1 /* Code conversion functions.
2 Copyright (C) 1991, 1995 Free Software Foundation, Inc.
3 Copyright (C) 1995 Sun Microsystems, Inc.
4 Copyright (C) 1999,2000,2001,2002,2003,2004 MORIOKA Tomohiko
6 This file is part of XEmacs.
8 XEmacs is free software; you can redistribute it and/or modify it
9 under the terms of the GNU General Public License as published by the
10 Free Software Foundation; either version 2, or (at your option) any
13 XEmacs is distributed in the hope that it will be useful, but WITHOUT
14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with XEmacs; see the file COPYING. If not, write to
20 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
21 Boston, MA 02111-1307, USA. */
23 /* Synched up with: Mule 2.3. Not in FSF. */
25 /* Rewritten by Ben Wing <ben@xemacs.org>. */
26 /* Rewritten by MORIOKA Tomohiko <tomo@m17n.org> for XEmacs CHISE. */
40 #include "file-coding.h"
42 Lisp_Object Qcoding_system_error;
44 Lisp_Object Vkeyboard_coding_system;
45 Lisp_Object Vterminal_coding_system;
46 Lisp_Object Vcoding_system_for_read;
47 Lisp_Object Vcoding_system_for_write;
48 Lisp_Object Vfile_name_coding_system;
50 Lisp_Object Vcoded_charset_entity_reference_alist;
52 /* Table of symbols identifying each coding category. */
53 Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST];
57 struct file_coding_dump {
58 /* Coding system currently associated with each coding category. */
59 Lisp_Object coding_category_system[CODING_CATEGORY_LAST];
61 /* Table of all coding categories in decreasing order of priority.
62 This describes a permutation of the possible coding categories. */
63 int coding_category_by_priority[CODING_CATEGORY_LAST];
65 #if defined(MULE) && !defined(UTF2000)
66 Lisp_Object ucs_to_mule_table[65536];
70 static const struct lrecord_description fcd_description_1[] = {
71 { XD_LISP_OBJECT_ARRAY, offsetof (struct file_coding_dump, coding_category_system), CODING_CATEGORY_LAST },
72 #if defined(MULE) && !defined(UTF2000)
73 { XD_LISP_OBJECT_ARRAY, offsetof (struct file_coding_dump, ucs_to_mule_table), countof (fcd->ucs_to_mule_table) },
78 static const struct struct_description fcd_description = {
79 sizeof (struct file_coding_dump),
83 Lisp_Object mule_to_ucs_table;
85 Lisp_Object Qcoding_systemp;
87 Lisp_Object Qraw_text, Qno_conversion, Qccl, Qiso2022;
88 /* Qinternal in general.c */
90 Lisp_Object Qmnemonic, Qeol_type;
91 Lisp_Object Qcr, Qcrlf, Qlf;
92 Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf;
93 Lisp_Object Qpost_read_conversion;
94 Lisp_Object Qpre_write_conversion;
97 Lisp_Object Qucs4, Qutf16, Qutf8;
98 Lisp_Object Qbig5, Qshift_jis;
99 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3;
100 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output;
101 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output;
102 Lisp_Object Qno_iso6429;
103 Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion;
104 Lisp_Object Qescape_quoted;
105 Lisp_Object Qshort, Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift;
108 Lisp_Object Qutf_8_mcs;
109 Lisp_Object Qdisable_composition;
110 Lisp_Object Quse_entity_reference;
111 Lisp_Object Qd, Qx, QX;
113 Lisp_Object Qencode, Qdecode;
115 Lisp_Object Vcoding_system_hash_table;
117 int enable_multibyte_characters;
120 /* Additional information used by the ISO2022 decoder and detector. */
121 struct iso2022_decoder
123 /* CHARSET holds the character sets currently assigned to the G0
124 through G3 variables. It is initialized from the array
125 INITIAL_CHARSET in CODESYS. */
126 Lisp_Object charset[4];
128 /* Which registers are currently invoked into the left (GL) and
129 right (GR) halves of the 8-bit encoding space? */
130 int register_left, register_right;
132 /* ISO_ESC holds a value indicating part of an escape sequence
133 that has already been seen. */
134 enum iso_esc_flag esc;
136 /* This records the bytes we've seen so far in an escape sequence,
137 in case the sequence is invalid (we spit out the bytes unchanged). */
138 unsigned char esc_bytes[8];
140 /* Index for next byte to store in ISO escape sequence. */
143 #ifdef ENABLE_COMPOSITE_CHARS
144 /* Stuff seen so far when composing a string. */
145 unsigned_char_dynarr *composite_chars;
148 /* If we saw an invalid designation sequence for a particular
149 register, we flag it here and switch to ASCII. The next time we
150 see a valid designation for this register, we turn off the flag
151 and do the designation normally, but pretend the sequence was
152 invalid. The effect of all this is that (most of the time) the
153 escape sequences for both the switch to the unknown charset, and
154 the switch back to the known charset, get inserted literally into
155 the buffer and saved out as such. The hope is that we can
156 preserve the escape sequences so that the resulting written out
157 file makes sense. If we don't do any of this, the designation
158 to the invalid charset will be preserved but that switch back
159 to the known charset will probably get eaten because it was
160 the same charset that was already present in the register. */
161 unsigned char invalid_designated[4];
163 /* We try to do similar things as above for direction-switching
164 sequences. If we encountered a direction switch while an
165 invalid designation was present, or an invalid designation
166 just after a direction switch (i.e. no valid designation
167 encountered yet), we insert the direction-switch escape
168 sequence literally into the output stream, and later on
169 insert the corresponding direction-restoring escape sequence
171 unsigned int switched_dir_and_no_valid_charset_yet :1;
172 unsigned int invalid_switch_dir :1;
174 /* Tells the decoder to output the escape sequence literally
175 even though it was valid. Used in the games we play to
176 avoid lossage when we encounter invalid designations. */
177 unsigned int output_literally :1;
178 /* We encountered a direction switch followed by an invalid
179 designation. We didn't output the direction switch
180 literally because we didn't know about the invalid designation;
181 but we have to do so now. */
182 unsigned int output_direction_sequence :1;
185 EXFUN (Fcopy_coding_system, 2);
187 struct detection_state;
190 text_encode_generic (Lstream *encoding, const Bufbyte *src,
191 unsigned_char_dynarr *dst, Lstream_data_count n);
193 static int detect_coding_sjis (struct detection_state *st,
194 const Extbyte *src, Lstream_data_count n);
195 static void decode_coding_sjis (Lstream *decoding, const Extbyte *src,
196 unsigned_char_dynarr *dst, Lstream_data_count n);
197 void char_encode_shift_jis (struct encoding_stream *str, Emchar c,
198 unsigned_char_dynarr *dst, unsigned int *flags);
199 void char_finish_shift_jis (struct encoding_stream *str,
200 unsigned_char_dynarr *dst, unsigned int *flags);
202 static int detect_coding_big5 (struct detection_state *st,
203 const Extbyte *src, Lstream_data_count n);
204 static void decode_coding_big5 (Lstream *decoding, const Extbyte *src,
205 unsigned_char_dynarr *dst, Lstream_data_count n);
206 void char_encode_big5 (struct encoding_stream *str, Emchar c,
207 unsigned_char_dynarr *dst, unsigned int *flags);
208 void char_finish_big5 (struct encoding_stream *str,
209 unsigned_char_dynarr *dst, unsigned int *flags);
211 static int detect_coding_ucs4 (struct detection_state *st,
212 const Extbyte *src, Lstream_data_count n);
213 static void decode_coding_ucs4 (Lstream *decoding, const Extbyte *src,
214 unsigned_char_dynarr *dst, Lstream_data_count n);
215 void char_encode_ucs4 (struct encoding_stream *str, Emchar c,
216 unsigned_char_dynarr *dst, unsigned int *flags);
217 void char_finish_ucs4 (struct encoding_stream *str,
218 unsigned_char_dynarr *dst, unsigned int *flags);
220 static int detect_coding_utf16 (struct detection_state *st,
221 const Extbyte *src, Lstream_data_count n);
222 static void decode_coding_utf16 (Lstream *decoding, const Extbyte *src,
223 unsigned_char_dynarr *dst, Lstream_data_count n);
224 void char_encode_utf16 (struct encoding_stream *str, Emchar c,
225 unsigned_char_dynarr *dst, unsigned int *flags);
226 void char_finish_utf16 (struct encoding_stream *str,
227 unsigned_char_dynarr *dst, unsigned int *flags);
229 static int detect_coding_utf8 (struct detection_state *st,
230 const Extbyte *src, Lstream_data_count n);
231 static void decode_coding_utf8 (Lstream *decoding, const Extbyte *src,
232 unsigned_char_dynarr *dst, Lstream_data_count n);
233 void char_encode_utf8 (struct encoding_stream *str, Emchar c,
234 unsigned_char_dynarr *dst, unsigned int *flags);
235 void char_finish_utf8 (struct encoding_stream *str,
236 unsigned_char_dynarr *dst, unsigned int *flags);
238 static int postprocess_iso2022_mask (int mask);
239 static void reset_iso2022 (Lisp_Object coding_system,
240 struct iso2022_decoder *iso);
241 static int detect_coding_iso2022 (struct detection_state *st,
242 const Extbyte *src, Lstream_data_count n);
243 static void decode_coding_iso2022 (Lstream *decoding, const Extbyte *src,
244 unsigned_char_dynarr *dst, Lstream_data_count n);
245 void char_encode_iso2022 (struct encoding_stream *str, Emchar c,
246 unsigned_char_dynarr *dst, unsigned int *flags);
247 void char_finish_iso2022 (struct encoding_stream *str,
248 unsigned_char_dynarr *dst, unsigned int *flags);
250 static void decode_coding_no_conversion (Lstream *decoding, const Extbyte *src,
251 unsigned_char_dynarr *dst, Lstream_data_count n);
252 static void encode_coding_no_conversion (Lstream *encoding, const Bufbyte *src,
253 unsigned_char_dynarr *dst, Lstream_data_count n);
254 static void mule_decode (Lstream *decoding, const Extbyte *src,
255 unsigned_char_dynarr *dst, Lstream_data_count n);
256 static void mule_encode (Lstream *encoding, const Bufbyte *src,
257 unsigned_char_dynarr *dst, Lstream_data_count n);
259 typedef struct codesys_prop codesys_prop;
268 Dynarr_declare (codesys_prop);
269 } codesys_prop_dynarr;
271 static const struct lrecord_description codesys_prop_description_1[] = {
272 { XD_LISP_OBJECT, offsetof (codesys_prop, sym) },
276 static const struct struct_description codesys_prop_description = {
277 sizeof (codesys_prop),
278 codesys_prop_description_1
281 static const struct lrecord_description codesys_prop_dynarr_description_1[] = {
282 XD_DYNARR_DESC (codesys_prop_dynarr, &codesys_prop_description),
286 static const struct struct_description codesys_prop_dynarr_description = {
287 sizeof (codesys_prop_dynarr),
288 codesys_prop_dynarr_description_1
291 codesys_prop_dynarr *the_codesys_prop_dynarr;
293 enum codesys_prop_enum
296 CODESYS_PROP_ISO2022,
301 /************************************************************************/
302 /* Coding system functions */
303 /************************************************************************/
305 static Lisp_Object mark_coding_system (Lisp_Object);
306 static void print_coding_system (Lisp_Object, Lisp_Object, int);
307 static void finalize_coding_system (void *header, int for_disksave);
310 static const struct lrecord_description ccs_description_1[] = {
311 { XD_LISP_OBJECT, offsetof (charset_conversion_spec, from_charset) },
312 { XD_LISP_OBJECT, offsetof (charset_conversion_spec, to_charset) },
316 static const struct struct_description ccs_description = {
317 sizeof (charset_conversion_spec),
321 static const struct lrecord_description ccsd_description_1[] = {
322 XD_DYNARR_DESC (charset_conversion_spec_dynarr, &ccs_description),
326 static const struct struct_description ccsd_description = {
327 sizeof (charset_conversion_spec_dynarr),
332 static const struct lrecord_description coding_system_description[] = {
333 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, name) },
334 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, doc_string) },
335 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, mnemonic) },
336 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, post_read_conversion) },
337 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, pre_write_conversion) },
338 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_lf) },
339 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_crlf) },
340 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_cr) },
342 { XD_LISP_OBJECT_ARRAY, offsetof (Lisp_Coding_System, iso2022.initial_charset), 4 },
343 { XD_STRUCT_PTR, offsetof (Lisp_Coding_System, iso2022.input_conv), 1, &ccsd_description },
344 { XD_STRUCT_PTR, offsetof (Lisp_Coding_System, iso2022.output_conv), 1, &ccsd_description },
345 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccl.decode) },
346 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccl.encode) },
348 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccs_priority_list) },
354 DEFINE_LRECORD_IMPLEMENTATION ("coding-system", coding_system,
355 mark_coding_system, print_coding_system,
356 finalize_coding_system,
357 0, 0, coding_system_description,
361 mark_coding_system (Lisp_Object obj)
363 Lisp_Coding_System *codesys = XCODING_SYSTEM (obj);
365 mark_object (CODING_SYSTEM_NAME (codesys));
366 mark_object (CODING_SYSTEM_DOC_STRING (codesys));
367 mark_object (CODING_SYSTEM_MNEMONIC (codesys));
368 mark_object (CODING_SYSTEM_EOL_LF (codesys));
369 mark_object (CODING_SYSTEM_EOL_CRLF (codesys));
370 mark_object (CODING_SYSTEM_EOL_CR (codesys));
372 switch (CODING_SYSTEM_TYPE (codesys))
376 case CODESYS_ISO2022:
377 for (i = 0; i < 4; i++)
378 mark_object (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i));
379 if (codesys->iso2022.input_conv)
381 for (i = 0; i < Dynarr_length (codesys->iso2022.input_conv); i++)
383 struct charset_conversion_spec *ccs =
384 Dynarr_atp (codesys->iso2022.input_conv, i);
385 mark_object (ccs->from_charset);
386 mark_object (ccs->to_charset);
389 if (codesys->iso2022.output_conv)
391 for (i = 0; i < Dynarr_length (codesys->iso2022.output_conv); i++)
393 struct charset_conversion_spec *ccs =
394 Dynarr_atp (codesys->iso2022.output_conv, i);
395 mark_object (ccs->from_charset);
396 mark_object (ccs->to_charset);
403 mark_object (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 0));
404 mark_object (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 1));
409 mark_object (CODING_SYSTEM_CCL_DECODE (codesys));
410 mark_object (CODING_SYSTEM_CCL_ENCODE (codesys));
417 mark_object (CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys));
419 mark_object (CODING_SYSTEM_CCS_PRIORITY_LIST (codesys));
421 return CODING_SYSTEM_POST_READ_CONVERSION (codesys);
425 print_coding_system (Lisp_Object obj, Lisp_Object printcharfun,
428 Lisp_Coding_System *c = XCODING_SYSTEM (obj);
430 error ("printing unreadable object #<coding_system 0x%x>",
433 write_c_string ("#<coding_system ", printcharfun);
434 print_internal (c->name, printcharfun, 1);
435 write_c_string (">", printcharfun);
439 finalize_coding_system (void *header, int for_disksave)
441 Lisp_Coding_System *c = (Lisp_Coding_System *) header;
442 /* Since coding systems never go away, this function is not
443 necessary. But it would be necessary if we changed things
444 so that coding systems could go away. */
445 if (!for_disksave) /* see comment in lstream.c */
447 switch (CODING_SYSTEM_TYPE (c))
450 case CODESYS_ISO2022:
451 if (c->iso2022.input_conv)
453 Dynarr_free (c->iso2022.input_conv);
454 c->iso2022.input_conv = 0;
456 if (c->iso2022.output_conv)
458 Dynarr_free (c->iso2022.output_conv);
459 c->iso2022.output_conv = 0;
470 symbol_to_eol_type (Lisp_Object symbol)
472 CHECK_SYMBOL (symbol);
473 if (NILP (symbol)) return EOL_AUTODETECT;
474 if (EQ (symbol, Qlf)) return EOL_LF;
475 if (EQ (symbol, Qcrlf)) return EOL_CRLF;
476 if (EQ (symbol, Qcr)) return EOL_CR;
478 signal_simple_error ("Unrecognized eol type", symbol);
479 return EOL_AUTODETECT; /* not reached */
483 eol_type_to_symbol (eol_type_t type)
488 case EOL_LF: return Qlf;
489 case EOL_CRLF: return Qcrlf;
490 case EOL_CR: return Qcr;
491 case EOL_AUTODETECT: return Qnil;
496 setup_eol_coding_systems (Lisp_Coding_System *codesys)
498 Lisp_Object codesys_obj;
499 int len = string_length (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name);
500 char *codesys_name = (char *) alloca (len + 7);
502 char *codesys_mnemonic=0;
504 Lisp_Object codesys_name_sym, sub_codesys_obj;
508 XSETCODING_SYSTEM (codesys_obj, codesys);
510 memcpy (codesys_name,
511 string_data (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name), len);
513 if (STRINGP (CODING_SYSTEM_MNEMONIC (codesys)))
515 mlen = XSTRING_LENGTH (CODING_SYSTEM_MNEMONIC (codesys));
516 codesys_mnemonic = (char *) alloca (mlen + 7);
517 memcpy (codesys_mnemonic,
518 XSTRING_DATA (CODING_SYSTEM_MNEMONIC (codesys)), mlen);
521 #define DEFINE_SUB_CODESYS(op_sys, op_sys_abbr, Type) do { \
522 strcpy (codesys_name + len, "-" op_sys); \
524 strcpy (codesys_mnemonic + mlen, op_sys_abbr); \
525 codesys_name_sym = intern (codesys_name); \
526 sub_codesys_obj = Fcopy_coding_system (codesys_obj, codesys_name_sym); \
527 XCODING_SYSTEM_EOL_TYPE (sub_codesys_obj) = Type; \
529 XCODING_SYSTEM_MNEMONIC(sub_codesys_obj) = \
530 build_string (codesys_mnemonic); \
531 CODING_SYSTEM_##Type (codesys) = sub_codesys_obj; \
534 DEFINE_SUB_CODESYS("unix", "", EOL_LF);
535 DEFINE_SUB_CODESYS("dos", ":T", EOL_CRLF);
536 DEFINE_SUB_CODESYS("mac", ":t", EOL_CR);
539 DEFUN ("coding-system-p", Fcoding_system_p, 1, 1, 0, /*
540 Return t if OBJECT is a coding system.
541 A coding system is an object that defines how text containing multiple
542 character sets is encoded into a stream of (typically 8-bit) bytes.
543 The coding system is used to decode the stream into a series of
544 characters (which may be from multiple charsets) when the text is read
545 from a file or process, and is used to encode the text back into the
546 same format when it is written out to a file or process.
548 For example, many ISO2022-compliant coding systems (such as Compound
549 Text, which is used for inter-client data under the X Window System)
550 use escape sequences to switch between different charsets -- Japanese
551 Kanji, for example, is invoked with "ESC $ ( B"; ASCII is invoked
552 with "ESC ( B"; and Cyrillic is invoked with "ESC - L". See
553 `make-coding-system' for more information.
555 Coding systems are normally identified using a symbol, and the
556 symbol is accepted in place of the actual coding system object whenever
557 a coding system is called for. (This is similar to how faces work.)
561 return CODING_SYSTEMP (object) ? Qt : Qnil;
564 DEFUN ("find-coding-system", Ffind_coding_system, 1, 1, 0, /*
565 Retrieve the coding system of the given name.
567 If CODING-SYSTEM-OR-NAME is a coding-system object, it is simply
568 returned. Otherwise, CODING-SYSTEM-OR-NAME should be a symbol.
569 If there is no such coding system, nil is returned. Otherwise the
570 associated coding system object is returned.
572 (coding_system_or_name))
574 if (NILP (coding_system_or_name))
575 coding_system_or_name = Qbinary;
576 else if (CODING_SYSTEMP (coding_system_or_name))
577 return coding_system_or_name;
579 CHECK_SYMBOL (coding_system_or_name);
583 coding_system_or_name =
584 Fgethash (coding_system_or_name, Vcoding_system_hash_table, Qnil);
586 if (CODING_SYSTEMP (coding_system_or_name) || NILP (coding_system_or_name))
587 return coding_system_or_name;
591 DEFUN ("get-coding-system", Fget_coding_system, 1, 1, 0, /*
592 Retrieve the coding system of the given name.
593 Same as `find-coding-system' except that if there is no such
594 coding system, an error is signaled instead of returning nil.
598 Lisp_Object coding_system = Ffind_coding_system (name);
600 if (NILP (coding_system))
601 signal_simple_error ("No such coding system", name);
602 return coding_system;
605 /* We store the coding systems in hash tables with the names as the key and the
606 actual coding system object as the value. Occasionally we need to use them
607 in a list format. These routines provide us with that. */
608 struct coding_system_list_closure
610 Lisp_Object *coding_system_list;
614 add_coding_system_to_list_mapper (Lisp_Object key, Lisp_Object value,
615 void *coding_system_list_closure)
617 /* This function can GC */
618 struct coding_system_list_closure *cscl =
619 (struct coding_system_list_closure *) coding_system_list_closure;
620 Lisp_Object *coding_system_list = cscl->coding_system_list;
622 *coding_system_list = Fcons (key, *coding_system_list);
626 DEFUN ("coding-system-list", Fcoding_system_list, 0, 0, 0, /*
627 Return a list of the names of all defined coding systems.
631 Lisp_Object coding_system_list = Qnil;
633 struct coding_system_list_closure coding_system_list_closure;
635 GCPRO1 (coding_system_list);
636 coding_system_list_closure.coding_system_list = &coding_system_list;
637 elisp_maphash (add_coding_system_to_list_mapper, Vcoding_system_hash_table,
638 &coding_system_list_closure);
641 return coding_system_list;
644 DEFUN ("coding-system-name", Fcoding_system_name, 1, 1, 0, /*
645 Return the name of the given coding system.
649 coding_system = Fget_coding_system (coding_system);
650 return XCODING_SYSTEM_NAME (coding_system);
653 static Lisp_Coding_System *
654 allocate_coding_system (enum coding_system_type type, Lisp_Object name)
656 Lisp_Coding_System *codesys =
657 alloc_lcrecord_type (Lisp_Coding_System, &lrecord_coding_system);
659 zero_lcrecord (codesys);
660 CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = Qnil;
661 CODING_SYSTEM_POST_READ_CONVERSION (codesys) = Qnil;
662 CODING_SYSTEM_EOL_TYPE (codesys) = EOL_AUTODETECT;
663 CODING_SYSTEM_EOL_CRLF (codesys) = Qnil;
664 CODING_SYSTEM_EOL_CR (codesys) = Qnil;
665 CODING_SYSTEM_EOL_LF (codesys) = Qnil;
666 CODING_SYSTEM_TYPE (codesys) = type;
667 CODING_SYSTEM_MNEMONIC (codesys) = Qnil;
670 CODING_SYSTEM_CCS_PRIORITY_LIST (codesys) = Qnil;
672 if (type == CODESYS_ISO2022)
675 for (i = 0; i < 4; i++)
676 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i) = Qnil;
679 if (type == CODESYS_UTF8)
681 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 0)
683 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 1)
685 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 2)
687 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 3)
690 else if (type == CODESYS_BIG5)
692 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 0)
694 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 1)
695 = Vcharset_chinese_big5;
696 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 2)
698 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 3)
702 else if (type == CODESYS_CCL)
704 CODING_SYSTEM_CCL_DECODE (codesys) = Qnil;
705 CODING_SYSTEM_CCL_ENCODE (codesys) = Qnil;
708 CODING_SYSTEM_NAME (codesys) = name;
714 /* Given a list of charset conversion specs as specified in a Lisp
715 program, parse it into STORE_HERE. */
718 parse_charset_conversion_specs (charset_conversion_spec_dynarr *store_here,
719 Lisp_Object spec_list)
723 EXTERNAL_LIST_LOOP (rest, spec_list)
725 Lisp_Object car = XCAR (rest);
726 Lisp_Object from, to;
727 struct charset_conversion_spec spec;
729 if (!CONSP (car) || !CONSP (XCDR (car)) || !NILP (XCDR (XCDR (car))))
730 signal_simple_error ("Invalid charset conversion spec", car);
731 from = Fget_charset (XCAR (car));
732 to = Fget_charset (XCAR (XCDR (car)));
733 if ( (XCHARSET_CHARS (from) != XCHARSET_CHARS (to)) ||
734 (XCHARSET_DIMENSION (from) != XCHARSET_DIMENSION (to)) )
735 signal_simple_error_2
736 ("Attempted conversion between different charset types",
738 spec.from_charset = from;
739 spec.to_charset = to;
741 Dynarr_add (store_here, spec);
745 /* Given a dynarr LOAD_HERE of internally-stored charset conversion
746 specs, return the equivalent as the Lisp programmer would see it.
748 If LOAD_HERE is 0, return Qnil. */
751 unparse_charset_conversion_specs (charset_conversion_spec_dynarr *load_here)
758 for (i = 0, result = Qnil; i < Dynarr_length (load_here); i++)
760 struct charset_conversion_spec *ccs = Dynarr_atp (load_here, i);
761 result = Fcons (list2 (ccs->from_charset, ccs->to_charset), result);
764 return Fnreverse (result);
769 DEFUN ("make-coding-system", Fmake_coding_system, 2, 4, 0, /*
770 Register symbol NAME as a coding system.
772 TYPE describes the conversion method used and should be one of
775 Automatic conversion. XEmacs attempts to detect the coding system
778 No conversion. Use this for binary files and such. On output,
779 graphic characters that are not in ASCII or Latin-1 will be
780 replaced by a ?. (For a no-conversion-encoded buffer, these
781 characters will only be present if you explicitly insert them.)
783 Shift-JIS (a Japanese encoding commonly used in PC operating systems).
785 ISO 10646 UCS-4 encoding.
787 ISO 10646 UTF-8 encoding.
789 Any ISO2022-compliant encoding. Among other things, this includes
790 JIS (the Japanese encoding commonly used for e-mail), EUC (the
791 standard Unix encoding for Japanese and other languages), and
792 Compound Text (the encoding used in X11). You can specify more
793 specific information about the conversion with the PROPS argument.
795 Big5 (the encoding commonly used for Taiwanese).
797 The conversion is performed using a user-written pseudo-code
798 program. CCL (Code Conversion Language) is the name of this
801 Write out or read in the raw contents of the memory representing
802 the buffer's text. This is primarily useful for debugging
803 purposes, and is only enabled when XEmacs has been compiled with
804 DEBUG_XEMACS defined (via the --debug configure option).
805 WARNING: Reading in a file using 'internal conversion can result
806 in an internal inconsistency in the memory representing a
807 buffer's text, which will produce unpredictable results and may
808 cause XEmacs to crash. Under normal circumstances you should
809 never use 'internal conversion.
811 DOC-STRING is a string describing the coding system.
813 PROPS is a property list, describing the specific nature of the
814 character set. Recognized properties are:
817 String to be displayed in the modeline when this coding system is
821 End-of-line conversion to be used. It should be one of
824 Automatically detect the end-of-line type (LF, CRLF,
825 or CR). Also generate subsidiary coding systems named
826 `NAME-unix', `NAME-dos', and `NAME-mac', that are
827 identical to this coding system but have an EOL-TYPE
828 value of 'lf, 'crlf, and 'cr, respectively.
830 The end of a line is marked externally using ASCII LF.
831 Since this is also the way that XEmacs represents an
832 end-of-line internally, specifying this option results
833 in no end-of-line conversion. This is the standard
834 format for Unix text files.
836 The end of a line is marked externally using ASCII
837 CRLF. This is the standard format for MS-DOS text
840 The end of a line is marked externally using ASCII CR.
841 This is the standard format for Macintosh text files.
843 Automatically detect the end-of-line type but do not
844 generate subsidiary coding systems. (This value is
845 converted to nil when stored internally, and
846 `coding-system-property' will return nil.)
849 If non-nil, composition/decomposition for combining characters
852 'use-entity-reference
853 If non-nil, SGML style entity-reference is used for non-system-characters.
855 'post-read-conversion
856 Function called after a file has been read in, to perform the
857 decoding. Called with two arguments, START and END, denoting
858 a region of the current buffer to be decoded.
860 'pre-write-conversion
861 Function called before a file is written out, to perform the
862 encoding. Called with two arguments, START and END, denoting
863 a region of the current buffer to be encoded.
866 The following additional properties are recognized if TYPE is 'iso2022:
872 The character set initially designated to the G0 - G3 registers.
873 The value should be one of
875 -- A charset object (designate that character set)
876 -- nil (do not ever use this register)
877 -- t (no character set is initially designated to
878 the register, but may be later on; this automatically
879 sets the corresponding `force-g*-on-output' property)
885 If non-nil, send an explicit designation sequence on output before
886 using the specified register.
889 If non-nil, use the short forms "ESC $ @", "ESC $ A", and
890 "ESC $ B" on output in place of the full designation sequences
891 "ESC $ ( @", "ESC $ ( A", and "ESC $ ( B".
894 If non-nil, don't designate ASCII to G0 at each end of line on output.
895 Setting this to non-nil also suppresses other state-resetting that
896 normally happens at the end of a line.
899 If non-nil, don't designate ASCII to G0 before control chars on output.
902 If non-nil, use 7-bit environment on output. Otherwise, use 8-bit
906 If non-nil, use locking-shift (SO/SI) instead of single-shift
907 or designation by escape sequence.
910 If non-nil, don't use ISO6429's direction specification.
913 If non-nil, literal control characters that are the same as
914 the beginning of a recognized ISO2022 or ISO6429 escape sequence
915 (in particular, ESC (0x1B), SO (0x0E), SI (0x0F), SS2 (0x8E),
916 SS3 (0x8F), and CSI (0x9B)) are "quoted" with an escape character
917 so that they can be properly distinguished from an escape sequence.
918 (Note that doing this results in a non-portable encoding.) This
919 encoding flag is used for byte-compiled files. Note that ESC
920 is a good choice for a quoting character because there are no
921 escape sequences whose second byte is a character from the Control-0
922 or Control-1 character sets; this is explicitly disallowed by the
925 'input-charset-conversion
926 A list of conversion specifications, specifying conversion of
927 characters in one charset to another when decoding is performed.
928 Each specification is a list of two elements: the source charset,
929 and the destination charset.
931 'output-charset-conversion
932 A list of conversion specifications, specifying conversion of
933 characters in one charset to another when encoding is performed.
934 The form of each specification is the same as for
935 'input-charset-conversion.
938 The following additional properties are recognized (and required)
942 CCL program used for decoding (converting to internal format).
945 CCL program used for encoding (converting to external format).
947 (name, type, doc_string, props))
949 Lisp_Coding_System *codesys;
950 enum coding_system_type ty;
951 int need_to_setup_eol_systems = 1;
953 /* Convert type to constant */
954 if (NILP (type) || EQ (type, Qundecided))
955 { ty = CODESYS_AUTODETECT; }
957 else if (EQ (type, Qshift_jis)) { ty = CODESYS_SHIFT_JIS; }
958 else if (EQ (type, Qiso2022)) { ty = CODESYS_ISO2022; }
959 else if (EQ (type, Qbig5)) { ty = CODESYS_BIG5; }
960 else if (EQ (type, Qucs4)) { ty = CODESYS_UCS4; }
961 else if (EQ (type, Qutf16)) { ty = CODESYS_UTF16; }
962 else if (EQ (type, Qutf8)) { ty = CODESYS_UTF8; }
963 else if (EQ (type, Qccl)) { ty = CODESYS_CCL; }
965 else if (EQ (type, Qno_conversion)) { ty = CODESYS_NO_CONVERSION; }
967 else if (EQ (type, Qinternal)) { ty = CODESYS_INTERNAL; }
970 signal_simple_error ("Invalid coding system type", type);
974 codesys = allocate_coding_system (ty, name);
976 if (NILP (doc_string))
977 doc_string = build_string ("");
979 CHECK_STRING (doc_string);
980 CODING_SYSTEM_DOC_STRING (codesys) = doc_string;
983 EXTERNAL_PROPERTY_LIST_LOOP_3 (key, value, props)
985 if (EQ (key, Qmnemonic))
988 CHECK_STRING (value);
989 CODING_SYSTEM_MNEMONIC (codesys) = value;
992 else if (EQ (key, Qeol_type))
994 need_to_setup_eol_systems = NILP (value);
997 CODING_SYSTEM_EOL_TYPE (codesys) = symbol_to_eol_type (value);
1000 else if (EQ (key, Qpost_read_conversion))
1001 CODING_SYSTEM_POST_READ_CONVERSION (codesys) = value;
1002 else if (EQ (key, Qpre_write_conversion))
1003 CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = value;
1005 else if (EQ (key, Qdisable_composition))
1006 CODING_SYSTEM_DISABLE_COMPOSITION (codesys) = !NILP (value);
1007 else if (EQ (key, Quse_entity_reference))
1008 CODING_SYSTEM_USE_ENTITY_REFERENCE (codesys) = !NILP (value);
1011 else if (ty == CODESYS_ISO2022)
1013 #define FROB_INITIAL_CHARSET(charset_num) \
1014 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \
1015 ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value))
1017 if (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
1018 else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
1019 else if (EQ (key, Qcharset_g2)) FROB_INITIAL_CHARSET (2);
1020 else if (EQ (key, Qcharset_g3)) FROB_INITIAL_CHARSET (3);
1022 #define FROB_FORCE_CHARSET(charset_num) \
1023 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = !NILP (value)
1025 else if (EQ (key, Qforce_g0_on_output)) FROB_FORCE_CHARSET (0);
1026 else if (EQ (key, Qforce_g1_on_output)) FROB_FORCE_CHARSET (1);
1027 else if (EQ (key, Qforce_g2_on_output)) FROB_FORCE_CHARSET (2);
1028 else if (EQ (key, Qforce_g3_on_output)) FROB_FORCE_CHARSET (3);
1030 #define FROB_BOOLEAN_PROPERTY(prop) \
1031 CODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value)
1033 else if (EQ (key, Qshort)) FROB_BOOLEAN_PROPERTY (SHORT);
1034 else if (EQ (key, Qno_ascii_eol)) FROB_BOOLEAN_PROPERTY (NO_ASCII_EOL);
1035 else if (EQ (key, Qno_ascii_cntl)) FROB_BOOLEAN_PROPERTY (NO_ASCII_CNTL);
1036 else if (EQ (key, Qseven)) FROB_BOOLEAN_PROPERTY (SEVEN);
1037 else if (EQ (key, Qlock_shift)) FROB_BOOLEAN_PROPERTY (LOCK_SHIFT);
1038 else if (EQ (key, Qno_iso6429)) FROB_BOOLEAN_PROPERTY (NO_ISO6429);
1039 else if (EQ (key, Qescape_quoted)) FROB_BOOLEAN_PROPERTY (ESCAPE_QUOTED);
1041 else if (EQ (key, Qinput_charset_conversion))
1043 codesys->iso2022.input_conv =
1044 Dynarr_new (charset_conversion_spec);
1045 parse_charset_conversion_specs (codesys->iso2022.input_conv,
1048 else if (EQ (key, Qoutput_charset_conversion))
1050 codesys->iso2022.output_conv =
1051 Dynarr_new (charset_conversion_spec);
1052 parse_charset_conversion_specs (codesys->iso2022.output_conv,
1056 signal_simple_error ("Unrecognized property", key);
1059 else if (ty == CODESYS_UTF8)
1061 if (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
1062 else if (EQ (key, Qcharset_g1))
1063 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 1) = value;
1064 else if (EQ (key, Qcharset_g2))
1065 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 2) = value;
1067 signal_simple_error ("Unrecognized property", key);
1069 else if (ty == CODESYS_BIG5)
1071 if (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
1072 else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
1074 signal_simple_error ("Unrecognized property", key);
1077 else if (EQ (type, Qccl))
1080 struct ccl_program test_ccl;
1083 /* Check key first. */
1084 if (EQ (key, Qdecode))
1085 suffix = "-ccl-decode";
1086 else if (EQ (key, Qencode))
1087 suffix = "-ccl-encode";
1089 signal_simple_error ("Unrecognized property", key);
1091 /* If value is vector, register it as a ccl program
1092 associated with an newly created symbol for
1093 backward compatibility. */
1094 if (VECTORP (value))
1096 sym = Fintern (concat2 (Fsymbol_name (name),
1097 build_string (suffix)),
1099 Fregister_ccl_program (sym, value);
1103 CHECK_SYMBOL (value);
1106 /* check if the given ccl programs are valid. */
1107 if (setup_ccl_program (&test_ccl, sym) < 0)
1108 signal_simple_error ("Invalid CCL program", value);
1110 if (EQ (key, Qdecode))
1111 CODING_SYSTEM_CCL_DECODE (codesys) = sym;
1112 else if (EQ (key, Qencode))
1113 CODING_SYSTEM_CCL_ENCODE (codesys) = sym;
1118 signal_simple_error ("Unrecognized property", key);
1122 if (need_to_setup_eol_systems)
1123 setup_eol_coding_systems (codesys);
1126 Lisp_Object codesys_obj;
1127 XSETCODING_SYSTEM (codesys_obj, codesys);
1128 Fputhash (name, codesys_obj, Vcoding_system_hash_table);
1133 DEFUN ("copy-coding-system", Fcopy_coding_system, 2, 2, 0, /*
1134 Copy OLD-CODING-SYSTEM to NEW-NAME.
1135 If NEW-NAME does not name an existing coding system, a new one will
1138 (old_coding_system, new_name))
1140 Lisp_Object new_coding_system;
1141 old_coding_system = Fget_coding_system (old_coding_system);
1142 new_coding_system = Ffind_coding_system (new_name);
1143 if (NILP (new_coding_system))
1145 XSETCODING_SYSTEM (new_coding_system,
1146 allocate_coding_system
1147 (XCODING_SYSTEM_TYPE (old_coding_system),
1149 Fputhash (new_name, new_coding_system, Vcoding_system_hash_table);
1153 Lisp_Coding_System *to = XCODING_SYSTEM (new_coding_system);
1154 Lisp_Coding_System *from = XCODING_SYSTEM (old_coding_system);
1155 memcpy (((char *) to ) + sizeof (to->header),
1156 ((char *) from) + sizeof (from->header),
1157 sizeof (*from) - sizeof (from->header));
1158 to->name = new_name;
1160 return new_coding_system;
1163 DEFUN ("coding-system-canonical-name-p", Fcoding_system_canonical_name_p, 1, 1, 0, /*
1164 Return t if OBJECT names a coding system, and is not a coding system alias.
1168 return CODING_SYSTEMP (Fgethash (object, Vcoding_system_hash_table, Qnil))
1172 DEFUN ("coding-system-alias-p", Fcoding_system_alias_p, 1, 1, 0, /*
1173 Return t if OBJECT is a coding system alias.
1174 All coding system aliases are created by `define-coding-system-alias'.
1178 return SYMBOLP (Fgethash (object, Vcoding_system_hash_table, Qzero))
1182 DEFUN ("coding-system-aliasee", Fcoding_system_aliasee, 1, 1, 0, /*
1183 Return the coding-system symbol for which symbol ALIAS is an alias.
1187 Lisp_Object aliasee = Fgethash (alias, Vcoding_system_hash_table, Qnil);
1188 if (SYMBOLP (aliasee))
1191 signal_simple_error ("Symbol is not a coding system alias", alias);
1192 return Qnil; /* To keep the compiler happy */
1196 append_suffix_to_symbol (Lisp_Object symbol, const char *ascii_string)
1198 return Fintern (concat2 (Fsymbol_name (symbol), build_string (ascii_string)),
1202 /* A maphash function, for removing dangling coding system aliases. */
1204 dangling_coding_system_alias_p (Lisp_Object alias,
1205 Lisp_Object aliasee,
1206 void *dangling_aliases)
1208 if (SYMBOLP (aliasee)
1209 && NILP (Fgethash (aliasee, Vcoding_system_hash_table, Qnil)))
1211 (*(int *) dangling_aliases)++;
1218 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias, 2, 2, 0, /*
1219 Define symbol ALIAS as an alias for coding system ALIASEE.
1221 You can use this function to redefine an alias that has already been defined,
1222 but you cannot redefine a name which is the canonical name for a coding system.
1223 \(a canonical name of a coding system is what is returned when you call
1224 `coding-system-name' on a coding system).
1226 ALIASEE itself can be an alias, which allows you to define nested aliases.
1228 You are forbidden, however, from creating alias loops or `dangling' aliases.
1229 These will be detected, and an error will be signaled if you attempt to do so.
1231 If ALIASEE is nil, then ALIAS will simply be undefined.
1233 See also `coding-system-alias-p', `coding-system-aliasee',
1234 and `coding-system-canonical-name-p'.
1238 Lisp_Object real_coding_system, probe;
1240 CHECK_SYMBOL (alias);
1242 if (!NILP (Fcoding_system_canonical_name_p (alias)))
1244 ("Symbol is the canonical name of a coding system and cannot be redefined",
1249 Lisp_Object subsidiary_unix = append_suffix_to_symbol (alias, "-unix");
1250 Lisp_Object subsidiary_dos = append_suffix_to_symbol (alias, "-dos");
1251 Lisp_Object subsidiary_mac = append_suffix_to_symbol (alias, "-mac");
1253 Fremhash (alias, Vcoding_system_hash_table);
1255 /* Undefine subsidiary aliases,
1256 presumably created by a previous call to this function */
1257 if (! NILP (Fcoding_system_alias_p (subsidiary_unix)) &&
1258 ! NILP (Fcoding_system_alias_p (subsidiary_dos)) &&
1259 ! NILP (Fcoding_system_alias_p (subsidiary_mac)))
1261 Fdefine_coding_system_alias (subsidiary_unix, Qnil);
1262 Fdefine_coding_system_alias (subsidiary_dos, Qnil);
1263 Fdefine_coding_system_alias (subsidiary_mac, Qnil);
1266 /* Undefine dangling coding system aliases. */
1268 int dangling_aliases;
1271 dangling_aliases = 0;
1272 elisp_map_remhash (dangling_coding_system_alias_p,
1273 Vcoding_system_hash_table,
1275 } while (dangling_aliases > 0);
1281 if (CODING_SYSTEMP (aliasee))
1282 aliasee = XCODING_SYSTEM_NAME (aliasee);
1284 /* Checks that aliasee names a coding-system */
1285 real_coding_system = Fget_coding_system (aliasee);
1287 /* Check for coding system alias loops */
1288 if (EQ (alias, aliasee))
1289 alias_loop: signal_simple_error_2
1290 ("Attempt to create a coding system alias loop", alias, aliasee);
1292 for (probe = aliasee;
1294 probe = Fgethash (probe, Vcoding_system_hash_table, Qzero))
1296 if (EQ (probe, alias))
1300 Fputhash (alias, aliasee, Vcoding_system_hash_table);
1302 /* Set up aliases for subsidiaries.
1303 #### There must be a better way to handle subsidiary coding systems. */
1305 static const char *suffixes[] = { "-unix", "-dos", "-mac" };
1307 for (i = 0; i < countof (suffixes); i++)
1309 Lisp_Object alias_subsidiary =
1310 append_suffix_to_symbol (alias, suffixes[i]);
1311 Lisp_Object aliasee_subsidiary =
1312 append_suffix_to_symbol (aliasee, suffixes[i]);
1314 if (! NILP (Ffind_coding_system (aliasee_subsidiary)))
1315 Fdefine_coding_system_alias (alias_subsidiary, aliasee_subsidiary);
1318 /* FSF return value is a vector of [ALIAS-unix ALIAS-dos ALIAS-mac],
1319 but it doesn't look intentional, so I'd rather return something
1320 meaningful or nothing at all. */
1325 subsidiary_coding_system (Lisp_Object coding_system, eol_type_t type)
1327 Lisp_Coding_System *cs = XCODING_SYSTEM (coding_system);
1328 Lisp_Object new_coding_system;
1330 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
1331 return coding_system;
1335 case EOL_AUTODETECT: return coding_system;
1336 case EOL_LF: new_coding_system = CODING_SYSTEM_EOL_LF (cs); break;
1337 case EOL_CR: new_coding_system = CODING_SYSTEM_EOL_CR (cs); break;
1338 case EOL_CRLF: new_coding_system = CODING_SYSTEM_EOL_CRLF (cs); break;
1339 default: abort (); return Qnil;
1342 return NILP (new_coding_system) ? coding_system : new_coding_system;
1345 DEFUN ("subsidiary-coding-system", Fsubsidiary_coding_system, 2, 2, 0, /*
1346 Return the subsidiary coding system of CODING-SYSTEM with eol type EOL-TYPE.
1348 (coding_system, eol_type))
1350 coding_system = Fget_coding_system (coding_system);
1352 return subsidiary_coding_system (coding_system,
1353 symbol_to_eol_type (eol_type));
1357 /************************************************************************/
1358 /* Coding system accessors */
1359 /************************************************************************/
1361 DEFUN ("coding-system-doc-string", Fcoding_system_doc_string, 1, 1, 0, /*
1362 Return the doc string for CODING-SYSTEM.
1366 coding_system = Fget_coding_system (coding_system);
1367 return XCODING_SYSTEM_DOC_STRING (coding_system);
1370 DEFUN ("coding-system-type", Fcoding_system_type, 1, 1, 0, /*
1371 Return the type of CODING-SYSTEM.
1375 switch (XCODING_SYSTEM_TYPE (Fget_coding_system (coding_system)))
1378 case CODESYS_AUTODETECT: return Qundecided;
1380 case CODESYS_SHIFT_JIS: return Qshift_jis;
1381 case CODESYS_ISO2022: return Qiso2022;
1382 case CODESYS_BIG5: return Qbig5;
1383 case CODESYS_UCS4: return Qucs4;
1384 case CODESYS_UTF16: return Qutf16;
1385 case CODESYS_UTF8: return Qutf8;
1386 case CODESYS_CCL: return Qccl;
1388 case CODESYS_NO_CONVERSION: return Qno_conversion;
1390 case CODESYS_INTERNAL: return Qinternal;
1397 Lisp_Object coding_system_charset (Lisp_Object coding_system, int gnum)
1400 = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, gnum);
1402 return CHARSETP (cs) ? XCHARSET_NAME (cs) : Qnil;
1405 DEFUN ("coding-system-charset", Fcoding_system_charset, 2, 2, 0, /*
1406 Return initial charset of CODING-SYSTEM designated to GNUM.
1409 (coding_system, gnum))
1411 coding_system = Fget_coding_system (coding_system);
1414 return coding_system_charset (coding_system, XINT (gnum));
1418 DEFUN ("coding-system-property", Fcoding_system_property, 2, 2, 0, /*
1419 Return the PROP property of CODING-SYSTEM.
1421 (coding_system, prop))
1424 enum coding_system_type type;
1426 coding_system = Fget_coding_system (coding_system);
1427 CHECK_SYMBOL (prop);
1428 type = XCODING_SYSTEM_TYPE (coding_system);
1430 for (i = 0; !ok && i < Dynarr_length (the_codesys_prop_dynarr); i++)
1431 if (EQ (Dynarr_at (the_codesys_prop_dynarr, i).sym, prop))
1434 switch (Dynarr_at (the_codesys_prop_dynarr, i).prop_type)
1436 case CODESYS_PROP_ALL_OK:
1439 case CODESYS_PROP_ISO2022:
1440 if (type != CODESYS_ISO2022)
1442 ("Property only valid in ISO2022 coding systems",
1446 case CODESYS_PROP_CCL:
1447 if (type != CODESYS_CCL)
1449 ("Property only valid in CCL coding systems",
1459 signal_simple_error ("Unrecognized property", prop);
1461 if (EQ (prop, Qname))
1462 return XCODING_SYSTEM_NAME (coding_system);
1463 else if (EQ (prop, Qtype))
1464 return Fcoding_system_type (coding_system);
1465 else if (EQ (prop, Qdoc_string))
1466 return XCODING_SYSTEM_DOC_STRING (coding_system);
1467 else if (EQ (prop, Qmnemonic))
1468 return XCODING_SYSTEM_MNEMONIC (coding_system);
1469 else if (EQ (prop, Qeol_type))
1470 return eol_type_to_symbol (XCODING_SYSTEM_EOL_TYPE (coding_system));
1471 else if (EQ (prop, Qeol_lf))
1472 return XCODING_SYSTEM_EOL_LF (coding_system);
1473 else if (EQ (prop, Qeol_crlf))
1474 return XCODING_SYSTEM_EOL_CRLF (coding_system);
1475 else if (EQ (prop, Qeol_cr))
1476 return XCODING_SYSTEM_EOL_CR (coding_system);
1477 else if (EQ (prop, Qpost_read_conversion))
1478 return XCODING_SYSTEM_POST_READ_CONVERSION (coding_system);
1479 else if (EQ (prop, Qpre_write_conversion))
1480 return XCODING_SYSTEM_PRE_WRITE_CONVERSION (coding_system);
1483 else if (EQ (prop, Qdisable_composition))
1484 return XCODING_SYSTEM_DISABLE_COMPOSITION (coding_system) ? Qt : Qnil;
1485 else if (EQ (prop, Quse_entity_reference))
1486 return XCODING_SYSTEM_USE_ENTITY_REFERENCE (coding_system) ? Qt : Qnil;
1488 else if (type == CODESYS_ISO2022)
1490 if (EQ (prop, Qcharset_g0))
1491 return coding_system_charset (coding_system, 0);
1492 else if (EQ (prop, Qcharset_g1))
1493 return coding_system_charset (coding_system, 1);
1494 else if (EQ (prop, Qcharset_g2))
1495 return coding_system_charset (coding_system, 2);
1496 else if (EQ (prop, Qcharset_g3))
1497 return coding_system_charset (coding_system, 3);
1499 #define FORCE_CHARSET(charset_num) \
1500 (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \
1501 (coding_system, charset_num) ? Qt : Qnil)
1503 else if (EQ (prop, Qforce_g0_on_output)) return FORCE_CHARSET (0);
1504 else if (EQ (prop, Qforce_g1_on_output)) return FORCE_CHARSET (1);
1505 else if (EQ (prop, Qforce_g2_on_output)) return FORCE_CHARSET (2);
1506 else if (EQ (prop, Qforce_g3_on_output)) return FORCE_CHARSET (3);
1508 #define LISP_BOOLEAN(prop) \
1509 (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil)
1511 else if (EQ (prop, Qshort)) return LISP_BOOLEAN (SHORT);
1512 else if (EQ (prop, Qno_ascii_eol)) return LISP_BOOLEAN (NO_ASCII_EOL);
1513 else if (EQ (prop, Qno_ascii_cntl)) return LISP_BOOLEAN (NO_ASCII_CNTL);
1514 else if (EQ (prop, Qseven)) return LISP_BOOLEAN (SEVEN);
1515 else if (EQ (prop, Qlock_shift)) return LISP_BOOLEAN (LOCK_SHIFT);
1516 else if (EQ (prop, Qno_iso6429)) return LISP_BOOLEAN (NO_ISO6429);
1517 else if (EQ (prop, Qescape_quoted)) return LISP_BOOLEAN (ESCAPE_QUOTED);
1519 else if (EQ (prop, Qinput_charset_conversion))
1521 unparse_charset_conversion_specs
1522 (XCODING_SYSTEM (coding_system)->iso2022.input_conv);
1523 else if (EQ (prop, Qoutput_charset_conversion))
1525 unparse_charset_conversion_specs
1526 (XCODING_SYSTEM (coding_system)->iso2022.output_conv);
1530 else if (type == CODESYS_CCL)
1532 if (EQ (prop, Qdecode))
1533 return XCODING_SYSTEM_CCL_DECODE (coding_system);
1534 else if (EQ (prop, Qencode))
1535 return XCODING_SYSTEM_CCL_ENCODE (coding_system);
1543 return Qnil; /* not reached */
1547 /************************************************************************/
1548 /* Coding category functions */
1549 /************************************************************************/
1552 decode_coding_category (Lisp_Object symbol)
1556 CHECK_SYMBOL (symbol);
1557 for (i = 0; i < CODING_CATEGORY_LAST; i++)
1558 if (EQ (coding_category_symbol[i], symbol))
1561 signal_simple_error ("Unrecognized coding category", symbol);
1562 return 0; /* not reached */
1565 DEFUN ("coding-category-list", Fcoding_category_list, 0, 0, 0, /*
1566 Return a list of all recognized coding categories.
1571 Lisp_Object list = Qnil;
1573 for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
1574 list = Fcons (coding_category_symbol[i], list);
1578 DEFUN ("set-coding-priority-list", Fset_coding_priority_list, 1, 1, 0, /*
1579 Change the priority order of the coding categories.
1580 LIST should be list of coding categories, in descending order of
1581 priority. Unspecified coding categories will be lower in priority
1582 than all specified ones, in the same relative order they were in
1587 int category_to_priority[CODING_CATEGORY_LAST];
1591 /* First generate a list that maps coding categories to priorities. */
1593 for (i = 0; i < CODING_CATEGORY_LAST; i++)
1594 category_to_priority[i] = -1;
1596 /* Highest priority comes from the specified list. */
1598 EXTERNAL_LIST_LOOP (rest, list)
1600 int cat = decode_coding_category (XCAR (rest));
1602 if (category_to_priority[cat] >= 0)
1603 signal_simple_error ("Duplicate coding category in list", XCAR (rest));
1604 category_to_priority[cat] = i++;
1607 /* Now go through the existing categories by priority to retrieve
1608 the categories not yet specified and preserve their priority
1610 for (j = 0; j < CODING_CATEGORY_LAST; j++)
1612 int cat = fcd->coding_category_by_priority[j];
1613 if (category_to_priority[cat] < 0)
1614 category_to_priority[cat] = i++;
1617 /* Now we need to construct the inverse of the mapping we just
1620 for (i = 0; i < CODING_CATEGORY_LAST; i++)
1621 fcd->coding_category_by_priority[category_to_priority[i]] = i;
1623 /* Phew! That was confusing. */
1627 DEFUN ("coding-priority-list", Fcoding_priority_list, 0, 0, 0, /*
1628 Return a list of coding categories in descending order of priority.
1633 Lisp_Object list = Qnil;
1635 for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
1636 list = Fcons (coding_category_symbol[fcd->coding_category_by_priority[i]],
1641 DEFUN ("set-coding-category-system", Fset_coding_category_system, 2, 2, 0, /*
1642 Change the coding system associated with a coding category.
1644 (coding_category, coding_system))
1646 int cat = decode_coding_category (coding_category);
1648 coding_system = Fget_coding_system (coding_system);
1649 fcd->coding_category_system[cat] = coding_system;
1653 DEFUN ("coding-category-system", Fcoding_category_system, 1, 1, 0, /*
1654 Return the coding system associated with a coding category.
1658 int cat = decode_coding_category (coding_category);
1659 Lisp_Object sys = fcd->coding_category_system[cat];
1662 return XCODING_SYSTEM_NAME (sys);
1667 /************************************************************************/
1668 /* Detecting the encoding of data */
1669 /************************************************************************/
1671 struct detection_state
1673 eol_type_t eol_type;
1716 struct iso2022_decoder iso;
1718 int high_byte_count;
1719 unsigned int saw_single_shift:1;
1732 acceptable_control_char_p (int c)
1736 /* Allow and ignore control characters that you might
1737 reasonably see in a text file */
1742 case 8: /* backspace */
1743 case 11: /* vertical tab */
1744 case 12: /* form feed */
1745 case 26: /* MS-DOS C-z junk */
1746 case 31: /* '^_' -- for info */
1754 mask_has_at_most_one_bit_p (int mask)
1756 /* Perhaps the only thing useful you learn from intensive Microsoft
1757 technical interviews */
1758 return (mask & (mask - 1)) == 0;
1762 detect_eol_type (struct detection_state *st, const Extbyte *src,
1763 Lstream_data_count n)
1767 unsigned char c = *(unsigned char *)src++;
1770 if (st->eol.just_saw_cr)
1772 else if (st->eol.seen_anything)
1775 else if (st->eol.just_saw_cr)
1778 st->eol.just_saw_cr = 1;
1780 st->eol.just_saw_cr = 0;
1781 st->eol.seen_anything = 1;
1784 return EOL_AUTODETECT;
1787 /* Attempt to determine the encoding and EOL type of the given text.
1788 Before calling this function for the first type, you must initialize
1789 st->eol_type as appropriate and initialize st->mask to ~0.
1791 st->eol_type holds the determined EOL type, or EOL_AUTODETECT if
1794 st->mask holds the determined coding category mask, or ~0 if only
1795 ASCII has been seen so far.
1799 0 == st->eol_type is EOL_AUTODETECT and/or more than coding category
1800 is present in st->mask
1801 1 == definitive answers are here for both st->eol_type and st->mask
1805 detect_coding_type (struct detection_state *st, const Extbyte *src,
1806 Lstream_data_count n, int just_do_eol)
1808 if (st->eol_type == EOL_AUTODETECT)
1809 st->eol_type = detect_eol_type (st, src, n);
1812 return st->eol_type != EOL_AUTODETECT;
1814 if (!st->seen_non_ascii)
1816 for (; n; n--, src++)
1818 unsigned char c = *(unsigned char *) src;
1819 if ((c < 0x20 && !acceptable_control_char_p (c)) || c >= 0x80)
1821 st->seen_non_ascii = 1;
1823 st->shift_jis.mask = ~0;
1826 st->utf16.mask = ~0;
1828 st->iso2022.mask = ~0;
1838 if (!mask_has_at_most_one_bit_p (st->iso2022.mask))
1839 st->iso2022.mask = detect_coding_iso2022 (st, src, n);
1840 if (!mask_has_at_most_one_bit_p (st->shift_jis.mask))
1841 st->shift_jis.mask = detect_coding_sjis (st, src, n);
1842 if (!mask_has_at_most_one_bit_p (st->big5.mask))
1843 st->big5.mask = detect_coding_big5 (st, src, n);
1844 if (!mask_has_at_most_one_bit_p (st->utf8.mask))
1845 st->utf8.mask = detect_coding_utf8 (st, src, n);
1846 if (!mask_has_at_most_one_bit_p (st->utf16.mask))
1847 st->utf16.mask = detect_coding_utf16 (st, src, n);
1848 if (!mask_has_at_most_one_bit_p (st->ucs4.mask))
1849 st->ucs4.mask = detect_coding_ucs4 (st, src, n);
1852 = st->iso2022.mask | st->shift_jis.mask | st->big5.mask
1853 | st->utf8.mask | st->ucs4.mask;
1856 int retval = mask_has_at_most_one_bit_p (st->mask);
1857 st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK;
1858 return retval && st->eol_type != EOL_AUTODETECT;
1863 coding_system_from_mask (int mask)
1867 /* If the file was entirely or basically ASCII, use the
1868 default value of `buffer-file-coding-system'. */
1869 Lisp_Object retval =
1870 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system;
1873 retval = Ffind_coding_system (retval);
1877 (Qbad_variable, Qwarning,
1878 "Invalid `default-buffer-file-coding-system', set to nil");
1879 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system = Qnil;
1883 retval = Fget_coding_system (Qraw_text);
1891 mask = postprocess_iso2022_mask (mask);
1893 /* Look through the coding categories by priority and find
1894 the first one that is allowed. */
1895 for (i = 0; i < CODING_CATEGORY_LAST; i++)
1897 cat = fcd->coding_category_by_priority[i];
1898 if ((mask & (1 << cat)) &&
1899 !NILP (fcd->coding_category_system[cat]))
1903 return fcd->coding_category_system[cat];
1905 return Fget_coding_system (Qraw_text);
1909 /* Given a seekable read stream and potential coding system and EOL type
1910 as specified, do any autodetection that is called for. If the
1911 coding system and/or EOL type are not `autodetect', they will be left
1912 alone; but this function will never return an autodetect coding system
1915 This function does not automatically fetch subsidiary coding systems;
1916 that should be unnecessary with the explicit eol-type argument. */
1918 #define LENGTH(string_constant) (sizeof (string_constant) - 1)
1919 /* number of leading lines to check for a coding cookie */
1920 #define LINES_TO_CHECK 2
1923 determine_real_coding_system (Lstream *stream, Lisp_Object *codesys_in_out,
1924 eol_type_t *eol_type_in_out)
1926 struct detection_state decst;
1928 if (*eol_type_in_out == EOL_AUTODETECT)
1929 *eol_type_in_out = XCODING_SYSTEM_EOL_TYPE (*codesys_in_out);
1932 decst.eol_type = *eol_type_in_out;
1935 /* If autodetection is called for, do it now. */
1936 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT
1937 || *eol_type_in_out == EOL_AUTODETECT)
1940 Lisp_Object coding_system = Qnil;
1942 Lstream_data_count nread = Lstream_read (stream, buf, sizeof (buf));
1944 int lines_checked = 0;
1946 /* Look for initial "-*-"; mode line prefix */
1948 scan_end = buf + nread - LENGTH ("-*-coding:?-*-");
1950 && lines_checked < LINES_TO_CHECK;
1952 if (*p == '-' && *(p+1) == '*' && *(p+2) == '-')
1954 Extbyte *local_vars_beg = p + 3;
1955 /* Look for final "-*-"; mode line suffix */
1956 for (p = local_vars_beg,
1957 scan_end = buf + nread - LENGTH ("-*-");
1959 && lines_checked < LINES_TO_CHECK;
1961 if (*p == '-' && *(p+1) == '*' && *(p+2) == '-')
1963 Extbyte *suffix = p;
1964 /* Look for "coding:" */
1965 for (p = local_vars_beg,
1966 scan_end = suffix - LENGTH ("coding:?");
1969 if (memcmp ("coding:", p, LENGTH ("coding:")) == 0
1970 && (p == local_vars_beg
1971 || (*(p-1) == ' ' ||
1977 p += LENGTH ("coding:");
1978 while (*p == ' ' || *p == '\t') p++;
1980 /* Get coding system name */
1981 save = *suffix; *suffix = '\0';
1982 /* Characters valid in a MIME charset name (rfc 1521),
1983 and in a Lisp symbol name. */
1984 n = strspn ( (char *) p,
1985 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
1986 "abcdefghijklmnopqrstuvwxyz"
1992 save = p[n]; p[n] = '\0';
1994 Ffind_coding_system (intern ((char *) p));
2001 /* #### file must use standard EOLs or we miss 2d line */
2002 /* #### not to mention this is broken for UTF-16 DOS files */
2003 else if (*p == '\n' || *p == '\r')
2006 /* skip past multibyte (DOS) newline */
2007 if (*p == '\r' && *(p+1) == '\n') p++;
2011 /* #### file must use standard EOLs or we miss 2d line */
2012 /* #### not to mention this is broken for UTF-16 DOS files */
2013 else if (*p == '\n' || *p == '\r')
2016 /* skip past multibyte (DOS) newline */
2017 if (*p == '\r' && *(p+1) == '\n') p++;
2020 if (NILP (coding_system))
2023 if (detect_coding_type (&decst, buf, nread,
2024 XCODING_SYSTEM_TYPE (*codesys_in_out)
2025 != CODESYS_AUTODETECT))
2027 nread = Lstream_read (stream, buf, sizeof (buf));
2033 else if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT
2034 && XCODING_SYSTEM_EOL_TYPE (coding_system) == EOL_AUTODETECT)
2037 if (detect_coding_type (&decst, buf, nread, 1))
2039 nread = Lstream_read (stream, buf, sizeof (buf));
2045 *eol_type_in_out = decst.eol_type;
2046 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT)
2048 if (NILP (coding_system))
2049 *codesys_in_out = coding_system_from_mask (decst.mask);
2051 *codesys_in_out = coding_system;
2055 /* If we absolutely can't determine the EOL type, just assume LF. */
2056 if (*eol_type_in_out == EOL_AUTODETECT)
2057 *eol_type_in_out = EOL_LF;
2059 Lstream_rewind (stream);
2062 DEFUN ("detect-coding-region", Fdetect_coding_region, 2, 3, 0, /*
2063 Detect coding system of the text in the region between START and END.
2064 Return a list of possible coding systems ordered by priority.
2065 If only ASCII characters are found, return 'undecided or one of
2066 its subsidiary coding systems according to a detected end-of-line
2067 type. Optional arg BUFFER defaults to the current buffer.
2069 (start, end, buffer))
2071 Lisp_Object val = Qnil;
2072 struct buffer *buf = decode_buffer (buffer, 0);
2074 Lisp_Object instream, lb_instream;
2075 Lstream *istr, *lb_istr;
2076 struct detection_state decst;
2077 struct gcpro gcpro1, gcpro2;
2079 get_buffer_range_char (buf, start, end, &b, &e, 0);
2080 lb_instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2081 lb_istr = XLSTREAM (lb_instream);
2082 instream = make_encoding_input_stream (lb_istr, Fget_coding_system (Qbinary));
2083 istr = XLSTREAM (instream);
2084 GCPRO2 (instream, lb_instream);
2086 decst.eol_type = EOL_AUTODETECT;
2090 Extbyte random_buffer[4096];
2091 Lstream_data_count nread = Lstream_read (istr, random_buffer, sizeof (random_buffer));
2095 if (detect_coding_type (&decst, random_buffer, nread, 0))
2099 if (decst.mask == ~0)
2100 val = subsidiary_coding_system (Fget_coding_system (Qundecided),
2108 decst.mask = postprocess_iso2022_mask (decst.mask);
2110 for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
2112 int sys = fcd->coding_category_by_priority[i];
2113 if (decst.mask & (1 << sys))
2115 Lisp_Object codesys = fcd->coding_category_system[sys];
2116 if (!NILP (codesys))
2117 codesys = subsidiary_coding_system (codesys, decst.eol_type);
2118 val = Fcons (codesys, val);
2122 Lstream_close (istr);
2124 Lstream_delete (istr);
2125 Lstream_delete (lb_istr);
2130 /************************************************************************/
2131 /* Converting to internal Mule format ("decoding") */
2132 /************************************************************************/
2134 /* A decoding stream is a stream used for decoding text (i.e.
2135 converting from some external format to internal format).
2136 The decoding-stream object keeps track of the actual coding
2137 stream, the stream that is at the other end, and data that
2138 needs to be persistent across the lifetime of the stream. */
2140 /* Handle the EOL stuff related to just-read-in character C.
2141 EOL_TYPE is the EOL type of the coding stream.
2142 FLAGS is the current value of FLAGS in the coding stream, and may
2143 be modified by this macro. (The macro only looks at the
2144 CODING_STATE_CR flag.) DST is the Dynarr to which the decoded
2145 bytes are to be written. You need to also define a local goto
2146 label "label_continue_loop" that is at the end of the main
2147 character-reading loop.
2149 If C is a CR character, then this macro handles it entirely and
2150 jumps to label_continue_loop. Otherwise, this macro does not add
2151 anything to DST, and continues normally. You should continue
2152 processing C normally after this macro. */
2154 #define DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst) \
2158 if (eol_type == EOL_CR) \
2159 Dynarr_add (dst, '\n'); \
2160 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR) \
2161 Dynarr_add (dst, c); \
2163 flags |= CODING_STATE_CR; \
2164 goto label_continue_loop; \
2166 else if (flags & CODING_STATE_CR) \
2167 { /* eol_type == CODING_SYSTEM_EOL_CRLF */ \
2169 Dynarr_add (dst, '\r'); \
2170 flags &= ~CODING_STATE_CR; \
2174 /* C should be a binary character in the range 0 - 255; convert
2175 to internal format and add to Dynarr DST. */
2178 #define DECODE_ADD_BINARY_CHAR(c, dst) \
2180 if (BYTE_ASCII_P (c)) \
2181 Dynarr_add (dst, c); \
2184 Dynarr_add (dst, (c >> 6) | 0xc0); \
2185 Dynarr_add (dst, (c & 0x3f) | 0x80); \
2189 INLINE_HEADER void DECODE_ADD_UCS_CHAR(Emchar c, unsigned_char_dynarr* dst);
2191 DECODE_ADD_UCS_CHAR(Emchar c, unsigned_char_dynarr* dst)
2195 Dynarr_add (dst, c);
2197 else if ( c <= 0x7ff )
2199 Dynarr_add (dst, (c >> 6) | 0xc0);
2200 Dynarr_add (dst, (c & 0x3f) | 0x80);
2202 else if ( c <= 0xffff )
2204 Dynarr_add (dst, (c >> 12) | 0xe0);
2205 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
2206 Dynarr_add (dst, (c & 0x3f) | 0x80);
2208 else if ( c <= 0x1fffff )
2210 Dynarr_add (dst, (c >> 18) | 0xf0);
2211 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2212 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
2213 Dynarr_add (dst, (c & 0x3f) | 0x80);
2215 else if ( c <= 0x3ffffff )
2217 Dynarr_add (dst, (c >> 24) | 0xf8);
2218 Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
2219 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2220 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
2221 Dynarr_add (dst, (c & 0x3f) | 0x80);
2225 Dynarr_add (dst, (c >> 30) | 0xfc);
2226 Dynarr_add (dst, ((c >> 24) & 0x3f) | 0x80);
2227 Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
2228 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2229 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
2230 Dynarr_add (dst, (c & 0x3f) | 0x80);
2234 #define DECODE_ADD_BINARY_CHAR(c, dst) \
2236 if (BYTE_ASCII_P (c)) \
2237 Dynarr_add (dst, c); \
2238 else if (BYTE_C1_P (c)) \
2240 Dynarr_add (dst, LEADING_BYTE_CONTROL_1); \
2241 Dynarr_add (dst, c + 0x20); \
2245 Dynarr_add (dst, LEADING_BYTE_LATIN_ISO8859_1); \
2246 Dynarr_add (dst, c); \
2251 #define DECODE_OUTPUT_PARTIAL_CHAR(ch) \
2255 DECODE_ADD_BINARY_CHAR (ch, dst); \
2260 #define DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst) \
2262 if (flags & CODING_STATE_END) \
2264 DECODE_OUTPUT_PARTIAL_CHAR (ch); \
2265 if (flags & CODING_STATE_CR) \
2266 Dynarr_add (dst, '\r'); \
2270 #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding)
2272 #define ER_BUF_SIZE 24
2274 struct decoding_stream
2276 /* Coding system that governs the conversion. */
2277 Lisp_Coding_System *codesys;
2279 /* Stream that we read the encoded data from or
2280 write the decoded data to. */
2283 /* If we are reading, then we can return only a fixed amount of
2284 data, so if the conversion resulted in too much data, we store it
2285 here for retrieval the next time around. */
2286 unsigned_char_dynarr *runoff;
2288 /* FLAGS holds flags indicating the current state of the decoding.
2289 Some of these flags are dependent on the coding system. */
2292 /* CPOS holds a partially built-up code-point of character. */
2295 /* EOL_TYPE specifies the type of end-of-line conversion that
2296 currently applies. We need to keep this separate from the
2297 EOL type stored in CODESYS because the latter might indicate
2298 automatic EOL-type detection while the former will always
2299 indicate a particular EOL type. */
2300 eol_type_t eol_type;
2302 /* Additional ISO2022 information. We define the structure above
2303 because it's also needed by the detection routines. */
2304 struct iso2022_decoder iso2022;
2306 /* Additional information (the state of the running CCL program)
2307 used by the CCL decoder. */
2308 struct ccl_program ccl;
2310 /* counter for UTF-8 or UCS-4 */
2311 unsigned char counter;
2314 unsigned char er_counter;
2315 unsigned char er_buf[ER_BUF_SIZE];
2317 unsigned combined_char_count;
2318 Emchar combined_chars[16];
2319 Lisp_Object combining_table;
2321 struct detection_state decst;
2324 static Lstream_data_count decoding_reader (Lstream *stream,
2325 unsigned char *data, Lstream_data_count size);
2326 static Lstream_data_count decoding_writer (Lstream *stream,
2327 const unsigned char *data, Lstream_data_count size);
2328 static int decoding_rewinder (Lstream *stream);
2329 static int decoding_seekable_p (Lstream *stream);
2330 static int decoding_flusher (Lstream *stream);
2331 static int decoding_closer (Lstream *stream);
2333 static Lisp_Object decoding_marker (Lisp_Object stream);
2335 DEFINE_LSTREAM_IMPLEMENTATION ("decoding", lstream_decoding,
2336 sizeof (struct decoding_stream));
2339 decoding_marker (Lisp_Object stream)
2341 Lstream *str = DECODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2342 Lisp_Object str_obj;
2344 /* We do not need to mark the coding systems or charsets stored
2345 within the stream because they are stored in a global list
2346 and automatically marked. */
2348 XSETLSTREAM (str_obj, str);
2349 mark_object (str_obj);
2350 if (str->imp->marker)
2351 return (str->imp->marker) (str_obj);
2356 /* Read SIZE bytes of data and store it into DATA. We are a decoding stream
2357 so we read data from the other end, decode it, and store it into DATA. */
2359 static Lstream_data_count
2360 decoding_reader (Lstream *stream, unsigned char *data, Lstream_data_count size)
2362 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2363 unsigned char *orig_data = data;
2364 Lstream_data_count read_size;
2365 int error_occurred = 0;
2367 /* We need to interface to mule_decode(), which expects to take some
2368 amount of data and store the result into a Dynarr. We have
2369 mule_decode() store into str->runoff, and take data from there
2372 /* We loop until we have enough data, reading chunks from the other
2373 end and decoding it. */
2376 /* Take data from the runoff if we can. Make sure to take at
2377 most SIZE bytes, and delete the data from the runoff. */
2378 if (Dynarr_length (str->runoff) > 0)
2380 Lstream_data_count chunk = min (size, (Lstream_data_count) Dynarr_length (str->runoff));
2381 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2382 Dynarr_delete_many (str->runoff, 0, chunk);
2388 break; /* No more room for data */
2390 if (str->flags & CODING_STATE_END)
2391 /* This means that on the previous iteration, we hit the EOF on
2392 the other end. We loop once more so that mule_decode() can
2393 output any final stuff it may be holding, or any "go back
2394 to a sane state" escape sequences. (This latter makes sense
2395 during encoding.) */
2398 /* Exhausted the runoff, so get some more. DATA has at least
2399 SIZE bytes left of storage in it, so it's OK to read directly
2400 into it. (We'll be overwriting above, after we've decoded it
2401 into the runoff.) */
2402 read_size = Lstream_read (str->other_end, data, size);
2409 /* There might be some more end data produced in the translation.
2410 See the comment above. */
2411 str->flags |= CODING_STATE_END;
2412 mule_decode (stream, (Extbyte *) data, str->runoff, read_size);
2415 if (data - orig_data == 0)
2416 return error_occurred ? -1 : 0;
2418 return data - orig_data;
2421 static Lstream_data_count
2422 decoding_writer (Lstream *stream, const unsigned char *data, Lstream_data_count size)
2424 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2425 Lstream_data_count retval;
2427 /* Decode all our data into the runoff, and then attempt to write
2428 it all out to the other end. Remove whatever chunk we succeeded
2430 mule_decode (stream, (Extbyte *) data, str->runoff, size);
2431 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2432 Dynarr_length (str->runoff));
2434 Dynarr_delete_many (str->runoff, 0, retval);
2435 /* Do NOT return retval. The return value indicates how much
2436 of the incoming data was written, not how many bytes were
2442 reset_decoding_stream (struct decoding_stream *str)
2445 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_ISO2022)
2447 Lisp_Object coding_system;
2448 XSETCODING_SYSTEM (coding_system, str->codesys);
2449 reset_iso2022 (coding_system, &str->iso2022);
2451 else if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_CCL)
2453 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_DECODE (str->codesys));
2458 str->er_counter = 0;
2459 str->combined_char_count = 0;
2460 str->combining_table = Qnil;
2462 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT
2463 || CODING_SYSTEM_EOL_TYPE (str->codesys) == EOL_AUTODETECT)
2466 str->decst.eol_type = EOL_AUTODETECT;
2467 str->decst.mask = ~0;
2469 str->flags = str->cpos = 0;
2473 decoding_rewinder (Lstream *stream)
2475 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2476 reset_decoding_stream (str);
2477 Dynarr_reset (str->runoff);
2478 return Lstream_rewind (str->other_end);
2482 decoding_seekable_p (Lstream *stream)
2484 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2485 return Lstream_seekable_p (str->other_end);
2489 decoding_flusher (Lstream *stream)
2491 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2492 return Lstream_flush (str->other_end);
2496 decoding_closer (Lstream *stream)
2498 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2499 if (stream->flags & LSTREAM_FL_WRITE)
2501 str->flags |= CODING_STATE_END;
2502 decoding_writer (stream, 0, 0);
2504 Dynarr_free (str->runoff);
2506 #ifdef ENABLE_COMPOSITE_CHARS
2507 if (str->iso2022.composite_chars)
2508 Dynarr_free (str->iso2022.composite_chars);
2511 return Lstream_close (str->other_end);
2515 decoding_stream_coding_system (Lstream *stream)
2517 Lisp_Object coding_system;
2518 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2520 XSETCODING_SYSTEM (coding_system, str->codesys);
2521 return subsidiary_coding_system (coding_system, str->eol_type);
2525 set_decoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2527 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2528 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2530 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
2531 str->eol_type = CODING_SYSTEM_EOL_TYPE (cs);
2532 reset_decoding_stream (str);
2535 /* WARNING WARNING WARNING WARNING!!!!! If you open up a decoding
2536 stream for writing, no automatic code detection will be performed.
2537 The reason for this is that automatic code detection requires a
2538 seekable input. Things will also fail if you open a decoding
2539 stream for reading using a non-fully-specified coding system and
2540 a non-seekable input stream. */
2543 make_decoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2546 Lstream *lstr = Lstream_new (lstream_decoding, mode);
2547 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2551 str->other_end = stream;
2552 str->runoff = (unsigned_char_dynarr *) Dynarr_new (unsigned_char);
2553 str->eol_type = EOL_AUTODETECT;
2554 if (!strcmp (mode, "r")
2555 && Lstream_seekable_p (stream))
2556 /* We can determine the coding system now. */
2557 determine_real_coding_system (stream, &codesys, &str->eol_type);
2558 set_decoding_stream_coding_system (lstr, codesys);
2559 str->decst.eol_type = str->eol_type;
2560 str->decst.mask = ~0;
2561 XSETLSTREAM (obj, lstr);
2566 make_decoding_input_stream (Lstream *stream, Lisp_Object codesys)
2568 return make_decoding_stream_1 (stream, codesys, "r");
2572 make_decoding_output_stream (Lstream *stream, Lisp_Object codesys)
2574 return make_decoding_stream_1 (stream, codesys, "w");
2577 /* Note: the decode_coding_* functions all take the same
2578 arguments as mule_decode(), which is to say some SRC data of
2579 size N, which is to be stored into dynamic array DST.
2580 DECODING is the stream within which the decoding is
2581 taking place, but no data is actually read from or
2582 written to that stream; that is handled in decoding_reader()
2583 or decoding_writer(). This allows the same functions to
2584 be used for both reading and writing. */
2587 mule_decode (Lstream *decoding, const Extbyte *src,
2588 unsigned_char_dynarr *dst, Lstream_data_count n)
2590 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2592 /* If necessary, do encoding-detection now. We do this when
2593 we're a writing stream or a non-seekable reading stream,
2594 meaning that we can't just process the whole input,
2595 rewind, and start over. */
2597 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT ||
2598 str->eol_type == EOL_AUTODETECT)
2600 Lisp_Object codesys;
2602 XSETCODING_SYSTEM (codesys, str->codesys);
2603 detect_coding_type (&str->decst, src, n,
2604 CODING_SYSTEM_TYPE (str->codesys) !=
2605 CODESYS_AUTODETECT);
2606 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT &&
2607 str->decst.mask != ~0)
2608 /* #### This is cheesy. What we really ought to do is
2609 buffer up a certain amount of data so as to get a
2610 less random result. */
2611 codesys = coding_system_from_mask (str->decst.mask);
2612 str->eol_type = str->decst.eol_type;
2613 if (XCODING_SYSTEM (codesys) != str->codesys)
2615 /* Preserve the CODING_STATE_END flag in case it was set.
2616 If we erase it, bad things might happen. */
2617 int was_end = str->flags & CODING_STATE_END;
2618 set_decoding_stream_coding_system (decoding, codesys);
2620 str->flags |= CODING_STATE_END;
2624 switch (CODING_SYSTEM_TYPE (str->codesys))
2627 case CODESYS_INTERNAL:
2628 Dynarr_add_many (dst, src, n);
2631 case CODESYS_AUTODETECT:
2632 /* If we got this far and still haven't decided on the coding
2633 system, then do no conversion. */
2634 case CODESYS_NO_CONVERSION:
2635 decode_coding_no_conversion (decoding, src, dst, n);
2638 case CODESYS_SHIFT_JIS:
2639 decode_coding_sjis (decoding, src, dst, n);
2642 decode_coding_big5 (decoding, src, dst, n);
2645 decode_coding_ucs4 (decoding, src, dst, n);
2648 decode_coding_utf16 (decoding, src, dst, n);
2651 decode_coding_utf8 (decoding, src, dst, n);
2654 str->ccl.last_block = str->flags & CODING_STATE_END;
2655 /* When applying ccl program to stream, MUST NOT set NULL
2657 ccl_driver (&str->ccl, (src ? (unsigned char *)src : (unsigned char*)""),
2658 dst, n, 0, CCL_MODE_DECODING);
2660 case CODESYS_ISO2022:
2661 decode_coding_iso2022 (decoding, src, dst, n);
2669 DEFUN ("decode-coding-region", Fdecode_coding_region, 3, 4, 0, /*
2670 Decode the text between START and END which is encoded in CODING-SYSTEM.
2671 This is useful if you've read in encoded text from a file without decoding
2672 it (e.g. you read in a JIS-formatted file but used the `binary' or
2673 `no-conversion' coding system, so that it shows up as "^[$B!<!+^[(B").
2674 Return length of decoded text.
2675 BUFFER defaults to the current buffer if unspecified.
2677 (start, end, coding_system, buffer))
2680 struct buffer *buf = decode_buffer (buffer, 0);
2681 Lisp_Object instream, lb_outstream, de_outstream, outstream;
2682 Lstream *istr, *ostr;
2683 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2685 get_buffer_range_char (buf, start, end, &b, &e, 0);
2687 barf_if_buffer_read_only (buf, b, e);
2689 coding_system = Fget_coding_system (coding_system);
2690 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2691 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2692 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2694 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2695 Fget_coding_system (Qbinary));
2696 istr = XLSTREAM (instream);
2697 ostr = XLSTREAM (outstream);
2698 GCPRO4 (instream, lb_outstream, de_outstream, outstream);
2700 /* The chain of streams looks like this:
2702 [BUFFER] <----- send through
2703 ------> [ENCODE AS BINARY]
2704 ------> [DECODE AS SPECIFIED]
2710 char tempbuf[1024]; /* some random amount */
2711 Bufpos newpos, even_newer_pos;
2712 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2713 Lstream_data_count size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2717 newpos = lisp_buffer_stream_startpos (istr);
2718 Lstream_write (ostr, tempbuf, size_in_bytes);
2719 even_newer_pos = lisp_buffer_stream_startpos (istr);
2720 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2723 Lstream_close (istr);
2724 Lstream_close (ostr);
2726 Lstream_delete (istr);
2727 Lstream_delete (ostr);
2728 Lstream_delete (XLSTREAM (de_outstream));
2729 Lstream_delete (XLSTREAM (lb_outstream));
2734 /************************************************************************/
2735 /* Converting to an external encoding ("encoding") */
2736 /************************************************************************/
2738 /* An encoding stream is an output stream. When you create the
2739 stream, you specify the coding system that governs the encoding
2740 and another stream that the resulting encoded data is to be
2741 sent to, and then start sending data to it. */
2743 #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding)
2745 struct encoding_stream
2747 /* Coding system that governs the conversion. */
2748 Lisp_Coding_System *codesys;
2750 /* Stream that we read the encoded data from or
2751 write the decoded data to. */
2754 /* If we are reading, then we can return only a fixed amount of
2755 data, so if the conversion resulted in too much data, we store it
2756 here for retrieval the next time around. */
2757 unsigned_char_dynarr *runoff;
2759 /* FLAGS holds flags indicating the current state of the encoding.
2760 Some of these flags are dependent on the coding system. */
2763 /* CH holds a partially built-up character. Since we only deal
2764 with one- and two-byte characters at the moment, we only use
2765 this to store the first byte of a two-byte character. */
2768 /* Additional information used by the ISO2022 encoder. */
2771 /* CHARSET holds the character sets currently assigned to the G0
2772 through G3 registers. It is initialized from the array
2773 INITIAL_CHARSET in CODESYS. */
2774 Lisp_Object charset[4];
2776 /* Which registers are currently invoked into the left (GL) and
2777 right (GR) halves of the 8-bit encoding space? */
2778 int register_left, register_right;
2780 /* Whether we need to explicitly designate the charset in the
2781 G? register before using it. It is initialized from the
2782 array FORCE_CHARSET_ON_OUTPUT in CODESYS. */
2783 unsigned char force_charset_on_output[4];
2785 /* Other state variables that need to be preserved across
2787 Lisp_Object current_charset;
2789 int current_char_boundary;
2792 void (*encode_char) (struct encoding_stream *str, Emchar c,
2793 unsigned_char_dynarr *dst, unsigned int *flags);
2794 void (*finish) (struct encoding_stream *str,
2795 unsigned_char_dynarr *dst, unsigned int *flags);
2797 /* Additional information (the state of the running CCL program)
2798 used by the CCL encoder. */
2799 struct ccl_program ccl;
2803 static Lstream_data_count encoding_reader (Lstream *stream, unsigned char *data, Lstream_data_count size);
2804 static Lstream_data_count encoding_writer (Lstream *stream, const unsigned char *data,
2805 Lstream_data_count size);
2806 static int encoding_rewinder (Lstream *stream);
2807 static int encoding_seekable_p (Lstream *stream);
2808 static int encoding_flusher (Lstream *stream);
2809 static int encoding_closer (Lstream *stream);
2811 static Lisp_Object encoding_marker (Lisp_Object stream);
2813 DEFINE_LSTREAM_IMPLEMENTATION ("encoding", lstream_encoding,
2814 sizeof (struct encoding_stream));
2817 encoding_marker (Lisp_Object stream)
2819 Lstream *str = ENCODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2820 Lisp_Object str_obj;
2822 /* We do not need to mark the coding systems or charsets stored
2823 within the stream because they are stored in a global list
2824 and automatically marked. */
2826 XSETLSTREAM (str_obj, str);
2827 mark_object (str_obj);
2828 if (str->imp->marker)
2829 return (str->imp->marker) (str_obj);
2834 /* Read SIZE bytes of data and store it into DATA. We are a encoding stream
2835 so we read data from the other end, encode it, and store it into DATA. */
2837 static Lstream_data_count
2838 encoding_reader (Lstream *stream, unsigned char *data, Lstream_data_count size)
2840 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2841 unsigned char *orig_data = data;
2842 Lstream_data_count read_size;
2843 int error_occurred = 0;
2845 /* We need to interface to mule_encode(), which expects to take some
2846 amount of data and store the result into a Dynarr. We have
2847 mule_encode() store into str->runoff, and take data from there
2850 /* We loop until we have enough data, reading chunks from the other
2851 end and encoding it. */
2854 /* Take data from the runoff if we can. Make sure to take at
2855 most SIZE bytes, and delete the data from the runoff. */
2856 if (Dynarr_length (str->runoff) > 0)
2858 int chunk = min ((int) size, Dynarr_length (str->runoff));
2859 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2860 Dynarr_delete_many (str->runoff, 0, chunk);
2866 break; /* No more room for data */
2868 if (str->flags & CODING_STATE_END)
2869 /* This means that on the previous iteration, we hit the EOF on
2870 the other end. We loop once more so that mule_encode() can
2871 output any final stuff it may be holding, or any "go back
2872 to a sane state" escape sequences. (This latter makes sense
2873 during encoding.) */
2876 /* Exhausted the runoff, so get some more. DATA at least SIZE bytes
2877 left of storage in it, so it's OK to read directly into it.
2878 (We'll be overwriting above, after we've encoded it into the
2880 read_size = Lstream_read (str->other_end, data, size);
2887 /* There might be some more end data produced in the translation.
2888 See the comment above. */
2889 str->flags |= CODING_STATE_END;
2890 mule_encode (stream, data, str->runoff, read_size);
2893 if (data == orig_data)
2894 return error_occurred ? -1 : 0;
2896 return data - orig_data;
2899 static Lstream_data_count
2900 encoding_writer (Lstream *stream, const unsigned char *data, Lstream_data_count size)
2902 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2903 Lstream_data_count retval;
2905 /* Encode all our data into the runoff, and then attempt to write
2906 it all out to the other end. Remove whatever chunk we succeeded
2908 mule_encode (stream, data, str->runoff, size);
2909 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2910 Dynarr_length (str->runoff));
2912 Dynarr_delete_many (str->runoff, 0, retval);
2913 /* Do NOT return retval. The return value indicates how much
2914 of the incoming data was written, not how many bytes were
2920 reset_encoding_stream (struct encoding_stream *str)
2923 switch (CODING_SYSTEM_TYPE (str->codesys))
2925 case CODESYS_ISO2022:
2929 str->encode_char = &char_encode_iso2022;
2930 str->finish = &char_finish_iso2022;
2931 for (i = 0; i < 4; i++)
2933 str->iso2022.charset[i] =
2934 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, i);
2935 str->iso2022.force_charset_on_output[i] =
2936 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (str->codesys, i);
2938 str->iso2022.register_left = 0;
2939 str->iso2022.register_right = 1;
2940 str->iso2022.current_charset = Qnil;
2941 str->iso2022.current_half = 0;
2945 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_ENCODE (str->codesys));
2948 str->encode_char = &char_encode_utf8;
2949 str->finish = &char_finish_utf8;
2952 str->encode_char = &char_encode_utf16;
2953 str->finish = &char_finish_utf16;
2956 str->encode_char = &char_encode_ucs4;
2957 str->finish = &char_finish_ucs4;
2959 case CODESYS_SHIFT_JIS:
2960 str->encode_char = &char_encode_shift_jis;
2961 str->finish = &char_finish_shift_jis;
2964 str->encode_char = &char_encode_big5;
2965 str->finish = &char_finish_big5;
2971 str->iso2022.current_char_boundary = 0;
2972 str->flags = str->ch = 0;
2976 encoding_rewinder (Lstream *stream)
2978 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2979 reset_encoding_stream (str);
2980 Dynarr_reset (str->runoff);
2981 return Lstream_rewind (str->other_end);
2985 encoding_seekable_p (Lstream *stream)
2987 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2988 return Lstream_seekable_p (str->other_end);
2992 encoding_flusher (Lstream *stream)
2994 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2995 return Lstream_flush (str->other_end);
2999 encoding_closer (Lstream *stream)
3001 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
3002 if (stream->flags & LSTREAM_FL_WRITE)
3004 str->flags |= CODING_STATE_END;
3005 encoding_writer (stream, 0, 0);
3007 Dynarr_free (str->runoff);
3008 return Lstream_close (str->other_end);
3012 encoding_stream_coding_system (Lstream *stream)
3014 Lisp_Object coding_system;
3015 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
3017 XSETCODING_SYSTEM (coding_system, str->codesys);
3018 return coding_system;
3022 set_encoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
3024 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
3025 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
3027 reset_encoding_stream (str);
3031 make_encoding_stream_1 (Lstream *stream, Lisp_Object codesys,
3034 Lstream *lstr = Lstream_new (lstream_encoding, mode);
3035 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
3039 str->runoff = Dynarr_new (unsigned_char);
3040 str->other_end = stream;
3041 set_encoding_stream_coding_system (lstr, codesys);
3042 XSETLSTREAM (obj, lstr);
3047 make_encoding_input_stream (Lstream *stream, Lisp_Object codesys)
3049 return make_encoding_stream_1 (stream, codesys, "r");
3053 make_encoding_output_stream (Lstream *stream, Lisp_Object codesys)
3055 return make_encoding_stream_1 (stream, codesys, "w");
3058 /* Convert N bytes of internally-formatted data stored in SRC to an
3059 external format, according to the encoding stream ENCODING.
3060 Store the encoded data into DST. */
3063 mule_encode (Lstream *encoding, const Bufbyte *src,
3064 unsigned_char_dynarr *dst, Lstream_data_count n)
3066 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3068 switch (CODING_SYSTEM_TYPE (str->codesys))
3071 case CODESYS_INTERNAL:
3072 Dynarr_add_many (dst, src, n);
3075 case CODESYS_AUTODETECT:
3076 /* If we got this far and still haven't decided on the coding
3077 system, then do no conversion. */
3078 case CODESYS_NO_CONVERSION:
3079 encode_coding_no_conversion (encoding, src, dst, n);
3083 str->ccl.last_block = str->flags & CODING_STATE_END;
3084 /* When applying ccl program to stream, MUST NOT set NULL
3086 ccl_driver (&str->ccl, ((src) ? src : (unsigned char*)""),
3087 dst, n, 0, CCL_MODE_ENCODING);
3091 text_encode_generic (encoding, src, dst, n);
3095 DEFUN ("encode-coding-region", Fencode_coding_region, 3, 4, 0, /*
3096 Encode the text between START and END using CODING-SYSTEM.
3097 This will, for example, convert Japanese characters into stuff such as
3098 "^[$B!<!+^[(B" if you use the JIS encoding. Return length of encoded
3099 text. BUFFER defaults to the current buffer if unspecified.
3101 (start, end, coding_system, buffer))
3104 struct buffer *buf = decode_buffer (buffer, 0);
3105 Lisp_Object instream, lb_outstream, de_outstream, outstream;
3106 Lstream *istr, *ostr;
3107 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
3109 get_buffer_range_char (buf, start, end, &b, &e, 0);
3111 barf_if_buffer_read_only (buf, b, e);
3113 coding_system = Fget_coding_system (coding_system);
3114 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
3115 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
3116 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
3117 Fget_coding_system (Qbinary));
3118 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
3120 istr = XLSTREAM (instream);
3121 ostr = XLSTREAM (outstream);
3122 GCPRO4 (instream, outstream, de_outstream, lb_outstream);
3123 /* The chain of streams looks like this:
3125 [BUFFER] <----- send through
3126 ------> [ENCODE AS SPECIFIED]
3127 ------> [DECODE AS BINARY]
3132 char tempbuf[1024]; /* some random amount */
3133 Bufpos newpos, even_newer_pos;
3134 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
3135 Lstream_data_count size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
3139 newpos = lisp_buffer_stream_startpos (istr);
3140 Lstream_write (ostr, tempbuf, size_in_bytes);
3141 even_newer_pos = lisp_buffer_stream_startpos (istr);
3142 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
3148 lisp_buffer_stream_startpos (XLSTREAM (instream)) - b;
3149 Lstream_close (istr);
3150 Lstream_close (ostr);
3152 Lstream_delete (istr);
3153 Lstream_delete (ostr);
3154 Lstream_delete (XLSTREAM (de_outstream));
3155 Lstream_delete (XLSTREAM (lb_outstream));
3156 return make_int (retlen);
3163 text_encode_generic (Lstream *encoding, const Bufbyte *src,
3164 unsigned_char_dynarr *dst, Lstream_data_count n)
3167 unsigned char char_boundary;
3168 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3169 unsigned int flags = str->flags;
3170 Emchar ch = str->ch;
3172 char_boundary = str->iso2022.current_char_boundary;
3178 if (char_boundary == 0)
3206 (*str->encode_char) (str, c, dst, &flags);
3208 else if (char_boundary == 1)
3210 (*str->encode_char) (str, (ch << 6) | (c & 0x3f), dst, &flags);
3216 ch = (ch << 6) | (c & 0x3f);
3221 if ((char_boundary == 0) && (flags & CODING_STATE_END))
3223 (*str->finish) (str, dst, &flags);
3228 str->iso2022.current_char_boundary = char_boundary;
3233 /************************************************************************/
3234 /* entity reference */
3235 /************************************************************************/
3238 decode_flush_er_chars (struct decoding_stream *str, unsigned_char_dynarr* dst);
3240 decode_flush_er_chars (struct decoding_stream *str, unsigned_char_dynarr* dst)
3242 if ( str->er_counter > 0)
3244 Dynarr_add_many (dst, str->er_buf, str->er_counter);
3245 str->er_counter = 0;
3249 EXFUN (Fregexp_quote, 1);
3251 void decode_add_er_char (struct decoding_stream *str, Emchar character,
3252 unsigned_char_dynarr* dst);
3254 decode_add_er_char (struct decoding_stream *str, Emchar c,
3255 unsigned_char_dynarr* dst)
3257 if (str->er_counter == 0)
3259 if (CODING_SYSTEM_USE_ENTITY_REFERENCE (str->codesys)
3262 str->er_buf[0] = '&';
3266 DECODE_ADD_UCS_CHAR (c, dst);
3270 Lisp_Object string = make_string (str->er_buf,
3277 Lisp_Object char_type;
3280 for ( rest = Vcoded_charset_entity_reference_alist;
3281 !NILP (rest); rest = Fcdr (rest) )
3287 char_type = XCDR (ccs);
3292 if (NILP (ccs = Ffind_charset (ccs)))
3301 pat = Fregexp_quote (pat);
3308 pat = concat3 (build_string ("^&"),
3309 pat, build_string ("\\([0-9]+\\)$"));
3312 else if (EQ (ret, Qx))
3314 pat = concat3 (build_string ("^&"),
3315 pat, build_string ("\\([0-9a-f]+\\)$"));
3318 else if (EQ (ret, QX))
3320 pat = concat3 (build_string ("^&"),
3321 pat, build_string ("\\([0-9A-F]+\\)$"));
3327 if (!NILP (Fstring_match (pat, string, Qnil, Qnil)))
3330 = XINT (Fstring_to_number
3331 (Fsubstring (string,
3332 Fmatch_beginning (make_int (1)),
3333 Fmatch_end (make_int (1))),
3337 ? DECODE_CHAR (ccs, code, 0)
3338 : decode_builtin_char (ccs, code);
3340 DECODE_ADD_UCS_CHAR (chr, dst);
3344 if (!NILP (Fstring_match (build_string ("^&MCS-\\([0-9A-F]+\\)$"),
3345 string, Qnil, Qnil)))
3348 = XUINT (Fstring_to_number
3349 (Fsubstring (string,
3350 Fmatch_beginning (make_int (1)),
3351 Fmatch_end (make_int (1))),
3354 DECODE_ADD_UCS_CHAR (code, dst);
3358 Dynarr_add_many (dst, str->er_buf, str->er_counter);
3359 Dynarr_add (dst, ';');
3362 str->er_counter = 0;
3364 else if ( (str->er_counter >= ER_BUF_SIZE) || (c >= 0x7F) )
3366 Dynarr_add_many (dst, str->er_buf, str->er_counter);
3367 str->er_counter = 0;
3368 DECODE_ADD_UCS_CHAR (c, dst);
3371 str->er_buf[str->er_counter++] = c;
3374 void char_encode_as_entity_reference (Emchar ch, char* buf);
3376 char_encode_as_entity_reference (Emchar ch, char* buf)
3378 Lisp_Object rest = Vcoded_charset_entity_reference_alist;
3381 Lisp_Object char_type;
3382 int format_columns, idx;
3383 char format[ER_BUF_SIZE];
3385 while (!NILP (rest))
3391 char_type = XCDR (ccs);
3396 if (!NILP (ccs = Ffind_charset (ccs)))
3398 int code_point = charset_code_point (ccs, ch, 0);
3400 if ( (code_point >= 0)
3401 && (NILP (char_type)
3402 || DECODE_CHAR (ccs, code_point, 0) != ch) )
3408 if ( STRINGP (ret) &&
3409 ( (idx = XSTRING_LENGTH (ret)) <= (ER_BUF_SIZE - 4) ) )
3412 strncpy (&format[1], XSTRING_DATA (ret), idx);
3422 format[idx++] = '%';
3423 format_columns = XINT (ret);
3424 if ( (2 <= format_columns) && (format_columns <= 8)
3425 && (idx + format_columns <= ER_BUF_SIZE - 1) )
3427 format [idx++] = '0';
3428 format [idx++] = '0' + format_columns;
3437 format [idx++] = 'd';
3438 else if (EQ (ret, Qx))
3439 format [idx++] = 'x';
3440 else if (EQ (ret, QX))
3441 format [idx++] = 'X';
3444 format [idx++] = ';';
3447 sprintf (buf, format, code_point);
3454 sprintf (buf, "&MCS-%08X;", ch);
3458 /************************************************************************/
3459 /* character composition */
3460 /************************************************************************/
3461 extern Lisp_Object Qcomposition;
3464 COMPOSE_FLUSH_CHARS (struct decoding_stream *str, unsigned_char_dynarr* dst);
3466 COMPOSE_FLUSH_CHARS (struct decoding_stream *str, unsigned_char_dynarr* dst)
3470 for (i = 0; i < str->combined_char_count; i++)
3471 decode_add_er_char (str, str->combined_chars[i], dst);
3472 str->combined_char_count = 0;
3473 str->combining_table = Qnil;
3476 void COMPOSE_ADD_CHAR (struct decoding_stream *str, Emchar character,
3477 unsigned_char_dynarr* dst);
3479 COMPOSE_ADD_CHAR (struct decoding_stream *str,
3480 Emchar character, unsigned_char_dynarr* dst)
3482 if (CODING_SYSTEM_DISABLE_COMPOSITION (str->codesys))
3483 decode_add_er_char (str, character, dst);
3484 else if (!CONSP (str->combining_table))
3487 = Fchar_feature (make_char (character), Qcomposition, Qnil,
3491 decode_add_er_char (str, character, dst);
3494 str->combined_chars[0] = character;
3495 str->combined_char_count = 1;
3496 str->combining_table = ret;
3502 = Fcdr (Fassq (make_char (character), str->combining_table));
3506 Emchar char2 = XCHARVAL (ret);
3507 Lisp_Object ret2 = Fchar_feature (ret, Qcomposition, Qnil,
3512 decode_add_er_char (str, char2, dst);
3513 str->combined_char_count = 0;
3514 str->combining_table = Qnil;
3518 str->combined_chars[0] = char2;
3519 str->combined_char_count = 1;
3520 str->combining_table = ret2;
3525 ret = Fchar_feature (make_char (character), Qcomposition, Qnil,
3528 COMPOSE_FLUSH_CHARS (str, dst);
3530 decode_add_er_char (str, character, dst);
3533 str->combined_chars[0] = character;
3534 str->combined_char_count = 1;
3535 str->combining_table = ret;
3540 #else /* not UTF2000 */
3541 #define COMPOSE_FLUSH_CHARS(str, dst)
3542 #define COMPOSE_ADD_CHAR(str, ch, dst) DECODE_ADD_UCS_CHAR (ch, dst)
3543 #endif /* UTF2000 */
3546 /************************************************************************/
3547 /* Shift-JIS methods */
3548 /************************************************************************/
3550 /* Shift-JIS is a coding system encoding three character sets: ASCII, right
3551 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
3552 as is. A character of JISX0201-Kana (DIMENSION1_CHARS94 character set) is
3553 encoded by "position-code + 0x80". A character of JISX0208
3554 (DIMENSION2_CHARS94 character set) is encoded in 2-byte but two
3555 position-codes are divided and shifted so that it fit in the range
3558 --- CODE RANGE of Shift-JIS ---
3559 (character set) (range)
3561 JISX0201-Kana 0xA0 .. 0xDF
3562 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xEF
3563 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
3564 -------------------------------
3568 /* Is this the first byte of a Shift-JIS two-byte char? */
3570 #define BYTE_SJIS_TWO_BYTE_1_P(c) \
3571 (((c) >= 0x81 && (c) <= 0x9F) || ((c) >= 0xE0 && (c) <= 0xEF))
3573 /* Is this the second byte of a Shift-JIS two-byte char? */
3575 #define BYTE_SJIS_TWO_BYTE_2_P(c) \
3576 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0x80 && (c) <= 0xFC))
3578 #define BYTE_SJIS_KATAKANA_P(c) \
3579 ((c) >= 0xA1 && (c) <= 0xDF)
3582 detect_coding_sjis (struct detection_state *st, const Extbyte *src, Lstream_data_count n)
3586 unsigned char c = *(unsigned char *)src++;
3587 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3589 if (st->shift_jis.in_second_byte)
3591 st->shift_jis.in_second_byte = 0;
3595 else if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
3596 st->shift_jis.in_second_byte = 1;
3598 return CODING_CATEGORY_SHIFT_JIS_MASK;
3601 /* Convert Shift-JIS data to internal format. */
3604 decode_coding_sjis (Lstream *decoding, const Extbyte *src,
3605 unsigned_char_dynarr *dst, Lstream_data_count n)
3607 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3608 unsigned int flags = str->flags;
3609 unsigned int cpos = str->cpos;
3610 eol_type_t eol_type = str->eol_type;
3614 unsigned char c = *(unsigned char *)src++;
3618 /* Previous character was first byte of Shift-JIS Kanji char. */
3619 if (BYTE_SJIS_TWO_BYTE_2_P (c))
3621 unsigned char e1, e2;
3623 DECODE_SJIS (cpos, c, e1, e2);
3625 DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_japanese_jisx0208,
3629 Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
3630 Dynarr_add (dst, e1);
3631 Dynarr_add (dst, e2);
3636 DECODE_ADD_BINARY_CHAR (cpos, dst);
3637 DECODE_ADD_BINARY_CHAR (c, dst);
3643 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3644 if (BYTE_SJIS_TWO_BYTE_1_P (c))
3646 else if (BYTE_SJIS_KATAKANA_P (c))
3649 DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_katakana_jisx0201,
3652 Dynarr_add (dst, LEADING_BYTE_KATAKANA_JISX0201);
3653 Dynarr_add (dst, c);
3658 DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_latin_jisx0201,
3662 DECODE_ADD_BINARY_CHAR (c, dst);
3664 label_continue_loop:;
3667 DECODE_HANDLE_END_OF_CONVERSION (flags, cpos, dst);
3673 /* Convert internal character representation to Shift_JIS. */
3676 char_encode_shift_jis (struct encoding_stream *str, Emchar ch,
3677 unsigned_char_dynarr *dst, unsigned int *flags)
3679 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3683 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3684 Dynarr_add (dst, '\r');
3685 if (eol_type != EOL_CR)
3686 Dynarr_add (dst, ch);
3690 unsigned int s1, s2;
3692 int code_point = charset_code_point (Vcharset_latin_jisx0201, ch, 0);
3694 if (code_point >= 0)
3695 Dynarr_add (dst, code_point);
3696 else if ((code_point
3697 = charset_code_point (Vcharset_japanese_jisx0208_1990, ch, 0))
3700 ENCODE_SJIS ((code_point >> 8) | 0x80,
3701 (code_point & 0xFF) | 0x80, s1, s2);
3702 Dynarr_add (dst, s1);
3703 Dynarr_add (dst, s2);
3705 else if ((code_point
3706 = charset_code_point (Vcharset_katakana_jisx0201, ch, 0))
3708 Dynarr_add (dst, code_point | 0x80);
3709 else if ((code_point
3710 = charset_code_point (Vcharset_japanese_jisx0208, ch, 0))
3713 ENCODE_SJIS ((code_point >> 8) | 0x80,
3714 (code_point & 0xFF) | 0x80, s1, s2);
3715 Dynarr_add (dst, s1);
3716 Dynarr_add (dst, s2);
3718 else if ((code_point = charset_code_point (Vcharset_ascii, ch, 0))
3720 Dynarr_add (dst, code_point);
3722 Dynarr_add (dst, '?');
3724 Lisp_Object charset;
3725 unsigned int c1, c2;
3727 BREAKUP_CHAR (ch, charset, c1, c2);
3729 if (EQ(charset, Vcharset_katakana_jisx0201))
3731 Dynarr_add (dst, c1 | 0x80);
3735 Dynarr_add (dst, c1);
3737 else if (EQ(charset, Vcharset_japanese_jisx0208))
3739 ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3740 Dynarr_add (dst, s1);
3741 Dynarr_add (dst, s2);
3744 Dynarr_add (dst, '?');
3750 char_finish_shift_jis (struct encoding_stream *str, unsigned_char_dynarr *dst,
3751 unsigned int *flags)
3755 DEFUN ("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /*
3756 Decode a JISX0208 character of Shift-JIS coding-system.
3757 CODE is the character code in Shift-JIS as a cons of type bytes.
3758 Return the corresponding character.
3762 unsigned char c1, c2, s1, s2;
3765 CHECK_INT (XCAR (code));
3766 CHECK_INT (XCDR (code));
3767 s1 = XINT (XCAR (code));
3768 s2 = XINT (XCDR (code));
3769 if (BYTE_SJIS_TWO_BYTE_1_P (s1) &&
3770 BYTE_SJIS_TWO_BYTE_2_P (s2))
3772 DECODE_SJIS (s1, s2, c1, c2);
3773 return make_char (MAKE_CHAR (Vcharset_japanese_jisx0208,
3774 c1 & 0x7F, c2 & 0x7F));
3780 DEFUN ("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /*
3781 Encode a JISX0208 character CHARACTER to SHIFT-JIS coding-system.
3782 Return the corresponding character code in SHIFT-JIS as a cons of two bytes.
3786 Lisp_Object charset;
3789 CHECK_CHAR_COERCE_INT (character);
3790 BREAKUP_CHAR (XCHAR (character), charset, c1, c2);
3791 if (EQ (charset, Vcharset_japanese_jisx0208))
3793 ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3794 return Fcons (make_int (s1), make_int (s2));
3801 /************************************************************************/
3803 /************************************************************************/
3805 /* BIG5 is a coding system encoding two character sets: ASCII and
3806 Big5. An ASCII character is encoded as is. Big5 is a two-byte
3807 character set and is encoded in two-byte.
3809 --- CODE RANGE of BIG5 ---
3810 (character set) (range)
3812 Big5 (1st byte) 0xA1 .. 0xFE
3813 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
3814 --------------------------
3816 Since the number of characters in Big5 is larger than maximum
3817 characters in Emacs' charset (96x96), it can't be handled as one
3818 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
3819 and `charset-big5-2'. Both <type>s are DIMENSION2_CHARS94. The former
3820 contains frequently used characters and the latter contains less
3821 frequently used characters. */
3824 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
3825 ((c) >= 0x81 && (c) <= 0xFE)
3827 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
3828 ((c) >= 0xA1 && (c) <= 0xFE)
3831 /* Is this the second byte of a Shift-JIS two-byte char? */
3833 #define BYTE_BIG5_TWO_BYTE_2_P(c) \
3834 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0xA1 && (c) <= 0xFE))
3836 /* Number of Big5 characters which have the same code in 1st byte. */
3838 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
3840 /* Code conversion macros. These are macros because they are used in
3841 inner loops during code conversion.
3843 Note that temporary variables in macros introduce the classic
3844 dynamic-scoping problems with variable names. We use capital-
3845 lettered variables in the assumption that XEmacs does not use
3846 capital letters in variables except in a very formalized way
3849 /* Convert Big5 code (b1, b2) into its internal string representation
3852 /* There is a much simpler way to split the Big5 charset into two.
3853 For the moment I'm going to leave the algorithm as-is because it
3854 claims to separate out the most-used characters into a single
3855 charset, which perhaps will lead to optimizations in various
3858 The way the algorithm works is something like this:
3860 Big5 can be viewed as a 94x157 charset, where the row is
3861 encoded into the bytes 0xA1 .. 0xFE and the column is encoded
3862 into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE. As for frequency,
3863 the split between low and high column numbers is apparently
3864 meaningless; ascending rows produce less and less frequent chars.
3865 Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to
3866 the first charset, and the upper half (0xC9 .. 0xFE) to the
3867 second. To do the conversion, we convert the character into
3868 a single number where 0 .. 156 is the first row, 157 .. 313
3869 is the second, etc. That way, the characters are ordered by
3870 decreasing frequency. Then we just chop the space in two
3871 and coerce the result into a 94x94 space.
3874 #define DECODE_BIG5(b1, b2, lb, c1, c2) do \
3876 int B1 = b1, B2 = b2; \
3878 = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62); \
3882 lb = LEADING_BYTE_CHINESE_BIG5_1; \
3886 lb = LEADING_BYTE_CHINESE_BIG5_2; \
3887 I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1); \
3889 c1 = I / (0xFF - 0xA1) + 0xA1; \
3890 c2 = I % (0xFF - 0xA1) + 0xA1; \
3893 /* Convert the internal string representation of a Big5 character
3894 (lb, c1, c2) into Big5 code (b1, b2). */
3896 #define ENCODE_BIG5(lb, c1, c2, b1, b2) do \
3898 unsigned int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1); \
3900 if (lb == LEADING_BYTE_CHINESE_BIG5_2) \
3902 I += BIG5_SAME_ROW * (0xC9 - 0xA1); \
3904 b1 = I / BIG5_SAME_ROW + 0xA1; \
3905 b2 = I % BIG5_SAME_ROW; \
3906 b2 += b2 < 0x3F ? 0x40 : 0x62; \
3910 detect_coding_big5 (struct detection_state *st, const Extbyte *src, Lstream_data_count n)
3914 unsigned char c = *(unsigned char *)src++;
3915 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO
3917 || (c >= 0x80 && c <= 0xA0)
3921 if (st->big5.in_second_byte)
3923 st->big5.in_second_byte = 0;
3924 if (c < 0x40 || (c >= 0x80 && c <= 0xA0))
3934 st->big5.in_second_byte = 1;
3936 return CODING_CATEGORY_BIG5_MASK;
3939 /* Convert Big5 data to internal format. */
3942 decode_coding_big5 (Lstream *decoding, const Extbyte *src,
3943 unsigned_char_dynarr *dst, Lstream_data_count n)
3945 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3946 unsigned int flags = str->flags;
3947 unsigned int cpos = str->cpos;
3948 eol_type_t eol_type = str->eol_type;
3951 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (DECODING_STREAM_DATA
3952 (decoding)->codesys, 1);
3957 unsigned char c = *(unsigned char *)src++;
3960 /* Previous character was first byte of Big5 char. */
3961 if (BYTE_BIG5_TWO_BYTE_2_P (c))
3964 int code_point = (cpos << 8) | c;
3965 Emchar char_id = decode_defined_char (ccs, code_point, 0);
3969 = DECODE_CHAR (Vcharset_chinese_big5, code_point, 0);
3970 DECODE_ADD_UCS_CHAR (char_id, dst);
3972 unsigned char b1, b2, b3;
3973 DECODE_BIG5 (cpos, c, b1, b2, b3);
3974 Dynarr_add (dst, b1);
3975 Dynarr_add (dst, b2);
3976 Dynarr_add (dst, b3);
3981 DECODE_ADD_BINARY_CHAR (cpos, dst);
3982 DECODE_ADD_BINARY_CHAR (c, dst);
3988 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3989 if (BYTE_BIG5_TWO_BYTE_1_P (c))
3991 decode_flush_er_chars (str, dst);
3996 decode_flush_er_chars (str, dst);
3997 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3998 DECODE_ADD_BINARY_CHAR (c, dst);
4002 /* DECODE_ADD_BINARY_CHAR (c, dst); */
4003 decode_add_er_char (str, c, dst);
4006 label_continue_loop:;
4009 /* DECODE_HANDLE_END_OF_CONVERSION (flags, cpos, dst); */
4010 if (flags & CODING_STATE_END)
4012 decode_flush_er_chars (str, dst);
4013 DECODE_OUTPUT_PARTIAL_CHAR (cpos);
4014 if (flags & CODING_STATE_CR)
4015 Dynarr_add (dst, '\r');
4022 /* Convert internally-formatted data to Big5. */
4025 char_encode_big5 (struct encoding_stream *str, Emchar ch,
4026 unsigned_char_dynarr *dst, unsigned int *flags)
4028 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
4032 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
4033 Dynarr_add (dst, '\r');
4034 if (eol_type != EOL_CR)
4035 Dynarr_add (dst, ch);
4042 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, 1);
4044 if ((code_point = charset_code_point (Vcharset_ascii, ch, 0)) >= 0)
4045 Dynarr_add (dst, code_point);
4046 else if ((code_point = charset_code_point (ccs, ch, 0)) >= 0)
4048 Dynarr_add (dst, code_point >> 8);
4049 Dynarr_add (dst, code_point & 0xFF);
4051 else if ((code_point
4052 = charset_code_point (Vcharset_chinese_big5, ch, 0)) >= 0)
4054 Dynarr_add (dst, code_point >> 8);
4055 Dynarr_add (dst, code_point & 0xFF);
4057 else if ((code_point
4058 = charset_code_point (Vcharset_chinese_big5_1, ch, 0)) >= 0)
4061 = ((code_point >> 8) - 33) * (0xFF - 0xA1)
4062 + ((code_point & 0xFF) - 33);
4063 unsigned char b1 = I / BIG5_SAME_ROW + 0xA1;
4064 unsigned char b2 = I % BIG5_SAME_ROW;
4066 b2 += b2 < 0x3F ? 0x40 : 0x62;
4067 Dynarr_add (dst, b1);
4068 Dynarr_add (dst, b2);
4070 else if ((code_point
4071 = charset_code_point (Vcharset_chinese_big5_2, ch, 0)) >= 0)
4074 = ((code_point >> 8) - 33) * (0xFF - 0xA1)
4075 + ((code_point & 0xFF) - 33);
4076 unsigned char b1, b2;
4078 I += BIG5_SAME_ROW * (0xC9 - 0xA1);
4079 b1 = I / BIG5_SAME_ROW + 0xA1;
4080 b2 = I % BIG5_SAME_ROW;
4081 b2 += b2 < 0x3F ? 0x40 : 0x62;
4082 Dynarr_add (dst, b1);
4083 Dynarr_add (dst, b2);
4085 else if (CODING_SYSTEM_USE_ENTITY_REFERENCE (str->codesys))
4089 char_encode_as_entity_reference (ch, buf);
4090 Dynarr_add_many (dst, buf, strlen (buf));
4093 Dynarr_add (dst, '?');
4100 char_finish_big5 (struct encoding_stream *str, unsigned_char_dynarr *dst,
4101 unsigned int *flags)
4106 DEFUN ("decode-big5-char", Fdecode_big5_char, 1, 1, 0, /*
4107 Decode a Big5 character CODE of BIG5 coding-system.
4108 CODE is the character code in BIG5, a cons of two integers.
4109 Return the corresponding character.
4113 unsigned char c1, c2, b1, b2;
4116 CHECK_INT (XCAR (code));
4117 CHECK_INT (XCDR (code));
4118 b1 = XINT (XCAR (code));
4119 b2 = XINT (XCDR (code));
4120 if (BYTE_BIG5_TWO_BYTE_1_P (b1) &&
4121 BYTE_BIG5_TWO_BYTE_2_P (b2))
4123 Charset_ID leading_byte;
4124 Lisp_Object charset;
4125 DECODE_BIG5 (b1, b2, leading_byte, c1, c2);
4126 charset = CHARSET_BY_LEADING_BYTE (leading_byte);
4127 return make_char (MAKE_CHAR (charset, c1 & 0x7F, c2 & 0x7F));
4133 DEFUN ("encode-big5-char", Fencode_big5_char, 1, 1, 0, /*
4134 Encode the Big5 character CHARACTER in the BIG5 coding-system.
4135 Return the corresponding character code in Big5.
4139 Lisp_Object charset;
4142 CHECK_CHAR_COERCE_INT (character);
4143 BREAKUP_CHAR (XCHAR (character), charset, c1, c2);
4144 if (EQ (charset, Vcharset_chinese_big5_1) ||
4145 EQ (charset, Vcharset_chinese_big5_2))
4147 ENCODE_BIG5 (XCHARSET_LEADING_BYTE (charset), c1 | 0x80, c2 | 0x80,
4149 return Fcons (make_int (b1), make_int (b2));
4156 /************************************************************************/
4158 /************************************************************************/
4161 detect_coding_ucs4 (struct detection_state *st, const Extbyte *src, Lstream_data_count n)
4165 unsigned char c = *(unsigned char *)src++;
4166 switch (st->ucs4.in_byte)
4175 st->ucs4.in_byte = 0;
4181 return CODING_CATEGORY_UCS4_MASK;
4185 decode_coding_ucs4 (Lstream *decoding, const Extbyte *src,
4186 unsigned_char_dynarr *dst, Lstream_data_count n)
4188 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4189 unsigned int flags = str->flags;
4190 unsigned int cpos = str->cpos;
4191 unsigned char counter = str->counter;
4195 unsigned char c = *(unsigned char *)src++;
4203 DECODE_ADD_UCS_CHAR ((cpos << 8) | c, dst);
4208 cpos = ( cpos << 8 ) | c;
4212 if (counter & CODING_STATE_END)
4213 DECODE_OUTPUT_PARTIAL_CHAR (cpos);
4217 str->counter = counter;
4221 char_encode_ucs4 (struct encoding_stream *str, Emchar ch,
4222 unsigned_char_dynarr *dst, unsigned int *flags)
4224 Dynarr_add (dst, ch >> 24);
4225 Dynarr_add (dst, ch >> 16);
4226 Dynarr_add (dst, ch >> 8);
4227 Dynarr_add (dst, ch );
4231 char_finish_ucs4 (struct encoding_stream *str, unsigned_char_dynarr *dst,
4232 unsigned int *flags)
4237 /************************************************************************/
4238 /* UTF-16 methods */
4239 /************************************************************************/
4242 detect_coding_utf16 (struct detection_state *st, const Extbyte *src, Lstream_data_count n)
4244 return CODING_CATEGORY_UTF16_MASK;
4248 decode_coding_utf16 (Lstream *decoding, const Extbyte *src,
4249 unsigned_char_dynarr *dst, Lstream_data_count n)
4251 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4252 unsigned int flags = str->flags;
4253 unsigned int cpos = str->cpos;
4254 unsigned char counter = str->counter & 3;
4255 unsigned char byte_order = str->counter >> 2;
4256 eol_type_t eol_type = str->eol_type;
4260 unsigned char c = *(unsigned char *)src++;
4266 else if (counter == 1)
4270 if (byte_order == 0)
4271 code = (c << 8) | cpos;
4273 code = (cpos << 8) | c;
4276 code = ((code & 0xFF) << 8) | (code >> 8);
4277 if ( byte_order == 0 )
4282 if ( (0xD800 <= code) && (code <= 0xDBFF) )
4293 DECODE_HANDLE_EOL_TYPE (eol_type, code, flags, dst);
4294 DECODE_ADD_UCS_CHAR (code, dst);
4298 else if (counter == 2)
4300 cpos = (cpos << 8) | c;
4308 ? (c << 8) | (cpos & 0xFF)
4309 : ((cpos & 0xFF) << 8) | c;
4311 DECODE_ADD_UCS_CHAR ((x - 0xD800) * 0x400 + (y - 0xDC00)
4316 label_continue_loop:;
4318 if (counter & CODING_STATE_END)
4319 DECODE_OUTPUT_PARTIAL_CHAR (cpos);
4323 str->counter = (byte_order << 2) | counter;
4327 char_encode_utf16 (struct encoding_stream *str, Emchar ch,
4328 unsigned_char_dynarr *dst, unsigned int *flags)
4332 Dynarr_add (dst, ch);
4333 Dynarr_add (dst, ch >> 8);
4337 int y = ((ch - 0x10000) / 0x400) + 0xD800;
4338 int z = ((ch - 0x10000) % 0x400) + 0xDC00;
4340 Dynarr_add (dst, y);
4341 Dynarr_add (dst, y >> 8);
4342 Dynarr_add (dst, z);
4343 Dynarr_add (dst, z >> 8);
4348 char_finish_utf16 (struct encoding_stream *str, unsigned_char_dynarr *dst,
4349 unsigned int *flags)
4354 /************************************************************************/
4356 /************************************************************************/
4359 detect_coding_utf8 (struct detection_state *st, const Extbyte *src, Lstream_data_count n)
4363 unsigned char c = *(unsigned char *)src++;
4364 switch (st->utf8.in_byte)
4367 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
4370 st->utf8.in_byte = 5;
4372 st->utf8.in_byte = 4;
4374 st->utf8.in_byte = 3;
4376 st->utf8.in_byte = 2;
4378 st->utf8.in_byte = 1;
4383 if ((c & 0xc0) != 0x80)
4389 return CODING_CATEGORY_UTF8_MASK;
4393 decode_output_utf8_partial_char (unsigned char counter,
4395 unsigned_char_dynarr *dst)
4398 DECODE_ADD_BINARY_CHAR ( (cpos|0xFC), dst);
4399 else if (counter == 4)
4401 if (cpos < (1 << 6))
4402 DECODE_ADD_BINARY_CHAR ( (cpos|0xF8), dst);
4405 DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xFC), dst);
4406 DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
4409 else if (counter == 3)
4411 if (cpos < (1 << 6))
4412 DECODE_ADD_BINARY_CHAR ( (cpos|0xF0), dst);
4413 else if (cpos < (1 << 12))
4415 DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xF8), dst);
4416 DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
4420 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 12)|0xFC), dst);
4421 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
4422 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
4425 else if (counter == 2)
4427 if (cpos < (1 << 6))
4428 DECODE_ADD_BINARY_CHAR ( (cpos|0xE0), dst);
4429 else if (cpos < (1 << 12))
4431 DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xF0), dst);
4432 DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
4434 else if (cpos < (1 << 18))
4436 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 12)|0xF8), dst);
4437 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
4438 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
4442 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 18)|0xFC), dst);
4443 DECODE_ADD_BINARY_CHAR ( (((cpos >> 12)&0x3F)|0x80), dst);
4444 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
4445 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
4450 if (cpos < (1 << 6))
4451 DECODE_ADD_BINARY_CHAR ( (cpos|0xC0), dst);
4452 else if (cpos < (1 << 12))
4454 DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xE0), dst);
4455 DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
4457 else if (cpos < (1 << 18))
4459 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 12)|0xF0), dst);
4460 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
4461 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
4463 else if (cpos < (1 << 24))
4465 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 18)|0xF8), dst);
4466 DECODE_ADD_BINARY_CHAR ( (((cpos >> 12)&0x3F)|0x80), dst);
4467 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
4468 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
4472 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 24)|0xFC), dst);
4473 DECODE_ADD_BINARY_CHAR ( (((cpos >> 18)&0x3F)|0x80), dst);
4474 DECODE_ADD_BINARY_CHAR ( (((cpos >> 12)&0x3F)|0x80), dst);
4475 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
4476 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
4482 decode_coding_utf8 (Lstream *decoding, const Extbyte *src,
4483 unsigned_char_dynarr *dst, Lstream_data_count n)
4485 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4486 unsigned int flags = str->flags;
4487 unsigned int cpos = str->cpos;
4488 eol_type_t eol_type = str->eol_type;
4489 unsigned char counter = str->counter;
4492 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (DECODING_STREAM_DATA
4493 (decoding)->codesys, 0);
4498 unsigned char c = *(unsigned char *)src++;
4503 COMPOSE_FLUSH_CHARS (str, dst);
4504 decode_flush_er_chars (str, dst);
4505 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4506 DECODE_ADD_UCS_CHAR (c, dst);
4508 else if ( c < 0xC0 )
4509 /* decode_add_er_char (str, c, dst); */
4510 COMPOSE_ADD_CHAR (str, c, dst);
4513 /* decode_flush_er_chars (str, dst); */
4519 else if ( c < 0xF0 )
4524 else if ( c < 0xF8 )
4529 else if ( c < 0xFC )
4541 else if ( (c & 0xC0) == 0x80 )
4543 cpos = ( cpos << 6 ) | ( c & 0x3f );
4550 char_id = decode_defined_char (ccs, cpos, 0);
4557 COMPOSE_ADD_CHAR (str, char_id, dst);
4566 COMPOSE_FLUSH_CHARS (str, dst);
4567 decode_flush_er_chars (str, dst);
4568 decode_output_utf8_partial_char (counter, cpos, dst);
4569 DECODE_ADD_BINARY_CHAR (c, dst);
4573 label_continue_loop:;
4576 if (flags & CODING_STATE_END)
4578 COMPOSE_FLUSH_CHARS (str, dst);
4579 decode_flush_er_chars (str, dst);
4582 decode_output_utf8_partial_char (counter, cpos, dst);
4589 str->counter = counter;
4593 char_encode_utf8 (struct encoding_stream *str, Emchar ch,
4594 unsigned_char_dynarr *dst, unsigned int *flags)
4596 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
4600 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
4601 Dynarr_add (dst, '\r');
4602 if (eol_type != EOL_CR)
4603 Dynarr_add (dst, ch);
4605 else if (ch <= 0x7f)
4607 Dynarr_add (dst, ch);
4612 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, 0);
4613 int code_point = charset_code_point (ucs_ccs, ch, 0);
4615 if ( (code_point < 0) || (code_point > 0xEFFFF) )
4618 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, 1);
4622 && INTP (ret = Fchar_feature (make_char (ch),
4625 code_point = XINT (ret);
4626 else if ( !NILP (map =
4627 CODING_SYSTEM_ISO2022_INITIAL_CHARSET
4629 && INTP (ret = Fchar_feature (make_char (ch),
4632 code_point = XINT (ret);
4633 else if (CODING_SYSTEM_USE_ENTITY_REFERENCE (str->codesys))
4637 char_encode_as_entity_reference (ch, buf);
4638 Dynarr_add_many (dst, buf, strlen (buf));
4644 if (code_point <= 0x7ff)
4646 Dynarr_add (dst, (code_point >> 6) | 0xc0);
4647 Dynarr_add (dst, (code_point & 0x3f) | 0x80);
4649 else if (code_point <= 0xffff)
4651 Dynarr_add (dst, (code_point >> 12) | 0xe0);
4652 Dynarr_add (dst, ((code_point >> 6) & 0x3f) | 0x80);
4653 Dynarr_add (dst, (code_point & 0x3f) | 0x80);
4655 else if (code_point <= 0x1fffff)
4657 Dynarr_add (dst, (code_point >> 18) | 0xf0);
4658 Dynarr_add (dst, ((code_point >> 12) & 0x3f) | 0x80);
4659 Dynarr_add (dst, ((code_point >> 6) & 0x3f) | 0x80);
4660 Dynarr_add (dst, (code_point & 0x3f) | 0x80);
4662 else if (code_point <= 0x3ffffff)
4664 Dynarr_add (dst, (code_point >> 24) | 0xf8);
4665 Dynarr_add (dst, ((code_point >> 18) & 0x3f) | 0x80);
4666 Dynarr_add (dst, ((code_point >> 12) & 0x3f) | 0x80);
4667 Dynarr_add (dst, ((code_point >> 6) & 0x3f) | 0x80);
4668 Dynarr_add (dst, (code_point & 0x3f) | 0x80);
4672 Dynarr_add (dst, (code_point >> 30) | 0xfc);
4673 Dynarr_add (dst, ((code_point >> 24) & 0x3f) | 0x80);
4674 Dynarr_add (dst, ((code_point >> 18) & 0x3f) | 0x80);
4675 Dynarr_add (dst, ((code_point >> 12) & 0x3f) | 0x80);
4676 Dynarr_add (dst, ((code_point >> 6) & 0x3f) | 0x80);
4677 Dynarr_add (dst, (code_point & 0x3f) | 0x80);
4683 char_finish_utf8 (struct encoding_stream *str, unsigned_char_dynarr *dst,
4684 unsigned int *flags)
4689 /************************************************************************/
4690 /* ISO2022 methods */
4691 /************************************************************************/
4693 /* The following note describes the coding system ISO2022 briefly.
4694 Since the intention of this note is to help understand the
4695 functions in this file, some parts are NOT ACCURATE or OVERLY
4696 SIMPLIFIED. For thorough understanding, please refer to the
4697 original document of ISO2022.
4699 ISO2022 provides many mechanisms to encode several character sets
4700 in 7-bit and 8-bit environments. For 7-bit environments, all text
4701 is encoded using bytes less than 128. This may make the encoded
4702 text a little bit longer, but the text passes more easily through
4703 several gateways, some of which strip off MSB (Most Signigant Bit).
4705 There are two kinds of character sets: control character set and
4706 graphic character set. The former contains control characters such
4707 as `newline' and `escape' to provide control functions (control
4708 functions are also provided by escape sequences). The latter
4709 contains graphic characters such as 'A' and '-'. Emacs recognizes
4710 two control character sets and many graphic character sets.
4712 Graphic character sets are classified into one of the following
4713 four classes, according to the number of bytes (DIMENSION) and
4714 number of characters in one dimension (CHARS) of the set:
4715 - DIMENSION1_CHARS94
4716 - DIMENSION1_CHARS96
4717 - DIMENSION2_CHARS94
4718 - DIMENSION2_CHARS96
4720 In addition, each character set is assigned an identification tag,
4721 unique for each set, called "final character" (denoted as <F>
4722 hereafter). The <F> of each character set is decided by ECMA(*)
4723 when it is registered in ISO. The code range of <F> is 0x30..0x7F
4724 (0x30..0x3F are for private use only).
4726 Note (*): ECMA = European Computer Manufacturers Association
4728 Here are examples of graphic character set [NAME(<F>)]:
4729 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
4730 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
4731 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
4732 o DIMENSION2_CHARS96 -- none for the moment
4734 A code area (1 byte = 8 bits) is divided into 4 areas, C0, GL, C1, and GR.
4735 C0 [0x00..0x1F] -- control character plane 0
4736 GL [0x20..0x7F] -- graphic character plane 0
4737 C1 [0x80..0x9F] -- control character plane 1
4738 GR [0xA0..0xFF] -- graphic character plane 1
4740 A control character set is directly designated and invoked to C0 or
4741 C1 by an escape sequence. The most common case is that:
4742 - ISO646's control character set is designated/invoked to C0, and
4743 - ISO6429's control character set is designated/invoked to C1,
4744 and usually these designations/invocations are omitted in encoded
4745 text. In a 7-bit environment, only C0 can be used, and a control
4746 character for C1 is encoded by an appropriate escape sequence to
4747 fit into the environment. All control characters for C1 are
4748 defined to have corresponding escape sequences.
4750 A graphic character set is at first designated to one of four
4751 graphic registers (G0 through G3), then these graphic registers are
4752 invoked to GL or GR. These designations and invocations can be
4753 done independently. The most common case is that G0 is invoked to
4754 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
4755 these invocations and designations are omitted in encoded text.
4756 In a 7-bit environment, only GL can be used.
4758 When a graphic character set of CHARS94 is invoked to GL, codes
4759 0x20 and 0x7F of the GL area work as control characters SPACE and
4760 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
4763 There are two ways of invocation: locking-shift and single-shift.
4764 With locking-shift, the invocation lasts until the next different
4765 invocation, whereas with single-shift, the invocation affects the
4766 following character only and doesn't affect the locking-shift
4767 state. Invocations are done by the following control characters or
4770 ----------------------------------------------------------------------
4771 abbrev function cntrl escape seq description
4772 ----------------------------------------------------------------------
4773 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
4774 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
4775 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
4776 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
4777 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
4778 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
4779 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
4780 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
4781 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
4782 ----------------------------------------------------------------------
4783 (*) These are not used by any known coding system.
4785 Control characters for these functions are defined by macros
4786 ISO_CODE_XXX in `coding.h'.
4788 Designations are done by the following escape sequences:
4789 ----------------------------------------------------------------------
4790 escape sequence description
4791 ----------------------------------------------------------------------
4792 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
4793 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
4794 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
4795 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
4796 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
4797 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
4798 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
4799 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
4800 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
4801 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
4802 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
4803 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
4804 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
4805 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
4806 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
4807 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
4808 ----------------------------------------------------------------------
4810 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
4811 of dimension 1, chars 94, and final character <F>, etc...
4813 Note (*): Although these designations are not allowed in ISO2022,
4814 Emacs accepts them on decoding, and produces them on encoding
4815 CHARS96 character sets in a coding system which is characterized as
4816 7-bit environment, non-locking-shift, and non-single-shift.
4818 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
4819 '(' can be omitted. We refer to this as "short-form" hereafter.
4821 Now you may notice that there are a lot of ways for encoding the
4822 same multilingual text in ISO2022. Actually, there exist many
4823 coding systems such as Compound Text (used in X11's inter client
4824 communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
4825 (used in Korean internet), EUC (Extended UNIX Code, used in Asian
4826 localized platforms), and all of these are variants of ISO2022.
4828 In addition to the above, Emacs handles two more kinds of escape
4829 sequences: ISO6429's direction specification and Emacs' private
4830 sequence for specifying character composition.
4832 ISO6429's direction specification takes the following form:
4833 o CSI ']' -- end of the current direction
4834 o CSI '0' ']' -- end of the current direction
4835 o CSI '1' ']' -- start of left-to-right text
4836 o CSI '2' ']' -- start of right-to-left text
4837 The control character CSI (0x9B: control sequence introducer) is
4838 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
4840 Character composition specification takes the following form:
4841 o ESC '0' -- start character composition
4842 o ESC '1' -- end character composition
4843 Since these are not standard escape sequences of any ISO standard,
4844 their use with these meanings is restricted to Emacs only. */
4847 reset_iso2022 (Lisp_Object coding_system, struct iso2022_decoder *iso)
4851 for (i = 0; i < 4; i++)
4853 if (!NILP (coding_system))
4855 XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, i);
4857 iso->charset[i] = Qt;
4858 iso->invalid_designated[i] = 0;
4860 iso->esc = ISO_ESC_NOTHING;
4861 iso->esc_bytes_index = 0;
4862 iso->register_left = 0;
4863 iso->register_right = 1;
4864 iso->switched_dir_and_no_valid_charset_yet = 0;
4865 iso->invalid_switch_dir = 0;
4866 iso->output_direction_sequence = 0;
4867 iso->output_literally = 0;
4868 #ifdef ENABLE_COMPOSITE_CHARS
4869 if (iso->composite_chars)
4870 Dynarr_reset (iso->composite_chars);
4875 fit_to_be_escape_quoted (unsigned char c)
4892 /* Parse one byte of an ISO2022 escape sequence.
4893 If the result is an invalid escape sequence, return 0 and
4894 do not change anything in STR. Otherwise, if the result is
4895 an incomplete escape sequence, update ISO2022.ESC and
4896 ISO2022.ESC_BYTES and return -1. Otherwise, update
4897 all the state variables (but not ISO2022.ESC_BYTES) and
4900 If CHECK_INVALID_CHARSETS is non-zero, check for designation
4901 or invocation of an invalid character set and treat that as
4902 an unrecognized escape sequence. */
4905 parse_iso2022_esc (Lisp_Object codesys, struct iso2022_decoder *iso,
4906 unsigned char c, unsigned int *flags,
4907 int check_invalid_charsets)
4909 /* (1) If we're at the end of a designation sequence, CS is the
4910 charset being designated and REG is the register to designate
4913 (2) If we're at the end of a locking-shift sequence, REG is
4914 the register to invoke and HALF (0 == left, 1 == right) is
4915 the half to invoke it into.
4917 (3) If we're at the end of a single-shift sequence, REG is
4918 the register to invoke. */
4919 Lisp_Object cs = Qnil;
4922 /* NOTE: This code does goto's all over the fucking place.
4923 The reason for this is that we're basically implementing
4924 a state machine here, and hierarchical languages like C
4925 don't really provide a clean way of doing this. */
4927 if (! (*flags & CODING_STATE_ESCAPE))
4928 /* At beginning of escape sequence; we need to reset our
4929 escape-state variables. */
4930 iso->esc = ISO_ESC_NOTHING;
4932 iso->output_literally = 0;
4933 iso->output_direction_sequence = 0;
4937 case ISO_ESC_NOTHING:
4938 iso->esc_bytes_index = 0;
4941 case ISO_CODE_ESC: /* Start escape sequence */
4942 *flags |= CODING_STATE_ESCAPE;
4946 case ISO_CODE_CSI: /* ISO6429 (specifying directionality) */
4947 *flags |= CODING_STATE_ESCAPE;
4948 iso->esc = ISO_ESC_5_11;
4951 case ISO_CODE_SO: /* locking shift 1 */
4954 case ISO_CODE_SI: /* locking shift 0 */
4958 case ISO_CODE_SS2: /* single shift */
4961 case ISO_CODE_SS3: /* single shift */
4965 default: /* Other control characters */
4972 /**** single shift ****/
4974 case 'N': /* single shift 2 */
4977 case 'O': /* single shift 3 */
4981 /**** locking shift ****/
4983 case '~': /* locking shift 1 right */
4986 case 'n': /* locking shift 2 */
4989 case '}': /* locking shift 2 right */
4992 case 'o': /* locking shift 3 */
4995 case '|': /* locking shift 3 right */
4999 #ifdef ENABLE_COMPOSITE_CHARS
5000 /**** composite ****/
5003 iso->esc = ISO_ESC_START_COMPOSITE;
5004 *flags = (*flags & CODING_STATE_ISO2022_LOCK) |
5005 CODING_STATE_COMPOSITE;
5009 iso->esc = ISO_ESC_END_COMPOSITE;
5010 *flags = (*flags & CODING_STATE_ISO2022_LOCK) &
5011 ~CODING_STATE_COMPOSITE;
5013 #endif /* ENABLE_COMPOSITE_CHARS */
5015 /**** directionality ****/
5018 iso->esc = ISO_ESC_5_11;
5021 /**** designation ****/
5023 case '$': /* multibyte charset prefix */
5024 iso->esc = ISO_ESC_2_4;
5028 if (0x28 <= c && c <= 0x2F)
5030 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_8);
5034 /* This function is called with CODESYS equal to nil when
5035 doing coding-system detection. */
5037 && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5038 && fit_to_be_escape_quoted (c))
5040 iso->esc = ISO_ESC_LITERAL;
5041 *flags &= CODING_STATE_ISO2022_LOCK;
5051 /**** directionality ****/
5053 case ISO_ESC_5_11: /* ISO6429 direction control */
5056 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
5057 goto directionality;
5059 if (c == '0') iso->esc = ISO_ESC_5_11_0;
5060 else if (c == '1') iso->esc = ISO_ESC_5_11_1;
5061 else if (c == '2') iso->esc = ISO_ESC_5_11_2;
5065 case ISO_ESC_5_11_0:
5068 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
5069 goto directionality;
5073 case ISO_ESC_5_11_1:
5076 *flags = (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
5077 goto directionality;
5081 case ISO_ESC_5_11_2:
5084 *flags = (*flags & CODING_STATE_ISO2022_LOCK) | CODING_STATE_R2L;
5085 goto directionality;
5090 iso->esc = ISO_ESC_DIRECTIONALITY;
5091 /* Various junk here to attempt to preserve the direction sequences
5092 literally in the text if they would otherwise be swallowed due
5093 to invalid designations that don't show up as actual charset
5094 changes in the text. */
5095 if (iso->invalid_switch_dir)
5097 /* We already inserted a direction switch literally into the
5098 text. We assume (#### this may not be right) that the
5099 next direction switch is the one going the other way,
5100 and we need to output that literally as well. */
5101 iso->output_literally = 1;
5102 iso->invalid_switch_dir = 0;
5108 /* If we are in the thrall of an invalid designation,
5109 then stick the directionality sequence literally into the
5110 output stream so it ends up in the original text again. */
5111 for (jj = 0; jj < 4; jj++)
5112 if (iso->invalid_designated[jj])
5116 iso->output_literally = 1;
5117 iso->invalid_switch_dir = 1;
5120 /* Indicate that we haven't yet seen a valid designation,
5121 so that if a switch-dir is directly followed by an
5122 invalid designation, both get inserted literally. */
5123 iso->switched_dir_and_no_valid_charset_yet = 1;
5128 /**** designation ****/
5131 if (0x28 <= c && c <= 0x2F)
5133 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_4_8);
5136 if (0x40 <= c && c <= 0x42)
5139 cs = CHARSET_BY_ATTRIBUTES (94, -1, c,
5140 *flags & CODING_STATE_R2L ?
5141 CHARSET_RIGHT_TO_LEFT :
5142 CHARSET_LEFT_TO_RIGHT);
5153 if (c < '0' || c > '~')
5154 return 0; /* bad final byte */
5156 if (iso->esc >= ISO_ESC_2_8 &&
5157 iso->esc <= ISO_ESC_2_15)
5159 chars = (iso->esc >= ISO_ESC_2_12) ? 96 : 94;
5160 single = 1; /* single-byte */
5161 reg = (iso->esc - ISO_ESC_2_8) & 3;
5163 else if (iso->esc >= ISO_ESC_2_4_8 &&
5164 iso->esc <= ISO_ESC_2_4_15)
5166 chars = (iso->esc >= ISO_ESC_2_4_12) ? 96 : 94;
5167 single = -1; /* multi-byte */
5168 reg = (iso->esc - ISO_ESC_2_4_8) & 3;
5172 /* Can this ever be reached? -slb */
5176 cs = CHARSET_BY_ATTRIBUTES (chars, single, c,
5177 *flags & CODING_STATE_R2L ?
5178 CHARSET_RIGHT_TO_LEFT :
5179 CHARSET_LEFT_TO_RIGHT);
5185 iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char) c;
5189 if (check_invalid_charsets && !CHARSETP (iso->charset[reg]))
5190 /* can't invoke something that ain't there. */
5192 iso->esc = ISO_ESC_SINGLE_SHIFT;
5193 *flags &= CODING_STATE_ISO2022_LOCK;
5195 *flags |= CODING_STATE_SS2;
5197 *flags |= CODING_STATE_SS3;
5201 if (check_invalid_charsets &&
5202 !CHARSETP (iso->charset[reg]))
5203 /* can't invoke something that ain't there. */
5206 iso->register_right = reg;
5208 iso->register_left = reg;
5209 *flags &= CODING_STATE_ISO2022_LOCK;
5210 iso->esc = ISO_ESC_LOCKING_SHIFT;
5214 if (NILP (cs) && check_invalid_charsets)
5216 iso->invalid_designated[reg] = 1;
5217 iso->charset[reg] = Vcharset_ascii;
5218 iso->esc = ISO_ESC_DESIGNATE;
5219 *flags &= CODING_STATE_ISO2022_LOCK;
5220 iso->output_literally = 1;
5221 if (iso->switched_dir_and_no_valid_charset_yet)
5223 /* We encountered a switch-direction followed by an
5224 invalid designation. Ensure that the switch-direction
5225 gets outputted; otherwise it will probably get eaten
5226 when the text is written out again. */
5227 iso->switched_dir_and_no_valid_charset_yet = 0;
5228 iso->output_direction_sequence = 1;
5229 /* And make sure that the switch-dir going the other
5230 way gets outputted, as well. */
5231 iso->invalid_switch_dir = 1;
5235 /* This function is called with CODESYS equal to nil when
5236 doing coding-system detection. */
5237 if (!NILP (codesys))
5239 charset_conversion_spec_dynarr *dyn =
5240 XCODING_SYSTEM (codesys)->iso2022.input_conv;
5246 for (i = 0; i < Dynarr_length (dyn); i++)
5248 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
5249 if (EQ (cs, spec->from_charset))
5250 cs = spec->to_charset;
5255 iso->charset[reg] = cs;
5256 iso->esc = ISO_ESC_DESIGNATE;
5257 *flags &= CODING_STATE_ISO2022_LOCK;
5258 if (iso->invalid_designated[reg])
5260 iso->invalid_designated[reg] = 0;
5261 iso->output_literally = 1;
5263 if (iso->switched_dir_and_no_valid_charset_yet)
5264 iso->switched_dir_and_no_valid_charset_yet = 0;
5269 detect_coding_iso2022 (struct detection_state *st, const Extbyte *src, Lstream_data_count n)
5273 /* #### There are serious deficiencies in the recognition mechanism
5274 here. This needs to be much smarter if it's going to cut it.
5275 The sequence "\xff\x0f" is currently detected as LOCK_SHIFT while
5276 it should be detected as Latin-1.
5277 All the ISO2022 stuff in this file should be synced up with the
5278 code from FSF Emacs-20.4, in which Mule should be more or less stable.
5279 Perhaps we should wait till R2L works in FSF Emacs? */
5281 if (!st->iso2022.initted)
5283 reset_iso2022 (Qnil, &st->iso2022.iso);
5284 st->iso2022.mask = (CODING_CATEGORY_ISO_7_MASK |
5285 CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
5286 CODING_CATEGORY_ISO_8_1_MASK |
5287 CODING_CATEGORY_ISO_8_2_MASK |
5288 CODING_CATEGORY_ISO_LOCK_SHIFT_MASK);
5289 st->iso2022.flags = 0;
5290 st->iso2022.high_byte_count = 0;
5291 st->iso2022.saw_single_shift = 0;
5292 st->iso2022.initted = 1;
5295 mask = st->iso2022.mask;
5299 unsigned char c = *(unsigned char *)src++;
5302 mask &= ~CODING_CATEGORY_ISO_7_MASK;
5303 st->iso2022.high_byte_count++;
5307 if (st->iso2022.high_byte_count && !st->iso2022.saw_single_shift)
5309 if (st->iso2022.high_byte_count & 1)
5310 /* odd number of high bytes; assume not iso-8-2 */
5311 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
5313 st->iso2022.high_byte_count = 0;
5314 st->iso2022.saw_single_shift = 0;
5316 mask &= ~CODING_CATEGORY_ISO_7_MASK;
5318 if (!(st->iso2022.flags & CODING_STATE_ESCAPE)
5319 && (BYTE_C0_P (c) || BYTE_C1_P (c)))
5320 { /* control chars */
5323 /* Allow and ignore control characters that you might
5324 reasonably see in a text file */
5329 case 8: /* backspace */
5330 case 11: /* vertical tab */
5331 case 12: /* form feed */
5332 case 26: /* MS-DOS C-z junk */
5333 case 31: /* '^_' -- for info */
5334 goto label_continue_loop;
5341 if ((st->iso2022.flags & CODING_STATE_ESCAPE) || BYTE_C0_P (c)
5344 if (parse_iso2022_esc (Qnil, &st->iso2022.iso, c,
5345 &st->iso2022.flags, 0))
5347 switch (st->iso2022.iso.esc)
5349 case ISO_ESC_DESIGNATE:
5350 mask &= ~CODING_CATEGORY_ISO_8_1_MASK;
5351 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
5353 case ISO_ESC_LOCKING_SHIFT:
5354 mask = CODING_CATEGORY_ISO_LOCK_SHIFT_MASK;
5355 goto ran_out_of_chars;
5356 case ISO_ESC_SINGLE_SHIFT:
5357 mask &= ~CODING_CATEGORY_ISO_8_DESIGNATE_MASK;
5358 st->iso2022.saw_single_shift = 1;
5367 goto ran_out_of_chars;
5370 label_continue_loop:;
5379 postprocess_iso2022_mask (int mask)
5381 /* #### kind of cheesy */
5382 /* If seven-bit ISO is allowed, then assume that the encoding is
5383 entirely seven-bit and turn off the eight-bit ones. */
5384 if (mask & CODING_CATEGORY_ISO_7_MASK)
5385 mask &= ~ (CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
5386 CODING_CATEGORY_ISO_8_1_MASK |
5387 CODING_CATEGORY_ISO_8_2_MASK);
5391 /* If FLAGS is a null pointer or specifies right-to-left motion,
5392 output a switch-dir-to-left-to-right sequence to DST.
5393 Also update FLAGS if it is not a null pointer.
5394 If INTERNAL_P is set, we are outputting in internal format and
5395 need to handle the CSI differently. */
5398 restore_left_to_right_direction (Lisp_Coding_System *codesys,
5399 unsigned_char_dynarr *dst,
5400 unsigned int *flags,
5403 if (!flags || (*flags & CODING_STATE_R2L))
5405 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5407 Dynarr_add (dst, ISO_CODE_ESC);
5408 Dynarr_add (dst, '[');
5410 else if (internal_p)
5411 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
5413 Dynarr_add (dst, ISO_CODE_CSI);
5414 Dynarr_add (dst, '0');
5415 Dynarr_add (dst, ']');
5417 *flags &= ~CODING_STATE_R2L;
5421 /* If FLAGS is a null pointer or specifies a direction different from
5422 DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or
5423 CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape
5424 sequence to DST. Also update FLAGS if it is not a null pointer.
5425 If INTERNAL_P is set, we are outputting in internal format and
5426 need to handle the CSI differently. */
5429 ensure_correct_direction (int direction, Lisp_Coding_System *codesys,
5430 unsigned_char_dynarr *dst, unsigned int *flags,
5433 if ((!flags || (*flags & CODING_STATE_R2L)) &&
5434 direction == CHARSET_LEFT_TO_RIGHT)
5435 restore_left_to_right_direction (codesys, dst, flags, internal_p);
5436 else if (!CODING_SYSTEM_ISO2022_NO_ISO6429 (codesys)
5437 && (!flags || !(*flags & CODING_STATE_R2L)) &&
5438 direction == CHARSET_RIGHT_TO_LEFT)
5440 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5442 Dynarr_add (dst, ISO_CODE_ESC);
5443 Dynarr_add (dst, '[');
5445 else if (internal_p)
5446 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
5448 Dynarr_add (dst, ISO_CODE_CSI);
5449 Dynarr_add (dst, '2');
5450 Dynarr_add (dst, ']');
5452 *flags |= CODING_STATE_R2L;
5456 /* Convert ISO2022-format data to internal format. */
5459 decode_coding_iso2022 (Lstream *decoding, const Extbyte *src,
5460 unsigned_char_dynarr *dst, Lstream_data_count n)
5462 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
5463 unsigned int flags = str->flags;
5464 unsigned int cpos = str->cpos;
5465 unsigned char counter = str->counter;
5466 eol_type_t eol_type = str->eol_type;
5467 #ifdef ENABLE_COMPOSITE_CHARS
5468 unsigned_char_dynarr *real_dst = dst;
5470 Lisp_Object coding_system;
5472 XSETCODING_SYSTEM (coding_system, str->codesys);
5474 #ifdef ENABLE_COMPOSITE_CHARS
5475 if (flags & CODING_STATE_COMPOSITE)
5476 dst = str->iso2022.composite_chars;
5477 #endif /* ENABLE_COMPOSITE_CHARS */
5481 unsigned char c = *(unsigned char *)src++;
5482 if (flags & CODING_STATE_ESCAPE)
5483 { /* Within ESC sequence */
5484 int retval = parse_iso2022_esc (coding_system, &str->iso2022,
5489 switch (str->iso2022.esc)
5491 #ifdef ENABLE_COMPOSITE_CHARS
5492 case ISO_ESC_START_COMPOSITE:
5493 if (str->iso2022.composite_chars)
5494 Dynarr_reset (str->iso2022.composite_chars);
5496 str->iso2022.composite_chars = Dynarr_new (unsigned_char);
5497 dst = str->iso2022.composite_chars;
5499 case ISO_ESC_END_COMPOSITE:
5501 Bufbyte comstr[MAX_EMCHAR_LEN];
5503 Emchar emch = lookup_composite_char (Dynarr_atp (dst, 0),
5504 Dynarr_length (dst));
5506 len = set_charptr_emchar (comstr, emch);
5507 Dynarr_add_many (dst, comstr, len);
5510 #endif /* ENABLE_COMPOSITE_CHARS */
5512 case ISO_ESC_LITERAL:
5513 COMPOSE_FLUSH_CHARS (str, dst);
5514 decode_flush_er_chars (str, dst);
5515 DECODE_ADD_BINARY_CHAR (c, dst);
5519 /* Everything else handled already */
5524 /* Attempted error recovery. */
5525 if (str->iso2022.output_direction_sequence)
5526 ensure_correct_direction (flags & CODING_STATE_R2L ?
5527 CHARSET_RIGHT_TO_LEFT :
5528 CHARSET_LEFT_TO_RIGHT,
5529 str->codesys, dst, 0, 1);
5530 /* More error recovery. */
5531 if (!retval || str->iso2022.output_literally)
5533 /* Output the (possibly invalid) sequence */
5535 COMPOSE_FLUSH_CHARS (str, dst);
5536 decode_flush_er_chars (str, dst);
5537 for (i = 0; i < str->iso2022.esc_bytes_index; i++)
5538 DECODE_ADD_BINARY_CHAR (str->iso2022.esc_bytes[i], dst);
5539 flags &= CODING_STATE_ISO2022_LOCK;
5541 n++, src--;/* Repeat the loop with the same character. */
5544 /* No sense in reprocessing the final byte of the
5545 escape sequence; it could mess things up anyway.
5547 COMPOSE_FLUSH_CHARS (str, dst);
5548 decode_flush_er_chars (str, dst);
5549 DECODE_ADD_BINARY_CHAR (c, dst);
5555 else if (BYTE_C0_P (c) || BYTE_C1_P (c))
5556 { /* Control characters */
5558 /***** Error-handling *****/
5560 /* If we were in the middle of a character, dump out the
5561 partial character. */
5564 COMPOSE_FLUSH_CHARS (str, dst);
5565 decode_flush_er_chars (str, dst);
5569 DECODE_ADD_BINARY_CHAR
5570 ((unsigned char)(cpos >> (counter * 8)), dst);
5575 /* If we just saw a single-shift character, dump it out.
5576 This may dump out the wrong sort of single-shift character,
5577 but least it will give an indication that something went
5579 if (flags & CODING_STATE_SS2)
5581 COMPOSE_FLUSH_CHARS (str, dst);
5582 decode_flush_er_chars (str, dst);
5583 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS2, dst);
5584 flags &= ~CODING_STATE_SS2;
5586 if (flags & CODING_STATE_SS3)
5588 COMPOSE_FLUSH_CHARS (str, dst);
5589 decode_flush_er_chars (str, dst);
5590 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS3, dst);
5591 flags &= ~CODING_STATE_SS3;
5594 /***** Now handle the control characters. *****/
5600 COMPOSE_FLUSH_CHARS (str, dst);
5601 decode_flush_er_chars (str, dst);
5602 if (eol_type == EOL_CR)
5603 Dynarr_add (dst, '\n');
5604 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR)
5605 Dynarr_add (dst, c);
5607 flags |= CODING_STATE_CR;
5608 goto label_continue_loop;
5610 else if (flags & CODING_STATE_CR)
5611 { /* eol_type == CODING_SYSTEM_EOL_CRLF */
5613 Dynarr_add (dst, '\r');
5614 flags &= ~CODING_STATE_CR;
5617 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5620 flags &= CODING_STATE_ISO2022_LOCK;
5622 if (!parse_iso2022_esc (coding_system, &str->iso2022, c, &flags, 1))
5624 COMPOSE_FLUSH_CHARS (str, dst);
5625 decode_flush_er_chars (str, dst);
5626 DECODE_ADD_BINARY_CHAR (c, dst);
5630 { /* Graphic characters */
5631 Lisp_Object charset;
5640 COMPOSE_FLUSH_CHARS (str, dst);
5641 decode_flush_er_chars (str, dst);
5642 if (eol_type == EOL_CR)
5643 Dynarr_add (dst, '\n');
5644 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR)
5645 Dynarr_add (dst, c);
5647 flags |= CODING_STATE_CR;
5648 goto label_continue_loop;
5650 else if (flags & CODING_STATE_CR)
5651 { /* eol_type == CODING_SYSTEM_EOL_CRLF */
5653 Dynarr_add (dst, '\r');
5654 flags &= ~CODING_STATE_CR;
5657 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5660 /* Now determine the charset. */
5661 reg = ((flags & CODING_STATE_SS2) ? 2
5662 : (flags & CODING_STATE_SS3) ? 3
5663 : !BYTE_ASCII_P (c) ? str->iso2022.register_right
5664 : str->iso2022.register_left);
5665 charset = str->iso2022.charset[reg];
5667 /* Error checking: */
5668 if (! CHARSETP (charset)
5669 || str->iso2022.invalid_designated[reg]
5670 || (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL)
5671 && XCHARSET_CHARS (charset) == 94))
5672 /* Mrmph. We are trying to invoke a register that has no
5673 or an invalid charset in it, or trying to add a character
5674 outside the range of the charset. Insert that char literally
5675 to preserve it for the output. */
5677 COMPOSE_FLUSH_CHARS (str, dst);
5678 decode_flush_er_chars (str, dst);
5682 DECODE_ADD_BINARY_CHAR
5683 ((unsigned char)(cpos >> (counter * 8)), dst);
5686 DECODE_ADD_BINARY_CHAR (c, dst);
5691 /* Things are probably hunky-dorey. */
5693 /* Fetch reverse charset, maybe. */
5694 if (((flags & CODING_STATE_R2L) &&
5695 XCHARSET_DIRECTION (charset) == CHARSET_LEFT_TO_RIGHT)
5697 (!(flags & CODING_STATE_R2L) &&
5698 XCHARSET_DIRECTION (charset) == CHARSET_RIGHT_TO_LEFT))
5700 Lisp_Object new_charset =
5701 XCHARSET_REVERSE_DIRECTION_CHARSET (charset);
5702 if (!NILP (new_charset))
5703 charset = new_charset;
5708 if (XCHARSET_DIMENSION (charset) == counter)
5710 COMPOSE_ADD_CHAR (str,
5711 DECODE_CHAR (charset,
5712 ((cpos & 0x7F7F7F) << 8)
5719 cpos = (cpos << 8) | c;
5721 lb = XCHARSET_LEADING_BYTE (charset);
5722 switch (XCHARSET_REP_BYTES (charset))
5725 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5726 Dynarr_add (dst, c & 0x7F);
5729 case 2: /* one-byte official */
5730 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5731 Dynarr_add (dst, lb);
5732 Dynarr_add (dst, c | 0x80);
5735 case 3: /* one-byte private or two-byte official */
5736 if (XCHARSET_PRIVATE_P (charset))
5738 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5739 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_1);
5740 Dynarr_add (dst, lb);
5741 Dynarr_add (dst, c | 0x80);
5747 Dynarr_add (dst, lb);
5748 Dynarr_add (dst, ch | 0x80);
5749 Dynarr_add (dst, c | 0x80);
5757 default: /* two-byte private */
5760 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_2);
5761 Dynarr_add (dst, lb);
5762 Dynarr_add (dst, ch | 0x80);
5763 Dynarr_add (dst, c | 0x80);
5773 flags &= CODING_STATE_ISO2022_LOCK;
5776 label_continue_loop:;
5779 if (flags & CODING_STATE_END)
5781 COMPOSE_FLUSH_CHARS (str, dst);
5782 decode_flush_er_chars (str, dst);
5783 DECODE_OUTPUT_PARTIAL_CHAR (cpos);
5787 str->counter = counter;
5791 /***** ISO2022 encoder *****/
5793 /* Designate CHARSET into register REG. */
5796 iso2022_designate (Lisp_Object charset, unsigned char reg,
5797 struct encoding_stream *str, unsigned_char_dynarr *dst)
5799 static const char inter94[] = "()*+";
5800 static const char inter96[] = ",-./";
5801 unsigned short chars;
5802 unsigned char dimension;
5803 unsigned char final;
5804 Lisp_Object old_charset = str->iso2022.charset[reg];
5806 str->iso2022.charset[reg] = charset;
5807 if (!CHARSETP (charset))
5808 /* charset might be an initial nil or t. */
5810 chars = XCHARSET_CHARS (charset);
5811 dimension = XCHARSET_DIMENSION (charset);
5812 final = XCHARSET_FINAL (charset);
5813 if (!str->iso2022.force_charset_on_output[reg] &&
5814 CHARSETP (old_charset) &&
5815 XCHARSET_CHARS (old_charset) == chars &&
5816 XCHARSET_DIMENSION (old_charset) == dimension &&
5817 XCHARSET_FINAL (old_charset) == final)
5820 str->iso2022.force_charset_on_output[reg] = 0;
5823 charset_conversion_spec_dynarr *dyn =
5824 str->codesys->iso2022.output_conv;
5830 for (i = 0; i < Dynarr_length (dyn); i++)
5832 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
5833 if (EQ (charset, spec->from_charset))
5834 charset = spec->to_charset;
5839 Dynarr_add (dst, ISO_CODE_ESC);
5844 Dynarr_add (dst, inter94[reg]);
5847 Dynarr_add (dst, '$');
5849 || !(CODING_SYSTEM_ISO2022_SHORT (str->codesys))
5852 Dynarr_add (dst, inter94[reg]);
5857 Dynarr_add (dst, inter96[reg]);
5860 Dynarr_add (dst, '$');
5861 Dynarr_add (dst, inter96[reg]);
5865 Dynarr_add (dst, final);
5869 ensure_normal_shift (struct encoding_stream *str, unsigned_char_dynarr *dst)
5871 if (str->iso2022.register_left != 0)
5873 Dynarr_add (dst, ISO_CODE_SI);
5874 str->iso2022.register_left = 0;
5879 ensure_shift_out (struct encoding_stream *str, unsigned_char_dynarr *dst)
5881 if (str->iso2022.register_left != 1)
5883 Dynarr_add (dst, ISO_CODE_SO);
5884 str->iso2022.register_left = 1;
5889 char_encode_iso2022 (struct encoding_stream *str, Emchar ch,
5890 unsigned_char_dynarr *dst, unsigned int *flags)
5892 unsigned char charmask;
5893 Lisp_Coding_System* codesys = str->codesys;
5894 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
5896 Lisp_Object charset = str->iso2022.current_charset;
5897 int half = str->iso2022.current_half;
5898 int code_point = -1;
5902 restore_left_to_right_direction (codesys, dst, flags, 0);
5904 /* Make sure G0 contains ASCII */
5905 if ((ch > ' ' && ch < ISO_CODE_DEL)
5906 || !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
5908 ensure_normal_shift (str, dst);
5909 iso2022_designate (Vcharset_ascii, 0, str, dst);
5912 /* If necessary, restore everything to the default state
5914 if (ch == '\n' && !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys)))
5916 restore_left_to_right_direction (codesys, dst, flags, 0);
5918 ensure_normal_shift (str, dst);
5920 for (i = 0; i < 4; i++)
5922 Lisp_Object initial_charset =
5923 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5924 iso2022_designate (initial_charset, i, str, dst);
5929 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5930 Dynarr_add (dst, '\r');
5931 if (eol_type != EOL_CR)
5932 Dynarr_add (dst, ch);
5936 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5937 && fit_to_be_escape_quoted (ch))
5938 Dynarr_add (dst, ISO_CODE_ESC);
5939 Dynarr_add (dst, ch);
5942 else if ( (0x80 <= ch) && (ch <= 0x9f) )
5944 charmask = (half == 0 ? 0x00 : 0x80);
5946 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5947 && fit_to_be_escape_quoted (ch))
5948 Dynarr_add (dst, ISO_CODE_ESC);
5949 /* you asked for it ... */
5950 Dynarr_add (dst, ch);
5956 /* Now determine which register to use. */
5958 for (i = 0; i < 4; i++)
5960 if ((CHARSETP (charset = str->iso2022.charset[i])
5961 && ((code_point = charset_code_point (charset, ch, 0)) >= 0))
5965 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i))
5966 && ((code_point = charset_code_point (charset, ch, 0)) >= 0)))
5974 Lisp_Object original_default_coded_charset_priority_list
5975 = Vdefault_coded_charset_priority_list;
5977 while (!EQ (Vdefault_coded_charset_priority_list, Qnil))
5979 code_point = ENCODE_CHAR (ch, charset);
5980 if (XCHARSET_FINAL (charset))
5982 Vdefault_coded_charset_priority_list
5983 = Fcdr (Fmemq (XCHARSET_NAME (charset),
5984 Vdefault_coded_charset_priority_list));
5986 code_point = ENCODE_CHAR (ch, charset);
5987 if (!XCHARSET_FINAL (charset))
5989 charset = Vcharset_ascii;
5993 Vdefault_coded_charset_priority_list
5994 = original_default_coded_charset_priority_list;
5996 ensure_correct_direction (XCHARSET_DIRECTION (charset),
5997 codesys, dst, flags, 0);
6001 if (XCHARSET_GRAPHIC (charset) != 0)
6003 if (!NILP (str->iso2022.charset[1]) &&
6004 (!CODING_SYSTEM_ISO2022_SEVEN (codesys)
6005 || CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys)))
6007 else if (!NILP (str->iso2022.charset[2]))
6009 else if (!NILP (str->iso2022.charset[3]))
6018 iso2022_designate (charset, reg, str, dst);
6020 /* Now invoke that register. */
6024 ensure_normal_shift (str, dst);
6028 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
6030 ensure_shift_out (str, dst);
6037 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
6039 Dynarr_add (dst, ISO_CODE_ESC);
6040 Dynarr_add (dst, 'N');
6045 Dynarr_add (dst, ISO_CODE_SS2);
6050 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
6052 Dynarr_add (dst, ISO_CODE_ESC);
6053 Dynarr_add (dst, 'O');
6058 Dynarr_add (dst, ISO_CODE_SS3);
6066 charmask = (half == 0 ? 0x00 : 0x80);
6068 switch (XCHARSET_DIMENSION (charset))
6071 Dynarr_add (dst, (code_point & 0xFF) | charmask);
6074 Dynarr_add (dst, ((code_point >> 8) & 0xFF) | charmask);
6075 Dynarr_add (dst, ( code_point & 0xFF) | charmask);
6078 Dynarr_add (dst, ((code_point >> 16) & 0xFF) | charmask);
6079 Dynarr_add (dst, ((code_point >> 8) & 0xFF) | charmask);
6080 Dynarr_add (dst, ( code_point & 0xFF) | charmask);
6083 Dynarr_add (dst, ((code_point >> 24) & 0xFF) | charmask);
6084 Dynarr_add (dst, ((code_point >> 16) & 0xFF) | charmask);
6085 Dynarr_add (dst, ((code_point >> 8) & 0xFF) | charmask);
6086 Dynarr_add (dst, ( code_point & 0xFF) | charmask);
6092 str->iso2022.current_charset = charset;
6093 str->iso2022.current_half = half;
6097 char_finish_iso2022 (struct encoding_stream *str, unsigned_char_dynarr *dst,
6098 unsigned int *flags)
6100 Lisp_Coding_System* codesys = str->codesys;
6103 restore_left_to_right_direction (codesys, dst, flags, 0);
6104 ensure_normal_shift (str, dst);
6105 for (i = 0; i < 4; i++)
6107 Lisp_Object initial_charset
6108 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
6109 iso2022_designate (initial_charset, i, str, dst);
6114 /************************************************************************/
6115 /* No-conversion methods */
6116 /************************************************************************/
6118 /* This is used when reading in "binary" files -- i.e. files that may
6119 contain all 256 possible byte values and that are not to be
6120 interpreted as being in any particular decoding. */
6122 decode_coding_no_conversion (Lstream *decoding, const Extbyte *src,
6123 unsigned_char_dynarr *dst, Lstream_data_count n)
6125 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
6126 unsigned int flags = str->flags;
6127 unsigned int cpos = str->cpos;
6128 eol_type_t eol_type = str->eol_type;
6132 unsigned char c = *(unsigned char *)src++;
6134 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
6135 DECODE_ADD_BINARY_CHAR (c, dst);
6136 label_continue_loop:;
6139 DECODE_HANDLE_END_OF_CONVERSION (flags, cpos, dst);
6146 encode_coding_no_conversion (Lstream *encoding, const Bufbyte *src,
6147 unsigned_char_dynarr *dst, Lstream_data_count n)
6150 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
6151 unsigned int flags = str->flags;
6152 unsigned int ch = str->ch;
6153 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
6155 unsigned char char_boundary = str->iso2022.current_char_boundary;
6162 if (char_boundary == 0)
6168 else if ( c >= 0xf8 )
6173 else if ( c >= 0xf0 )
6178 else if ( c >= 0xe0 )
6183 else if ( c >= 0xc0 )
6193 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
6194 Dynarr_add (dst, '\r');
6195 if (eol_type != EOL_CR)
6196 Dynarr_add (dst, c);
6199 Dynarr_add (dst, c);
6202 else if (char_boundary == 1)
6204 ch = ( ch << 6 ) | ( c & 0x3f );
6205 Dynarr_add (dst, ch & 0xff);
6210 ch = ( ch << 6 ) | ( c & 0x3f );
6213 #else /* not UTF2000 */
6216 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
6217 Dynarr_add (dst, '\r');
6218 if (eol_type != EOL_CR)
6219 Dynarr_add (dst, '\n');
6222 else if (BYTE_ASCII_P (c))
6225 Dynarr_add (dst, c);
6227 else if (BUFBYTE_LEADING_BYTE_P (c))
6230 if (c == LEADING_BYTE_LATIN_ISO8859_1 ||
6231 c == LEADING_BYTE_CONTROL_1)
6234 Dynarr_add (dst, '~'); /* untranslatable character */
6238 if (ch == LEADING_BYTE_LATIN_ISO8859_1)
6239 Dynarr_add (dst, c);
6240 else if (ch == LEADING_BYTE_CONTROL_1)
6243 Dynarr_add (dst, c - 0x20);
6245 /* else it should be the second or third byte of an
6246 untranslatable character, so ignore it */
6249 #endif /* not UTF2000 */
6255 str->iso2022.current_char_boundary = char_boundary;
6261 /************************************************************************/
6262 /* Initialization */
6263 /************************************************************************/
6266 syms_of_file_coding (void)
6268 INIT_LRECORD_IMPLEMENTATION (coding_system);
6270 DEFERROR_STANDARD (Qcoding_system_error, Qio_error);
6272 DEFSUBR (Fcoding_system_p);
6273 DEFSUBR (Ffind_coding_system);
6274 DEFSUBR (Fget_coding_system);
6275 DEFSUBR (Fcoding_system_list);
6276 DEFSUBR (Fcoding_system_name);
6277 DEFSUBR (Fmake_coding_system);
6278 DEFSUBR (Fcopy_coding_system);
6279 DEFSUBR (Fcoding_system_canonical_name_p);
6280 DEFSUBR (Fcoding_system_alias_p);
6281 DEFSUBR (Fcoding_system_aliasee);
6282 DEFSUBR (Fdefine_coding_system_alias);
6283 DEFSUBR (Fsubsidiary_coding_system);
6285 DEFSUBR (Fcoding_system_type);
6286 DEFSUBR (Fcoding_system_doc_string);
6288 DEFSUBR (Fcoding_system_charset);
6290 DEFSUBR (Fcoding_system_property);
6292 DEFSUBR (Fcoding_category_list);
6293 DEFSUBR (Fset_coding_priority_list);
6294 DEFSUBR (Fcoding_priority_list);
6295 DEFSUBR (Fset_coding_category_system);
6296 DEFSUBR (Fcoding_category_system);
6298 DEFSUBR (Fdetect_coding_region);
6299 DEFSUBR (Fdecode_coding_region);
6300 DEFSUBR (Fencode_coding_region);
6302 DEFSUBR (Fdecode_shift_jis_char);
6303 DEFSUBR (Fencode_shift_jis_char);
6304 DEFSUBR (Fdecode_big5_char);
6305 DEFSUBR (Fencode_big5_char);
6307 defsymbol (&Qcoding_systemp, "coding-system-p");
6308 defsymbol (&Qno_conversion, "no-conversion");
6309 defsymbol (&Qraw_text, "raw-text");
6311 defsymbol (&Qbig5, "big5");
6312 defsymbol (&Qshift_jis, "shift-jis");
6313 defsymbol (&Qucs4, "ucs-4");
6314 defsymbol (&Qutf8, "utf-8");
6315 defsymbol (&Qutf16, "utf-16");
6316 defsymbol (&Qccl, "ccl");
6317 defsymbol (&Qiso2022, "iso2022");
6319 defsymbol (&Qmnemonic, "mnemonic");
6320 defsymbol (&Qeol_type, "eol-type");
6321 defsymbol (&Qpost_read_conversion, "post-read-conversion");
6322 defsymbol (&Qpre_write_conversion, "pre-write-conversion");
6324 defsymbol (&Qcr, "cr");
6325 defsymbol (&Qlf, "lf");
6326 defsymbol (&Qcrlf, "crlf");
6327 defsymbol (&Qeol_cr, "eol-cr");
6328 defsymbol (&Qeol_lf, "eol-lf");
6329 defsymbol (&Qeol_crlf, "eol-crlf");
6331 defsymbol (&Qcharset_g0, "charset-g0");
6332 defsymbol (&Qcharset_g1, "charset-g1");
6333 defsymbol (&Qcharset_g2, "charset-g2");
6334 defsymbol (&Qcharset_g3, "charset-g3");
6335 defsymbol (&Qforce_g0_on_output, "force-g0-on-output");
6336 defsymbol (&Qforce_g1_on_output, "force-g1-on-output");
6337 defsymbol (&Qforce_g2_on_output, "force-g2-on-output");
6338 defsymbol (&Qforce_g3_on_output, "force-g3-on-output");
6339 defsymbol (&Qno_iso6429, "no-iso6429");
6340 defsymbol (&Qinput_charset_conversion, "input-charset-conversion");
6341 defsymbol (&Qoutput_charset_conversion, "output-charset-conversion");
6343 defsymbol (&Qshort, "short");
6344 defsymbol (&Qno_ascii_eol, "no-ascii-eol");
6345 defsymbol (&Qno_ascii_cntl, "no-ascii-cntl");
6346 defsymbol (&Qseven, "seven");
6347 defsymbol (&Qlock_shift, "lock-shift");
6348 defsymbol (&Qescape_quoted, "escape-quoted");
6351 defsymbol (&Qutf_8_mcs, "utf-8-mcs");
6352 defsymbol (&Qdisable_composition, "disable-composition");
6353 defsymbol (&Quse_entity_reference, "use-entity-reference");
6354 defsymbol (&Qd, "d");
6355 defsymbol (&Qx, "x");
6356 defsymbol (&QX, "X");
6358 defsymbol (&Qencode, "encode");
6359 defsymbol (&Qdecode, "decode");
6362 defsymbol (&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS],
6364 defsymbol (&coding_category_symbol[CODING_CATEGORY_BIG5],
6366 defsymbol (&coding_category_symbol[CODING_CATEGORY_UCS4],
6368 defsymbol (&coding_category_symbol[CODING_CATEGORY_UTF16],
6370 defsymbol (&coding_category_symbol[CODING_CATEGORY_UTF8],
6372 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_7],
6374 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE],
6376 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_1],
6378 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_2],
6380 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_LOCK_SHIFT],
6383 defsymbol (&coding_category_symbol[CODING_CATEGORY_NO_CONVERSION],
6388 lstream_type_create_file_coding (void)
6390 LSTREAM_HAS_METHOD (decoding, reader);
6391 LSTREAM_HAS_METHOD (decoding, writer);
6392 LSTREAM_HAS_METHOD (decoding, rewinder);
6393 LSTREAM_HAS_METHOD (decoding, seekable_p);
6394 LSTREAM_HAS_METHOD (decoding, flusher);
6395 LSTREAM_HAS_METHOD (decoding, closer);
6396 LSTREAM_HAS_METHOD (decoding, marker);
6398 LSTREAM_HAS_METHOD (encoding, reader);
6399 LSTREAM_HAS_METHOD (encoding, writer);
6400 LSTREAM_HAS_METHOD (encoding, rewinder);
6401 LSTREAM_HAS_METHOD (encoding, seekable_p);
6402 LSTREAM_HAS_METHOD (encoding, flusher);
6403 LSTREAM_HAS_METHOD (encoding, closer);
6404 LSTREAM_HAS_METHOD (encoding, marker);
6408 vars_of_file_coding (void)
6412 fcd = xnew (struct file_coding_dump);
6413 dump_add_root_struct_ptr (&fcd, &fcd_description);
6415 /* Initialize to something reasonable ... */
6416 for (i = 0; i < CODING_CATEGORY_LAST; i++)
6418 fcd->coding_category_system[i] = Qnil;
6419 fcd->coding_category_by_priority[i] = i;
6422 Fprovide (intern ("file-coding"));
6424 DEFVAR_LISP ("keyboard-coding-system", &Vkeyboard_coding_system /*
6425 Coding system used for TTY keyboard input.
6426 Not used under a windowing system.
6428 Vkeyboard_coding_system = Qnil;
6430 DEFVAR_LISP ("terminal-coding-system", &Vterminal_coding_system /*
6431 Coding system used for TTY display output.
6432 Not used under a windowing system.
6434 Vterminal_coding_system = Qnil;
6436 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read /*
6437 Overriding coding system used when reading from a file or process.
6438 You should bind this variable with `let', but do not set it globally.
6439 If this is non-nil, it specifies the coding system that will be used
6440 to decode input on read operations, such as from a file or process.
6441 It overrides `buffer-file-coding-system-for-read',
6442 `insert-file-contents-pre-hook', etc. Use those variables instead of
6443 this one for permanent changes to the environment. */ );
6444 Vcoding_system_for_read = Qnil;
6446 DEFVAR_LISP ("coding-system-for-write",
6447 &Vcoding_system_for_write /*
6448 Overriding coding system used when writing to a file or process.
6449 You should bind this variable with `let', but do not set it globally.
6450 If this is non-nil, it specifies the coding system that will be used
6451 to encode output for write operations, such as to a file or process.
6452 It overrides `buffer-file-coding-system', `write-region-pre-hook', etc.
6453 Use those variables instead of this one for permanent changes to the
6455 Vcoding_system_for_write = Qnil;
6457 DEFVAR_LISP ("file-name-coding-system", &Vfile_name_coding_system /*
6458 Coding system used to convert pathnames when accessing files.
6460 Vfile_name_coding_system = Qnil;
6462 DEFVAR_LISP ("coded-charset-entity-reference-alist",
6463 &Vcoded_charset_entity_reference_alist /*
6464 Alist of coded-charset vs corresponding entity-reference.
6465 Each element looks like (CCS PREFIX CODE-COLUMNS CODE-TYPE).
6466 CCS is coded-charset.
6467 CODE-COLUMNS is columns of code-point of entity-reference.
6468 CODE-TYPE is format type of code-point of entity-reference.
6469 `d' means decimal value and `x' means hexadecimal value.
6471 Vcoded_charset_entity_reference_alist = Qnil;
6473 DEFVAR_BOOL ("enable-multibyte-characters", &enable_multibyte_characters /*
6474 Non-nil means the buffer contents are regarded as multi-byte form
6475 of characters, not a binary code. This affects the display, file I/O,
6476 and behaviors of various editing commands.
6478 Setting this to nil does not do anything.
6480 enable_multibyte_characters = 1;
6484 complex_vars_of_file_coding (void)
6486 staticpro (&Vcoding_system_hash_table);
6487 Vcoding_system_hash_table =
6488 make_lisp_hash_table (50, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ);
6490 the_codesys_prop_dynarr = Dynarr_new (codesys_prop);
6491 dump_add_root_struct_ptr (&the_codesys_prop_dynarr, &codesys_prop_dynarr_description);
6493 #define DEFINE_CODESYS_PROP(Prop_Type, Sym) do \
6495 struct codesys_prop csp; \
6497 csp.prop_type = (Prop_Type); \
6498 Dynarr_add (the_codesys_prop_dynarr, csp); \
6501 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qmnemonic);
6502 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_type);
6503 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_cr);
6504 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_crlf);
6505 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_lf);
6506 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpost_read_conversion);
6507 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpre_write_conversion);
6509 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g0);
6510 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g1);
6511 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g2);
6512 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g3);
6513 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g0_on_output);
6514 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g1_on_output);
6515 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g2_on_output);
6516 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g3_on_output);
6517 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qshort);
6518 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_eol);
6519 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_cntl);
6520 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qseven);
6521 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qlock_shift);
6522 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_iso6429);
6523 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qescape_quoted);
6524 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qinput_charset_conversion);
6525 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qoutput_charset_conversion);
6527 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qencode);
6528 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qdecode);
6530 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qdisable_composition);
6531 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Quse_entity_reference);
6534 /* Need to create this here or we're really screwed. */
6536 (Qraw_text, Qno_conversion,
6537 build_string ("Raw text, which means it converts only line-break-codes."),
6538 list2 (Qmnemonic, build_string ("Raw")));
6541 (Qbinary, Qno_conversion,
6542 build_string ("Binary, which means it does not convert anything."),
6543 list4 (Qeol_type, Qlf,
6544 Qmnemonic, build_string ("Binary")));
6550 ("Coding-system of UTF-8 with Multiple Coded-character-Sets extension."),
6551 list2 (Qmnemonic, build_string ("MTF8")));
6554 Fdefine_coding_system_alias (Qno_conversion, Qraw_text);
6556 Fdefine_coding_system_alias (Qfile_name, Qbinary);
6558 Fdefine_coding_system_alias (Qterminal, Qbinary);
6559 Fdefine_coding_system_alias (Qkeyboard, Qbinary);
6561 /* Need this for bootstrapping */
6562 fcd->coding_category_system[CODING_CATEGORY_NO_CONVERSION] =
6563 Fget_coding_system (Qraw_text);
6566 fcd->coding_category_system[CODING_CATEGORY_UTF8]
6567 = Fget_coding_system (Qutf_8_mcs);
6570 #if defined(MULE) && !defined(UTF2000)
6574 for (i = 0; i < countof (fcd->ucs_to_mule_table); i++)
6575 fcd->ucs_to_mule_table[i] = Qnil;
6577 staticpro (&mule_to_ucs_table);
6578 mule_to_ucs_table = Fmake_char_table(Qgeneric);
6579 #endif /* defined(MULE) && !defined(UTF2000) */