1 /* Code conversion functions.
2 Copyright (C) 1991, 1995 Free Software Foundation, Inc.
3 Copyright (C) 1995 Sun Microsystems, Inc.
4 Copyright (C) 1999,2000,2001 MORIOKA Tomohiko
6 This file is part of XEmacs.
8 XEmacs is free software; you can redistribute it and/or modify it
9 under the terms of the GNU General Public License as published by the
10 Free Software Foundation; either version 2, or (at your option) any
13 XEmacs is distributed in the hope that it will be useful, but WITHOUT
14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with XEmacs; see the file COPYING. If not, write to
20 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
21 Boston, MA 02111-1307, USA. */
23 /* Synched up with: Mule 2.3. Not in FSF. */
25 /* Rewritten by Ben Wing <ben@xemacs.org>. */
39 #include "file-coding.h"
41 Lisp_Object Qcoding_system_error;
43 Lisp_Object Vkeyboard_coding_system;
44 Lisp_Object Vterminal_coding_system;
45 Lisp_Object Vcoding_system_for_read;
46 Lisp_Object Vcoding_system_for_write;
47 Lisp_Object Vfile_name_coding_system;
49 Lisp_Object Vcoded_charset_entity_reference_alist;
51 /* Table of symbols identifying each coding category. */
52 Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST];
56 struct file_coding_dump {
57 /* Coding system currently associated with each coding category. */
58 Lisp_Object coding_category_system[CODING_CATEGORY_LAST];
60 /* Table of all coding categories in decreasing order of priority.
61 This describes a permutation of the possible coding categories. */
62 int coding_category_by_priority[CODING_CATEGORY_LAST];
64 #if defined(MULE) && !defined(UTF2000)
65 Lisp_Object ucs_to_mule_table[65536];
69 static const struct lrecord_description fcd_description_1[] = {
70 { XD_LISP_OBJECT_ARRAY, offsetof (struct file_coding_dump, coding_category_system), CODING_CATEGORY_LAST },
71 #if defined(MULE) && !defined(UTF2000)
72 { XD_LISP_OBJECT_ARRAY, offsetof (struct file_coding_dump, ucs_to_mule_table), countof (fcd->ucs_to_mule_table) },
77 static const struct struct_description fcd_description = {
78 sizeof (struct file_coding_dump),
82 Lisp_Object mule_to_ucs_table;
84 Lisp_Object Qcoding_systemp;
86 Lisp_Object Qraw_text, Qno_conversion, Qccl, Qiso2022;
87 /* Qinternal in general.c */
89 Lisp_Object Qmnemonic, Qeol_type;
90 Lisp_Object Qcr, Qcrlf, Qlf;
91 Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf;
92 Lisp_Object Qpost_read_conversion;
93 Lisp_Object Qpre_write_conversion;
96 Lisp_Object Qucs4, Qutf8;
97 Lisp_Object Qbig5, Qshift_jis;
98 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3;
99 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output;
100 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output;
101 Lisp_Object Qno_iso6429;
102 Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion;
103 Lisp_Object Qescape_quoted;
104 Lisp_Object Qshort, Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift;
107 Lisp_Object Qdisable_composition;
108 Lisp_Object Quse_entity_reference;
109 Lisp_Object Qd, Qx, QX;
111 Lisp_Object Qencode, Qdecode;
113 Lisp_Object Vcoding_system_hash_table;
115 int enable_multibyte_characters;
118 /* Additional information used by the ISO2022 decoder and detector. */
119 struct iso2022_decoder
121 /* CHARSET holds the character sets currently assigned to the G0
122 through G3 variables. It is initialized from the array
123 INITIAL_CHARSET in CODESYS. */
124 Lisp_Object charset[4];
126 /* Which registers are currently invoked into the left (GL) and
127 right (GR) halves of the 8-bit encoding space? */
128 int register_left, register_right;
130 /* ISO_ESC holds a value indicating part of an escape sequence
131 that has already been seen. */
132 enum iso_esc_flag esc;
134 /* This records the bytes we've seen so far in an escape sequence,
135 in case the sequence is invalid (we spit out the bytes unchanged). */
136 unsigned char esc_bytes[8];
138 /* Index for next byte to store in ISO escape sequence. */
141 #ifdef ENABLE_COMPOSITE_CHARS
142 /* Stuff seen so far when composing a string. */
143 unsigned_char_dynarr *composite_chars;
146 /* If we saw an invalid designation sequence for a particular
147 register, we flag it here and switch to ASCII. The next time we
148 see a valid designation for this register, we turn off the flag
149 and do the designation normally, but pretend the sequence was
150 invalid. The effect of all this is that (most of the time) the
151 escape sequences for both the switch to the unknown charset, and
152 the switch back to the known charset, get inserted literally into
153 the buffer and saved out as such. The hope is that we can
154 preserve the escape sequences so that the resulting written out
155 file makes sense. If we don't do any of this, the designation
156 to the invalid charset will be preserved but that switch back
157 to the known charset will probably get eaten because it was
158 the same charset that was already present in the register. */
159 unsigned char invalid_designated[4];
161 /* We try to do similar things as above for direction-switching
162 sequences. If we encountered a direction switch while an
163 invalid designation was present, or an invalid designation
164 just after a direction switch (i.e. no valid designation
165 encountered yet), we insert the direction-switch escape
166 sequence literally into the output stream, and later on
167 insert the corresponding direction-restoring escape sequence
169 unsigned int switched_dir_and_no_valid_charset_yet :1;
170 unsigned int invalid_switch_dir :1;
172 /* Tells the decoder to output the escape sequence literally
173 even though it was valid. Used in the games we play to
174 avoid lossage when we encounter invalid designations. */
175 unsigned int output_literally :1;
176 /* We encountered a direction switch followed by an invalid
177 designation. We didn't output the direction switch
178 literally because we didn't know about the invalid designation;
179 but we have to do so now. */
180 unsigned int output_direction_sequence :1;
183 EXFUN (Fcopy_coding_system, 2);
185 struct detection_state;
188 text_encode_generic (Lstream *encoding, const Bufbyte *src,
189 unsigned_char_dynarr *dst, size_t n);
191 static int detect_coding_sjis (struct detection_state *st,
192 const Extbyte *src, size_t n);
193 static void decode_coding_sjis (Lstream *decoding, const Extbyte *src,
194 unsigned_char_dynarr *dst, size_t n);
195 void char_encode_shift_jis (struct encoding_stream *str, Emchar c,
196 unsigned_char_dynarr *dst, unsigned int *flags);
197 void char_finish_shift_jis (struct encoding_stream *str,
198 unsigned_char_dynarr *dst, unsigned int *flags);
200 static int detect_coding_big5 (struct detection_state *st,
201 const Extbyte *src, size_t n);
202 static void decode_coding_big5 (Lstream *decoding, const Extbyte *src,
203 unsigned_char_dynarr *dst, size_t n);
204 void char_encode_big5 (struct encoding_stream *str, Emchar c,
205 unsigned_char_dynarr *dst, unsigned int *flags);
206 void char_finish_big5 (struct encoding_stream *str,
207 unsigned_char_dynarr *dst, unsigned int *flags);
209 static int detect_coding_ucs4 (struct detection_state *st,
210 const Extbyte *src, size_t n);
211 static void decode_coding_ucs4 (Lstream *decoding, const Extbyte *src,
212 unsigned_char_dynarr *dst, size_t n);
213 void char_encode_ucs4 (struct encoding_stream *str, Emchar c,
214 unsigned_char_dynarr *dst, unsigned int *flags);
215 void char_finish_ucs4 (struct encoding_stream *str,
216 unsigned_char_dynarr *dst, unsigned int *flags);
218 static int detect_coding_utf8 (struct detection_state *st,
219 const Extbyte *src, size_t n);
220 static void decode_coding_utf8 (Lstream *decoding, const Extbyte *src,
221 unsigned_char_dynarr *dst, size_t n);
222 void char_encode_utf8 (struct encoding_stream *str, Emchar c,
223 unsigned_char_dynarr *dst, unsigned int *flags);
224 void char_finish_utf8 (struct encoding_stream *str,
225 unsigned_char_dynarr *dst, unsigned int *flags);
227 static int postprocess_iso2022_mask (int mask);
228 static void reset_iso2022 (Lisp_Object coding_system,
229 struct iso2022_decoder *iso);
230 static int detect_coding_iso2022 (struct detection_state *st,
231 const Extbyte *src, size_t n);
232 static void decode_coding_iso2022 (Lstream *decoding, const Extbyte *src,
233 unsigned_char_dynarr *dst, size_t n);
234 void char_encode_iso2022 (struct encoding_stream *str, Emchar c,
235 unsigned_char_dynarr *dst, unsigned int *flags);
236 void char_finish_iso2022 (struct encoding_stream *str,
237 unsigned_char_dynarr *dst, unsigned int *flags);
239 static void decode_coding_no_conversion (Lstream *decoding, const Extbyte *src,
240 unsigned_char_dynarr *dst, size_t n);
241 static void encode_coding_no_conversion (Lstream *encoding, const Bufbyte *src,
242 unsigned_char_dynarr *dst, size_t n);
243 static void mule_decode (Lstream *decoding, const Extbyte *src,
244 unsigned_char_dynarr *dst, size_t n);
245 static void mule_encode (Lstream *encoding, const Bufbyte *src,
246 unsigned_char_dynarr *dst, size_t n);
248 typedef struct codesys_prop codesys_prop;
257 Dynarr_declare (codesys_prop);
258 } codesys_prop_dynarr;
260 static const struct lrecord_description codesys_prop_description_1[] = {
261 { XD_LISP_OBJECT, offsetof (codesys_prop, sym) },
265 static const struct struct_description codesys_prop_description = {
266 sizeof (codesys_prop),
267 codesys_prop_description_1
270 static const struct lrecord_description codesys_prop_dynarr_description_1[] = {
271 XD_DYNARR_DESC (codesys_prop_dynarr, &codesys_prop_description),
275 static const struct struct_description codesys_prop_dynarr_description = {
276 sizeof (codesys_prop_dynarr),
277 codesys_prop_dynarr_description_1
280 codesys_prop_dynarr *the_codesys_prop_dynarr;
282 enum codesys_prop_enum
285 CODESYS_PROP_ISO2022,
290 /************************************************************************/
291 /* Coding system functions */
292 /************************************************************************/
294 static Lisp_Object mark_coding_system (Lisp_Object);
295 static void print_coding_system (Lisp_Object, Lisp_Object, int);
296 static void finalize_coding_system (void *header, int for_disksave);
299 static const struct lrecord_description ccs_description_1[] = {
300 { XD_LISP_OBJECT, offsetof (charset_conversion_spec, from_charset) },
301 { XD_LISP_OBJECT, offsetof (charset_conversion_spec, to_charset) },
305 static const struct struct_description ccs_description = {
306 sizeof (charset_conversion_spec),
310 static const struct lrecord_description ccsd_description_1[] = {
311 XD_DYNARR_DESC (charset_conversion_spec_dynarr, &ccs_description),
315 static const struct struct_description ccsd_description = {
316 sizeof (charset_conversion_spec_dynarr),
321 static const struct lrecord_description coding_system_description[] = {
322 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, name) },
323 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, doc_string) },
324 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, mnemonic) },
325 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, post_read_conversion) },
326 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, pre_write_conversion) },
327 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_lf) },
328 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_crlf) },
329 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_cr) },
331 { XD_LISP_OBJECT_ARRAY, offsetof (Lisp_Coding_System, iso2022.initial_charset), 4 },
332 { XD_STRUCT_PTR, offsetof (Lisp_Coding_System, iso2022.input_conv), 1, &ccsd_description },
333 { XD_STRUCT_PTR, offsetof (Lisp_Coding_System, iso2022.output_conv), 1, &ccsd_description },
334 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccl.decode) },
335 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccl.encode) },
337 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccs_priority_list) },
343 DEFINE_LRECORD_IMPLEMENTATION ("coding-system", coding_system,
344 mark_coding_system, print_coding_system,
345 finalize_coding_system,
346 0, 0, coding_system_description,
350 mark_coding_system (Lisp_Object obj)
352 Lisp_Coding_System *codesys = XCODING_SYSTEM (obj);
354 mark_object (CODING_SYSTEM_NAME (codesys));
355 mark_object (CODING_SYSTEM_DOC_STRING (codesys));
356 mark_object (CODING_SYSTEM_MNEMONIC (codesys));
357 mark_object (CODING_SYSTEM_EOL_LF (codesys));
358 mark_object (CODING_SYSTEM_EOL_CRLF (codesys));
359 mark_object (CODING_SYSTEM_EOL_CR (codesys));
361 switch (CODING_SYSTEM_TYPE (codesys))
365 case CODESYS_ISO2022:
366 for (i = 0; i < 4; i++)
367 mark_object (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i));
368 if (codesys->iso2022.input_conv)
370 for (i = 0; i < Dynarr_length (codesys->iso2022.input_conv); i++)
372 struct charset_conversion_spec *ccs =
373 Dynarr_atp (codesys->iso2022.input_conv, i);
374 mark_object (ccs->from_charset);
375 mark_object (ccs->to_charset);
378 if (codesys->iso2022.output_conv)
380 for (i = 0; i < Dynarr_length (codesys->iso2022.output_conv); i++)
382 struct charset_conversion_spec *ccs =
383 Dynarr_atp (codesys->iso2022.output_conv, i);
384 mark_object (ccs->from_charset);
385 mark_object (ccs->to_charset);
392 mark_object (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 0));
393 mark_object (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 1));
398 mark_object (CODING_SYSTEM_CCL_DECODE (codesys));
399 mark_object (CODING_SYSTEM_CCL_ENCODE (codesys));
406 mark_object (CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys));
408 mark_object (CODING_SYSTEM_CCS_PRIORITY_LIST (codesys));
410 return CODING_SYSTEM_POST_READ_CONVERSION (codesys);
414 print_coding_system (Lisp_Object obj, Lisp_Object printcharfun,
417 Lisp_Coding_System *c = XCODING_SYSTEM (obj);
419 error ("printing unreadable object #<coding_system 0x%x>",
422 write_c_string ("#<coding_system ", printcharfun);
423 print_internal (c->name, printcharfun, 1);
424 write_c_string (">", printcharfun);
428 finalize_coding_system (void *header, int for_disksave)
430 Lisp_Coding_System *c = (Lisp_Coding_System *) header;
431 /* Since coding systems never go away, this function is not
432 necessary. But it would be necessary if we changed things
433 so that coding systems could go away. */
434 if (!for_disksave) /* see comment in lstream.c */
436 switch (CODING_SYSTEM_TYPE (c))
439 case CODESYS_ISO2022:
440 if (c->iso2022.input_conv)
442 Dynarr_free (c->iso2022.input_conv);
443 c->iso2022.input_conv = 0;
445 if (c->iso2022.output_conv)
447 Dynarr_free (c->iso2022.output_conv);
448 c->iso2022.output_conv = 0;
459 symbol_to_eol_type (Lisp_Object symbol)
461 CHECK_SYMBOL (symbol);
462 if (NILP (symbol)) return EOL_AUTODETECT;
463 if (EQ (symbol, Qlf)) return EOL_LF;
464 if (EQ (symbol, Qcrlf)) return EOL_CRLF;
465 if (EQ (symbol, Qcr)) return EOL_CR;
467 signal_simple_error ("Unrecognized eol type", symbol);
468 return EOL_AUTODETECT; /* not reached */
472 eol_type_to_symbol (eol_type_t type)
477 case EOL_LF: return Qlf;
478 case EOL_CRLF: return Qcrlf;
479 case EOL_CR: return Qcr;
480 case EOL_AUTODETECT: return Qnil;
485 setup_eol_coding_systems (Lisp_Coding_System *codesys)
487 Lisp_Object codesys_obj;
488 int len = string_length (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name);
489 char *codesys_name = (char *) alloca (len + 7);
491 char *codesys_mnemonic=0;
493 Lisp_Object codesys_name_sym, sub_codesys_obj;
497 XSETCODING_SYSTEM (codesys_obj, codesys);
499 memcpy (codesys_name,
500 string_data (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name), len);
502 if (STRINGP (CODING_SYSTEM_MNEMONIC (codesys)))
504 mlen = XSTRING_LENGTH (CODING_SYSTEM_MNEMONIC (codesys));
505 codesys_mnemonic = (char *) alloca (mlen + 7);
506 memcpy (codesys_mnemonic,
507 XSTRING_DATA (CODING_SYSTEM_MNEMONIC (codesys)), mlen);
510 #define DEFINE_SUB_CODESYS(op_sys, op_sys_abbr, Type) do { \
511 strcpy (codesys_name + len, "-" op_sys); \
513 strcpy (codesys_mnemonic + mlen, op_sys_abbr); \
514 codesys_name_sym = intern (codesys_name); \
515 sub_codesys_obj = Fcopy_coding_system (codesys_obj, codesys_name_sym); \
516 XCODING_SYSTEM_EOL_TYPE (sub_codesys_obj) = Type; \
518 XCODING_SYSTEM_MNEMONIC(sub_codesys_obj) = \
519 build_string (codesys_mnemonic); \
520 CODING_SYSTEM_##Type (codesys) = sub_codesys_obj; \
523 DEFINE_SUB_CODESYS("unix", "", EOL_LF);
524 DEFINE_SUB_CODESYS("dos", ":T", EOL_CRLF);
525 DEFINE_SUB_CODESYS("mac", ":t", EOL_CR);
528 DEFUN ("coding-system-p", Fcoding_system_p, 1, 1, 0, /*
529 Return t if OBJECT is a coding system.
530 A coding system is an object that defines how text containing multiple
531 character sets is encoded into a stream of (typically 8-bit) bytes.
532 The coding system is used to decode the stream into a series of
533 characters (which may be from multiple charsets) when the text is read
534 from a file or process, and is used to encode the text back into the
535 same format when it is written out to a file or process.
537 For example, many ISO2022-compliant coding systems (such as Compound
538 Text, which is used for inter-client data under the X Window System)
539 use escape sequences to switch between different charsets -- Japanese
540 Kanji, for example, is invoked with "ESC $ ( B"; ASCII is invoked
541 with "ESC ( B"; and Cyrillic is invoked with "ESC - L". See
542 `make-coding-system' for more information.
544 Coding systems are normally identified using a symbol, and the
545 symbol is accepted in place of the actual coding system object whenever
546 a coding system is called for. (This is similar to how faces work.)
550 return CODING_SYSTEMP (object) ? Qt : Qnil;
553 DEFUN ("find-coding-system", Ffind_coding_system, 1, 1, 0, /*
554 Retrieve the coding system of the given name.
556 If CODING-SYSTEM-OR-NAME is a coding-system object, it is simply
557 returned. Otherwise, CODING-SYSTEM-OR-NAME should be a symbol.
558 If there is no such coding system, nil is returned. Otherwise the
559 associated coding system object is returned.
561 (coding_system_or_name))
563 if (NILP (coding_system_or_name))
564 coding_system_or_name = Qbinary;
565 else if (CODING_SYSTEMP (coding_system_or_name))
566 return coding_system_or_name;
568 CHECK_SYMBOL (coding_system_or_name);
572 coding_system_or_name =
573 Fgethash (coding_system_or_name, Vcoding_system_hash_table, Qnil);
575 if (CODING_SYSTEMP (coding_system_or_name) || NILP (coding_system_or_name))
576 return coding_system_or_name;
580 DEFUN ("get-coding-system", Fget_coding_system, 1, 1, 0, /*
581 Retrieve the coding system of the given name.
582 Same as `find-coding-system' except that if there is no such
583 coding system, an error is signaled instead of returning nil.
587 Lisp_Object coding_system = Ffind_coding_system (name);
589 if (NILP (coding_system))
590 signal_simple_error ("No such coding system", name);
591 return coding_system;
594 /* We store the coding systems in hash tables with the names as the key and the
595 actual coding system object as the value. Occasionally we need to use them
596 in a list format. These routines provide us with that. */
597 struct coding_system_list_closure
599 Lisp_Object *coding_system_list;
603 add_coding_system_to_list_mapper (Lisp_Object key, Lisp_Object value,
604 void *coding_system_list_closure)
606 /* This function can GC */
607 struct coding_system_list_closure *cscl =
608 (struct coding_system_list_closure *) coding_system_list_closure;
609 Lisp_Object *coding_system_list = cscl->coding_system_list;
611 *coding_system_list = Fcons (key, *coding_system_list);
615 DEFUN ("coding-system-list", Fcoding_system_list, 0, 0, 0, /*
616 Return a list of the names of all defined coding systems.
620 Lisp_Object coding_system_list = Qnil;
622 struct coding_system_list_closure coding_system_list_closure;
624 GCPRO1 (coding_system_list);
625 coding_system_list_closure.coding_system_list = &coding_system_list;
626 elisp_maphash (add_coding_system_to_list_mapper, Vcoding_system_hash_table,
627 &coding_system_list_closure);
630 return coding_system_list;
633 DEFUN ("coding-system-name", Fcoding_system_name, 1, 1, 0, /*
634 Return the name of the given coding system.
638 coding_system = Fget_coding_system (coding_system);
639 return XCODING_SYSTEM_NAME (coding_system);
642 static Lisp_Coding_System *
643 allocate_coding_system (enum coding_system_type type, Lisp_Object name)
645 Lisp_Coding_System *codesys =
646 alloc_lcrecord_type (Lisp_Coding_System, &lrecord_coding_system);
648 zero_lcrecord (codesys);
649 CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = Qnil;
650 CODING_SYSTEM_POST_READ_CONVERSION (codesys) = Qnil;
651 CODING_SYSTEM_EOL_TYPE (codesys) = EOL_AUTODETECT;
652 CODING_SYSTEM_EOL_CRLF (codesys) = Qnil;
653 CODING_SYSTEM_EOL_CR (codesys) = Qnil;
654 CODING_SYSTEM_EOL_LF (codesys) = Qnil;
655 CODING_SYSTEM_TYPE (codesys) = type;
656 CODING_SYSTEM_MNEMONIC (codesys) = Qnil;
659 CODING_SYSTEM_CCS_PRIORITY_LIST (codesys) = Qnil;
661 if (type == CODESYS_ISO2022)
664 for (i = 0; i < 4; i++)
665 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i) = Qnil;
668 if (type == CODESYS_BIG5)
670 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 0)
672 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 1)
673 = Vcharset_chinese_big5;
674 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 2)
676 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 3)
680 else if (type == CODESYS_CCL)
682 CODING_SYSTEM_CCL_DECODE (codesys) = Qnil;
683 CODING_SYSTEM_CCL_ENCODE (codesys) = Qnil;
686 CODING_SYSTEM_NAME (codesys) = name;
692 /* Given a list of charset conversion specs as specified in a Lisp
693 program, parse it into STORE_HERE. */
696 parse_charset_conversion_specs (charset_conversion_spec_dynarr *store_here,
697 Lisp_Object spec_list)
701 EXTERNAL_LIST_LOOP (rest, spec_list)
703 Lisp_Object car = XCAR (rest);
704 Lisp_Object from, to;
705 struct charset_conversion_spec spec;
707 if (!CONSP (car) || !CONSP (XCDR (car)) || !NILP (XCDR (XCDR (car))))
708 signal_simple_error ("Invalid charset conversion spec", car);
709 from = Fget_charset (XCAR (car));
710 to = Fget_charset (XCAR (XCDR (car)));
711 if ( (XCHARSET_CHARS (from) != XCHARSET_CHARS (to)) ||
712 (XCHARSET_DIMENSION (from) != XCHARSET_DIMENSION (to)) )
713 signal_simple_error_2
714 ("Attempted conversion between different charset types",
716 spec.from_charset = from;
717 spec.to_charset = to;
719 Dynarr_add (store_here, spec);
723 /* Given a dynarr LOAD_HERE of internally-stored charset conversion
724 specs, return the equivalent as the Lisp programmer would see it.
726 If LOAD_HERE is 0, return Qnil. */
729 unparse_charset_conversion_specs (charset_conversion_spec_dynarr *load_here)
736 for (i = 0, result = Qnil; i < Dynarr_length (load_here); i++)
738 struct charset_conversion_spec *ccs = Dynarr_atp (load_here, i);
739 result = Fcons (list2 (ccs->from_charset, ccs->to_charset), result);
742 return Fnreverse (result);
747 DEFUN ("make-coding-system", Fmake_coding_system, 2, 4, 0, /*
748 Register symbol NAME as a coding system.
750 TYPE describes the conversion method used and should be one of
753 Automatic conversion. XEmacs attempts to detect the coding system
756 No conversion. Use this for binary files and such. On output,
757 graphic characters that are not in ASCII or Latin-1 will be
758 replaced by a ?. (For a no-conversion-encoded buffer, these
759 characters will only be present if you explicitly insert them.)
761 Shift-JIS (a Japanese encoding commonly used in PC operating systems).
763 ISO 10646 UCS-4 encoding.
765 ISO 10646 UTF-8 encoding.
767 Any ISO2022-compliant encoding. Among other things, this includes
768 JIS (the Japanese encoding commonly used for e-mail), EUC (the
769 standard Unix encoding for Japanese and other languages), and
770 Compound Text (the encoding used in X11). You can specify more
771 specific information about the conversion with the PROPS argument.
773 Big5 (the encoding commonly used for Taiwanese).
775 The conversion is performed using a user-written pseudo-code
776 program. CCL (Code Conversion Language) is the name of this
779 Write out or read in the raw contents of the memory representing
780 the buffer's text. This is primarily useful for debugging
781 purposes, and is only enabled when XEmacs has been compiled with
782 DEBUG_XEMACS defined (via the --debug configure option).
783 WARNING: Reading in a file using 'internal conversion can result
784 in an internal inconsistency in the memory representing a
785 buffer's text, which will produce unpredictable results and may
786 cause XEmacs to crash. Under normal circumstances you should
787 never use 'internal conversion.
789 DOC-STRING is a string describing the coding system.
791 PROPS is a property list, describing the specific nature of the
792 character set. Recognized properties are:
795 String to be displayed in the modeline when this coding system is
799 End-of-line conversion to be used. It should be one of
802 Automatically detect the end-of-line type (LF, CRLF,
803 or CR). Also generate subsidiary coding systems named
804 `NAME-unix', `NAME-dos', and `NAME-mac', that are
805 identical to this coding system but have an EOL-TYPE
806 value of 'lf, 'crlf, and 'cr, respectively.
808 The end of a line is marked externally using ASCII LF.
809 Since this is also the way that XEmacs represents an
810 end-of-line internally, specifying this option results
811 in no end-of-line conversion. This is the standard
812 format for Unix text files.
814 The end of a line is marked externally using ASCII
815 CRLF. This is the standard format for MS-DOS text
818 The end of a line is marked externally using ASCII CR.
819 This is the standard format for Macintosh text files.
821 Automatically detect the end-of-line type but do not
822 generate subsidiary coding systems. (This value is
823 converted to nil when stored internally, and
824 `coding-system-property' will return nil.)
827 If non-nil, composition/decomposition for combining characters
830 'use-entity-reference
831 If non-nil, SGML style entity-reference is used for non-system-characters.
833 'post-read-conversion
834 Function called after a file has been read in, to perform the
835 decoding. Called with two arguments, START and END, denoting
836 a region of the current buffer to be decoded.
838 'pre-write-conversion
839 Function called before a file is written out, to perform the
840 encoding. Called with two arguments, START and END, denoting
841 a region of the current buffer to be encoded.
844 The following additional properties are recognized if TYPE is 'iso2022:
850 The character set initially designated to the G0 - G3 registers.
851 The value should be one of
853 -- A charset object (designate that character set)
854 -- nil (do not ever use this register)
855 -- t (no character set is initially designated to
856 the register, but may be later on; this automatically
857 sets the corresponding `force-g*-on-output' property)
863 If non-nil, send an explicit designation sequence on output before
864 using the specified register.
867 If non-nil, use the short forms "ESC $ @", "ESC $ A", and
868 "ESC $ B" on output in place of the full designation sequences
869 "ESC $ ( @", "ESC $ ( A", and "ESC $ ( B".
872 If non-nil, don't designate ASCII to G0 at each end of line on output.
873 Setting this to non-nil also suppresses other state-resetting that
874 normally happens at the end of a line.
877 If non-nil, don't designate ASCII to G0 before control chars on output.
880 If non-nil, use 7-bit environment on output. Otherwise, use 8-bit
884 If non-nil, use locking-shift (SO/SI) instead of single-shift
885 or designation by escape sequence.
888 If non-nil, don't use ISO6429's direction specification.
891 If non-nil, literal control characters that are the same as
892 the beginning of a recognized ISO2022 or ISO6429 escape sequence
893 (in particular, ESC (0x1B), SO (0x0E), SI (0x0F), SS2 (0x8E),
894 SS3 (0x8F), and CSI (0x9B)) are "quoted" with an escape character
895 so that they can be properly distinguished from an escape sequence.
896 (Note that doing this results in a non-portable encoding.) This
897 encoding flag is used for byte-compiled files. Note that ESC
898 is a good choice for a quoting character because there are no
899 escape sequences whose second byte is a character from the Control-0
900 or Control-1 character sets; this is explicitly disallowed by the
903 'input-charset-conversion
904 A list of conversion specifications, specifying conversion of
905 characters in one charset to another when decoding is performed.
906 Each specification is a list of two elements: the source charset,
907 and the destination charset.
909 'output-charset-conversion
910 A list of conversion specifications, specifying conversion of
911 characters in one charset to another when encoding is performed.
912 The form of each specification is the same as for
913 'input-charset-conversion.
916 The following additional properties are recognized (and required)
920 CCL program used for decoding (converting to internal format).
923 CCL program used for encoding (converting to external format).
925 (name, type, doc_string, props))
927 Lisp_Coding_System *codesys;
928 enum coding_system_type ty;
929 int need_to_setup_eol_systems = 1;
931 /* Convert type to constant */
932 if (NILP (type) || EQ (type, Qundecided))
933 { ty = CODESYS_AUTODETECT; }
935 else if (EQ (type, Qshift_jis)) { ty = CODESYS_SHIFT_JIS; }
936 else if (EQ (type, Qiso2022)) { ty = CODESYS_ISO2022; }
937 else if (EQ (type, Qbig5)) { ty = CODESYS_BIG5; }
938 else if (EQ (type, Qucs4)) { ty = CODESYS_UCS4; }
939 else if (EQ (type, Qutf8)) { ty = CODESYS_UTF8; }
940 else if (EQ (type, Qccl)) { ty = CODESYS_CCL; }
942 else if (EQ (type, Qno_conversion)) { ty = CODESYS_NO_CONVERSION; }
944 else if (EQ (type, Qinternal)) { ty = CODESYS_INTERNAL; }
947 signal_simple_error ("Invalid coding system type", type);
951 codesys = allocate_coding_system (ty, name);
953 if (NILP (doc_string))
954 doc_string = build_string ("");
956 CHECK_STRING (doc_string);
957 CODING_SYSTEM_DOC_STRING (codesys) = doc_string;
960 EXTERNAL_PROPERTY_LIST_LOOP_3 (key, value, props)
962 if (EQ (key, Qmnemonic))
965 CHECK_STRING (value);
966 CODING_SYSTEM_MNEMONIC (codesys) = value;
969 else if (EQ (key, Qeol_type))
971 need_to_setup_eol_systems = NILP (value);
974 CODING_SYSTEM_EOL_TYPE (codesys) = symbol_to_eol_type (value);
977 else if (EQ (key, Qpost_read_conversion))
978 CODING_SYSTEM_POST_READ_CONVERSION (codesys) = value;
979 else if (EQ (key, Qpre_write_conversion))
980 CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = value;
982 else if (EQ (key, Qdisable_composition))
983 CODING_SYSTEM_DISABLE_COMPOSITION (codesys) = !NILP (value);
984 else if (EQ (key, Quse_entity_reference))
985 CODING_SYSTEM_USE_ENTITY_REFERENCE (codesys) = !NILP (value);
988 else if (ty == CODESYS_ISO2022)
990 #define FROB_INITIAL_CHARSET(charset_num) \
991 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \
992 ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value))
994 if (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
995 else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
996 else if (EQ (key, Qcharset_g2)) FROB_INITIAL_CHARSET (2);
997 else if (EQ (key, Qcharset_g3)) FROB_INITIAL_CHARSET (3);
999 #define FROB_FORCE_CHARSET(charset_num) \
1000 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = !NILP (value)
1002 else if (EQ (key, Qforce_g0_on_output)) FROB_FORCE_CHARSET (0);
1003 else if (EQ (key, Qforce_g1_on_output)) FROB_FORCE_CHARSET (1);
1004 else if (EQ (key, Qforce_g2_on_output)) FROB_FORCE_CHARSET (2);
1005 else if (EQ (key, Qforce_g3_on_output)) FROB_FORCE_CHARSET (3);
1007 #define FROB_BOOLEAN_PROPERTY(prop) \
1008 CODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value)
1010 else if (EQ (key, Qshort)) FROB_BOOLEAN_PROPERTY (SHORT);
1011 else if (EQ (key, Qno_ascii_eol)) FROB_BOOLEAN_PROPERTY (NO_ASCII_EOL);
1012 else if (EQ (key, Qno_ascii_cntl)) FROB_BOOLEAN_PROPERTY (NO_ASCII_CNTL);
1013 else if (EQ (key, Qseven)) FROB_BOOLEAN_PROPERTY (SEVEN);
1014 else if (EQ (key, Qlock_shift)) FROB_BOOLEAN_PROPERTY (LOCK_SHIFT);
1015 else if (EQ (key, Qno_iso6429)) FROB_BOOLEAN_PROPERTY (NO_ISO6429);
1016 else if (EQ (key, Qescape_quoted)) FROB_BOOLEAN_PROPERTY (ESCAPE_QUOTED);
1018 else if (EQ (key, Qinput_charset_conversion))
1020 codesys->iso2022.input_conv =
1021 Dynarr_new (charset_conversion_spec);
1022 parse_charset_conversion_specs (codesys->iso2022.input_conv,
1025 else if (EQ (key, Qoutput_charset_conversion))
1027 codesys->iso2022.output_conv =
1028 Dynarr_new (charset_conversion_spec);
1029 parse_charset_conversion_specs (codesys->iso2022.output_conv,
1033 signal_simple_error ("Unrecognized property", key);
1036 else if (ty == CODESYS_BIG5)
1038 if (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
1039 else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
1041 signal_simple_error ("Unrecognized property", key);
1044 else if (EQ (type, Qccl))
1047 struct ccl_program test_ccl;
1050 /* Check key first. */
1051 if (EQ (key, Qdecode))
1052 suffix = "-ccl-decode";
1053 else if (EQ (key, Qencode))
1054 suffix = "-ccl-encode";
1056 signal_simple_error ("Unrecognized property", key);
1058 /* If value is vector, register it as a ccl program
1059 associated with an newly created symbol for
1060 backward compatibility. */
1061 if (VECTORP (value))
1063 sym = Fintern (concat2 (Fsymbol_name (name),
1064 build_string (suffix)),
1066 Fregister_ccl_program (sym, value);
1070 CHECK_SYMBOL (value);
1073 /* check if the given ccl programs are valid. */
1074 if (setup_ccl_program (&test_ccl, sym) < 0)
1075 signal_simple_error ("Invalid CCL program", value);
1077 if (EQ (key, Qdecode))
1078 CODING_SYSTEM_CCL_DECODE (codesys) = sym;
1079 else if (EQ (key, Qencode))
1080 CODING_SYSTEM_CCL_ENCODE (codesys) = sym;
1085 signal_simple_error ("Unrecognized property", key);
1089 if (need_to_setup_eol_systems)
1090 setup_eol_coding_systems (codesys);
1093 Lisp_Object codesys_obj;
1094 XSETCODING_SYSTEM (codesys_obj, codesys);
1095 Fputhash (name, codesys_obj, Vcoding_system_hash_table);
1100 DEFUN ("copy-coding-system", Fcopy_coding_system, 2, 2, 0, /*
1101 Copy OLD-CODING-SYSTEM to NEW-NAME.
1102 If NEW-NAME does not name an existing coding system, a new one will
1105 (old_coding_system, new_name))
1107 Lisp_Object new_coding_system;
1108 old_coding_system = Fget_coding_system (old_coding_system);
1109 new_coding_system = Ffind_coding_system (new_name);
1110 if (NILP (new_coding_system))
1112 XSETCODING_SYSTEM (new_coding_system,
1113 allocate_coding_system
1114 (XCODING_SYSTEM_TYPE (old_coding_system),
1116 Fputhash (new_name, new_coding_system, Vcoding_system_hash_table);
1120 Lisp_Coding_System *to = XCODING_SYSTEM (new_coding_system);
1121 Lisp_Coding_System *from = XCODING_SYSTEM (old_coding_system);
1122 memcpy (((char *) to ) + sizeof (to->header),
1123 ((char *) from) + sizeof (from->header),
1124 sizeof (*from) - sizeof (from->header));
1125 to->name = new_name;
1127 return new_coding_system;
1130 DEFUN ("coding-system-canonical-name-p", Fcoding_system_canonical_name_p, 1, 1, 0, /*
1131 Return t if OBJECT names a coding system, and is not a coding system alias.
1135 return CODING_SYSTEMP (Fgethash (object, Vcoding_system_hash_table, Qnil))
1139 DEFUN ("coding-system-alias-p", Fcoding_system_alias_p, 1, 1, 0, /*
1140 Return t if OBJECT is a coding system alias.
1141 All coding system aliases are created by `define-coding-system-alias'.
1145 return SYMBOLP (Fgethash (object, Vcoding_system_hash_table, Qzero))
1149 DEFUN ("coding-system-aliasee", Fcoding_system_aliasee, 1, 1, 0, /*
1150 Return the coding-system symbol for which symbol ALIAS is an alias.
1154 Lisp_Object aliasee = Fgethash (alias, Vcoding_system_hash_table, Qnil);
1155 if (SYMBOLP (aliasee))
1158 signal_simple_error ("Symbol is not a coding system alias", alias);
1159 return Qnil; /* To keep the compiler happy */
1163 append_suffix_to_symbol (Lisp_Object symbol, const char *ascii_string)
1165 return Fintern (concat2 (Fsymbol_name (symbol), build_string (ascii_string)),
1169 /* A maphash function, for removing dangling coding system aliases. */
1171 dangling_coding_system_alias_p (Lisp_Object alias,
1172 Lisp_Object aliasee,
1173 void *dangling_aliases)
1175 if (SYMBOLP (aliasee)
1176 && NILP (Fgethash (aliasee, Vcoding_system_hash_table, Qnil)))
1178 (*(int *) dangling_aliases)++;
1185 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias, 2, 2, 0, /*
1186 Define symbol ALIAS as an alias for coding system ALIASEE.
1188 You can use this function to redefine an alias that has already been defined,
1189 but you cannot redefine a name which is the canonical name for a coding system.
1190 \(a canonical name of a coding system is what is returned when you call
1191 `coding-system-name' on a coding system).
1193 ALIASEE itself can be an alias, which allows you to define nested aliases.
1195 You are forbidden, however, from creating alias loops or `dangling' aliases.
1196 These will be detected, and an error will be signaled if you attempt to do so.
1198 If ALIASEE is nil, then ALIAS will simply be undefined.
1200 See also `coding-system-alias-p', `coding-system-aliasee',
1201 and `coding-system-canonical-name-p'.
1205 Lisp_Object real_coding_system, probe;
1207 CHECK_SYMBOL (alias);
1209 if (!NILP (Fcoding_system_canonical_name_p (alias)))
1211 ("Symbol is the canonical name of a coding system and cannot be redefined",
1216 Lisp_Object subsidiary_unix = append_suffix_to_symbol (alias, "-unix");
1217 Lisp_Object subsidiary_dos = append_suffix_to_symbol (alias, "-dos");
1218 Lisp_Object subsidiary_mac = append_suffix_to_symbol (alias, "-mac");
1220 Fremhash (alias, Vcoding_system_hash_table);
1222 /* Undefine subsidiary aliases,
1223 presumably created by a previous call to this function */
1224 if (! NILP (Fcoding_system_alias_p (subsidiary_unix)) &&
1225 ! NILP (Fcoding_system_alias_p (subsidiary_dos)) &&
1226 ! NILP (Fcoding_system_alias_p (subsidiary_mac)))
1228 Fdefine_coding_system_alias (subsidiary_unix, Qnil);
1229 Fdefine_coding_system_alias (subsidiary_dos, Qnil);
1230 Fdefine_coding_system_alias (subsidiary_mac, Qnil);
1233 /* Undefine dangling coding system aliases. */
1235 int dangling_aliases;
1238 dangling_aliases = 0;
1239 elisp_map_remhash (dangling_coding_system_alias_p,
1240 Vcoding_system_hash_table,
1242 } while (dangling_aliases > 0);
1248 if (CODING_SYSTEMP (aliasee))
1249 aliasee = XCODING_SYSTEM_NAME (aliasee);
1251 /* Checks that aliasee names a coding-system */
1252 real_coding_system = Fget_coding_system (aliasee);
1254 /* Check for coding system alias loops */
1255 if (EQ (alias, aliasee))
1256 alias_loop: signal_simple_error_2
1257 ("Attempt to create a coding system alias loop", alias, aliasee);
1259 for (probe = aliasee;
1261 probe = Fgethash (probe, Vcoding_system_hash_table, Qzero))
1263 if (EQ (probe, alias))
1267 Fputhash (alias, aliasee, Vcoding_system_hash_table);
1269 /* Set up aliases for subsidiaries.
1270 #### There must be a better way to handle subsidiary coding systems. */
1272 static const char *suffixes[] = { "-unix", "-dos", "-mac" };
1274 for (i = 0; i < countof (suffixes); i++)
1276 Lisp_Object alias_subsidiary =
1277 append_suffix_to_symbol (alias, suffixes[i]);
1278 Lisp_Object aliasee_subsidiary =
1279 append_suffix_to_symbol (aliasee, suffixes[i]);
1281 if (! NILP (Ffind_coding_system (aliasee_subsidiary)))
1282 Fdefine_coding_system_alias (alias_subsidiary, aliasee_subsidiary);
1285 /* FSF return value is a vector of [ALIAS-unix ALIAS-dos ALIAS-mac],
1286 but it doesn't look intentional, so I'd rather return something
1287 meaningful or nothing at all. */
1292 subsidiary_coding_system (Lisp_Object coding_system, eol_type_t type)
1294 Lisp_Coding_System *cs = XCODING_SYSTEM (coding_system);
1295 Lisp_Object new_coding_system;
1297 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
1298 return coding_system;
1302 case EOL_AUTODETECT: return coding_system;
1303 case EOL_LF: new_coding_system = CODING_SYSTEM_EOL_LF (cs); break;
1304 case EOL_CR: new_coding_system = CODING_SYSTEM_EOL_CR (cs); break;
1305 case EOL_CRLF: new_coding_system = CODING_SYSTEM_EOL_CRLF (cs); break;
1306 default: abort (); return Qnil;
1309 return NILP (new_coding_system) ? coding_system : new_coding_system;
1312 DEFUN ("subsidiary-coding-system", Fsubsidiary_coding_system, 2, 2, 0, /*
1313 Return the subsidiary coding system of CODING-SYSTEM with eol type EOL-TYPE.
1315 (coding_system, eol_type))
1317 coding_system = Fget_coding_system (coding_system);
1319 return subsidiary_coding_system (coding_system,
1320 symbol_to_eol_type (eol_type));
1324 /************************************************************************/
1325 /* Coding system accessors */
1326 /************************************************************************/
1328 DEFUN ("coding-system-doc-string", Fcoding_system_doc_string, 1, 1, 0, /*
1329 Return the doc string for CODING-SYSTEM.
1333 coding_system = Fget_coding_system (coding_system);
1334 return XCODING_SYSTEM_DOC_STRING (coding_system);
1337 DEFUN ("coding-system-type", Fcoding_system_type, 1, 1, 0, /*
1338 Return the type of CODING-SYSTEM.
1342 switch (XCODING_SYSTEM_TYPE (Fget_coding_system (coding_system)))
1345 case CODESYS_AUTODETECT: return Qundecided;
1347 case CODESYS_SHIFT_JIS: return Qshift_jis;
1348 case CODESYS_ISO2022: return Qiso2022;
1349 case CODESYS_BIG5: return Qbig5;
1350 case CODESYS_UCS4: return Qucs4;
1351 case CODESYS_UTF8: return Qutf8;
1352 case CODESYS_CCL: return Qccl;
1354 case CODESYS_NO_CONVERSION: return Qno_conversion;
1356 case CODESYS_INTERNAL: return Qinternal;
1363 Lisp_Object coding_system_charset (Lisp_Object coding_system, int gnum)
1366 = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, gnum);
1368 return CHARSETP (cs) ? XCHARSET_NAME (cs) : Qnil;
1371 DEFUN ("coding-system-charset", Fcoding_system_charset, 2, 2, 0, /*
1372 Return initial charset of CODING-SYSTEM designated to GNUM.
1375 (coding_system, gnum))
1377 coding_system = Fget_coding_system (coding_system);
1380 return coding_system_charset (coding_system, XINT (gnum));
1384 DEFUN ("coding-system-property", Fcoding_system_property, 2, 2, 0, /*
1385 Return the PROP property of CODING-SYSTEM.
1387 (coding_system, prop))
1390 enum coding_system_type type;
1392 coding_system = Fget_coding_system (coding_system);
1393 CHECK_SYMBOL (prop);
1394 type = XCODING_SYSTEM_TYPE (coding_system);
1396 for (i = 0; !ok && i < Dynarr_length (the_codesys_prop_dynarr); i++)
1397 if (EQ (Dynarr_at (the_codesys_prop_dynarr, i).sym, prop))
1400 switch (Dynarr_at (the_codesys_prop_dynarr, i).prop_type)
1402 case CODESYS_PROP_ALL_OK:
1405 case CODESYS_PROP_ISO2022:
1406 if (type != CODESYS_ISO2022)
1408 ("Property only valid in ISO2022 coding systems",
1412 case CODESYS_PROP_CCL:
1413 if (type != CODESYS_CCL)
1415 ("Property only valid in CCL coding systems",
1425 signal_simple_error ("Unrecognized property", prop);
1427 if (EQ (prop, Qname))
1428 return XCODING_SYSTEM_NAME (coding_system);
1429 else if (EQ (prop, Qtype))
1430 return Fcoding_system_type (coding_system);
1431 else if (EQ (prop, Qdoc_string))
1432 return XCODING_SYSTEM_DOC_STRING (coding_system);
1433 else if (EQ (prop, Qmnemonic))
1434 return XCODING_SYSTEM_MNEMONIC (coding_system);
1435 else if (EQ (prop, Qeol_type))
1436 return eol_type_to_symbol (XCODING_SYSTEM_EOL_TYPE (coding_system));
1437 else if (EQ (prop, Qeol_lf))
1438 return XCODING_SYSTEM_EOL_LF (coding_system);
1439 else if (EQ (prop, Qeol_crlf))
1440 return XCODING_SYSTEM_EOL_CRLF (coding_system);
1441 else if (EQ (prop, Qeol_cr))
1442 return XCODING_SYSTEM_EOL_CR (coding_system);
1443 else if (EQ (prop, Qpost_read_conversion))
1444 return XCODING_SYSTEM_POST_READ_CONVERSION (coding_system);
1445 else if (EQ (prop, Qpre_write_conversion))
1446 return XCODING_SYSTEM_PRE_WRITE_CONVERSION (coding_system);
1449 else if (EQ (prop, Qdisable_composition))
1450 return XCODING_SYSTEM_DISABLE_COMPOSITION (coding_system) ? Qt : Qnil;
1451 else if (EQ (prop, Quse_entity_reference))
1452 return XCODING_SYSTEM_USE_ENTITY_REFERENCE (coding_system) ? Qt : Qnil;
1454 else if (type == CODESYS_ISO2022)
1456 if (EQ (prop, Qcharset_g0))
1457 return coding_system_charset (coding_system, 0);
1458 else if (EQ (prop, Qcharset_g1))
1459 return coding_system_charset (coding_system, 1);
1460 else if (EQ (prop, Qcharset_g2))
1461 return coding_system_charset (coding_system, 2);
1462 else if (EQ (prop, Qcharset_g3))
1463 return coding_system_charset (coding_system, 3);
1465 #define FORCE_CHARSET(charset_num) \
1466 (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \
1467 (coding_system, charset_num) ? Qt : Qnil)
1469 else if (EQ (prop, Qforce_g0_on_output)) return FORCE_CHARSET (0);
1470 else if (EQ (prop, Qforce_g1_on_output)) return FORCE_CHARSET (1);
1471 else if (EQ (prop, Qforce_g2_on_output)) return FORCE_CHARSET (2);
1472 else if (EQ (prop, Qforce_g3_on_output)) return FORCE_CHARSET (3);
1474 #define LISP_BOOLEAN(prop) \
1475 (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil)
1477 else if (EQ (prop, Qshort)) return LISP_BOOLEAN (SHORT);
1478 else if (EQ (prop, Qno_ascii_eol)) return LISP_BOOLEAN (NO_ASCII_EOL);
1479 else if (EQ (prop, Qno_ascii_cntl)) return LISP_BOOLEAN (NO_ASCII_CNTL);
1480 else if (EQ (prop, Qseven)) return LISP_BOOLEAN (SEVEN);
1481 else if (EQ (prop, Qlock_shift)) return LISP_BOOLEAN (LOCK_SHIFT);
1482 else if (EQ (prop, Qno_iso6429)) return LISP_BOOLEAN (NO_ISO6429);
1483 else if (EQ (prop, Qescape_quoted)) return LISP_BOOLEAN (ESCAPE_QUOTED);
1485 else if (EQ (prop, Qinput_charset_conversion))
1487 unparse_charset_conversion_specs
1488 (XCODING_SYSTEM (coding_system)->iso2022.input_conv);
1489 else if (EQ (prop, Qoutput_charset_conversion))
1491 unparse_charset_conversion_specs
1492 (XCODING_SYSTEM (coding_system)->iso2022.output_conv);
1496 else if (type == CODESYS_CCL)
1498 if (EQ (prop, Qdecode))
1499 return XCODING_SYSTEM_CCL_DECODE (coding_system);
1500 else if (EQ (prop, Qencode))
1501 return XCODING_SYSTEM_CCL_ENCODE (coding_system);
1509 return Qnil; /* not reached */
1513 /************************************************************************/
1514 /* Coding category functions */
1515 /************************************************************************/
1518 decode_coding_category (Lisp_Object symbol)
1522 CHECK_SYMBOL (symbol);
1523 for (i = 0; i < CODING_CATEGORY_LAST; i++)
1524 if (EQ (coding_category_symbol[i], symbol))
1527 signal_simple_error ("Unrecognized coding category", symbol);
1528 return 0; /* not reached */
1531 DEFUN ("coding-category-list", Fcoding_category_list, 0, 0, 0, /*
1532 Return a list of all recognized coding categories.
1537 Lisp_Object list = Qnil;
1539 for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
1540 list = Fcons (coding_category_symbol[i], list);
1544 DEFUN ("set-coding-priority-list", Fset_coding_priority_list, 1, 1, 0, /*
1545 Change the priority order of the coding categories.
1546 LIST should be list of coding categories, in descending order of
1547 priority. Unspecified coding categories will be lower in priority
1548 than all specified ones, in the same relative order they were in
1553 int category_to_priority[CODING_CATEGORY_LAST];
1557 /* First generate a list that maps coding categories to priorities. */
1559 for (i = 0; i < CODING_CATEGORY_LAST; i++)
1560 category_to_priority[i] = -1;
1562 /* Highest priority comes from the specified list. */
1564 EXTERNAL_LIST_LOOP (rest, list)
1566 int cat = decode_coding_category (XCAR (rest));
1568 if (category_to_priority[cat] >= 0)
1569 signal_simple_error ("Duplicate coding category in list", XCAR (rest));
1570 category_to_priority[cat] = i++;
1573 /* Now go through the existing categories by priority to retrieve
1574 the categories not yet specified and preserve their priority
1576 for (j = 0; j < CODING_CATEGORY_LAST; j++)
1578 int cat = fcd->coding_category_by_priority[j];
1579 if (category_to_priority[cat] < 0)
1580 category_to_priority[cat] = i++;
1583 /* Now we need to construct the inverse of the mapping we just
1586 for (i = 0; i < CODING_CATEGORY_LAST; i++)
1587 fcd->coding_category_by_priority[category_to_priority[i]] = i;
1589 /* Phew! That was confusing. */
1593 DEFUN ("coding-priority-list", Fcoding_priority_list, 0, 0, 0, /*
1594 Return a list of coding categories in descending order of priority.
1599 Lisp_Object list = Qnil;
1601 for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
1602 list = Fcons (coding_category_symbol[fcd->coding_category_by_priority[i]],
1607 DEFUN ("set-coding-category-system", Fset_coding_category_system, 2, 2, 0, /*
1608 Change the coding system associated with a coding category.
1610 (coding_category, coding_system))
1612 int cat = decode_coding_category (coding_category);
1614 coding_system = Fget_coding_system (coding_system);
1615 fcd->coding_category_system[cat] = coding_system;
1619 DEFUN ("coding-category-system", Fcoding_category_system, 1, 1, 0, /*
1620 Return the coding system associated with a coding category.
1624 int cat = decode_coding_category (coding_category);
1625 Lisp_Object sys = fcd->coding_category_system[cat];
1628 return XCODING_SYSTEM_NAME (sys);
1633 /************************************************************************/
1634 /* Detecting the encoding of data */
1635 /************************************************************************/
1637 struct detection_state
1639 eol_type_t eol_type;
1675 struct iso2022_decoder iso;
1677 int high_byte_count;
1678 unsigned int saw_single_shift:1;
1691 acceptable_control_char_p (int c)
1695 /* Allow and ignore control characters that you might
1696 reasonably see in a text file */
1701 case 8: /* backspace */
1702 case 11: /* vertical tab */
1703 case 12: /* form feed */
1704 case 26: /* MS-DOS C-z junk */
1705 case 31: /* '^_' -- for info */
1713 mask_has_at_most_one_bit_p (int mask)
1715 /* Perhaps the only thing useful you learn from intensive Microsoft
1716 technical interviews */
1717 return (mask & (mask - 1)) == 0;
1721 detect_eol_type (struct detection_state *st, const Extbyte *src,
1726 unsigned char c = *(unsigned char *)src++;
1729 if (st->eol.just_saw_cr)
1731 else if (st->eol.seen_anything)
1734 else if (st->eol.just_saw_cr)
1737 st->eol.just_saw_cr = 1;
1739 st->eol.just_saw_cr = 0;
1740 st->eol.seen_anything = 1;
1743 return EOL_AUTODETECT;
1746 /* Attempt to determine the encoding and EOL type of the given text.
1747 Before calling this function for the first type, you must initialize
1748 st->eol_type as appropriate and initialize st->mask to ~0.
1750 st->eol_type holds the determined EOL type, or EOL_AUTODETECT if
1753 st->mask holds the determined coding category mask, or ~0 if only
1754 ASCII has been seen so far.
1758 0 == st->eol_type is EOL_AUTODETECT and/or more than coding category
1759 is present in st->mask
1760 1 == definitive answers are here for both st->eol_type and st->mask
1764 detect_coding_type (struct detection_state *st, const Extbyte *src,
1765 size_t n, int just_do_eol)
1767 if (st->eol_type == EOL_AUTODETECT)
1768 st->eol_type = detect_eol_type (st, src, n);
1771 return st->eol_type != EOL_AUTODETECT;
1773 if (!st->seen_non_ascii)
1775 for (; n; n--, src++)
1777 unsigned char c = *(unsigned char *) src;
1778 if ((c < 0x20 && !acceptable_control_char_p (c)) || c >= 0x80)
1780 st->seen_non_ascii = 1;
1782 st->shift_jis.mask = ~0;
1786 st->iso2022.mask = ~0;
1796 if (!mask_has_at_most_one_bit_p (st->iso2022.mask))
1797 st->iso2022.mask = detect_coding_iso2022 (st, src, n);
1798 if (!mask_has_at_most_one_bit_p (st->shift_jis.mask))
1799 st->shift_jis.mask = detect_coding_sjis (st, src, n);
1800 if (!mask_has_at_most_one_bit_p (st->big5.mask))
1801 st->big5.mask = detect_coding_big5 (st, src, n);
1802 if (!mask_has_at_most_one_bit_p (st->utf8.mask))
1803 st->utf8.mask = detect_coding_utf8 (st, src, n);
1804 if (!mask_has_at_most_one_bit_p (st->ucs4.mask))
1805 st->ucs4.mask = detect_coding_ucs4 (st, src, n);
1808 = st->iso2022.mask | st->shift_jis.mask | st->big5.mask
1809 | st->utf8.mask | st->ucs4.mask;
1812 int retval = mask_has_at_most_one_bit_p (st->mask);
1813 st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK;
1814 return retval && st->eol_type != EOL_AUTODETECT;
1819 coding_system_from_mask (int mask)
1823 /* If the file was entirely or basically ASCII, use the
1824 default value of `buffer-file-coding-system'. */
1825 Lisp_Object retval =
1826 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system;
1829 retval = Ffind_coding_system (retval);
1833 (Qbad_variable, Qwarning,
1834 "Invalid `default-buffer-file-coding-system', set to nil");
1835 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system = Qnil;
1839 retval = Fget_coding_system (Qraw_text);
1847 mask = postprocess_iso2022_mask (mask);
1849 /* Look through the coding categories by priority and find
1850 the first one that is allowed. */
1851 for (i = 0; i < CODING_CATEGORY_LAST; i++)
1853 cat = fcd->coding_category_by_priority[i];
1854 if ((mask & (1 << cat)) &&
1855 !NILP (fcd->coding_category_system[cat]))
1859 return fcd->coding_category_system[cat];
1861 return Fget_coding_system (Qraw_text);
1865 /* Given a seekable read stream and potential coding system and EOL type
1866 as specified, do any autodetection that is called for. If the
1867 coding system and/or EOL type are not `autodetect', they will be left
1868 alone; but this function will never return an autodetect coding system
1871 This function does not automatically fetch subsidiary coding systems;
1872 that should be unnecessary with the explicit eol-type argument. */
1874 #define LENGTH(string_constant) (sizeof (string_constant) - 1)
1877 determine_real_coding_system (Lstream *stream, Lisp_Object *codesys_in_out,
1878 eol_type_t *eol_type_in_out)
1880 struct detection_state decst;
1882 if (*eol_type_in_out == EOL_AUTODETECT)
1883 *eol_type_in_out = XCODING_SYSTEM_EOL_TYPE (*codesys_in_out);
1886 decst.eol_type = *eol_type_in_out;
1889 /* If autodetection is called for, do it now. */
1890 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT
1891 || *eol_type_in_out == EOL_AUTODETECT)
1894 Lisp_Object coding_system = Qnil;
1896 ssize_t nread = Lstream_read (stream, buf, sizeof (buf));
1899 /* Look for initial "-*-"; mode line prefix */
1901 scan_end = buf + nread - LENGTH ("-*-coding:?-*-");
1906 if (*p == '-' && *(p+1) == '*' && *(p+2) == '-')
1908 Extbyte *local_vars_beg = p + 3;
1909 /* Look for final "-*-"; mode line suffix */
1910 for (p = local_vars_beg,
1911 scan_end = buf + nread - LENGTH ("-*-");
1916 if (*p == '-' && *(p+1) == '*' && *(p+2) == '-')
1918 Extbyte *suffix = p;
1919 /* Look for "coding:" */
1920 for (p = local_vars_beg,
1921 scan_end = suffix - LENGTH ("coding:?");
1924 if (memcmp ("coding:", p, LENGTH ("coding:")) == 0
1925 && (p == local_vars_beg
1926 || (*(p-1) == ' ' ||
1932 p += LENGTH ("coding:");
1933 while (*p == ' ' || *p == '\t') p++;
1935 /* Get coding system name */
1936 save = *suffix; *suffix = '\0';
1937 /* Characters valid in a MIME charset name (rfc 1521),
1938 and in a Lisp symbol name. */
1939 n = strspn ( (char *) p,
1940 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
1941 "abcdefghijklmnopqrstuvwxyz"
1947 save = p[n]; p[n] = '\0';
1949 Ffind_coding_system (intern ((char *) p));
1959 if (NILP (coding_system))
1962 if (detect_coding_type (&decst, buf, nread,
1963 XCODING_SYSTEM_TYPE (*codesys_in_out)
1964 != CODESYS_AUTODETECT))
1966 nread = Lstream_read (stream, buf, sizeof (buf));
1972 else if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT
1973 && XCODING_SYSTEM_EOL_TYPE (coding_system) == EOL_AUTODETECT)
1976 if (detect_coding_type (&decst, buf, nread, 1))
1978 nread = Lstream_read (stream, buf, sizeof (buf));
1984 *eol_type_in_out = decst.eol_type;
1985 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT)
1987 if (NILP (coding_system))
1988 *codesys_in_out = coding_system_from_mask (decst.mask);
1990 *codesys_in_out = coding_system;
1994 /* If we absolutely can't determine the EOL type, just assume LF. */
1995 if (*eol_type_in_out == EOL_AUTODETECT)
1996 *eol_type_in_out = EOL_LF;
1998 Lstream_rewind (stream);
2001 DEFUN ("detect-coding-region", Fdetect_coding_region, 2, 3, 0, /*
2002 Detect coding system of the text in the region between START and END.
2003 Return a list of possible coding systems ordered by priority.
2004 If only ASCII characters are found, return 'undecided or one of
2005 its subsidiary coding systems according to a detected end-of-line
2006 type. Optional arg BUFFER defaults to the current buffer.
2008 (start, end, buffer))
2010 Lisp_Object val = Qnil;
2011 struct buffer *buf = decode_buffer (buffer, 0);
2013 Lisp_Object instream, lb_instream;
2014 Lstream *istr, *lb_istr;
2015 struct detection_state decst;
2016 struct gcpro gcpro1, gcpro2;
2018 get_buffer_range_char (buf, start, end, &b, &e, 0);
2019 lb_instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2020 lb_istr = XLSTREAM (lb_instream);
2021 instream = make_encoding_input_stream (lb_istr, Fget_coding_system (Qbinary));
2022 istr = XLSTREAM (instream);
2023 GCPRO2 (instream, lb_instream);
2025 decst.eol_type = EOL_AUTODETECT;
2029 Extbyte random_buffer[4096];
2030 ssize_t nread = Lstream_read (istr, random_buffer, sizeof (random_buffer));
2034 if (detect_coding_type (&decst, random_buffer, nread, 0))
2038 if (decst.mask == ~0)
2039 val = subsidiary_coding_system (Fget_coding_system (Qundecided),
2047 decst.mask = postprocess_iso2022_mask (decst.mask);
2049 for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
2051 int sys = fcd->coding_category_by_priority[i];
2052 if (decst.mask & (1 << sys))
2054 Lisp_Object codesys = fcd->coding_category_system[sys];
2055 if (!NILP (codesys))
2056 codesys = subsidiary_coding_system (codesys, decst.eol_type);
2057 val = Fcons (codesys, val);
2061 Lstream_close (istr);
2063 Lstream_delete (istr);
2064 Lstream_delete (lb_istr);
2069 /************************************************************************/
2070 /* Converting to internal Mule format ("decoding") */
2071 /************************************************************************/
2073 /* A decoding stream is a stream used for decoding text (i.e.
2074 converting from some external format to internal format).
2075 The decoding-stream object keeps track of the actual coding
2076 stream, the stream that is at the other end, and data that
2077 needs to be persistent across the lifetime of the stream. */
2079 /* Handle the EOL stuff related to just-read-in character C.
2080 EOL_TYPE is the EOL type of the coding stream.
2081 FLAGS is the current value of FLAGS in the coding stream, and may
2082 be modified by this macro. (The macro only looks at the
2083 CODING_STATE_CR flag.) DST is the Dynarr to which the decoded
2084 bytes are to be written. You need to also define a local goto
2085 label "label_continue_loop" that is at the end of the main
2086 character-reading loop.
2088 If C is a CR character, then this macro handles it entirely and
2089 jumps to label_continue_loop. Otherwise, this macro does not add
2090 anything to DST, and continues normally. You should continue
2091 processing C normally after this macro. */
2093 #define DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst) \
2097 if (eol_type == EOL_CR) \
2098 Dynarr_add (dst, '\n'); \
2099 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR) \
2100 Dynarr_add (dst, c); \
2102 flags |= CODING_STATE_CR; \
2103 goto label_continue_loop; \
2105 else if (flags & CODING_STATE_CR) \
2106 { /* eol_type == CODING_SYSTEM_EOL_CRLF */ \
2108 Dynarr_add (dst, '\r'); \
2109 flags &= ~CODING_STATE_CR; \
2113 /* C should be a binary character in the range 0 - 255; convert
2114 to internal format and add to Dynarr DST. */
2117 #define DECODE_ADD_BINARY_CHAR(c, dst) \
2119 if (BYTE_ASCII_P (c)) \
2120 Dynarr_add (dst, c); \
2123 Dynarr_add (dst, (c >> 6) | 0xc0); \
2124 Dynarr_add (dst, (c & 0x3f) | 0x80); \
2128 INLINE_HEADER void DECODE_ADD_UCS_CHAR(Emchar c, unsigned_char_dynarr* dst);
2130 DECODE_ADD_UCS_CHAR(Emchar c, unsigned_char_dynarr* dst)
2134 Dynarr_add (dst, c);
2136 else if ( c <= 0x7ff )
2138 Dynarr_add (dst, (c >> 6) | 0xc0);
2139 Dynarr_add (dst, (c & 0x3f) | 0x80);
2141 else if ( c <= 0xffff )
2143 Dynarr_add (dst, (c >> 12) | 0xe0);
2144 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
2145 Dynarr_add (dst, (c & 0x3f) | 0x80);
2147 else if ( c <= 0x1fffff )
2149 Dynarr_add (dst, (c >> 18) | 0xf0);
2150 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2151 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
2152 Dynarr_add (dst, (c & 0x3f) | 0x80);
2154 else if ( c <= 0x3ffffff )
2156 Dynarr_add (dst, (c >> 24) | 0xf8);
2157 Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
2158 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2159 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
2160 Dynarr_add (dst, (c & 0x3f) | 0x80);
2164 Dynarr_add (dst, (c >> 30) | 0xfc);
2165 Dynarr_add (dst, ((c >> 24) & 0x3f) | 0x80);
2166 Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
2167 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2168 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
2169 Dynarr_add (dst, (c & 0x3f) | 0x80);
2173 #define DECODE_ADD_BINARY_CHAR(c, dst) \
2175 if (BYTE_ASCII_P (c)) \
2176 Dynarr_add (dst, c); \
2177 else if (BYTE_C1_P (c)) \
2179 Dynarr_add (dst, LEADING_BYTE_CONTROL_1); \
2180 Dynarr_add (dst, c + 0x20); \
2184 Dynarr_add (dst, LEADING_BYTE_LATIN_ISO8859_1); \
2185 Dynarr_add (dst, c); \
2190 #define DECODE_OUTPUT_PARTIAL_CHAR(ch) \
2194 DECODE_ADD_BINARY_CHAR (ch, dst); \
2199 #define DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst) \
2201 if (flags & CODING_STATE_END) \
2203 DECODE_OUTPUT_PARTIAL_CHAR (ch); \
2204 if (flags & CODING_STATE_CR) \
2205 Dynarr_add (dst, '\r'); \
2209 #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding)
2211 struct decoding_stream
2213 /* Coding system that governs the conversion. */
2214 Lisp_Coding_System *codesys;
2216 /* Stream that we read the encoded data from or
2217 write the decoded data to. */
2220 /* If we are reading, then we can return only a fixed amount of
2221 data, so if the conversion resulted in too much data, we store it
2222 here for retrieval the next time around. */
2223 unsigned_char_dynarr *runoff;
2225 /* FLAGS holds flags indicating the current state of the decoding.
2226 Some of these flags are dependent on the coding system. */
2229 /* CPOS holds a partially built-up code-point of character. */
2232 /* EOL_TYPE specifies the type of end-of-line conversion that
2233 currently applies. We need to keep this separate from the
2234 EOL type stored in CODESYS because the latter might indicate
2235 automatic EOL-type detection while the former will always
2236 indicate a particular EOL type. */
2237 eol_type_t eol_type;
2239 /* Additional ISO2022 information. We define the structure above
2240 because it's also needed by the detection routines. */
2241 struct iso2022_decoder iso2022;
2243 /* Additional information (the state of the running CCL program)
2244 used by the CCL decoder. */
2245 struct ccl_program ccl;
2247 /* counter for UTF-8 or UCS-4 */
2248 unsigned char counter;
2251 unsigned char er_counter;
2252 unsigned char er_buf[16];
2254 unsigned combined_char_count;
2255 Emchar combined_chars[16];
2256 Lisp_Object combining_table;
2258 struct detection_state decst;
2262 extern Lisp_Object Vcharacter_composition_table;
2265 COMPOSE_FLUSH_CHARS (struct decoding_stream *str, unsigned_char_dynarr* dst);
2267 COMPOSE_FLUSH_CHARS (struct decoding_stream *str, unsigned_char_dynarr* dst)
2271 for (i = 0; i < str->combined_char_count; i++)
2272 DECODE_ADD_UCS_CHAR (str->combined_chars[i], dst);
2273 str->combined_char_count = 0;
2274 str->combining_table = Qnil;
2277 void COMPOSE_ADD_CHAR(struct decoding_stream *str, Emchar character,
2278 unsigned_char_dynarr* dst);
2280 COMPOSE_ADD_CHAR(struct decoding_stream *str,
2281 Emchar character, unsigned_char_dynarr* dst)
2283 if (CODING_SYSTEM_DISABLE_COMPOSITION (str->codesys))
2284 DECODE_ADD_UCS_CHAR (character, dst);
2285 else if (!CHAR_TABLEP (str->combining_table))
2288 = get_char_id_table (XCHAR_TABLE(Vcharacter_composition_table),
2292 DECODE_ADD_UCS_CHAR (character, dst);
2295 str->combined_chars[0] = character;
2296 str->combined_char_count = 1;
2297 str->combining_table = ret;
2303 = get_char_id_table (XCHAR_TABLE(str->combining_table),
2308 Emchar char2 = XCHARVAL (ret);
2310 get_char_id_table (XCHAR_TABLE(Vcharacter_composition_table),
2314 DECODE_ADD_UCS_CHAR (char2, dst);
2315 str->combined_char_count = 0;
2316 str->combining_table = Qnil;
2320 str->combined_chars[0] = char2;
2321 str->combined_char_count = 1;
2322 str->combining_table = ret;
2325 else if (CHAR_TABLEP (ret))
2327 str->combined_chars[str->combined_char_count++] = character;
2328 str->combining_table = ret;
2332 COMPOSE_FLUSH_CHARS (str, dst);
2333 DECODE_ADD_UCS_CHAR (character, dst);
2337 #else /* not UTF2000 */
2338 #define COMPOSE_FLUSH_CHARS(str, dst)
2339 #define COMPOSE_ADD_CHAR(str, ch, dst) DECODE_ADD_UCS_CHAR (ch, dst)
2340 #endif /* UTF2000 */
2342 static ssize_t decoding_reader (Lstream *stream,
2343 unsigned char *data, size_t size);
2344 static ssize_t decoding_writer (Lstream *stream,
2345 const unsigned char *data, size_t size);
2346 static int decoding_rewinder (Lstream *stream);
2347 static int decoding_seekable_p (Lstream *stream);
2348 static int decoding_flusher (Lstream *stream);
2349 static int decoding_closer (Lstream *stream);
2351 static Lisp_Object decoding_marker (Lisp_Object stream);
2353 DEFINE_LSTREAM_IMPLEMENTATION ("decoding", lstream_decoding,
2354 sizeof (struct decoding_stream));
2357 decoding_marker (Lisp_Object stream)
2359 Lstream *str = DECODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2360 Lisp_Object str_obj;
2362 /* We do not need to mark the coding systems or charsets stored
2363 within the stream because they are stored in a global list
2364 and automatically marked. */
2366 XSETLSTREAM (str_obj, str);
2367 mark_object (str_obj);
2368 if (str->imp->marker)
2369 return (str->imp->marker) (str_obj);
2374 /* Read SIZE bytes of data and store it into DATA. We are a decoding stream
2375 so we read data from the other end, decode it, and store it into DATA. */
2378 decoding_reader (Lstream *stream, unsigned char *data, size_t size)
2380 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2381 unsigned char *orig_data = data;
2383 int error_occurred = 0;
2385 /* We need to interface to mule_decode(), which expects to take some
2386 amount of data and store the result into a Dynarr. We have
2387 mule_decode() store into str->runoff, and take data from there
2390 /* We loop until we have enough data, reading chunks from the other
2391 end and decoding it. */
2394 /* Take data from the runoff if we can. Make sure to take at
2395 most SIZE bytes, and delete the data from the runoff. */
2396 if (Dynarr_length (str->runoff) > 0)
2398 size_t chunk = min (size, (size_t) Dynarr_length (str->runoff));
2399 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2400 Dynarr_delete_many (str->runoff, 0, chunk);
2406 break; /* No more room for data */
2408 if (str->flags & CODING_STATE_END)
2409 /* This means that on the previous iteration, we hit the EOF on
2410 the other end. We loop once more so that mule_decode() can
2411 output any final stuff it may be holding, or any "go back
2412 to a sane state" escape sequences. (This latter makes sense
2413 during encoding.) */
2416 /* Exhausted the runoff, so get some more. DATA has at least
2417 SIZE bytes left of storage in it, so it's OK to read directly
2418 into it. (We'll be overwriting above, after we've decoded it
2419 into the runoff.) */
2420 read_size = Lstream_read (str->other_end, data, size);
2427 /* There might be some more end data produced in the translation.
2428 See the comment above. */
2429 str->flags |= CODING_STATE_END;
2430 mule_decode (stream, (Extbyte *) data, str->runoff, read_size);
2433 if (data - orig_data == 0)
2434 return error_occurred ? -1 : 0;
2436 return data - orig_data;
2440 decoding_writer (Lstream *stream, const unsigned char *data, size_t size)
2442 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2445 /* Decode all our data into the runoff, and then attempt to write
2446 it all out to the other end. Remove whatever chunk we succeeded
2448 mule_decode (stream, (Extbyte *) data, str->runoff, size);
2449 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2450 Dynarr_length (str->runoff));
2452 Dynarr_delete_many (str->runoff, 0, retval);
2453 /* Do NOT return retval. The return value indicates how much
2454 of the incoming data was written, not how many bytes were
2460 reset_decoding_stream (struct decoding_stream *str)
2463 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_ISO2022)
2465 Lisp_Object coding_system;
2466 XSETCODING_SYSTEM (coding_system, str->codesys);
2467 reset_iso2022 (coding_system, &str->iso2022);
2469 else if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_CCL)
2471 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_DECODE (str->codesys));
2476 str->er_counter = 0;
2477 str->combined_char_count = 0;
2478 str->combining_table = Qnil;
2480 str->flags = str->cpos = 0;
2484 decoding_rewinder (Lstream *stream)
2486 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2487 reset_decoding_stream (str);
2488 Dynarr_reset (str->runoff);
2489 return Lstream_rewind (str->other_end);
2493 decoding_seekable_p (Lstream *stream)
2495 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2496 return Lstream_seekable_p (str->other_end);
2500 decoding_flusher (Lstream *stream)
2502 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2503 return Lstream_flush (str->other_end);
2507 decoding_closer (Lstream *stream)
2509 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2510 if (stream->flags & LSTREAM_FL_WRITE)
2512 str->flags |= CODING_STATE_END;
2513 decoding_writer (stream, 0, 0);
2515 Dynarr_free (str->runoff);
2517 #ifdef ENABLE_COMPOSITE_CHARS
2518 if (str->iso2022.composite_chars)
2519 Dynarr_free (str->iso2022.composite_chars);
2522 return Lstream_close (str->other_end);
2526 decoding_stream_coding_system (Lstream *stream)
2528 Lisp_Object coding_system;
2529 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2531 XSETCODING_SYSTEM (coding_system, str->codesys);
2532 return subsidiary_coding_system (coding_system, str->eol_type);
2536 set_decoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2538 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2539 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2541 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
2542 str->eol_type = CODING_SYSTEM_EOL_TYPE (cs);
2543 reset_decoding_stream (str);
2546 /* WARNING WARNING WARNING WARNING!!!!! If you open up a decoding
2547 stream for writing, no automatic code detection will be performed.
2548 The reason for this is that automatic code detection requires a
2549 seekable input. Things will also fail if you open a decoding
2550 stream for reading using a non-fully-specified coding system and
2551 a non-seekable input stream. */
2554 make_decoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2557 Lstream *lstr = Lstream_new (lstream_decoding, mode);
2558 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2562 str->other_end = stream;
2563 str->runoff = (unsigned_char_dynarr *) Dynarr_new (unsigned_char);
2564 str->eol_type = EOL_AUTODETECT;
2565 if (!strcmp (mode, "r")
2566 && Lstream_seekable_p (stream))
2567 /* We can determine the coding system now. */
2568 determine_real_coding_system (stream, &codesys, &str->eol_type);
2569 set_decoding_stream_coding_system (lstr, codesys);
2570 str->decst.eol_type = str->eol_type;
2571 str->decst.mask = ~0;
2572 XSETLSTREAM (obj, lstr);
2577 make_decoding_input_stream (Lstream *stream, Lisp_Object codesys)
2579 return make_decoding_stream_1 (stream, codesys, "r");
2583 make_decoding_output_stream (Lstream *stream, Lisp_Object codesys)
2585 return make_decoding_stream_1 (stream, codesys, "w");
2588 /* Note: the decode_coding_* functions all take the same
2589 arguments as mule_decode(), which is to say some SRC data of
2590 size N, which is to be stored into dynamic array DST.
2591 DECODING is the stream within which the decoding is
2592 taking place, but no data is actually read from or
2593 written to that stream; that is handled in decoding_reader()
2594 or decoding_writer(). This allows the same functions to
2595 be used for both reading and writing. */
2598 mule_decode (Lstream *decoding, const Extbyte *src,
2599 unsigned_char_dynarr *dst, size_t n)
2601 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2603 /* If necessary, do encoding-detection now. We do this when
2604 we're a writing stream or a non-seekable reading stream,
2605 meaning that we can't just process the whole input,
2606 rewind, and start over. */
2608 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT ||
2609 str->eol_type == EOL_AUTODETECT)
2611 Lisp_Object codesys;
2613 XSETCODING_SYSTEM (codesys, str->codesys);
2614 detect_coding_type (&str->decst, src, n,
2615 CODING_SYSTEM_TYPE (str->codesys) !=
2616 CODESYS_AUTODETECT);
2617 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT &&
2618 str->decst.mask != ~0)
2619 /* #### This is cheesy. What we really ought to do is
2620 buffer up a certain amount of data so as to get a
2621 less random result. */
2622 codesys = coding_system_from_mask (str->decst.mask);
2623 str->eol_type = str->decst.eol_type;
2624 if (XCODING_SYSTEM (codesys) != str->codesys)
2626 /* Preserve the CODING_STATE_END flag in case it was set.
2627 If we erase it, bad things might happen. */
2628 int was_end = str->flags & CODING_STATE_END;
2629 set_decoding_stream_coding_system (decoding, codesys);
2631 str->flags |= CODING_STATE_END;
2635 switch (CODING_SYSTEM_TYPE (str->codesys))
2638 case CODESYS_INTERNAL:
2639 Dynarr_add_many (dst, src, n);
2642 case CODESYS_AUTODETECT:
2643 /* If we got this far and still haven't decided on the coding
2644 system, then do no conversion. */
2645 case CODESYS_NO_CONVERSION:
2646 decode_coding_no_conversion (decoding, src, dst, n);
2649 case CODESYS_SHIFT_JIS:
2650 decode_coding_sjis (decoding, src, dst, n);
2653 decode_coding_big5 (decoding, src, dst, n);
2656 decode_coding_ucs4 (decoding, src, dst, n);
2659 decode_coding_utf8 (decoding, src, dst, n);
2662 str->ccl.last_block = str->flags & CODING_STATE_END;
2663 /* When applying ccl program to stream, MUST NOT set NULL
2665 ccl_driver (&str->ccl, (src ? (unsigned char *)src : (unsigned char*)""),
2666 dst, n, 0, CCL_MODE_DECODING);
2668 case CODESYS_ISO2022:
2669 decode_coding_iso2022 (decoding, src, dst, n);
2677 DEFUN ("decode-coding-region", Fdecode_coding_region, 3, 4, 0, /*
2678 Decode the text between START and END which is encoded in CODING-SYSTEM.
2679 This is useful if you've read in encoded text from a file without decoding
2680 it (e.g. you read in a JIS-formatted file but used the `binary' or
2681 `no-conversion' coding system, so that it shows up as "^[$B!<!+^[(B").
2682 Return length of decoded text.
2683 BUFFER defaults to the current buffer if unspecified.
2685 (start, end, coding_system, buffer))
2688 struct buffer *buf = decode_buffer (buffer, 0);
2689 Lisp_Object instream, lb_outstream, de_outstream, outstream;
2690 Lstream *istr, *ostr;
2691 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2693 get_buffer_range_char (buf, start, end, &b, &e, 0);
2695 barf_if_buffer_read_only (buf, b, e);
2697 coding_system = Fget_coding_system (coding_system);
2698 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2699 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2700 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2702 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2703 Fget_coding_system (Qbinary));
2704 istr = XLSTREAM (instream);
2705 ostr = XLSTREAM (outstream);
2706 GCPRO4 (instream, lb_outstream, de_outstream, outstream);
2708 /* The chain of streams looks like this:
2710 [BUFFER] <----- send through
2711 ------> [ENCODE AS BINARY]
2712 ------> [DECODE AS SPECIFIED]
2718 char tempbuf[1024]; /* some random amount */
2719 Bufpos newpos, even_newer_pos;
2720 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2721 ssize_t size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2725 newpos = lisp_buffer_stream_startpos (istr);
2726 Lstream_write (ostr, tempbuf, size_in_bytes);
2727 even_newer_pos = lisp_buffer_stream_startpos (istr);
2728 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2731 Lstream_close (istr);
2732 Lstream_close (ostr);
2734 Lstream_delete (istr);
2735 Lstream_delete (ostr);
2736 Lstream_delete (XLSTREAM (de_outstream));
2737 Lstream_delete (XLSTREAM (lb_outstream));
2742 /************************************************************************/
2743 /* Converting to an external encoding ("encoding") */
2744 /************************************************************************/
2746 /* An encoding stream is an output stream. When you create the
2747 stream, you specify the coding system that governs the encoding
2748 and another stream that the resulting encoded data is to be
2749 sent to, and then start sending data to it. */
2751 #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding)
2753 struct encoding_stream
2755 /* Coding system that governs the conversion. */
2756 Lisp_Coding_System *codesys;
2758 /* Stream that we read the encoded data from or
2759 write the decoded data to. */
2762 /* If we are reading, then we can return only a fixed amount of
2763 data, so if the conversion resulted in too much data, we store it
2764 here for retrieval the next time around. */
2765 unsigned_char_dynarr *runoff;
2767 /* FLAGS holds flags indicating the current state of the encoding.
2768 Some of these flags are dependent on the coding system. */
2771 /* CH holds a partially built-up character. Since we only deal
2772 with one- and two-byte characters at the moment, we only use
2773 this to store the first byte of a two-byte character. */
2776 /* Additional information used by the ISO2022 encoder. */
2779 /* CHARSET holds the character sets currently assigned to the G0
2780 through G3 registers. It is initialized from the array
2781 INITIAL_CHARSET in CODESYS. */
2782 Lisp_Object charset[4];
2784 /* Which registers are currently invoked into the left (GL) and
2785 right (GR) halves of the 8-bit encoding space? */
2786 int register_left, register_right;
2788 /* Whether we need to explicitly designate the charset in the
2789 G? register before using it. It is initialized from the
2790 array FORCE_CHARSET_ON_OUTPUT in CODESYS. */
2791 unsigned char force_charset_on_output[4];
2793 /* Other state variables that need to be preserved across
2795 Lisp_Object current_charset;
2797 int current_char_boundary;
2800 void (*encode_char) (struct encoding_stream *str, Emchar c,
2801 unsigned_char_dynarr *dst, unsigned int *flags);
2802 void (*finish) (struct encoding_stream *str,
2803 unsigned_char_dynarr *dst, unsigned int *flags);
2805 /* Additional information (the state of the running CCL program)
2806 used by the CCL encoder. */
2807 struct ccl_program ccl;
2811 static ssize_t encoding_reader (Lstream *stream, unsigned char *data, size_t size);
2812 static ssize_t encoding_writer (Lstream *stream, const unsigned char *data,
2814 static int encoding_rewinder (Lstream *stream);
2815 static int encoding_seekable_p (Lstream *stream);
2816 static int encoding_flusher (Lstream *stream);
2817 static int encoding_closer (Lstream *stream);
2819 static Lisp_Object encoding_marker (Lisp_Object stream);
2821 DEFINE_LSTREAM_IMPLEMENTATION ("encoding", lstream_encoding,
2822 sizeof (struct encoding_stream));
2825 encoding_marker (Lisp_Object stream)
2827 Lstream *str = ENCODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2828 Lisp_Object str_obj;
2830 /* We do not need to mark the coding systems or charsets stored
2831 within the stream because they are stored in a global list
2832 and automatically marked. */
2834 XSETLSTREAM (str_obj, str);
2835 mark_object (str_obj);
2836 if (str->imp->marker)
2837 return (str->imp->marker) (str_obj);
2842 /* Read SIZE bytes of data and store it into DATA. We are a encoding stream
2843 so we read data from the other end, encode it, and store it into DATA. */
2846 encoding_reader (Lstream *stream, unsigned char *data, size_t size)
2848 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2849 unsigned char *orig_data = data;
2851 int error_occurred = 0;
2853 /* We need to interface to mule_encode(), which expects to take some
2854 amount of data and store the result into a Dynarr. We have
2855 mule_encode() store into str->runoff, and take data from there
2858 /* We loop until we have enough data, reading chunks from the other
2859 end and encoding it. */
2862 /* Take data from the runoff if we can. Make sure to take at
2863 most SIZE bytes, and delete the data from the runoff. */
2864 if (Dynarr_length (str->runoff) > 0)
2866 int chunk = min ((int) size, Dynarr_length (str->runoff));
2867 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2868 Dynarr_delete_many (str->runoff, 0, chunk);
2874 break; /* No more room for data */
2876 if (str->flags & CODING_STATE_END)
2877 /* This means that on the previous iteration, we hit the EOF on
2878 the other end. We loop once more so that mule_encode() can
2879 output any final stuff it may be holding, or any "go back
2880 to a sane state" escape sequences. (This latter makes sense
2881 during encoding.) */
2884 /* Exhausted the runoff, so get some more. DATA at least SIZE bytes
2885 left of storage in it, so it's OK to read directly into it.
2886 (We'll be overwriting above, after we've encoded it into the
2888 read_size = Lstream_read (str->other_end, data, size);
2895 /* There might be some more end data produced in the translation.
2896 See the comment above. */
2897 str->flags |= CODING_STATE_END;
2898 mule_encode (stream, data, str->runoff, read_size);
2901 if (data == orig_data)
2902 return error_occurred ? -1 : 0;
2904 return data - orig_data;
2908 encoding_writer (Lstream *stream, const unsigned char *data, size_t size)
2910 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2913 /* Encode all our data into the runoff, and then attempt to write
2914 it all out to the other end. Remove whatever chunk we succeeded
2916 mule_encode (stream, data, str->runoff, size);
2917 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2918 Dynarr_length (str->runoff));
2920 Dynarr_delete_many (str->runoff, 0, retval);
2921 /* Do NOT return retval. The return value indicates how much
2922 of the incoming data was written, not how many bytes were
2928 reset_encoding_stream (struct encoding_stream *str)
2931 switch (CODING_SYSTEM_TYPE (str->codesys))
2933 case CODESYS_ISO2022:
2937 str->encode_char = &char_encode_iso2022;
2938 str->finish = &char_finish_iso2022;
2939 for (i = 0; i < 4; i++)
2941 str->iso2022.charset[i] =
2942 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, i);
2943 str->iso2022.force_charset_on_output[i] =
2944 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (str->codesys, i);
2946 str->iso2022.register_left = 0;
2947 str->iso2022.register_right = 1;
2948 str->iso2022.current_charset = Qnil;
2949 str->iso2022.current_half = 0;
2953 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_ENCODE (str->codesys));
2956 str->encode_char = &char_encode_utf8;
2957 str->finish = &char_finish_utf8;
2960 str->encode_char = &char_encode_ucs4;
2961 str->finish = &char_finish_ucs4;
2963 case CODESYS_SHIFT_JIS:
2964 str->encode_char = &char_encode_shift_jis;
2965 str->finish = &char_finish_shift_jis;
2968 str->encode_char = &char_encode_big5;
2969 str->finish = &char_finish_big5;
2975 str->iso2022.current_char_boundary = 0;
2976 str->flags = str->ch = 0;
2980 encoding_rewinder (Lstream *stream)
2982 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2983 reset_encoding_stream (str);
2984 Dynarr_reset (str->runoff);
2985 return Lstream_rewind (str->other_end);
2989 encoding_seekable_p (Lstream *stream)
2991 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2992 return Lstream_seekable_p (str->other_end);
2996 encoding_flusher (Lstream *stream)
2998 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2999 return Lstream_flush (str->other_end);
3003 encoding_closer (Lstream *stream)
3005 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
3006 if (stream->flags & LSTREAM_FL_WRITE)
3008 str->flags |= CODING_STATE_END;
3009 encoding_writer (stream, 0, 0);
3011 Dynarr_free (str->runoff);
3012 return Lstream_close (str->other_end);
3016 encoding_stream_coding_system (Lstream *stream)
3018 Lisp_Object coding_system;
3019 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
3021 XSETCODING_SYSTEM (coding_system, str->codesys);
3022 return coding_system;
3026 set_encoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
3028 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
3029 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
3031 reset_encoding_stream (str);
3035 make_encoding_stream_1 (Lstream *stream, Lisp_Object codesys,
3038 Lstream *lstr = Lstream_new (lstream_encoding, mode);
3039 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
3043 str->runoff = Dynarr_new (unsigned_char);
3044 str->other_end = stream;
3045 set_encoding_stream_coding_system (lstr, codesys);
3046 XSETLSTREAM (obj, lstr);
3051 make_encoding_input_stream (Lstream *stream, Lisp_Object codesys)
3053 return make_encoding_stream_1 (stream, codesys, "r");
3057 make_encoding_output_stream (Lstream *stream, Lisp_Object codesys)
3059 return make_encoding_stream_1 (stream, codesys, "w");
3062 /* Convert N bytes of internally-formatted data stored in SRC to an
3063 external format, according to the encoding stream ENCODING.
3064 Store the encoded data into DST. */
3067 mule_encode (Lstream *encoding, const Bufbyte *src,
3068 unsigned_char_dynarr *dst, size_t n)
3070 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3072 switch (CODING_SYSTEM_TYPE (str->codesys))
3075 case CODESYS_INTERNAL:
3076 Dynarr_add_many (dst, src, n);
3079 case CODESYS_AUTODETECT:
3080 /* If we got this far and still haven't decided on the coding
3081 system, then do no conversion. */
3082 case CODESYS_NO_CONVERSION:
3083 encode_coding_no_conversion (encoding, src, dst, n);
3087 str->ccl.last_block = str->flags & CODING_STATE_END;
3088 /* When applying ccl program to stream, MUST NOT set NULL
3090 ccl_driver (&str->ccl, ((src) ? src : (unsigned char*)""),
3091 dst, n, 0, CCL_MODE_ENCODING);
3095 text_encode_generic (encoding, src, dst, n);
3099 DEFUN ("encode-coding-region", Fencode_coding_region, 3, 4, 0, /*
3100 Encode the text between START and END using CODING-SYSTEM.
3101 This will, for example, convert Japanese characters into stuff such as
3102 "^[$B!<!+^[(B" if you use the JIS encoding. Return length of encoded
3103 text. BUFFER defaults to the current buffer if unspecified.
3105 (start, end, coding_system, buffer))
3108 struct buffer *buf = decode_buffer (buffer, 0);
3109 Lisp_Object instream, lb_outstream, de_outstream, outstream;
3110 Lstream *istr, *ostr;
3111 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
3113 get_buffer_range_char (buf, start, end, &b, &e, 0);
3115 barf_if_buffer_read_only (buf, b, e);
3117 coding_system = Fget_coding_system (coding_system);
3118 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
3119 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
3120 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
3121 Fget_coding_system (Qbinary));
3122 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
3124 istr = XLSTREAM (instream);
3125 ostr = XLSTREAM (outstream);
3126 GCPRO4 (instream, outstream, de_outstream, lb_outstream);
3127 /* The chain of streams looks like this:
3129 [BUFFER] <----- send through
3130 ------> [ENCODE AS SPECIFIED]
3131 ------> [DECODE AS BINARY]
3136 char tempbuf[1024]; /* some random amount */
3137 Bufpos newpos, even_newer_pos;
3138 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
3139 ssize_t size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
3143 newpos = lisp_buffer_stream_startpos (istr);
3144 Lstream_write (ostr, tempbuf, size_in_bytes);
3145 even_newer_pos = lisp_buffer_stream_startpos (istr);
3146 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
3152 lisp_buffer_stream_startpos (XLSTREAM (instream)) - b;
3153 Lstream_close (istr);
3154 Lstream_close (ostr);
3156 Lstream_delete (istr);
3157 Lstream_delete (ostr);
3158 Lstream_delete (XLSTREAM (de_outstream));
3159 Lstream_delete (XLSTREAM (lb_outstream));
3160 return make_int (retlen);
3167 text_encode_generic (Lstream *encoding, const Bufbyte *src,
3168 unsigned_char_dynarr *dst, size_t n)
3171 unsigned char char_boundary;
3172 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3173 unsigned int flags = str->flags;
3174 Emchar ch = str->ch;
3176 char_boundary = str->iso2022.current_char_boundary;
3182 if (char_boundary == 0)
3210 (*str->encode_char) (str, c, dst, &flags);
3212 else if (char_boundary == 1)
3214 (*str->encode_char) (str, (ch << 6) | (c & 0x3f), dst, &flags);
3220 ch = (ch << 6) | (c & 0x3f);
3225 if ((char_boundary == 0) && (flags & CODING_STATE_END))
3227 (*str->finish) (str, dst, &flags);
3232 str->iso2022.current_char_boundary = char_boundary;
3236 /************************************************************************/
3237 /* Shift-JIS methods */
3238 /************************************************************************/
3240 /* Shift-JIS is a coding system encoding three character sets: ASCII, right
3241 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
3242 as is. A character of JISX0201-Kana (DIMENSION1_CHARS94 character set) is
3243 encoded by "position-code + 0x80". A character of JISX0208
3244 (DIMENSION2_CHARS94 character set) is encoded in 2-byte but two
3245 position-codes are divided and shifted so that it fit in the range
3248 --- CODE RANGE of Shift-JIS ---
3249 (character set) (range)
3251 JISX0201-Kana 0xA0 .. 0xDF
3252 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xEF
3253 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
3254 -------------------------------
3258 /* Is this the first byte of a Shift-JIS two-byte char? */
3260 #define BYTE_SJIS_TWO_BYTE_1_P(c) \
3261 (((c) >= 0x81 && (c) <= 0x9F) || ((c) >= 0xE0 && (c) <= 0xEF))
3263 /* Is this the second byte of a Shift-JIS two-byte char? */
3265 #define BYTE_SJIS_TWO_BYTE_2_P(c) \
3266 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0x80 && (c) <= 0xFC))
3268 #define BYTE_SJIS_KATAKANA_P(c) \
3269 ((c) >= 0xA1 && (c) <= 0xDF)
3272 detect_coding_sjis (struct detection_state *st, const Extbyte *src, size_t n)
3276 unsigned char c = *(unsigned char *)src++;
3277 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3279 if (st->shift_jis.in_second_byte)
3281 st->shift_jis.in_second_byte = 0;
3285 else if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
3286 st->shift_jis.in_second_byte = 1;
3288 return CODING_CATEGORY_SHIFT_JIS_MASK;
3291 /* Convert Shift-JIS data to internal format. */
3294 decode_coding_sjis (Lstream *decoding, const Extbyte *src,
3295 unsigned_char_dynarr *dst, size_t n)
3297 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3298 unsigned int flags = str->flags;
3299 unsigned int cpos = str->cpos;
3300 eol_type_t eol_type = str->eol_type;
3304 unsigned char c = *(unsigned char *)src++;
3308 /* Previous character was first byte of Shift-JIS Kanji char. */
3309 if (BYTE_SJIS_TWO_BYTE_2_P (c))
3311 unsigned char e1, e2;
3313 DECODE_SJIS (cpos, c, e1, e2);
3315 DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_japanese_jisx0208,
3319 Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
3320 Dynarr_add (dst, e1);
3321 Dynarr_add (dst, e2);
3326 DECODE_ADD_BINARY_CHAR (cpos, dst);
3327 DECODE_ADD_BINARY_CHAR (c, dst);
3333 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3334 if (BYTE_SJIS_TWO_BYTE_1_P (c))
3336 else if (BYTE_SJIS_KATAKANA_P (c))
3339 DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_katakana_jisx0201,
3342 Dynarr_add (dst, LEADING_BYTE_KATAKANA_JISX0201);
3343 Dynarr_add (dst, c);
3348 DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_latin_jisx0201,
3352 DECODE_ADD_BINARY_CHAR (c, dst);
3354 label_continue_loop:;
3357 DECODE_HANDLE_END_OF_CONVERSION (flags, cpos, dst);
3363 /* Convert internal character representation to Shift_JIS. */
3366 char_encode_shift_jis (struct encoding_stream *str, Emchar ch,
3367 unsigned_char_dynarr *dst, unsigned int *flags)
3369 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3373 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3374 Dynarr_add (dst, '\r');
3375 if (eol_type != EOL_CR)
3376 Dynarr_add (dst, ch);
3380 unsigned int s1, s2;
3382 int code_point = charset_code_point (Vcharset_latin_jisx0201, ch);
3384 if (code_point >= 0)
3385 Dynarr_add (dst, code_point);
3386 else if ((code_point
3387 = charset_code_point (Vcharset_japanese_jisx0208_1990, ch))
3390 ENCODE_SJIS ((code_point >> 8) | 0x80,
3391 (code_point & 0xFF) | 0x80, s1, s2);
3392 Dynarr_add (dst, s1);
3393 Dynarr_add (dst, s2);
3395 else if ((code_point
3396 = charset_code_point (Vcharset_katakana_jisx0201, ch))
3398 Dynarr_add (dst, code_point | 0x80);
3399 else if ((code_point
3400 = charset_code_point (Vcharset_japanese_jisx0208, ch))
3403 ENCODE_SJIS ((code_point >> 8) | 0x80,
3404 (code_point & 0xFF) | 0x80, s1, s2);
3405 Dynarr_add (dst, s1);
3406 Dynarr_add (dst, s2);
3408 else if ((code_point = charset_code_point (Vcharset_ascii, ch))
3410 Dynarr_add (dst, code_point);
3412 Dynarr_add (dst, '?');
3414 Lisp_Object charset;
3415 unsigned int c1, c2;
3417 BREAKUP_CHAR (ch, charset, c1, c2);
3419 if (EQ(charset, Vcharset_katakana_jisx0201))
3421 Dynarr_add (dst, c1 | 0x80);
3425 Dynarr_add (dst, c1);
3427 else if (EQ(charset, Vcharset_japanese_jisx0208))
3429 ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3430 Dynarr_add (dst, s1);
3431 Dynarr_add (dst, s2);
3434 Dynarr_add (dst, '?');
3440 char_finish_shift_jis (struct encoding_stream *str, unsigned_char_dynarr *dst,
3441 unsigned int *flags)
3445 DEFUN ("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /*
3446 Decode a JISX0208 character of Shift-JIS coding-system.
3447 CODE is the character code in Shift-JIS as a cons of type bytes.
3448 Return the corresponding character.
3452 unsigned char c1, c2, s1, s2;
3455 CHECK_INT (XCAR (code));
3456 CHECK_INT (XCDR (code));
3457 s1 = XINT (XCAR (code));
3458 s2 = XINT (XCDR (code));
3459 if (BYTE_SJIS_TWO_BYTE_1_P (s1) &&
3460 BYTE_SJIS_TWO_BYTE_2_P (s2))
3462 DECODE_SJIS (s1, s2, c1, c2);
3463 return make_char (MAKE_CHAR (Vcharset_japanese_jisx0208,
3464 c1 & 0x7F, c2 & 0x7F));
3470 DEFUN ("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /*
3471 Encode a JISX0208 character CHARACTER to SHIFT-JIS coding-system.
3472 Return the corresponding character code in SHIFT-JIS as a cons of two bytes.
3476 Lisp_Object charset;
3479 CHECK_CHAR_COERCE_INT (character);
3480 BREAKUP_CHAR (XCHAR (character), charset, c1, c2);
3481 if (EQ (charset, Vcharset_japanese_jisx0208))
3483 ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3484 return Fcons (make_int (s1), make_int (s2));
3491 /************************************************************************/
3493 /************************************************************************/
3495 /* BIG5 is a coding system encoding two character sets: ASCII and
3496 Big5. An ASCII character is encoded as is. Big5 is a two-byte
3497 character set and is encoded in two-byte.
3499 --- CODE RANGE of BIG5 ---
3500 (character set) (range)
3502 Big5 (1st byte) 0xA1 .. 0xFE
3503 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
3504 --------------------------
3506 Since the number of characters in Big5 is larger than maximum
3507 characters in Emacs' charset (96x96), it can't be handled as one
3508 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
3509 and `charset-big5-2'. Both <type>s are DIMENSION2_CHARS94. The former
3510 contains frequently used characters and the latter contains less
3511 frequently used characters. */
3514 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
3515 ((c) >= 0x81 && (c) <= 0xFE)
3517 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
3518 ((c) >= 0xA1 && (c) <= 0xFE)
3521 /* Is this the second byte of a Shift-JIS two-byte char? */
3523 #define BYTE_BIG5_TWO_BYTE_2_P(c) \
3524 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0xA1 && (c) <= 0xFE))
3526 /* Number of Big5 characters which have the same code in 1st byte. */
3528 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
3530 /* Code conversion macros. These are macros because they are used in
3531 inner loops during code conversion.
3533 Note that temporary variables in macros introduce the classic
3534 dynamic-scoping problems with variable names. We use capital-
3535 lettered variables in the assumption that XEmacs does not use
3536 capital letters in variables except in a very formalized way
3539 /* Convert Big5 code (b1, b2) into its internal string representation
3542 /* There is a much simpler way to split the Big5 charset into two.
3543 For the moment I'm going to leave the algorithm as-is because it
3544 claims to separate out the most-used characters into a single
3545 charset, which perhaps will lead to optimizations in various
3548 The way the algorithm works is something like this:
3550 Big5 can be viewed as a 94x157 charset, where the row is
3551 encoded into the bytes 0xA1 .. 0xFE and the column is encoded
3552 into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE. As for frequency,
3553 the split between low and high column numbers is apparently
3554 meaningless; ascending rows produce less and less frequent chars.
3555 Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to
3556 the first charset, and the upper half (0xC9 .. 0xFE) to the
3557 second. To do the conversion, we convert the character into
3558 a single number where 0 .. 156 is the first row, 157 .. 313
3559 is the second, etc. That way, the characters are ordered by
3560 decreasing frequency. Then we just chop the space in two
3561 and coerce the result into a 94x94 space.
3564 #define DECODE_BIG5(b1, b2, lb, c1, c2) do \
3566 int B1 = b1, B2 = b2; \
3568 = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62); \
3572 lb = LEADING_BYTE_CHINESE_BIG5_1; \
3576 lb = LEADING_BYTE_CHINESE_BIG5_2; \
3577 I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1); \
3579 c1 = I / (0xFF - 0xA1) + 0xA1; \
3580 c2 = I % (0xFF - 0xA1) + 0xA1; \
3583 /* Convert the internal string representation of a Big5 character
3584 (lb, c1, c2) into Big5 code (b1, b2). */
3586 #define ENCODE_BIG5(lb, c1, c2, b1, b2) do \
3588 unsigned int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1); \
3590 if (lb == LEADING_BYTE_CHINESE_BIG5_2) \
3592 I += BIG5_SAME_ROW * (0xC9 - 0xA1); \
3594 b1 = I / BIG5_SAME_ROW + 0xA1; \
3595 b2 = I % BIG5_SAME_ROW; \
3596 b2 += b2 < 0x3F ? 0x40 : 0x62; \
3600 detect_coding_big5 (struct detection_state *st, const Extbyte *src, size_t n)
3604 unsigned char c = *(unsigned char *)src++;
3605 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO
3607 || (c >= 0x80 && c <= 0xA0)
3611 if (st->big5.in_second_byte)
3613 st->big5.in_second_byte = 0;
3614 if (c < 0x40 || (c >= 0x80 && c <= 0xA0))
3624 st->big5.in_second_byte = 1;
3626 return CODING_CATEGORY_BIG5_MASK;
3629 /* Convert Big5 data to internal format. */
3632 decode_coding_big5 (Lstream *decoding, const Extbyte *src,
3633 unsigned_char_dynarr *dst, size_t n)
3635 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3636 unsigned int flags = str->flags;
3637 unsigned int cpos = str->cpos;
3638 eol_type_t eol_type = str->eol_type;
3641 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (DECODING_STREAM_DATA
3642 (decoding)->codesys, 1);
3647 unsigned char c = *(unsigned char *)src++;
3650 /* Previous character was first byte of Big5 char. */
3651 if (BYTE_BIG5_TWO_BYTE_2_P (c))
3654 int code_point = (cpos << 8) | c;
3655 Emchar char_id = DECODE_DEFINED_CHAR (ccs, code_point);
3658 char_id = DECODE_CHAR (Vcharset_chinese_big5, code_point);
3659 DECODE_ADD_UCS_CHAR (char_id, dst);
3661 unsigned char b1, b2, b3;
3662 DECODE_BIG5 (cpos, c, b1, b2, b3);
3663 Dynarr_add (dst, b1);
3664 Dynarr_add (dst, b2);
3665 Dynarr_add (dst, b3);
3670 DECODE_ADD_BINARY_CHAR (cpos, dst);
3671 DECODE_ADD_BINARY_CHAR (c, dst);
3677 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3678 if (BYTE_BIG5_TWO_BYTE_1_P (c))
3681 DECODE_ADD_BINARY_CHAR (c, dst);
3683 label_continue_loop:;
3686 DECODE_HANDLE_END_OF_CONVERSION (flags, cpos, dst);
3692 /* Convert internally-formatted data to Big5. */
3695 char_encode_big5 (struct encoding_stream *str, Emchar ch,
3696 unsigned_char_dynarr *dst, unsigned int *flags)
3698 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3702 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3703 Dynarr_add (dst, '\r');
3704 if (eol_type != EOL_CR)
3705 Dynarr_add (dst, ch);
3712 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, 1);
3714 if ((code_point = charset_code_point (Vcharset_ascii, ch)) >= 0)
3715 Dynarr_add (dst, code_point);
3716 else if ((code_point = charset_code_point (ccs, ch)) >= 0)
3718 Dynarr_add (dst, code_point >> 8);
3719 Dynarr_add (dst, code_point & 0xFF);
3721 else if ((code_point
3722 = charset_code_point (Vcharset_chinese_big5, ch)) >= 0)
3724 Dynarr_add (dst, code_point >> 8);
3725 Dynarr_add (dst, code_point & 0xFF);
3727 else if ((code_point
3728 = charset_code_point (Vcharset_chinese_big5_1, ch)) >= 0)
3731 = ((code_point >> 8) - 33) * (0xFF - 0xA1)
3732 + ((code_point & 0xFF) - 33);
3733 unsigned char b1 = I / BIG5_SAME_ROW + 0xA1;
3734 unsigned char b2 = I % BIG5_SAME_ROW;
3736 b2 += b2 < 0x3F ? 0x40 : 0x62;
3737 Dynarr_add (dst, b1);
3738 Dynarr_add (dst, b2);
3740 else if ((code_point
3741 = charset_code_point (Vcharset_chinese_big5_2, ch)) >= 0)
3744 = ((code_point >> 8) - 33) * (0xFF - 0xA1)
3745 + ((code_point & 0xFF) - 33);
3746 unsigned char b1, b2;
3748 I += BIG5_SAME_ROW * (0xC9 - 0xA1);
3749 b1 = I / BIG5_SAME_ROW + 0xA1;
3750 b2 = I % BIG5_SAME_ROW;
3751 b2 += b2 < 0x3F ? 0x40 : 0x62;
3752 Dynarr_add (dst, b1);
3753 Dynarr_add (dst, b2);
3756 Dynarr_add (dst, '?');
3763 char_finish_big5 (struct encoding_stream *str, unsigned_char_dynarr *dst,
3764 unsigned int *flags)
3769 DEFUN ("decode-big5-char", Fdecode_big5_char, 1, 1, 0, /*
3770 Decode a Big5 character CODE of BIG5 coding-system.
3771 CODE is the character code in BIG5, a cons of two integers.
3772 Return the corresponding character.
3776 unsigned char c1, c2, b1, b2;
3779 CHECK_INT (XCAR (code));
3780 CHECK_INT (XCDR (code));
3781 b1 = XINT (XCAR (code));
3782 b2 = XINT (XCDR (code));
3783 if (BYTE_BIG5_TWO_BYTE_1_P (b1) &&
3784 BYTE_BIG5_TWO_BYTE_2_P (b2))
3786 Charset_ID leading_byte;
3787 Lisp_Object charset;
3788 DECODE_BIG5 (b1, b2, leading_byte, c1, c2);
3789 charset = CHARSET_BY_LEADING_BYTE (leading_byte);
3790 return make_char (MAKE_CHAR (charset, c1 & 0x7F, c2 & 0x7F));
3796 DEFUN ("encode-big5-char", Fencode_big5_char, 1, 1, 0, /*
3797 Encode the Big5 character CHARACTER in the BIG5 coding-system.
3798 Return the corresponding character code in Big5.
3802 Lisp_Object charset;
3805 CHECK_CHAR_COERCE_INT (character);
3806 BREAKUP_CHAR (XCHAR (character), charset, c1, c2);
3807 if (EQ (charset, Vcharset_chinese_big5_1) ||
3808 EQ (charset, Vcharset_chinese_big5_2))
3810 ENCODE_BIG5 (XCHARSET_LEADING_BYTE (charset), c1 | 0x80, c2 | 0x80,
3812 return Fcons (make_int (b1), make_int (b2));
3819 /************************************************************************/
3821 /************************************************************************/
3824 detect_coding_ucs4 (struct detection_state *st, const Extbyte *src, size_t n)
3828 unsigned char c = *(unsigned char *)src++;
3829 switch (st->ucs4.in_byte)
3838 st->ucs4.in_byte = 0;
3844 return CODING_CATEGORY_UCS4_MASK;
3848 decode_coding_ucs4 (Lstream *decoding, const Extbyte *src,
3849 unsigned_char_dynarr *dst, size_t n)
3851 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3852 unsigned int flags = str->flags;
3853 unsigned int cpos = str->cpos;
3854 unsigned char counter = str->counter;
3858 unsigned char c = *(unsigned char *)src++;
3866 DECODE_ADD_UCS_CHAR ((cpos << 8) | c, dst);
3871 cpos = ( cpos << 8 ) | c;
3875 if (counter & CODING_STATE_END)
3876 DECODE_OUTPUT_PARTIAL_CHAR (cpos);
3880 str->counter = counter;
3884 char_encode_ucs4 (struct encoding_stream *str, Emchar ch,
3885 unsigned_char_dynarr *dst, unsigned int *flags)
3887 Dynarr_add (dst, ch >> 24);
3888 Dynarr_add (dst, ch >> 16);
3889 Dynarr_add (dst, ch >> 8);
3890 Dynarr_add (dst, ch );
3894 char_finish_ucs4 (struct encoding_stream *str, unsigned_char_dynarr *dst,
3895 unsigned int *flags)
3900 /************************************************************************/
3902 /************************************************************************/
3905 detect_coding_utf8 (struct detection_state *st, const Extbyte *src, size_t n)
3909 unsigned char c = *(unsigned char *)src++;
3910 switch (st->utf8.in_byte)
3913 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3916 st->utf8.in_byte = 5;
3918 st->utf8.in_byte = 4;
3920 st->utf8.in_byte = 3;
3922 st->utf8.in_byte = 2;
3924 st->utf8.in_byte = 1;
3929 if ((c & 0xc0) != 0x80)
3935 return CODING_CATEGORY_UTF8_MASK;
3939 decode_output_utf8_partial_char (unsigned char counter,
3941 unsigned_char_dynarr *dst)
3944 DECODE_ADD_BINARY_CHAR ( (cpos|0xFC), dst);
3945 else if (counter == 4)
3947 if (cpos < (1 << 6))
3948 DECODE_ADD_BINARY_CHAR ( (cpos|0xF8), dst);
3951 DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xFC), dst);
3952 DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
3955 else if (counter == 3)
3957 if (cpos < (1 << 6))
3958 DECODE_ADD_BINARY_CHAR ( (cpos|0xF0), dst);
3959 else if (cpos < (1 << 12))
3961 DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xF8), dst);
3962 DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
3966 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 12)|0xFC), dst);
3967 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
3968 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
3971 else if (counter == 2)
3973 if (cpos < (1 << 6))
3974 DECODE_ADD_BINARY_CHAR ( (cpos|0xE0), dst);
3975 else if (cpos < (1 << 12))
3977 DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xF0), dst);
3978 DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
3980 else if (cpos < (1 << 18))
3982 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 12)|0xF8), dst);
3983 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
3984 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
3988 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 18)|0xFC), dst);
3989 DECODE_ADD_BINARY_CHAR ( (((cpos >> 12)&0x3F)|0x80), dst);
3990 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
3991 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
3996 if (cpos < (1 << 6))
3997 DECODE_ADD_BINARY_CHAR ( (cpos|0xC0), dst);
3998 else if (cpos < (1 << 12))
4000 DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xE0), dst);
4001 DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
4003 else if (cpos < (1 << 18))
4005 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 12)|0xF0), dst);
4006 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
4007 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
4009 else if (cpos < (1 << 24))
4011 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 18)|0xF8), dst);
4012 DECODE_ADD_BINARY_CHAR ( (((cpos >> 12)&0x3F)|0x80), dst);
4013 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
4014 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
4018 DECODE_ADD_BINARY_CHAR ( ( (cpos >> 24)|0xFC), dst);
4019 DECODE_ADD_BINARY_CHAR ( (((cpos >> 18)&0x3F)|0x80), dst);
4020 DECODE_ADD_BINARY_CHAR ( (((cpos >> 12)&0x3F)|0x80), dst);
4021 DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
4022 DECODE_ADD_BINARY_CHAR ( ( (cpos &0x3F)|0x80), dst);
4028 decode_coding_utf8 (Lstream *decoding, const Extbyte *src,
4029 unsigned_char_dynarr *dst, size_t n)
4031 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4032 unsigned int flags = str->flags;
4033 unsigned int cpos = str->cpos;
4034 eol_type_t eol_type = str->eol_type;
4035 unsigned char counter = str->counter;
4036 unsigned char er_counter = str->er_counter;
4040 unsigned char c = *(unsigned char *)src++;
4045 if (!CODING_SYSTEM_USE_ENTITY_REFERENCE (str->codesys))
4047 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4048 DECODE_ADD_UCS_CHAR (c, dst);
4050 else if (er_counter == 0)
4054 str->er_buf[0] = '&';
4059 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4060 DECODE_ADD_UCS_CHAR (c, dst);
4065 Lisp_Object string = make_string (str->er_buf, er_counter);
4066 Lisp_Object rest = Vcoded_charset_entity_reference_alist;
4073 while (!NILP (rest))
4077 if (NILP (ccs = Ffind_charset (ccs)))
4092 pat = concat3 (build_string ("^&"),
4093 pat, build_string ("\\([0-9]+\\)$"));
4096 else if (EQ (ret, Qx))
4098 pat = concat3 (build_string ("^&"),
4100 build_string ("\\([0-9a-f]+\\)$"));
4103 else if (EQ (ret, QX))
4105 pat = concat3 (build_string ("^&"),
4107 build_string ("\\([0-9A-F]+\\)$"));
4113 if (!NILP (Fstring_match (pat, string, Qnil, Qnil)))
4116 = XINT (Fstring_to_number
4117 (Fsubstring (string,
4120 Fmatch_end (make_int (1))),
4123 DECODE_ADD_UCS_CHAR (DECODE_CHAR (ccs, code), dst);
4128 if (!NILP (Fstring_match
4129 (build_string ("^&MCS-\\([0-9A-F]+\\)$"),
4130 string, Qnil, Qnil)))
4133 = XINT (Fstring_to_number
4134 (Fsubstring (string,
4137 Fmatch_end (make_int (1))),
4140 DECODE_ADD_UCS_CHAR (code, dst);
4144 Dynarr_add_many (dst, str->er_buf, er_counter);
4145 Dynarr_add (dst, ';');
4150 else if ( (er_counter >= 16) || (c <= ' ') || (c >= 0x7F) )
4152 Dynarr_add_many (dst, str->er_buf, er_counter);
4154 DECODE_ADD_UCS_CHAR (c, dst);
4157 str->er_buf[er_counter++] = c;
4161 Dynarr_add_many (dst, str->er_buf, er_counter);
4168 else if ( c < 0xF0 )
4173 else if ( c < 0xF8 )
4178 else if ( c < 0xFC )
4190 else if ( (c & 0xC0) == 0x80 )
4192 cpos = ( cpos << 6 ) | ( c & 0x3f );
4195 DECODE_ADD_UCS_CHAR (cpos, dst);
4204 decode_output_utf8_partial_char (counter, cpos, dst);
4205 DECODE_ADD_BINARY_CHAR (c, dst);
4209 label_continue_loop:;
4212 if (flags & CODING_STATE_END)
4216 decode_output_utf8_partial_char (counter, cpos, dst);
4220 else if ( er_counter > 0)
4222 Dynarr_add_many (dst, str->er_buf, er_counter);
4228 str->counter = counter;
4229 str->er_counter = er_counter;
4233 char_encode_utf8 (struct encoding_stream *str, Emchar ch,
4234 unsigned_char_dynarr *dst, unsigned int *flags)
4236 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
4240 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
4241 Dynarr_add (dst, '\r');
4242 if (eol_type != EOL_CR)
4243 Dynarr_add (dst, ch);
4245 else if (ch <= 0x7f)
4247 Dynarr_add (dst, ch);
4251 int code_point = charset_code_point (Vcharset_ucs, ch);
4253 if ( (code_point < 0) || (code_point > 0x10FFFF) )
4255 if (CODING_SYSTEM_USE_ENTITY_REFERENCE (str->codesys))
4257 Lisp_Object rest = Vcoded_charset_entity_reference_alist;
4261 int format_columns, idx;
4262 char buf[16], format[16];
4264 while (!NILP (rest))
4268 if (!NILP (ccs = Ffind_charset (ccs)))
4271 = charset_code_point (ccs, ch)) >= 0 )
4276 && ((idx =XSTRING_LENGTH (ret)) <= 6))
4278 strncpy (format, XSTRING_DATA (ret), idx);
4287 format [idx++] = '%';
4288 format_columns = XINT (ret);
4289 if ( (2 <= format_columns)
4290 && (format_columns <= 8) )
4292 format [idx++] = '0';
4293 format [idx++] = '0' + format_columns;
4300 format [idx++] = 'd';
4301 else if (EQ (ret, Qx))
4302 format [idx++] = 'x';
4303 else if (EQ (ret, QX))
4304 format [idx++] = 'X';
4309 sprintf (buf, format, code_point);
4310 Dynarr_add (dst, '&');
4311 Dynarr_add_many (dst, buf, strlen (buf));
4312 Dynarr_add (dst, ';');
4318 sprintf (buf, "&MCS-%08X;", ch);
4319 Dynarr_add_many (dst, buf, strlen (buf));
4325 if (code_point <= 0x7ff)
4327 Dynarr_add (dst, (code_point >> 6) | 0xc0);
4328 Dynarr_add (dst, (code_point & 0x3f) | 0x80);
4330 else if (code_point <= 0xffff)
4332 Dynarr_add (dst, (code_point >> 12) | 0xe0);
4333 Dynarr_add (dst, ((code_point >> 6) & 0x3f) | 0x80);
4334 Dynarr_add (dst, (code_point & 0x3f) | 0x80);
4336 else if (code_point <= 0x1fffff)
4338 Dynarr_add (dst, (code_point >> 18) | 0xf0);
4339 Dynarr_add (dst, ((code_point >> 12) & 0x3f) | 0x80);
4340 Dynarr_add (dst, ((code_point >> 6) & 0x3f) | 0x80);
4341 Dynarr_add (dst, (code_point & 0x3f) | 0x80);
4343 else if (code_point <= 0x3ffffff)
4345 Dynarr_add (dst, (code_point >> 24) | 0xf8);
4346 Dynarr_add (dst, ((code_point >> 18) & 0x3f) | 0x80);
4347 Dynarr_add (dst, ((code_point >> 12) & 0x3f) | 0x80);
4348 Dynarr_add (dst, ((code_point >> 6) & 0x3f) | 0x80);
4349 Dynarr_add (dst, (code_point & 0x3f) | 0x80);
4353 Dynarr_add (dst, (code_point >> 30) | 0xfc);
4354 Dynarr_add (dst, ((code_point >> 24) & 0x3f) | 0x80);
4355 Dynarr_add (dst, ((code_point >> 18) & 0x3f) | 0x80);
4356 Dynarr_add (dst, ((code_point >> 12) & 0x3f) | 0x80);
4357 Dynarr_add (dst, ((code_point >> 6) & 0x3f) | 0x80);
4358 Dynarr_add (dst, (code_point & 0x3f) | 0x80);
4364 char_finish_utf8 (struct encoding_stream *str, unsigned_char_dynarr *dst,
4365 unsigned int *flags)
4370 /************************************************************************/
4371 /* ISO2022 methods */
4372 /************************************************************************/
4374 /* The following note describes the coding system ISO2022 briefly.
4375 Since the intention of this note is to help understand the
4376 functions in this file, some parts are NOT ACCURATE or OVERLY
4377 SIMPLIFIED. For thorough understanding, please refer to the
4378 original document of ISO2022.
4380 ISO2022 provides many mechanisms to encode several character sets
4381 in 7-bit and 8-bit environments. For 7-bit environments, all text
4382 is encoded using bytes less than 128. This may make the encoded
4383 text a little bit longer, but the text passes more easily through
4384 several gateways, some of which strip off MSB (Most Signigant Bit).
4386 There are two kinds of character sets: control character set and
4387 graphic character set. The former contains control characters such
4388 as `newline' and `escape' to provide control functions (control
4389 functions are also provided by escape sequences). The latter
4390 contains graphic characters such as 'A' and '-'. Emacs recognizes
4391 two control character sets and many graphic character sets.
4393 Graphic character sets are classified into one of the following
4394 four classes, according to the number of bytes (DIMENSION) and
4395 number of characters in one dimension (CHARS) of the set:
4396 - DIMENSION1_CHARS94
4397 - DIMENSION1_CHARS96
4398 - DIMENSION2_CHARS94
4399 - DIMENSION2_CHARS96
4401 In addition, each character set is assigned an identification tag,
4402 unique for each set, called "final character" (denoted as <F>
4403 hereafter). The <F> of each character set is decided by ECMA(*)
4404 when it is registered in ISO. The code range of <F> is 0x30..0x7F
4405 (0x30..0x3F are for private use only).
4407 Note (*): ECMA = European Computer Manufacturers Association
4409 Here are examples of graphic character set [NAME(<F>)]:
4410 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
4411 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
4412 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
4413 o DIMENSION2_CHARS96 -- none for the moment
4415 A code area (1 byte = 8 bits) is divided into 4 areas, C0, GL, C1, and GR.
4416 C0 [0x00..0x1F] -- control character plane 0
4417 GL [0x20..0x7F] -- graphic character plane 0
4418 C1 [0x80..0x9F] -- control character plane 1
4419 GR [0xA0..0xFF] -- graphic character plane 1
4421 A control character set is directly designated and invoked to C0 or
4422 C1 by an escape sequence. The most common case is that:
4423 - ISO646's control character set is designated/invoked to C0, and
4424 - ISO6429's control character set is designated/invoked to C1,
4425 and usually these designations/invocations are omitted in encoded
4426 text. In a 7-bit environment, only C0 can be used, and a control
4427 character for C1 is encoded by an appropriate escape sequence to
4428 fit into the environment. All control characters for C1 are
4429 defined to have corresponding escape sequences.
4431 A graphic character set is at first designated to one of four
4432 graphic registers (G0 through G3), then these graphic registers are
4433 invoked to GL or GR. These designations and invocations can be
4434 done independently. The most common case is that G0 is invoked to
4435 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
4436 these invocations and designations are omitted in encoded text.
4437 In a 7-bit environment, only GL can be used.
4439 When a graphic character set of CHARS94 is invoked to GL, codes
4440 0x20 and 0x7F of the GL area work as control characters SPACE and
4441 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
4444 There are two ways of invocation: locking-shift and single-shift.
4445 With locking-shift, the invocation lasts until the next different
4446 invocation, whereas with single-shift, the invocation affects the
4447 following character only and doesn't affect the locking-shift
4448 state. Invocations are done by the following control characters or
4451 ----------------------------------------------------------------------
4452 abbrev function cntrl escape seq description
4453 ----------------------------------------------------------------------
4454 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
4455 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
4456 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
4457 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
4458 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
4459 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
4460 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
4461 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
4462 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
4463 ----------------------------------------------------------------------
4464 (*) These are not used by any known coding system.
4466 Control characters for these functions are defined by macros
4467 ISO_CODE_XXX in `coding.h'.
4469 Designations are done by the following escape sequences:
4470 ----------------------------------------------------------------------
4471 escape sequence description
4472 ----------------------------------------------------------------------
4473 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
4474 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
4475 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
4476 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
4477 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
4478 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
4479 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
4480 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
4481 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
4482 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
4483 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
4484 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
4485 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
4486 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
4487 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
4488 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
4489 ----------------------------------------------------------------------
4491 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
4492 of dimension 1, chars 94, and final character <F>, etc...
4494 Note (*): Although these designations are not allowed in ISO2022,
4495 Emacs accepts them on decoding, and produces them on encoding
4496 CHARS96 character sets in a coding system which is characterized as
4497 7-bit environment, non-locking-shift, and non-single-shift.
4499 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
4500 '(' can be omitted. We refer to this as "short-form" hereafter.
4502 Now you may notice that there are a lot of ways for encoding the
4503 same multilingual text in ISO2022. Actually, there exist many
4504 coding systems such as Compound Text (used in X11's inter client
4505 communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
4506 (used in Korean internet), EUC (Extended UNIX Code, used in Asian
4507 localized platforms), and all of these are variants of ISO2022.
4509 In addition to the above, Emacs handles two more kinds of escape
4510 sequences: ISO6429's direction specification and Emacs' private
4511 sequence for specifying character composition.
4513 ISO6429's direction specification takes the following form:
4514 o CSI ']' -- end of the current direction
4515 o CSI '0' ']' -- end of the current direction
4516 o CSI '1' ']' -- start of left-to-right text
4517 o CSI '2' ']' -- start of right-to-left text
4518 The control character CSI (0x9B: control sequence introducer) is
4519 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
4521 Character composition specification takes the following form:
4522 o ESC '0' -- start character composition
4523 o ESC '1' -- end character composition
4524 Since these are not standard escape sequences of any ISO standard,
4525 their use with these meanings is restricted to Emacs only. */
4528 reset_iso2022 (Lisp_Object coding_system, struct iso2022_decoder *iso)
4532 for (i = 0; i < 4; i++)
4534 if (!NILP (coding_system))
4536 XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, i);
4538 iso->charset[i] = Qt;
4539 iso->invalid_designated[i] = 0;
4541 iso->esc = ISO_ESC_NOTHING;
4542 iso->esc_bytes_index = 0;
4543 iso->register_left = 0;
4544 iso->register_right = 1;
4545 iso->switched_dir_and_no_valid_charset_yet = 0;
4546 iso->invalid_switch_dir = 0;
4547 iso->output_direction_sequence = 0;
4548 iso->output_literally = 0;
4549 #ifdef ENABLE_COMPOSITE_CHARS
4550 if (iso->composite_chars)
4551 Dynarr_reset (iso->composite_chars);
4556 fit_to_be_escape_quoted (unsigned char c)
4573 /* Parse one byte of an ISO2022 escape sequence.
4574 If the result is an invalid escape sequence, return 0 and
4575 do not change anything in STR. Otherwise, if the result is
4576 an incomplete escape sequence, update ISO2022.ESC and
4577 ISO2022.ESC_BYTES and return -1. Otherwise, update
4578 all the state variables (but not ISO2022.ESC_BYTES) and
4581 If CHECK_INVALID_CHARSETS is non-zero, check for designation
4582 or invocation of an invalid character set and treat that as
4583 an unrecognized escape sequence. */
4586 parse_iso2022_esc (Lisp_Object codesys, struct iso2022_decoder *iso,
4587 unsigned char c, unsigned int *flags,
4588 int check_invalid_charsets)
4590 /* (1) If we're at the end of a designation sequence, CS is the
4591 charset being designated and REG is the register to designate
4594 (2) If we're at the end of a locking-shift sequence, REG is
4595 the register to invoke and HALF (0 == left, 1 == right) is
4596 the half to invoke it into.
4598 (3) If we're at the end of a single-shift sequence, REG is
4599 the register to invoke. */
4600 Lisp_Object cs = Qnil;
4603 /* NOTE: This code does goto's all over the fucking place.
4604 The reason for this is that we're basically implementing
4605 a state machine here, and hierarchical languages like C
4606 don't really provide a clean way of doing this. */
4608 if (! (*flags & CODING_STATE_ESCAPE))
4609 /* At beginning of escape sequence; we need to reset our
4610 escape-state variables. */
4611 iso->esc = ISO_ESC_NOTHING;
4613 iso->output_literally = 0;
4614 iso->output_direction_sequence = 0;
4618 case ISO_ESC_NOTHING:
4619 iso->esc_bytes_index = 0;
4622 case ISO_CODE_ESC: /* Start escape sequence */
4623 *flags |= CODING_STATE_ESCAPE;
4627 case ISO_CODE_CSI: /* ISO6429 (specifying directionality) */
4628 *flags |= CODING_STATE_ESCAPE;
4629 iso->esc = ISO_ESC_5_11;
4632 case ISO_CODE_SO: /* locking shift 1 */
4635 case ISO_CODE_SI: /* locking shift 0 */
4639 case ISO_CODE_SS2: /* single shift */
4642 case ISO_CODE_SS3: /* single shift */
4646 default: /* Other control characters */
4653 /**** single shift ****/
4655 case 'N': /* single shift 2 */
4658 case 'O': /* single shift 3 */
4662 /**** locking shift ****/
4664 case '~': /* locking shift 1 right */
4667 case 'n': /* locking shift 2 */
4670 case '}': /* locking shift 2 right */
4673 case 'o': /* locking shift 3 */
4676 case '|': /* locking shift 3 right */
4680 #ifdef ENABLE_COMPOSITE_CHARS
4681 /**** composite ****/
4684 iso->esc = ISO_ESC_START_COMPOSITE;
4685 *flags = (*flags & CODING_STATE_ISO2022_LOCK) |
4686 CODING_STATE_COMPOSITE;
4690 iso->esc = ISO_ESC_END_COMPOSITE;
4691 *flags = (*flags & CODING_STATE_ISO2022_LOCK) &
4692 ~CODING_STATE_COMPOSITE;
4694 #endif /* ENABLE_COMPOSITE_CHARS */
4696 /**** directionality ****/
4699 iso->esc = ISO_ESC_5_11;
4702 /**** designation ****/
4704 case '$': /* multibyte charset prefix */
4705 iso->esc = ISO_ESC_2_4;
4709 if (0x28 <= c && c <= 0x2F)
4711 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_8);
4715 /* This function is called with CODESYS equal to nil when
4716 doing coding-system detection. */
4718 && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
4719 && fit_to_be_escape_quoted (c))
4721 iso->esc = ISO_ESC_LITERAL;
4722 *flags &= CODING_STATE_ISO2022_LOCK;
4732 /**** directionality ****/
4734 case ISO_ESC_5_11: /* ISO6429 direction control */
4737 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4738 goto directionality;
4740 if (c == '0') iso->esc = ISO_ESC_5_11_0;
4741 else if (c == '1') iso->esc = ISO_ESC_5_11_1;
4742 else if (c == '2') iso->esc = ISO_ESC_5_11_2;
4746 case ISO_ESC_5_11_0:
4749 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4750 goto directionality;
4754 case ISO_ESC_5_11_1:
4757 *flags = (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4758 goto directionality;
4762 case ISO_ESC_5_11_2:
4765 *flags = (*flags & CODING_STATE_ISO2022_LOCK) | CODING_STATE_R2L;
4766 goto directionality;
4771 iso->esc = ISO_ESC_DIRECTIONALITY;
4772 /* Various junk here to attempt to preserve the direction sequences
4773 literally in the text if they would otherwise be swallowed due
4774 to invalid designations that don't show up as actual charset
4775 changes in the text. */
4776 if (iso->invalid_switch_dir)
4778 /* We already inserted a direction switch literally into the
4779 text. We assume (#### this may not be right) that the
4780 next direction switch is the one going the other way,
4781 and we need to output that literally as well. */
4782 iso->output_literally = 1;
4783 iso->invalid_switch_dir = 0;
4789 /* If we are in the thrall of an invalid designation,
4790 then stick the directionality sequence literally into the
4791 output stream so it ends up in the original text again. */
4792 for (jj = 0; jj < 4; jj++)
4793 if (iso->invalid_designated[jj])
4797 iso->output_literally = 1;
4798 iso->invalid_switch_dir = 1;
4801 /* Indicate that we haven't yet seen a valid designation,
4802 so that if a switch-dir is directly followed by an
4803 invalid designation, both get inserted literally. */
4804 iso->switched_dir_and_no_valid_charset_yet = 1;
4809 /**** designation ****/
4812 if (0x28 <= c && c <= 0x2F)
4814 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_4_8);
4817 if (0x40 <= c && c <= 0x42)
4820 cs = CHARSET_BY_ATTRIBUTES (94, -1, c,
4821 *flags & CODING_STATE_R2L ?
4822 CHARSET_RIGHT_TO_LEFT :
4823 CHARSET_LEFT_TO_RIGHT);
4834 if (c < '0' || c > '~')
4835 return 0; /* bad final byte */
4837 if (iso->esc >= ISO_ESC_2_8 &&
4838 iso->esc <= ISO_ESC_2_15)
4840 chars = (iso->esc >= ISO_ESC_2_12) ? 96 : 94;
4841 single = 1; /* single-byte */
4842 reg = (iso->esc - ISO_ESC_2_8) & 3;
4844 else if (iso->esc >= ISO_ESC_2_4_8 &&
4845 iso->esc <= ISO_ESC_2_4_15)
4847 chars = (iso->esc >= ISO_ESC_2_4_12) ? 96 : 94;
4848 single = -1; /* multi-byte */
4849 reg = (iso->esc - ISO_ESC_2_4_8) & 3;
4853 /* Can this ever be reached? -slb */
4857 cs = CHARSET_BY_ATTRIBUTES (chars, single, c,
4858 *flags & CODING_STATE_R2L ?
4859 CHARSET_RIGHT_TO_LEFT :
4860 CHARSET_LEFT_TO_RIGHT);
4866 iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char) c;
4870 if (check_invalid_charsets && !CHARSETP (iso->charset[reg]))
4871 /* can't invoke something that ain't there. */
4873 iso->esc = ISO_ESC_SINGLE_SHIFT;
4874 *flags &= CODING_STATE_ISO2022_LOCK;
4876 *flags |= CODING_STATE_SS2;
4878 *flags |= CODING_STATE_SS3;
4882 if (check_invalid_charsets &&
4883 !CHARSETP (iso->charset[reg]))
4884 /* can't invoke something that ain't there. */
4887 iso->register_right = reg;
4889 iso->register_left = reg;
4890 *flags &= CODING_STATE_ISO2022_LOCK;
4891 iso->esc = ISO_ESC_LOCKING_SHIFT;
4895 if (NILP (cs) && check_invalid_charsets)
4897 iso->invalid_designated[reg] = 1;
4898 iso->charset[reg] = Vcharset_ascii;
4899 iso->esc = ISO_ESC_DESIGNATE;
4900 *flags &= CODING_STATE_ISO2022_LOCK;
4901 iso->output_literally = 1;
4902 if (iso->switched_dir_and_no_valid_charset_yet)
4904 /* We encountered a switch-direction followed by an
4905 invalid designation. Ensure that the switch-direction
4906 gets outputted; otherwise it will probably get eaten
4907 when the text is written out again. */
4908 iso->switched_dir_and_no_valid_charset_yet = 0;
4909 iso->output_direction_sequence = 1;
4910 /* And make sure that the switch-dir going the other
4911 way gets outputted, as well. */
4912 iso->invalid_switch_dir = 1;
4916 /* This function is called with CODESYS equal to nil when
4917 doing coding-system detection. */
4918 if (!NILP (codesys))
4920 charset_conversion_spec_dynarr *dyn =
4921 XCODING_SYSTEM (codesys)->iso2022.input_conv;
4927 for (i = 0; i < Dynarr_length (dyn); i++)
4929 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
4930 if (EQ (cs, spec->from_charset))
4931 cs = spec->to_charset;
4936 iso->charset[reg] = cs;
4937 iso->esc = ISO_ESC_DESIGNATE;
4938 *flags &= CODING_STATE_ISO2022_LOCK;
4939 if (iso->invalid_designated[reg])
4941 iso->invalid_designated[reg] = 0;
4942 iso->output_literally = 1;
4944 if (iso->switched_dir_and_no_valid_charset_yet)
4945 iso->switched_dir_and_no_valid_charset_yet = 0;
4950 detect_coding_iso2022 (struct detection_state *st, const Extbyte *src, size_t n)
4954 /* #### There are serious deficiencies in the recognition mechanism
4955 here. This needs to be much smarter if it's going to cut it.
4956 The sequence "\xff\x0f" is currently detected as LOCK_SHIFT while
4957 it should be detected as Latin-1.
4958 All the ISO2022 stuff in this file should be synced up with the
4959 code from FSF Emacs-20.4, in which Mule should be more or less stable.
4960 Perhaps we should wait till R2L works in FSF Emacs? */
4962 if (!st->iso2022.initted)
4964 reset_iso2022 (Qnil, &st->iso2022.iso);
4965 st->iso2022.mask = (CODING_CATEGORY_ISO_7_MASK |
4966 CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4967 CODING_CATEGORY_ISO_8_1_MASK |
4968 CODING_CATEGORY_ISO_8_2_MASK |
4969 CODING_CATEGORY_ISO_LOCK_SHIFT_MASK);
4970 st->iso2022.flags = 0;
4971 st->iso2022.high_byte_count = 0;
4972 st->iso2022.saw_single_shift = 0;
4973 st->iso2022.initted = 1;
4976 mask = st->iso2022.mask;
4980 unsigned char c = *(unsigned char *)src++;
4983 mask &= ~CODING_CATEGORY_ISO_7_MASK;
4984 st->iso2022.high_byte_count++;
4988 if (st->iso2022.high_byte_count && !st->iso2022.saw_single_shift)
4990 if (st->iso2022.high_byte_count & 1)
4991 /* odd number of high bytes; assume not iso-8-2 */
4992 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4994 st->iso2022.high_byte_count = 0;
4995 st->iso2022.saw_single_shift = 0;
4997 mask &= ~CODING_CATEGORY_ISO_7_MASK;
4999 if (!(st->iso2022.flags & CODING_STATE_ESCAPE)
5000 && (BYTE_C0_P (c) || BYTE_C1_P (c)))
5001 { /* control chars */
5004 /* Allow and ignore control characters that you might
5005 reasonably see in a text file */
5010 case 8: /* backspace */
5011 case 11: /* vertical tab */
5012 case 12: /* form feed */
5013 case 26: /* MS-DOS C-z junk */
5014 case 31: /* '^_' -- for info */
5015 goto label_continue_loop;
5022 if ((st->iso2022.flags & CODING_STATE_ESCAPE) || BYTE_C0_P (c)
5025 if (parse_iso2022_esc (Qnil, &st->iso2022.iso, c,
5026 &st->iso2022.flags, 0))
5028 switch (st->iso2022.iso.esc)
5030 case ISO_ESC_DESIGNATE:
5031 mask &= ~CODING_CATEGORY_ISO_8_1_MASK;
5032 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
5034 case ISO_ESC_LOCKING_SHIFT:
5035 mask = CODING_CATEGORY_ISO_LOCK_SHIFT_MASK;
5036 goto ran_out_of_chars;
5037 case ISO_ESC_SINGLE_SHIFT:
5038 mask &= ~CODING_CATEGORY_ISO_8_DESIGNATE_MASK;
5039 st->iso2022.saw_single_shift = 1;
5048 goto ran_out_of_chars;
5051 label_continue_loop:;
5060 postprocess_iso2022_mask (int mask)
5062 /* #### kind of cheesy */
5063 /* If seven-bit ISO is allowed, then assume that the encoding is
5064 entirely seven-bit and turn off the eight-bit ones. */
5065 if (mask & CODING_CATEGORY_ISO_7_MASK)
5066 mask &= ~ (CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
5067 CODING_CATEGORY_ISO_8_1_MASK |
5068 CODING_CATEGORY_ISO_8_2_MASK);
5072 /* If FLAGS is a null pointer or specifies right-to-left motion,
5073 output a switch-dir-to-left-to-right sequence to DST.
5074 Also update FLAGS if it is not a null pointer.
5075 If INTERNAL_P is set, we are outputting in internal format and
5076 need to handle the CSI differently. */
5079 restore_left_to_right_direction (Lisp_Coding_System *codesys,
5080 unsigned_char_dynarr *dst,
5081 unsigned int *flags,
5084 if (!flags || (*flags & CODING_STATE_R2L))
5086 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5088 Dynarr_add (dst, ISO_CODE_ESC);
5089 Dynarr_add (dst, '[');
5091 else if (internal_p)
5092 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
5094 Dynarr_add (dst, ISO_CODE_CSI);
5095 Dynarr_add (dst, '0');
5096 Dynarr_add (dst, ']');
5098 *flags &= ~CODING_STATE_R2L;
5102 /* If FLAGS is a null pointer or specifies a direction different from
5103 DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or
5104 CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape
5105 sequence to DST. Also update FLAGS if it is not a null pointer.
5106 If INTERNAL_P is set, we are outputting in internal format and
5107 need to handle the CSI differently. */
5110 ensure_correct_direction (int direction, Lisp_Coding_System *codesys,
5111 unsigned_char_dynarr *dst, unsigned int *flags,
5114 if ((!flags || (*flags & CODING_STATE_R2L)) &&
5115 direction == CHARSET_LEFT_TO_RIGHT)
5116 restore_left_to_right_direction (codesys, dst, flags, internal_p);
5117 else if (!CODING_SYSTEM_ISO2022_NO_ISO6429 (codesys)
5118 && (!flags || !(*flags & CODING_STATE_R2L)) &&
5119 direction == CHARSET_RIGHT_TO_LEFT)
5121 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5123 Dynarr_add (dst, ISO_CODE_ESC);
5124 Dynarr_add (dst, '[');
5126 else if (internal_p)
5127 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
5129 Dynarr_add (dst, ISO_CODE_CSI);
5130 Dynarr_add (dst, '2');
5131 Dynarr_add (dst, ']');
5133 *flags |= CODING_STATE_R2L;
5137 /* Convert ISO2022-format data to internal format. */
5140 decode_coding_iso2022 (Lstream *decoding, const Extbyte *src,
5141 unsigned_char_dynarr *dst, size_t n)
5143 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
5144 unsigned int flags = str->flags;
5145 unsigned int cpos = str->cpos;
5146 unsigned char counter = str->counter;
5147 eol_type_t eol_type = str->eol_type;
5148 #ifdef ENABLE_COMPOSITE_CHARS
5149 unsigned_char_dynarr *real_dst = dst;
5151 Lisp_Object coding_system;
5153 XSETCODING_SYSTEM (coding_system, str->codesys);
5155 #ifdef ENABLE_COMPOSITE_CHARS
5156 if (flags & CODING_STATE_COMPOSITE)
5157 dst = str->iso2022.composite_chars;
5158 #endif /* ENABLE_COMPOSITE_CHARS */
5162 unsigned char c = *(unsigned char *)src++;
5163 if (flags & CODING_STATE_ESCAPE)
5164 { /* Within ESC sequence */
5165 int retval = parse_iso2022_esc (coding_system, &str->iso2022,
5170 switch (str->iso2022.esc)
5172 #ifdef ENABLE_COMPOSITE_CHARS
5173 case ISO_ESC_START_COMPOSITE:
5174 if (str->iso2022.composite_chars)
5175 Dynarr_reset (str->iso2022.composite_chars);
5177 str->iso2022.composite_chars = Dynarr_new (unsigned_char);
5178 dst = str->iso2022.composite_chars;
5180 case ISO_ESC_END_COMPOSITE:
5182 Bufbyte comstr[MAX_EMCHAR_LEN];
5184 Emchar emch = lookup_composite_char (Dynarr_atp (dst, 0),
5185 Dynarr_length (dst));
5187 len = set_charptr_emchar (comstr, emch);
5188 Dynarr_add_many (dst, comstr, len);
5191 #endif /* ENABLE_COMPOSITE_CHARS */
5193 case ISO_ESC_LITERAL:
5194 COMPOSE_FLUSH_CHARS (str, dst);
5195 DECODE_ADD_BINARY_CHAR (c, dst);
5199 /* Everything else handled already */
5204 /* Attempted error recovery. */
5205 if (str->iso2022.output_direction_sequence)
5206 ensure_correct_direction (flags & CODING_STATE_R2L ?
5207 CHARSET_RIGHT_TO_LEFT :
5208 CHARSET_LEFT_TO_RIGHT,
5209 str->codesys, dst, 0, 1);
5210 /* More error recovery. */
5211 if (!retval || str->iso2022.output_literally)
5213 /* Output the (possibly invalid) sequence */
5215 COMPOSE_FLUSH_CHARS (str, dst);
5216 for (i = 0; i < str->iso2022.esc_bytes_index; i++)
5217 DECODE_ADD_BINARY_CHAR (str->iso2022.esc_bytes[i], dst);
5218 flags &= CODING_STATE_ISO2022_LOCK;
5220 n++, src--;/* Repeat the loop with the same character. */
5223 /* No sense in reprocessing the final byte of the
5224 escape sequence; it could mess things up anyway.
5226 COMPOSE_FLUSH_CHARS (str, dst);
5227 DECODE_ADD_BINARY_CHAR (c, dst);
5233 else if (BYTE_C0_P (c) || BYTE_C1_P (c))
5234 { /* Control characters */
5236 /***** Error-handling *****/
5238 /* If we were in the middle of a character, dump out the
5239 partial character. */
5242 COMPOSE_FLUSH_CHARS (str, dst);
5246 DECODE_ADD_BINARY_CHAR
5247 ((unsigned char)(cpos >> (counter * 8)), dst);
5252 /* If we just saw a single-shift character, dump it out.
5253 This may dump out the wrong sort of single-shift character,
5254 but least it will give an indication that something went
5256 if (flags & CODING_STATE_SS2)
5258 COMPOSE_FLUSH_CHARS (str, dst);
5259 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS2, dst);
5260 flags &= ~CODING_STATE_SS2;
5262 if (flags & CODING_STATE_SS3)
5264 COMPOSE_FLUSH_CHARS (str, dst);
5265 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS3, dst);
5266 flags &= ~CODING_STATE_SS3;
5269 /***** Now handle the control characters. *****/
5275 COMPOSE_FLUSH_CHARS (str, dst);
5276 if (eol_type == EOL_CR)
5277 Dynarr_add (dst, '\n');
5278 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR)
5279 Dynarr_add (dst, c);
5281 flags |= CODING_STATE_CR;
5282 goto label_continue_loop;
5284 else if (flags & CODING_STATE_CR)
5285 { /* eol_type == CODING_SYSTEM_EOL_CRLF */
5287 Dynarr_add (dst, '\r');
5288 flags &= ~CODING_STATE_CR;
5291 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5294 flags &= CODING_STATE_ISO2022_LOCK;
5296 if (!parse_iso2022_esc (coding_system, &str->iso2022, c, &flags, 1))
5298 COMPOSE_FLUSH_CHARS (str, dst);
5299 DECODE_ADD_BINARY_CHAR (c, dst);
5303 { /* Graphic characters */
5304 Lisp_Object charset;
5313 COMPOSE_FLUSH_CHARS (str, dst);
5314 if (eol_type == EOL_CR)
5315 Dynarr_add (dst, '\n');
5316 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR)
5317 Dynarr_add (dst, c);
5319 flags |= CODING_STATE_CR;
5320 goto label_continue_loop;
5322 else if (flags & CODING_STATE_CR)
5323 { /* eol_type == CODING_SYSTEM_EOL_CRLF */
5325 Dynarr_add (dst, '\r');
5326 flags &= ~CODING_STATE_CR;
5329 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5332 /* Now determine the charset. */
5333 reg = ((flags & CODING_STATE_SS2) ? 2
5334 : (flags & CODING_STATE_SS3) ? 3
5335 : !BYTE_ASCII_P (c) ? str->iso2022.register_right
5336 : str->iso2022.register_left);
5337 charset = str->iso2022.charset[reg];
5339 /* Error checking: */
5340 if (! CHARSETP (charset)
5341 || str->iso2022.invalid_designated[reg]
5342 || (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL)
5343 && XCHARSET_CHARS (charset) == 94))
5344 /* Mrmph. We are trying to invoke a register that has no
5345 or an invalid charset in it, or trying to add a character
5346 outside the range of the charset. Insert that char literally
5347 to preserve it for the output. */
5349 COMPOSE_FLUSH_CHARS (str, dst);
5353 DECODE_ADD_BINARY_CHAR
5354 ((unsigned char)(cpos >> (counter * 8)), dst);
5357 DECODE_ADD_BINARY_CHAR (c, dst);
5362 /* Things are probably hunky-dorey. */
5364 /* Fetch reverse charset, maybe. */
5365 if (((flags & CODING_STATE_R2L) &&
5366 XCHARSET_DIRECTION (charset) == CHARSET_LEFT_TO_RIGHT)
5368 (!(flags & CODING_STATE_R2L) &&
5369 XCHARSET_DIRECTION (charset) == CHARSET_RIGHT_TO_LEFT))
5371 Lisp_Object new_charset =
5372 XCHARSET_REVERSE_DIRECTION_CHARSET (charset);
5373 if (!NILP (new_charset))
5374 charset = new_charset;
5379 if (XCHARSET_DIMENSION (charset) == counter)
5381 COMPOSE_ADD_CHAR (str,
5382 DECODE_CHAR (charset,
5383 ((cpos & 0x7F7F7F) << 8)
5390 cpos = (cpos << 8) | c;
5392 lb = XCHARSET_LEADING_BYTE (charset);
5393 switch (XCHARSET_REP_BYTES (charset))
5396 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5397 Dynarr_add (dst, c & 0x7F);
5400 case 2: /* one-byte official */
5401 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5402 Dynarr_add (dst, lb);
5403 Dynarr_add (dst, c | 0x80);
5406 case 3: /* one-byte private or two-byte official */
5407 if (XCHARSET_PRIVATE_P (charset))
5409 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5410 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_1);
5411 Dynarr_add (dst, lb);
5412 Dynarr_add (dst, c | 0x80);
5418 Dynarr_add (dst, lb);
5419 Dynarr_add (dst, ch | 0x80);
5420 Dynarr_add (dst, c | 0x80);
5428 default: /* two-byte private */
5431 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_2);
5432 Dynarr_add (dst, lb);
5433 Dynarr_add (dst, ch | 0x80);
5434 Dynarr_add (dst, c | 0x80);
5444 flags &= CODING_STATE_ISO2022_LOCK;
5447 label_continue_loop:;
5450 if (flags & CODING_STATE_END)
5452 COMPOSE_FLUSH_CHARS (str, dst);
5453 DECODE_OUTPUT_PARTIAL_CHAR (cpos);
5457 str->counter = counter;
5461 /***** ISO2022 encoder *****/
5463 /* Designate CHARSET into register REG. */
5466 iso2022_designate (Lisp_Object charset, unsigned char reg,
5467 struct encoding_stream *str, unsigned_char_dynarr *dst)
5469 static const char inter94[] = "()*+";
5470 static const char inter96[] = ",-./";
5471 unsigned short chars;
5472 unsigned char dimension;
5473 unsigned char final;
5474 Lisp_Object old_charset = str->iso2022.charset[reg];
5476 str->iso2022.charset[reg] = charset;
5477 if (!CHARSETP (charset))
5478 /* charset might be an initial nil or t. */
5480 chars = XCHARSET_CHARS (charset);
5481 dimension = XCHARSET_DIMENSION (charset);
5482 final = XCHARSET_FINAL (charset);
5483 if (!str->iso2022.force_charset_on_output[reg] &&
5484 CHARSETP (old_charset) &&
5485 XCHARSET_CHARS (old_charset) == chars &&
5486 XCHARSET_DIMENSION (old_charset) == dimension &&
5487 XCHARSET_FINAL (old_charset) == final)
5490 str->iso2022.force_charset_on_output[reg] = 0;
5493 charset_conversion_spec_dynarr *dyn =
5494 str->codesys->iso2022.output_conv;
5500 for (i = 0; i < Dynarr_length (dyn); i++)
5502 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
5503 if (EQ (charset, spec->from_charset))
5504 charset = spec->to_charset;
5509 Dynarr_add (dst, ISO_CODE_ESC);
5514 Dynarr_add (dst, inter94[reg]);
5517 Dynarr_add (dst, '$');
5519 || !(CODING_SYSTEM_ISO2022_SHORT (str->codesys))
5522 Dynarr_add (dst, inter94[reg]);
5527 Dynarr_add (dst, inter96[reg]);
5530 Dynarr_add (dst, '$');
5531 Dynarr_add (dst, inter96[reg]);
5535 Dynarr_add (dst, final);
5539 ensure_normal_shift (struct encoding_stream *str, unsigned_char_dynarr *dst)
5541 if (str->iso2022.register_left != 0)
5543 Dynarr_add (dst, ISO_CODE_SI);
5544 str->iso2022.register_left = 0;
5549 ensure_shift_out (struct encoding_stream *str, unsigned_char_dynarr *dst)
5551 if (str->iso2022.register_left != 1)
5553 Dynarr_add (dst, ISO_CODE_SO);
5554 str->iso2022.register_left = 1;
5559 char_encode_iso2022 (struct encoding_stream *str, Emchar ch,
5560 unsigned_char_dynarr *dst, unsigned int *flags)
5562 unsigned char charmask;
5563 Lisp_Coding_System* codesys = str->codesys;
5564 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
5566 Lisp_Object charset = str->iso2022.current_charset;
5567 int half = str->iso2022.current_half;
5568 int code_point = -1;
5572 restore_left_to_right_direction (codesys, dst, flags, 0);
5574 /* Make sure G0 contains ASCII */
5575 if ((ch > ' ' && ch < ISO_CODE_DEL)
5576 || !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
5578 ensure_normal_shift (str, dst);
5579 iso2022_designate (Vcharset_ascii, 0, str, dst);
5582 /* If necessary, restore everything to the default state
5584 if (ch == '\n' && !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys)))
5586 restore_left_to_right_direction (codesys, dst, flags, 0);
5588 ensure_normal_shift (str, dst);
5590 for (i = 0; i < 4; i++)
5592 Lisp_Object initial_charset =
5593 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5594 iso2022_designate (initial_charset, i, str, dst);
5599 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5600 Dynarr_add (dst, '\r');
5601 if (eol_type != EOL_CR)
5602 Dynarr_add (dst, ch);
5606 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5607 && fit_to_be_escape_quoted (ch))
5608 Dynarr_add (dst, ISO_CODE_ESC);
5609 Dynarr_add (dst, ch);
5612 else if ( (0x80 <= ch) && (ch <= 0x9f) )
5614 charmask = (half == 0 ? 0x00 : 0x80);
5616 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5617 && fit_to_be_escape_quoted (ch))
5618 Dynarr_add (dst, ISO_CODE_ESC);
5619 /* you asked for it ... */
5620 Dynarr_add (dst, ch);
5626 /* Now determine which register to use. */
5628 for (i = 0; i < 4; i++)
5630 if ((CHARSETP (charset = str->iso2022.charset[i])
5631 && ((code_point = charset_code_point (charset, ch)) >= 0))
5635 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i))
5636 && ((code_point = charset_code_point (charset, ch)) >= 0)))
5644 Lisp_Object original_default_coded_charset_priority_list
5645 = Vdefault_coded_charset_priority_list;
5647 while (!EQ (Vdefault_coded_charset_priority_list, Qnil))
5649 code_point = ENCODE_CHAR (ch, charset);
5650 if (XCHARSET_FINAL (charset))
5652 Vdefault_coded_charset_priority_list
5653 = Fcdr (Fmemq (XCHARSET_NAME (charset),
5654 Vdefault_coded_charset_priority_list));
5656 code_point = ENCODE_CHAR (ch, charset);
5657 if (!XCHARSET_FINAL (charset))
5659 charset = Vcharset_ascii;
5663 Vdefault_coded_charset_priority_list
5664 = original_default_coded_charset_priority_list;
5666 ensure_correct_direction (XCHARSET_DIRECTION (charset),
5667 codesys, dst, flags, 0);
5671 if (XCHARSET_GRAPHIC (charset) != 0)
5673 if (!NILP (str->iso2022.charset[1]) &&
5674 (!CODING_SYSTEM_ISO2022_SEVEN (codesys)
5675 || CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys)))
5677 else if (!NILP (str->iso2022.charset[2]))
5679 else if (!NILP (str->iso2022.charset[3]))
5688 iso2022_designate (charset, reg, str, dst);
5690 /* Now invoke that register. */
5694 ensure_normal_shift (str, dst);
5698 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5700 ensure_shift_out (str, dst);
5707 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5709 Dynarr_add (dst, ISO_CODE_ESC);
5710 Dynarr_add (dst, 'N');
5715 Dynarr_add (dst, ISO_CODE_SS2);
5720 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5722 Dynarr_add (dst, ISO_CODE_ESC);
5723 Dynarr_add (dst, 'O');
5728 Dynarr_add (dst, ISO_CODE_SS3);
5736 charmask = (half == 0 ? 0x00 : 0x80);
5738 switch (XCHARSET_DIMENSION (charset))
5741 Dynarr_add (dst, (code_point & 0xFF) | charmask);
5744 Dynarr_add (dst, ((code_point >> 8) & 0xFF) | charmask);
5745 Dynarr_add (dst, ( code_point & 0xFF) | charmask);
5748 Dynarr_add (dst, ((code_point >> 16) & 0xFF) | charmask);
5749 Dynarr_add (dst, ((code_point >> 8) & 0xFF) | charmask);
5750 Dynarr_add (dst, ( code_point & 0xFF) | charmask);
5753 Dynarr_add (dst, ((code_point >> 24) & 0xFF) | charmask);
5754 Dynarr_add (dst, ((code_point >> 16) & 0xFF) | charmask);
5755 Dynarr_add (dst, ((code_point >> 8) & 0xFF) | charmask);
5756 Dynarr_add (dst, ( code_point & 0xFF) | charmask);
5762 str->iso2022.current_charset = charset;
5763 str->iso2022.current_half = half;
5767 char_finish_iso2022 (struct encoding_stream *str, unsigned_char_dynarr *dst,
5768 unsigned int *flags)
5770 Lisp_Coding_System* codesys = str->codesys;
5773 restore_left_to_right_direction (codesys, dst, flags, 0);
5774 ensure_normal_shift (str, dst);
5775 for (i = 0; i < 4; i++)
5777 Lisp_Object initial_charset
5778 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5779 iso2022_designate (initial_charset, i, str, dst);
5784 /************************************************************************/
5785 /* No-conversion methods */
5786 /************************************************************************/
5788 /* This is used when reading in "binary" files -- i.e. files that may
5789 contain all 256 possible byte values and that are not to be
5790 interpreted as being in any particular decoding. */
5792 decode_coding_no_conversion (Lstream *decoding, const Extbyte *src,
5793 unsigned_char_dynarr *dst, size_t n)
5795 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
5796 unsigned int flags = str->flags;
5797 unsigned int cpos = str->cpos;
5798 eol_type_t eol_type = str->eol_type;
5802 unsigned char c = *(unsigned char *)src++;
5804 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5805 DECODE_ADD_BINARY_CHAR (c, dst);
5806 label_continue_loop:;
5809 DECODE_HANDLE_END_OF_CONVERSION (flags, cpos, dst);
5816 encode_coding_no_conversion (Lstream *encoding, const Bufbyte *src,
5817 unsigned_char_dynarr *dst, size_t n)
5820 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
5821 unsigned int flags = str->flags;
5822 unsigned int ch = str->ch;
5823 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
5825 unsigned char char_boundary = str->iso2022.current_char_boundary;
5832 if (char_boundary == 0)
5838 else if ( c >= 0xf8 )
5843 else if ( c >= 0xf0 )
5848 else if ( c >= 0xe0 )
5853 else if ( c >= 0xc0 )
5863 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5864 Dynarr_add (dst, '\r');
5865 if (eol_type != EOL_CR)
5866 Dynarr_add (dst, c);
5869 Dynarr_add (dst, c);
5872 else if (char_boundary == 1)
5874 ch = ( ch << 6 ) | ( c & 0x3f );
5875 Dynarr_add (dst, ch & 0xff);
5880 ch = ( ch << 6 ) | ( c & 0x3f );
5883 #else /* not UTF2000 */
5886 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5887 Dynarr_add (dst, '\r');
5888 if (eol_type != EOL_CR)
5889 Dynarr_add (dst, '\n');
5892 else if (BYTE_ASCII_P (c))
5895 Dynarr_add (dst, c);
5897 else if (BUFBYTE_LEADING_BYTE_P (c))
5900 if (c == LEADING_BYTE_LATIN_ISO8859_1 ||
5901 c == LEADING_BYTE_CONTROL_1)
5904 Dynarr_add (dst, '~'); /* untranslatable character */
5908 if (ch == LEADING_BYTE_LATIN_ISO8859_1)
5909 Dynarr_add (dst, c);
5910 else if (ch == LEADING_BYTE_CONTROL_1)
5913 Dynarr_add (dst, c - 0x20);
5915 /* else it should be the second or third byte of an
5916 untranslatable character, so ignore it */
5919 #endif /* not UTF2000 */
5925 str->iso2022.current_char_boundary = char_boundary;
5931 /************************************************************************/
5932 /* Initialization */
5933 /************************************************************************/
5936 syms_of_file_coding (void)
5938 INIT_LRECORD_IMPLEMENTATION (coding_system);
5940 deferror (&Qcoding_system_error, "coding-system-error",
5941 "Coding-system error", Qio_error);
5943 DEFSUBR (Fcoding_system_p);
5944 DEFSUBR (Ffind_coding_system);
5945 DEFSUBR (Fget_coding_system);
5946 DEFSUBR (Fcoding_system_list);
5947 DEFSUBR (Fcoding_system_name);
5948 DEFSUBR (Fmake_coding_system);
5949 DEFSUBR (Fcopy_coding_system);
5950 DEFSUBR (Fcoding_system_canonical_name_p);
5951 DEFSUBR (Fcoding_system_alias_p);
5952 DEFSUBR (Fcoding_system_aliasee);
5953 DEFSUBR (Fdefine_coding_system_alias);
5954 DEFSUBR (Fsubsidiary_coding_system);
5956 DEFSUBR (Fcoding_system_type);
5957 DEFSUBR (Fcoding_system_doc_string);
5959 DEFSUBR (Fcoding_system_charset);
5961 DEFSUBR (Fcoding_system_property);
5963 DEFSUBR (Fcoding_category_list);
5964 DEFSUBR (Fset_coding_priority_list);
5965 DEFSUBR (Fcoding_priority_list);
5966 DEFSUBR (Fset_coding_category_system);
5967 DEFSUBR (Fcoding_category_system);
5969 DEFSUBR (Fdetect_coding_region);
5970 DEFSUBR (Fdecode_coding_region);
5971 DEFSUBR (Fencode_coding_region);
5973 DEFSUBR (Fdecode_shift_jis_char);
5974 DEFSUBR (Fencode_shift_jis_char);
5975 DEFSUBR (Fdecode_big5_char);
5976 DEFSUBR (Fencode_big5_char);
5978 defsymbol (&Qcoding_systemp, "coding-system-p");
5979 defsymbol (&Qno_conversion, "no-conversion");
5980 defsymbol (&Qraw_text, "raw-text");
5982 defsymbol (&Qbig5, "big5");
5983 defsymbol (&Qshift_jis, "shift-jis");
5984 defsymbol (&Qucs4, "ucs-4");
5985 defsymbol (&Qutf8, "utf-8");
5986 defsymbol (&Qccl, "ccl");
5987 defsymbol (&Qiso2022, "iso2022");
5989 defsymbol (&Qmnemonic, "mnemonic");
5990 defsymbol (&Qeol_type, "eol-type");
5991 defsymbol (&Qpost_read_conversion, "post-read-conversion");
5992 defsymbol (&Qpre_write_conversion, "pre-write-conversion");
5994 defsymbol (&Qcr, "cr");
5995 defsymbol (&Qlf, "lf");
5996 defsymbol (&Qcrlf, "crlf");
5997 defsymbol (&Qeol_cr, "eol-cr");
5998 defsymbol (&Qeol_lf, "eol-lf");
5999 defsymbol (&Qeol_crlf, "eol-crlf");
6001 defsymbol (&Qcharset_g0, "charset-g0");
6002 defsymbol (&Qcharset_g1, "charset-g1");
6003 defsymbol (&Qcharset_g2, "charset-g2");
6004 defsymbol (&Qcharset_g3, "charset-g3");
6005 defsymbol (&Qforce_g0_on_output, "force-g0-on-output");
6006 defsymbol (&Qforce_g1_on_output, "force-g1-on-output");
6007 defsymbol (&Qforce_g2_on_output, "force-g2-on-output");
6008 defsymbol (&Qforce_g3_on_output, "force-g3-on-output");
6009 defsymbol (&Qno_iso6429, "no-iso6429");
6010 defsymbol (&Qinput_charset_conversion, "input-charset-conversion");
6011 defsymbol (&Qoutput_charset_conversion, "output-charset-conversion");
6013 defsymbol (&Qshort, "short");
6014 defsymbol (&Qno_ascii_eol, "no-ascii-eol");
6015 defsymbol (&Qno_ascii_cntl, "no-ascii-cntl");
6016 defsymbol (&Qseven, "seven");
6017 defsymbol (&Qlock_shift, "lock-shift");
6018 defsymbol (&Qescape_quoted, "escape-quoted");
6021 defsymbol (&Qdisable_composition, "disable-composition");
6022 defsymbol (&Quse_entity_reference, "use-entity-reference");
6023 defsymbol (&Qd, "d");
6024 defsymbol (&Qx, "x");
6025 defsymbol (&QX, "X");
6027 defsymbol (&Qencode, "encode");
6028 defsymbol (&Qdecode, "decode");
6031 defsymbol (&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS],
6033 defsymbol (&coding_category_symbol[CODING_CATEGORY_BIG5],
6035 defsymbol (&coding_category_symbol[CODING_CATEGORY_UCS4],
6037 defsymbol (&coding_category_symbol[CODING_CATEGORY_UTF8],
6039 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_7],
6041 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE],
6043 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_1],
6045 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_2],
6047 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_LOCK_SHIFT],
6050 defsymbol (&coding_category_symbol[CODING_CATEGORY_NO_CONVERSION],
6055 lstream_type_create_file_coding (void)
6057 LSTREAM_HAS_METHOD (decoding, reader);
6058 LSTREAM_HAS_METHOD (decoding, writer);
6059 LSTREAM_HAS_METHOD (decoding, rewinder);
6060 LSTREAM_HAS_METHOD (decoding, seekable_p);
6061 LSTREAM_HAS_METHOD (decoding, flusher);
6062 LSTREAM_HAS_METHOD (decoding, closer);
6063 LSTREAM_HAS_METHOD (decoding, marker);
6065 LSTREAM_HAS_METHOD (encoding, reader);
6066 LSTREAM_HAS_METHOD (encoding, writer);
6067 LSTREAM_HAS_METHOD (encoding, rewinder);
6068 LSTREAM_HAS_METHOD (encoding, seekable_p);
6069 LSTREAM_HAS_METHOD (encoding, flusher);
6070 LSTREAM_HAS_METHOD (encoding, closer);
6071 LSTREAM_HAS_METHOD (encoding, marker);
6075 vars_of_file_coding (void)
6079 fcd = xnew (struct file_coding_dump);
6080 dump_add_root_struct_ptr (&fcd, &fcd_description);
6082 /* Initialize to something reasonable ... */
6083 for (i = 0; i < CODING_CATEGORY_LAST; i++)
6085 fcd->coding_category_system[i] = Qnil;
6086 fcd->coding_category_by_priority[i] = i;
6089 Fprovide (intern ("file-coding"));
6091 DEFVAR_LISP ("keyboard-coding-system", &Vkeyboard_coding_system /*
6092 Coding system used for TTY keyboard input.
6093 Not used under a windowing system.
6095 Vkeyboard_coding_system = Qnil;
6097 DEFVAR_LISP ("terminal-coding-system", &Vterminal_coding_system /*
6098 Coding system used for TTY display output.
6099 Not used under a windowing system.
6101 Vterminal_coding_system = Qnil;
6103 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read /*
6104 Overriding coding system used when reading from a file or process.
6105 You should bind this variable with `let', but do not set it globally.
6106 If this is non-nil, it specifies the coding system that will be used
6107 to decode input on read operations, such as from a file or process.
6108 It overrides `buffer-file-coding-system-for-read',
6109 `insert-file-contents-pre-hook', etc. Use those variables instead of
6110 this one for permanent changes to the environment. */ );
6111 Vcoding_system_for_read = Qnil;
6113 DEFVAR_LISP ("coding-system-for-write",
6114 &Vcoding_system_for_write /*
6115 Overriding coding system used when writing to a file or process.
6116 You should bind this variable with `let', but do not set it globally.
6117 If this is non-nil, it specifies the coding system that will be used
6118 to encode output for write operations, such as to a file or process.
6119 It overrides `buffer-file-coding-system', `write-region-pre-hook', etc.
6120 Use those variables instead of this one for permanent changes to the
6122 Vcoding_system_for_write = Qnil;
6124 DEFVAR_LISP ("file-name-coding-system", &Vfile_name_coding_system /*
6125 Coding system used to convert pathnames when accessing files.
6127 Vfile_name_coding_system = Qnil;
6129 DEFVAR_LISP ("coded-charset-entity-reference-alist",
6130 &Vcoded_charset_entity_reference_alist /*
6131 Alist of coded-charset vs corresponding entity-reference.
6132 Each element looks like (CCS PREFIX CODE-COLUMNS CODE-TYPE).
6133 CCS is coded-charset.
6134 CODE-COLUMNS is columns of code-point of entity-reference.
6135 CODE-TYPE is format type of code-point of entity-reference.
6136 `d' means decimal value and `x' means hexadecimal value.
6138 Vcoded_charset_entity_reference_alist = Qnil;
6140 DEFVAR_BOOL ("enable-multibyte-characters", &enable_multibyte_characters /*
6141 Non-nil means the buffer contents are regarded as multi-byte form
6142 of characters, not a binary code. This affects the display, file I/O,
6143 and behaviors of various editing commands.
6145 Setting this to nil does not do anything.
6147 enable_multibyte_characters = 1;
6151 complex_vars_of_file_coding (void)
6153 staticpro (&Vcoding_system_hash_table);
6154 Vcoding_system_hash_table =
6155 make_lisp_hash_table (50, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ);
6157 the_codesys_prop_dynarr = Dynarr_new (codesys_prop);
6158 dump_add_root_struct_ptr (&the_codesys_prop_dynarr, &codesys_prop_dynarr_description);
6160 #define DEFINE_CODESYS_PROP(Prop_Type, Sym) do \
6162 struct codesys_prop csp; \
6164 csp.prop_type = (Prop_Type); \
6165 Dynarr_add (the_codesys_prop_dynarr, csp); \
6168 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qmnemonic);
6169 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_type);
6170 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_cr);
6171 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_crlf);
6172 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_lf);
6173 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpost_read_conversion);
6174 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpre_write_conversion);
6176 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g0);
6177 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g1);
6178 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g2);
6179 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g3);
6180 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g0_on_output);
6181 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g1_on_output);
6182 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g2_on_output);
6183 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g3_on_output);
6184 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qshort);
6185 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_eol);
6186 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_cntl);
6187 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qseven);
6188 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qlock_shift);
6189 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_iso6429);
6190 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qescape_quoted);
6191 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qinput_charset_conversion);
6192 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qoutput_charset_conversion);
6194 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qencode);
6195 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qdecode);
6197 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qdisable_composition);
6198 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Quse_entity_reference);
6201 /* Need to create this here or we're really screwed. */
6203 (Qraw_text, Qno_conversion,
6204 build_string ("Raw text, which means it converts only line-break-codes."),
6205 list2 (Qmnemonic, build_string ("Raw")));
6208 (Qbinary, Qno_conversion,
6209 build_string ("Binary, which means it does not convert anything."),
6210 list4 (Qeol_type, Qlf,
6211 Qmnemonic, build_string ("Binary")));
6216 build_string ("Coding-system of ISO/IEC 10646 UTF-8."),
6217 list2 (Qmnemonic, build_string ("UTF8")));
6220 Fdefine_coding_system_alias (Qno_conversion, Qraw_text);
6222 Fdefine_coding_system_alias (Qfile_name, Qbinary);
6224 Fdefine_coding_system_alias (Qterminal, Qbinary);
6225 Fdefine_coding_system_alias (Qkeyboard, Qbinary);
6227 /* Need this for bootstrapping */
6228 fcd->coding_category_system[CODING_CATEGORY_NO_CONVERSION] =
6229 Fget_coding_system (Qraw_text);
6232 fcd->coding_category_system[CODING_CATEGORY_UTF8]
6233 = Fget_coding_system (Qutf8);
6236 #if defined(MULE) && !defined(UTF2000)
6240 for (i = 0; i < countof (fcd->ucs_to_mule_table); i++)
6241 fcd->ucs_to_mule_table[i] = Qnil;
6243 staticpro (&mule_to_ucs_table);
6244 mule_to_ucs_table = Fmake_char_table(Qgeneric);
6245 #endif /* defined(MULE) && !defined(UTF2000) */