1 /* Code conversion functions.
2 Copyright (C) 1991, 1995 Free Software Foundation, Inc.
3 Copyright (C) 1995 Sun Microsystems, Inc.
5 This file is part of XEmacs.
7 XEmacs is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by the
9 Free Software Foundation; either version 2, or (at your option) any
12 XEmacs is distributed in the hope that it will be useful, but WITHOUT
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 You should have received a copy of the GNU General Public License
18 along with XEmacs; see the file COPYING. If not, write to
19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20 Boston, MA 02111-1307, USA. */
22 /* Synched up with: Mule 2.3. Not in FSF. */
24 /* Rewritten by Ben Wing <ben@xemacs.org>. */
38 #include "file-coding.h"
40 Lisp_Object Qcoding_system_error;
42 Lisp_Object Vkeyboard_coding_system;
43 Lisp_Object Vterminal_coding_system;
44 Lisp_Object Vcoding_system_for_read;
45 Lisp_Object Vcoding_system_for_write;
46 Lisp_Object Vfile_name_coding_system;
48 /* Table of symbols identifying each coding category. */
49 Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST];
53 struct file_coding_dump {
54 /* Coding system currently associated with each coding category. */
55 Lisp_Object coding_category_system[CODING_CATEGORY_LAST];
57 /* Table of all coding categories in decreasing order of priority.
58 This describes a permutation of the possible coding categories. */
59 int coding_category_by_priority[CODING_CATEGORY_LAST];
62 Lisp_Object ucs_to_mule_table[65536];
66 static const struct lrecord_description fcd_description_1[] = {
67 { XD_LISP_OBJECT_ARRAY, offsetof (struct file_coding_dump, coding_category_system), CODING_CATEGORY_LAST },
69 { XD_LISP_OBJECT_ARRAY, offsetof (struct file_coding_dump, ucs_to_mule_table), countof (fcd->ucs_to_mule_table) },
74 static const struct struct_description fcd_description = {
75 sizeof (struct file_coding_dump),
79 Lisp_Object mule_to_ucs_table;
81 Lisp_Object Qcoding_systemp;
83 Lisp_Object Qraw_text, Qno_conversion, Qccl, Qiso2022;
84 /* Qinternal in general.c */
86 Lisp_Object Qmnemonic, Qeol_type;
87 Lisp_Object Qcr, Qcrlf, Qlf;
88 Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf;
89 Lisp_Object Qpost_read_conversion;
90 Lisp_Object Qpre_write_conversion;
93 Lisp_Object Qucs4, Qutf8;
94 Lisp_Object Qbig5, Qshift_jis;
95 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3;
96 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output;
97 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output;
98 Lisp_Object Qno_iso6429;
99 Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion;
100 Lisp_Object Qescape_quoted;
101 Lisp_Object Qshort, Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift;
103 Lisp_Object Qencode, Qdecode;
105 Lisp_Object Vcoding_system_hash_table;
107 int enable_multibyte_characters;
110 /* Additional information used by the ISO2022 decoder and detector. */
111 struct iso2022_decoder
113 /* CHARSET holds the character sets currently assigned to the G0
114 through G3 variables. It is initialized from the array
115 INITIAL_CHARSET in CODESYS. */
116 Lisp_Object charset[4];
118 /* Which registers are currently invoked into the left (GL) and
119 right (GR) halves of the 8-bit encoding space? */
120 int register_left, register_right;
122 /* ISO_ESC holds a value indicating part of an escape sequence
123 that has already been seen. */
124 enum iso_esc_flag esc;
126 /* This records the bytes we've seen so far in an escape sequence,
127 in case the sequence is invalid (we spit out the bytes unchanged). */
128 unsigned char esc_bytes[8];
130 /* Index for next byte to store in ISO escape sequence. */
133 #ifdef ENABLE_COMPOSITE_CHARS
134 /* Stuff seen so far when composing a string. */
135 unsigned_char_dynarr *composite_chars;
138 /* If we saw an invalid designation sequence for a particular
139 register, we flag it here and switch to ASCII. The next time we
140 see a valid designation for this register, we turn off the flag
141 and do the designation normally, but pretend the sequence was
142 invalid. The effect of all this is that (most of the time) the
143 escape sequences for both the switch to the unknown charset, and
144 the switch back to the known charset, get inserted literally into
145 the buffer and saved out as such. The hope is that we can
146 preserve the escape sequences so that the resulting written out
147 file makes sense. If we don't do any of this, the designation
148 to the invalid charset will be preserved but that switch back
149 to the known charset will probably get eaten because it was
150 the same charset that was already present in the register. */
151 unsigned char invalid_designated[4];
153 /* We try to do similar things as above for direction-switching
154 sequences. If we encountered a direction switch while an
155 invalid designation was present, or an invalid designation
156 just after a direction switch (i.e. no valid designation
157 encountered yet), we insert the direction-switch escape
158 sequence literally into the output stream, and later on
159 insert the corresponding direction-restoring escape sequence
161 unsigned int switched_dir_and_no_valid_charset_yet :1;
162 unsigned int invalid_switch_dir :1;
164 /* Tells the decoder to output the escape sequence literally
165 even though it was valid. Used in the games we play to
166 avoid lossage when we encounter invalid designations. */
167 unsigned int output_literally :1;
168 /* We encountered a direction switch followed by an invalid
169 designation. We didn't output the direction switch
170 literally because we didn't know about the invalid designation;
171 but we have to do so now. */
172 unsigned int output_direction_sequence :1;
175 EXFUN (Fcopy_coding_system, 2);
177 struct detection_state;
178 static int detect_coding_sjis (struct detection_state *st,
179 const Extbyte *src, Lstream_data_count n);
180 static void decode_coding_sjis (Lstream *decoding, const Extbyte *src,
181 unsigned_char_dynarr *dst, Lstream_data_count n);
182 static void encode_coding_sjis (Lstream *encoding, const Bufbyte *src,
183 unsigned_char_dynarr *dst, Lstream_data_count n);
184 static int detect_coding_big5 (struct detection_state *st,
185 const Extbyte *src, Lstream_data_count n);
186 static void decode_coding_big5 (Lstream *decoding, const Extbyte *src,
187 unsigned_char_dynarr *dst, Lstream_data_count n);
188 static void encode_coding_big5 (Lstream *encoding, const Bufbyte *src,
189 unsigned_char_dynarr *dst, Lstream_data_count n);
190 static int detect_coding_ucs4 (struct detection_state *st,
191 const Extbyte *src, Lstream_data_count n);
192 static void decode_coding_ucs4 (Lstream *decoding, const Extbyte *src,
193 unsigned_char_dynarr *dst, Lstream_data_count n);
194 static void encode_coding_ucs4 (Lstream *encoding, const Bufbyte *src,
195 unsigned_char_dynarr *dst, Lstream_data_count n);
196 static int detect_coding_utf8 (struct detection_state *st,
197 const Extbyte *src, Lstream_data_count n);
198 static void decode_coding_utf8 (Lstream *decoding, const Extbyte *src,
199 unsigned_char_dynarr *dst, Lstream_data_count n);
200 static void encode_coding_utf8 (Lstream *encoding, const Bufbyte *src,
201 unsigned_char_dynarr *dst, Lstream_data_count n);
202 static int postprocess_iso2022_mask (int mask);
203 static void reset_iso2022 (Lisp_Object coding_system,
204 struct iso2022_decoder *iso);
205 static int detect_coding_iso2022 (struct detection_state *st,
206 const Extbyte *src, Lstream_data_count n);
207 static void decode_coding_iso2022 (Lstream *decoding, const Extbyte *src,
208 unsigned_char_dynarr *dst, Lstream_data_count n);
209 static void encode_coding_iso2022 (Lstream *encoding, const Bufbyte *src,
210 unsigned_char_dynarr *dst, Lstream_data_count n);
212 static void decode_coding_no_conversion (Lstream *decoding, const Extbyte *src,
213 unsigned_char_dynarr *dst, Lstream_data_count n);
214 static void encode_coding_no_conversion (Lstream *encoding, const Bufbyte *src,
215 unsigned_char_dynarr *dst, Lstream_data_count n);
216 static void mule_decode (Lstream *decoding, const Extbyte *src,
217 unsigned_char_dynarr *dst, Lstream_data_count n);
218 static void mule_encode (Lstream *encoding, const Bufbyte *src,
219 unsigned_char_dynarr *dst, Lstream_data_count n);
221 typedef struct codesys_prop codesys_prop;
230 Dynarr_declare (codesys_prop);
231 } codesys_prop_dynarr;
233 static const struct lrecord_description codesys_prop_description_1[] = {
234 { XD_LISP_OBJECT, offsetof (codesys_prop, sym) },
238 static const struct struct_description codesys_prop_description = {
239 sizeof (codesys_prop),
240 codesys_prop_description_1
243 static const struct lrecord_description codesys_prop_dynarr_description_1[] = {
244 XD_DYNARR_DESC (codesys_prop_dynarr, &codesys_prop_description),
248 static const struct struct_description codesys_prop_dynarr_description = {
249 sizeof (codesys_prop_dynarr),
250 codesys_prop_dynarr_description_1
253 codesys_prop_dynarr *the_codesys_prop_dynarr;
255 enum codesys_prop_enum
258 CODESYS_PROP_ISO2022,
263 /************************************************************************/
264 /* Coding system functions */
265 /************************************************************************/
267 static Lisp_Object mark_coding_system (Lisp_Object);
268 static void print_coding_system (Lisp_Object, Lisp_Object, int);
269 static void finalize_coding_system (void *header, int for_disksave);
272 static const struct lrecord_description ccs_description_1[] = {
273 { XD_LISP_OBJECT, offsetof (charset_conversion_spec, from_charset) },
274 { XD_LISP_OBJECT, offsetof (charset_conversion_spec, to_charset) },
278 static const struct struct_description ccs_description = {
279 sizeof (charset_conversion_spec),
283 static const struct lrecord_description ccsd_description_1[] = {
284 XD_DYNARR_DESC (charset_conversion_spec_dynarr, &ccs_description),
288 static const struct struct_description ccsd_description = {
289 sizeof (charset_conversion_spec_dynarr),
294 static const struct lrecord_description coding_system_description[] = {
295 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, name) },
296 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, doc_string) },
297 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, mnemonic) },
298 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, post_read_conversion) },
299 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, pre_write_conversion) },
300 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_lf) },
301 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_crlf) },
302 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_cr) },
304 { XD_LISP_OBJECT_ARRAY, offsetof (Lisp_Coding_System, iso2022.initial_charset), 4 },
305 { XD_STRUCT_PTR, offsetof (Lisp_Coding_System, iso2022.input_conv), 1, &ccsd_description },
306 { XD_STRUCT_PTR, offsetof (Lisp_Coding_System, iso2022.output_conv), 1, &ccsd_description },
307 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccl.decode) },
308 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccl.encode) },
313 DEFINE_LRECORD_IMPLEMENTATION ("coding-system", coding_system,
314 mark_coding_system, print_coding_system,
315 finalize_coding_system,
316 0, 0, coding_system_description,
320 mark_coding_system (Lisp_Object obj)
322 Lisp_Coding_System *codesys = XCODING_SYSTEM (obj);
324 mark_object (CODING_SYSTEM_NAME (codesys));
325 mark_object (CODING_SYSTEM_DOC_STRING (codesys));
326 mark_object (CODING_SYSTEM_MNEMONIC (codesys));
327 mark_object (CODING_SYSTEM_EOL_LF (codesys));
328 mark_object (CODING_SYSTEM_EOL_CRLF (codesys));
329 mark_object (CODING_SYSTEM_EOL_CR (codesys));
331 switch (CODING_SYSTEM_TYPE (codesys))
335 case CODESYS_ISO2022:
336 for (i = 0; i < 4; i++)
337 mark_object (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i));
338 if (codesys->iso2022.input_conv)
340 for (i = 0; i < Dynarr_length (codesys->iso2022.input_conv); i++)
342 struct charset_conversion_spec *ccs =
343 Dynarr_atp (codesys->iso2022.input_conv, i);
344 mark_object (ccs->from_charset);
345 mark_object (ccs->to_charset);
348 if (codesys->iso2022.output_conv)
350 for (i = 0; i < Dynarr_length (codesys->iso2022.output_conv); i++)
352 struct charset_conversion_spec *ccs =
353 Dynarr_atp (codesys->iso2022.output_conv, i);
354 mark_object (ccs->from_charset);
355 mark_object (ccs->to_charset);
361 mark_object (CODING_SYSTEM_CCL_DECODE (codesys));
362 mark_object (CODING_SYSTEM_CCL_ENCODE (codesys));
369 mark_object (CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys));
370 return CODING_SYSTEM_POST_READ_CONVERSION (codesys);
374 print_coding_system (Lisp_Object obj, Lisp_Object printcharfun,
377 Lisp_Coding_System *c = XCODING_SYSTEM (obj);
379 error ("printing unreadable object #<coding_system 0x%x>",
382 write_c_string ("#<coding_system ", printcharfun);
383 print_internal (c->name, printcharfun, 1);
384 write_c_string (">", printcharfun);
388 finalize_coding_system (void *header, int for_disksave)
390 Lisp_Coding_System *c = (Lisp_Coding_System *) header;
391 /* Since coding systems never go away, this function is not
392 necessary. But it would be necessary if we changed things
393 so that coding systems could go away. */
394 if (!for_disksave) /* see comment in lstream.c */
396 switch (CODING_SYSTEM_TYPE (c))
399 case CODESYS_ISO2022:
400 if (c->iso2022.input_conv)
402 Dynarr_free (c->iso2022.input_conv);
403 c->iso2022.input_conv = 0;
405 if (c->iso2022.output_conv)
407 Dynarr_free (c->iso2022.output_conv);
408 c->iso2022.output_conv = 0;
419 symbol_to_eol_type (Lisp_Object symbol)
421 CHECK_SYMBOL (symbol);
422 if (NILP (symbol)) return EOL_AUTODETECT;
423 if (EQ (symbol, Qlf)) return EOL_LF;
424 if (EQ (symbol, Qcrlf)) return EOL_CRLF;
425 if (EQ (symbol, Qcr)) return EOL_CR;
427 signal_simple_error ("Unrecognized eol type", symbol);
428 return EOL_AUTODETECT; /* not reached */
432 eol_type_to_symbol (eol_type_t type)
437 case EOL_LF: return Qlf;
438 case EOL_CRLF: return Qcrlf;
439 case EOL_CR: return Qcr;
440 case EOL_AUTODETECT: return Qnil;
445 setup_eol_coding_systems (Lisp_Coding_System *codesys)
447 Lisp_Object codesys_obj;
448 int len = string_length (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name);
449 char *codesys_name = (char *) alloca (len + 7);
451 char *codesys_mnemonic=0;
453 Lisp_Object codesys_name_sym, sub_codesys_obj;
457 XSETCODING_SYSTEM (codesys_obj, codesys);
459 memcpy (codesys_name,
460 string_data (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name), len);
462 if (STRINGP (CODING_SYSTEM_MNEMONIC (codesys)))
464 mlen = XSTRING_LENGTH (CODING_SYSTEM_MNEMONIC (codesys));
465 codesys_mnemonic = (char *) alloca (mlen + 7);
466 memcpy (codesys_mnemonic,
467 XSTRING_DATA (CODING_SYSTEM_MNEMONIC (codesys)), mlen);
470 #define DEFINE_SUB_CODESYS(op_sys, op_sys_abbr, Type) do { \
471 strcpy (codesys_name + len, "-" op_sys); \
473 strcpy (codesys_mnemonic + mlen, op_sys_abbr); \
474 codesys_name_sym = intern (codesys_name); \
475 sub_codesys_obj = Fcopy_coding_system (codesys_obj, codesys_name_sym); \
476 XCODING_SYSTEM_EOL_TYPE (sub_codesys_obj) = Type; \
478 XCODING_SYSTEM_MNEMONIC(sub_codesys_obj) = \
479 build_string (codesys_mnemonic); \
480 CODING_SYSTEM_##Type (codesys) = sub_codesys_obj; \
483 DEFINE_SUB_CODESYS("unix", "", EOL_LF);
484 DEFINE_SUB_CODESYS("dos", ":T", EOL_CRLF);
485 DEFINE_SUB_CODESYS("mac", ":t", EOL_CR);
488 DEFUN ("coding-system-p", Fcoding_system_p, 1, 1, 0, /*
489 Return t if OBJECT is a coding system.
490 A coding system is an object that defines how text containing multiple
491 character sets is encoded into a stream of (typically 8-bit) bytes.
492 The coding system is used to decode the stream into a series of
493 characters (which may be from multiple charsets) when the text is read
494 from a file or process, and is used to encode the text back into the
495 same format when it is written out to a file or process.
497 For example, many ISO2022-compliant coding systems (such as Compound
498 Text, which is used for inter-client data under the X Window System)
499 use escape sequences to switch between different charsets -- Japanese
500 Kanji, for example, is invoked with "ESC $ ( B"; ASCII is invoked
501 with "ESC ( B"; and Cyrillic is invoked with "ESC - L". See
502 `make-coding-system' for more information.
504 Coding systems are normally identified using a symbol, and the
505 symbol is accepted in place of the actual coding system object whenever
506 a coding system is called for. (This is similar to how faces work.)
510 return CODING_SYSTEMP (object) ? Qt : Qnil;
513 DEFUN ("find-coding-system", Ffind_coding_system, 1, 1, 0, /*
514 Retrieve the coding system of the given name.
516 If CODING-SYSTEM-OR-NAME is a coding-system object, it is simply
517 returned. Otherwise, CODING-SYSTEM-OR-NAME should be a symbol.
518 If there is no such coding system, nil is returned. Otherwise the
519 associated coding system object is returned.
521 (coding_system_or_name))
523 if (NILP (coding_system_or_name))
524 coding_system_or_name = Qbinary;
525 else if (CODING_SYSTEMP (coding_system_or_name))
526 return coding_system_or_name;
528 CHECK_SYMBOL (coding_system_or_name);
532 coding_system_or_name =
533 Fgethash (coding_system_or_name, Vcoding_system_hash_table, Qnil);
535 if (CODING_SYSTEMP (coding_system_or_name) || NILP (coding_system_or_name))
536 return coding_system_or_name;
540 DEFUN ("get-coding-system", Fget_coding_system, 1, 1, 0, /*
541 Retrieve the coding system of the given name.
542 Same as `find-coding-system' except that if there is no such
543 coding system, an error is signaled instead of returning nil.
547 Lisp_Object coding_system = Ffind_coding_system (name);
549 if (NILP (coding_system))
550 signal_simple_error ("No such coding system", name);
551 return coding_system;
554 /* We store the coding systems in hash tables with the names as the key and the
555 actual coding system object as the value. Occasionally we need to use them
556 in a list format. These routines provide us with that. */
557 struct coding_system_list_closure
559 Lisp_Object *coding_system_list;
563 add_coding_system_to_list_mapper (Lisp_Object key, Lisp_Object value,
564 void *coding_system_list_closure)
566 /* This function can GC */
567 struct coding_system_list_closure *cscl =
568 (struct coding_system_list_closure *) coding_system_list_closure;
569 Lisp_Object *coding_system_list = cscl->coding_system_list;
571 *coding_system_list = Fcons (key, *coding_system_list);
575 DEFUN ("coding-system-list", Fcoding_system_list, 0, 0, 0, /*
576 Return a list of the names of all defined coding systems.
580 Lisp_Object coding_system_list = Qnil;
582 struct coding_system_list_closure coding_system_list_closure;
584 GCPRO1 (coding_system_list);
585 coding_system_list_closure.coding_system_list = &coding_system_list;
586 elisp_maphash (add_coding_system_to_list_mapper, Vcoding_system_hash_table,
587 &coding_system_list_closure);
590 return coding_system_list;
593 DEFUN ("coding-system-name", Fcoding_system_name, 1, 1, 0, /*
594 Return the name of the given coding system.
598 coding_system = Fget_coding_system (coding_system);
599 return XCODING_SYSTEM_NAME (coding_system);
602 static Lisp_Coding_System *
603 allocate_coding_system (enum coding_system_type type, Lisp_Object name)
605 Lisp_Coding_System *codesys =
606 alloc_lcrecord_type (Lisp_Coding_System, &lrecord_coding_system);
608 zero_lcrecord (codesys);
609 CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = Qnil;
610 CODING_SYSTEM_POST_READ_CONVERSION (codesys) = Qnil;
611 CODING_SYSTEM_EOL_TYPE (codesys) = EOL_AUTODETECT;
612 CODING_SYSTEM_EOL_CRLF (codesys) = Qnil;
613 CODING_SYSTEM_EOL_CR (codesys) = Qnil;
614 CODING_SYSTEM_EOL_LF (codesys) = Qnil;
615 CODING_SYSTEM_TYPE (codesys) = type;
616 CODING_SYSTEM_MNEMONIC (codesys) = Qnil;
618 if (type == CODESYS_ISO2022)
621 for (i = 0; i < 4; i++)
622 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i) = Qnil;
624 else if (type == CODESYS_CCL)
626 CODING_SYSTEM_CCL_DECODE (codesys) = Qnil;
627 CODING_SYSTEM_CCL_ENCODE (codesys) = Qnil;
630 CODING_SYSTEM_NAME (codesys) = name;
636 /* Given a list of charset conversion specs as specified in a Lisp
637 program, parse it into STORE_HERE. */
640 parse_charset_conversion_specs (charset_conversion_spec_dynarr *store_here,
641 Lisp_Object spec_list)
645 EXTERNAL_LIST_LOOP (rest, spec_list)
647 Lisp_Object car = XCAR (rest);
648 Lisp_Object from, to;
649 struct charset_conversion_spec spec;
651 if (!CONSP (car) || !CONSP (XCDR (car)) || !NILP (XCDR (XCDR (car))))
652 signal_simple_error ("Invalid charset conversion spec", car);
653 from = Fget_charset (XCAR (car));
654 to = Fget_charset (XCAR (XCDR (car)));
655 if (XCHARSET_TYPE (from) != XCHARSET_TYPE (to))
656 signal_simple_error_2
657 ("Attempted conversion between different charset types",
659 spec.from_charset = from;
660 spec.to_charset = to;
662 Dynarr_add (store_here, spec);
666 /* Given a dynarr LOAD_HERE of internally-stored charset conversion
667 specs, return the equivalent as the Lisp programmer would see it.
669 If LOAD_HERE is 0, return Qnil. */
672 unparse_charset_conversion_specs (charset_conversion_spec_dynarr *load_here)
679 for (i = 0, result = Qnil; i < Dynarr_length (load_here); i++)
681 struct charset_conversion_spec *ccs = Dynarr_atp (load_here, i);
682 result = Fcons (list2 (ccs->from_charset, ccs->to_charset), result);
685 return Fnreverse (result);
690 DEFUN ("make-coding-system", Fmake_coding_system, 2, 4, 0, /*
691 Register symbol NAME as a coding system.
693 TYPE describes the conversion method used and should be one of
696 Automatic conversion. XEmacs attempts to detect the coding system
699 No conversion. Use this for binary files and such. On output,
700 graphic characters that are not in ASCII or Latin-1 will be
701 replaced by a ?. (For a no-conversion-encoded buffer, these
702 characters will only be present if you explicitly insert them.)
704 Shift-JIS (a Japanese encoding commonly used in PC operating systems).
706 ISO 10646 UCS-4 encoding.
708 ISO 10646 UTF-8 encoding.
710 Any ISO2022-compliant encoding. Among other things, this includes
711 JIS (the Japanese encoding commonly used for e-mail), EUC (the
712 standard Unix encoding for Japanese and other languages), and
713 Compound Text (the encoding used in X11). You can specify more
714 specific information about the conversion with the PROPS argument.
716 Big5 (the encoding commonly used for Taiwanese).
718 The conversion is performed using a user-written pseudo-code
719 program. CCL (Code Conversion Language) is the name of this
722 Write out or read in the raw contents of the memory representing
723 the buffer's text. This is primarily useful for debugging
724 purposes, and is only enabled when XEmacs has been compiled with
725 DEBUG_XEMACS defined (via the --debug configure option).
726 WARNING: Reading in a file using 'internal conversion can result
727 in an internal inconsistency in the memory representing a
728 buffer's text, which will produce unpredictable results and may
729 cause XEmacs to crash. Under normal circumstances you should
730 never use 'internal conversion.
732 DOC-STRING is a string describing the coding system.
734 PROPS is a property list, describing the specific nature of the
735 character set. Recognized properties are:
738 String to be displayed in the modeline when this coding system is
742 End-of-line conversion to be used. It should be one of
745 Automatically detect the end-of-line type (LF, CRLF,
746 or CR). Also generate subsidiary coding systems named
747 `NAME-unix', `NAME-dos', and `NAME-mac', that are
748 identical to this coding system but have an EOL-TYPE
749 value of 'lf, 'crlf, and 'cr, respectively.
751 The end of a line is marked externally using ASCII LF.
752 Since this is also the way that XEmacs represents an
753 end-of-line internally, specifying this option results
754 in no end-of-line conversion. This is the standard
755 format for Unix text files.
757 The end of a line is marked externally using ASCII
758 CRLF. This is the standard format for MS-DOS text
761 The end of a line is marked externally using ASCII CR.
762 This is the standard format for Macintosh text files.
764 Automatically detect the end-of-line type but do not
765 generate subsidiary coding systems. (This value is
766 converted to nil when stored internally, and
767 `coding-system-property' will return nil.)
769 'post-read-conversion
770 Function called after a file has been read in, to perform the
771 decoding. Called with two arguments, START and END, denoting
772 a region of the current buffer to be decoded.
774 'pre-write-conversion
775 Function called before a file is written out, to perform the
776 encoding. Called with two arguments, START and END, denoting
777 a region of the current buffer to be encoded.
780 The following additional properties are recognized if TYPE is 'iso2022:
786 The character set initially designated to the G0 - G3 registers.
787 The value should be one of
789 -- A charset object (designate that character set)
790 -- nil (do not ever use this register)
791 -- t (no character set is initially designated to
792 the register, but may be later on; this automatically
793 sets the corresponding `force-g*-on-output' property)
799 If non-nil, send an explicit designation sequence on output before
800 using the specified register.
803 If non-nil, use the short forms "ESC $ @", "ESC $ A", and
804 "ESC $ B" on output in place of the full designation sequences
805 "ESC $ ( @", "ESC $ ( A", and "ESC $ ( B".
808 If non-nil, don't designate ASCII to G0 at each end of line on output.
809 Setting this to non-nil also suppresses other state-resetting that
810 normally happens at the end of a line.
813 If non-nil, don't designate ASCII to G0 before control chars on output.
816 If non-nil, use 7-bit environment on output. Otherwise, use 8-bit
820 If non-nil, use locking-shift (SO/SI) instead of single-shift
821 or designation by escape sequence.
824 If non-nil, don't use ISO6429's direction specification.
827 If non-nil, literal control characters that are the same as
828 the beginning of a recognized ISO2022 or ISO6429 escape sequence
829 (in particular, ESC (0x1B), SO (0x0E), SI (0x0F), SS2 (0x8E),
830 SS3 (0x8F), and CSI (0x9B)) are "quoted" with an escape character
831 so that they can be properly distinguished from an escape sequence.
832 (Note that doing this results in a non-portable encoding.) This
833 encoding flag is used for byte-compiled files. Note that ESC
834 is a good choice for a quoting character because there are no
835 escape sequences whose second byte is a character from the Control-0
836 or Control-1 character sets; this is explicitly disallowed by the
839 'input-charset-conversion
840 A list of conversion specifications, specifying conversion of
841 characters in one charset to another when decoding is performed.
842 Each specification is a list of two elements: the source charset,
843 and the destination charset.
845 'output-charset-conversion
846 A list of conversion specifications, specifying conversion of
847 characters in one charset to another when encoding is performed.
848 The form of each specification is the same as for
849 'input-charset-conversion.
852 The following additional properties are recognized (and required)
856 CCL program used for decoding (converting to internal format).
859 CCL program used for encoding (converting to external format).
861 (name, type, doc_string, props))
863 Lisp_Coding_System *codesys;
864 enum coding_system_type ty;
865 int need_to_setup_eol_systems = 1;
867 /* Convert type to constant */
868 if (NILP (type) || EQ (type, Qundecided))
869 { ty = CODESYS_AUTODETECT; }
871 else if (EQ (type, Qshift_jis)) { ty = CODESYS_SHIFT_JIS; }
872 else if (EQ (type, Qiso2022)) { ty = CODESYS_ISO2022; }
873 else if (EQ (type, Qbig5)) { ty = CODESYS_BIG5; }
874 else if (EQ (type, Qucs4)) { ty = CODESYS_UCS4; }
875 else if (EQ (type, Qutf8)) { ty = CODESYS_UTF8; }
876 else if (EQ (type, Qccl)) { ty = CODESYS_CCL; }
878 else if (EQ (type, Qno_conversion)) { ty = CODESYS_NO_CONVERSION; }
880 else if (EQ (type, Qinternal)) { ty = CODESYS_INTERNAL; }
883 signal_simple_error ("Invalid coding system type", type);
887 codesys = allocate_coding_system (ty, name);
889 if (NILP (doc_string))
890 doc_string = build_string ("");
892 CHECK_STRING (doc_string);
893 CODING_SYSTEM_DOC_STRING (codesys) = doc_string;
896 EXTERNAL_PROPERTY_LIST_LOOP_3 (key, value, props)
898 if (EQ (key, Qmnemonic))
901 CHECK_STRING (value);
902 CODING_SYSTEM_MNEMONIC (codesys) = value;
905 else if (EQ (key, Qeol_type))
907 need_to_setup_eol_systems = NILP (value);
910 CODING_SYSTEM_EOL_TYPE (codesys) = symbol_to_eol_type (value);
913 else if (EQ (key, Qpost_read_conversion)) CODING_SYSTEM_POST_READ_CONVERSION (codesys) = value;
914 else if (EQ (key, Qpre_write_conversion)) CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = value;
916 else if (ty == CODESYS_ISO2022)
918 #define FROB_INITIAL_CHARSET(charset_num) \
919 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \
920 ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value))
922 if (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
923 else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
924 else if (EQ (key, Qcharset_g2)) FROB_INITIAL_CHARSET (2);
925 else if (EQ (key, Qcharset_g3)) FROB_INITIAL_CHARSET (3);
927 #define FROB_FORCE_CHARSET(charset_num) \
928 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = !NILP (value)
930 else if (EQ (key, Qforce_g0_on_output)) FROB_FORCE_CHARSET (0);
931 else if (EQ (key, Qforce_g1_on_output)) FROB_FORCE_CHARSET (1);
932 else if (EQ (key, Qforce_g2_on_output)) FROB_FORCE_CHARSET (2);
933 else if (EQ (key, Qforce_g3_on_output)) FROB_FORCE_CHARSET (3);
935 #define FROB_BOOLEAN_PROPERTY(prop) \
936 CODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value)
938 else if (EQ (key, Qshort)) FROB_BOOLEAN_PROPERTY (SHORT);
939 else if (EQ (key, Qno_ascii_eol)) FROB_BOOLEAN_PROPERTY (NO_ASCII_EOL);
940 else if (EQ (key, Qno_ascii_cntl)) FROB_BOOLEAN_PROPERTY (NO_ASCII_CNTL);
941 else if (EQ (key, Qseven)) FROB_BOOLEAN_PROPERTY (SEVEN);
942 else if (EQ (key, Qlock_shift)) FROB_BOOLEAN_PROPERTY (LOCK_SHIFT);
943 else if (EQ (key, Qno_iso6429)) FROB_BOOLEAN_PROPERTY (NO_ISO6429);
944 else if (EQ (key, Qescape_quoted)) FROB_BOOLEAN_PROPERTY (ESCAPE_QUOTED);
946 else if (EQ (key, Qinput_charset_conversion))
948 codesys->iso2022.input_conv =
949 Dynarr_new (charset_conversion_spec);
950 parse_charset_conversion_specs (codesys->iso2022.input_conv,
953 else if (EQ (key, Qoutput_charset_conversion))
955 codesys->iso2022.output_conv =
956 Dynarr_new (charset_conversion_spec);
957 parse_charset_conversion_specs (codesys->iso2022.output_conv,
961 signal_simple_error ("Unrecognized property", key);
963 else if (EQ (type, Qccl))
966 struct ccl_program test_ccl;
969 /* Check key first. */
970 if (EQ (key, Qdecode))
971 suffix = "-ccl-decode";
972 else if (EQ (key, Qencode))
973 suffix = "-ccl-encode";
975 signal_simple_error ("Unrecognized property", key);
977 /* If value is vector, register it as a ccl program
978 associated with an newly created symbol for
979 backward compatibility. */
982 sym = Fintern (concat2 (Fsymbol_name (name),
983 build_string (suffix)),
985 Fregister_ccl_program (sym, value);
989 CHECK_SYMBOL (value);
992 /* check if the given ccl programs are valid. */
993 if (setup_ccl_program (&test_ccl, sym) < 0)
994 signal_simple_error ("Invalid CCL program", value);
996 if (EQ (key, Qdecode))
997 CODING_SYSTEM_CCL_DECODE (codesys) = sym;
998 else if (EQ (key, Qencode))
999 CODING_SYSTEM_CCL_ENCODE (codesys) = sym;
1004 signal_simple_error ("Unrecognized property", key);
1008 if (need_to_setup_eol_systems)
1009 setup_eol_coding_systems (codesys);
1012 Lisp_Object codesys_obj;
1013 XSETCODING_SYSTEM (codesys_obj, codesys);
1014 Fputhash (name, codesys_obj, Vcoding_system_hash_table);
1019 DEFUN ("copy-coding-system", Fcopy_coding_system, 2, 2, 0, /*
1020 Copy OLD-CODING-SYSTEM to NEW-NAME.
1021 If NEW-NAME does not name an existing coding system, a new one will
1024 (old_coding_system, new_name))
1026 Lisp_Object new_coding_system;
1027 old_coding_system = Fget_coding_system (old_coding_system);
1028 new_coding_system = Ffind_coding_system (new_name);
1029 if (NILP (new_coding_system))
1031 XSETCODING_SYSTEM (new_coding_system,
1032 allocate_coding_system
1033 (XCODING_SYSTEM_TYPE (old_coding_system),
1035 Fputhash (new_name, new_coding_system, Vcoding_system_hash_table);
1039 Lisp_Coding_System *to = XCODING_SYSTEM (new_coding_system);
1040 Lisp_Coding_System *from = XCODING_SYSTEM (old_coding_system);
1041 memcpy (((char *) to ) + sizeof (to->header),
1042 ((char *) from) + sizeof (from->header),
1043 sizeof (*from) - sizeof (from->header));
1044 to->name = new_name;
1046 return new_coding_system;
1049 DEFUN ("coding-system-canonical-name-p", Fcoding_system_canonical_name_p, 1, 1, 0, /*
1050 Return t if OBJECT names a coding system, and is not a coding system alias.
1054 return CODING_SYSTEMP (Fgethash (object, Vcoding_system_hash_table, Qnil))
1058 DEFUN ("coding-system-alias-p", Fcoding_system_alias_p, 1, 1, 0, /*
1059 Return t if OBJECT is a coding system alias.
1060 All coding system aliases are created by `define-coding-system-alias'.
1064 return SYMBOLP (Fgethash (object, Vcoding_system_hash_table, Qzero))
1068 DEFUN ("coding-system-aliasee", Fcoding_system_aliasee, 1, 1, 0, /*
1069 Return the coding-system symbol for which symbol ALIAS is an alias.
1073 Lisp_Object aliasee = Fgethash (alias, Vcoding_system_hash_table, Qnil);
1074 if (SYMBOLP (aliasee))
1077 signal_simple_error ("Symbol is not a coding system alias", alias);
1078 return Qnil; /* To keep the compiler happy */
1082 append_suffix_to_symbol (Lisp_Object symbol, const char *ascii_string)
1084 return Fintern (concat2 (Fsymbol_name (symbol), build_string (ascii_string)),
1088 /* A maphash function, for removing dangling coding system aliases. */
1090 dangling_coding_system_alias_p (Lisp_Object alias,
1091 Lisp_Object aliasee,
1092 void *dangling_aliases)
1094 if (SYMBOLP (aliasee)
1095 && NILP (Fgethash (aliasee, Vcoding_system_hash_table, Qnil)))
1097 (*(int *) dangling_aliases)++;
1104 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias, 2, 2, 0, /*
1105 Define symbol ALIAS as an alias for coding system ALIASEE.
1107 You can use this function to redefine an alias that has already been defined,
1108 but you cannot redefine a name which is the canonical name for a coding system.
1109 \(a canonical name of a coding system is what is returned when you call
1110 `coding-system-name' on a coding system).
1112 ALIASEE itself can be an alias, which allows you to define nested aliases.
1114 You are forbidden, however, from creating alias loops or `dangling' aliases.
1115 These will be detected, and an error will be signaled if you attempt to do so.
1117 If ALIASEE is nil, then ALIAS will simply be undefined.
1119 See also `coding-system-alias-p', `coding-system-aliasee',
1120 and `coding-system-canonical-name-p'.
1124 Lisp_Object real_coding_system, probe;
1126 CHECK_SYMBOL (alias);
1128 if (!NILP (Fcoding_system_canonical_name_p (alias)))
1130 ("Symbol is the canonical name of a coding system and cannot be redefined",
1135 Lisp_Object subsidiary_unix = append_suffix_to_symbol (alias, "-unix");
1136 Lisp_Object subsidiary_dos = append_suffix_to_symbol (alias, "-dos");
1137 Lisp_Object subsidiary_mac = append_suffix_to_symbol (alias, "-mac");
1139 Fremhash (alias, Vcoding_system_hash_table);
1141 /* Undefine subsidiary aliases,
1142 presumably created by a previous call to this function */
1143 if (! NILP (Fcoding_system_alias_p (subsidiary_unix)) &&
1144 ! NILP (Fcoding_system_alias_p (subsidiary_dos)) &&
1145 ! NILP (Fcoding_system_alias_p (subsidiary_mac)))
1147 Fdefine_coding_system_alias (subsidiary_unix, Qnil);
1148 Fdefine_coding_system_alias (subsidiary_dos, Qnil);
1149 Fdefine_coding_system_alias (subsidiary_mac, Qnil);
1152 /* Undefine dangling coding system aliases. */
1154 int dangling_aliases;
1157 dangling_aliases = 0;
1158 elisp_map_remhash (dangling_coding_system_alias_p,
1159 Vcoding_system_hash_table,
1161 } while (dangling_aliases > 0);
1167 if (CODING_SYSTEMP (aliasee))
1168 aliasee = XCODING_SYSTEM_NAME (aliasee);
1170 /* Checks that aliasee names a coding-system */
1171 real_coding_system = Fget_coding_system (aliasee);
1173 /* Check for coding system alias loops */
1174 if (EQ (alias, aliasee))
1175 alias_loop: signal_simple_error_2
1176 ("Attempt to create a coding system alias loop", alias, aliasee);
1178 for (probe = aliasee;
1180 probe = Fgethash (probe, Vcoding_system_hash_table, Qzero))
1182 if (EQ (probe, alias))
1186 Fputhash (alias, aliasee, Vcoding_system_hash_table);
1188 /* Set up aliases for subsidiaries.
1189 #### There must be a better way to handle subsidiary coding systems. */
1191 static const char *suffixes[] = { "-unix", "-dos", "-mac" };
1193 for (i = 0; i < countof (suffixes); i++)
1195 Lisp_Object alias_subsidiary =
1196 append_suffix_to_symbol (alias, suffixes[i]);
1197 Lisp_Object aliasee_subsidiary =
1198 append_suffix_to_symbol (aliasee, suffixes[i]);
1200 if (! NILP (Ffind_coding_system (aliasee_subsidiary)))
1201 Fdefine_coding_system_alias (alias_subsidiary, aliasee_subsidiary);
1204 /* FSF return value is a vector of [ALIAS-unix ALIAS-dos ALIAS-mac],
1205 but it doesn't look intentional, so I'd rather return something
1206 meaningful or nothing at all. */
1211 subsidiary_coding_system (Lisp_Object coding_system, eol_type_t type)
1213 Lisp_Coding_System *cs = XCODING_SYSTEM (coding_system);
1214 Lisp_Object new_coding_system;
1216 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
1217 return coding_system;
1221 case EOL_AUTODETECT: return coding_system;
1222 case EOL_LF: new_coding_system = CODING_SYSTEM_EOL_LF (cs); break;
1223 case EOL_CR: new_coding_system = CODING_SYSTEM_EOL_CR (cs); break;
1224 case EOL_CRLF: new_coding_system = CODING_SYSTEM_EOL_CRLF (cs); break;
1225 default: abort (); return Qnil;
1228 return NILP (new_coding_system) ? coding_system : new_coding_system;
1231 DEFUN ("subsidiary-coding-system", Fsubsidiary_coding_system, 2, 2, 0, /*
1232 Return the subsidiary coding system of CODING-SYSTEM with eol type EOL-TYPE.
1234 (coding_system, eol_type))
1236 coding_system = Fget_coding_system (coding_system);
1238 return subsidiary_coding_system (coding_system,
1239 symbol_to_eol_type (eol_type));
1243 /************************************************************************/
1244 /* Coding system accessors */
1245 /************************************************************************/
1247 DEFUN ("coding-system-doc-string", Fcoding_system_doc_string, 1, 1, 0, /*
1248 Return the doc string for CODING-SYSTEM.
1252 coding_system = Fget_coding_system (coding_system);
1253 return XCODING_SYSTEM_DOC_STRING (coding_system);
1256 DEFUN ("coding-system-type", Fcoding_system_type, 1, 1, 0, /*
1257 Return the type of CODING-SYSTEM.
1261 switch (XCODING_SYSTEM_TYPE (Fget_coding_system (coding_system)))
1264 case CODESYS_AUTODETECT: return Qundecided;
1266 case CODESYS_SHIFT_JIS: return Qshift_jis;
1267 case CODESYS_ISO2022: return Qiso2022;
1268 case CODESYS_BIG5: return Qbig5;
1269 case CODESYS_UCS4: return Qucs4;
1270 case CODESYS_UTF8: return Qutf8;
1271 case CODESYS_CCL: return Qccl;
1273 case CODESYS_NO_CONVERSION: return Qno_conversion;
1275 case CODESYS_INTERNAL: return Qinternal;
1282 Lisp_Object coding_system_charset (Lisp_Object coding_system, int gnum)
1285 = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, gnum);
1287 return CHARSETP (cs) ? XCHARSET_NAME (cs) : Qnil;
1290 DEFUN ("coding-system-charset", Fcoding_system_charset, 2, 2, 0, /*
1291 Return initial charset of CODING-SYSTEM designated to GNUM.
1294 (coding_system, gnum))
1296 coding_system = Fget_coding_system (coding_system);
1299 return coding_system_charset (coding_system, XINT (gnum));
1303 DEFUN ("coding-system-property", Fcoding_system_property, 2, 2, 0, /*
1304 Return the PROP property of CODING-SYSTEM.
1306 (coding_system, prop))
1309 enum coding_system_type type;
1311 coding_system = Fget_coding_system (coding_system);
1312 CHECK_SYMBOL (prop);
1313 type = XCODING_SYSTEM_TYPE (coding_system);
1315 for (i = 0; !ok && i < Dynarr_length (the_codesys_prop_dynarr); i++)
1316 if (EQ (Dynarr_at (the_codesys_prop_dynarr, i).sym, prop))
1319 switch (Dynarr_at (the_codesys_prop_dynarr, i).prop_type)
1321 case CODESYS_PROP_ALL_OK:
1324 case CODESYS_PROP_ISO2022:
1325 if (type != CODESYS_ISO2022)
1327 ("Property only valid in ISO2022 coding systems",
1331 case CODESYS_PROP_CCL:
1332 if (type != CODESYS_CCL)
1334 ("Property only valid in CCL coding systems",
1344 signal_simple_error ("Unrecognized property", prop);
1346 if (EQ (prop, Qname))
1347 return XCODING_SYSTEM_NAME (coding_system);
1348 else if (EQ (prop, Qtype))
1349 return Fcoding_system_type (coding_system);
1350 else if (EQ (prop, Qdoc_string))
1351 return XCODING_SYSTEM_DOC_STRING (coding_system);
1352 else if (EQ (prop, Qmnemonic))
1353 return XCODING_SYSTEM_MNEMONIC (coding_system);
1354 else if (EQ (prop, Qeol_type))
1355 return eol_type_to_symbol (XCODING_SYSTEM_EOL_TYPE (coding_system));
1356 else if (EQ (prop, Qeol_lf))
1357 return XCODING_SYSTEM_EOL_LF (coding_system);
1358 else if (EQ (prop, Qeol_crlf))
1359 return XCODING_SYSTEM_EOL_CRLF (coding_system);
1360 else if (EQ (prop, Qeol_cr))
1361 return XCODING_SYSTEM_EOL_CR (coding_system);
1362 else if (EQ (prop, Qpost_read_conversion))
1363 return XCODING_SYSTEM_POST_READ_CONVERSION (coding_system);
1364 else if (EQ (prop, Qpre_write_conversion))
1365 return XCODING_SYSTEM_PRE_WRITE_CONVERSION (coding_system);
1367 else if (type == CODESYS_ISO2022)
1369 if (EQ (prop, Qcharset_g0))
1370 return coding_system_charset (coding_system, 0);
1371 else if (EQ (prop, Qcharset_g1))
1372 return coding_system_charset (coding_system, 1);
1373 else if (EQ (prop, Qcharset_g2))
1374 return coding_system_charset (coding_system, 2);
1375 else if (EQ (prop, Qcharset_g3))
1376 return coding_system_charset (coding_system, 3);
1378 #define FORCE_CHARSET(charset_num) \
1379 (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \
1380 (coding_system, charset_num) ? Qt : Qnil)
1382 else if (EQ (prop, Qforce_g0_on_output)) return FORCE_CHARSET (0);
1383 else if (EQ (prop, Qforce_g1_on_output)) return FORCE_CHARSET (1);
1384 else if (EQ (prop, Qforce_g2_on_output)) return FORCE_CHARSET (2);
1385 else if (EQ (prop, Qforce_g3_on_output)) return FORCE_CHARSET (3);
1387 #define LISP_BOOLEAN(prop) \
1388 (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil)
1390 else if (EQ (prop, Qshort)) return LISP_BOOLEAN (SHORT);
1391 else if (EQ (prop, Qno_ascii_eol)) return LISP_BOOLEAN (NO_ASCII_EOL);
1392 else if (EQ (prop, Qno_ascii_cntl)) return LISP_BOOLEAN (NO_ASCII_CNTL);
1393 else if (EQ (prop, Qseven)) return LISP_BOOLEAN (SEVEN);
1394 else if (EQ (prop, Qlock_shift)) return LISP_BOOLEAN (LOCK_SHIFT);
1395 else if (EQ (prop, Qno_iso6429)) return LISP_BOOLEAN (NO_ISO6429);
1396 else if (EQ (prop, Qescape_quoted)) return LISP_BOOLEAN (ESCAPE_QUOTED);
1398 else if (EQ (prop, Qinput_charset_conversion))
1400 unparse_charset_conversion_specs
1401 (XCODING_SYSTEM (coding_system)->iso2022.input_conv);
1402 else if (EQ (prop, Qoutput_charset_conversion))
1404 unparse_charset_conversion_specs
1405 (XCODING_SYSTEM (coding_system)->iso2022.output_conv);
1409 else if (type == CODESYS_CCL)
1411 if (EQ (prop, Qdecode))
1412 return XCODING_SYSTEM_CCL_DECODE (coding_system);
1413 else if (EQ (prop, Qencode))
1414 return XCODING_SYSTEM_CCL_ENCODE (coding_system);
1422 return Qnil; /* not reached */
1426 /************************************************************************/
1427 /* Coding category functions */
1428 /************************************************************************/
1431 decode_coding_category (Lisp_Object symbol)
1435 CHECK_SYMBOL (symbol);
1436 for (i = 0; i < CODING_CATEGORY_LAST; i++)
1437 if (EQ (coding_category_symbol[i], symbol))
1440 signal_simple_error ("Unrecognized coding category", symbol);
1441 return 0; /* not reached */
1444 DEFUN ("coding-category-list", Fcoding_category_list, 0, 0, 0, /*
1445 Return a list of all recognized coding categories.
1450 Lisp_Object list = Qnil;
1452 for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
1453 list = Fcons (coding_category_symbol[i], list);
1457 DEFUN ("set-coding-priority-list", Fset_coding_priority_list, 1, 1, 0, /*
1458 Change the priority order of the coding categories.
1459 LIST should be list of coding categories, in descending order of
1460 priority. Unspecified coding categories will be lower in priority
1461 than all specified ones, in the same relative order they were in
1466 int category_to_priority[CODING_CATEGORY_LAST];
1470 /* First generate a list that maps coding categories to priorities. */
1472 for (i = 0; i < CODING_CATEGORY_LAST; i++)
1473 category_to_priority[i] = -1;
1475 /* Highest priority comes from the specified list. */
1477 EXTERNAL_LIST_LOOP (rest, list)
1479 int cat = decode_coding_category (XCAR (rest));
1481 if (category_to_priority[cat] >= 0)
1482 signal_simple_error ("Duplicate coding category in list", XCAR (rest));
1483 category_to_priority[cat] = i++;
1486 /* Now go through the existing categories by priority to retrieve
1487 the categories not yet specified and preserve their priority
1489 for (j = 0; j < CODING_CATEGORY_LAST; j++)
1491 int cat = fcd->coding_category_by_priority[j];
1492 if (category_to_priority[cat] < 0)
1493 category_to_priority[cat] = i++;
1496 /* Now we need to construct the inverse of the mapping we just
1499 for (i = 0; i < CODING_CATEGORY_LAST; i++)
1500 fcd->coding_category_by_priority[category_to_priority[i]] = i;
1502 /* Phew! That was confusing. */
1506 DEFUN ("coding-priority-list", Fcoding_priority_list, 0, 0, 0, /*
1507 Return a list of coding categories in descending order of priority.
1512 Lisp_Object list = Qnil;
1514 for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
1515 list = Fcons (coding_category_symbol[fcd->coding_category_by_priority[i]],
1520 DEFUN ("set-coding-category-system", Fset_coding_category_system, 2, 2, 0, /*
1521 Change the coding system associated with a coding category.
1523 (coding_category, coding_system))
1525 int cat = decode_coding_category (coding_category);
1527 coding_system = Fget_coding_system (coding_system);
1528 fcd->coding_category_system[cat] = coding_system;
1532 DEFUN ("coding-category-system", Fcoding_category_system, 1, 1, 0, /*
1533 Return the coding system associated with a coding category.
1537 int cat = decode_coding_category (coding_category);
1538 Lisp_Object sys = fcd->coding_category_system[cat];
1541 return XCODING_SYSTEM_NAME (sys);
1546 /************************************************************************/
1547 /* Detecting the encoding of data */
1548 /************************************************************************/
1550 struct detection_state
1552 eol_type_t eol_type;
1588 struct iso2022_decoder iso;
1590 int high_byte_count;
1591 unsigned int saw_single_shift:1;
1604 acceptable_control_char_p (int c)
1608 /* Allow and ignore control characters that you might
1609 reasonably see in a text file */
1614 case 8: /* backspace */
1615 case 11: /* vertical tab */
1616 case 12: /* form feed */
1617 case 26: /* MS-DOS C-z junk */
1618 case 31: /* '^_' -- for info */
1626 mask_has_at_most_one_bit_p (int mask)
1628 /* Perhaps the only thing useful you learn from intensive Microsoft
1629 technical interviews */
1630 return (mask & (mask - 1)) == 0;
1634 detect_eol_type (struct detection_state *st, const Extbyte *src,
1635 Lstream_data_count n)
1639 unsigned char c = *(unsigned char *)src++;
1642 if (st->eol.just_saw_cr)
1644 else if (st->eol.seen_anything)
1647 else if (st->eol.just_saw_cr)
1650 st->eol.just_saw_cr = 1;
1652 st->eol.just_saw_cr = 0;
1653 st->eol.seen_anything = 1;
1656 return EOL_AUTODETECT;
1659 /* Attempt to determine the encoding and EOL type of the given text.
1660 Before calling this function for the first type, you must initialize
1661 st->eol_type as appropriate and initialize st->mask to ~0.
1663 st->eol_type holds the determined EOL type, or EOL_AUTODETECT if
1666 st->mask holds the determined coding category mask, or ~0 if only
1667 ASCII has been seen so far.
1671 0 == st->eol_type is EOL_AUTODETECT and/or more than coding category
1672 is present in st->mask
1673 1 == definitive answers are here for both st->eol_type and st->mask
1677 detect_coding_type (struct detection_state *st, const Extbyte *src,
1678 Lstream_data_count n, int just_do_eol)
1680 if (st->eol_type == EOL_AUTODETECT)
1681 st->eol_type = detect_eol_type (st, src, n);
1684 return st->eol_type != EOL_AUTODETECT;
1686 if (!st->seen_non_ascii)
1688 for (; n; n--, src++)
1690 unsigned char c = *(unsigned char *) src;
1691 if ((c < 0x20 && !acceptable_control_char_p (c)) || c >= 0x80)
1693 st->seen_non_ascii = 1;
1695 st->shift_jis.mask = ~0;
1699 st->iso2022.mask = ~0;
1709 if (!mask_has_at_most_one_bit_p (st->iso2022.mask))
1710 st->iso2022.mask = detect_coding_iso2022 (st, src, n);
1711 if (!mask_has_at_most_one_bit_p (st->shift_jis.mask))
1712 st->shift_jis.mask = detect_coding_sjis (st, src, n);
1713 if (!mask_has_at_most_one_bit_p (st->big5.mask))
1714 st->big5.mask = detect_coding_big5 (st, src, n);
1715 if (!mask_has_at_most_one_bit_p (st->utf8.mask))
1716 st->utf8.mask = detect_coding_utf8 (st, src, n);
1717 if (!mask_has_at_most_one_bit_p (st->ucs4.mask))
1718 st->ucs4.mask = detect_coding_ucs4 (st, src, n);
1721 = st->iso2022.mask | st->shift_jis.mask | st->big5.mask
1722 | st->utf8.mask | st->ucs4.mask;
1725 int retval = mask_has_at_most_one_bit_p (st->mask);
1726 st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK;
1727 return retval && st->eol_type != EOL_AUTODETECT;
1732 coding_system_from_mask (int mask)
1736 /* If the file was entirely or basically ASCII, use the
1737 default value of `buffer-file-coding-system'. */
1738 Lisp_Object retval =
1739 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system;
1742 retval = Ffind_coding_system (retval);
1746 (Qbad_variable, Qwarning,
1747 "Invalid `default-buffer-file-coding-system', set to nil");
1748 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system = Qnil;
1752 retval = Fget_coding_system (Qraw_text);
1760 mask = postprocess_iso2022_mask (mask);
1762 /* Look through the coding categories by priority and find
1763 the first one that is allowed. */
1764 for (i = 0; i < CODING_CATEGORY_LAST; i++)
1766 cat = fcd->coding_category_by_priority[i];
1767 if ((mask & (1 << cat)) &&
1768 !NILP (fcd->coding_category_system[cat]))
1772 return fcd->coding_category_system[cat];
1774 return Fget_coding_system (Qraw_text);
1778 /* Given a seekable read stream and potential coding system and EOL type
1779 as specified, do any autodetection that is called for. If the
1780 coding system and/or EOL type are not `autodetect', they will be left
1781 alone; but this function will never return an autodetect coding system
1784 This function does not automatically fetch subsidiary coding systems;
1785 that should be unnecessary with the explicit eol-type argument. */
1787 #define LENGTH(string_constant) (sizeof (string_constant) - 1)
1788 /* number of leading lines to check for a coding cookie */
1789 #define LINES_TO_CHECK 2
1792 determine_real_coding_system (Lstream *stream, Lisp_Object *codesys_in_out,
1793 eol_type_t *eol_type_in_out)
1795 struct detection_state decst;
1797 if (*eol_type_in_out == EOL_AUTODETECT)
1798 *eol_type_in_out = XCODING_SYSTEM_EOL_TYPE (*codesys_in_out);
1801 decst.eol_type = *eol_type_in_out;
1804 /* If autodetection is called for, do it now. */
1805 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT
1806 || *eol_type_in_out == EOL_AUTODETECT)
1809 Lisp_Object coding_system = Qnil;
1811 Lstream_data_count nread = Lstream_read (stream, buf, sizeof (buf));
1813 int lines_checked = 0;
1815 /* Look for initial "-*-"; mode line prefix */
1817 scan_end = buf + nread - LENGTH ("-*-coding:?-*-");
1819 && lines_checked < LINES_TO_CHECK;
1821 if (*p == '-' && *(p+1) == '*' && *(p+2) == '-')
1823 Extbyte *local_vars_beg = p + 3;
1824 /* Look for final "-*-"; mode line suffix */
1825 for (p = local_vars_beg,
1826 scan_end = buf + nread - LENGTH ("-*-");
1828 && lines_checked < LINES_TO_CHECK;
1830 if (*p == '-' && *(p+1) == '*' && *(p+2) == '-')
1832 Extbyte *suffix = p;
1833 /* Look for "coding:" */
1834 for (p = local_vars_beg,
1835 scan_end = suffix - LENGTH ("coding:?");
1838 if (memcmp ("coding:", p, LENGTH ("coding:")) == 0
1839 && (p == local_vars_beg
1840 || (*(p-1) == ' ' ||
1846 p += LENGTH ("coding:");
1847 while (*p == ' ' || *p == '\t') p++;
1849 /* Get coding system name */
1850 save = *suffix; *suffix = '\0';
1851 /* Characters valid in a MIME charset name (rfc 1521),
1852 and in a Lisp symbol name. */
1853 n = strspn ( (char *) p,
1854 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
1855 "abcdefghijklmnopqrstuvwxyz"
1861 save = p[n]; p[n] = '\0';
1863 Ffind_coding_system (intern ((char *) p));
1870 /* #### file must use standard EOLs or we miss 2d line */
1871 /* #### not to mention this is broken for UTF-16 DOS files */
1872 else if (*p == '\n' || *p == '\r')
1875 /* skip past multibyte (DOS) newline */
1876 if (*p == '\r' && *(p+1) == '\n') p++;
1880 /* #### file must use standard EOLs or we miss 2d line */
1881 /* #### not to mention this is broken for UTF-16 DOS files */
1882 else if (*p == '\n' || *p == '\r')
1885 /* skip past multibyte (DOS) newline */
1886 if (*p == '\r' && *(p+1) == '\n') p++;
1889 if (NILP (coding_system))
1892 if (detect_coding_type (&decst, buf, nread,
1893 XCODING_SYSTEM_TYPE (*codesys_in_out)
1894 != CODESYS_AUTODETECT))
1896 nread = Lstream_read (stream, buf, sizeof (buf));
1902 else if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT
1903 && XCODING_SYSTEM_EOL_TYPE (coding_system) == EOL_AUTODETECT)
1906 if (detect_coding_type (&decst, buf, nread, 1))
1908 nread = Lstream_read (stream, buf, sizeof (buf));
1914 *eol_type_in_out = decst.eol_type;
1915 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT)
1917 if (NILP (coding_system))
1918 *codesys_in_out = coding_system_from_mask (decst.mask);
1920 *codesys_in_out = coding_system;
1924 /* If we absolutely can't determine the EOL type, just assume LF. */
1925 if (*eol_type_in_out == EOL_AUTODETECT)
1926 *eol_type_in_out = EOL_LF;
1928 Lstream_rewind (stream);
1931 DEFUN ("detect-coding-region", Fdetect_coding_region, 2, 3, 0, /*
1932 Detect coding system of the text in the region between START and END.
1933 Return a list of possible coding systems ordered by priority.
1934 If only ASCII characters are found, return 'undecided or one of
1935 its subsidiary coding systems according to a detected end-of-line
1936 type. Optional arg BUFFER defaults to the current buffer.
1938 (start, end, buffer))
1940 Lisp_Object val = Qnil;
1941 struct buffer *buf = decode_buffer (buffer, 0);
1943 Lisp_Object instream, lb_instream;
1944 Lstream *istr, *lb_istr;
1945 struct detection_state decst;
1946 struct gcpro gcpro1, gcpro2;
1948 get_buffer_range_char (buf, start, end, &b, &e, 0);
1949 lb_instream = make_lisp_buffer_input_stream (buf, b, e, 0);
1950 lb_istr = XLSTREAM (lb_instream);
1951 instream = make_encoding_input_stream (lb_istr, Fget_coding_system (Qbinary));
1952 istr = XLSTREAM (instream);
1953 GCPRO2 (instream, lb_instream);
1955 decst.eol_type = EOL_AUTODETECT;
1959 Extbyte random_buffer[4096];
1960 Lstream_data_count nread = Lstream_read (istr, random_buffer, sizeof (random_buffer));
1964 if (detect_coding_type (&decst, random_buffer, nread, 0))
1968 if (decst.mask == ~0)
1969 val = subsidiary_coding_system (Fget_coding_system (Qundecided),
1977 decst.mask = postprocess_iso2022_mask (decst.mask);
1979 for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
1981 int sys = fcd->coding_category_by_priority[i];
1982 if (decst.mask & (1 << sys))
1984 Lisp_Object codesys = fcd->coding_category_system[sys];
1985 if (!NILP (codesys))
1986 codesys = subsidiary_coding_system (codesys, decst.eol_type);
1987 val = Fcons (codesys, val);
1991 Lstream_close (istr);
1993 Lstream_delete (istr);
1994 Lstream_delete (lb_istr);
1999 /************************************************************************/
2000 /* Converting to internal Mule format ("decoding") */
2001 /************************************************************************/
2003 /* A decoding stream is a stream used for decoding text (i.e.
2004 converting from some external format to internal format).
2005 The decoding-stream object keeps track of the actual coding
2006 stream, the stream that is at the other end, and data that
2007 needs to be persistent across the lifetime of the stream. */
2009 /* Handle the EOL stuff related to just-read-in character C.
2010 EOL_TYPE is the EOL type of the coding stream.
2011 FLAGS is the current value of FLAGS in the coding stream, and may
2012 be modified by this macro. (The macro only looks at the
2013 CODING_STATE_CR flag.) DST is the Dynarr to which the decoded
2014 bytes are to be written. You need to also define a local goto
2015 label "label_continue_loop" that is at the end of the main
2016 character-reading loop.
2018 If C is a CR character, then this macro handles it entirely and
2019 jumps to label_continue_loop. Otherwise, this macro does not add
2020 anything to DST, and continues normally. You should continue
2021 processing C normally after this macro. */
2023 #define DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst) \
2027 if (eol_type == EOL_CR) \
2028 Dynarr_add (dst, '\n'); \
2029 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR) \
2030 Dynarr_add (dst, c); \
2032 flags |= CODING_STATE_CR; \
2033 goto label_continue_loop; \
2035 else if (flags & CODING_STATE_CR) \
2036 { /* eol_type == CODING_SYSTEM_EOL_CRLF */ \
2038 Dynarr_add (dst, '\r'); \
2039 flags &= ~CODING_STATE_CR; \
2043 /* C should be a binary character in the range 0 - 255; convert
2044 to internal format and add to Dynarr DST. */
2046 #define DECODE_ADD_BINARY_CHAR(c, dst) \
2048 if (BYTE_ASCII_P (c)) \
2049 Dynarr_add (dst, c); \
2050 else if (BYTE_C1_P (c)) \
2052 Dynarr_add (dst, LEADING_BYTE_CONTROL_1); \
2053 Dynarr_add (dst, c + 0x20); \
2057 Dynarr_add (dst, LEADING_BYTE_LATIN_ISO8859_1); \
2058 Dynarr_add (dst, c); \
2062 #define DECODE_OUTPUT_PARTIAL_CHAR(ch) \
2066 DECODE_ADD_BINARY_CHAR (ch, dst); \
2071 #define DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst) \
2073 if (flags & CODING_STATE_END) \
2075 DECODE_OUTPUT_PARTIAL_CHAR (ch); \
2076 if (flags & CODING_STATE_CR) \
2077 Dynarr_add (dst, '\r'); \
2081 #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding)
2083 struct decoding_stream
2085 /* Coding system that governs the conversion. */
2086 Lisp_Coding_System *codesys;
2088 /* Stream that we read the encoded data from or
2089 write the decoded data to. */
2092 /* If we are reading, then we can return only a fixed amount of
2093 data, so if the conversion resulted in too much data, we store it
2094 here for retrieval the next time around. */
2095 unsigned_char_dynarr *runoff;
2097 /* FLAGS holds flags indicating the current state of the decoding.
2098 Some of these flags are dependent on the coding system. */
2101 /* CH holds a partially built-up character. Since we only deal
2102 with one- and two-byte characters at the moment, we only use
2103 this to store the first byte of a two-byte character. */
2106 /* EOL_TYPE specifies the type of end-of-line conversion that
2107 currently applies. We need to keep this separate from the
2108 EOL type stored in CODESYS because the latter might indicate
2109 automatic EOL-type detection while the former will always
2110 indicate a particular EOL type. */
2111 eol_type_t eol_type;
2113 /* Additional ISO2022 information. We define the structure above
2114 because it's also needed by the detection routines. */
2115 struct iso2022_decoder iso2022;
2117 /* Additional information (the state of the running CCL program)
2118 used by the CCL decoder. */
2119 struct ccl_program ccl;
2121 /* counter for UTF-8 or UCS-4 */
2122 unsigned char counter;
2124 struct detection_state decst;
2127 static Lstream_data_count decoding_reader (Lstream *stream,
2128 unsigned char *data, Lstream_data_count size);
2129 static Lstream_data_count decoding_writer (Lstream *stream,
2130 const unsigned char *data, Lstream_data_count size);
2131 static int decoding_rewinder (Lstream *stream);
2132 static int decoding_seekable_p (Lstream *stream);
2133 static int decoding_flusher (Lstream *stream);
2134 static int decoding_closer (Lstream *stream);
2136 static Lisp_Object decoding_marker (Lisp_Object stream);
2138 DEFINE_LSTREAM_IMPLEMENTATION ("decoding", lstream_decoding,
2139 sizeof (struct decoding_stream));
2142 decoding_marker (Lisp_Object stream)
2144 Lstream *str = DECODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2145 Lisp_Object str_obj;
2147 /* We do not need to mark the coding systems or charsets stored
2148 within the stream because they are stored in a global list
2149 and automatically marked. */
2151 XSETLSTREAM (str_obj, str);
2152 mark_object (str_obj);
2153 if (str->imp->marker)
2154 return (str->imp->marker) (str_obj);
2159 /* Read SIZE bytes of data and store it into DATA. We are a decoding stream
2160 so we read data from the other end, decode it, and store it into DATA. */
2162 static Lstream_data_count
2163 decoding_reader (Lstream *stream, unsigned char *data, Lstream_data_count size)
2165 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2166 unsigned char *orig_data = data;
2167 Lstream_data_count read_size;
2168 int error_occurred = 0;
2170 /* We need to interface to mule_decode(), which expects to take some
2171 amount of data and store the result into a Dynarr. We have
2172 mule_decode() store into str->runoff, and take data from there
2175 /* We loop until we have enough data, reading chunks from the other
2176 end and decoding it. */
2179 /* Take data from the runoff if we can. Make sure to take at
2180 most SIZE bytes, and delete the data from the runoff. */
2181 if (Dynarr_length (str->runoff) > 0)
2183 Lstream_data_count chunk = min (size, (Lstream_data_count) Dynarr_length (str->runoff));
2184 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2185 Dynarr_delete_many (str->runoff, 0, chunk);
2191 break; /* No more room for data */
2193 if (str->flags & CODING_STATE_END)
2194 /* This means that on the previous iteration, we hit the EOF on
2195 the other end. We loop once more so that mule_decode() can
2196 output any final stuff it may be holding, or any "go back
2197 to a sane state" escape sequences. (This latter makes sense
2198 during encoding.) */
2201 /* Exhausted the runoff, so get some more. DATA has at least
2202 SIZE bytes left of storage in it, so it's OK to read directly
2203 into it. (We'll be overwriting above, after we've decoded it
2204 into the runoff.) */
2205 read_size = Lstream_read (str->other_end, data, size);
2212 /* There might be some more end data produced in the translation.
2213 See the comment above. */
2214 str->flags |= CODING_STATE_END;
2215 mule_decode (stream, (Extbyte *) data, str->runoff, read_size);
2218 if (data - orig_data == 0)
2219 return error_occurred ? -1 : 0;
2221 return data - orig_data;
2224 static Lstream_data_count
2225 decoding_writer (Lstream *stream, const unsigned char *data, Lstream_data_count size)
2227 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2228 Lstream_data_count retval;
2230 /* Decode all our data into the runoff, and then attempt to write
2231 it all out to the other end. Remove whatever chunk we succeeded
2233 mule_decode (stream, (Extbyte *) data, str->runoff, size);
2234 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2235 Dynarr_length (str->runoff));
2237 Dynarr_delete_many (str->runoff, 0, retval);
2238 /* Do NOT return retval. The return value indicates how much
2239 of the incoming data was written, not how many bytes were
2245 reset_decoding_stream (struct decoding_stream *str)
2248 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_ISO2022)
2250 Lisp_Object coding_system;
2251 XSETCODING_SYSTEM (coding_system, str->codesys);
2252 reset_iso2022 (coding_system, &str->iso2022);
2254 else if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_CCL)
2256 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_DECODE (str->codesys));
2260 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT
2261 || CODING_SYSTEM_EOL_TYPE (str->codesys) == EOL_AUTODETECT)
2264 str->decst.eol_type = EOL_AUTODETECT;
2265 str->decst.mask = ~0;
2267 str->flags = str->ch = 0;
2271 decoding_rewinder (Lstream *stream)
2273 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2274 reset_decoding_stream (str);
2275 Dynarr_reset (str->runoff);
2276 return Lstream_rewind (str->other_end);
2280 decoding_seekable_p (Lstream *stream)
2282 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2283 return Lstream_seekable_p (str->other_end);
2287 decoding_flusher (Lstream *stream)
2289 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2290 return Lstream_flush (str->other_end);
2294 decoding_closer (Lstream *stream)
2296 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2297 if (stream->flags & LSTREAM_FL_WRITE)
2299 str->flags |= CODING_STATE_END;
2300 decoding_writer (stream, 0, 0);
2302 Dynarr_free (str->runoff);
2304 #ifdef ENABLE_COMPOSITE_CHARS
2305 if (str->iso2022.composite_chars)
2306 Dynarr_free (str->iso2022.composite_chars);
2309 return Lstream_close (str->other_end);
2313 decoding_stream_coding_system (Lstream *stream)
2315 Lisp_Object coding_system;
2316 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2318 XSETCODING_SYSTEM (coding_system, str->codesys);
2319 return subsidiary_coding_system (coding_system, str->eol_type);
2323 set_decoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2325 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2326 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2328 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
2329 str->eol_type = CODING_SYSTEM_EOL_TYPE (cs);
2330 reset_decoding_stream (str);
2333 /* WARNING WARNING WARNING WARNING!!!!! If you open up a decoding
2334 stream for writing, no automatic code detection will be performed.
2335 The reason for this is that automatic code detection requires a
2336 seekable input. Things will also fail if you open a decoding
2337 stream for reading using a non-fully-specified coding system and
2338 a non-seekable input stream. */
2341 make_decoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2344 Lstream *lstr = Lstream_new (lstream_decoding, mode);
2345 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2349 str->other_end = stream;
2350 str->runoff = (unsigned_char_dynarr *) Dynarr_new (unsigned_char);
2351 str->eol_type = EOL_AUTODETECT;
2352 if (!strcmp (mode, "r")
2353 && Lstream_seekable_p (stream))
2354 /* We can determine the coding system now. */
2355 determine_real_coding_system (stream, &codesys, &str->eol_type);
2356 set_decoding_stream_coding_system (lstr, codesys);
2357 str->decst.eol_type = str->eol_type;
2358 str->decst.mask = ~0;
2359 XSETLSTREAM (obj, lstr);
2364 make_decoding_input_stream (Lstream *stream, Lisp_Object codesys)
2366 return make_decoding_stream_1 (stream, codesys, "r");
2370 make_decoding_output_stream (Lstream *stream, Lisp_Object codesys)
2372 return make_decoding_stream_1 (stream, codesys, "w");
2375 /* Note: the decode_coding_* functions all take the same
2376 arguments as mule_decode(), which is to say some SRC data of
2377 size N, which is to be stored into dynamic array DST.
2378 DECODING is the stream within which the decoding is
2379 taking place, but no data is actually read from or
2380 written to that stream; that is handled in decoding_reader()
2381 or decoding_writer(). This allows the same functions to
2382 be used for both reading and writing. */
2385 mule_decode (Lstream *decoding, const Extbyte *src,
2386 unsigned_char_dynarr *dst, Lstream_data_count n)
2388 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2390 /* If necessary, do encoding-detection now. We do this when
2391 we're a writing stream or a non-seekable reading stream,
2392 meaning that we can't just process the whole input,
2393 rewind, and start over. */
2395 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT ||
2396 str->eol_type == EOL_AUTODETECT)
2398 Lisp_Object codesys;
2400 XSETCODING_SYSTEM (codesys, str->codesys);
2401 detect_coding_type (&str->decst, src, n,
2402 CODING_SYSTEM_TYPE (str->codesys) !=
2403 CODESYS_AUTODETECT);
2404 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT &&
2405 str->decst.mask != ~0)
2406 /* #### This is cheesy. What we really ought to do is
2407 buffer up a certain amount of data so as to get a
2408 less random result. */
2409 codesys = coding_system_from_mask (str->decst.mask);
2410 str->eol_type = str->decst.eol_type;
2411 if (XCODING_SYSTEM (codesys) != str->codesys)
2413 /* Preserve the CODING_STATE_END flag in case it was set.
2414 If we erase it, bad things might happen. */
2415 int was_end = str->flags & CODING_STATE_END;
2416 set_decoding_stream_coding_system (decoding, codesys);
2418 str->flags |= CODING_STATE_END;
2422 switch (CODING_SYSTEM_TYPE (str->codesys))
2425 case CODESYS_INTERNAL:
2426 Dynarr_add_many (dst, src, n);
2429 case CODESYS_AUTODETECT:
2430 /* If we got this far and still haven't decided on the coding
2431 system, then do no conversion. */
2432 case CODESYS_NO_CONVERSION:
2433 decode_coding_no_conversion (decoding, src, dst, n);
2436 case CODESYS_SHIFT_JIS:
2437 decode_coding_sjis (decoding, src, dst, n);
2440 decode_coding_big5 (decoding, src, dst, n);
2443 decode_coding_ucs4 (decoding, src, dst, n);
2446 decode_coding_utf8 (decoding, src, dst, n);
2449 str->ccl.last_block = str->flags & CODING_STATE_END;
2450 /* When applying ccl program to stream, MUST NOT set NULL
2452 ccl_driver (&str->ccl, (src ? (unsigned char *)src : (unsigned char*)""),
2453 dst, n, 0, CCL_MODE_DECODING);
2455 case CODESYS_ISO2022:
2456 decode_coding_iso2022 (decoding, src, dst, n);
2464 DEFUN ("decode-coding-region", Fdecode_coding_region, 3, 4, 0, /*
2465 Decode the text between START and END which is encoded in CODING-SYSTEM.
2466 This is useful if you've read in encoded text from a file without decoding
2467 it (e.g. you read in a JIS-formatted file but used the `binary' or
2468 `no-conversion' coding system, so that it shows up as "^[$B!<!+^[(B").
2469 Return length of decoded text.
2470 BUFFER defaults to the current buffer if unspecified.
2472 (start, end, coding_system, buffer))
2475 struct buffer *buf = decode_buffer (buffer, 0);
2476 Lisp_Object instream, lb_outstream, de_outstream, outstream;
2477 Lstream *istr, *ostr;
2478 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2480 get_buffer_range_char (buf, start, end, &b, &e, 0);
2482 barf_if_buffer_read_only (buf, b, e);
2484 coding_system = Fget_coding_system (coding_system);
2485 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2486 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2487 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2489 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2490 Fget_coding_system (Qbinary));
2491 istr = XLSTREAM (instream);
2492 ostr = XLSTREAM (outstream);
2493 GCPRO4 (instream, lb_outstream, de_outstream, outstream);
2495 /* The chain of streams looks like this:
2497 [BUFFER] <----- send through
2498 ------> [ENCODE AS BINARY]
2499 ------> [DECODE AS SPECIFIED]
2505 char tempbuf[1024]; /* some random amount */
2506 Bufpos newpos, even_newer_pos;
2507 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2508 Lstream_data_count size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2512 newpos = lisp_buffer_stream_startpos (istr);
2513 Lstream_write (ostr, tempbuf, size_in_bytes);
2514 even_newer_pos = lisp_buffer_stream_startpos (istr);
2515 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2518 Lstream_close (istr);
2519 Lstream_close (ostr);
2521 Lstream_delete (istr);
2522 Lstream_delete (ostr);
2523 Lstream_delete (XLSTREAM (de_outstream));
2524 Lstream_delete (XLSTREAM (lb_outstream));
2529 /************************************************************************/
2530 /* Converting to an external encoding ("encoding") */
2531 /************************************************************************/
2533 /* An encoding stream is an output stream. When you create the
2534 stream, you specify the coding system that governs the encoding
2535 and another stream that the resulting encoded data is to be
2536 sent to, and then start sending data to it. */
2538 #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding)
2540 struct encoding_stream
2542 /* Coding system that governs the conversion. */
2543 Lisp_Coding_System *codesys;
2545 /* Stream that we read the encoded data from or
2546 write the decoded data to. */
2549 /* If we are reading, then we can return only a fixed amount of
2550 data, so if the conversion resulted in too much data, we store it
2551 here for retrieval the next time around. */
2552 unsigned_char_dynarr *runoff;
2554 /* FLAGS holds flags indicating the current state of the encoding.
2555 Some of these flags are dependent on the coding system. */
2558 /* CH holds a partially built-up character. Since we only deal
2559 with one- and two-byte characters at the moment, we only use
2560 this to store the first byte of a two-byte character. */
2563 /* Additional information used by the ISO2022 encoder. */
2566 /* CHARSET holds the character sets currently assigned to the G0
2567 through G3 registers. It is initialized from the array
2568 INITIAL_CHARSET in CODESYS. */
2569 Lisp_Object charset[4];
2571 /* Which registers are currently invoked into the left (GL) and
2572 right (GR) halves of the 8-bit encoding space? */
2573 int register_left, register_right;
2575 /* Whether we need to explicitly designate the charset in the
2576 G? register before using it. It is initialized from the
2577 array FORCE_CHARSET_ON_OUTPUT in CODESYS. */
2578 unsigned char force_charset_on_output[4];
2580 /* Other state variables that need to be preserved across
2582 Lisp_Object current_charset;
2584 int current_char_boundary;
2587 /* Additional information (the state of the running CCL program)
2588 used by the CCL encoder. */
2589 struct ccl_program ccl;
2593 static Lstream_data_count encoding_reader (Lstream *stream, unsigned char *data, Lstream_data_count size);
2594 static Lstream_data_count encoding_writer (Lstream *stream, const unsigned char *data,
2595 Lstream_data_count size);
2596 static int encoding_rewinder (Lstream *stream);
2597 static int encoding_seekable_p (Lstream *stream);
2598 static int encoding_flusher (Lstream *stream);
2599 static int encoding_closer (Lstream *stream);
2601 static Lisp_Object encoding_marker (Lisp_Object stream);
2603 DEFINE_LSTREAM_IMPLEMENTATION ("encoding", lstream_encoding,
2604 sizeof (struct encoding_stream));
2607 encoding_marker (Lisp_Object stream)
2609 Lstream *str = ENCODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2610 Lisp_Object str_obj;
2612 /* We do not need to mark the coding systems or charsets stored
2613 within the stream because they are stored in a global list
2614 and automatically marked. */
2616 XSETLSTREAM (str_obj, str);
2617 mark_object (str_obj);
2618 if (str->imp->marker)
2619 return (str->imp->marker) (str_obj);
2624 /* Read SIZE bytes of data and store it into DATA. We are a encoding stream
2625 so we read data from the other end, encode it, and store it into DATA. */
2627 static Lstream_data_count
2628 encoding_reader (Lstream *stream, unsigned char *data, Lstream_data_count size)
2630 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2631 unsigned char *orig_data = data;
2632 Lstream_data_count read_size;
2633 int error_occurred = 0;
2635 /* We need to interface to mule_encode(), which expects to take some
2636 amount of data and store the result into a Dynarr. We have
2637 mule_encode() store into str->runoff, and take data from there
2640 /* We loop until we have enough data, reading chunks from the other
2641 end and encoding it. */
2644 /* Take data from the runoff if we can. Make sure to take at
2645 most SIZE bytes, and delete the data from the runoff. */
2646 if (Dynarr_length (str->runoff) > 0)
2648 int chunk = min ((int) size, Dynarr_length (str->runoff));
2649 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2650 Dynarr_delete_many (str->runoff, 0, chunk);
2656 break; /* No more room for data */
2658 if (str->flags & CODING_STATE_END)
2659 /* This means that on the previous iteration, we hit the EOF on
2660 the other end. We loop once more so that mule_encode() can
2661 output any final stuff it may be holding, or any "go back
2662 to a sane state" escape sequences. (This latter makes sense
2663 during encoding.) */
2666 /* Exhausted the runoff, so get some more. DATA at least SIZE bytes
2667 left of storage in it, so it's OK to read directly into it.
2668 (We'll be overwriting above, after we've encoded it into the
2670 read_size = Lstream_read (str->other_end, data, size);
2677 /* There might be some more end data produced in the translation.
2678 See the comment above. */
2679 str->flags |= CODING_STATE_END;
2680 mule_encode (stream, data, str->runoff, read_size);
2683 if (data == orig_data)
2684 return error_occurred ? -1 : 0;
2686 return data - orig_data;
2689 static Lstream_data_count
2690 encoding_writer (Lstream *stream, const unsigned char *data, Lstream_data_count size)
2692 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2693 Lstream_data_count retval;
2695 /* Encode all our data into the runoff, and then attempt to write
2696 it all out to the other end. Remove whatever chunk we succeeded
2698 mule_encode (stream, data, str->runoff, size);
2699 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2700 Dynarr_length (str->runoff));
2702 Dynarr_delete_many (str->runoff, 0, retval);
2703 /* Do NOT return retval. The return value indicates how much
2704 of the incoming data was written, not how many bytes were
2710 reset_encoding_stream (struct encoding_stream *str)
2713 switch (CODING_SYSTEM_TYPE (str->codesys))
2715 case CODESYS_ISO2022:
2719 for (i = 0; i < 4; i++)
2721 str->iso2022.charset[i] =
2722 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, i);
2723 str->iso2022.force_charset_on_output[i] =
2724 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (str->codesys, i);
2726 str->iso2022.register_left = 0;
2727 str->iso2022.register_right = 1;
2728 str->iso2022.current_charset = Qnil;
2729 str->iso2022.current_half = 0;
2730 str->iso2022.current_char_boundary = 1;
2734 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_ENCODE (str->codesys));
2741 str->flags = str->ch = 0;
2745 encoding_rewinder (Lstream *stream)
2747 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2748 reset_encoding_stream (str);
2749 Dynarr_reset (str->runoff);
2750 return Lstream_rewind (str->other_end);
2754 encoding_seekable_p (Lstream *stream)
2756 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2757 return Lstream_seekable_p (str->other_end);
2761 encoding_flusher (Lstream *stream)
2763 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2764 return Lstream_flush (str->other_end);
2768 encoding_closer (Lstream *stream)
2770 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2771 if (stream->flags & LSTREAM_FL_WRITE)
2773 str->flags |= CODING_STATE_END;
2774 encoding_writer (stream, 0, 0);
2776 Dynarr_free (str->runoff);
2777 return Lstream_close (str->other_end);
2781 encoding_stream_coding_system (Lstream *stream)
2783 Lisp_Object coding_system;
2784 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2786 XSETCODING_SYSTEM (coding_system, str->codesys);
2787 return coding_system;
2791 set_encoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2793 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2794 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2796 reset_encoding_stream (str);
2800 make_encoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2803 Lstream *lstr = Lstream_new (lstream_encoding, mode);
2804 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2808 str->runoff = Dynarr_new (unsigned_char);
2809 str->other_end = stream;
2810 set_encoding_stream_coding_system (lstr, codesys);
2811 XSETLSTREAM (obj, lstr);
2816 make_encoding_input_stream (Lstream *stream, Lisp_Object codesys)
2818 return make_encoding_stream_1 (stream, codesys, "r");
2822 make_encoding_output_stream (Lstream *stream, Lisp_Object codesys)
2824 return make_encoding_stream_1 (stream, codesys, "w");
2827 /* Convert N bytes of internally-formatted data stored in SRC to an
2828 external format, according to the encoding stream ENCODING.
2829 Store the encoded data into DST. */
2832 mule_encode (Lstream *encoding, const Bufbyte *src,
2833 unsigned_char_dynarr *dst, Lstream_data_count n)
2835 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2837 switch (CODING_SYSTEM_TYPE (str->codesys))
2840 case CODESYS_INTERNAL:
2841 Dynarr_add_many (dst, src, n);
2844 case CODESYS_AUTODETECT:
2845 /* If we got this far and still haven't decided on the coding
2846 system, then do no conversion. */
2847 case CODESYS_NO_CONVERSION:
2848 encode_coding_no_conversion (encoding, src, dst, n);
2851 case CODESYS_SHIFT_JIS:
2852 encode_coding_sjis (encoding, src, dst, n);
2855 encode_coding_big5 (encoding, src, dst, n);
2858 encode_coding_ucs4 (encoding, src, dst, n);
2861 encode_coding_utf8 (encoding, src, dst, n);
2864 str->ccl.last_block = str->flags & CODING_STATE_END;
2865 /* When applying ccl program to stream, MUST NOT set NULL
2867 ccl_driver (&str->ccl, ((src) ? src : (unsigned char*)""),
2868 dst, n, 0, CCL_MODE_ENCODING);
2870 case CODESYS_ISO2022:
2871 encode_coding_iso2022 (encoding, src, dst, n);
2879 DEFUN ("encode-coding-region", Fencode_coding_region, 3, 4, 0, /*
2880 Encode the text between START and END using CODING-SYSTEM.
2881 This will, for example, convert Japanese characters into stuff such as
2882 "^[$B!<!+^[(B" if you use the JIS encoding. Return length of encoded
2883 text. BUFFER defaults to the current buffer if unspecified.
2885 (start, end, coding_system, buffer))
2888 struct buffer *buf = decode_buffer (buffer, 0);
2889 Lisp_Object instream, lb_outstream, de_outstream, outstream;
2890 Lstream *istr, *ostr;
2891 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2893 get_buffer_range_char (buf, start, end, &b, &e, 0);
2895 barf_if_buffer_read_only (buf, b, e);
2897 coding_system = Fget_coding_system (coding_system);
2898 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2899 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2900 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2901 Fget_coding_system (Qbinary));
2902 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2904 istr = XLSTREAM (instream);
2905 ostr = XLSTREAM (outstream);
2906 GCPRO4 (instream, outstream, de_outstream, lb_outstream);
2907 /* The chain of streams looks like this:
2909 [BUFFER] <----- send through
2910 ------> [ENCODE AS SPECIFIED]
2911 ------> [DECODE AS BINARY]
2916 char tempbuf[1024]; /* some random amount */
2917 Bufpos newpos, even_newer_pos;
2918 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2919 Lstream_data_count size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2923 newpos = lisp_buffer_stream_startpos (istr);
2924 Lstream_write (ostr, tempbuf, size_in_bytes);
2925 even_newer_pos = lisp_buffer_stream_startpos (istr);
2926 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2932 lisp_buffer_stream_startpos (XLSTREAM (instream)) - b;
2933 Lstream_close (istr);
2934 Lstream_close (ostr);
2936 Lstream_delete (istr);
2937 Lstream_delete (ostr);
2938 Lstream_delete (XLSTREAM (de_outstream));
2939 Lstream_delete (XLSTREAM (lb_outstream));
2940 return make_int (retlen);
2946 /************************************************************************/
2947 /* Shift-JIS methods */
2948 /************************************************************************/
2950 /* Shift-JIS is a coding system encoding three character sets: ASCII, right
2951 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
2952 as is. A character of JISX0201-Kana (DIMENSION1_CHARS94 character set) is
2953 encoded by "position-code + 0x80". A character of JISX0208
2954 (DIMENSION2_CHARS94 character set) is encoded in 2-byte but two
2955 position-codes are divided and shifted so that it fit in the range
2958 --- CODE RANGE of Shift-JIS ---
2959 (character set) (range)
2961 JISX0201-Kana 0xA0 .. 0xDF
2962 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xEF
2963 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
2964 -------------------------------
2968 /* Is this the first byte of a Shift-JIS two-byte char? */
2970 #define BYTE_SJIS_TWO_BYTE_1_P(c) \
2971 (((c) >= 0x81 && (c) <= 0x9F) || ((c) >= 0xE0 && (c) <= 0xEF))
2973 /* Is this the second byte of a Shift-JIS two-byte char? */
2975 #define BYTE_SJIS_TWO_BYTE_2_P(c) \
2976 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0x80 && (c) <= 0xFC))
2978 #define BYTE_SJIS_KATAKANA_P(c) \
2979 ((c) >= 0xA1 && (c) <= 0xDF)
2982 detect_coding_sjis (struct detection_state *st, const Extbyte *src, Lstream_data_count n)
2986 unsigned char c = *(unsigned char *)src++;
2987 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
2989 if (st->shift_jis.in_second_byte)
2991 st->shift_jis.in_second_byte = 0;
2995 else if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
2996 st->shift_jis.in_second_byte = 1;
2998 return CODING_CATEGORY_SHIFT_JIS_MASK;
3001 /* Convert Shift-JIS data to internal format. */
3004 decode_coding_sjis (Lstream *decoding, const Extbyte *src,
3005 unsigned_char_dynarr *dst, Lstream_data_count n)
3007 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3008 unsigned int flags = str->flags;
3009 unsigned int ch = str->ch;
3010 eol_type_t eol_type = str->eol_type;
3014 unsigned char c = *(unsigned char *)src++;
3018 /* Previous character was first byte of Shift-JIS Kanji char. */
3019 if (BYTE_SJIS_TWO_BYTE_2_P (c))
3021 unsigned char e1, e2;
3023 Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
3024 DECODE_SJIS (ch, c, e1, e2);
3025 Dynarr_add (dst, e1);
3026 Dynarr_add (dst, e2);
3030 DECODE_ADD_BINARY_CHAR (ch, dst);
3031 DECODE_ADD_BINARY_CHAR (c, dst);
3037 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3038 if (BYTE_SJIS_TWO_BYTE_1_P (c))
3040 else if (BYTE_SJIS_KATAKANA_P (c))
3042 Dynarr_add (dst, LEADING_BYTE_KATAKANA_JISX0201);
3043 Dynarr_add (dst, c);
3046 DECODE_ADD_BINARY_CHAR (c, dst);
3048 label_continue_loop:;
3051 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
3057 /* Convert internally-formatted data to Shift-JIS. */
3060 encode_coding_sjis (Lstream *encoding, const Bufbyte *src,
3061 unsigned_char_dynarr *dst, Lstream_data_count n)
3063 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3064 unsigned int flags = str->flags;
3065 unsigned int ch = str->ch;
3066 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3073 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3074 Dynarr_add (dst, '\r');
3075 if (eol_type != EOL_CR)
3076 Dynarr_add (dst, '\n');
3079 else if (BYTE_ASCII_P (c))
3081 Dynarr_add (dst, c);
3084 else if (BUFBYTE_LEADING_BYTE_P (c))
3085 ch = (c == LEADING_BYTE_KATAKANA_JISX0201 ||
3086 c == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
3087 c == LEADING_BYTE_JAPANESE_JISX0208) ? c : 0;
3090 if (ch == LEADING_BYTE_KATAKANA_JISX0201)
3092 Dynarr_add (dst, c);
3095 else if (ch == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
3096 ch == LEADING_BYTE_JAPANESE_JISX0208)
3100 unsigned char j1, j2;
3101 ENCODE_SJIS (ch, c, j1, j2);
3102 Dynarr_add (dst, j1);
3103 Dynarr_add (dst, j2);
3113 DEFUN ("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /*
3114 Decode a JISX0208 character of Shift-JIS coding-system.
3115 CODE is the character code in Shift-JIS as a cons of type bytes.
3116 Return the corresponding character.
3120 unsigned char c1, c2, s1, s2;
3123 CHECK_INT (XCAR (code));
3124 CHECK_INT (XCDR (code));
3125 s1 = XINT (XCAR (code));
3126 s2 = XINT (XCDR (code));
3127 if (BYTE_SJIS_TWO_BYTE_1_P (s1) &&
3128 BYTE_SJIS_TWO_BYTE_2_P (s2))
3130 DECODE_SJIS (s1, s2, c1, c2);
3131 return make_char (MAKE_CHAR (Vcharset_japanese_jisx0208,
3132 c1 & 0x7F, c2 & 0x7F));
3138 DEFUN ("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /*
3139 Encode a JISX0208 character CHARACTER to SHIFT-JIS coding-system.
3140 Return the corresponding character code in SHIFT-JIS as a cons of two bytes.
3144 Lisp_Object charset;
3147 CHECK_CHAR_COERCE_INT (character);
3148 BREAKUP_CHAR (XCHAR (character), charset, c1, c2);
3149 if (EQ (charset, Vcharset_japanese_jisx0208))
3151 ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3152 return Fcons (make_int (s1), make_int (s2));
3159 /************************************************************************/
3161 /************************************************************************/
3163 /* BIG5 is a coding system encoding two character sets: ASCII and
3164 Big5. An ASCII character is encoded as is. Big5 is a two-byte
3165 character set and is encoded in two-byte.
3167 --- CODE RANGE of BIG5 ---
3168 (character set) (range)
3170 Big5 (1st byte) 0xA1 .. 0xFE
3171 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
3172 --------------------------
3174 Since the number of characters in Big5 is larger than maximum
3175 characters in Emacs' charset (96x96), it can't be handled as one
3176 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
3177 and `charset-big5-2'. Both <type>s are DIMENSION2_CHARS94. The former
3178 contains frequently used characters and the latter contains less
3179 frequently used characters. */
3181 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
3182 ((c) >= 0xA1 && (c) <= 0xFE)
3184 /* Is this the second byte of a Shift-JIS two-byte char? */
3186 #define BYTE_BIG5_TWO_BYTE_2_P(c) \
3187 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0xA1 && (c) <= 0xFE))
3189 /* Number of Big5 characters which have the same code in 1st byte. */
3191 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
3193 /* Code conversion macros. These are macros because they are used in
3194 inner loops during code conversion.
3196 Note that temporary variables in macros introduce the classic
3197 dynamic-scoping problems with variable names. We use capital-
3198 lettered variables in the assumption that XEmacs does not use
3199 capital letters in variables except in a very formalized way
3202 /* Convert Big5 code (b1, b2) into its internal string representation
3205 /* There is a much simpler way to split the Big5 charset into two.
3206 For the moment I'm going to leave the algorithm as-is because it
3207 claims to separate out the most-used characters into a single
3208 charset, which perhaps will lead to optimizations in various
3211 The way the algorithm works is something like this:
3213 Big5 can be viewed as a 94x157 charset, where the row is
3214 encoded into the bytes 0xA1 .. 0xFE and the column is encoded
3215 into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE. As for frequency,
3216 the split between low and high column numbers is apparently
3217 meaningless; ascending rows produce less and less frequent chars.
3218 Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to
3219 the first charset, and the upper half (0xC9 .. 0xFE) to the
3220 second. To do the conversion, we convert the character into
3221 a single number where 0 .. 156 is the first row, 157 .. 313
3222 is the second, etc. That way, the characters are ordered by
3223 decreasing frequency. Then we just chop the space in two
3224 and coerce the result into a 94x94 space.
3227 #define DECODE_BIG5(b1, b2, lb, c1, c2) do \
3229 int B1 = b1, B2 = b2; \
3231 = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62); \
3235 lb = LEADING_BYTE_CHINESE_BIG5_1; \
3239 lb = LEADING_BYTE_CHINESE_BIG5_2; \
3240 I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1); \
3242 c1 = I / (0xFF - 0xA1) + 0xA1; \
3243 c2 = I % (0xFF - 0xA1) + 0xA1; \
3246 /* Convert the internal string representation of a Big5 character
3247 (lb, c1, c2) into Big5 code (b1, b2). */
3249 #define ENCODE_BIG5(lb, c1, c2, b1, b2) do \
3251 unsigned int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1); \
3253 if (lb == LEADING_BYTE_CHINESE_BIG5_2) \
3255 I += BIG5_SAME_ROW * (0xC9 - 0xA1); \
3257 b1 = I / BIG5_SAME_ROW + 0xA1; \
3258 b2 = I % BIG5_SAME_ROW; \
3259 b2 += b2 < 0x3F ? 0x40 : 0x62; \
3263 detect_coding_big5 (struct detection_state *st, const Extbyte *src, Lstream_data_count n)
3267 unsigned char c = *(unsigned char *)src++;
3268 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO ||
3269 (c >= 0x80 && c <= 0xA0))
3271 if (st->big5.in_second_byte)
3273 st->big5.in_second_byte = 0;
3274 if (c < 0x40 || (c >= 0x80 && c <= 0xA0))
3278 st->big5.in_second_byte = 1;
3280 return CODING_CATEGORY_BIG5_MASK;
3283 /* Convert Big5 data to internal format. */
3286 decode_coding_big5 (Lstream *decoding, const Extbyte *src,
3287 unsigned_char_dynarr *dst, Lstream_data_count n)
3289 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3290 unsigned int flags = str->flags;
3291 unsigned int ch = str->ch;
3292 eol_type_t eol_type = str->eol_type;
3296 unsigned char c = *(unsigned char *)src++;
3299 /* Previous character was first byte of Big5 char. */
3300 if (BYTE_BIG5_TWO_BYTE_2_P (c))
3302 unsigned char b1, b2, b3;
3303 DECODE_BIG5 (ch, c, b1, b2, b3);
3304 Dynarr_add (dst, b1);
3305 Dynarr_add (dst, b2);
3306 Dynarr_add (dst, b3);
3310 DECODE_ADD_BINARY_CHAR (ch, dst);
3311 DECODE_ADD_BINARY_CHAR (c, dst);
3317 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3318 if (BYTE_BIG5_TWO_BYTE_1_P (c))
3321 DECODE_ADD_BINARY_CHAR (c, dst);
3323 label_continue_loop:;
3326 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
3332 /* Convert internally-formatted data to Big5. */
3335 encode_coding_big5 (Lstream *encoding, const Bufbyte *src,
3336 unsigned_char_dynarr *dst, Lstream_data_count n)
3339 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3340 unsigned int flags = str->flags;
3341 unsigned int ch = str->ch;
3342 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3349 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3350 Dynarr_add (dst, '\r');
3351 if (eol_type != EOL_CR)
3352 Dynarr_add (dst, '\n');
3354 else if (BYTE_ASCII_P (c))
3357 Dynarr_add (dst, c);
3359 else if (BUFBYTE_LEADING_BYTE_P (c))
3361 if (c == LEADING_BYTE_CHINESE_BIG5_1 ||
3362 c == LEADING_BYTE_CHINESE_BIG5_2)
3364 /* A recognized leading byte. */
3366 continue; /* not done with this character. */
3368 /* otherwise just ignore this character. */
3370 else if (ch == LEADING_BYTE_CHINESE_BIG5_1 ||
3371 ch == LEADING_BYTE_CHINESE_BIG5_2)
3373 /* Previous char was a recognized leading byte. */
3375 continue; /* not done with this character. */
3379 /* Encountering second byte of a Big5 character. */
3380 unsigned char b1, b2;
3382 ENCODE_BIG5 (ch >> 8, ch & 0xFF, c, b1, b2);
3383 Dynarr_add (dst, b1);
3384 Dynarr_add (dst, b2);
3395 DEFUN ("decode-big5-char", Fdecode_big5_char, 1, 1, 0, /*
3396 Decode a Big5 character CODE of BIG5 coding-system.
3397 CODE is the character code in BIG5, a cons of two integers.
3398 Return the corresponding character.
3402 unsigned char c1, c2, b1, b2;
3405 CHECK_INT (XCAR (code));
3406 CHECK_INT (XCDR (code));
3407 b1 = XINT (XCAR (code));
3408 b2 = XINT (XCDR (code));
3409 if (BYTE_BIG5_TWO_BYTE_1_P (b1) &&
3410 BYTE_BIG5_TWO_BYTE_2_P (b2))
3413 Lisp_Object charset;
3414 DECODE_BIG5 (b1, b2, leading_byte, c1, c2);
3415 charset = CHARSET_BY_LEADING_BYTE (leading_byte);
3416 return make_char (MAKE_CHAR (charset, c1 & 0x7F, c2 & 0x7F));
3422 DEFUN ("encode-big5-char", Fencode_big5_char, 1, 1, 0, /*
3423 Encode the Big5 character CHARACTER in the BIG5 coding-system.
3424 Return the corresponding character code in Big5.
3428 Lisp_Object charset;
3431 CHECK_CHAR_COERCE_INT (character);
3432 BREAKUP_CHAR (XCHAR (character), charset, c1, c2);
3433 if (EQ (charset, Vcharset_chinese_big5_1) ||
3434 EQ (charset, Vcharset_chinese_big5_2))
3436 ENCODE_BIG5 (XCHARSET_LEADING_BYTE (charset), c1 | 0x80, c2 | 0x80,
3438 return Fcons (make_int (b1), make_int (b2));
3445 /************************************************************************/
3448 /* UCS-4 character codes are implemented as nonnegative integers. */
3450 /************************************************************************/
3453 DEFUN ("set-ucs-char", Fset_ucs_char, 2, 2, 0, /*
3454 Map UCS-4 code CODE to Mule character CHARACTER.
3456 Return T on success, NIL on failure.
3462 CHECK_CHAR (character);
3463 CHECK_NATNUM (code);
3466 if (c < countof (fcd->ucs_to_mule_table))
3468 fcd->ucs_to_mule_table[c] = character;
3476 ucs_to_char (unsigned long code)
3478 if (code < countof (fcd->ucs_to_mule_table))
3480 return fcd->ucs_to_mule_table[code];
3482 else if ((0xe00000 <= code) && (code <= 0xe00000 + 94 * 94 * 14))
3487 c = code % (94 * 94);
3489 (MAKE_CHAR (CHARSET_BY_ATTRIBUTES
3490 (CHARSET_TYPE_94X94, code / (94 * 94) + '@',
3491 CHARSET_LEFT_TO_RIGHT),
3492 c / 94 + 33, c % 94 + 33));
3498 DEFUN ("ucs-char", Fucs_char, 1, 1, 0, /*
3499 Return Mule character corresponding to UCS code CODE (a positive integer).
3503 CHECK_NATNUM (code);
3504 return ucs_to_char (XINT (code));
3507 DEFUN ("set-char-ucs", Fset_char_ucs, 2, 2, 0, /*
3508 Map Mule character CHARACTER to UCS code CODE (a positive integer).
3512 /* #### Isn't this gilding the lily? Fput_char_table checks its args.
3513 Fset_char_ucs is more restrictive on index arg, but should
3514 check code arg in a char_table method. */
3515 CHECK_CHAR (character);
3516 CHECK_NATNUM (code);
3517 return Fput_char_table (character, code, mule_to_ucs_table);
3520 DEFUN ("char-ucs", Fchar_ucs, 1, 1, 0, /*
3521 Return the UCS code (a positive integer) corresponding to CHARACTER.
3525 return Fget_char_table (character, mule_to_ucs_table);
3528 /* Decode a UCS-4 character into a buffer. If the lookup fails, use
3529 <GETA MARK> (U+3013) of JIS X 0208, which means correct character
3530 is not found, instead.
3531 #### do something more appropriate (use blob?)
3532 Danger, Will Robinson! Data loss. Should we signal user? */
3534 decode_ucs4 (unsigned long ch, unsigned_char_dynarr *dst)
3536 Lisp_Object chr = ucs_to_char (ch);
3540 Bufbyte work[MAX_EMCHAR_LEN];
3545 simple_set_charptr_emchar (work, ch) :
3546 non_ascii_set_charptr_emchar (work, ch);
3547 Dynarr_add_many (dst, work, len);
3551 Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
3552 Dynarr_add (dst, 34 + 128);
3553 Dynarr_add (dst, 46 + 128);
3557 static unsigned long
3558 mule_char_to_ucs4 (Lisp_Object charset,
3559 unsigned char h, unsigned char l)
3562 = Fget_char_table (make_char (MAKE_CHAR (charset, h & 127, l & 127)),
3569 else if ( (XCHARSET_DIMENSION (charset) == 2) &&
3570 (XCHARSET_CHARS (charset) == 94) )
3572 unsigned char final = XCHARSET_FINAL (charset);
3574 if ( ('@' <= final) && (final < 0x7f) )
3576 return 0xe00000 + (final - '@') * 94 * 94
3577 + ((h & 127) - 33) * 94 + (l & 127) - 33;
3591 encode_ucs4 (Lisp_Object charset,
3592 unsigned char h, unsigned char l, unsigned_char_dynarr *dst)
3594 unsigned long code = mule_char_to_ucs4 (charset, h, l);
3595 Dynarr_add (dst, code >> 24);
3596 Dynarr_add (dst, (code >> 16) & 255);
3597 Dynarr_add (dst, (code >> 8) & 255);
3598 Dynarr_add (dst, code & 255);
3602 detect_coding_ucs4 (struct detection_state *st, const Extbyte *src, Lstream_data_count n)
3606 unsigned char c = *(unsigned char *)src++;
3607 switch (st->ucs4.in_byte)
3616 st->ucs4.in_byte = 0;
3622 return CODING_CATEGORY_UCS4_MASK;
3626 decode_coding_ucs4 (Lstream *decoding, const Extbyte *src,
3627 unsigned_char_dynarr *dst, Lstream_data_count n)
3629 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3630 unsigned int flags = str->flags;
3631 unsigned int ch = str->ch;
3632 unsigned char counter = str->counter;
3636 unsigned char c = *(unsigned char *)src++;
3644 decode_ucs4 ( ( ch << 8 ) | c, dst);
3649 ch = ( ch << 8 ) | c;
3653 if (counter & CODING_STATE_END)
3654 DECODE_OUTPUT_PARTIAL_CHAR (ch);
3658 str->counter = counter;
3662 encode_coding_ucs4 (Lstream *encoding, const Bufbyte *src,
3663 unsigned_char_dynarr *dst, Lstream_data_count n)
3665 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3666 unsigned int flags = str->flags;
3667 unsigned int ch = str->ch;
3668 unsigned char char_boundary = str->iso2022.current_char_boundary;
3669 Lisp_Object charset = str->iso2022.current_charset;
3671 #ifdef ENABLE_COMPOSITE_CHARS
3672 /* flags for handling composite chars. We do a little switcharoo
3673 on the source while we're outputting the composite char. */
3674 unsigned int saved_n = 0;
3675 const unsigned char *saved_src = NULL;
3676 int in_composite = 0;
3683 unsigned char c = *src++;
3685 if (BYTE_ASCII_P (c))
3686 { /* Processing ASCII character */
3688 encode_ucs4 (Vcharset_ascii, c, 0, dst);
3691 else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
3692 { /* Processing Leading Byte */
3694 charset = CHARSET_BY_LEADING_BYTE (c);
3695 if (LEADING_BYTE_PREFIX_P(c))
3700 { /* Processing Non-ASCII character */
3702 if (EQ (charset, Vcharset_control_1))
3704 encode_ucs4 (Vcharset_control_1, c, 0, dst);
3708 switch (XCHARSET_REP_BYTES (charset))
3711 encode_ucs4 (charset, c, 0, dst);
3714 if (XCHARSET_PRIVATE_P (charset))
3716 encode_ucs4 (charset, c, 0, dst);
3721 #ifdef ENABLE_COMPOSITE_CHARS
3722 if (EQ (charset, Vcharset_composite))
3726 /* #### Bother! We don't know how to
3728 Dynarr_add (dst, '\0');
3729 Dynarr_add (dst, '\0');
3730 Dynarr_add (dst, '\0');
3731 Dynarr_add (dst, '~');
3735 Emchar emch = MAKE_CHAR (Vcharset_composite,
3736 ch & 0x7F, c & 0x7F);
3737 Lisp_Object lstr = composite_char_string (emch);
3741 src = XSTRING_DATA (lstr);
3742 n = XSTRING_LENGTH (lstr);
3746 #endif /* ENABLE_COMPOSITE_CHARS */
3748 encode_ucs4(charset, ch, c, dst);
3761 encode_ucs4 (charset, ch, c, dst);
3777 #ifdef ENABLE_COMPOSITE_CHARS
3783 goto back_to_square_n; /* Wheeeeeeeee ..... */
3785 #endif /* ENABLE_COMPOSITE_CHARS */
3789 str->iso2022.current_char_boundary = char_boundary;
3790 str->iso2022.current_charset = charset;
3792 /* Verbum caro factum est! */
3796 /************************************************************************/
3798 /************************************************************************/
3801 detect_coding_utf8 (struct detection_state *st, const Extbyte *src, Lstream_data_count n)
3805 unsigned char c = *(unsigned char *)src++;
3806 switch (st->utf8.in_byte)
3809 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3812 st->utf8.in_byte = 5;
3814 st->utf8.in_byte = 4;
3816 st->utf8.in_byte = 3;
3818 st->utf8.in_byte = 2;
3820 st->utf8.in_byte = 1;
3825 if ((c & 0xc0) != 0x80)
3831 return CODING_CATEGORY_UTF8_MASK;
3835 decode_coding_utf8 (Lstream *decoding, const Extbyte *src,
3836 unsigned_char_dynarr *dst, Lstream_data_count n)
3838 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3839 unsigned int flags = str->flags;
3840 unsigned int ch = str->ch;
3841 eol_type_t eol_type = str->eol_type;
3842 unsigned char counter = str->counter;
3846 unsigned char c = *(unsigned char *)src++;
3855 else if ( c >= 0xf8 )
3860 else if ( c >= 0xf0 )
3865 else if ( c >= 0xe0 )
3870 else if ( c >= 0xc0 )
3877 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3878 decode_ucs4 (c, dst);
3882 ch = ( ch << 6 ) | ( c & 0x3f );
3883 decode_ucs4 (ch, dst);
3888 ch = ( ch << 6 ) | ( c & 0x3f );
3891 label_continue_loop:;
3894 if (flags & CODING_STATE_END)
3895 DECODE_OUTPUT_PARTIAL_CHAR (ch);
3899 str->counter = counter;
3903 encode_utf8 (Lisp_Object charset,
3904 unsigned char h, unsigned char l, unsigned_char_dynarr *dst)
3906 unsigned long code = mule_char_to_ucs4 (charset, h, l);
3909 Dynarr_add (dst, code);
3911 else if ( code <= 0x7ff )
3913 Dynarr_add (dst, (code >> 6) | 0xc0);
3914 Dynarr_add (dst, (code & 0x3f) | 0x80);
3916 else if ( code <= 0xffff )
3918 Dynarr_add (dst, (code >> 12) | 0xe0);
3919 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
3920 Dynarr_add (dst, (code & 0x3f) | 0x80);
3922 else if ( code <= 0x1fffff )
3924 Dynarr_add (dst, (code >> 18) | 0xf0);
3925 Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
3926 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
3927 Dynarr_add (dst, (code & 0x3f) | 0x80);
3929 else if ( code <= 0x3ffffff )
3931 Dynarr_add (dst, (code >> 24) | 0xf8);
3932 Dynarr_add (dst, ((code >> 18) & 0x3f) | 0x80);
3933 Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
3934 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
3935 Dynarr_add (dst, (code & 0x3f) | 0x80);
3939 Dynarr_add (dst, (code >> 30) | 0xfc);
3940 Dynarr_add (dst, ((code >> 24) & 0x3f) | 0x80);
3941 Dynarr_add (dst, ((code >> 18) & 0x3f) | 0x80);
3942 Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
3943 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
3944 Dynarr_add (dst, (code & 0x3f) | 0x80);
3949 encode_coding_utf8 (Lstream *encoding, const Bufbyte *src,
3950 unsigned_char_dynarr *dst, Lstream_data_count n)
3952 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3953 unsigned int flags = str->flags;
3954 unsigned int ch = str->ch;
3955 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3956 unsigned char char_boundary = str->iso2022.current_char_boundary;
3957 Lisp_Object charset = str->iso2022.current_charset;
3959 #ifdef ENABLE_COMPOSITE_CHARS
3960 /* flags for handling composite chars. We do a little switcharoo
3961 on the source while we're outputting the composite char. */
3962 unsigned int saved_n = 0;
3963 const unsigned char *saved_src = NULL;
3964 int in_composite = 0;
3967 #endif /* ENABLE_COMPOSITE_CHARS */
3971 unsigned char c = *src++;
3973 if (BYTE_ASCII_P (c))
3974 { /* Processing ASCII character */
3978 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3979 Dynarr_add (dst, '\r');
3980 if (eol_type != EOL_CR)
3981 Dynarr_add (dst, c);
3984 encode_utf8 (Vcharset_ascii, c, 0, dst);
3987 else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
3988 { /* Processing Leading Byte */
3990 charset = CHARSET_BY_LEADING_BYTE (c);
3991 if (LEADING_BYTE_PREFIX_P(c))
3996 { /* Processing Non-ASCII character */
3998 if (EQ (charset, Vcharset_control_1))
4000 encode_utf8 (Vcharset_control_1, c, 0, dst);
4004 switch (XCHARSET_REP_BYTES (charset))
4007 encode_utf8 (charset, c, 0, dst);
4010 if (XCHARSET_PRIVATE_P (charset))
4012 encode_utf8 (charset, c, 0, dst);
4017 #ifdef ENABLE_COMPOSITE_CHARS
4018 if (EQ (charset, Vcharset_composite))
4022 /* #### Bother! We don't know how to
4024 encode_utf8 (Vcharset_ascii, '~', 0, dst);
4028 Emchar emch = MAKE_CHAR (Vcharset_composite,
4029 ch & 0x7F, c & 0x7F);
4030 Lisp_Object lstr = composite_char_string (emch);
4034 src = XSTRING_DATA (lstr);
4035 n = XSTRING_LENGTH (lstr);
4039 #endif /* ENABLE_COMPOSITE_CHARS */
4041 encode_utf8 (charset, ch, c, dst);
4054 encode_utf8 (charset, ch, c, dst);
4070 #ifdef ENABLE_COMPOSITE_CHARS
4076 goto back_to_square_n; /* Wheeeeeeeee ..... */
4082 str->iso2022.current_char_boundary = char_boundary;
4083 str->iso2022.current_charset = charset;
4085 /* Verbum caro factum est! */
4089 /************************************************************************/
4090 /* ISO2022 methods */
4091 /************************************************************************/
4093 /* The following note describes the coding system ISO2022 briefly.
4094 Since the intention of this note is to help understand the
4095 functions in this file, some parts are NOT ACCURATE or OVERLY
4096 SIMPLIFIED. For thorough understanding, please refer to the
4097 original document of ISO2022.
4099 ISO2022 provides many mechanisms to encode several character sets
4100 in 7-bit and 8-bit environments. For 7-bit environments, all text
4101 is encoded using bytes less than 128. This may make the encoded
4102 text a little bit longer, but the text passes more easily through
4103 several gateways, some of which strip off MSB (Most Signigant Bit).
4105 There are two kinds of character sets: control character set and
4106 graphic character set. The former contains control characters such
4107 as `newline' and `escape' to provide control functions (control
4108 functions are also provided by escape sequences). The latter
4109 contains graphic characters such as 'A' and '-'. Emacs recognizes
4110 two control character sets and many graphic character sets.
4112 Graphic character sets are classified into one of the following
4113 four classes, according to the number of bytes (DIMENSION) and
4114 number of characters in one dimension (CHARS) of the set:
4115 - DIMENSION1_CHARS94
4116 - DIMENSION1_CHARS96
4117 - DIMENSION2_CHARS94
4118 - DIMENSION2_CHARS96
4120 In addition, each character set is assigned an identification tag,
4121 unique for each set, called "final character" (denoted as <F>
4122 hereafter). The <F> of each character set is decided by ECMA(*)
4123 when it is registered in ISO. The code range of <F> is 0x30..0x7F
4124 (0x30..0x3F are for private use only).
4126 Note (*): ECMA = European Computer Manufacturers Association
4128 Here are examples of graphic character set [NAME(<F>)]:
4129 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
4130 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
4131 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
4132 o DIMENSION2_CHARS96 -- none for the moment
4134 A code area (1 byte = 8 bits) is divided into 4 areas, C0, GL, C1, and GR.
4135 C0 [0x00..0x1F] -- control character plane 0
4136 GL [0x20..0x7F] -- graphic character plane 0
4137 C1 [0x80..0x9F] -- control character plane 1
4138 GR [0xA0..0xFF] -- graphic character plane 1
4140 A control character set is directly designated and invoked to C0 or
4141 C1 by an escape sequence. The most common case is that:
4142 - ISO646's control character set is designated/invoked to C0, and
4143 - ISO6429's control character set is designated/invoked to C1,
4144 and usually these designations/invocations are omitted in encoded
4145 text. In a 7-bit environment, only C0 can be used, and a control
4146 character for C1 is encoded by an appropriate escape sequence to
4147 fit into the environment. All control characters for C1 are
4148 defined to have corresponding escape sequences.
4150 A graphic character set is at first designated to one of four
4151 graphic registers (G0 through G3), then these graphic registers are
4152 invoked to GL or GR. These designations and invocations can be
4153 done independently. The most common case is that G0 is invoked to
4154 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
4155 these invocations and designations are omitted in encoded text.
4156 In a 7-bit environment, only GL can be used.
4158 When a graphic character set of CHARS94 is invoked to GL, codes
4159 0x20 and 0x7F of the GL area work as control characters SPACE and
4160 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
4163 There are two ways of invocation: locking-shift and single-shift.
4164 With locking-shift, the invocation lasts until the next different
4165 invocation, whereas with single-shift, the invocation affects the
4166 following character only and doesn't affect the locking-shift
4167 state. Invocations are done by the following control characters or
4170 ----------------------------------------------------------------------
4171 abbrev function cntrl escape seq description
4172 ----------------------------------------------------------------------
4173 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
4174 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
4175 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
4176 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
4177 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
4178 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
4179 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
4180 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
4181 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
4182 ----------------------------------------------------------------------
4183 (*) These are not used by any known coding system.
4185 Control characters for these functions are defined by macros
4186 ISO_CODE_XXX in `coding.h'.
4188 Designations are done by the following escape sequences:
4189 ----------------------------------------------------------------------
4190 escape sequence description
4191 ----------------------------------------------------------------------
4192 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
4193 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
4194 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
4195 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
4196 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
4197 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
4198 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
4199 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
4200 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
4201 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
4202 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
4203 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
4204 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
4205 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
4206 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
4207 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
4208 ----------------------------------------------------------------------
4210 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
4211 of dimension 1, chars 94, and final character <F>, etc...
4213 Note (*): Although these designations are not allowed in ISO2022,
4214 Emacs accepts them on decoding, and produces them on encoding
4215 CHARS96 character sets in a coding system which is characterized as
4216 7-bit environment, non-locking-shift, and non-single-shift.
4218 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
4219 '(' can be omitted. We refer to this as "short-form" hereafter.
4221 Now you may notice that there are a lot of ways for encoding the
4222 same multilingual text in ISO2022. Actually, there exist many
4223 coding systems such as Compound Text (used in X11's inter client
4224 communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
4225 (used in Korean internet), EUC (Extended UNIX Code, used in Asian
4226 localized platforms), and all of these are variants of ISO2022.
4228 In addition to the above, Emacs handles two more kinds of escape
4229 sequences: ISO6429's direction specification and Emacs' private
4230 sequence for specifying character composition.
4232 ISO6429's direction specification takes the following form:
4233 o CSI ']' -- end of the current direction
4234 o CSI '0' ']' -- end of the current direction
4235 o CSI '1' ']' -- start of left-to-right text
4236 o CSI '2' ']' -- start of right-to-left text
4237 The control character CSI (0x9B: control sequence introducer) is
4238 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
4240 Character composition specification takes the following form:
4241 o ESC '0' -- start character composition
4242 o ESC '1' -- end character composition
4243 Since these are not standard escape sequences of any ISO standard,
4244 their use with these meanings is restricted to Emacs only. */
4247 reset_iso2022 (Lisp_Object coding_system, struct iso2022_decoder *iso)
4251 for (i = 0; i < 4; i++)
4253 if (!NILP (coding_system))
4255 XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, i);
4257 iso->charset[i] = Qt;
4258 iso->invalid_designated[i] = 0;
4260 iso->esc = ISO_ESC_NOTHING;
4261 iso->esc_bytes_index = 0;
4262 iso->register_left = 0;
4263 iso->register_right = 1;
4264 iso->switched_dir_and_no_valid_charset_yet = 0;
4265 iso->invalid_switch_dir = 0;
4266 iso->output_direction_sequence = 0;
4267 iso->output_literally = 0;
4268 #ifdef ENABLE_COMPOSITE_CHARS
4269 if (iso->composite_chars)
4270 Dynarr_reset (iso->composite_chars);
4275 fit_to_be_escape_quoted (unsigned char c)
4292 /* Parse one byte of an ISO2022 escape sequence.
4293 If the result is an invalid escape sequence, return 0 and
4294 do not change anything in STR. Otherwise, if the result is
4295 an incomplete escape sequence, update ISO2022.ESC and
4296 ISO2022.ESC_BYTES and return -1. Otherwise, update
4297 all the state variables (but not ISO2022.ESC_BYTES) and
4300 If CHECK_INVALID_CHARSETS is non-zero, check for designation
4301 or invocation of an invalid character set and treat that as
4302 an unrecognized escape sequence.
4304 ********************************************************************
4306 #### Strategies for error annotation and coding orthogonalization
4308 We really want to separate out a number of things. Conceptually,
4309 there is a nested syntax.
4311 At the top level is the ISO 2022 extension syntax, including charset
4312 designation and invocation, and certain auxiliary controls such as the
4313 ISO 6429 direction specification. These are octet-oriented, with the
4314 single exception (AFAIK) of the "exit Unicode" sequence which uses the
4315 UTF's natural width (1 byte for UTF-7 and UTF-8, 2 bytes for UCS-2 and
4316 UTF-16, and 4 bytes for UCS-4 and UTF-32). This will be treated as a
4317 (deprecated) special case in Unicode processing.
4319 The middle layer is ISO 2022 character interpretation. This will depend
4320 on the current state of the ISO 2022 registers, and assembles octets
4321 into the character's internal representation.
4323 The lowest level is translating system control conventions. At present
4324 this is restricted to newline translation, but one could imagine doing
4325 tab conversion or line wrapping here. "Escape from Unicode" processing
4326 would be done at this level.
4328 At each level the parser will verify the syntax. In the case of a
4329 syntax error or warning (such as a redundant escape sequence that affects
4330 no characters), the parser will take some action, typically inserting the
4331 erroneous octets directly into the output and creating an annotation
4332 which can be used by higher level I/O to mark the affected region.
4334 This should make it possible to do something sensible about separating
4335 newline convention processing from character construction, and about
4336 preventing ISO 2022 escape sequences from being recognized
4339 The basic strategy will be to have octet classification tables, and
4340 switch processing according to the table entry.
4342 It's possible that, by doing the processing with tables of functions or
4343 the like, the parser can be used for both detection and translation. */
4346 parse_iso2022_esc (Lisp_Object codesys, struct iso2022_decoder *iso,
4347 unsigned char c, unsigned int *flags,
4348 int check_invalid_charsets)
4350 /* (1) If we're at the end of a designation sequence, CS is the
4351 charset being designated and REG is the register to designate
4354 (2) If we're at the end of a locking-shift sequence, REG is
4355 the register to invoke and HALF (0 == left, 1 == right) is
4356 the half to invoke it into.
4358 (3) If we're at the end of a single-shift sequence, REG is
4359 the register to invoke. */
4360 Lisp_Object cs = Qnil;
4363 /* NOTE: This code does goto's all over the fucking place.
4364 The reason for this is that we're basically implementing
4365 a state machine here, and hierarchical languages like C
4366 don't really provide a clean way of doing this. */
4368 if (! (*flags & CODING_STATE_ESCAPE))
4369 /* At beginning of escape sequence; we need to reset our
4370 escape-state variables. */
4371 iso->esc = ISO_ESC_NOTHING;
4373 iso->output_literally = 0;
4374 iso->output_direction_sequence = 0;
4378 case ISO_ESC_NOTHING:
4379 iso->esc_bytes_index = 0;
4382 case ISO_CODE_ESC: /* Start escape sequence */
4383 *flags |= CODING_STATE_ESCAPE;
4387 case ISO_CODE_CSI: /* ISO6429 (specifying directionality) */
4388 *flags |= CODING_STATE_ESCAPE;
4389 iso->esc = ISO_ESC_5_11;
4392 case ISO_CODE_SO: /* locking shift 1 */
4395 case ISO_CODE_SI: /* locking shift 0 */
4399 case ISO_CODE_SS2: /* single shift */
4402 case ISO_CODE_SS3: /* single shift */
4406 default: /* Other control characters */
4413 /**** single shift ****/
4415 case 'N': /* single shift 2 */
4418 case 'O': /* single shift 3 */
4422 /**** locking shift ****/
4424 case '~': /* locking shift 1 right */
4427 case 'n': /* locking shift 2 */
4430 case '}': /* locking shift 2 right */
4433 case 'o': /* locking shift 3 */
4436 case '|': /* locking shift 3 right */
4440 #ifdef ENABLE_COMPOSITE_CHARS
4441 /**** composite ****/
4444 iso->esc = ISO_ESC_START_COMPOSITE;
4445 *flags = (*flags & CODING_STATE_ISO2022_LOCK) |
4446 CODING_STATE_COMPOSITE;
4450 iso->esc = ISO_ESC_END_COMPOSITE;
4451 *flags = (*flags & CODING_STATE_ISO2022_LOCK) &
4452 ~CODING_STATE_COMPOSITE;
4454 #endif /* ENABLE_COMPOSITE_CHARS */
4456 /**** directionality ****/
4459 iso->esc = ISO_ESC_5_11;
4462 /**** designation ****/
4464 case '$': /* multibyte charset prefix */
4465 iso->esc = ISO_ESC_2_4;
4469 if (0x28 <= c && c <= 0x2F)
4471 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_8);
4475 /* This function is called with CODESYS equal to nil when
4476 doing coding-system detection. */
4478 && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
4479 && fit_to_be_escape_quoted (c))
4481 iso->esc = ISO_ESC_LITERAL;
4482 *flags &= CODING_STATE_ISO2022_LOCK;
4492 /**** directionality ****/
4494 case ISO_ESC_5_11: /* ISO6429 direction control */
4497 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4498 goto directionality;
4500 if (c == '0') iso->esc = ISO_ESC_5_11_0;
4501 else if (c == '1') iso->esc = ISO_ESC_5_11_1;
4502 else if (c == '2') iso->esc = ISO_ESC_5_11_2;
4506 case ISO_ESC_5_11_0:
4509 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4510 goto directionality;
4514 case ISO_ESC_5_11_1:
4517 *flags = (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4518 goto directionality;
4522 case ISO_ESC_5_11_2:
4525 *flags = (*flags & CODING_STATE_ISO2022_LOCK) | CODING_STATE_R2L;
4526 goto directionality;
4531 iso->esc = ISO_ESC_DIRECTIONALITY;
4532 /* Various junk here to attempt to preserve the direction sequences
4533 literally in the text if they would otherwise be swallowed due
4534 to invalid designations that don't show up as actual charset
4535 changes in the text. */
4536 if (iso->invalid_switch_dir)
4538 /* We already inserted a direction switch literally into the
4539 text. We assume (#### this may not be right) that the
4540 next direction switch is the one going the other way,
4541 and we need to output that literally as well. */
4542 iso->output_literally = 1;
4543 iso->invalid_switch_dir = 0;
4549 /* If we are in the thrall of an invalid designation,
4550 then stick the directionality sequence literally into the
4551 output stream so it ends up in the original text again. */
4552 for (jj = 0; jj < 4; jj++)
4553 if (iso->invalid_designated[jj])
4557 iso->output_literally = 1;
4558 iso->invalid_switch_dir = 1;
4561 /* Indicate that we haven't yet seen a valid designation,
4562 so that if a switch-dir is directly followed by an
4563 invalid designation, both get inserted literally. */
4564 iso->switched_dir_and_no_valid_charset_yet = 1;
4569 /**** designation ****/
4572 if (0x28 <= c && c <= 0x2F)
4574 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_4_8);
4577 if (0x40 <= c && c <= 0x42)
4579 cs = CHARSET_BY_ATTRIBUTES (CHARSET_TYPE_94X94, c,
4580 *flags & CODING_STATE_R2L ?
4581 CHARSET_RIGHT_TO_LEFT :
4582 CHARSET_LEFT_TO_RIGHT);
4592 if (c < '0' || c > '~')
4593 return 0; /* bad final byte */
4595 if (iso->esc >= ISO_ESC_2_8 &&
4596 iso->esc <= ISO_ESC_2_15)
4598 type = ((iso->esc >= ISO_ESC_2_12) ?
4599 CHARSET_TYPE_96 : CHARSET_TYPE_94);
4600 reg = (iso->esc - ISO_ESC_2_8) & 3;
4602 else if (iso->esc >= ISO_ESC_2_4_8 &&
4603 iso->esc <= ISO_ESC_2_4_15)
4605 type = ((iso->esc >= ISO_ESC_2_4_12) ?
4606 CHARSET_TYPE_96X96 : CHARSET_TYPE_94X94);
4607 reg = (iso->esc - ISO_ESC_2_4_8) & 3;
4611 /* Can this ever be reached? -slb */
4616 cs = CHARSET_BY_ATTRIBUTES (type, c,
4617 *flags & CODING_STATE_R2L ?
4618 CHARSET_RIGHT_TO_LEFT :
4619 CHARSET_LEFT_TO_RIGHT);
4625 iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char) c;
4629 if (check_invalid_charsets && !CHARSETP (iso->charset[reg]))
4630 /* can't invoke something that ain't there. */
4632 iso->esc = ISO_ESC_SINGLE_SHIFT;
4633 *flags &= CODING_STATE_ISO2022_LOCK;
4635 *flags |= CODING_STATE_SS2;
4637 *flags |= CODING_STATE_SS3;
4641 if (check_invalid_charsets &&
4642 !CHARSETP (iso->charset[reg]))
4643 /* can't invoke something that ain't there. */
4646 iso->register_right = reg;
4648 iso->register_left = reg;
4649 *flags &= CODING_STATE_ISO2022_LOCK;
4650 iso->esc = ISO_ESC_LOCKING_SHIFT;
4654 if (NILP (cs) && check_invalid_charsets)
4656 iso->invalid_designated[reg] = 1;
4657 iso->charset[reg] = Vcharset_ascii;
4658 iso->esc = ISO_ESC_DESIGNATE;
4659 *flags &= CODING_STATE_ISO2022_LOCK;
4660 iso->output_literally = 1;
4661 if (iso->switched_dir_and_no_valid_charset_yet)
4663 /* We encountered a switch-direction followed by an
4664 invalid designation. Ensure that the switch-direction
4665 gets outputted; otherwise it will probably get eaten
4666 when the text is written out again. */
4667 iso->switched_dir_and_no_valid_charset_yet = 0;
4668 iso->output_direction_sequence = 1;
4669 /* And make sure that the switch-dir going the other
4670 way gets outputted, as well. */
4671 iso->invalid_switch_dir = 1;
4675 /* This function is called with CODESYS equal to nil when
4676 doing coding-system detection. */
4677 if (!NILP (codesys))
4679 charset_conversion_spec_dynarr *dyn =
4680 XCODING_SYSTEM (codesys)->iso2022.input_conv;
4686 for (i = 0; i < Dynarr_length (dyn); i++)
4688 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
4689 if (EQ (cs, spec->from_charset))
4690 cs = spec->to_charset;
4695 iso->charset[reg] = cs;
4696 iso->esc = ISO_ESC_DESIGNATE;
4697 *flags &= CODING_STATE_ISO2022_LOCK;
4698 if (iso->invalid_designated[reg])
4700 iso->invalid_designated[reg] = 0;
4701 iso->output_literally = 1;
4703 if (iso->switched_dir_and_no_valid_charset_yet)
4704 iso->switched_dir_and_no_valid_charset_yet = 0;
4709 detect_coding_iso2022 (struct detection_state *st, const Extbyte *src, Lstream_data_count n)
4713 /* #### There are serious deficiencies in the recognition mechanism
4714 here. This needs to be much smarter if it's going to cut it.
4715 The sequence "\xff\x0f" is currently detected as LOCK_SHIFT while
4716 it should be detected as Latin-1.
4717 All the ISO2022 stuff in this file should be synced up with the
4718 code from FSF Emacs-20.4, in which Mule should be more or less stable.
4719 Perhaps we should wait till R2L works in FSF Emacs? */
4721 if (!st->iso2022.initted)
4723 reset_iso2022 (Qnil, &st->iso2022.iso);
4724 st->iso2022.mask = (CODING_CATEGORY_ISO_7_MASK |
4725 CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4726 CODING_CATEGORY_ISO_8_1_MASK |
4727 CODING_CATEGORY_ISO_8_2_MASK |
4728 CODING_CATEGORY_ISO_LOCK_SHIFT_MASK);
4729 st->iso2022.flags = 0;
4730 st->iso2022.high_byte_count = 0;
4731 st->iso2022.saw_single_shift = 0;
4732 st->iso2022.initted = 1;
4735 mask = st->iso2022.mask;
4739 unsigned char c = *(unsigned char *)src++;
4742 mask &= ~CODING_CATEGORY_ISO_7_MASK;
4743 st->iso2022.high_byte_count++;
4747 if (st->iso2022.high_byte_count && !st->iso2022.saw_single_shift)
4749 if (st->iso2022.high_byte_count & 1)
4750 /* odd number of high bytes; assume not iso-8-2 */
4751 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4753 st->iso2022.high_byte_count = 0;
4754 st->iso2022.saw_single_shift = 0;
4756 mask &= ~CODING_CATEGORY_ISO_7_MASK;
4758 if (!(st->iso2022.flags & CODING_STATE_ESCAPE)
4759 && (BYTE_C0_P (c) || BYTE_C1_P (c)))
4760 { /* control chars */
4763 /* Allow and ignore control characters that you might
4764 reasonably see in a text file */
4769 case 8: /* backspace */
4770 case 11: /* vertical tab */
4771 case 12: /* form feed */
4772 case 26: /* MS-DOS C-z junk */
4773 case 31: /* '^_' -- for info */
4774 goto label_continue_loop;
4781 if ((st->iso2022.flags & CODING_STATE_ESCAPE) || BYTE_C0_P (c)
4784 if (parse_iso2022_esc (Qnil, &st->iso2022.iso, c,
4785 &st->iso2022.flags, 0))
4787 switch (st->iso2022.iso.esc)
4789 case ISO_ESC_DESIGNATE:
4790 mask &= ~CODING_CATEGORY_ISO_8_1_MASK;
4791 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4793 case ISO_ESC_LOCKING_SHIFT:
4794 mask = CODING_CATEGORY_ISO_LOCK_SHIFT_MASK;
4795 goto ran_out_of_chars;
4796 case ISO_ESC_SINGLE_SHIFT:
4797 mask &= ~CODING_CATEGORY_ISO_8_DESIGNATE_MASK;
4798 st->iso2022.saw_single_shift = 1;
4807 goto ran_out_of_chars;
4810 label_continue_loop:;
4819 postprocess_iso2022_mask (int mask)
4821 /* #### kind of cheesy */
4822 /* If seven-bit ISO is allowed, then assume that the encoding is
4823 entirely seven-bit and turn off the eight-bit ones. */
4824 if (mask & CODING_CATEGORY_ISO_7_MASK)
4825 mask &= ~ (CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4826 CODING_CATEGORY_ISO_8_1_MASK |
4827 CODING_CATEGORY_ISO_8_2_MASK);
4831 /* If FLAGS is a null pointer or specifies right-to-left motion,
4832 output a switch-dir-to-left-to-right sequence to DST.
4833 Also update FLAGS if it is not a null pointer.
4834 If INTERNAL_P is set, we are outputting in internal format and
4835 need to handle the CSI differently. */
4838 restore_left_to_right_direction (Lisp_Coding_System *codesys,
4839 unsigned_char_dynarr *dst,
4840 unsigned int *flags,
4843 if (!flags || (*flags & CODING_STATE_R2L))
4845 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4847 Dynarr_add (dst, ISO_CODE_ESC);
4848 Dynarr_add (dst, '[');
4850 else if (internal_p)
4851 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4853 Dynarr_add (dst, ISO_CODE_CSI);
4854 Dynarr_add (dst, '0');
4855 Dynarr_add (dst, ']');
4857 *flags &= ~CODING_STATE_R2L;
4861 /* If FLAGS is a null pointer or specifies a direction different from
4862 DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or
4863 CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape
4864 sequence to DST. Also update FLAGS if it is not a null pointer.
4865 If INTERNAL_P is set, we are outputting in internal format and
4866 need to handle the CSI differently. */
4869 ensure_correct_direction (int direction, Lisp_Coding_System *codesys,
4870 unsigned_char_dynarr *dst, unsigned int *flags,
4873 if ((!flags || (*flags & CODING_STATE_R2L)) &&
4874 direction == CHARSET_LEFT_TO_RIGHT)
4875 restore_left_to_right_direction (codesys, dst, flags, internal_p);
4876 else if (!CODING_SYSTEM_ISO2022_NO_ISO6429 (codesys)
4877 && (!flags || !(*flags & CODING_STATE_R2L)) &&
4878 direction == CHARSET_RIGHT_TO_LEFT)
4880 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4882 Dynarr_add (dst, ISO_CODE_ESC);
4883 Dynarr_add (dst, '[');
4885 else if (internal_p)
4886 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4888 Dynarr_add (dst, ISO_CODE_CSI);
4889 Dynarr_add (dst, '2');
4890 Dynarr_add (dst, ']');
4892 *flags |= CODING_STATE_R2L;
4896 /* Convert ISO2022-format data to internal format. */
4899 decode_coding_iso2022 (Lstream *decoding, const Extbyte *src,
4900 unsigned_char_dynarr *dst, Lstream_data_count n)
4902 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4903 unsigned int flags = str->flags;
4904 unsigned int ch = str->ch;
4905 eol_type_t eol_type = str->eol_type;
4906 #ifdef ENABLE_COMPOSITE_CHARS
4907 unsigned_char_dynarr *real_dst = dst;
4909 Lisp_Object coding_system;
4911 XSETCODING_SYSTEM (coding_system, str->codesys);
4913 #ifdef ENABLE_COMPOSITE_CHARS
4914 if (flags & CODING_STATE_COMPOSITE)
4915 dst = str->iso2022.composite_chars;
4916 #endif /* ENABLE_COMPOSITE_CHARS */
4920 unsigned char c = *(unsigned char *)src++;
4921 if (flags & CODING_STATE_ESCAPE)
4922 { /* Within ESC sequence */
4923 int retval = parse_iso2022_esc (coding_system, &str->iso2022,
4928 switch (str->iso2022.esc)
4930 #ifdef ENABLE_COMPOSITE_CHARS
4931 case ISO_ESC_START_COMPOSITE:
4932 if (str->iso2022.composite_chars)
4933 Dynarr_reset (str->iso2022.composite_chars);
4935 str->iso2022.composite_chars = Dynarr_new (unsigned_char);
4936 dst = str->iso2022.composite_chars;
4938 case ISO_ESC_END_COMPOSITE:
4940 Bufbyte comstr[MAX_EMCHAR_LEN];
4942 Emchar emch = lookup_composite_char (Dynarr_atp (dst, 0),
4943 Dynarr_length (dst));
4945 len = set_charptr_emchar (comstr, emch);
4946 Dynarr_add_many (dst, comstr, len);
4949 #endif /* ENABLE_COMPOSITE_CHARS */
4951 case ISO_ESC_LITERAL:
4952 DECODE_ADD_BINARY_CHAR (c, dst);
4956 /* Everything else handled already */
4961 /* Attempted error recovery. */
4962 if (str->iso2022.output_direction_sequence)
4963 ensure_correct_direction (flags & CODING_STATE_R2L ?
4964 CHARSET_RIGHT_TO_LEFT :
4965 CHARSET_LEFT_TO_RIGHT,
4966 str->codesys, dst, 0, 1);
4967 /* More error recovery. */
4968 if (!retval || str->iso2022.output_literally)
4970 /* Output the (possibly invalid) sequence */
4972 for (i = 0; i < str->iso2022.esc_bytes_index; i++)
4973 DECODE_ADD_BINARY_CHAR (str->iso2022.esc_bytes[i], dst);
4974 flags &= CODING_STATE_ISO2022_LOCK;
4976 n++, src--;/* Repeat the loop with the same character. */
4979 /* No sense in reprocessing the final byte of the
4980 escape sequence; it could mess things up anyway.
4982 DECODE_ADD_BINARY_CHAR (c, dst);
4987 else if (BYTE_C0_P (c) || BYTE_C1_P (c))
4988 { /* Control characters */
4990 /***** Error-handling *****/
4992 /* If we were in the middle of a character, dump out the
4993 partial character. */
4994 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4996 /* If we just saw a single-shift character, dump it out.
4997 This may dump out the wrong sort of single-shift character,
4998 but least it will give an indication that something went
5000 if (flags & CODING_STATE_SS2)
5002 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS2, dst);
5003 flags &= ~CODING_STATE_SS2;
5005 if (flags & CODING_STATE_SS3)
5007 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS3, dst);
5008 flags &= ~CODING_STATE_SS3;
5011 /***** Now handle the control characters. *****/
5014 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5016 flags &= CODING_STATE_ISO2022_LOCK;
5018 if (!parse_iso2022_esc (coding_system, &str->iso2022, c, &flags, 1))
5019 DECODE_ADD_BINARY_CHAR (c, dst);
5022 { /* Graphic characters */
5023 Lisp_Object charset;
5027 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5029 /* Now determine the charset. */
5030 reg = ((flags & CODING_STATE_SS2) ? 2
5031 : (flags & CODING_STATE_SS3) ? 3
5032 : !BYTE_ASCII_P (c) ? str->iso2022.register_right
5033 : str->iso2022.register_left);
5034 charset = str->iso2022.charset[reg];
5036 /* Error checking: */
5037 if (! CHARSETP (charset)
5038 || str->iso2022.invalid_designated[reg]
5039 || (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL)
5040 && XCHARSET_CHARS (charset) == 94))
5041 /* Mrmph. We are trying to invoke a register that has no
5042 or an invalid charset in it, or trying to add a character
5043 outside the range of the charset. Insert that char literally
5044 to preserve it for the output. */
5046 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5047 DECODE_ADD_BINARY_CHAR (c, dst);
5052 /* Things are probably hunky-dorey. */
5054 /* Fetch reverse charset, maybe. */
5055 if (((flags & CODING_STATE_R2L) &&
5056 XCHARSET_DIRECTION (charset) == CHARSET_LEFT_TO_RIGHT)
5058 (!(flags & CODING_STATE_R2L) &&
5059 XCHARSET_DIRECTION (charset) == CHARSET_RIGHT_TO_LEFT))
5061 Lisp_Object new_charset =
5062 XCHARSET_REVERSE_DIRECTION_CHARSET (charset);
5063 if (!NILP (new_charset))
5064 charset = new_charset;
5067 lb = XCHARSET_LEADING_BYTE (charset);
5068 switch (XCHARSET_REP_BYTES (charset))
5071 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5072 Dynarr_add (dst, c & 0x7F);
5075 case 2: /* one-byte official */
5076 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5077 Dynarr_add (dst, lb);
5078 Dynarr_add (dst, c | 0x80);
5081 case 3: /* one-byte private or two-byte official */
5082 if (XCHARSET_PRIVATE_P (charset))
5084 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5085 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_1);
5086 Dynarr_add (dst, lb);
5087 Dynarr_add (dst, c | 0x80);
5093 Dynarr_add (dst, lb);
5094 Dynarr_add (dst, ch | 0x80);
5095 Dynarr_add (dst, c | 0x80);
5103 default: /* two-byte private */
5106 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_2);
5107 Dynarr_add (dst, lb);
5108 Dynarr_add (dst, ch | 0x80);
5109 Dynarr_add (dst, c | 0x80);
5118 flags &= CODING_STATE_ISO2022_LOCK;
5121 label_continue_loop:;
5124 if (flags & CODING_STATE_END)
5125 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5132 /***** ISO2022 encoder *****/
5134 /* Designate CHARSET into register REG. */
5137 iso2022_designate (Lisp_Object charset, unsigned char reg,
5138 struct encoding_stream *str, unsigned_char_dynarr *dst)
5140 static const char inter94[] = "()*+";
5141 static const char inter96[] = ",-./";
5143 unsigned char final;
5144 Lisp_Object old_charset = str->iso2022.charset[reg];
5146 str->iso2022.charset[reg] = charset;
5147 if (!CHARSETP (charset))
5148 /* charset might be an initial nil or t. */
5150 type = XCHARSET_TYPE (charset);
5151 final = XCHARSET_FINAL (charset);
5152 if (!str->iso2022.force_charset_on_output[reg] &&
5153 CHARSETP (old_charset) &&
5154 XCHARSET_TYPE (old_charset) == type &&
5155 XCHARSET_FINAL (old_charset) == final)
5158 str->iso2022.force_charset_on_output[reg] = 0;
5161 charset_conversion_spec_dynarr *dyn =
5162 str->codesys->iso2022.output_conv;
5168 for (i = 0; i < Dynarr_length (dyn); i++)
5170 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
5171 if (EQ (charset, spec->from_charset))
5172 charset = spec->to_charset;
5177 Dynarr_add (dst, ISO_CODE_ESC);
5180 case CHARSET_TYPE_94:
5181 Dynarr_add (dst, inter94[reg]);
5183 case CHARSET_TYPE_96:
5184 Dynarr_add (dst, inter96[reg]);
5186 case CHARSET_TYPE_94X94:
5187 Dynarr_add (dst, '$');
5189 || !(CODING_SYSTEM_ISO2022_SHORT (str->codesys))
5192 Dynarr_add (dst, inter94[reg]);
5194 case CHARSET_TYPE_96X96:
5195 Dynarr_add (dst, '$');
5196 Dynarr_add (dst, inter96[reg]);
5199 Dynarr_add (dst, final);
5203 ensure_normal_shift (struct encoding_stream *str, unsigned_char_dynarr *dst)
5205 if (str->iso2022.register_left != 0)
5207 Dynarr_add (dst, ISO_CODE_SI);
5208 str->iso2022.register_left = 0;
5213 ensure_shift_out (struct encoding_stream *str, unsigned_char_dynarr *dst)
5215 if (str->iso2022.register_left != 1)
5217 Dynarr_add (dst, ISO_CODE_SO);
5218 str->iso2022.register_left = 1;
5222 /* Convert internally-formatted data to ISO2022 format. */
5225 encode_coding_iso2022 (Lstream *encoding, const Bufbyte *src,
5226 unsigned_char_dynarr *dst, Lstream_data_count n)
5228 unsigned char charmask, c;
5229 unsigned char char_boundary;
5230 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
5231 unsigned int flags = str->flags;
5232 unsigned int ch = str->ch;
5233 Lisp_Coding_System *codesys = str->codesys;
5234 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
5236 Lisp_Object charset;
5239 #ifdef ENABLE_COMPOSITE_CHARS
5240 /* flags for handling composite chars. We do a little switcharoo
5241 on the source while we're outputting the composite char. */
5242 unsigned int saved_n = 0;
5243 const unsigned char *saved_src = NULL;
5244 int in_composite = 0;
5245 #endif /* ENABLE_COMPOSITE_CHARS */
5247 char_boundary = str->iso2022.current_char_boundary;
5248 charset = str->iso2022.current_charset;
5249 half = str->iso2022.current_half;
5251 #ifdef ENABLE_COMPOSITE_CHARS
5258 if (BYTE_ASCII_P (c))
5259 { /* Processing ASCII character */
5262 restore_left_to_right_direction (codesys, dst, &flags, 0);
5264 /* Make sure G0 contains ASCII */
5265 if ((c > ' ' && c < ISO_CODE_DEL) ||
5266 !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
5268 ensure_normal_shift (str, dst);
5269 iso2022_designate (Vcharset_ascii, 0, str, dst);
5272 /* If necessary, restore everything to the default state
5275 !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys)))
5277 restore_left_to_right_direction (codesys, dst, &flags, 0);
5279 ensure_normal_shift (str, dst);
5281 for (i = 0; i < 4; i++)
5283 Lisp_Object initial_charset =
5284 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5285 iso2022_designate (initial_charset, i, str, dst);
5290 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5291 Dynarr_add (dst, '\r');
5292 if (eol_type != EOL_CR)
5293 Dynarr_add (dst, c);
5297 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5298 && fit_to_be_escape_quoted (c))
5299 Dynarr_add (dst, ISO_CODE_ESC);
5300 Dynarr_add (dst, c);
5305 else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
5306 { /* Processing Leading Byte */
5308 charset = CHARSET_BY_LEADING_BYTE (c);
5309 if (LEADING_BYTE_PREFIX_P(c))
5311 else if (!EQ (charset, Vcharset_control_1)
5312 #ifdef ENABLE_COMPOSITE_CHARS
5313 && !EQ (charset, Vcharset_composite)
5319 ensure_correct_direction (XCHARSET_DIRECTION (charset),
5320 codesys, dst, &flags, 0);
5322 /* Now determine which register to use. */
5324 for (i = 0; i < 4; i++)
5326 if (EQ (charset, str->iso2022.charset[i]) ||
5328 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i)))
5337 if (XCHARSET_GRAPHIC (charset) != 0)
5339 if (!NILP (str->iso2022.charset[1]) &&
5340 (!CODING_SYSTEM_ISO2022_SEVEN (codesys) ||
5341 CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys)))
5343 else if (!NILP (str->iso2022.charset[2]))
5345 else if (!NILP (str->iso2022.charset[3]))
5354 iso2022_designate (charset, reg, str, dst);
5356 /* Now invoke that register. */
5360 ensure_normal_shift (str, dst);
5365 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5367 ensure_shift_out (str, dst);
5375 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5377 Dynarr_add (dst, ISO_CODE_ESC);
5378 Dynarr_add (dst, 'N');
5383 Dynarr_add (dst, ISO_CODE_SS2);
5389 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5391 Dynarr_add (dst, ISO_CODE_ESC);
5392 Dynarr_add (dst, 'O');
5397 Dynarr_add (dst, ISO_CODE_SS3);
5409 { /* Processing Non-ASCII character */
5410 charmask = (half == 0 ? 0x7F : 0xFF);
5412 if (EQ (charset, Vcharset_control_1))
5414 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5415 && fit_to_be_escape_quoted (c))
5416 Dynarr_add (dst, ISO_CODE_ESC);
5417 /* you asked for it ... */
5418 Dynarr_add (dst, c - 0x20);
5422 switch (XCHARSET_REP_BYTES (charset))
5425 Dynarr_add (dst, c & charmask);
5428 if (XCHARSET_PRIVATE_P (charset))
5430 Dynarr_add (dst, c & charmask);
5435 #ifdef ENABLE_COMPOSITE_CHARS
5436 if (EQ (charset, Vcharset_composite))
5440 /* #### Bother! We don't know how to
5442 Dynarr_add (dst, '~');
5446 Emchar emch = MAKE_CHAR (Vcharset_composite,
5447 ch & 0x7F, c & 0x7F);
5448 Lisp_Object lstr = composite_char_string (emch);
5452 src = XSTRING_DATA (lstr);
5453 n = XSTRING_LENGTH (lstr);
5454 Dynarr_add (dst, ISO_CODE_ESC);
5455 Dynarr_add (dst, '0'); /* start composing */
5459 #endif /* ENABLE_COMPOSITE_CHARS */
5461 Dynarr_add (dst, ch & charmask);
5462 Dynarr_add (dst, c & charmask);
5475 Dynarr_add (dst, ch & charmask);
5476 Dynarr_add (dst, c & charmask);
5492 #ifdef ENABLE_COMPOSITE_CHARS
5498 Dynarr_add (dst, ISO_CODE_ESC);
5499 Dynarr_add (dst, '1'); /* end composing */
5500 goto back_to_square_n; /* Wheeeeeeeee ..... */
5502 #endif /* ENABLE_COMPOSITE_CHARS */
5504 if (char_boundary && flags & CODING_STATE_END)
5506 restore_left_to_right_direction (codesys, dst, &flags, 0);
5507 ensure_normal_shift (str, dst);
5508 for (i = 0; i < 4; i++)
5510 Lisp_Object initial_charset =
5511 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5512 iso2022_designate (initial_charset, i, str, dst);
5518 str->iso2022.current_char_boundary = char_boundary;
5519 str->iso2022.current_charset = charset;
5520 str->iso2022.current_half = half;
5522 /* Verbum caro factum est! */
5526 /************************************************************************/
5527 /* No-conversion methods */
5528 /************************************************************************/
5530 /* This is used when reading in "binary" files -- i.e. files that may
5531 contain all 256 possible byte values and that are not to be
5532 interpreted as being in any particular decoding. */
5534 decode_coding_no_conversion (Lstream *decoding, const Extbyte *src,
5535 unsigned_char_dynarr *dst, Lstream_data_count n)
5537 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
5538 unsigned int flags = str->flags;
5539 unsigned int ch = str->ch;
5540 eol_type_t eol_type = str->eol_type;
5544 unsigned char c = *(unsigned char *)src++;
5546 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5547 DECODE_ADD_BINARY_CHAR (c, dst);
5548 label_continue_loop:;
5551 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
5558 encode_coding_no_conversion (Lstream *encoding, const Bufbyte *src,
5559 unsigned_char_dynarr *dst, Lstream_data_count n)
5562 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
5563 unsigned int flags = str->flags;
5564 unsigned int ch = str->ch;
5565 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
5572 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5573 Dynarr_add (dst, '\r');
5574 if (eol_type != EOL_CR)
5575 Dynarr_add (dst, '\n');
5578 else if (BYTE_ASCII_P (c))
5581 Dynarr_add (dst, c);
5583 else if (BUFBYTE_LEADING_BYTE_P (c))
5586 if (c == LEADING_BYTE_LATIN_ISO8859_1 ||
5587 c == LEADING_BYTE_CONTROL_1)
5590 Dynarr_add (dst, '~'); /* untranslatable character */
5594 if (ch == LEADING_BYTE_LATIN_ISO8859_1)
5595 Dynarr_add (dst, c);
5596 else if (ch == LEADING_BYTE_CONTROL_1)
5599 Dynarr_add (dst, c - 0x20);
5601 /* else it should be the second or third byte of an
5602 untranslatable character, so ignore it */
5613 /************************************************************************/
5614 /* Initialization */
5615 /************************************************************************/
5618 syms_of_file_coding (void)
5620 INIT_LRECORD_IMPLEMENTATION (coding_system);
5622 DEFERROR_STANDARD (Qcoding_system_error, Qio_error);
5624 DEFSUBR (Fcoding_system_p);
5625 DEFSUBR (Ffind_coding_system);
5626 DEFSUBR (Fget_coding_system);
5627 DEFSUBR (Fcoding_system_list);
5628 DEFSUBR (Fcoding_system_name);
5629 DEFSUBR (Fmake_coding_system);
5630 DEFSUBR (Fcopy_coding_system);
5631 DEFSUBR (Fcoding_system_canonical_name_p);
5632 DEFSUBR (Fcoding_system_alias_p);
5633 DEFSUBR (Fcoding_system_aliasee);
5634 DEFSUBR (Fdefine_coding_system_alias);
5635 DEFSUBR (Fsubsidiary_coding_system);
5637 DEFSUBR (Fcoding_system_type);
5638 DEFSUBR (Fcoding_system_doc_string);
5640 DEFSUBR (Fcoding_system_charset);
5642 DEFSUBR (Fcoding_system_property);
5644 DEFSUBR (Fcoding_category_list);
5645 DEFSUBR (Fset_coding_priority_list);
5646 DEFSUBR (Fcoding_priority_list);
5647 DEFSUBR (Fset_coding_category_system);
5648 DEFSUBR (Fcoding_category_system);
5650 DEFSUBR (Fdetect_coding_region);
5651 DEFSUBR (Fdecode_coding_region);
5652 DEFSUBR (Fencode_coding_region);
5654 DEFSUBR (Fdecode_shift_jis_char);
5655 DEFSUBR (Fencode_shift_jis_char);
5656 DEFSUBR (Fdecode_big5_char);
5657 DEFSUBR (Fencode_big5_char);
5658 DEFSUBR (Fset_ucs_char);
5659 DEFSUBR (Fucs_char);
5660 DEFSUBR (Fset_char_ucs);
5661 DEFSUBR (Fchar_ucs);
5663 defsymbol (&Qcoding_systemp, "coding-system-p");
5664 defsymbol (&Qno_conversion, "no-conversion");
5665 defsymbol (&Qraw_text, "raw-text");
5667 defsymbol (&Qbig5, "big5");
5668 defsymbol (&Qshift_jis, "shift-jis");
5669 defsymbol (&Qucs4, "ucs-4");
5670 defsymbol (&Qutf8, "utf-8");
5671 defsymbol (&Qccl, "ccl");
5672 defsymbol (&Qiso2022, "iso2022");
5674 defsymbol (&Qmnemonic, "mnemonic");
5675 defsymbol (&Qeol_type, "eol-type");
5676 defsymbol (&Qpost_read_conversion, "post-read-conversion");
5677 defsymbol (&Qpre_write_conversion, "pre-write-conversion");
5679 defsymbol (&Qcr, "cr");
5680 defsymbol (&Qlf, "lf");
5681 defsymbol (&Qcrlf, "crlf");
5682 defsymbol (&Qeol_cr, "eol-cr");
5683 defsymbol (&Qeol_lf, "eol-lf");
5684 defsymbol (&Qeol_crlf, "eol-crlf");
5686 defsymbol (&Qcharset_g0, "charset-g0");
5687 defsymbol (&Qcharset_g1, "charset-g1");
5688 defsymbol (&Qcharset_g2, "charset-g2");
5689 defsymbol (&Qcharset_g3, "charset-g3");
5690 defsymbol (&Qforce_g0_on_output, "force-g0-on-output");
5691 defsymbol (&Qforce_g1_on_output, "force-g1-on-output");
5692 defsymbol (&Qforce_g2_on_output, "force-g2-on-output");
5693 defsymbol (&Qforce_g3_on_output, "force-g3-on-output");
5694 defsymbol (&Qno_iso6429, "no-iso6429");
5695 defsymbol (&Qinput_charset_conversion, "input-charset-conversion");
5696 defsymbol (&Qoutput_charset_conversion, "output-charset-conversion");
5698 defsymbol (&Qshort, "short");
5699 defsymbol (&Qno_ascii_eol, "no-ascii-eol");
5700 defsymbol (&Qno_ascii_cntl, "no-ascii-cntl");
5701 defsymbol (&Qseven, "seven");
5702 defsymbol (&Qlock_shift, "lock-shift");
5703 defsymbol (&Qescape_quoted, "escape-quoted");
5705 defsymbol (&Qencode, "encode");
5706 defsymbol (&Qdecode, "decode");
5709 defsymbol (&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS],
5711 defsymbol (&coding_category_symbol[CODING_CATEGORY_BIG5],
5713 defsymbol (&coding_category_symbol[CODING_CATEGORY_UCS4],
5715 defsymbol (&coding_category_symbol[CODING_CATEGORY_UTF8],
5717 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_7],
5719 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE],
5721 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_1],
5723 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_2],
5725 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_LOCK_SHIFT],
5728 defsymbol (&coding_category_symbol[CODING_CATEGORY_NO_CONVERSION],
5733 lstream_type_create_file_coding (void)
5735 LSTREAM_HAS_METHOD (decoding, reader);
5736 LSTREAM_HAS_METHOD (decoding, writer);
5737 LSTREAM_HAS_METHOD (decoding, rewinder);
5738 LSTREAM_HAS_METHOD (decoding, seekable_p);
5739 LSTREAM_HAS_METHOD (decoding, flusher);
5740 LSTREAM_HAS_METHOD (decoding, closer);
5741 LSTREAM_HAS_METHOD (decoding, marker);
5743 LSTREAM_HAS_METHOD (encoding, reader);
5744 LSTREAM_HAS_METHOD (encoding, writer);
5745 LSTREAM_HAS_METHOD (encoding, rewinder);
5746 LSTREAM_HAS_METHOD (encoding, seekable_p);
5747 LSTREAM_HAS_METHOD (encoding, flusher);
5748 LSTREAM_HAS_METHOD (encoding, closer);
5749 LSTREAM_HAS_METHOD (encoding, marker);
5753 vars_of_file_coding (void)
5757 fcd = xnew (struct file_coding_dump);
5758 dump_add_root_struct_ptr (&fcd, &fcd_description);
5760 /* Initialize to something reasonable ... */
5761 for (i = 0; i < CODING_CATEGORY_LAST; i++)
5763 fcd->coding_category_system[i] = Qnil;
5764 fcd->coding_category_by_priority[i] = i;
5767 Fprovide (intern ("file-coding"));
5769 DEFVAR_LISP ("keyboard-coding-system", &Vkeyboard_coding_system /*
5770 Coding system used for TTY keyboard input.
5771 Not used under a windowing system.
5773 Vkeyboard_coding_system = Qnil;
5775 DEFVAR_LISP ("terminal-coding-system", &Vterminal_coding_system /*
5776 Coding system used for TTY display output.
5777 Not used under a windowing system.
5779 Vterminal_coding_system = Qnil;
5781 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read /*
5782 Overriding coding system used when reading from a file or process.
5783 You should bind this variable with `let', but do not set it globally.
5784 If this is non-nil, it specifies the coding system that will be used
5785 to decode input on read operations, such as from a file or process.
5786 It overrides `buffer-file-coding-system-for-read',
5787 `insert-file-contents-pre-hook', etc. Use those variables instead of
5788 this one for permanent changes to the environment. */ );
5789 Vcoding_system_for_read = Qnil;
5791 DEFVAR_LISP ("coding-system-for-write",
5792 &Vcoding_system_for_write /*
5793 Overriding coding system used when writing to a file or process.
5794 You should bind this variable with `let', but do not set it globally.
5795 If this is non-nil, it specifies the coding system that will be used
5796 to encode output for write operations, such as to a file or process.
5797 It overrides `buffer-file-coding-system', `write-region-pre-hook', etc.
5798 Use those variables instead of this one for permanent changes to the
5800 Vcoding_system_for_write = Qnil;
5802 DEFVAR_LISP ("file-name-coding-system", &Vfile_name_coding_system /*
5803 Coding system used to convert pathnames when accessing files.
5805 Vfile_name_coding_system = Qnil;
5807 DEFVAR_BOOL ("enable-multibyte-characters", &enable_multibyte_characters /*
5808 Non-nil means the buffer contents are regarded as multi-byte form
5809 of characters, not a binary code. This affects the display, file I/O,
5810 and behaviors of various editing commands.
5812 Setting this to nil does not do anything.
5814 enable_multibyte_characters = 1;
5818 complex_vars_of_file_coding (void)
5820 staticpro (&Vcoding_system_hash_table);
5821 Vcoding_system_hash_table =
5822 make_lisp_hash_table (50, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ);
5824 the_codesys_prop_dynarr = Dynarr_new (codesys_prop);
5825 dump_add_root_struct_ptr (&the_codesys_prop_dynarr, &codesys_prop_dynarr_description);
5827 #define DEFINE_CODESYS_PROP(Prop_Type, Sym) do \
5829 struct codesys_prop csp; \
5831 csp.prop_type = (Prop_Type); \
5832 Dynarr_add (the_codesys_prop_dynarr, csp); \
5835 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qmnemonic);
5836 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_type);
5837 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_cr);
5838 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_crlf);
5839 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_lf);
5840 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpost_read_conversion);
5841 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpre_write_conversion);
5843 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g0);
5844 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g1);
5845 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g2);
5846 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g3);
5847 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g0_on_output);
5848 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g1_on_output);
5849 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g2_on_output);
5850 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g3_on_output);
5851 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qshort);
5852 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_eol);
5853 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_cntl);
5854 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qseven);
5855 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qlock_shift);
5856 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_iso6429);
5857 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qescape_quoted);
5858 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qinput_charset_conversion);
5859 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qoutput_charset_conversion);
5861 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qencode);
5862 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qdecode);
5864 /* Need to create this here or we're really screwed. */
5866 (Qraw_text, Qno_conversion,
5867 build_string ("Raw text, which means it converts only line-break-codes."),
5868 list2 (Qmnemonic, build_string ("Raw")));
5871 (Qbinary, Qno_conversion,
5872 build_string ("Binary, which means it does not convert anything."),
5873 list4 (Qeol_type, Qlf,
5874 Qmnemonic, build_string ("Binary")));
5876 Fdefine_coding_system_alias (Qno_conversion, Qraw_text);
5878 Fdefine_coding_system_alias (Qfile_name, Qbinary);
5880 Fdefine_coding_system_alias (Qterminal, Qbinary);
5881 Fdefine_coding_system_alias (Qkeyboard, Qbinary);
5883 /* Need this for bootstrapping */
5884 fcd->coding_category_system[CODING_CATEGORY_NO_CONVERSION] =
5885 Fget_coding_system (Qraw_text);
5891 for (i = 0; i < countof (fcd->ucs_to_mule_table); i++)
5892 fcd->ucs_to_mule_table[i] = Qnil;
5894 staticpro (&mule_to_ucs_table);
5895 mule_to_ucs_table = Fmake_char_table(Qgeneric);