1 /* Code conversion functions.
2 Copyright (C) 1991, 1995 Free Software Foundation, Inc.
3 Copyright (C) 1995 Sun Microsystems, Inc.
5 This file is part of XEmacs.
7 XEmacs is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by the
9 Free Software Foundation; either version 2, or (at your option) any
12 XEmacs is distributed in the hope that it will be useful, but WITHOUT
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 You should have received a copy of the GNU General Public License
18 along with XEmacs; see the file COPYING. If not, write to
19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20 Boston, MA 02111-1307, USA. */
22 /* Synched up with: Mule 2.3. Not in FSF. */
24 /* Rewritten by Ben Wing <ben@xemacs.org>. */
37 #include "file-coding.h"
39 Lisp_Object Qcoding_system_error;
41 Lisp_Object Vkeyboard_coding_system;
42 Lisp_Object Vterminal_coding_system;
43 Lisp_Object Vcoding_system_for_read;
44 Lisp_Object Vcoding_system_for_write;
45 Lisp_Object Vfile_name_coding_system;
47 /* Table of symbols identifying each coding category. */
48 Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST + 1];
52 struct file_coding_dump {
53 /* Coding system currently associated with each coding category. */
54 Lisp_Object coding_category_system[CODING_CATEGORY_LAST + 1];
56 /* Table of all coding categories in decreasing order of priority.
57 This describes a permutation of the possible coding categories. */
58 int coding_category_by_priority[CODING_CATEGORY_LAST + 1];
60 Lisp_Object ucs_to_mule_table[65536];
63 static const struct lrecord_description fcd_description_1[] = {
64 { XD_LISP_OBJECT, offsetof(struct file_coding_dump, coding_category_system), CODING_CATEGORY_LAST + 1 },
65 { XD_LISP_OBJECT, offsetof(struct file_coding_dump, ucs_to_mule_table), 65536 },
69 static const struct struct_description fcd_description = {
70 sizeof(struct file_coding_dump),
74 Lisp_Object mule_to_ucs_table;
76 Lisp_Object Qcoding_systemp;
78 Lisp_Object Qraw_text, Qno_conversion, Qccl, Qiso2022;
79 /* Qinternal in general.c */
81 Lisp_Object Qmnemonic, Qeol_type;
82 Lisp_Object Qcr, Qcrlf, Qlf;
83 Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf;
84 Lisp_Object Qpost_read_conversion;
85 Lisp_Object Qpre_write_conversion;
88 Lisp_Object Qucs4, Qutf8;
89 Lisp_Object Qbig5, Qshift_jis;
90 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3;
91 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output;
92 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output;
93 Lisp_Object Qno_iso6429;
94 Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion;
95 Lisp_Object Qctext, Qescape_quoted;
96 Lisp_Object Qshort, Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift;
98 Lisp_Object Qencode, Qdecode;
100 Lisp_Object Vcoding_system_hash_table;
102 int enable_multibyte_characters;
105 /* Additional information used by the ISO2022 decoder and detector. */
106 struct iso2022_decoder
108 /* CHARSET holds the character sets currently assigned to the G0
109 through G3 variables. It is initialized from the array
110 INITIAL_CHARSET in CODESYS. */
111 Lisp_Object charset[4];
113 /* Which registers are currently invoked into the left (GL) and
114 right (GR) halves of the 8-bit encoding space? */
115 int register_left, register_right;
117 /* ISO_ESC holds a value indicating part of an escape sequence
118 that has already been seen. */
119 enum iso_esc_flag esc;
121 /* This records the bytes we've seen so far in an escape sequence,
122 in case the sequence is invalid (we spit out the bytes unchanged). */
123 unsigned char esc_bytes[8];
125 /* Index for next byte to store in ISO escape sequence. */
128 #ifdef ENABLE_COMPOSITE_CHARS
129 /* Stuff seen so far when composing a string. */
130 unsigned_char_dynarr *composite_chars;
133 /* If we saw an invalid designation sequence for a particular
134 register, we flag it here and switch to ASCII. The next time we
135 see a valid designation for this register, we turn off the flag
136 and do the designation normally, but pretend the sequence was
137 invalid. The effect of all this is that (most of the time) the
138 escape sequences for both the switch to the unknown charset, and
139 the switch back to the known charset, get inserted literally into
140 the buffer and saved out as such. The hope is that we can
141 preserve the escape sequences so that the resulting written out
142 file makes sense. If we don't do any of this, the designation
143 to the invalid charset will be preserved but that switch back
144 to the known charset will probably get eaten because it was
145 the same charset that was already present in the register. */
146 unsigned char invalid_designated[4];
148 /* We try to do similar things as above for direction-switching
149 sequences. If we encountered a direction switch while an
150 invalid designation was present, or an invalid designation
151 just after a direction switch (i.e. no valid designation
152 encountered yet), we insert the direction-switch escape
153 sequence literally into the output stream, and later on
154 insert the corresponding direction-restoring escape sequence
156 unsigned int switched_dir_and_no_valid_charset_yet :1;
157 unsigned int invalid_switch_dir :1;
159 /* Tells the decoder to output the escape sequence literally
160 even though it was valid. Used in the games we play to
161 avoid lossage when we encounter invalid designations. */
162 unsigned int output_literally :1;
163 /* We encountered a direction switch followed by an invalid
164 designation. We didn't output the direction switch
165 literally because we didn't know about the invalid designation;
166 but we have to do so now. */
167 unsigned int output_direction_sequence :1;
170 EXFUN (Fcopy_coding_system, 2);
172 struct detection_state;
173 static int detect_coding_sjis (struct detection_state *st,
174 CONST unsigned char *src,
176 static void decode_coding_sjis (Lstream *decoding,
177 CONST unsigned char *src,
178 unsigned_char_dynarr *dst,
180 static void encode_coding_sjis (Lstream *encoding,
181 CONST unsigned char *src,
182 unsigned_char_dynarr *dst,
184 static int detect_coding_big5 (struct detection_state *st,
185 CONST unsigned char *src,
187 static void decode_coding_big5 (Lstream *decoding,
188 CONST unsigned char *src,
189 unsigned_char_dynarr *dst, unsigned int n);
190 static void encode_coding_big5 (Lstream *encoding,
191 CONST unsigned char *src,
192 unsigned_char_dynarr *dst, unsigned int n);
193 static int detect_coding_ucs4 (struct detection_state *st,
194 CONST unsigned char *src,
196 static void decode_coding_ucs4 (Lstream *decoding,
197 CONST unsigned char *src,
198 unsigned_char_dynarr *dst, unsigned int n);
199 static void encode_coding_ucs4 (Lstream *encoding,
200 CONST unsigned char *src,
201 unsigned_char_dynarr *dst, unsigned int n);
202 static int detect_coding_utf8 (struct detection_state *st,
203 CONST unsigned char *src,
205 static void decode_coding_utf8 (Lstream *decoding,
206 CONST unsigned char *src,
207 unsigned_char_dynarr *dst, unsigned int n);
208 static void encode_coding_utf8 (Lstream *encoding,
209 CONST unsigned char *src,
210 unsigned_char_dynarr *dst, unsigned int n);
211 static int postprocess_iso2022_mask (int mask);
212 static void reset_iso2022 (Lisp_Object coding_system,
213 struct iso2022_decoder *iso);
214 static int detect_coding_iso2022 (struct detection_state *st,
215 CONST unsigned char *src,
217 static void decode_coding_iso2022 (Lstream *decoding,
218 CONST unsigned char *src,
219 unsigned_char_dynarr *dst, unsigned int n);
220 static void encode_coding_iso2022 (Lstream *encoding,
221 CONST unsigned char *src,
222 unsigned_char_dynarr *dst, unsigned int n);
224 static void decode_coding_no_conversion (Lstream *decoding,
225 CONST unsigned char *src,
226 unsigned_char_dynarr *dst,
228 static void encode_coding_no_conversion (Lstream *encoding,
229 CONST unsigned char *src,
230 unsigned_char_dynarr *dst,
232 static void mule_decode (Lstream *decoding, CONST unsigned char *src,
233 unsigned_char_dynarr *dst, unsigned int n);
234 static void mule_encode (Lstream *encoding, CONST unsigned char *src,
235 unsigned_char_dynarr *dst, unsigned int n);
237 typedef struct codesys_prop codesys_prop;
246 Dynarr_declare (codesys_prop);
247 } codesys_prop_dynarr;
249 static const struct lrecord_description codesys_prop_description_1[] = {
250 { XD_LISP_OBJECT, offsetof(codesys_prop, sym), 1 },
254 static const struct struct_description codesys_prop_description = {
255 sizeof(codesys_prop),
256 codesys_prop_description_1
259 static const struct lrecord_description codesys_prop_dynarr_description_1[] = {
260 XD_DYNARR_DESC(codesys_prop_dynarr, &codesys_prop_description),
264 static const struct struct_description codesys_prop_dynarr_description = {
265 sizeof(codesys_prop_dynarr),
266 codesys_prop_dynarr_description_1
269 codesys_prop_dynarr *the_codesys_prop_dynarr;
271 enum codesys_prop_enum
274 CODESYS_PROP_ISO2022,
279 /************************************************************************/
280 /* Coding system functions */
281 /************************************************************************/
283 static Lisp_Object mark_coding_system (Lisp_Object);
284 static void print_coding_system (Lisp_Object, Lisp_Object, int);
285 static void finalize_coding_system (void *header, int for_disksave);
288 static const struct lrecord_description ccs_description_1[] = {
289 { XD_LISP_OBJECT, offsetof(charset_conversion_spec, from_charset), 2 },
293 static const struct struct_description ccs_description = {
294 sizeof(charset_conversion_spec),
298 static const struct lrecord_description ccsd_description_1[] = {
299 XD_DYNARR_DESC(charset_conversion_spec_dynarr, &ccs_description),
303 static const struct struct_description ccsd_description = {
304 sizeof(charset_conversion_spec_dynarr),
309 static const struct lrecord_description coding_system_description[] = {
310 { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, name), 2 },
311 { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, mnemonic), 3 },
312 { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, eol_lf), 3 },
314 { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, iso2022.initial_charset), 4 },
315 { XD_STRUCT_PTR, offsetof(struct Lisp_Coding_System, iso2022.input_conv), 1, &ccsd_description },
316 { XD_STRUCT_PTR, offsetof(struct Lisp_Coding_System, iso2022.output_conv), 1, &ccsd_description },
317 { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, ccl.decode), 2 },
322 DEFINE_LRECORD_IMPLEMENTATION ("coding-system", coding_system,
323 mark_coding_system, print_coding_system,
324 finalize_coding_system,
325 0, 0, coding_system_description,
326 struct Lisp_Coding_System);
329 mark_coding_system (Lisp_Object obj)
331 Lisp_Coding_System *codesys = XCODING_SYSTEM (obj);
333 mark_object (CODING_SYSTEM_NAME (codesys));
334 mark_object (CODING_SYSTEM_DOC_STRING (codesys));
335 mark_object (CODING_SYSTEM_MNEMONIC (codesys));
336 mark_object (CODING_SYSTEM_EOL_LF (codesys));
337 mark_object (CODING_SYSTEM_EOL_CRLF (codesys));
338 mark_object (CODING_SYSTEM_EOL_CR (codesys));
340 switch (CODING_SYSTEM_TYPE (codesys))
344 case CODESYS_ISO2022:
345 for (i = 0; i < 4; i++)
346 mark_object (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i));
347 if (codesys->iso2022.input_conv)
349 for (i = 0; i < Dynarr_length (codesys->iso2022.input_conv); i++)
351 struct charset_conversion_spec *ccs =
352 Dynarr_atp (codesys->iso2022.input_conv, i);
353 mark_object (ccs->from_charset);
354 mark_object (ccs->to_charset);
357 if (codesys->iso2022.output_conv)
359 for (i = 0; i < Dynarr_length (codesys->iso2022.output_conv); i++)
361 struct charset_conversion_spec *ccs =
362 Dynarr_atp (codesys->iso2022.output_conv, i);
363 mark_object (ccs->from_charset);
364 mark_object (ccs->to_charset);
370 mark_object (CODING_SYSTEM_CCL_DECODE (codesys));
371 mark_object (CODING_SYSTEM_CCL_ENCODE (codesys));
378 mark_object (CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys));
379 return CODING_SYSTEM_POST_READ_CONVERSION (codesys);
383 print_coding_system (Lisp_Object obj, Lisp_Object printcharfun,
386 Lisp_Coding_System *c = XCODING_SYSTEM (obj);
388 error ("printing unreadable object #<coding_system 0x%x>",
391 write_c_string ("#<coding_system ", printcharfun);
392 print_internal (c->name, printcharfun, 1);
393 write_c_string (">", printcharfun);
397 finalize_coding_system (void *header, int for_disksave)
399 Lisp_Coding_System *c = (Lisp_Coding_System *) header;
400 /* Since coding systems never go away, this function is not
401 necessary. But it would be necessary if we changed things
402 so that coding systems could go away. */
403 if (!for_disksave) /* see comment in lstream.c */
405 switch (CODING_SYSTEM_TYPE (c))
408 case CODESYS_ISO2022:
409 if (c->iso2022.input_conv)
411 Dynarr_free (c->iso2022.input_conv);
412 c->iso2022.input_conv = 0;
414 if (c->iso2022.output_conv)
416 Dynarr_free (c->iso2022.output_conv);
417 c->iso2022.output_conv = 0;
428 symbol_to_eol_type (Lisp_Object symbol)
430 CHECK_SYMBOL (symbol);
431 if (NILP (symbol)) return EOL_AUTODETECT;
432 if (EQ (symbol, Qlf)) return EOL_LF;
433 if (EQ (symbol, Qcrlf)) return EOL_CRLF;
434 if (EQ (symbol, Qcr)) return EOL_CR;
436 signal_simple_error ("Unrecognized eol type", symbol);
437 return EOL_AUTODETECT; /* not reached */
441 eol_type_to_symbol (eol_type_t type)
446 case EOL_LF: return Qlf;
447 case EOL_CRLF: return Qcrlf;
448 case EOL_CR: return Qcr;
449 case EOL_AUTODETECT: return Qnil;
454 setup_eol_coding_systems (Lisp_Coding_System *codesys)
456 Lisp_Object codesys_obj;
457 int len = string_length (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name);
458 char *codesys_name = (char *) alloca (len + 7);
460 char *codesys_mnemonic=0;
462 Lisp_Object codesys_name_sym, sub_codesys_obj;
466 XSETCODING_SYSTEM (codesys_obj, codesys);
468 memcpy (codesys_name,
469 string_data (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name), len);
471 if (STRINGP (CODING_SYSTEM_MNEMONIC (codesys)))
473 mlen = XSTRING_LENGTH (CODING_SYSTEM_MNEMONIC (codesys));
474 codesys_mnemonic = (char *) alloca (mlen + 7);
475 memcpy (codesys_mnemonic,
476 XSTRING_DATA (CODING_SYSTEM_MNEMONIC (codesys)), mlen);
479 #define DEFINE_SUB_CODESYS(op_sys, op_sys_abbr, Type) do { \
480 strcpy (codesys_name + len, "-" op_sys); \
482 strcpy (codesys_mnemonic + mlen, op_sys_abbr); \
483 codesys_name_sym = intern (codesys_name); \
484 sub_codesys_obj = Fcopy_coding_system (codesys_obj, codesys_name_sym); \
485 XCODING_SYSTEM_EOL_TYPE (sub_codesys_obj) = Type; \
487 XCODING_SYSTEM_MNEMONIC(sub_codesys_obj) = \
488 build_string (codesys_mnemonic); \
489 CODING_SYSTEM_##Type (codesys) = sub_codesys_obj; \
492 DEFINE_SUB_CODESYS("unix", "", EOL_LF);
493 DEFINE_SUB_CODESYS("dos", ":T", EOL_CRLF);
494 DEFINE_SUB_CODESYS("mac", ":t", EOL_CR);
497 DEFUN ("coding-system-p", Fcoding_system_p, 1, 1, 0, /*
498 Return t if OBJECT is a coding system.
499 A coding system is an object that defines how text containing multiple
500 character sets is encoded into a stream of (typically 8-bit) bytes.
501 The coding system is used to decode the stream into a series of
502 characters (which may be from multiple charsets) when the text is read
503 from a file or process, and is used to encode the text back into the
504 same format when it is written out to a file or process.
506 For example, many ISO2022-compliant coding systems (such as Compound
507 Text, which is used for inter-client data under the X Window System)
508 use escape sequences to switch between different charsets -- Japanese
509 Kanji, for example, is invoked with "ESC $ ( B"; ASCII is invoked
510 with "ESC ( B"; and Cyrillic is invoked with "ESC - L". See
511 `make-coding-system' for more information.
513 Coding systems are normally identified using a symbol, and the
514 symbol is accepted in place of the actual coding system object whenever
515 a coding system is called for. (This is similar to how faces work.)
519 return CODING_SYSTEMP (object) ? Qt : Qnil;
522 DEFUN ("find-coding-system", Ffind_coding_system, 1, 1, 0, /*
523 Retrieve the coding system of the given name.
525 If CODING-SYSTEM-OR-NAME is a coding-system object, it is simply
526 returned. Otherwise, CODING-SYSTEM-OR-NAME should be a symbol.
527 If there is no such coding system, nil is returned. Otherwise the
528 associated coding system object is returned.
530 (coding_system_or_name))
532 if (CODING_SYSTEMP (coding_system_or_name))
533 return coding_system_or_name;
535 if (NILP (coding_system_or_name))
536 coding_system_or_name = Qbinary;
538 CHECK_SYMBOL (coding_system_or_name);
540 return Fgethash (coding_system_or_name, Vcoding_system_hash_table, Qnil);
543 DEFUN ("get-coding-system", Fget_coding_system, 1, 1, 0, /*
544 Retrieve the coding system of the given name.
545 Same as `find-coding-system' except that if there is no such
546 coding system, an error is signaled instead of returning nil.
550 Lisp_Object coding_system = Ffind_coding_system (name);
552 if (NILP (coding_system))
553 signal_simple_error ("No such coding system", name);
554 return coding_system;
557 /* We store the coding systems in hash tables with the names as the key and the
558 actual coding system object as the value. Occasionally we need to use them
559 in a list format. These routines provide us with that. */
560 struct coding_system_list_closure
562 Lisp_Object *coding_system_list;
566 add_coding_system_to_list_mapper (Lisp_Object key, Lisp_Object value,
567 void *coding_system_list_closure)
569 /* This function can GC */
570 struct coding_system_list_closure *cscl =
571 (struct coding_system_list_closure *) coding_system_list_closure;
572 Lisp_Object *coding_system_list = cscl->coding_system_list;
574 *coding_system_list = Fcons (key, *coding_system_list);
578 DEFUN ("coding-system-list", Fcoding_system_list, 0, 0, 0, /*
579 Return a list of the names of all defined coding systems.
583 Lisp_Object coding_system_list = Qnil;
585 struct coding_system_list_closure coding_system_list_closure;
587 GCPRO1 (coding_system_list);
588 coding_system_list_closure.coding_system_list = &coding_system_list;
589 elisp_maphash (add_coding_system_to_list_mapper, Vcoding_system_hash_table,
590 &coding_system_list_closure);
593 return coding_system_list;
596 DEFUN ("coding-system-name", Fcoding_system_name, 1, 1, 0, /*
597 Return the name of the given coding system.
601 coding_system = Fget_coding_system (coding_system);
602 return XCODING_SYSTEM_NAME (coding_system);
605 static Lisp_Coding_System *
606 allocate_coding_system (enum coding_system_type type, Lisp_Object name)
608 Lisp_Coding_System *codesys =
609 alloc_lcrecord_type (Lisp_Coding_System, &lrecord_coding_system);
611 zero_lcrecord (codesys);
612 CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = Qnil;
613 CODING_SYSTEM_POST_READ_CONVERSION (codesys) = Qnil;
614 CODING_SYSTEM_EOL_TYPE (codesys) = EOL_AUTODETECT;
615 CODING_SYSTEM_EOL_CRLF (codesys) = Qnil;
616 CODING_SYSTEM_EOL_CR (codesys) = Qnil;
617 CODING_SYSTEM_EOL_LF (codesys) = Qnil;
618 CODING_SYSTEM_TYPE (codesys) = type;
619 CODING_SYSTEM_MNEMONIC (codesys) = Qnil;
621 if (type == CODESYS_ISO2022)
624 for (i = 0; i < 4; i++)
625 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i) = Qnil;
627 else if (type == CODESYS_CCL)
629 CODING_SYSTEM_CCL_DECODE (codesys) = Qnil;
630 CODING_SYSTEM_CCL_ENCODE (codesys) = Qnil;
633 CODING_SYSTEM_NAME (codesys) = name;
639 /* Given a list of charset conversion specs as specified in a Lisp
640 program, parse it into STORE_HERE. */
643 parse_charset_conversion_specs (charset_conversion_spec_dynarr *store_here,
644 Lisp_Object spec_list)
648 EXTERNAL_LIST_LOOP (rest, spec_list)
650 Lisp_Object car = XCAR (rest);
651 Lisp_Object from, to;
652 struct charset_conversion_spec spec;
654 if (!CONSP (car) || !CONSP (XCDR (car)) || !NILP (XCDR (XCDR (car))))
655 signal_simple_error ("Invalid charset conversion spec", car);
656 from = Fget_charset (XCAR (car));
657 to = Fget_charset (XCAR (XCDR (car)));
658 if (XCHARSET_TYPE (from) != XCHARSET_TYPE (to))
659 signal_simple_error_2
660 ("Attempted conversion between different charset types",
662 spec.from_charset = from;
663 spec.to_charset = to;
665 Dynarr_add (store_here, spec);
669 /* Given a dynarr LOAD_HERE of internally-stored charset conversion
670 specs, return the equivalent as the Lisp programmer would see it.
672 If LOAD_HERE is 0, return Qnil. */
675 unparse_charset_conversion_specs (charset_conversion_spec_dynarr *load_here)
682 for (i = 0, result = Qnil; i < Dynarr_length (load_here); i++)
684 struct charset_conversion_spec *ccs = Dynarr_atp (load_here, i);
685 result = Fcons (list2 (ccs->from_charset, ccs->to_charset), result);
688 return Fnreverse (result);
693 DEFUN ("make-coding-system", Fmake_coding_system, 2, 4, 0, /*
694 Register symbol NAME as a coding system.
696 TYPE describes the conversion method used and should be one of
699 Automatic conversion. XEmacs attempts to detect the coding system
702 No conversion. Use this for binary files and such. On output,
703 graphic characters that are not in ASCII or Latin-1 will be
704 replaced by a ?. (For a no-conversion-encoded buffer, these
705 characters will only be present if you explicitly insert them.)
707 Shift-JIS (a Japanese encoding commonly used in PC operating systems).
709 ISO 10646 UCS-4 encoding.
711 ISO 10646 UTF-8 encoding.
713 Any ISO2022-compliant encoding. Among other things, this includes
714 JIS (the Japanese encoding commonly used for e-mail), EUC (the
715 standard Unix encoding for Japanese and other languages), and
716 Compound Text (the encoding used in X11). You can specify more
717 specific information about the conversion with the FLAGS argument.
719 Big5 (the encoding commonly used for Taiwanese).
721 The conversion is performed using a user-written pseudo-code
722 program. CCL (Code Conversion Language) is the name of this
725 Write out or read in the raw contents of the memory representing
726 the buffer's text. This is primarily useful for debugging
727 purposes, and is only enabled when XEmacs has been compiled with
728 DEBUG_XEMACS defined (via the --debug configure option).
729 WARNING: Reading in a file using 'internal conversion can result
730 in an internal inconsistency in the memory representing a
731 buffer's text, which will produce unpredictable results and may
732 cause XEmacs to crash. Under normal circumstances you should
733 never use 'internal conversion.
735 DOC-STRING is a string describing the coding system.
737 PROPS is a property list, describing the specific nature of the
738 character set. Recognized properties are:
741 String to be displayed in the modeline when this coding system is
745 End-of-line conversion to be used. It should be one of
748 Automatically detect the end-of-line type (LF, CRLF,
749 or CR). Also generate subsidiary coding systems named
750 `NAME-unix', `NAME-dos', and `NAME-mac', that are
751 identical to this coding system but have an EOL-TYPE
752 value of 'lf, 'crlf, and 'cr, respectively.
754 The end of a line is marked externally using ASCII LF.
755 Since this is also the way that XEmacs represents an
756 end-of-line internally, specifying this option results
757 in no end-of-line conversion. This is the standard
758 format for Unix text files.
760 The end of a line is marked externally using ASCII
761 CRLF. This is the standard format for MS-DOS text
764 The end of a line is marked externally using ASCII CR.
765 This is the standard format for Macintosh text files.
767 Automatically detect the end-of-line type but do not
768 generate subsidiary coding systems. (This value is
769 converted to nil when stored internally, and
770 `coding-system-property' will return nil.)
772 'post-read-conversion
773 Function called after a file has been read in, to perform the
774 decoding. Called with two arguments, BEG and END, denoting
775 a region of the current buffer to be decoded.
777 'pre-write-conversion
778 Function called before a file is written out, to perform the
779 encoding. Called with two arguments, BEG and END, denoting
780 a region of the current buffer to be encoded.
783 The following additional properties are recognized if TYPE is 'iso2022:
789 The character set initially designated to the G0 - G3 registers.
790 The value should be one of
792 -- A charset object (designate that character set)
793 -- nil (do not ever use this register)
794 -- t (no character set is initially designated to
795 the register, but may be later on; this automatically
796 sets the corresponding `force-g*-on-output' property)
802 If non-nil, send an explicit designation sequence on output before
803 using the specified register.
806 If non-nil, use the short forms "ESC $ @", "ESC $ A", and
807 "ESC $ B" on output in place of the full designation sequences
808 "ESC $ ( @", "ESC $ ( A", and "ESC $ ( B".
811 If non-nil, don't designate ASCII to G0 at each end of line on output.
812 Setting this to non-nil also suppresses other state-resetting that
813 normally happens at the end of a line.
816 If non-nil, don't designate ASCII to G0 before control chars on output.
819 If non-nil, use 7-bit environment on output. Otherwise, use 8-bit
823 If non-nil, use locking-shift (SO/SI) instead of single-shift
824 or designation by escape sequence.
827 If non-nil, don't use ISO6429's direction specification.
830 If non-nil, literal control characters that are the same as
831 the beginning of a recognized ISO2022 or ISO6429 escape sequence
832 (in particular, ESC (0x1B), SO (0x0E), SI (0x0F), SS2 (0x8E),
833 SS3 (0x8F), and CSI (0x9B)) are "quoted" with an escape character
834 so that they can be properly distinguished from an escape sequence.
835 (Note that doing this results in a non-portable encoding.) This
836 encoding flag is used for byte-compiled files. Note that ESC
837 is a good choice for a quoting character because there are no
838 escape sequences whose second byte is a character from the Control-0
839 or Control-1 character sets; this is explicitly disallowed by the
842 'input-charset-conversion
843 A list of conversion specifications, specifying conversion of
844 characters in one charset to another when decoding is performed.
845 Each specification is a list of two elements: the source charset,
846 and the destination charset.
848 'output-charset-conversion
849 A list of conversion specifications, specifying conversion of
850 characters in one charset to another when encoding is performed.
851 The form of each specification is the same as for
852 'input-charset-conversion.
855 The following additional properties are recognized (and required)
859 CCL program used for decoding (converting to internal format).
862 CCL program used for encoding (converting to external format).
864 (name, type, doc_string, props))
866 Lisp_Coding_System *codesys;
867 Lisp_Object rest, key, value;
868 enum coding_system_type ty;
869 int need_to_setup_eol_systems = 1;
871 /* Convert type to constant */
872 if (NILP (type) || EQ (type, Qundecided))
873 { ty = CODESYS_AUTODETECT; }
875 else if (EQ (type, Qshift_jis)) { ty = CODESYS_SHIFT_JIS; }
876 else if (EQ (type, Qiso2022)) { ty = CODESYS_ISO2022; }
877 else if (EQ (type, Qbig5)) { ty = CODESYS_BIG5; }
878 else if (EQ (type, Qucs4)) { ty = CODESYS_UCS4; }
879 else if (EQ (type, Qutf8)) { ty = CODESYS_UTF8; }
880 else if (EQ (type, Qccl)) { ty = CODESYS_CCL; }
882 else if (EQ (type, Qno_conversion)) { ty = CODESYS_NO_CONVERSION; }
884 else if (EQ (type, Qinternal)) { ty = CODESYS_INTERNAL; }
887 signal_simple_error ("Invalid coding system type", type);
891 codesys = allocate_coding_system (ty, name);
893 if (NILP (doc_string))
894 doc_string = build_string ("");
896 CHECK_STRING (doc_string);
897 CODING_SYSTEM_DOC_STRING (codesys) = doc_string;
899 EXTERNAL_PROPERTY_LIST_LOOP (rest, key, value, props)
901 if (EQ (key, Qmnemonic))
904 CHECK_STRING (value);
905 CODING_SYSTEM_MNEMONIC (codesys) = value;
908 else if (EQ (key, Qeol_type))
910 need_to_setup_eol_systems = NILP (value);
913 CODING_SYSTEM_EOL_TYPE (codesys) = symbol_to_eol_type (value);
916 else if (EQ (key, Qpost_read_conversion)) CODING_SYSTEM_POST_READ_CONVERSION (codesys) = value;
917 else if (EQ (key, Qpre_write_conversion)) CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = value;
919 else if (ty == CODESYS_ISO2022)
921 #define FROB_INITIAL_CHARSET(charset_num) \
922 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \
923 ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value))
925 if (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
926 else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
927 else if (EQ (key, Qcharset_g2)) FROB_INITIAL_CHARSET (2);
928 else if (EQ (key, Qcharset_g3)) FROB_INITIAL_CHARSET (3);
930 #define FROB_FORCE_CHARSET(charset_num) \
931 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = !NILP (value)
933 else if (EQ (key, Qforce_g0_on_output)) FROB_FORCE_CHARSET (0);
934 else if (EQ (key, Qforce_g1_on_output)) FROB_FORCE_CHARSET (1);
935 else if (EQ (key, Qforce_g2_on_output)) FROB_FORCE_CHARSET (2);
936 else if (EQ (key, Qforce_g3_on_output)) FROB_FORCE_CHARSET (3);
938 #define FROB_BOOLEAN_PROPERTY(prop) \
939 CODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value)
941 else if (EQ (key, Qshort)) FROB_BOOLEAN_PROPERTY (SHORT);
942 else if (EQ (key, Qno_ascii_eol)) FROB_BOOLEAN_PROPERTY (NO_ASCII_EOL);
943 else if (EQ (key, Qno_ascii_cntl)) FROB_BOOLEAN_PROPERTY (NO_ASCII_CNTL);
944 else if (EQ (key, Qseven)) FROB_BOOLEAN_PROPERTY (SEVEN);
945 else if (EQ (key, Qlock_shift)) FROB_BOOLEAN_PROPERTY (LOCK_SHIFT);
946 else if (EQ (key, Qno_iso6429)) FROB_BOOLEAN_PROPERTY (NO_ISO6429);
947 else if (EQ (key, Qescape_quoted)) FROB_BOOLEAN_PROPERTY (ESCAPE_QUOTED);
949 else if (EQ (key, Qinput_charset_conversion))
951 codesys->iso2022.input_conv =
952 Dynarr_new (charset_conversion_spec);
953 parse_charset_conversion_specs (codesys->iso2022.input_conv,
956 else if (EQ (key, Qoutput_charset_conversion))
958 codesys->iso2022.output_conv =
959 Dynarr_new (charset_conversion_spec);
960 parse_charset_conversion_specs (codesys->iso2022.output_conv,
964 signal_simple_error ("Unrecognized property", key);
966 else if (EQ (type, Qccl))
968 if (EQ (key, Qdecode))
970 CHECK_VECTOR (value);
971 CODING_SYSTEM_CCL_DECODE (codesys) = value;
973 else if (EQ (key, Qencode))
975 CHECK_VECTOR (value);
976 CODING_SYSTEM_CCL_ENCODE (codesys) = value;
979 signal_simple_error ("Unrecognized property", key);
983 signal_simple_error ("Unrecognized property", key);
986 if (need_to_setup_eol_systems)
987 setup_eol_coding_systems (codesys);
990 Lisp_Object codesys_obj;
991 XSETCODING_SYSTEM (codesys_obj, codesys);
992 Fputhash (name, codesys_obj, Vcoding_system_hash_table);
997 DEFUN ("copy-coding-system", Fcopy_coding_system, 2, 2, 0, /*
998 Copy OLD-CODING-SYSTEM to NEW-NAME.
999 If NEW-NAME does not name an existing coding system, a new one will
1002 (old_coding_system, new_name))
1004 Lisp_Object new_coding_system;
1005 old_coding_system = Fget_coding_system (old_coding_system);
1006 new_coding_system = Ffind_coding_system (new_name);
1007 if (NILP (new_coding_system))
1009 XSETCODING_SYSTEM (new_coding_system,
1010 allocate_coding_system
1011 (XCODING_SYSTEM_TYPE (old_coding_system),
1013 Fputhash (new_name, new_coding_system, Vcoding_system_hash_table);
1017 Lisp_Coding_System *to = XCODING_SYSTEM (new_coding_system);
1018 Lisp_Coding_System *from = XCODING_SYSTEM (old_coding_system);
1019 memcpy (((char *) to ) + sizeof (to->header),
1020 ((char *) from) + sizeof (from->header),
1021 sizeof (*from) - sizeof (from->header));
1022 to->name = new_name;
1024 return new_coding_system;
1027 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias, 2, 2, 0, /*
1028 Define symbol ALIAS as an alias for coding system CODING-SYSTEM.
1030 (alias, coding_system))
1032 CHECK_SYMBOL (alias);
1033 if (!NILP (Ffind_coding_system (alias)))
1034 signal_simple_error ("Symbol already names a coding system", alias);
1035 coding_system = Fget_coding_system (coding_system);
1036 Fputhash (alias, coding_system, Vcoding_system_hash_table);
1038 /* Set up aliases for subsidiaries. */
1039 if (XCODING_SYSTEM_EOL_TYPE (coding_system) == EOL_AUTODETECT)
1042 XSETSTRING (str, symbol_name (XSYMBOL (alias)));
1043 #define FROB(type, name) \
1045 Lisp_Object subsidiary = XCODING_SYSTEM_EOL_##type (coding_system); \
1046 if (!NILP (subsidiary)) \
1047 Fdefine_coding_system_alias \
1048 (Fintern (concat2 (str, build_string (name)), Qnil), subsidiary); \
1051 FROB (CRLF, "-dos");
1055 /* FSF return value is a vector of [ALIAS-unix ALIAS-dos ALIAS-mac],
1056 but it doesn't look intentional, so I'd rather return something
1057 meaningful or nothing at all. */
1062 subsidiary_coding_system (Lisp_Object coding_system, eol_type_t type)
1064 Lisp_Coding_System *cs = XCODING_SYSTEM (coding_system);
1065 Lisp_Object new_coding_system;
1067 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
1068 return coding_system;
1072 case EOL_AUTODETECT: return coding_system;
1073 case EOL_LF: new_coding_system = CODING_SYSTEM_EOL_LF (cs); break;
1074 case EOL_CR: new_coding_system = CODING_SYSTEM_EOL_CR (cs); break;
1075 case EOL_CRLF: new_coding_system = CODING_SYSTEM_EOL_CRLF (cs); break;
1079 return NILP (new_coding_system) ? coding_system : new_coding_system;
1082 DEFUN ("subsidiary-coding-system", Fsubsidiary_coding_system, 2, 2, 0, /*
1083 Return the subsidiary coding system of CODING-SYSTEM with eol type EOL-TYPE.
1085 (coding_system, eol_type))
1087 coding_system = Fget_coding_system (coding_system);
1089 return subsidiary_coding_system (coding_system,
1090 symbol_to_eol_type (eol_type));
1094 /************************************************************************/
1095 /* Coding system accessors */
1096 /************************************************************************/
1098 DEFUN ("coding-system-doc-string", Fcoding_system_doc_string, 1, 1, 0, /*
1099 Return the doc string for CODING-SYSTEM.
1103 coding_system = Fget_coding_system (coding_system);
1104 return XCODING_SYSTEM_DOC_STRING (coding_system);
1107 DEFUN ("coding-system-type", Fcoding_system_type, 1, 1, 0, /*
1108 Return the type of CODING-SYSTEM.
1112 switch (XCODING_SYSTEM_TYPE (Fget_coding_system (coding_system)))
1115 case CODESYS_AUTODETECT: return Qundecided;
1117 case CODESYS_SHIFT_JIS: return Qshift_jis;
1118 case CODESYS_ISO2022: return Qiso2022;
1119 case CODESYS_BIG5: return Qbig5;
1120 case CODESYS_UCS4: return Qucs4;
1121 case CODESYS_UTF8: return Qutf8;
1122 case CODESYS_CCL: return Qccl;
1124 case CODESYS_NO_CONVERSION: return Qno_conversion;
1126 case CODESYS_INTERNAL: return Qinternal;
1133 Lisp_Object coding_system_charset (Lisp_Object coding_system, int gnum)
1136 = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, gnum);
1138 return CHARSETP (cs) ? XCHARSET_NAME (cs) : Qnil;
1141 DEFUN ("coding-system-charset", Fcoding_system_charset, 2, 2, 0, /*
1142 Return initial charset of CODING-SYSTEM designated to GNUM.
1145 (coding_system, gnum))
1147 coding_system = Fget_coding_system (coding_system);
1150 return coding_system_charset (coding_system, XINT (gnum));
1154 DEFUN ("coding-system-property", Fcoding_system_property, 2, 2, 0, /*
1155 Return the PROP property of CODING-SYSTEM.
1157 (coding_system, prop))
1160 enum coding_system_type type;
1162 coding_system = Fget_coding_system (coding_system);
1163 CHECK_SYMBOL (prop);
1164 type = XCODING_SYSTEM_TYPE (coding_system);
1166 for (i = 0; !ok && i < Dynarr_length (the_codesys_prop_dynarr); i++)
1167 if (EQ (Dynarr_at (the_codesys_prop_dynarr, i).sym, prop))
1170 switch (Dynarr_at (the_codesys_prop_dynarr, i).prop_type)
1172 case CODESYS_PROP_ALL_OK:
1175 case CODESYS_PROP_ISO2022:
1176 if (type != CODESYS_ISO2022)
1178 ("Property only valid in ISO2022 coding systems",
1182 case CODESYS_PROP_CCL:
1183 if (type != CODESYS_CCL)
1185 ("Property only valid in CCL coding systems",
1195 signal_simple_error ("Unrecognized property", prop);
1197 if (EQ (prop, Qname))
1198 return XCODING_SYSTEM_NAME (coding_system);
1199 else if (EQ (prop, Qtype))
1200 return Fcoding_system_type (coding_system);
1201 else if (EQ (prop, Qdoc_string))
1202 return XCODING_SYSTEM_DOC_STRING (coding_system);
1203 else if (EQ (prop, Qmnemonic))
1204 return XCODING_SYSTEM_MNEMONIC (coding_system);
1205 else if (EQ (prop, Qeol_type))
1206 return eol_type_to_symbol (XCODING_SYSTEM_EOL_TYPE (coding_system));
1207 else if (EQ (prop, Qeol_lf))
1208 return XCODING_SYSTEM_EOL_LF (coding_system);
1209 else if (EQ (prop, Qeol_crlf))
1210 return XCODING_SYSTEM_EOL_CRLF (coding_system);
1211 else if (EQ (prop, Qeol_cr))
1212 return XCODING_SYSTEM_EOL_CR (coding_system);
1213 else if (EQ (prop, Qpost_read_conversion))
1214 return XCODING_SYSTEM_POST_READ_CONVERSION (coding_system);
1215 else if (EQ (prop, Qpre_write_conversion))
1216 return XCODING_SYSTEM_PRE_WRITE_CONVERSION (coding_system);
1218 else if (type == CODESYS_ISO2022)
1220 if (EQ (prop, Qcharset_g0))
1221 return coding_system_charset (coding_system, 0);
1222 else if (EQ (prop, Qcharset_g1))
1223 return coding_system_charset (coding_system, 1);
1224 else if (EQ (prop, Qcharset_g2))
1225 return coding_system_charset (coding_system, 2);
1226 else if (EQ (prop, Qcharset_g3))
1227 return coding_system_charset (coding_system, 3);
1229 #define FORCE_CHARSET(charset_num) \
1230 (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \
1231 (coding_system, charset_num) ? Qt : Qnil)
1233 else if (EQ (prop, Qforce_g0_on_output)) return FORCE_CHARSET (0);
1234 else if (EQ (prop, Qforce_g1_on_output)) return FORCE_CHARSET (1);
1235 else if (EQ (prop, Qforce_g2_on_output)) return FORCE_CHARSET (2);
1236 else if (EQ (prop, Qforce_g3_on_output)) return FORCE_CHARSET (3);
1238 #define LISP_BOOLEAN(prop) \
1239 (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil)
1241 else if (EQ (prop, Qshort)) return LISP_BOOLEAN (SHORT);
1242 else if (EQ (prop, Qno_ascii_eol)) return LISP_BOOLEAN (NO_ASCII_EOL);
1243 else if (EQ (prop, Qno_ascii_cntl)) return LISP_BOOLEAN (NO_ASCII_CNTL);
1244 else if (EQ (prop, Qseven)) return LISP_BOOLEAN (SEVEN);
1245 else if (EQ (prop, Qlock_shift)) return LISP_BOOLEAN (LOCK_SHIFT);
1246 else if (EQ (prop, Qno_iso6429)) return LISP_BOOLEAN (NO_ISO6429);
1247 else if (EQ (prop, Qescape_quoted)) return LISP_BOOLEAN (ESCAPE_QUOTED);
1249 else if (EQ (prop, Qinput_charset_conversion))
1251 unparse_charset_conversion_specs
1252 (XCODING_SYSTEM (coding_system)->iso2022.input_conv);
1253 else if (EQ (prop, Qoutput_charset_conversion))
1255 unparse_charset_conversion_specs
1256 (XCODING_SYSTEM (coding_system)->iso2022.output_conv);
1260 else if (type == CODESYS_CCL)
1262 if (EQ (prop, Qdecode))
1263 return XCODING_SYSTEM_CCL_DECODE (coding_system);
1264 else if (EQ (prop, Qencode))
1265 return XCODING_SYSTEM_CCL_ENCODE (coding_system);
1273 return Qnil; /* not reached */
1277 /************************************************************************/
1278 /* Coding category functions */
1279 /************************************************************************/
1282 decode_coding_category (Lisp_Object symbol)
1286 CHECK_SYMBOL (symbol);
1287 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1288 if (EQ (coding_category_symbol[i], symbol))
1291 signal_simple_error ("Unrecognized coding category", symbol);
1292 return 0; /* not reached */
1295 DEFUN ("coding-category-list", Fcoding_category_list, 0, 0, 0, /*
1296 Return a list of all recognized coding categories.
1301 Lisp_Object list = Qnil;
1303 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1304 list = Fcons (coding_category_symbol[i], list);
1308 DEFUN ("set-coding-priority-list", Fset_coding_priority_list, 1, 1, 0, /*
1309 Change the priority order of the coding categories.
1310 LIST should be list of coding categories, in descending order of
1311 priority. Unspecified coding categories will be lower in priority
1312 than all specified ones, in the same relative order they were in
1317 int category_to_priority[CODING_CATEGORY_LAST + 1];
1321 /* First generate a list that maps coding categories to priorities. */
1323 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1324 category_to_priority[i] = -1;
1326 /* Highest priority comes from the specified list. */
1328 EXTERNAL_LIST_LOOP (rest, list)
1330 int cat = decode_coding_category (XCAR (rest));
1332 if (category_to_priority[cat] >= 0)
1333 signal_simple_error ("Duplicate coding category in list", XCAR (rest));
1334 category_to_priority[cat] = i++;
1337 /* Now go through the existing categories by priority to retrieve
1338 the categories not yet specified and preserve their priority
1340 for (j = 0; j <= CODING_CATEGORY_LAST; j++)
1342 int cat = fcd->coding_category_by_priority[j];
1343 if (category_to_priority[cat] < 0)
1344 category_to_priority[cat] = i++;
1347 /* Now we need to construct the inverse of the mapping we just
1350 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1351 fcd->coding_category_by_priority[category_to_priority[i]] = i;
1353 /* Phew! That was confusing. */
1357 DEFUN ("coding-priority-list", Fcoding_priority_list, 0, 0, 0, /*
1358 Return a list of coding categories in descending order of priority.
1363 Lisp_Object list = Qnil;
1365 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1366 list = Fcons (coding_category_symbol[fcd->coding_category_by_priority[i]],
1371 DEFUN ("set-coding-category-system", Fset_coding_category_system, 2, 2, 0, /*
1372 Change the coding system associated with a coding category.
1374 (coding_category, coding_system))
1376 int cat = decode_coding_category (coding_category);
1378 coding_system = Fget_coding_system (coding_system);
1379 fcd->coding_category_system[cat] = coding_system;
1383 DEFUN ("coding-category-system", Fcoding_category_system, 1, 1, 0, /*
1384 Return the coding system associated with a coding category.
1388 int cat = decode_coding_category (coding_category);
1389 Lisp_Object sys = fcd->coding_category_system[cat];
1392 return XCODING_SYSTEM_NAME (sys);
1397 /************************************************************************/
1398 /* Detecting the encoding of data */
1399 /************************************************************************/
1401 struct detection_state
1403 eol_type_t eol_type;
1439 struct iso2022_decoder iso;
1441 int high_byte_count;
1442 unsigned int saw_single_shift:1;
1455 acceptable_control_char_p (int c)
1459 /* Allow and ignore control characters that you might
1460 reasonably see in a text file */
1465 case 8: /* backspace */
1466 case 11: /* vertical tab */
1467 case 12: /* form feed */
1468 case 26: /* MS-DOS C-z junk */
1469 case 31: /* '^_' -- for info */
1477 mask_has_at_most_one_bit_p (int mask)
1479 /* Perhaps the only thing useful you learn from intensive Microsoft
1480 technical interviews */
1481 return (mask & (mask - 1)) == 0;
1485 detect_eol_type (struct detection_state *st, CONST unsigned char *src,
1495 if (st->eol.just_saw_cr)
1497 else if (st->eol.seen_anything)
1500 else if (st->eol.just_saw_cr)
1503 st->eol.just_saw_cr = 1;
1505 st->eol.just_saw_cr = 0;
1506 st->eol.seen_anything = 1;
1509 return EOL_AUTODETECT;
1512 /* Attempt to determine the encoding and EOL type of the given text.
1513 Before calling this function for the first type, you must initialize
1514 st->eol_type as appropriate and initialize st->mask to ~0.
1516 st->eol_type holds the determined EOL type, or EOL_AUTODETECT if
1519 st->mask holds the determined coding category mask, or ~0 if only
1520 ASCII has been seen so far.
1524 0 == st->eol_type is EOL_AUTODETECT and/or more than coding category
1525 is present in st->mask
1526 1 == definitive answers are here for both st->eol_type and st->mask
1530 detect_coding_type (struct detection_state *st, CONST Extbyte *src,
1531 unsigned int n, int just_do_eol)
1535 if (st->eol_type == EOL_AUTODETECT)
1536 st->eol_type = detect_eol_type (st, src, n);
1539 return st->eol_type != EOL_AUTODETECT;
1541 if (!st->seen_non_ascii)
1543 for (; n; n--, src++)
1546 if ((c < 0x20 && !acceptable_control_char_p (c)) || c >= 0x80)
1548 st->seen_non_ascii = 1;
1550 st->shift_jis.mask = ~0;
1554 st->iso2022.mask = ~0;
1564 if (!mask_has_at_most_one_bit_p (st->iso2022.mask))
1565 st->iso2022.mask = detect_coding_iso2022 (st, src, n);
1566 if (!mask_has_at_most_one_bit_p (st->shift_jis.mask))
1567 st->shift_jis.mask = detect_coding_sjis (st, src, n);
1568 if (!mask_has_at_most_one_bit_p (st->big5.mask))
1569 st->big5.mask = detect_coding_big5 (st, src, n);
1570 if (!mask_has_at_most_one_bit_p (st->utf8.mask))
1571 st->utf8.mask = detect_coding_utf8 (st, src, n);
1572 if (!mask_has_at_most_one_bit_p (st->ucs4.mask))
1573 st->ucs4.mask = detect_coding_ucs4 (st, src, n);
1576 = st->iso2022.mask | st->shift_jis.mask | st->big5.mask
1577 | st->utf8.mask | st->ucs4.mask;
1580 int retval = mask_has_at_most_one_bit_p (st->mask);
1581 st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK;
1582 return retval && st->eol_type != EOL_AUTODETECT;
1587 coding_system_from_mask (int mask)
1591 /* If the file was entirely or basically ASCII, use the
1592 default value of `buffer-file-coding-system'. */
1593 Lisp_Object retval =
1594 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system;
1597 retval = Ffind_coding_system (retval);
1601 (Qbad_variable, Qwarning,
1602 "Invalid `default-buffer-file-coding-system', set to nil");
1603 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system = Qnil;
1607 retval = Fget_coding_system (Qraw_text);
1615 mask = postprocess_iso2022_mask (mask);
1617 /* Look through the coding categories by priority and find
1618 the first one that is allowed. */
1619 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1621 cat = fcd->coding_category_by_priority[i];
1622 if ((mask & (1 << cat)) &&
1623 !NILP (fcd->coding_category_system[cat]))
1627 return fcd->coding_category_system[cat];
1629 return Fget_coding_system (Qraw_text);
1633 /* Given a seekable read stream and potential coding system and EOL type
1634 as specified, do any autodetection that is called for. If the
1635 coding system and/or EOL type are not `autodetect', they will be left
1636 alone; but this function will never return an autodetect coding system
1639 This function does not automatically fetch subsidiary coding systems;
1640 that should be unnecessary with the explicit eol-type argument. */
1642 #define LENGTH(string_constant) (sizeof (string_constant) - 1)
1645 determine_real_coding_system (Lstream *stream, Lisp_Object *codesys_in_out,
1646 eol_type_t *eol_type_in_out)
1648 struct detection_state decst;
1650 if (*eol_type_in_out == EOL_AUTODETECT)
1651 *eol_type_in_out = XCODING_SYSTEM_EOL_TYPE (*codesys_in_out);
1654 decst.eol_type = *eol_type_in_out;
1657 /* If autodetection is called for, do it now. */
1658 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT
1659 || *eol_type_in_out == EOL_AUTODETECT)
1662 Lisp_Object coding_system = Qnil;
1664 ssize_t nread = Lstream_read (stream, buf, sizeof (buf));
1667 /* Look for initial "-*-"; mode line prefix */
1669 scan_end = buf + nread - LENGTH ("-*-coding:?-*-");
1674 if (*p == '-' && *(p+1) == '*' && *(p+2) == '-')
1676 Extbyte *local_vars_beg = p + 3;
1677 /* Look for final "-*-"; mode line suffix */
1678 for (p = local_vars_beg,
1679 scan_end = buf + nread - LENGTH ("-*-");
1684 if (*p == '-' && *(p+1) == '*' && *(p+2) == '-')
1686 Extbyte *suffix = p;
1687 /* Look for "coding:" */
1688 for (p = local_vars_beg,
1689 scan_end = suffix - LENGTH ("coding:?");
1692 if (memcmp ("coding:", p, LENGTH ("coding:")) == 0
1693 && (p == local_vars_beg
1694 || (*(p-1) == ' ' ||
1700 p += LENGTH ("coding:");
1701 while (*p == ' ' || *p == '\t') p++;
1703 /* Get coding system name */
1704 save = *suffix; *suffix = '\0';
1705 /* Characters valid in a MIME charset name (rfc 1521),
1706 and in a Lisp symbol name. */
1707 n = strspn ( (char *) p,
1708 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
1709 "abcdefghijklmnopqrstuvwxyz"
1715 save = p[n]; p[n] = '\0';
1717 Ffind_coding_system (intern ((char *) p));
1727 if (NILP (coding_system))
1730 if (detect_coding_type (&decst, buf, nread,
1731 XCODING_SYSTEM_TYPE (*codesys_in_out)
1732 != CODESYS_AUTODETECT))
1734 nread = Lstream_read (stream, buf, sizeof (buf));
1740 else if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT
1741 && XCODING_SYSTEM_EOL_TYPE (coding_system) == EOL_AUTODETECT)
1744 if (detect_coding_type (&decst, buf, nread, 1))
1746 nread = Lstream_read (stream, buf, sizeof (buf));
1752 *eol_type_in_out = decst.eol_type;
1753 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT)
1755 if (NILP (coding_system))
1756 *codesys_in_out = coding_system_from_mask (decst.mask);
1758 *codesys_in_out = coding_system;
1762 /* If we absolutely can't determine the EOL type, just assume LF. */
1763 if (*eol_type_in_out == EOL_AUTODETECT)
1764 *eol_type_in_out = EOL_LF;
1766 Lstream_rewind (stream);
1769 DEFUN ("detect-coding-region", Fdetect_coding_region, 2, 3, 0, /*
1770 Detect coding system of the text in the region between START and END.
1771 Returned a list of possible coding systems ordered by priority.
1772 If only ASCII characters are found, it returns 'undecided or one of
1773 its subsidiary coding systems according to a detected end-of-line
1774 type. Optional arg BUFFER defaults to the current buffer.
1776 (start, end, buffer))
1778 Lisp_Object val = Qnil;
1779 struct buffer *buf = decode_buffer (buffer, 0);
1781 Lisp_Object instream, lb_instream;
1782 Lstream *istr, *lb_istr;
1783 struct detection_state decst;
1784 struct gcpro gcpro1, gcpro2;
1786 get_buffer_range_char (buf, start, end, &b, &e, 0);
1787 lb_instream = make_lisp_buffer_input_stream (buf, b, e, 0);
1788 lb_istr = XLSTREAM (lb_instream);
1789 instream = make_encoding_input_stream (lb_istr, Fget_coding_system (Qbinary));
1790 istr = XLSTREAM (instream);
1791 GCPRO2 (instream, lb_instream);
1793 decst.eol_type = EOL_AUTODETECT;
1797 unsigned char random_buffer[4096];
1798 ssize_t nread = Lstream_read (istr, random_buffer, sizeof (random_buffer));
1802 if (detect_coding_type (&decst, random_buffer, nread, 0))
1806 if (decst.mask == ~0)
1807 val = subsidiary_coding_system (Fget_coding_system (Qundecided),
1815 decst.mask = postprocess_iso2022_mask (decst.mask);
1817 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1819 int sys = fcd->coding_category_by_priority[i];
1820 if (decst.mask & (1 << sys))
1822 Lisp_Object codesys = fcd->coding_category_system[sys];
1823 if (!NILP (codesys))
1824 codesys = subsidiary_coding_system (codesys, decst.eol_type);
1825 val = Fcons (codesys, val);
1829 Lstream_close (istr);
1831 Lstream_delete (istr);
1832 Lstream_delete (lb_istr);
1837 /************************************************************************/
1838 /* Converting to internal Mule format ("decoding") */
1839 /************************************************************************/
1841 /* A decoding stream is a stream used for decoding text (i.e.
1842 converting from some external format to internal format).
1843 The decoding-stream object keeps track of the actual coding
1844 stream, the stream that is at the other end, and data that
1845 needs to be persistent across the lifetime of the stream. */
1847 /* Handle the EOL stuff related to just-read-in character C.
1848 EOL_TYPE is the EOL type of the coding stream.
1849 FLAGS is the current value of FLAGS in the coding stream, and may
1850 be modified by this macro. (The macro only looks at the
1851 CODING_STATE_CR flag.) DST is the Dynarr to which the decoded
1852 bytes are to be written. You need to also define a local goto
1853 label "label_continue_loop" that is at the end of the main
1854 character-reading loop.
1856 If C is a CR character, then this macro handles it entirely and
1857 jumps to label_continue_loop. Otherwise, this macro does not add
1858 anything to DST, and continues normally. You should continue
1859 processing C normally after this macro. */
1861 #define DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst) \
1865 if (eol_type == EOL_CR) \
1866 Dynarr_add (dst, '\n'); \
1867 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR) \
1868 Dynarr_add (dst, c); \
1870 flags |= CODING_STATE_CR; \
1871 goto label_continue_loop; \
1873 else if (flags & CODING_STATE_CR) \
1874 { /* eol_type == CODING_SYSTEM_EOL_CRLF */ \
1876 Dynarr_add (dst, '\r'); \
1877 flags &= ~CODING_STATE_CR; \
1881 /* C should be a binary character in the range 0 - 255; convert
1882 to internal format and add to Dynarr DST. */
1884 #define DECODE_ADD_BINARY_CHAR(c, dst) \
1886 if (BYTE_ASCII_P (c)) \
1887 Dynarr_add (dst, c); \
1888 else if (BYTE_C1_P (c)) \
1890 Dynarr_add (dst, LEADING_BYTE_CONTROL_1); \
1891 Dynarr_add (dst, c + 0x20); \
1895 Dynarr_add (dst, LEADING_BYTE_LATIN_ISO8859_1); \
1896 Dynarr_add (dst, c); \
1900 #define DECODE_OUTPUT_PARTIAL_CHAR(ch) \
1904 DECODE_ADD_BINARY_CHAR (ch, dst); \
1909 #define DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst) \
1911 if (flags & CODING_STATE_END) \
1913 DECODE_OUTPUT_PARTIAL_CHAR (ch); \
1914 if (flags & CODING_STATE_CR) \
1915 Dynarr_add (dst, '\r'); \
1919 #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding)
1921 struct decoding_stream
1923 /* Coding system that governs the conversion. */
1924 Lisp_Coding_System *codesys;
1926 /* Stream that we read the encoded data from or
1927 write the decoded data to. */
1930 /* If we are reading, then we can return only a fixed amount of
1931 data, so if the conversion resulted in too much data, we store it
1932 here for retrieval the next time around. */
1933 unsigned_char_dynarr *runoff;
1935 /* FLAGS holds flags indicating the current state of the decoding.
1936 Some of these flags are dependent on the coding system. */
1939 /* CH holds a partially built-up character. Since we only deal
1940 with one- and two-byte characters at the moment, we only use
1941 this to store the first byte of a two-byte character. */
1944 /* EOL_TYPE specifies the type of end-of-line conversion that
1945 currently applies. We need to keep this separate from the
1946 EOL type stored in CODESYS because the latter might indicate
1947 automatic EOL-type detection while the former will always
1948 indicate a particular EOL type. */
1949 eol_type_t eol_type;
1951 /* Additional ISO2022 information. We define the structure above
1952 because it's also needed by the detection routines. */
1953 struct iso2022_decoder iso2022;
1955 /* Additional information (the state of the running CCL program)
1956 used by the CCL decoder. */
1957 struct ccl_program ccl;
1959 /* counter for UTF-8 or UCS-4 */
1960 unsigned char counter;
1962 struct detection_state decst;
1965 static ssize_t decoding_reader (Lstream *stream,
1966 unsigned char *data, size_t size);
1967 static ssize_t decoding_writer (Lstream *stream,
1968 CONST unsigned char *data, size_t size);
1969 static int decoding_rewinder (Lstream *stream);
1970 static int decoding_seekable_p (Lstream *stream);
1971 static int decoding_flusher (Lstream *stream);
1972 static int decoding_closer (Lstream *stream);
1974 static Lisp_Object decoding_marker (Lisp_Object stream);
1976 DEFINE_LSTREAM_IMPLEMENTATION ("decoding", lstream_decoding,
1977 sizeof (struct decoding_stream));
1980 decoding_marker (Lisp_Object stream)
1982 Lstream *str = DECODING_STREAM_DATA (XLSTREAM (stream))->other_end;
1983 Lisp_Object str_obj;
1985 /* We do not need to mark the coding systems or charsets stored
1986 within the stream because they are stored in a global list
1987 and automatically marked. */
1989 XSETLSTREAM (str_obj, str);
1990 mark_object (str_obj);
1991 if (str->imp->marker)
1992 return (str->imp->marker) (str_obj);
1997 /* Read SIZE bytes of data and store it into DATA. We are a decoding stream
1998 so we read data from the other end, decode it, and store it into DATA. */
2001 decoding_reader (Lstream *stream, unsigned char *data, size_t size)
2003 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2004 unsigned char *orig_data = data;
2006 int error_occurred = 0;
2008 /* We need to interface to mule_decode(), which expects to take some
2009 amount of data and store the result into a Dynarr. We have
2010 mule_decode() store into str->runoff, and take data from there
2013 /* We loop until we have enough data, reading chunks from the other
2014 end and decoding it. */
2017 /* Take data from the runoff if we can. Make sure to take at
2018 most SIZE bytes, and delete the data from the runoff. */
2019 if (Dynarr_length (str->runoff) > 0)
2021 size_t chunk = min (size, (size_t) Dynarr_length (str->runoff));
2022 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2023 Dynarr_delete_many (str->runoff, 0, chunk);
2029 break; /* No more room for data */
2031 if (str->flags & CODING_STATE_END)
2032 /* This means that on the previous iteration, we hit the EOF on
2033 the other end. We loop once more so that mule_decode() can
2034 output any final stuff it may be holding, or any "go back
2035 to a sane state" escape sequences. (This latter makes sense
2036 during encoding.) */
2039 /* Exhausted the runoff, so get some more. DATA has at least
2040 SIZE bytes left of storage in it, so it's OK to read directly
2041 into it. (We'll be overwriting above, after we've decoded it
2042 into the runoff.) */
2043 read_size = Lstream_read (str->other_end, data, size);
2050 /* There might be some more end data produced in the translation.
2051 See the comment above. */
2052 str->flags |= CODING_STATE_END;
2053 mule_decode (stream, data, str->runoff, read_size);
2056 if (data - orig_data == 0)
2057 return error_occurred ? -1 : 0;
2059 return data - orig_data;
2063 decoding_writer (Lstream *stream, CONST unsigned char *data, size_t size)
2065 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2068 /* Decode all our data into the runoff, and then attempt to write
2069 it all out to the other end. Remove whatever chunk we succeeded
2071 mule_decode (stream, data, str->runoff, size);
2072 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2073 Dynarr_length (str->runoff));
2075 Dynarr_delete_many (str->runoff, 0, retval);
2076 /* Do NOT return retval. The return value indicates how much
2077 of the incoming data was written, not how many bytes were
2083 reset_decoding_stream (struct decoding_stream *str)
2086 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_ISO2022)
2088 Lisp_Object coding_system;
2089 XSETCODING_SYSTEM (coding_system, str->codesys);
2090 reset_iso2022 (coding_system, &str->iso2022);
2092 else if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_CCL)
2094 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_DECODE (str->codesys));
2098 str->flags = str->ch = 0;
2102 decoding_rewinder (Lstream *stream)
2104 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2105 reset_decoding_stream (str);
2106 Dynarr_reset (str->runoff);
2107 return Lstream_rewind (str->other_end);
2111 decoding_seekable_p (Lstream *stream)
2113 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2114 return Lstream_seekable_p (str->other_end);
2118 decoding_flusher (Lstream *stream)
2120 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2121 return Lstream_flush (str->other_end);
2125 decoding_closer (Lstream *stream)
2127 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2128 if (stream->flags & LSTREAM_FL_WRITE)
2130 str->flags |= CODING_STATE_END;
2131 decoding_writer (stream, 0, 0);
2133 Dynarr_free (str->runoff);
2135 #ifdef ENABLE_COMPOSITE_CHARS
2136 if (str->iso2022.composite_chars)
2137 Dynarr_free (str->iso2022.composite_chars);
2140 return Lstream_close (str->other_end);
2144 decoding_stream_coding_system (Lstream *stream)
2146 Lisp_Object coding_system;
2147 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2149 XSETCODING_SYSTEM (coding_system, str->codesys);
2150 return subsidiary_coding_system (coding_system, str->eol_type);
2154 set_decoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2156 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2157 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2159 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
2160 str->eol_type = CODING_SYSTEM_EOL_TYPE (cs);
2161 reset_decoding_stream (str);
2164 /* WARNING WARNING WARNING WARNING!!!!! If you open up a decoding
2165 stream for writing, no automatic code detection will be performed.
2166 The reason for this is that automatic code detection requires a
2167 seekable input. Things will also fail if you open a decoding
2168 stream for reading using a non-fully-specified coding system and
2169 a non-seekable input stream. */
2172 make_decoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2175 Lstream *lstr = Lstream_new (lstream_decoding, mode);
2176 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2180 str->other_end = stream;
2181 str->runoff = (unsigned_char_dynarr *) Dynarr_new (unsigned_char);
2182 str->eol_type = EOL_AUTODETECT;
2183 if (!strcmp (mode, "r")
2184 && Lstream_seekable_p (stream))
2185 /* We can determine the coding system now. */
2186 determine_real_coding_system (stream, &codesys, &str->eol_type);
2187 set_decoding_stream_coding_system (lstr, codesys);
2188 str->decst.eol_type = str->eol_type;
2189 str->decst.mask = ~0;
2190 XSETLSTREAM (obj, lstr);
2195 make_decoding_input_stream (Lstream *stream, Lisp_Object codesys)
2197 return make_decoding_stream_1 (stream, codesys, "r");
2201 make_decoding_output_stream (Lstream *stream, Lisp_Object codesys)
2203 return make_decoding_stream_1 (stream, codesys, "w");
2206 /* Note: the decode_coding_* functions all take the same
2207 arguments as mule_decode(), which is to say some SRC data of
2208 size N, which is to be stored into dynamic array DST.
2209 DECODING is the stream within which the decoding is
2210 taking place, but no data is actually read from or
2211 written to that stream; that is handled in decoding_reader()
2212 or decoding_writer(). This allows the same functions to
2213 be used for both reading and writing. */
2216 mule_decode (Lstream *decoding, CONST unsigned char *src,
2217 unsigned_char_dynarr *dst, unsigned int n)
2219 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2221 /* If necessary, do encoding-detection now. We do this when
2222 we're a writing stream or a non-seekable reading stream,
2223 meaning that we can't just process the whole input,
2224 rewind, and start over. */
2226 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT ||
2227 str->eol_type == EOL_AUTODETECT)
2229 Lisp_Object codesys;
2231 XSETCODING_SYSTEM (codesys, str->codesys);
2232 detect_coding_type (&str->decst, src, n,
2233 CODING_SYSTEM_TYPE (str->codesys) !=
2234 CODESYS_AUTODETECT);
2235 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT &&
2236 str->decst.mask != ~0)
2237 /* #### This is cheesy. What we really ought to do is
2238 buffer up a certain amount of data so as to get a
2239 less random result. */
2240 codesys = coding_system_from_mask (str->decst.mask);
2241 str->eol_type = str->decst.eol_type;
2242 if (XCODING_SYSTEM (codesys) != str->codesys)
2244 /* Preserve the CODING_STATE_END flag in case it was set.
2245 If we erase it, bad things might happen. */
2246 int was_end = str->flags & CODING_STATE_END;
2247 set_decoding_stream_coding_system (decoding, codesys);
2249 str->flags |= CODING_STATE_END;
2253 switch (CODING_SYSTEM_TYPE (str->codesys))
2256 case CODESYS_INTERNAL:
2257 Dynarr_add_many (dst, src, n);
2260 case CODESYS_AUTODETECT:
2261 /* If we got this far and still haven't decided on the coding
2262 system, then do no conversion. */
2263 case CODESYS_NO_CONVERSION:
2264 decode_coding_no_conversion (decoding, src, dst, n);
2267 case CODESYS_SHIFT_JIS:
2268 decode_coding_sjis (decoding, src, dst, n);
2271 decode_coding_big5 (decoding, src, dst, n);
2274 decode_coding_ucs4 (decoding, src, dst, n);
2277 decode_coding_utf8 (decoding, src, dst, n);
2280 str->ccl.last_block = str->flags & CODING_STATE_END;
2281 ccl_driver (&str->ccl, src, dst, n, 0, CCL_MODE_DECODING);
2283 case CODESYS_ISO2022:
2284 decode_coding_iso2022 (decoding, src, dst, n);
2292 DEFUN ("decode-coding-region", Fdecode_coding_region, 3, 4, 0, /*
2293 Decode the text between START and END which is encoded in CODING-SYSTEM.
2294 This is useful if you've read in encoded text from a file without decoding
2295 it (e.g. you read in a JIS-formatted file but used the `binary' or
2296 `no-conversion' coding system, so that it shows up as "^[$B!<!+^[(B").
2297 Return length of decoded text.
2298 BUFFER defaults to the current buffer if unspecified.
2300 (start, end, coding_system, buffer))
2303 struct buffer *buf = decode_buffer (buffer, 0);
2304 Lisp_Object instream, lb_outstream, de_outstream, outstream;
2305 Lstream *istr, *ostr;
2306 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2308 get_buffer_range_char (buf, start, end, &b, &e, 0);
2310 barf_if_buffer_read_only (buf, b, e);
2312 coding_system = Fget_coding_system (coding_system);
2313 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2314 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2315 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2317 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2318 Fget_coding_system (Qbinary));
2319 istr = XLSTREAM (instream);
2320 ostr = XLSTREAM (outstream);
2321 GCPRO4 (instream, lb_outstream, de_outstream, outstream);
2323 /* The chain of streams looks like this:
2325 [BUFFER] <----- send through
2326 ------> [ENCODE AS BINARY]
2327 ------> [DECODE AS SPECIFIED]
2333 char tempbuf[1024]; /* some random amount */
2334 Bufpos newpos, even_newer_pos;
2335 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2336 ssize_t size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2340 newpos = lisp_buffer_stream_startpos (istr);
2341 Lstream_write (ostr, tempbuf, size_in_bytes);
2342 even_newer_pos = lisp_buffer_stream_startpos (istr);
2343 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2346 Lstream_close (istr);
2347 Lstream_close (ostr);
2349 Lstream_delete (istr);
2350 Lstream_delete (ostr);
2351 Lstream_delete (XLSTREAM (de_outstream));
2352 Lstream_delete (XLSTREAM (lb_outstream));
2357 /************************************************************************/
2358 /* Converting to an external encoding ("encoding") */
2359 /************************************************************************/
2361 /* An encoding stream is an output stream. When you create the
2362 stream, you specify the coding system that governs the encoding
2363 and another stream that the resulting encoded data is to be
2364 sent to, and then start sending data to it. */
2366 #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding)
2368 struct encoding_stream
2370 /* Coding system that governs the conversion. */
2371 Lisp_Coding_System *codesys;
2373 /* Stream that we read the encoded data from or
2374 write the decoded data to. */
2377 /* If we are reading, then we can return only a fixed amount of
2378 data, so if the conversion resulted in too much data, we store it
2379 here for retrieval the next time around. */
2380 unsigned_char_dynarr *runoff;
2382 /* FLAGS holds flags indicating the current state of the encoding.
2383 Some of these flags are dependent on the coding system. */
2386 /* CH holds a partially built-up character. Since we only deal
2387 with one- and two-byte characters at the moment, we only use
2388 this to store the first byte of a two-byte character. */
2391 /* Additional information used by the ISO2022 encoder. */
2394 /* CHARSET holds the character sets currently assigned to the G0
2395 through G3 registers. It is initialized from the array
2396 INITIAL_CHARSET in CODESYS. */
2397 Lisp_Object charset[4];
2399 /* Which registers are currently invoked into the left (GL) and
2400 right (GR) halves of the 8-bit encoding space? */
2401 int register_left, register_right;
2403 /* Whether we need to explicitly designate the charset in the
2404 G? register before using it. It is initialized from the
2405 array FORCE_CHARSET_ON_OUTPUT in CODESYS. */
2406 unsigned char force_charset_on_output[4];
2408 /* Other state variables that need to be preserved across
2410 Lisp_Object current_charset;
2412 int current_char_boundary;
2415 /* Additional information (the state of the running CCL program)
2416 used by the CCL encoder. */
2417 struct ccl_program ccl;
2421 static ssize_t encoding_reader (Lstream *stream, unsigned char *data, size_t size);
2422 static ssize_t encoding_writer (Lstream *stream, CONST unsigned char *data,
2424 static int encoding_rewinder (Lstream *stream);
2425 static int encoding_seekable_p (Lstream *stream);
2426 static int encoding_flusher (Lstream *stream);
2427 static int encoding_closer (Lstream *stream);
2429 static Lisp_Object encoding_marker (Lisp_Object stream);
2431 DEFINE_LSTREAM_IMPLEMENTATION ("encoding", lstream_encoding,
2432 sizeof (struct encoding_stream));
2435 encoding_marker (Lisp_Object stream)
2437 Lstream *str = ENCODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2438 Lisp_Object str_obj;
2440 /* We do not need to mark the coding systems or charsets stored
2441 within the stream because they are stored in a global list
2442 and automatically marked. */
2444 XSETLSTREAM (str_obj, str);
2445 mark_object (str_obj);
2446 if (str->imp->marker)
2447 return (str->imp->marker) (str_obj);
2452 /* Read SIZE bytes of data and store it into DATA. We are a encoding stream
2453 so we read data from the other end, encode it, and store it into DATA. */
2456 encoding_reader (Lstream *stream, unsigned char *data, size_t size)
2458 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2459 unsigned char *orig_data = data;
2461 int error_occurred = 0;
2463 /* We need to interface to mule_encode(), which expects to take some
2464 amount of data and store the result into a Dynarr. We have
2465 mule_encode() store into str->runoff, and take data from there
2468 /* We loop until we have enough data, reading chunks from the other
2469 end and encoding it. */
2472 /* Take data from the runoff if we can. Make sure to take at
2473 most SIZE bytes, and delete the data from the runoff. */
2474 if (Dynarr_length (str->runoff) > 0)
2476 int chunk = min ((int) size, Dynarr_length (str->runoff));
2477 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2478 Dynarr_delete_many (str->runoff, 0, chunk);
2484 break; /* No more room for data */
2486 if (str->flags & CODING_STATE_END)
2487 /* This means that on the previous iteration, we hit the EOF on
2488 the other end. We loop once more so that mule_encode() can
2489 output any final stuff it may be holding, or any "go back
2490 to a sane state" escape sequences. (This latter makes sense
2491 during encoding.) */
2494 /* Exhausted the runoff, so get some more. DATA at least SIZE bytes
2495 left of storage in it, so it's OK to read directly into it.
2496 (We'll be overwriting above, after we've encoded it into the
2498 read_size = Lstream_read (str->other_end, data, size);
2505 /* There might be some more end data produced in the translation.
2506 See the comment above. */
2507 str->flags |= CODING_STATE_END;
2508 mule_encode (stream, data, str->runoff, read_size);
2511 if (data == orig_data)
2512 return error_occurred ? -1 : 0;
2514 return data - orig_data;
2518 encoding_writer (Lstream *stream, CONST unsigned char *data, size_t size)
2520 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2523 /* Encode all our data into the runoff, and then attempt to write
2524 it all out to the other end. Remove whatever chunk we succeeded
2526 mule_encode (stream, data, str->runoff, size);
2527 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2528 Dynarr_length (str->runoff));
2530 Dynarr_delete_many (str->runoff, 0, retval);
2531 /* Do NOT return retval. The return value indicates how much
2532 of the incoming data was written, not how many bytes were
2538 reset_encoding_stream (struct encoding_stream *str)
2541 switch (CODING_SYSTEM_TYPE (str->codesys))
2543 case CODESYS_ISO2022:
2547 for (i = 0; i < 4; i++)
2549 str->iso2022.charset[i] =
2550 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, i);
2551 str->iso2022.force_charset_on_output[i] =
2552 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (str->codesys, i);
2554 str->iso2022.register_left = 0;
2555 str->iso2022.register_right = 1;
2556 str->iso2022.current_charset = Qnil;
2557 str->iso2022.current_half = 0;
2558 str->iso2022.current_char_boundary = 1;
2562 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_ENCODE (str->codesys));
2569 str->flags = str->ch = 0;
2573 encoding_rewinder (Lstream *stream)
2575 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2576 reset_encoding_stream (str);
2577 Dynarr_reset (str->runoff);
2578 return Lstream_rewind (str->other_end);
2582 encoding_seekable_p (Lstream *stream)
2584 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2585 return Lstream_seekable_p (str->other_end);
2589 encoding_flusher (Lstream *stream)
2591 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2592 return Lstream_flush (str->other_end);
2596 encoding_closer (Lstream *stream)
2598 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2599 if (stream->flags & LSTREAM_FL_WRITE)
2601 str->flags |= CODING_STATE_END;
2602 encoding_writer (stream, 0, 0);
2604 Dynarr_free (str->runoff);
2605 return Lstream_close (str->other_end);
2609 encoding_stream_coding_system (Lstream *stream)
2611 Lisp_Object coding_system;
2612 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2614 XSETCODING_SYSTEM (coding_system, str->codesys);
2615 return coding_system;
2619 set_encoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2621 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2622 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2624 reset_encoding_stream (str);
2628 make_encoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2631 Lstream *lstr = Lstream_new (lstream_encoding, mode);
2632 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2636 str->runoff = Dynarr_new (unsigned_char);
2637 str->other_end = stream;
2638 set_encoding_stream_coding_system (lstr, codesys);
2639 XSETLSTREAM (obj, lstr);
2644 make_encoding_input_stream (Lstream *stream, Lisp_Object codesys)
2646 return make_encoding_stream_1 (stream, codesys, "r");
2650 make_encoding_output_stream (Lstream *stream, Lisp_Object codesys)
2652 return make_encoding_stream_1 (stream, codesys, "w");
2655 /* Convert N bytes of internally-formatted data stored in SRC to an
2656 external format, according to the encoding stream ENCODING.
2657 Store the encoded data into DST. */
2660 mule_encode (Lstream *encoding, CONST unsigned char *src,
2661 unsigned_char_dynarr *dst, unsigned int n)
2663 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2665 switch (CODING_SYSTEM_TYPE (str->codesys))
2668 case CODESYS_INTERNAL:
2669 Dynarr_add_many (dst, src, n);
2672 case CODESYS_AUTODETECT:
2673 /* If we got this far and still haven't decided on the coding
2674 system, then do no conversion. */
2675 case CODESYS_NO_CONVERSION:
2676 encode_coding_no_conversion (encoding, src, dst, n);
2679 case CODESYS_SHIFT_JIS:
2680 encode_coding_sjis (encoding, src, dst, n);
2683 encode_coding_big5 (encoding, src, dst, n);
2686 encode_coding_ucs4 (encoding, src, dst, n);
2689 encode_coding_utf8 (encoding, src, dst, n);
2692 str->ccl.last_block = str->flags & CODING_STATE_END;
2693 ccl_driver (&str->ccl, src, dst, n, 0, CCL_MODE_ENCODING);
2695 case CODESYS_ISO2022:
2696 encode_coding_iso2022 (encoding, src, dst, n);
2704 DEFUN ("encode-coding-region", Fencode_coding_region, 3, 4, 0, /*
2705 Encode the text between START and END using CODING-SYSTEM.
2706 This will, for example, convert Japanese characters into stuff such as
2707 "^[$B!<!+^[(B" if you use the JIS encoding. Return length of encoded
2708 text. BUFFER defaults to the current buffer if unspecified.
2710 (start, end, coding_system, buffer))
2713 struct buffer *buf = decode_buffer (buffer, 0);
2714 Lisp_Object instream, lb_outstream, de_outstream, outstream;
2715 Lstream *istr, *ostr;
2716 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2718 get_buffer_range_char (buf, start, end, &b, &e, 0);
2720 barf_if_buffer_read_only (buf, b, e);
2722 coding_system = Fget_coding_system (coding_system);
2723 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2724 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2725 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2726 Fget_coding_system (Qbinary));
2727 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2729 istr = XLSTREAM (instream);
2730 ostr = XLSTREAM (outstream);
2731 GCPRO4 (instream, outstream, de_outstream, lb_outstream);
2732 /* The chain of streams looks like this:
2734 [BUFFER] <----- send through
2735 ------> [ENCODE AS SPECIFIED]
2736 ------> [DECODE AS BINARY]
2741 char tempbuf[1024]; /* some random amount */
2742 Bufpos newpos, even_newer_pos;
2743 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2744 ssize_t size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2748 newpos = lisp_buffer_stream_startpos (istr);
2749 Lstream_write (ostr, tempbuf, size_in_bytes);
2750 even_newer_pos = lisp_buffer_stream_startpos (istr);
2751 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2757 lisp_buffer_stream_startpos (XLSTREAM (instream)) - b;
2758 Lstream_close (istr);
2759 Lstream_close (ostr);
2761 Lstream_delete (istr);
2762 Lstream_delete (ostr);
2763 Lstream_delete (XLSTREAM (de_outstream));
2764 Lstream_delete (XLSTREAM (lb_outstream));
2765 return make_int (retlen);
2771 /************************************************************************/
2772 /* Shift-JIS methods */
2773 /************************************************************************/
2775 /* Shift-JIS is a coding system encoding three character sets: ASCII, right
2776 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
2777 as is. A character of JISX0201-Kana (DIMENSION1_CHARS94 character set) is
2778 encoded by "position-code + 0x80". A character of JISX0208
2779 (DIMENSION2_CHARS94 character set) is encoded in 2-byte but two
2780 position-codes are divided and shifted so that it fit in the range
2783 --- CODE RANGE of Shift-JIS ---
2784 (character set) (range)
2786 JISX0201-Kana 0xA0 .. 0xDF
2787 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xEF
2788 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
2789 -------------------------------
2793 /* Is this the first byte of a Shift-JIS two-byte char? */
2795 #define BYTE_SJIS_TWO_BYTE_1_P(c) \
2796 (((c) >= 0x81 && (c) <= 0x9F) || ((c) >= 0xE0 && (c) <= 0xEF))
2798 /* Is this the second byte of a Shift-JIS two-byte char? */
2800 #define BYTE_SJIS_TWO_BYTE_2_P(c) \
2801 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0x80 && (c) <= 0xFC))
2803 #define BYTE_SJIS_KATAKANA_P(c) \
2804 ((c) >= 0xA1 && (c) <= 0xDF)
2807 detect_coding_sjis (struct detection_state *st, CONST unsigned char *src,
2815 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
2817 if (st->shift_jis.in_second_byte)
2819 st->shift_jis.in_second_byte = 0;
2823 else if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
2824 st->shift_jis.in_second_byte = 1;
2826 return CODING_CATEGORY_SHIFT_JIS_MASK;
2829 /* Convert Shift-JIS data to internal format. */
2832 decode_coding_sjis (Lstream *decoding, CONST unsigned char *src,
2833 unsigned_char_dynarr *dst, unsigned int n)
2836 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2837 unsigned int flags = str->flags;
2838 unsigned int ch = str->ch;
2839 eol_type_t eol_type = str->eol_type;
2847 /* Previous character was first byte of Shift-JIS Kanji char. */
2848 if (BYTE_SJIS_TWO_BYTE_2_P (c))
2850 unsigned char e1, e2;
2852 Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
2853 DECODE_SJIS (ch, c, e1, e2);
2854 Dynarr_add (dst, e1);
2855 Dynarr_add (dst, e2);
2859 DECODE_ADD_BINARY_CHAR (ch, dst);
2860 DECODE_ADD_BINARY_CHAR (c, dst);
2866 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
2867 if (BYTE_SJIS_TWO_BYTE_1_P (c))
2869 else if (BYTE_SJIS_KATAKANA_P (c))
2871 Dynarr_add (dst, LEADING_BYTE_KATAKANA_JISX0201);
2872 Dynarr_add (dst, c);
2875 DECODE_ADD_BINARY_CHAR (c, dst);
2877 label_continue_loop:;
2880 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
2886 /* Convert internally-formatted data to Shift-JIS. */
2889 encode_coding_sjis (Lstream *encoding, CONST unsigned char *src,
2890 unsigned_char_dynarr *dst, unsigned int n)
2893 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2894 unsigned int flags = str->flags;
2895 unsigned int ch = str->ch;
2896 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
2903 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
2904 Dynarr_add (dst, '\r');
2905 if (eol_type != EOL_CR)
2906 Dynarr_add (dst, '\n');
2909 else if (BYTE_ASCII_P (c))
2911 Dynarr_add (dst, c);
2914 else if (BUFBYTE_LEADING_BYTE_P (c))
2915 ch = (c == LEADING_BYTE_KATAKANA_JISX0201 ||
2916 c == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
2917 c == LEADING_BYTE_JAPANESE_JISX0208) ? c : 0;
2920 if (ch == LEADING_BYTE_KATAKANA_JISX0201)
2922 Dynarr_add (dst, c);
2925 else if (ch == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
2926 ch == LEADING_BYTE_JAPANESE_JISX0208)
2930 unsigned char j1, j2;
2931 ENCODE_SJIS (ch, c, j1, j2);
2932 Dynarr_add (dst, j1);
2933 Dynarr_add (dst, j2);
2943 DEFUN ("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /*
2944 Decode a JISX0208 character of Shift-JIS coding-system.
2945 CODE is the character code in Shift-JIS as a cons of type bytes.
2946 Return the corresponding character.
2950 unsigned char c1, c2, s1, s2;
2953 CHECK_INT (XCAR (code));
2954 CHECK_INT (XCDR (code));
2955 s1 = XINT (XCAR (code));
2956 s2 = XINT (XCDR (code));
2957 if (BYTE_SJIS_TWO_BYTE_1_P (s1) &&
2958 BYTE_SJIS_TWO_BYTE_2_P (s2))
2960 DECODE_SJIS (s1, s2, c1, c2);
2961 return make_char (MAKE_CHAR (Vcharset_japanese_jisx0208,
2962 c1 & 0x7F, c2 & 0x7F));
2968 DEFUN ("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /*
2969 Encode a JISX0208 character CHAR to SHIFT-JIS coding-system.
2970 Return the corresponding character code in SHIFT-JIS as a cons of two bytes.
2974 Lisp_Object charset;
2977 CHECK_CHAR_COERCE_INT (ch);
2978 BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
2979 if (EQ (charset, Vcharset_japanese_jisx0208))
2981 ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
2982 return Fcons (make_int (s1), make_int (s2));
2989 /************************************************************************/
2991 /************************************************************************/
2993 /* BIG5 is a coding system encoding two character sets: ASCII and
2994 Big5. An ASCII character is encoded as is. Big5 is a two-byte
2995 character set and is encoded in two-byte.
2997 --- CODE RANGE of BIG5 ---
2998 (character set) (range)
3000 Big5 (1st byte) 0xA1 .. 0xFE
3001 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
3002 --------------------------
3004 Since the number of characters in Big5 is larger than maximum
3005 characters in Emacs' charset (96x96), it can't be handled as one
3006 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
3007 and `charset-big5-2'. Both <type>s are DIMENSION2_CHARS94. The former
3008 contains frequently used characters and the latter contains less
3009 frequently used characters. */
3011 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
3012 ((c) >= 0xA1 && (c) <= 0xFE)
3014 /* Is this the second byte of a Shift-JIS two-byte char? */
3016 #define BYTE_BIG5_TWO_BYTE_2_P(c) \
3017 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0xA1 && (c) <= 0xFE))
3019 /* Number of Big5 characters which have the same code in 1st byte. */
3021 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
3023 /* Code conversion macros. These are macros because they are used in
3024 inner loops during code conversion.
3026 Note that temporary variables in macros introduce the classic
3027 dynamic-scoping problems with variable names. We use capital-
3028 lettered variables in the assumption that XEmacs does not use
3029 capital letters in variables except in a very formalized way
3032 /* Convert Big5 code (b1, b2) into its internal string representation
3035 /* There is a much simpler way to split the Big5 charset into two.
3036 For the moment I'm going to leave the algorithm as-is because it
3037 claims to separate out the most-used characters into a single
3038 charset, which perhaps will lead to optimizations in various
3041 The way the algorithm works is something like this:
3043 Big5 can be viewed as a 94x157 charset, where the row is
3044 encoded into the bytes 0xA1 .. 0xFE and the column is encoded
3045 into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE. As for frequency,
3046 the split between low and high column numbers is apparently
3047 meaningless; ascending rows produce less and less frequent chars.
3048 Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to
3049 the first charset, and the upper half (0xC9 .. 0xFE) to the
3050 second. To do the conversion, we convert the character into
3051 a single number where 0 .. 156 is the first row, 157 .. 313
3052 is the second, etc. That way, the characters are ordered by
3053 decreasing frequency. Then we just chop the space in two
3054 and coerce the result into a 94x94 space.
3057 #define DECODE_BIG5(b1, b2, lb, c1, c2) do \
3059 int B1 = b1, B2 = b2; \
3061 = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62); \
3065 lb = LEADING_BYTE_CHINESE_BIG5_1; \
3069 lb = LEADING_BYTE_CHINESE_BIG5_2; \
3070 I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1); \
3072 c1 = I / (0xFF - 0xA1) + 0xA1; \
3073 c2 = I % (0xFF - 0xA1) + 0xA1; \
3076 /* Convert the internal string representation of a Big5 character
3077 (lb, c1, c2) into Big5 code (b1, b2). */
3079 #define ENCODE_BIG5(lb, c1, c2, b1, b2) do \
3081 unsigned int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1); \
3083 if (lb == LEADING_BYTE_CHINESE_BIG5_2) \
3085 I += BIG5_SAME_ROW * (0xC9 - 0xA1); \
3087 b1 = I / BIG5_SAME_ROW + 0xA1; \
3088 b2 = I % BIG5_SAME_ROW; \
3089 b2 += b2 < 0x3F ? 0x40 : 0x62; \
3093 detect_coding_big5 (struct detection_state *st, CONST unsigned char *src,
3101 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO ||
3102 (c >= 0x80 && c <= 0xA0))
3104 if (st->big5.in_second_byte)
3106 st->big5.in_second_byte = 0;
3107 if (c < 0x40 || (c >= 0x80 && c <= 0xA0))
3111 st->big5.in_second_byte = 1;
3113 return CODING_CATEGORY_BIG5_MASK;
3116 /* Convert Big5 data to internal format. */
3119 decode_coding_big5 (Lstream *decoding, CONST unsigned char *src,
3120 unsigned_char_dynarr *dst, unsigned int n)
3123 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3124 unsigned int flags = str->flags;
3125 unsigned int ch = str->ch;
3126 eol_type_t eol_type = str->eol_type;
3133 /* Previous character was first byte of Big5 char. */
3134 if (BYTE_BIG5_TWO_BYTE_2_P (c))
3136 unsigned char b1, b2, b3;
3137 DECODE_BIG5 (ch, c, b1, b2, b3);
3138 Dynarr_add (dst, b1);
3139 Dynarr_add (dst, b2);
3140 Dynarr_add (dst, b3);
3144 DECODE_ADD_BINARY_CHAR (ch, dst);
3145 DECODE_ADD_BINARY_CHAR (c, dst);
3151 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3152 if (BYTE_BIG5_TWO_BYTE_1_P (c))
3155 DECODE_ADD_BINARY_CHAR (c, dst);
3157 label_continue_loop:;
3160 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
3166 /* Convert internally-formatted data to Big5. */
3169 encode_coding_big5 (Lstream *encoding, CONST unsigned char *src,
3170 unsigned_char_dynarr *dst, unsigned int n)
3173 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3174 unsigned int flags = str->flags;
3175 unsigned int ch = str->ch;
3176 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3183 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3184 Dynarr_add (dst, '\r');
3185 if (eol_type != EOL_CR)
3186 Dynarr_add (dst, '\n');
3188 else if (BYTE_ASCII_P (c))
3191 Dynarr_add (dst, c);
3193 else if (BUFBYTE_LEADING_BYTE_P (c))
3195 if (c == LEADING_BYTE_CHINESE_BIG5_1 ||
3196 c == LEADING_BYTE_CHINESE_BIG5_2)
3198 /* A recognized leading byte. */
3200 continue; /* not done with this character. */
3202 /* otherwise just ignore this character. */
3204 else if (ch == LEADING_BYTE_CHINESE_BIG5_1 ||
3205 ch == LEADING_BYTE_CHINESE_BIG5_2)
3207 /* Previous char was a recognized leading byte. */
3209 continue; /* not done with this character. */
3213 /* Encountering second byte of a Big5 character. */
3214 unsigned char b1, b2;
3216 ENCODE_BIG5 (ch >> 8, ch & 0xFF, c, b1, b2);
3217 Dynarr_add (dst, b1);
3218 Dynarr_add (dst, b2);
3229 DEFUN ("decode-big5-char", Fdecode_big5_char, 1, 1, 0, /*
3230 Decode a Big5 character CODE of BIG5 coding-system.
3231 CODE is the character code in BIG5, a cons of two integers.
3232 Return the corresponding character.
3236 unsigned char c1, c2, b1, b2;
3239 CHECK_INT (XCAR (code));
3240 CHECK_INT (XCDR (code));
3241 b1 = XINT (XCAR (code));
3242 b2 = XINT (XCDR (code));
3243 if (BYTE_BIG5_TWO_BYTE_1_P (b1) &&
3244 BYTE_BIG5_TWO_BYTE_2_P (b2))
3247 Lisp_Object charset;
3248 DECODE_BIG5 (b1, b2, leading_byte, c1, c2);
3249 charset = CHARSET_BY_LEADING_BYTE (leading_byte);
3250 return make_char (MAKE_CHAR (charset, c1 & 0x7F, c2 & 0x7F));
3256 DEFUN ("encode-big5-char", Fencode_big5_char, 1, 1, 0, /*
3257 Encode the Big5 character CH to BIG5 coding-system.
3258 Return the corresponding character code in Big5.
3262 Lisp_Object charset;
3265 CHECK_CHAR_COERCE_INT (ch);
3266 BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
3267 if (EQ (charset, Vcharset_chinese_big5_1) ||
3268 EQ (charset, Vcharset_chinese_big5_2))
3270 ENCODE_BIG5 (XCHARSET_LEADING_BYTE (charset), c1 | 0x80, c2 | 0x80,
3272 return Fcons (make_int (b1), make_int (b2));
3279 /************************************************************************/
3282 /* UCS-4 character codes are implemented as nonnegative integers. */
3284 /************************************************************************/
3287 DEFUN ("set-ucs-char", Fset_ucs_char, 2, 2, 0, /*
3288 Map UCS-4 code CODE to Mule character CHARACTER.
3290 Return T on success, NIL on failure.
3296 CHECK_CHAR (character);
3300 if (c < sizeof (fcd->ucs_to_mule_table))
3302 fcd->ucs_to_mule_table[c] = character;
3310 ucs_to_char (unsigned long code)
3312 if (code < sizeof (fcd->ucs_to_mule_table))
3314 return fcd->ucs_to_mule_table[code];
3316 else if ((0xe00000 <= code) && (code <= 0xe00000 + 94 * 94 * 14))
3321 c = code % (94 * 94);
3323 (MAKE_CHAR (CHARSET_BY_ATTRIBUTES
3324 (CHARSET_TYPE_94X94, code / (94 * 94) + '@',
3325 CHARSET_LEFT_TO_RIGHT),
3326 c / 94 + 33, c % 94 + 33));
3332 DEFUN ("ucs-char", Fucs_char, 1, 1, 0, /*
3333 Return Mule character corresponding to UCS code CODE (a positive integer).
3337 CHECK_NATNUM (code);
3338 return ucs_to_char (XINT (code));
3341 DEFUN ("set-char-ucs", Fset_char_ucs, 2, 2, 0, /*
3342 Map Mule character CHARACTER to UCS code CODE (a positive integer).
3346 /* #### Isn't this gilding the lily? Fput_char_table checks its args.
3347 Fset_char_ucs is more restrictive on index arg, but should
3348 check code arg in a char_table method. */
3349 CHECK_CHAR (character);
3350 CHECK_NATNUM (code);
3351 return Fput_char_table (character, code, mule_to_ucs_table);
3354 DEFUN ("char-ucs", Fchar_ucs, 1, 1, 0, /*
3355 Return the UCS code (a positive integer) corresponding to CHARACTER.
3359 return Fget_char_table (character, mule_to_ucs_table);
3362 /* Decode a UCS-4 character into a buffer. If the lookup fails, use
3363 <GETA MARK> (U+3013) of JIS X 0208, which means correct character
3364 is not found, instead.
3365 #### do something more appropriate (use blob?)
3366 Danger, Will Robinson! Data loss. Should we signal user? */
3368 decode_ucs4 (unsigned long ch, unsigned_char_dynarr *dst)
3370 Lisp_Object chr = ucs_to_char (ch);
3374 Bufbyte work[MAX_EMCHAR_LEN];
3379 simple_set_charptr_emchar (work, ch) :
3380 non_ascii_set_charptr_emchar (work, ch);
3381 Dynarr_add_many (dst, work, len);
3385 Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
3386 Dynarr_add (dst, 34 + 128);
3387 Dynarr_add (dst, 46 + 128);
3391 static unsigned long
3392 mule_char_to_ucs4 (Lisp_Object charset,
3393 unsigned char h, unsigned char l)
3396 = Fget_char_table (make_char (MAKE_CHAR (charset, h & 127, l & 127)),
3403 else if ( (XCHARSET_DIMENSION (charset) == 2) &&
3404 (XCHARSET_CHARS (charset) == 94) )
3406 unsigned char final = XCHARSET_FINAL (charset);
3408 if ( ('@' <= final) && (final < 0x7f) )
3410 return 0xe00000 + (final - '@') * 94 * 94
3411 + ((h & 127) - 33) * 94 + (l & 127) - 33;
3425 encode_ucs4 (Lisp_Object charset,
3426 unsigned char h, unsigned char l, unsigned_char_dynarr *dst)
3428 unsigned long code = mule_char_to_ucs4 (charset, h, l);
3429 Dynarr_add (dst, code >> 24);
3430 Dynarr_add (dst, (code >> 16) & 255);
3431 Dynarr_add (dst, (code >> 8) & 255);
3432 Dynarr_add (dst, code & 255);
3436 detect_coding_ucs4 (struct detection_state *st, CONST unsigned char *src,
3442 switch (st->ucs4.in_byte)
3451 st->ucs4.in_byte = 0;
3457 return CODING_CATEGORY_UCS4_MASK;
3461 decode_coding_ucs4 (Lstream *decoding, CONST unsigned char *src,
3462 unsigned_char_dynarr *dst, unsigned int n)
3464 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3465 unsigned int flags = str->flags;
3466 unsigned int ch = str->ch;
3467 unsigned char counter = str->counter;
3471 unsigned char c = *src++;
3479 decode_ucs4 ( ( ch << 8 ) | c, dst);
3484 ch = ( ch << 8 ) | c;
3488 if (counter & CODING_STATE_END)
3489 DECODE_OUTPUT_PARTIAL_CHAR (ch);
3493 str->counter = counter;
3497 encode_coding_ucs4 (Lstream *encoding, CONST unsigned char *src,
3498 unsigned_char_dynarr *dst, unsigned int n)
3500 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3501 unsigned int flags = str->flags;
3502 unsigned int ch = str->ch;
3503 unsigned char char_boundary = str->iso2022.current_char_boundary;
3504 Lisp_Object charset = str->iso2022.current_charset;
3506 #ifdef ENABLE_COMPOSITE_CHARS
3507 /* flags for handling composite chars. We do a little switcharoo
3508 on the source while we're outputting the composite char. */
3509 unsigned int saved_n = 0;
3510 CONST unsigned char *saved_src = NULL;
3511 int in_composite = 0;
3518 unsigned char c = *src++;
3520 if (BYTE_ASCII_P (c))
3521 { /* Processing ASCII character */
3523 encode_ucs4 (Vcharset_ascii, c, 0, dst);
3526 else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
3527 { /* Processing Leading Byte */
3529 charset = CHARSET_BY_LEADING_BYTE (c);
3530 if (LEADING_BYTE_PREFIX_P(c))
3535 { /* Processing Non-ASCII character */
3537 if (EQ (charset, Vcharset_control_1))
3539 encode_ucs4 (Vcharset_control_1, c, 0, dst);
3543 switch (XCHARSET_REP_BYTES (charset))
3546 encode_ucs4 (charset, c, 0, dst);
3549 if (XCHARSET_PRIVATE_P (charset))
3551 encode_ucs4 (charset, c, 0, dst);
3556 #ifdef ENABLE_COMPOSITE_CHARS
3557 if (EQ (charset, Vcharset_composite))
3561 /* #### Bother! We don't know how to
3563 Dynarr_add (dst, 0);
3564 Dynarr_add (dst, 0);
3565 Dynarr_add (dst, 0);
3566 Dynarr_add (dst, '~');
3570 Emchar emch = MAKE_CHAR (Vcharset_composite,
3571 ch & 0x7F, c & 0x7F);
3572 Lisp_Object lstr = composite_char_string (emch);
3576 src = XSTRING_DATA (lstr);
3577 n = XSTRING_LENGTH (lstr);
3581 #endif /* ENABLE_COMPOSITE_CHARS */
3583 encode_ucs4(charset, ch, c, dst);
3596 encode_ucs4 (charset, ch, c, dst);
3612 #ifdef ENABLE_COMPOSITE_CHARS
3618 goto back_to_square_n; /* Wheeeeeeeee ..... */
3620 #endif /* ENABLE_COMPOSITE_CHARS */
3624 str->iso2022.current_char_boundary = char_boundary;
3625 str->iso2022.current_charset = charset;
3627 /* Verbum caro factum est! */
3631 /************************************************************************/
3633 /************************************************************************/
3636 detect_coding_utf8 (struct detection_state *st, CONST unsigned char *src,
3641 unsigned char c = *src++;
3642 switch (st->utf8.in_byte)
3645 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3648 st->utf8.in_byte = 5;
3650 st->utf8.in_byte = 4;
3652 st->utf8.in_byte = 3;
3654 st->utf8.in_byte = 2;
3656 st->utf8.in_byte = 1;
3661 if ((c & 0xc0) != 0x80)
3667 return CODING_CATEGORY_UTF8_MASK;
3671 decode_coding_utf8 (Lstream *decoding, CONST unsigned char *src,
3672 unsigned_char_dynarr *dst, unsigned int n)
3674 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3675 unsigned int flags = str->flags;
3676 unsigned int ch = str->ch;
3677 eol_type_t eol_type = str->eol_type;
3678 unsigned char counter = str->counter;
3682 unsigned char c = *src++;
3691 else if ( c >= 0xf8 )
3696 else if ( c >= 0xf0 )
3701 else if ( c >= 0xe0 )
3706 else if ( c >= 0xc0 )
3713 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3714 decode_ucs4 (c, dst);
3718 ch = ( ch << 6 ) | ( c & 0x3f );
3719 decode_ucs4 (ch, dst);
3724 ch = ( ch << 6 ) | ( c & 0x3f );
3727 label_continue_loop:;
3730 if (flags & CODING_STATE_END)
3731 DECODE_OUTPUT_PARTIAL_CHAR (ch);
3735 str->counter = counter;
3739 encode_utf8 (Lisp_Object charset,
3740 unsigned char h, unsigned char l, unsigned_char_dynarr *dst)
3742 unsigned long code = mule_char_to_ucs4 (charset, h, l);
3745 Dynarr_add (dst, code);
3747 else if ( code <= 0x7ff )
3749 Dynarr_add (dst, (code >> 6) | 0xc0);
3750 Dynarr_add (dst, (code & 0x3f) | 0x80);
3752 else if ( code <= 0xffff )
3754 Dynarr_add (dst, (code >> 12) | 0xe0);
3755 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
3756 Dynarr_add (dst, (code & 0x3f) | 0x80);
3758 else if ( code <= 0x1fffff )
3760 Dynarr_add (dst, (code >> 18) | 0xf0);
3761 Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
3762 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
3763 Dynarr_add (dst, (code & 0x3f) | 0x80);
3765 else if ( code <= 0x3ffffff )
3767 Dynarr_add (dst, (code >> 24) | 0xf8);
3768 Dynarr_add (dst, ((code >> 18) & 0x3f) | 0x80);
3769 Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
3770 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
3771 Dynarr_add (dst, (code & 0x3f) | 0x80);
3775 Dynarr_add (dst, (code >> 30) | 0xfc);
3776 Dynarr_add (dst, ((code >> 24) & 0x3f) | 0x80);
3777 Dynarr_add (dst, ((code >> 18) & 0x3f) | 0x80);
3778 Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
3779 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
3780 Dynarr_add (dst, (code & 0x3f) | 0x80);
3785 encode_coding_utf8 (Lstream *encoding, CONST unsigned char *src,
3786 unsigned_char_dynarr *dst, unsigned int n)
3788 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3789 unsigned int flags = str->flags;
3790 unsigned int ch = str->ch;
3791 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3792 unsigned char char_boundary = str->iso2022.current_char_boundary;
3793 Lisp_Object charset = str->iso2022.current_charset;
3795 #ifdef ENABLE_COMPOSITE_CHARS
3796 /* flags for handling composite chars. We do a little switcharoo
3797 on the source while we're outputting the composite char. */
3798 unsigned int saved_n = 0;
3799 CONST unsigned char *saved_src = NULL;
3800 int in_composite = 0;
3803 #endif /* ENABLE_COMPOSITE_CHARS */
3807 unsigned char c = *src++;
3809 if (BYTE_ASCII_P (c))
3810 { /* Processing ASCII character */
3814 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3815 Dynarr_add (dst, '\r');
3816 if (eol_type != EOL_CR)
3817 Dynarr_add (dst, c);
3820 encode_utf8 (Vcharset_ascii, c, 0, dst);
3823 else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
3824 { /* Processing Leading Byte */
3826 charset = CHARSET_BY_LEADING_BYTE (c);
3827 if (LEADING_BYTE_PREFIX_P(c))
3832 { /* Processing Non-ASCII character */
3834 if (EQ (charset, Vcharset_control_1))
3836 encode_utf8 (Vcharset_control_1, c, 0, dst);
3840 switch (XCHARSET_REP_BYTES (charset))
3843 encode_utf8 (charset, c, 0, dst);
3846 if (XCHARSET_PRIVATE_P (charset))
3848 encode_utf8 (charset, c, 0, dst);
3853 #ifdef ENABLE_COMPOSITE_CHARS
3854 if (EQ (charset, Vcharset_composite))
3858 /* #### Bother! We don't know how to
3860 encode_utf8 (Vcharset_ascii, '~', 0, dst);
3864 Emchar emch = MAKE_CHAR (Vcharset_composite,
3865 ch & 0x7F, c & 0x7F);
3866 Lisp_Object lstr = composite_char_string (emch);
3870 src = XSTRING_DATA (lstr);
3871 n = XSTRING_LENGTH (lstr);
3875 #endif /* ENABLE_COMPOSITE_CHARS */
3877 encode_utf8 (charset, ch, c, dst);
3890 encode_utf8 (charset, ch, c, dst);
3906 #ifdef ENABLE_COMPOSITE_CHARS
3912 goto back_to_square_n; /* Wheeeeeeeee ..... */
3918 str->iso2022.current_char_boundary = char_boundary;
3919 str->iso2022.current_charset = charset;
3921 /* Verbum caro factum est! */
3925 /************************************************************************/
3926 /* ISO2022 methods */
3927 /************************************************************************/
3929 /* The following note describes the coding system ISO2022 briefly.
3930 Since the intention of this note is to help understand the
3931 functions in this file, some parts are NOT ACCURATE or OVERLY
3932 SIMPLIFIED. For thorough understanding, please refer to the
3933 original document of ISO2022.
3935 ISO2022 provides many mechanisms to encode several character sets
3936 in 7-bit and 8-bit environments. For 7-bit environments, all text
3937 is encoded using bytes less than 128. This may make the encoded
3938 text a little bit longer, but the text passes more easily through
3939 several gateways, some of which strip off MSB (Most Signigant Bit).
3941 There are two kinds of character sets: control character set and
3942 graphic character set. The former contains control characters such
3943 as `newline' and `escape' to provide control functions (control
3944 functions are also provided by escape sequences). The latter
3945 contains graphic characters such as 'A' and '-'. Emacs recognizes
3946 two control character sets and many graphic character sets.
3948 Graphic character sets are classified into one of the following
3949 four classes, according to the number of bytes (DIMENSION) and
3950 number of characters in one dimension (CHARS) of the set:
3951 - DIMENSION1_CHARS94
3952 - DIMENSION1_CHARS96
3953 - DIMENSION2_CHARS94
3954 - DIMENSION2_CHARS96
3956 In addition, each character set is assigned an identification tag,
3957 unique for each set, called "final character" (denoted as <F>
3958 hereafter). The <F> of each character set is decided by ECMA(*)
3959 when it is registered in ISO. The code range of <F> is 0x30..0x7F
3960 (0x30..0x3F are for private use only).
3962 Note (*): ECMA = European Computer Manufacturers Association
3964 Here are examples of graphic character set [NAME(<F>)]:
3965 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
3966 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
3967 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
3968 o DIMENSION2_CHARS96 -- none for the moment
3970 A code area (1 byte = 8 bits) is divided into 4 areas, C0, GL, C1, and GR.
3971 C0 [0x00..0x1F] -- control character plane 0
3972 GL [0x20..0x7F] -- graphic character plane 0
3973 C1 [0x80..0x9F] -- control character plane 1
3974 GR [0xA0..0xFF] -- graphic character plane 1
3976 A control character set is directly designated and invoked to C0 or
3977 C1 by an escape sequence. The most common case is that:
3978 - ISO646's control character set is designated/invoked to C0, and
3979 - ISO6429's control character set is designated/invoked to C1,
3980 and usually these designations/invocations are omitted in encoded
3981 text. In a 7-bit environment, only C0 can be used, and a control
3982 character for C1 is encoded by an appropriate escape sequence to
3983 fit into the environment. All control characters for C1 are
3984 defined to have corresponding escape sequences.
3986 A graphic character set is at first designated to one of four
3987 graphic registers (G0 through G3), then these graphic registers are
3988 invoked to GL or GR. These designations and invocations can be
3989 done independently. The most common case is that G0 is invoked to
3990 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
3991 these invocations and designations are omitted in encoded text.
3992 In a 7-bit environment, only GL can be used.
3994 When a graphic character set of CHARS94 is invoked to GL, codes
3995 0x20 and 0x7F of the GL area work as control characters SPACE and
3996 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
3999 There are two ways of invocation: locking-shift and single-shift.
4000 With locking-shift, the invocation lasts until the next different
4001 invocation, whereas with single-shift, the invocation affects the
4002 following character only and doesn't affect the locking-shift
4003 state. Invocations are done by the following control characters or
4006 ----------------------------------------------------------------------
4007 abbrev function cntrl escape seq description
4008 ----------------------------------------------------------------------
4009 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
4010 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
4011 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
4012 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
4013 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
4014 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
4015 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
4016 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
4017 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
4018 ----------------------------------------------------------------------
4019 (*) These are not used by any known coding system.
4021 Control characters for these functions are defined by macros
4022 ISO_CODE_XXX in `coding.h'.
4024 Designations are done by the following escape sequences:
4025 ----------------------------------------------------------------------
4026 escape sequence description
4027 ----------------------------------------------------------------------
4028 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
4029 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
4030 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
4031 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
4032 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
4033 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
4034 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
4035 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
4036 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
4037 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
4038 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
4039 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
4040 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
4041 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
4042 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
4043 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
4044 ----------------------------------------------------------------------
4046 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
4047 of dimension 1, chars 94, and final character <F>, etc...
4049 Note (*): Although these designations are not allowed in ISO2022,
4050 Emacs accepts them on decoding, and produces them on encoding
4051 CHARS96 character sets in a coding system which is characterized as
4052 7-bit environment, non-locking-shift, and non-single-shift.
4054 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
4055 '(' can be omitted. We refer to this as "short-form" hereafter.
4057 Now you may notice that there are a lot of ways for encoding the
4058 same multilingual text in ISO2022. Actually, there exist many
4059 coding systems such as Compound Text (used in X11's inter client
4060 communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
4061 (used in Korean internet), EUC (Extended UNIX Code, used in Asian
4062 localized platforms), and all of these are variants of ISO2022.
4064 In addition to the above, Emacs handles two more kinds of escape
4065 sequences: ISO6429's direction specification and Emacs' private
4066 sequence for specifying character composition.
4068 ISO6429's direction specification takes the following form:
4069 o CSI ']' -- end of the current direction
4070 o CSI '0' ']' -- end of the current direction
4071 o CSI '1' ']' -- start of left-to-right text
4072 o CSI '2' ']' -- start of right-to-left text
4073 The control character CSI (0x9B: control sequence introducer) is
4074 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
4076 Character composition specification takes the following form:
4077 o ESC '0' -- start character composition
4078 o ESC '1' -- end character composition
4079 Since these are not standard escape sequences of any ISO standard,
4080 their use with these meanings is restricted to Emacs only. */
4083 reset_iso2022 (Lisp_Object coding_system, struct iso2022_decoder *iso)
4087 for (i = 0; i < 4; i++)
4089 if (!NILP (coding_system))
4091 XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, i);
4093 iso->charset[i] = Qt;
4094 iso->invalid_designated[i] = 0;
4096 iso->esc = ISO_ESC_NOTHING;
4097 iso->esc_bytes_index = 0;
4098 iso->register_left = 0;
4099 iso->register_right = 1;
4100 iso->switched_dir_and_no_valid_charset_yet = 0;
4101 iso->invalid_switch_dir = 0;
4102 iso->output_direction_sequence = 0;
4103 iso->output_literally = 0;
4104 #ifdef ENABLE_COMPOSITE_CHARS
4105 if (iso->composite_chars)
4106 Dynarr_reset (iso->composite_chars);
4111 fit_to_be_escape_quoted (unsigned char c)
4128 /* Parse one byte of an ISO2022 escape sequence.
4129 If the result is an invalid escape sequence, return 0 and
4130 do not change anything in STR. Otherwise, if the result is
4131 an incomplete escape sequence, update ISO2022.ESC and
4132 ISO2022.ESC_BYTES and return -1. Otherwise, update
4133 all the state variables (but not ISO2022.ESC_BYTES) and
4136 If CHECK_INVALID_CHARSETS is non-zero, check for designation
4137 or invocation of an invalid character set and treat that as
4138 an unrecognized escape sequence. */
4141 parse_iso2022_esc (Lisp_Object codesys, struct iso2022_decoder *iso,
4142 unsigned char c, unsigned int *flags,
4143 int check_invalid_charsets)
4145 /* (1) If we're at the end of a designation sequence, CS is the
4146 charset being designated and REG is the register to designate
4149 (2) If we're at the end of a locking-shift sequence, REG is
4150 the register to invoke and HALF (0 == left, 1 == right) is
4151 the half to invoke it into.
4153 (3) If we're at the end of a single-shift sequence, REG is
4154 the register to invoke. */
4155 Lisp_Object cs = Qnil;
4158 /* NOTE: This code does goto's all over the fucking place.
4159 The reason for this is that we're basically implementing
4160 a state machine here, and hierarchical languages like C
4161 don't really provide a clean way of doing this. */
4163 if (! (*flags & CODING_STATE_ESCAPE))
4164 /* At beginning of escape sequence; we need to reset our
4165 escape-state variables. */
4166 iso->esc = ISO_ESC_NOTHING;
4168 iso->output_literally = 0;
4169 iso->output_direction_sequence = 0;
4173 case ISO_ESC_NOTHING:
4174 iso->esc_bytes_index = 0;
4177 case ISO_CODE_ESC: /* Start escape sequence */
4178 *flags |= CODING_STATE_ESCAPE;
4182 case ISO_CODE_CSI: /* ISO6429 (specifying directionality) */
4183 *flags |= CODING_STATE_ESCAPE;
4184 iso->esc = ISO_ESC_5_11;
4187 case ISO_CODE_SO: /* locking shift 1 */
4190 case ISO_CODE_SI: /* locking shift 0 */
4194 case ISO_CODE_SS2: /* single shift */
4197 case ISO_CODE_SS3: /* single shift */
4201 default: /* Other control characters */
4208 /**** single shift ****/
4210 case 'N': /* single shift 2 */
4213 case 'O': /* single shift 3 */
4217 /**** locking shift ****/
4219 case '~': /* locking shift 1 right */
4222 case 'n': /* locking shift 2 */
4225 case '}': /* locking shift 2 right */
4228 case 'o': /* locking shift 3 */
4231 case '|': /* locking shift 3 right */
4235 #ifdef ENABLE_COMPOSITE_CHARS
4236 /**** composite ****/
4239 iso->esc = ISO_ESC_START_COMPOSITE;
4240 *flags = (*flags & CODING_STATE_ISO2022_LOCK) |
4241 CODING_STATE_COMPOSITE;
4245 iso->esc = ISO_ESC_END_COMPOSITE;
4246 *flags = (*flags & CODING_STATE_ISO2022_LOCK) &
4247 ~CODING_STATE_COMPOSITE;
4249 #endif /* ENABLE_COMPOSITE_CHARS */
4251 /**** directionality ****/
4254 iso->esc = ISO_ESC_5_11;
4257 /**** designation ****/
4259 case '$': /* multibyte charset prefix */
4260 iso->esc = ISO_ESC_2_4;
4264 if (0x28 <= c && c <= 0x2F)
4266 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_8);
4270 /* This function is called with CODESYS equal to nil when
4271 doing coding-system detection. */
4273 && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
4274 && fit_to_be_escape_quoted (c))
4276 iso->esc = ISO_ESC_LITERAL;
4277 *flags &= CODING_STATE_ISO2022_LOCK;
4287 /**** directionality ****/
4289 case ISO_ESC_5_11: /* ISO6429 direction control */
4292 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4293 goto directionality;
4295 if (c == '0') iso->esc = ISO_ESC_5_11_0;
4296 else if (c == '1') iso->esc = ISO_ESC_5_11_1;
4297 else if (c == '2') iso->esc = ISO_ESC_5_11_2;
4301 case ISO_ESC_5_11_0:
4304 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4305 goto directionality;
4309 case ISO_ESC_5_11_1:
4312 *flags = (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4313 goto directionality;
4317 case ISO_ESC_5_11_2:
4320 *flags = (*flags & CODING_STATE_ISO2022_LOCK) | CODING_STATE_R2L;
4321 goto directionality;
4326 iso->esc = ISO_ESC_DIRECTIONALITY;
4327 /* Various junk here to attempt to preserve the direction sequences
4328 literally in the text if they would otherwise be swallowed due
4329 to invalid designations that don't show up as actual charset
4330 changes in the text. */
4331 if (iso->invalid_switch_dir)
4333 /* We already inserted a direction switch literally into the
4334 text. We assume (#### this may not be right) that the
4335 next direction switch is the one going the other way,
4336 and we need to output that literally as well. */
4337 iso->output_literally = 1;
4338 iso->invalid_switch_dir = 0;
4344 /* If we are in the thrall of an invalid designation,
4345 then stick the directionality sequence literally into the
4346 output stream so it ends up in the original text again. */
4347 for (jj = 0; jj < 4; jj++)
4348 if (iso->invalid_designated[jj])
4352 iso->output_literally = 1;
4353 iso->invalid_switch_dir = 1;
4356 /* Indicate that we haven't yet seen a valid designation,
4357 so that if a switch-dir is directly followed by an
4358 invalid designation, both get inserted literally. */
4359 iso->switched_dir_and_no_valid_charset_yet = 1;
4364 /**** designation ****/
4367 if (0x28 <= c && c <= 0x2F)
4369 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_4_8);
4372 if (0x40 <= c && c <= 0x42)
4374 cs = CHARSET_BY_ATTRIBUTES (CHARSET_TYPE_94X94, c,
4375 *flags & CODING_STATE_R2L ?
4376 CHARSET_RIGHT_TO_LEFT :
4377 CHARSET_LEFT_TO_RIGHT);
4387 if (c < '0' || c > '~')
4388 return 0; /* bad final byte */
4390 if (iso->esc >= ISO_ESC_2_8 &&
4391 iso->esc <= ISO_ESC_2_15)
4393 type = ((iso->esc >= ISO_ESC_2_12) ?
4394 CHARSET_TYPE_96 : CHARSET_TYPE_94);
4395 reg = (iso->esc - ISO_ESC_2_8) & 3;
4397 else if (iso->esc >= ISO_ESC_2_4_8 &&
4398 iso->esc <= ISO_ESC_2_4_15)
4400 type = ((iso->esc >= ISO_ESC_2_4_12) ?
4401 CHARSET_TYPE_96X96 : CHARSET_TYPE_94X94);
4402 reg = (iso->esc - ISO_ESC_2_4_8) & 3;
4406 /* Can this ever be reached? -slb */
4410 cs = CHARSET_BY_ATTRIBUTES (type, c,
4411 *flags & CODING_STATE_R2L ?
4412 CHARSET_RIGHT_TO_LEFT :
4413 CHARSET_LEFT_TO_RIGHT);
4419 iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char) c;
4423 if (check_invalid_charsets && !CHARSETP (iso->charset[reg]))
4424 /* can't invoke something that ain't there. */
4426 iso->esc = ISO_ESC_SINGLE_SHIFT;
4427 *flags &= CODING_STATE_ISO2022_LOCK;
4429 *flags |= CODING_STATE_SS2;
4431 *flags |= CODING_STATE_SS3;
4435 if (check_invalid_charsets &&
4436 !CHARSETP (iso->charset[reg]))
4437 /* can't invoke something that ain't there. */
4440 iso->register_right = reg;
4442 iso->register_left = reg;
4443 *flags &= CODING_STATE_ISO2022_LOCK;
4444 iso->esc = ISO_ESC_LOCKING_SHIFT;
4448 if (NILP (cs) && check_invalid_charsets)
4450 iso->invalid_designated[reg] = 1;
4451 iso->charset[reg] = Vcharset_ascii;
4452 iso->esc = ISO_ESC_DESIGNATE;
4453 *flags &= CODING_STATE_ISO2022_LOCK;
4454 iso->output_literally = 1;
4455 if (iso->switched_dir_and_no_valid_charset_yet)
4457 /* We encountered a switch-direction followed by an
4458 invalid designation. Ensure that the switch-direction
4459 gets outputted; otherwise it will probably get eaten
4460 when the text is written out again. */
4461 iso->switched_dir_and_no_valid_charset_yet = 0;
4462 iso->output_direction_sequence = 1;
4463 /* And make sure that the switch-dir going the other
4464 way gets outputted, as well. */
4465 iso->invalid_switch_dir = 1;
4469 /* This function is called with CODESYS equal to nil when
4470 doing coding-system detection. */
4471 if (!NILP (codesys))
4473 charset_conversion_spec_dynarr *dyn =
4474 XCODING_SYSTEM (codesys)->iso2022.input_conv;
4480 for (i = 0; i < Dynarr_length (dyn); i++)
4482 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
4483 if (EQ (cs, spec->from_charset))
4484 cs = spec->to_charset;
4489 iso->charset[reg] = cs;
4490 iso->esc = ISO_ESC_DESIGNATE;
4491 *flags &= CODING_STATE_ISO2022_LOCK;
4492 if (iso->invalid_designated[reg])
4494 iso->invalid_designated[reg] = 0;
4495 iso->output_literally = 1;
4497 if (iso->switched_dir_and_no_valid_charset_yet)
4498 iso->switched_dir_and_no_valid_charset_yet = 0;
4503 detect_coding_iso2022 (struct detection_state *st, CONST unsigned char *src,
4508 /* #### There are serious deficiencies in the recognition mechanism
4509 here. This needs to be much smarter if it's going to cut it.
4510 The sequence "\xff\x0f" is currently detected as LOCK_SHIFT while
4511 it should be detected as Latin-1.
4512 All the ISO2022 stuff in this file should be synced up with the
4513 code from FSF Emacs-20.4, in which Mule should be more or less stable.
4514 Perhaps we should wait till R2L works in FSF Emacs? */
4516 if (!st->iso2022.initted)
4518 reset_iso2022 (Qnil, &st->iso2022.iso);
4519 st->iso2022.mask = (CODING_CATEGORY_ISO_7_MASK |
4520 CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4521 CODING_CATEGORY_ISO_8_1_MASK |
4522 CODING_CATEGORY_ISO_8_2_MASK |
4523 CODING_CATEGORY_ISO_LOCK_SHIFT_MASK);
4524 st->iso2022.flags = 0;
4525 st->iso2022.high_byte_count = 0;
4526 st->iso2022.saw_single_shift = 0;
4527 st->iso2022.initted = 1;
4530 mask = st->iso2022.mask;
4537 mask &= ~CODING_CATEGORY_ISO_7_MASK;
4538 st->iso2022.high_byte_count++;
4542 if (st->iso2022.high_byte_count && !st->iso2022.saw_single_shift)
4544 if (st->iso2022.high_byte_count & 1)
4545 /* odd number of high bytes; assume not iso-8-2 */
4546 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4548 st->iso2022.high_byte_count = 0;
4549 st->iso2022.saw_single_shift = 0;
4551 mask &= ~CODING_CATEGORY_ISO_7_MASK;
4553 if (!(st->iso2022.flags & CODING_STATE_ESCAPE)
4554 && (BYTE_C0_P (c) || BYTE_C1_P (c)))
4555 { /* control chars */
4558 /* Allow and ignore control characters that you might
4559 reasonably see in a text file */
4564 case 8: /* backspace */
4565 case 11: /* vertical tab */
4566 case 12: /* form feed */
4567 case 26: /* MS-DOS C-z junk */
4568 case 31: /* '^_' -- for info */
4569 goto label_continue_loop;
4576 if ((st->iso2022.flags & CODING_STATE_ESCAPE) || BYTE_C0_P (c)
4579 if (parse_iso2022_esc (Qnil, &st->iso2022.iso, c,
4580 &st->iso2022.flags, 0))
4582 switch (st->iso2022.iso.esc)
4584 case ISO_ESC_DESIGNATE:
4585 mask &= ~CODING_CATEGORY_ISO_8_1_MASK;
4586 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4588 case ISO_ESC_LOCKING_SHIFT:
4589 mask = CODING_CATEGORY_ISO_LOCK_SHIFT_MASK;
4590 goto ran_out_of_chars;
4591 case ISO_ESC_SINGLE_SHIFT:
4592 mask &= ~CODING_CATEGORY_ISO_8_DESIGNATE_MASK;
4593 st->iso2022.saw_single_shift = 1;
4602 goto ran_out_of_chars;
4605 label_continue_loop:;
4614 postprocess_iso2022_mask (int mask)
4616 /* #### kind of cheesy */
4617 /* If seven-bit ISO is allowed, then assume that the encoding is
4618 entirely seven-bit and turn off the eight-bit ones. */
4619 if (mask & CODING_CATEGORY_ISO_7_MASK)
4620 mask &= ~ (CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4621 CODING_CATEGORY_ISO_8_1_MASK |
4622 CODING_CATEGORY_ISO_8_2_MASK);
4626 /* If FLAGS is a null pointer or specifies right-to-left motion,
4627 output a switch-dir-to-left-to-right sequence to DST.
4628 Also update FLAGS if it is not a null pointer.
4629 If INTERNAL_P is set, we are outputting in internal format and
4630 need to handle the CSI differently. */
4633 restore_left_to_right_direction (Lisp_Coding_System *codesys,
4634 unsigned_char_dynarr *dst,
4635 unsigned int *flags,
4638 if (!flags || (*flags & CODING_STATE_R2L))
4640 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4642 Dynarr_add (dst, ISO_CODE_ESC);
4643 Dynarr_add (dst, '[');
4645 else if (internal_p)
4646 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4648 Dynarr_add (dst, ISO_CODE_CSI);
4649 Dynarr_add (dst, '0');
4650 Dynarr_add (dst, ']');
4652 *flags &= ~CODING_STATE_R2L;
4656 /* If FLAGS is a null pointer or specifies a direction different from
4657 DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or
4658 CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape
4659 sequence to DST. Also update FLAGS if it is not a null pointer.
4660 If INTERNAL_P is set, we are outputting in internal format and
4661 need to handle the CSI differently. */
4664 ensure_correct_direction (int direction, Lisp_Coding_System *codesys,
4665 unsigned_char_dynarr *dst, unsigned int *flags,
4668 if ((!flags || (*flags & CODING_STATE_R2L)) &&
4669 direction == CHARSET_LEFT_TO_RIGHT)
4670 restore_left_to_right_direction (codesys, dst, flags, internal_p);
4671 else if (!CODING_SYSTEM_ISO2022_NO_ISO6429 (codesys)
4672 && (!flags || !(*flags & CODING_STATE_R2L)) &&
4673 direction == CHARSET_RIGHT_TO_LEFT)
4675 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4677 Dynarr_add (dst, ISO_CODE_ESC);
4678 Dynarr_add (dst, '[');
4680 else if (internal_p)
4681 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4683 Dynarr_add (dst, ISO_CODE_CSI);
4684 Dynarr_add (dst, '2');
4685 Dynarr_add (dst, ']');
4687 *flags |= CODING_STATE_R2L;
4691 /* Convert ISO2022-format data to internal format. */
4694 decode_coding_iso2022 (Lstream *decoding, CONST unsigned char *src,
4695 unsigned_char_dynarr *dst, unsigned int n)
4697 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4698 unsigned int flags = str->flags;
4699 unsigned int ch = str->ch;
4700 eol_type_t eol_type = str->eol_type;
4701 #ifdef ENABLE_COMPOSITE_CHARS
4702 unsigned_char_dynarr *real_dst = dst;
4704 Lisp_Object coding_system;
4706 XSETCODING_SYSTEM (coding_system, str->codesys);
4708 #ifdef ENABLE_COMPOSITE_CHARS
4709 if (flags & CODING_STATE_COMPOSITE)
4710 dst = str->iso2022.composite_chars;
4711 #endif /* ENABLE_COMPOSITE_CHARS */
4715 unsigned char c = *src++;
4716 if (flags & CODING_STATE_ESCAPE)
4717 { /* Within ESC sequence */
4718 int retval = parse_iso2022_esc (coding_system, &str->iso2022,
4723 switch (str->iso2022.esc)
4725 #ifdef ENABLE_COMPOSITE_CHARS
4726 case ISO_ESC_START_COMPOSITE:
4727 if (str->iso2022.composite_chars)
4728 Dynarr_reset (str->iso2022.composite_chars);
4730 str->iso2022.composite_chars = Dynarr_new (unsigned_char);
4731 dst = str->iso2022.composite_chars;
4733 case ISO_ESC_END_COMPOSITE:
4735 Bufbyte comstr[MAX_EMCHAR_LEN];
4737 Emchar emch = lookup_composite_char (Dynarr_atp (dst, 0),
4738 Dynarr_length (dst));
4740 len = set_charptr_emchar (comstr, emch);
4741 Dynarr_add_many (dst, comstr, len);
4744 #endif /* ENABLE_COMPOSITE_CHARS */
4746 case ISO_ESC_LITERAL:
4747 DECODE_ADD_BINARY_CHAR (c, dst);
4751 /* Everything else handled already */
4756 /* Attempted error recovery. */
4757 if (str->iso2022.output_direction_sequence)
4758 ensure_correct_direction (flags & CODING_STATE_R2L ?
4759 CHARSET_RIGHT_TO_LEFT :
4760 CHARSET_LEFT_TO_RIGHT,
4761 str->codesys, dst, 0, 1);
4762 /* More error recovery. */
4763 if (!retval || str->iso2022.output_literally)
4765 /* Output the (possibly invalid) sequence */
4767 for (i = 0; i < str->iso2022.esc_bytes_index; i++)
4768 DECODE_ADD_BINARY_CHAR (str->iso2022.esc_bytes[i], dst);
4769 flags &= CODING_STATE_ISO2022_LOCK;
4771 n++, src--;/* Repeat the loop with the same character. */
4774 /* No sense in reprocessing the final byte of the
4775 escape sequence; it could mess things up anyway.
4777 DECODE_ADD_BINARY_CHAR (c, dst);
4782 else if (BYTE_C0_P (c) || BYTE_C1_P (c))
4783 { /* Control characters */
4785 /***** Error-handling *****/
4787 /* If we were in the middle of a character, dump out the
4788 partial character. */
4789 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4791 /* If we just saw a single-shift character, dump it out.
4792 This may dump out the wrong sort of single-shift character,
4793 but least it will give an indication that something went
4795 if (flags & CODING_STATE_SS2)
4797 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS2, dst);
4798 flags &= ~CODING_STATE_SS2;
4800 if (flags & CODING_STATE_SS3)
4802 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS3, dst);
4803 flags &= ~CODING_STATE_SS3;
4806 /***** Now handle the control characters. *****/
4809 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4811 flags &= CODING_STATE_ISO2022_LOCK;
4813 if (!parse_iso2022_esc (coding_system, &str->iso2022, c, &flags, 1))
4814 DECODE_ADD_BINARY_CHAR (c, dst);
4817 { /* Graphic characters */
4818 Lisp_Object charset;
4822 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4824 /* Now determine the charset. */
4825 reg = ((flags & CODING_STATE_SS2) ? 2
4826 : (flags & CODING_STATE_SS3) ? 3
4827 : !BYTE_ASCII_P (c) ? str->iso2022.register_right
4828 : str->iso2022.register_left);
4829 charset = str->iso2022.charset[reg];
4831 /* Error checking: */
4832 if (! CHARSETP (charset)
4833 || str->iso2022.invalid_designated[reg]
4834 || (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL)
4835 && XCHARSET_CHARS (charset) == 94))
4836 /* Mrmph. We are trying to invoke a register that has no
4837 or an invalid charset in it, or trying to add a character
4838 outside the range of the charset. Insert that char literally
4839 to preserve it for the output. */
4841 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4842 DECODE_ADD_BINARY_CHAR (c, dst);
4847 /* Things are probably hunky-dorey. */
4849 /* Fetch reverse charset, maybe. */
4850 if (((flags & CODING_STATE_R2L) &&
4851 XCHARSET_DIRECTION (charset) == CHARSET_LEFT_TO_RIGHT)
4853 (!(flags & CODING_STATE_R2L) &&
4854 XCHARSET_DIRECTION (charset) == CHARSET_RIGHT_TO_LEFT))
4856 Lisp_Object new_charset =
4857 XCHARSET_REVERSE_DIRECTION_CHARSET (charset);
4858 if (!NILP (new_charset))
4859 charset = new_charset;
4862 lb = XCHARSET_LEADING_BYTE (charset);
4863 switch (XCHARSET_REP_BYTES (charset))
4866 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4867 Dynarr_add (dst, c & 0x7F);
4870 case 2: /* one-byte official */
4871 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4872 Dynarr_add (dst, lb);
4873 Dynarr_add (dst, c | 0x80);
4876 case 3: /* one-byte private or two-byte official */
4877 if (XCHARSET_PRIVATE_P (charset))
4879 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4880 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_1);
4881 Dynarr_add (dst, lb);
4882 Dynarr_add (dst, c | 0x80);
4888 Dynarr_add (dst, lb);
4889 Dynarr_add (dst, ch | 0x80);
4890 Dynarr_add (dst, c | 0x80);
4898 default: /* two-byte private */
4901 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_2);
4902 Dynarr_add (dst, lb);
4903 Dynarr_add (dst, ch | 0x80);
4904 Dynarr_add (dst, c | 0x80);
4913 flags &= CODING_STATE_ISO2022_LOCK;
4916 label_continue_loop:;
4919 if (flags & CODING_STATE_END)
4920 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4927 /***** ISO2022 encoder *****/
4929 /* Designate CHARSET into register REG. */
4932 iso2022_designate (Lisp_Object charset, unsigned char reg,
4933 struct encoding_stream *str, unsigned_char_dynarr *dst)
4935 static CONST char inter94[] = "()*+";
4936 static CONST char inter96[] = ",-./";
4938 unsigned char final;
4939 Lisp_Object old_charset = str->iso2022.charset[reg];
4941 str->iso2022.charset[reg] = charset;
4942 if (!CHARSETP (charset))
4943 /* charset might be an initial nil or t. */
4945 type = XCHARSET_TYPE (charset);
4946 final = XCHARSET_FINAL (charset);
4947 if (!str->iso2022.force_charset_on_output[reg] &&
4948 CHARSETP (old_charset) &&
4949 XCHARSET_TYPE (old_charset) == type &&
4950 XCHARSET_FINAL (old_charset) == final)
4953 str->iso2022.force_charset_on_output[reg] = 0;
4956 charset_conversion_spec_dynarr *dyn =
4957 str->codesys->iso2022.output_conv;
4963 for (i = 0; i < Dynarr_length (dyn); i++)
4965 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
4966 if (EQ (charset, spec->from_charset))
4967 charset = spec->to_charset;
4972 Dynarr_add (dst, ISO_CODE_ESC);
4975 case CHARSET_TYPE_94:
4976 Dynarr_add (dst, inter94[reg]);
4978 case CHARSET_TYPE_96:
4979 Dynarr_add (dst, inter96[reg]);
4981 case CHARSET_TYPE_94X94:
4982 Dynarr_add (dst, '$');
4984 || !(CODING_SYSTEM_ISO2022_SHORT (str->codesys))
4987 Dynarr_add (dst, inter94[reg]);
4989 case CHARSET_TYPE_96X96:
4990 Dynarr_add (dst, '$');
4991 Dynarr_add (dst, inter96[reg]);
4994 Dynarr_add (dst, final);
4998 ensure_normal_shift (struct encoding_stream *str, unsigned_char_dynarr *dst)
5000 if (str->iso2022.register_left != 0)
5002 Dynarr_add (dst, ISO_CODE_SI);
5003 str->iso2022.register_left = 0;
5008 ensure_shift_out (struct encoding_stream *str, unsigned_char_dynarr *dst)
5010 if (str->iso2022.register_left != 1)
5012 Dynarr_add (dst, ISO_CODE_SO);
5013 str->iso2022.register_left = 1;
5017 /* Convert internally-formatted data to ISO2022 format. */
5020 encode_coding_iso2022 (Lstream *encoding, CONST unsigned char *src,
5021 unsigned_char_dynarr *dst, unsigned int n)
5023 unsigned char charmask, c;
5024 unsigned char char_boundary;
5025 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
5026 unsigned int flags = str->flags;
5027 unsigned int ch = str->ch;
5028 Lisp_Coding_System *codesys = str->codesys;
5029 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
5031 Lisp_Object charset;
5034 #ifdef ENABLE_COMPOSITE_CHARS
5035 /* flags for handling composite chars. We do a little switcharoo
5036 on the source while we're outputting the composite char. */
5037 unsigned int saved_n = 0;
5038 CONST unsigned char *saved_src = NULL;
5039 int in_composite = 0;
5040 #endif /* ENABLE_COMPOSITE_CHARS */
5042 char_boundary = str->iso2022.current_char_boundary;
5043 charset = str->iso2022.current_charset;
5044 half = str->iso2022.current_half;
5046 #ifdef ENABLE_COMPOSITE_CHARS
5053 if (BYTE_ASCII_P (c))
5054 { /* Processing ASCII character */
5057 restore_left_to_right_direction (codesys, dst, &flags, 0);
5059 /* Make sure G0 contains ASCII */
5060 if ((c > ' ' && c < ISO_CODE_DEL) ||
5061 !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
5063 ensure_normal_shift (str, dst);
5064 iso2022_designate (Vcharset_ascii, 0, str, dst);
5067 /* If necessary, restore everything to the default state
5070 !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys)))
5072 restore_left_to_right_direction (codesys, dst, &flags, 0);
5074 ensure_normal_shift (str, dst);
5076 for (i = 0; i < 4; i++)
5078 Lisp_Object initial_charset =
5079 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5080 iso2022_designate (initial_charset, i, str, dst);
5085 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5086 Dynarr_add (dst, '\r');
5087 if (eol_type != EOL_CR)
5088 Dynarr_add (dst, c);
5092 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5093 && fit_to_be_escape_quoted (c))
5094 Dynarr_add (dst, ISO_CODE_ESC);
5095 Dynarr_add (dst, c);
5100 else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
5101 { /* Processing Leading Byte */
5103 charset = CHARSET_BY_LEADING_BYTE (c);
5104 if (LEADING_BYTE_PREFIX_P(c))
5106 else if (!EQ (charset, Vcharset_control_1)
5107 #ifdef ENABLE_COMPOSITE_CHARS
5108 && !EQ (charset, Vcharset_composite)
5114 ensure_correct_direction (XCHARSET_DIRECTION (charset),
5115 codesys, dst, &flags, 0);
5117 /* Now determine which register to use. */
5119 for (i = 0; i < 4; i++)
5121 if (EQ (charset, str->iso2022.charset[i]) ||
5123 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i)))
5132 if (XCHARSET_GRAPHIC (charset) != 0)
5134 if (!NILP (str->iso2022.charset[1]) &&
5135 (!CODING_SYSTEM_ISO2022_SEVEN (codesys) ||
5136 CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys)))
5138 else if (!NILP (str->iso2022.charset[2]))
5140 else if (!NILP (str->iso2022.charset[3]))
5149 iso2022_designate (charset, reg, str, dst);
5151 /* Now invoke that register. */
5155 ensure_normal_shift (str, dst);
5160 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5162 ensure_shift_out (str, dst);
5170 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5172 Dynarr_add (dst, ISO_CODE_ESC);
5173 Dynarr_add (dst, 'N');
5178 Dynarr_add (dst, ISO_CODE_SS2);
5184 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5186 Dynarr_add (dst, ISO_CODE_ESC);
5187 Dynarr_add (dst, 'O');
5192 Dynarr_add (dst, ISO_CODE_SS3);
5204 { /* Processing Non-ASCII character */
5205 charmask = (half == 0 ? 0x7F : 0xFF);
5207 if (EQ (charset, Vcharset_control_1))
5209 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5210 && fit_to_be_escape_quoted (c))
5211 Dynarr_add (dst, ISO_CODE_ESC);
5212 /* you asked for it ... */
5213 Dynarr_add (dst, c - 0x20);
5217 switch (XCHARSET_REP_BYTES (charset))
5220 Dynarr_add (dst, c & charmask);
5223 if (XCHARSET_PRIVATE_P (charset))
5225 Dynarr_add (dst, c & charmask);
5230 #ifdef ENABLE_COMPOSITE_CHARS
5231 if (EQ (charset, Vcharset_composite))
5235 /* #### Bother! We don't know how to
5237 Dynarr_add (dst, '~');
5241 Emchar emch = MAKE_CHAR (Vcharset_composite,
5242 ch & 0x7F, c & 0x7F);
5243 Lisp_Object lstr = composite_char_string (emch);
5247 src = XSTRING_DATA (lstr);
5248 n = XSTRING_LENGTH (lstr);
5249 Dynarr_add (dst, ISO_CODE_ESC);
5250 Dynarr_add (dst, '0'); /* start composing */
5254 #endif /* ENABLE_COMPOSITE_CHARS */
5256 Dynarr_add (dst, ch & charmask);
5257 Dynarr_add (dst, c & charmask);
5270 Dynarr_add (dst, ch & charmask);
5271 Dynarr_add (dst, c & charmask);
5287 #ifdef ENABLE_COMPOSITE_CHARS
5293 Dynarr_add (dst, ISO_CODE_ESC);
5294 Dynarr_add (dst, '1'); /* end composing */
5295 goto back_to_square_n; /* Wheeeeeeeee ..... */
5297 #endif /* ENABLE_COMPOSITE_CHARS */
5299 if (char_boundary && flags & CODING_STATE_END)
5301 restore_left_to_right_direction (codesys, dst, &flags, 0);
5302 ensure_normal_shift (str, dst);
5303 for (i = 0; i < 4; i++)
5305 Lisp_Object initial_charset =
5306 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5307 iso2022_designate (initial_charset, i, str, dst);
5313 str->iso2022.current_char_boundary = char_boundary;
5314 str->iso2022.current_charset = charset;
5315 str->iso2022.current_half = half;
5317 /* Verbum caro factum est! */
5321 /************************************************************************/
5322 /* No-conversion methods */
5323 /************************************************************************/
5325 /* This is used when reading in "binary" files -- i.e. files that may
5326 contain all 256 possible byte values and that are not to be
5327 interpreted as being in any particular decoding. */
5329 decode_coding_no_conversion (Lstream *decoding, CONST unsigned char *src,
5330 unsigned_char_dynarr *dst, unsigned int n)
5333 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
5334 unsigned int flags = str->flags;
5335 unsigned int ch = str->ch;
5336 eol_type_t eol_type = str->eol_type;
5342 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5343 DECODE_ADD_BINARY_CHAR (c, dst);
5344 label_continue_loop:;
5347 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
5354 encode_coding_no_conversion (Lstream *encoding, CONST unsigned char *src,
5355 unsigned_char_dynarr *dst, unsigned int n)
5358 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
5359 unsigned int flags = str->flags;
5360 unsigned int ch = str->ch;
5361 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
5368 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5369 Dynarr_add (dst, '\r');
5370 if (eol_type != EOL_CR)
5371 Dynarr_add (dst, '\n');
5374 else if (BYTE_ASCII_P (c))
5377 Dynarr_add (dst, c);
5379 else if (BUFBYTE_LEADING_BYTE_P (c))
5382 if (c == LEADING_BYTE_LATIN_ISO8859_1 ||
5383 c == LEADING_BYTE_CONTROL_1)
5386 Dynarr_add (dst, '~'); /* untranslatable character */
5390 if (ch == LEADING_BYTE_LATIN_ISO8859_1)
5391 Dynarr_add (dst, c);
5392 else if (ch == LEADING_BYTE_CONTROL_1)
5395 Dynarr_add (dst, c - 0x20);
5397 /* else it should be the second or third byte of an
5398 untranslatable character, so ignore it */
5408 /************************************************************************/
5409 /* Simple internal/external functions */
5410 /************************************************************************/
5412 static Extbyte_dynarr *conversion_out_dynarr;
5413 static Bufbyte_dynarr *conversion_in_dynarr;
5415 /* Determine coding system from coding format */
5417 /* #### not correct for all values of `fmt'! */
5419 external_data_format_to_coding_system (enum external_data_format fmt)
5423 case FORMAT_FILENAME:
5424 case FORMAT_TERMINAL:
5425 if (EQ (Vfile_name_coding_system, Qnil) ||
5426 EQ (Vfile_name_coding_system, Qbinary))
5429 return Fget_coding_system (Vfile_name_coding_system);
5432 return Fget_coding_system (Qctext);
5440 convert_to_external_format (CONST Bufbyte *ptr,
5443 enum external_data_format fmt)
5445 Lisp_Object coding_system = external_data_format_to_coding_system (fmt);
5447 if (!conversion_out_dynarr)
5448 conversion_out_dynarr = Dynarr_new (Extbyte);
5450 Dynarr_reset (conversion_out_dynarr);
5452 if (NILP (coding_system))
5454 CONST Bufbyte *end = ptr + len;
5459 (BYTE_ASCII_P (*ptr)) ? *ptr :
5460 (*ptr == LEADING_BYTE_CONTROL_1) ? (*(ptr+1) - 0x20) :
5461 (*ptr == LEADING_BYTE_LATIN_ISO8859_1) ? (*(ptr+1)) :
5464 Dynarr_add (conversion_out_dynarr, (Extbyte) c);
5468 #ifdef ERROR_CHECK_BUFPOS
5469 assert (ptr == end);
5474 Lisp_Object instream, outstream, da_outstream;
5475 Lstream *istr, *ostr;
5476 struct gcpro gcpro1, gcpro2, gcpro3;
5477 char tempbuf[1024]; /* some random amount */
5479 instream = make_fixed_buffer_input_stream ((unsigned char *) ptr, len);
5480 da_outstream = make_dynarr_output_stream
5481 ((unsigned_char_dynarr *) conversion_out_dynarr);
5483 make_encoding_output_stream (XLSTREAM (da_outstream), coding_system);
5484 istr = XLSTREAM (instream);
5485 ostr = XLSTREAM (outstream);
5486 GCPRO3 (instream, outstream, da_outstream);
5489 ssize_t size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
5492 Lstream_write (ostr, tempbuf, size_in_bytes);
5494 Lstream_close (istr);
5495 Lstream_close (ostr);
5497 Lstream_delete (istr);
5498 Lstream_delete (ostr);
5499 Lstream_delete (XLSTREAM (da_outstream));
5502 *len_out = Dynarr_length (conversion_out_dynarr);
5503 Dynarr_add (conversion_out_dynarr, 0); /* remember to zero-terminate! */
5504 return Dynarr_atp (conversion_out_dynarr, 0);
5508 convert_from_external_format (CONST Extbyte *ptr,
5511 enum external_data_format fmt)
5513 Lisp_Object coding_system = external_data_format_to_coding_system (fmt);
5515 if (!conversion_in_dynarr)
5516 conversion_in_dynarr = Dynarr_new (Bufbyte);
5518 Dynarr_reset (conversion_in_dynarr);
5520 if (NILP (coding_system))
5522 CONST Extbyte *end = ptr + len;
5523 for (; ptr < end; ptr++)
5526 DECODE_ADD_BINARY_CHAR (c, conversion_in_dynarr);
5531 Lisp_Object instream, outstream, da_outstream;
5532 Lstream *istr, *ostr;
5533 struct gcpro gcpro1, gcpro2, gcpro3;
5534 char tempbuf[1024]; /* some random amount */
5536 instream = make_fixed_buffer_input_stream ((unsigned char *) ptr, len);
5537 da_outstream = make_dynarr_output_stream
5538 ((unsigned_char_dynarr *) conversion_in_dynarr);
5540 make_decoding_output_stream (XLSTREAM (da_outstream), coding_system);
5541 istr = XLSTREAM (instream);
5542 ostr = XLSTREAM (outstream);
5543 GCPRO3 (instream, outstream, da_outstream);
5546 ssize_t size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
5549 Lstream_write (ostr, tempbuf, size_in_bytes);
5551 Lstream_close (istr);
5552 Lstream_close (ostr);
5554 Lstream_delete (istr);
5555 Lstream_delete (ostr);
5556 Lstream_delete (XLSTREAM (da_outstream));
5559 *len_out = Dynarr_length (conversion_in_dynarr);
5560 Dynarr_add (conversion_in_dynarr, 0); /* remember to zero-terminate! */
5561 return Dynarr_atp (conversion_in_dynarr, 0);
5565 /************************************************************************/
5566 /* Initialization */
5567 /************************************************************************/
5570 syms_of_file_coding (void)
5572 deferror (&Qcoding_system_error, "coding-system-error",
5573 "Coding-system error", Qio_error);
5575 DEFSUBR (Fcoding_system_p);
5576 DEFSUBR (Ffind_coding_system);
5577 DEFSUBR (Fget_coding_system);
5578 DEFSUBR (Fcoding_system_list);
5579 DEFSUBR (Fcoding_system_name);
5580 DEFSUBR (Fmake_coding_system);
5581 DEFSUBR (Fcopy_coding_system);
5582 DEFSUBR (Fdefine_coding_system_alias);
5583 DEFSUBR (Fsubsidiary_coding_system);
5585 DEFSUBR (Fcoding_system_type);
5586 DEFSUBR (Fcoding_system_doc_string);
5588 DEFSUBR (Fcoding_system_charset);
5590 DEFSUBR (Fcoding_system_property);
5592 DEFSUBR (Fcoding_category_list);
5593 DEFSUBR (Fset_coding_priority_list);
5594 DEFSUBR (Fcoding_priority_list);
5595 DEFSUBR (Fset_coding_category_system);
5596 DEFSUBR (Fcoding_category_system);
5598 DEFSUBR (Fdetect_coding_region);
5599 DEFSUBR (Fdecode_coding_region);
5600 DEFSUBR (Fencode_coding_region);
5602 DEFSUBR (Fdecode_shift_jis_char);
5603 DEFSUBR (Fencode_shift_jis_char);
5604 DEFSUBR (Fdecode_big5_char);
5605 DEFSUBR (Fencode_big5_char);
5606 DEFSUBR (Fset_ucs_char);
5607 DEFSUBR (Fucs_char);
5608 DEFSUBR (Fset_char_ucs);
5609 DEFSUBR (Fchar_ucs);
5611 defsymbol (&Qcoding_systemp, "coding-system-p");
5612 defsymbol (&Qno_conversion, "no-conversion");
5613 defsymbol (&Qraw_text, "raw-text");
5615 defsymbol (&Qbig5, "big5");
5616 defsymbol (&Qshift_jis, "shift-jis");
5617 defsymbol (&Qucs4, "ucs-4");
5618 defsymbol (&Qutf8, "utf-8");
5619 defsymbol (&Qccl, "ccl");
5620 defsymbol (&Qiso2022, "iso2022");
5622 defsymbol (&Qmnemonic, "mnemonic");
5623 defsymbol (&Qeol_type, "eol-type");
5624 defsymbol (&Qpost_read_conversion, "post-read-conversion");
5625 defsymbol (&Qpre_write_conversion, "pre-write-conversion");
5627 defsymbol (&Qcr, "cr");
5628 defsymbol (&Qlf, "lf");
5629 defsymbol (&Qcrlf, "crlf");
5630 defsymbol (&Qeol_cr, "eol-cr");
5631 defsymbol (&Qeol_lf, "eol-lf");
5632 defsymbol (&Qeol_crlf, "eol-crlf");
5634 defsymbol (&Qcharset_g0, "charset-g0");
5635 defsymbol (&Qcharset_g1, "charset-g1");
5636 defsymbol (&Qcharset_g2, "charset-g2");
5637 defsymbol (&Qcharset_g3, "charset-g3");
5638 defsymbol (&Qforce_g0_on_output, "force-g0-on-output");
5639 defsymbol (&Qforce_g1_on_output, "force-g1-on-output");
5640 defsymbol (&Qforce_g2_on_output, "force-g2-on-output");
5641 defsymbol (&Qforce_g3_on_output, "force-g3-on-output");
5642 defsymbol (&Qno_iso6429, "no-iso6429");
5643 defsymbol (&Qinput_charset_conversion, "input-charset-conversion");
5644 defsymbol (&Qoutput_charset_conversion, "output-charset-conversion");
5646 defsymbol (&Qshort, "short");
5647 defsymbol (&Qno_ascii_eol, "no-ascii-eol");
5648 defsymbol (&Qno_ascii_cntl, "no-ascii-cntl");
5649 defsymbol (&Qseven, "seven");
5650 defsymbol (&Qlock_shift, "lock-shift");
5651 defsymbol (&Qescape_quoted, "escape-quoted");
5653 defsymbol (&Qencode, "encode");
5654 defsymbol (&Qdecode, "decode");
5657 defsymbol (&Qctext, "ctext");
5658 defsymbol (&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS],
5660 defsymbol (&coding_category_symbol[CODING_CATEGORY_BIG5],
5662 defsymbol (&coding_category_symbol[CODING_CATEGORY_UCS4],
5664 defsymbol (&coding_category_symbol[CODING_CATEGORY_UTF8],
5666 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_7],
5668 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE],
5670 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_1],
5672 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_2],
5674 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_LOCK_SHIFT],
5677 defsymbol (&coding_category_symbol[CODING_CATEGORY_NO_CONVERSION],
5682 lstream_type_create_file_coding (void)
5684 LSTREAM_HAS_METHOD (decoding, reader);
5685 LSTREAM_HAS_METHOD (decoding, writer);
5686 LSTREAM_HAS_METHOD (decoding, rewinder);
5687 LSTREAM_HAS_METHOD (decoding, seekable_p);
5688 LSTREAM_HAS_METHOD (decoding, flusher);
5689 LSTREAM_HAS_METHOD (decoding, closer);
5690 LSTREAM_HAS_METHOD (decoding, marker);
5692 LSTREAM_HAS_METHOD (encoding, reader);
5693 LSTREAM_HAS_METHOD (encoding, writer);
5694 LSTREAM_HAS_METHOD (encoding, rewinder);
5695 LSTREAM_HAS_METHOD (encoding, seekable_p);
5696 LSTREAM_HAS_METHOD (encoding, flusher);
5697 LSTREAM_HAS_METHOD (encoding, closer);
5698 LSTREAM_HAS_METHOD (encoding, marker);
5702 vars_of_file_coding (void)
5706 fcd = xnew (struct file_coding_dump);
5707 dumpstruct (&fcd, &fcd_description);
5709 /* Initialize to something reasonable ... */
5710 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
5712 fcd->coding_category_system[i] = Qnil;
5713 fcd->coding_category_by_priority[i] = i;
5716 Fprovide (intern ("file-coding"));
5718 DEFVAR_LISP ("keyboard-coding-system", &Vkeyboard_coding_system /*
5719 Coding system used for TTY keyboard input.
5720 Not used under a windowing system.
5722 Vkeyboard_coding_system = Qnil;
5724 DEFVAR_LISP ("terminal-coding-system", &Vterminal_coding_system /*
5725 Coding system used for TTY display output.
5726 Not used under a windowing system.
5728 Vterminal_coding_system = Qnil;
5730 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read /*
5731 Overriding coding system used when writing a file or process.
5732 You should *bind* this, not set it. If this is non-nil, it specifies
5733 the coding system that will be used when a file or process is read
5734 in, and overrides `buffer-file-coding-system-for-read',
5735 `insert-file-contents-pre-hook', etc. Use those variables instead of
5736 this one for permanent changes to the environment.
5738 Vcoding_system_for_read = Qnil;
5740 DEFVAR_LISP ("coding-system-for-write",
5741 &Vcoding_system_for_write /*
5742 Overriding coding system used when writing a file or process.
5743 You should *bind* this, not set it. If this is non-nil, it specifies
5744 the coding system that will be used when a file or process is wrote
5745 in, and overrides `buffer-file-coding-system',
5746 `write-region-pre-hook', etc. Use those variables instead of this one
5747 for permanent changes to the environment.
5749 Vcoding_system_for_write = Qnil;
5751 DEFVAR_LISP ("file-name-coding-system", &Vfile_name_coding_system /*
5752 Coding system used to convert pathnames when accessing files.
5754 Vfile_name_coding_system = Qnil;
5756 DEFVAR_BOOL ("enable-multibyte-characters", &enable_multibyte_characters /*
5757 Non-nil means the buffer contents are regarded as multi-byte form
5758 of characters, not a binary code. This affects the display, file I/O,
5759 and behaviors of various editing commands.
5761 Setting this to nil does not do anything.
5763 enable_multibyte_characters = 1;
5767 complex_vars_of_file_coding (void)
5769 staticpro (&Vcoding_system_hash_table);
5770 Vcoding_system_hash_table =
5771 make_lisp_hash_table (50, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ);
5773 the_codesys_prop_dynarr = Dynarr_new (codesys_prop);
5774 dumpstruct (&the_codesys_prop_dynarr, &codesys_prop_dynarr_description);
5776 #define DEFINE_CODESYS_PROP(Prop_Type, Sym) do \
5778 struct codesys_prop csp; \
5780 csp.prop_type = (Prop_Type); \
5781 Dynarr_add (the_codesys_prop_dynarr, csp); \
5784 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qmnemonic);
5785 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_type);
5786 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_cr);
5787 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_crlf);
5788 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_lf);
5789 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpost_read_conversion);
5790 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpre_write_conversion);
5792 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g0);
5793 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g1);
5794 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g2);
5795 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g3);
5796 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g0_on_output);
5797 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g1_on_output);
5798 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g2_on_output);
5799 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g3_on_output);
5800 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qshort);
5801 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_eol);
5802 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_cntl);
5803 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qseven);
5804 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qlock_shift);
5805 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_iso6429);
5806 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qescape_quoted);
5807 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qinput_charset_conversion);
5808 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qoutput_charset_conversion);
5810 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qencode);
5811 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qdecode);
5813 /* Need to create this here or we're really screwed. */
5815 (Qraw_text, Qno_conversion,
5816 build_string ("Raw text, which means it converts only line-break-codes."),
5817 list2 (Qmnemonic, build_string ("Raw")));
5820 (Qbinary, Qno_conversion,
5821 build_string ("Binary, which means it does not convert anything."),
5822 list4 (Qeol_type, Qlf,
5823 Qmnemonic, build_string ("Binary")));
5825 Fdefine_coding_system_alias (Qno_conversion, Qraw_text);
5827 /* Need this for bootstrapping */
5828 fcd->coding_category_system[CODING_CATEGORY_NO_CONVERSION] =
5829 Fget_coding_system (Qraw_text);
5835 for (i = 0; i < 65536; i++)
5836 fcd->ucs_to_mule_table[i] = Qnil;
5838 staticpro (&mule_to_ucs_table);
5839 mule_to_ucs_table = Fmake_char_table(Qgeneric);