1 /* Code conversion functions.
2 Copyright (C) 1991, 1995 Free Software Foundation, Inc.
3 Copyright (C) 1995 Sun Microsystems, Inc.
5 This file is part of XEmacs.
7 XEmacs is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by the
9 Free Software Foundation; either version 2, or (at your option) any
12 XEmacs is distributed in the hope that it will be useful, but WITHOUT
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 You should have received a copy of the GNU General Public License
18 along with XEmacs; see the file COPYING. If not, write to
19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20 Boston, MA 02111-1307, USA. */
22 /* Synched up with: Mule 2.3. Not in FSF. */
24 /* Rewritten by Ben Wing <ben@xemacs.org>. */
37 #include "file-coding.h"
39 Lisp_Object Qbuffer_file_coding_system, Qcoding_system_error;
41 Lisp_Object Vkeyboard_coding_system;
42 Lisp_Object Vterminal_coding_system;
43 Lisp_Object Vcoding_system_for_read;
44 Lisp_Object Vcoding_system_for_write;
45 Lisp_Object Vfile_name_coding_system;
47 /* Table of symbols identifying each coding category. */
48 Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST + 1];
50 /* Coding system currently associated with each coding category. */
51 Lisp_Object coding_category_system[CODING_CATEGORY_LAST + 1];
53 /* Table of all coding categories in decreasing order of priority.
54 This describes a permutation of the possible coding categories. */
55 int coding_category_by_priority[CODING_CATEGORY_LAST + 1];
57 Lisp_Object Qcoding_system_p;
59 Lisp_Object Qraw_text, Qno_conversion, Qccl, Qiso2022;
60 /* Qinternal in general.c */
62 Lisp_Object Qmnemonic, Qeol_type;
63 Lisp_Object Qcr, Qcrlf, Qlf;
64 Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf;
65 Lisp_Object Qpost_read_conversion;
66 Lisp_Object Qpre_write_conversion;
69 Lisp_Object Qucs4, Qutf8;
70 Lisp_Object Qbig5, Qshift_jis;
71 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3;
72 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output;
73 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output;
74 Lisp_Object Qno_iso6429;
75 Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion;
76 Lisp_Object Qctext, Qescape_quoted;
77 Lisp_Object Qshort, Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift;
79 Lisp_Object Qencode, Qdecode;
81 Lisp_Object Vcoding_system_hash_table;
83 int enable_multibyte_characters;
86 /* Additional information used by the ISO2022 decoder and detector. */
87 struct iso2022_decoder
89 /* CHARSET holds the character sets currently assigned to the G0
90 through G3 variables. It is initialized from the array
91 INITIAL_CHARSET in CODESYS. */
92 Lisp_Object charset[4];
94 /* Which registers are currently invoked into the left (GL) and
95 right (GR) halves of the 8-bit encoding space? */
96 int register_left, register_right;
98 /* ISO_ESC holds a value indicating part of an escape sequence
99 that has already been seen. */
100 enum iso_esc_flag esc;
102 /* This records the bytes we've seen so far in an escape sequence,
103 in case the sequence is invalid (we spit out the bytes unchanged). */
104 unsigned char esc_bytes[8];
106 /* Index for next byte to store in ISO escape sequence. */
109 #ifdef ENABLE_COMPOSITE_CHARS
110 /* Stuff seen so far when composing a string. */
111 unsigned_char_dynarr *composite_chars;
114 /* If we saw an invalid designation sequence for a particular
115 register, we flag it here and switch to ASCII. The next time we
116 see a valid designation for this register, we turn off the flag
117 and do the designation normally, but pretend the sequence was
118 invalid. The effect of all this is that (most of the time) the
119 escape sequences for both the switch to the unknown charset, and
120 the switch back to the known charset, get inserted literally into
121 the buffer and saved out as such. The hope is that we can
122 preserve the escape sequences so that the resulting written out
123 file makes sense. If we don't do any of this, the designation
124 to the invalid charset will be preserved but that switch back
125 to the known charset will probably get eaten because it was
126 the same charset that was already present in the register. */
127 unsigned char invalid_designated[4];
129 /* We try to do similar things as above for direction-switching
130 sequences. If we encountered a direction switch while an
131 invalid designation was present, or an invalid designation
132 just after a direction switch (i.e. no valid designation
133 encountered yet), we insert the direction-switch escape
134 sequence literally into the output stream, and later on
135 insert the corresponding direction-restoring escape sequence
137 unsigned int switched_dir_and_no_valid_charset_yet :1;
138 unsigned int invalid_switch_dir :1;
140 /* Tells the decoder to output the escape sequence literally
141 even though it was valid. Used in the games we play to
142 avoid lossage when we encounter invalid designations. */
143 unsigned int output_literally :1;
144 /* We encountered a direction switch followed by an invalid
145 designation. We didn't output the direction switch
146 literally because we didn't know about the invalid designation;
147 but we have to do so now. */
148 unsigned int output_direction_sequence :1;
151 EXFUN (Fcopy_coding_system, 2);
153 struct detection_state;
154 static int detect_coding_sjis (struct detection_state *st,
155 CONST unsigned char *src,
157 static void decode_coding_sjis (Lstream *decoding,
158 CONST unsigned char *src,
159 unsigned_char_dynarr *dst,
161 static void encode_coding_sjis (Lstream *encoding,
162 CONST unsigned char *src,
163 unsigned_char_dynarr *dst,
165 static int detect_coding_big5 (struct detection_state *st,
166 CONST unsigned char *src,
168 static void decode_coding_big5 (Lstream *decoding,
169 CONST unsigned char *src,
170 unsigned_char_dynarr *dst, unsigned int n);
171 static void encode_coding_big5 (Lstream *encoding,
172 CONST unsigned char *src,
173 unsigned_char_dynarr *dst, unsigned int n);
174 static int detect_coding_ucs4 (struct detection_state *st,
175 CONST unsigned char *src,
177 static void decode_coding_ucs4 (Lstream *decoding,
178 CONST unsigned char *src,
179 unsigned_char_dynarr *dst, unsigned int n);
180 static void encode_coding_ucs4 (Lstream *encoding,
181 CONST unsigned char *src,
182 unsigned_char_dynarr *dst, unsigned int n);
183 static int detect_coding_utf8 (struct detection_state *st,
184 CONST unsigned char *src,
186 static void decode_coding_utf8 (Lstream *decoding,
187 CONST unsigned char *src,
188 unsigned_char_dynarr *dst, unsigned int n);
189 static void encode_coding_utf8 (Lstream *encoding,
190 CONST unsigned char *src,
191 unsigned_char_dynarr *dst, unsigned int n);
192 static int postprocess_iso2022_mask (int mask);
193 static void reset_iso2022 (Lisp_Object coding_system,
194 struct iso2022_decoder *iso);
195 static int detect_coding_iso2022 (struct detection_state *st,
196 CONST unsigned char *src,
198 static void decode_coding_iso2022 (Lstream *decoding,
199 CONST unsigned char *src,
200 unsigned_char_dynarr *dst, unsigned int n);
201 static void encode_coding_iso2022 (Lstream *encoding,
202 CONST unsigned char *src,
203 unsigned_char_dynarr *dst, unsigned int n);
205 static void decode_coding_no_conversion (Lstream *decoding,
206 CONST unsigned char *src,
207 unsigned_char_dynarr *dst,
209 static void encode_coding_no_conversion (Lstream *encoding,
210 CONST unsigned char *src,
211 unsigned_char_dynarr *dst,
213 static void mule_decode (Lstream *decoding, CONST unsigned char *src,
214 unsigned_char_dynarr *dst, unsigned int n);
215 static void mule_encode (Lstream *encoding, CONST unsigned char *src,
216 unsigned_char_dynarr *dst, unsigned int n);
218 typedef struct codesys_prop codesys_prop;
227 Dynarr_declare (codesys_prop);
228 } codesys_prop_dynarr;
230 codesys_prop_dynarr *the_codesys_prop_dynarr;
232 enum codesys_prop_enum
235 CODESYS_PROP_ISO2022,
240 /************************************************************************/
241 /* Coding system functions */
242 /************************************************************************/
244 static Lisp_Object mark_coding_system (Lisp_Object, void (*) (Lisp_Object));
245 static void print_coding_system (Lisp_Object, Lisp_Object, int);
246 static void finalize_coding_system (void *header, int for_disksave);
249 static const struct lrecord_description ccs_description_1[] = {
250 { XD_LISP_OBJECT, offsetof(charset_conversion_spec, from_charset), 2 },
254 static const struct struct_description ccs_description = {
255 sizeof(charset_conversion_spec),
259 static const struct lrecord_description ccsd_description_1[] = {
260 XD_DYNARR_DESC(charset_conversion_spec_dynarr, &ccs_description),
264 static const struct struct_description ccsd_description = {
265 sizeof(charset_conversion_spec_dynarr),
270 static const struct lrecord_description coding_system_description[] = {
271 { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, name), 2 },
272 { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, mnemonic), 3 },
273 { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, eol_lf), 3 },
275 { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, iso2022.initial_charset), 4 },
276 { XD_STRUCT_PTR, offsetof(struct Lisp_Coding_System, iso2022.input_conv), 1, &ccsd_description },
277 { XD_STRUCT_PTR, offsetof(struct Lisp_Coding_System, iso2022.output_conv), 1, &ccsd_description },
278 { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, ccl.decode), 2 },
283 DEFINE_LRECORD_IMPLEMENTATION ("coding-system", coding_system,
284 mark_coding_system, print_coding_system,
285 finalize_coding_system,
286 0, 0, coding_system_description,
287 struct Lisp_Coding_System);
290 mark_coding_system (Lisp_Object obj, void (*markobj) (Lisp_Object))
292 Lisp_Coding_System *codesys = XCODING_SYSTEM (obj);
294 markobj (CODING_SYSTEM_NAME (codesys));
295 markobj (CODING_SYSTEM_DOC_STRING (codesys));
296 markobj (CODING_SYSTEM_MNEMONIC (codesys));
297 markobj (CODING_SYSTEM_EOL_LF (codesys));
298 markobj (CODING_SYSTEM_EOL_CRLF (codesys));
299 markobj (CODING_SYSTEM_EOL_CR (codesys));
301 switch (CODING_SYSTEM_TYPE (codesys))
305 case CODESYS_ISO2022:
306 for (i = 0; i < 4; i++)
307 markobj (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i));
308 if (codesys->iso2022.input_conv)
310 for (i = 0; i < Dynarr_length (codesys->iso2022.input_conv); i++)
312 struct charset_conversion_spec *ccs =
313 Dynarr_atp (codesys->iso2022.input_conv, i);
314 markobj (ccs->from_charset);
315 markobj (ccs->to_charset);
318 if (codesys->iso2022.output_conv)
320 for (i = 0; i < Dynarr_length (codesys->iso2022.output_conv); i++)
322 struct charset_conversion_spec *ccs =
323 Dynarr_atp (codesys->iso2022.output_conv, i);
324 markobj (ccs->from_charset);
325 markobj (ccs->to_charset);
331 markobj (CODING_SYSTEM_CCL_DECODE (codesys));
332 markobj (CODING_SYSTEM_CCL_ENCODE (codesys));
339 markobj (CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys));
340 return CODING_SYSTEM_POST_READ_CONVERSION (codesys);
344 print_coding_system (Lisp_Object obj, Lisp_Object printcharfun,
347 Lisp_Coding_System *c = XCODING_SYSTEM (obj);
349 error ("printing unreadable object #<coding_system 0x%x>",
352 write_c_string ("#<coding_system ", printcharfun);
353 print_internal (c->name, printcharfun, 1);
354 write_c_string (">", printcharfun);
358 finalize_coding_system (void *header, int for_disksave)
360 Lisp_Coding_System *c = (Lisp_Coding_System *) header;
361 /* Since coding systems never go away, this function is not
362 necessary. But it would be necessary if we changed things
363 so that coding systems could go away. */
364 if (!for_disksave) /* see comment in lstream.c */
366 switch (CODING_SYSTEM_TYPE (c))
369 case CODESYS_ISO2022:
370 if (c->iso2022.input_conv)
372 Dynarr_free (c->iso2022.input_conv);
373 c->iso2022.input_conv = 0;
375 if (c->iso2022.output_conv)
377 Dynarr_free (c->iso2022.output_conv);
378 c->iso2022.output_conv = 0;
389 symbol_to_eol_type (Lisp_Object symbol)
391 CHECK_SYMBOL (symbol);
392 if (NILP (symbol)) return EOL_AUTODETECT;
393 if (EQ (symbol, Qlf)) return EOL_LF;
394 if (EQ (symbol, Qcrlf)) return EOL_CRLF;
395 if (EQ (symbol, Qcr)) return EOL_CR;
397 signal_simple_error ("Unrecognized eol type", symbol);
398 return EOL_AUTODETECT; /* not reached */
402 eol_type_to_symbol (enum eol_type type)
407 case EOL_LF: return Qlf;
408 case EOL_CRLF: return Qcrlf;
409 case EOL_CR: return Qcr;
410 case EOL_AUTODETECT: return Qnil;
415 setup_eol_coding_systems (Lisp_Coding_System *codesys)
417 Lisp_Object codesys_obj;
418 int len = string_length (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name);
419 char *codesys_name = (char *) alloca (len + 7);
421 char *codesys_mnemonic=0;
423 Lisp_Object codesys_name_sym, sub_codesys_obj;
427 XSETCODING_SYSTEM (codesys_obj, codesys);
429 memcpy (codesys_name,
430 string_data (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name), len);
432 if (STRINGP (CODING_SYSTEM_MNEMONIC (codesys)))
434 mlen = XSTRING_LENGTH (CODING_SYSTEM_MNEMONIC (codesys));
435 codesys_mnemonic = (char *) alloca (mlen + 7);
436 memcpy (codesys_mnemonic,
437 XSTRING_DATA (CODING_SYSTEM_MNEMONIC (codesys)), mlen);
440 #define DEFINE_SUB_CODESYS(op_sys, op_sys_abbr, Type) do { \
441 strcpy (codesys_name + len, "-" op_sys); \
443 strcpy (codesys_mnemonic + mlen, op_sys_abbr); \
444 codesys_name_sym = intern (codesys_name); \
445 sub_codesys_obj = Fcopy_coding_system (codesys_obj, codesys_name_sym); \
446 XCODING_SYSTEM_EOL_TYPE (sub_codesys_obj) = Type; \
448 XCODING_SYSTEM_MNEMONIC(sub_codesys_obj) = \
449 build_string (codesys_mnemonic); \
450 CODING_SYSTEM_##Type (codesys) = sub_codesys_obj; \
453 DEFINE_SUB_CODESYS("unix", "", EOL_LF);
454 DEFINE_SUB_CODESYS("dos", ":T", EOL_CRLF);
455 DEFINE_SUB_CODESYS("mac", ":t", EOL_CR);
458 DEFUN ("coding-system-p", Fcoding_system_p, 1, 1, 0, /*
459 Return t if OBJECT is a coding system.
460 A coding system is an object that defines how text containing multiple
461 character sets is encoded into a stream of (typically 8-bit) bytes.
462 The coding system is used to decode the stream into a series of
463 characters (which may be from multiple charsets) when the text is read
464 from a file or process, and is used to encode the text back into the
465 same format when it is written out to a file or process.
467 For example, many ISO2022-compliant coding systems (such as Compound
468 Text, which is used for inter-client data under the X Window System)
469 use escape sequences to switch between different charsets -- Japanese
470 Kanji, for example, is invoked with "ESC $ ( B"; ASCII is invoked
471 with "ESC ( B"; and Cyrillic is invoked with "ESC - L". See
472 `make-coding-system' for more information.
474 Coding systems are normally identified using a symbol, and the
475 symbol is accepted in place of the actual coding system object whenever
476 a coding system is called for. (This is similar to how faces work.)
480 return CODING_SYSTEMP (object) ? Qt : Qnil;
483 DEFUN ("find-coding-system", Ffind_coding_system, 1, 1, 0, /*
484 Retrieve the coding system of the given name.
486 If CODING-SYSTEM-OR-NAME is a coding-system object, it is simply
487 returned. Otherwise, CODING-SYSTEM-OR-NAME should be a symbol.
488 If there is no such coding system, nil is returned. Otherwise the
489 associated coding system object is returned.
491 (coding_system_or_name))
493 if (CODING_SYSTEMP (coding_system_or_name))
494 return coding_system_or_name;
496 if (NILP (coding_system_or_name))
497 coding_system_or_name = Qbinary;
499 CHECK_SYMBOL (coding_system_or_name);
501 return Fgethash (coding_system_or_name, Vcoding_system_hash_table, Qnil);
504 DEFUN ("get-coding-system", Fget_coding_system, 1, 1, 0, /*
505 Retrieve the coding system of the given name.
506 Same as `find-coding-system' except that if there is no such
507 coding system, an error is signaled instead of returning nil.
511 Lisp_Object coding_system = Ffind_coding_system (name);
513 if (NILP (coding_system))
514 signal_simple_error ("No such coding system", name);
515 return coding_system;
518 /* We store the coding systems in hash tables with the names as the key and the
519 actual coding system object as the value. Occasionally we need to use them
520 in a list format. These routines provide us with that. */
521 struct coding_system_list_closure
523 Lisp_Object *coding_system_list;
527 add_coding_system_to_list_mapper (Lisp_Object key, Lisp_Object value,
528 void *coding_system_list_closure)
530 /* This function can GC */
531 struct coding_system_list_closure *cscl =
532 (struct coding_system_list_closure *) coding_system_list_closure;
533 Lisp_Object *coding_system_list = cscl->coding_system_list;
535 *coding_system_list = Fcons (XCODING_SYSTEM (value)->name,
536 *coding_system_list);
540 DEFUN ("coding-system-list", Fcoding_system_list, 0, 0, 0, /*
541 Return a list of the names of all defined coding systems.
545 Lisp_Object coding_system_list = Qnil;
547 struct coding_system_list_closure coding_system_list_closure;
549 GCPRO1 (coding_system_list);
550 coding_system_list_closure.coding_system_list = &coding_system_list;
551 elisp_maphash (add_coding_system_to_list_mapper, Vcoding_system_hash_table,
552 &coding_system_list_closure);
555 return coding_system_list;
558 DEFUN ("coding-system-name", Fcoding_system_name, 1, 1, 0, /*
559 Return the name of the given coding system.
563 coding_system = Fget_coding_system (coding_system);
564 return XCODING_SYSTEM_NAME (coding_system);
567 static Lisp_Coding_System *
568 allocate_coding_system (enum coding_system_type type, Lisp_Object name)
570 Lisp_Coding_System *codesys =
571 alloc_lcrecord_type (Lisp_Coding_System, &lrecord_coding_system);
573 zero_lcrecord (codesys);
574 CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = Qnil;
575 CODING_SYSTEM_POST_READ_CONVERSION (codesys) = Qnil;
576 CODING_SYSTEM_EOL_TYPE (codesys) = EOL_AUTODETECT;
577 CODING_SYSTEM_EOL_CRLF (codesys) = Qnil;
578 CODING_SYSTEM_EOL_CR (codesys) = Qnil;
579 CODING_SYSTEM_EOL_LF (codesys) = Qnil;
580 CODING_SYSTEM_TYPE (codesys) = type;
581 CODING_SYSTEM_MNEMONIC (codesys) = Qnil;
583 if (type == CODESYS_ISO2022)
586 for (i = 0; i < 4; i++)
587 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i) = Qnil;
589 else if (type == CODESYS_CCL)
591 CODING_SYSTEM_CCL_DECODE (codesys) = Qnil;
592 CODING_SYSTEM_CCL_ENCODE (codesys) = Qnil;
595 CODING_SYSTEM_NAME (codesys) = name;
601 /* Given a list of charset conversion specs as specified in a Lisp
602 program, parse it into STORE_HERE. */
605 parse_charset_conversion_specs (charset_conversion_spec_dynarr *store_here,
606 Lisp_Object spec_list)
610 EXTERNAL_LIST_LOOP (rest, spec_list)
612 Lisp_Object car = XCAR (rest);
613 Lisp_Object from, to;
614 struct charset_conversion_spec spec;
616 if (!CONSP (car) || !CONSP (XCDR (car)) || !NILP (XCDR (XCDR (car))))
617 signal_simple_error ("Invalid charset conversion spec", car);
618 from = Fget_charset (XCAR (car));
619 to = Fget_charset (XCAR (XCDR (car)));
620 if (XCHARSET_TYPE (from) != XCHARSET_TYPE (to))
621 signal_simple_error_2
622 ("Attempted conversion between different charset types",
624 spec.from_charset = from;
625 spec.to_charset = to;
627 Dynarr_add (store_here, spec);
631 /* Given a dynarr LOAD_HERE of internally-stored charset conversion
632 specs, return the equivalent as the Lisp programmer would see it.
634 If LOAD_HERE is 0, return Qnil. */
637 unparse_charset_conversion_specs (charset_conversion_spec_dynarr *load_here)
644 for (i = 0, result = Qnil; i < Dynarr_length (load_here); i++)
646 struct charset_conversion_spec *ccs = Dynarr_atp (load_here, i);
647 result = Fcons (list2 (ccs->from_charset, ccs->to_charset), result);
650 return Fnreverse (result);
655 DEFUN ("make-coding-system", Fmake_coding_system, 2, 4, 0, /*
656 Register symbol NAME as a coding system.
658 TYPE describes the conversion method used and should be one of
661 Automatic conversion. XEmacs attempts to detect the coding system
664 No conversion. Use this for binary files and such. On output,
665 graphic characters that are not in ASCII or Latin-1 will be
666 replaced by a ?. (For a no-conversion-encoded buffer, these
667 characters will only be present if you explicitly insert them.)
669 Shift-JIS (a Japanese encoding commonly used in PC operating systems).
671 ISO 10646 UCS-4 encoding.
673 ISO 10646 UTF-8 encoding.
675 Any ISO2022-compliant encoding. Among other things, this includes
676 JIS (the Japanese encoding commonly used for e-mail), EUC (the
677 standard Unix encoding for Japanese and other languages), and
678 Compound Text (the encoding used in X11). You can specify more
679 specific information about the conversion with the FLAGS argument.
681 Big5 (the encoding commonly used for Taiwanese).
683 The conversion is performed using a user-written pseudo-code
684 program. CCL (Code Conversion Language) is the name of this
687 Write out or read in the raw contents of the memory representing
688 the buffer's text. This is primarily useful for debugging
689 purposes, and is only enabled when XEmacs has been compiled with
690 DEBUG_XEMACS defined (via the --debug configure option).
691 WARNING: Reading in a file using 'internal conversion can result
692 in an internal inconsistency in the memory representing a
693 buffer's text, which will produce unpredictable results and may
694 cause XEmacs to crash. Under normal circumstances you should
695 never use 'internal conversion.
697 DOC-STRING is a string describing the coding system.
699 PROPS is a property list, describing the specific nature of the
700 character set. Recognized properties are:
703 String to be displayed in the modeline when this coding system is
707 End-of-line conversion to be used. It should be one of
710 Automatically detect the end-of-line type (LF, CRLF,
711 or CR). Also generate subsidiary coding systems named
712 `NAME-unix', `NAME-dos', and `NAME-mac', that are
713 identical to this coding system but have an EOL-TYPE
714 value of 'lf, 'crlf, and 'cr, respectively.
716 The end of a line is marked externally using ASCII LF.
717 Since this is also the way that XEmacs represents an
718 end-of-line internally, specifying this option results
719 in no end-of-line conversion. This is the standard
720 format for Unix text files.
722 The end of a line is marked externally using ASCII
723 CRLF. This is the standard format for MS-DOS text
726 The end of a line is marked externally using ASCII CR.
727 This is the standard format for Macintosh text files.
729 Automatically detect the end-of-line type but do not
730 generate subsidiary coding systems. (This value is
731 converted to nil when stored internally, and
732 `coding-system-property' will return nil.)
734 'post-read-conversion
735 Function called after a file has been read in, to perform the
736 decoding. Called with two arguments, BEG and END, denoting
737 a region of the current buffer to be decoded.
739 'pre-write-conversion
740 Function called before a file is written out, to perform the
741 encoding. Called with two arguments, BEG and END, denoting
742 a region of the current buffer to be encoded.
745 The following additional properties are recognized if TYPE is 'iso2022:
751 The character set initially designated to the G0 - G3 registers.
752 The value should be one of
754 -- A charset object (designate that character set)
755 -- nil (do not ever use this register)
756 -- t (no character set is initially designated to
757 the register, but may be later on; this automatically
758 sets the corresponding `force-g*-on-output' property)
764 If non-nil, send an explicit designation sequence on output before
765 using the specified register.
768 If non-nil, use the short forms "ESC $ @", "ESC $ A", and
769 "ESC $ B" on output in place of the full designation sequences
770 "ESC $ ( @", "ESC $ ( A", and "ESC $ ( B".
773 If non-nil, don't designate ASCII to G0 at each end of line on output.
774 Setting this to non-nil also suppresses other state-resetting that
775 normally happens at the end of a line.
778 If non-nil, don't designate ASCII to G0 before control chars on output.
781 If non-nil, use 7-bit environment on output. Otherwise, use 8-bit
785 If non-nil, use locking-shift (SO/SI) instead of single-shift
786 or designation by escape sequence.
789 If non-nil, don't use ISO6429's direction specification.
792 If non-nil, literal control characters that are the same as
793 the beginning of a recognized ISO2022 or ISO6429 escape sequence
794 (in particular, ESC (0x1B), SO (0x0E), SI (0x0F), SS2 (0x8E),
795 SS3 (0x8F), and CSI (0x9B)) are "quoted" with an escape character
796 so that they can be properly distinguished from an escape sequence.
797 (Note that doing this results in a non-portable encoding.) This
798 encoding flag is used for byte-compiled files. Note that ESC
799 is a good choice for a quoting character because there are no
800 escape sequences whose second byte is a character from the Control-0
801 or Control-1 character sets; this is explicitly disallowed by the
804 'input-charset-conversion
805 A list of conversion specifications, specifying conversion of
806 characters in one charset to another when decoding is performed.
807 Each specification is a list of two elements: the source charset,
808 and the destination charset.
810 'output-charset-conversion
811 A list of conversion specifications, specifying conversion of
812 characters in one charset to another when encoding is performed.
813 The form of each specification is the same as for
814 'input-charset-conversion.
817 The following additional properties are recognized (and required)
821 CCL program used for decoding (converting to internal format).
824 CCL program used for encoding (converting to external format).
826 (name, type, doc_string, props))
828 Lisp_Coding_System *codesys;
829 Lisp_Object rest, key, value;
830 enum coding_system_type ty;
831 int need_to_setup_eol_systems = 1;
833 /* Convert type to constant */
834 if (NILP (type) || EQ (type, Qundecided))
835 { ty = CODESYS_AUTODETECT; }
837 else if (EQ (type, Qshift_jis)) { ty = CODESYS_SHIFT_JIS; }
838 else if (EQ (type, Qiso2022)) { ty = CODESYS_ISO2022; }
839 else if (EQ (type, Qbig5)) { ty = CODESYS_BIG5; }
840 else if (EQ (type, Qucs4)) { ty = CODESYS_UCS4; }
841 else if (EQ (type, Qutf8)) { ty = CODESYS_UTF8; }
842 else if (EQ (type, Qccl)) { ty = CODESYS_CCL; }
844 else if (EQ (type, Qno_conversion)) { ty = CODESYS_NO_CONVERSION; }
846 else if (EQ (type, Qinternal)) { ty = CODESYS_INTERNAL; }
849 signal_simple_error ("Invalid coding system type", type);
853 codesys = allocate_coding_system (ty, name);
855 if (NILP (doc_string))
856 doc_string = build_string ("");
858 CHECK_STRING (doc_string);
859 CODING_SYSTEM_DOC_STRING (codesys) = doc_string;
861 EXTERNAL_PROPERTY_LIST_LOOP (rest, key, value, props)
863 if (EQ (key, Qmnemonic))
866 CHECK_STRING (value);
867 CODING_SYSTEM_MNEMONIC (codesys) = value;
870 else if (EQ (key, Qeol_type))
872 need_to_setup_eol_systems = NILP (value);
875 CODING_SYSTEM_EOL_TYPE (codesys) = symbol_to_eol_type (value);
878 else if (EQ (key, Qpost_read_conversion)) CODING_SYSTEM_POST_READ_CONVERSION (codesys) = value;
879 else if (EQ (key, Qpre_write_conversion)) CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = value;
881 else if (ty == CODESYS_ISO2022)
883 #define FROB_INITIAL_CHARSET(charset_num) \
884 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \
885 ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value))
887 if (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
888 else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
889 else if (EQ (key, Qcharset_g2)) FROB_INITIAL_CHARSET (2);
890 else if (EQ (key, Qcharset_g3)) FROB_INITIAL_CHARSET (3);
892 #define FROB_FORCE_CHARSET(charset_num) \
893 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = !NILP (value)
895 else if (EQ (key, Qforce_g0_on_output)) FROB_FORCE_CHARSET (0);
896 else if (EQ (key, Qforce_g1_on_output)) FROB_FORCE_CHARSET (1);
897 else if (EQ (key, Qforce_g2_on_output)) FROB_FORCE_CHARSET (2);
898 else if (EQ (key, Qforce_g3_on_output)) FROB_FORCE_CHARSET (3);
900 #define FROB_BOOLEAN_PROPERTY(prop) \
901 CODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value)
903 else if (EQ (key, Qshort)) FROB_BOOLEAN_PROPERTY (SHORT);
904 else if (EQ (key, Qno_ascii_eol)) FROB_BOOLEAN_PROPERTY (NO_ASCII_EOL);
905 else if (EQ (key, Qno_ascii_cntl)) FROB_BOOLEAN_PROPERTY (NO_ASCII_CNTL);
906 else if (EQ (key, Qseven)) FROB_BOOLEAN_PROPERTY (SEVEN);
907 else if (EQ (key, Qlock_shift)) FROB_BOOLEAN_PROPERTY (LOCK_SHIFT);
908 else if (EQ (key, Qno_iso6429)) FROB_BOOLEAN_PROPERTY (NO_ISO6429);
909 else if (EQ (key, Qescape_quoted)) FROB_BOOLEAN_PROPERTY (ESCAPE_QUOTED);
911 else if (EQ (key, Qinput_charset_conversion))
913 codesys->iso2022.input_conv =
914 Dynarr_new (charset_conversion_spec);
915 parse_charset_conversion_specs (codesys->iso2022.input_conv,
918 else if (EQ (key, Qoutput_charset_conversion))
920 codesys->iso2022.output_conv =
921 Dynarr_new (charset_conversion_spec);
922 parse_charset_conversion_specs (codesys->iso2022.output_conv,
926 signal_simple_error ("Unrecognized property", key);
928 else if (EQ (type, Qccl))
930 if (EQ (key, Qdecode))
932 CHECK_VECTOR (value);
933 CODING_SYSTEM_CCL_DECODE (codesys) = value;
935 else if (EQ (key, Qencode))
937 CHECK_VECTOR (value);
938 CODING_SYSTEM_CCL_ENCODE (codesys) = value;
941 signal_simple_error ("Unrecognized property", key);
945 signal_simple_error ("Unrecognized property", key);
948 if (need_to_setup_eol_systems)
949 setup_eol_coding_systems (codesys);
952 Lisp_Object codesys_obj;
953 XSETCODING_SYSTEM (codesys_obj, codesys);
954 Fputhash (name, codesys_obj, Vcoding_system_hash_table);
959 DEFUN ("copy-coding-system", Fcopy_coding_system, 2, 2, 0, /*
960 Copy OLD-CODING-SYSTEM to NEW-NAME.
961 If NEW-NAME does not name an existing coding system, a new one will
964 (old_coding_system, new_name))
966 Lisp_Object new_coding_system;
967 old_coding_system = Fget_coding_system (old_coding_system);
968 new_coding_system = Ffind_coding_system (new_name);
969 if (NILP (new_coding_system))
971 XSETCODING_SYSTEM (new_coding_system,
972 allocate_coding_system
973 (XCODING_SYSTEM_TYPE (old_coding_system),
975 Fputhash (new_name, new_coding_system, Vcoding_system_hash_table);
979 Lisp_Coding_System *to = XCODING_SYSTEM (new_coding_system);
980 Lisp_Coding_System *from = XCODING_SYSTEM (old_coding_system);
981 memcpy (((char *) to ) + sizeof (to->header),
982 ((char *) from) + sizeof (from->header),
983 sizeof (*from) - sizeof (from->header));
986 return new_coding_system;
989 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias, 2, 2, 0, /*
990 Define symbol ALIAS as an alias for coding system CODING-SYSTEM.
992 (alias, coding_system))
994 CHECK_SYMBOL (alias);
995 if (!NILP (Ffind_coding_system (alias)))
996 signal_simple_error ("Symbol already names a coding system", alias);
997 coding_system = Fget_coding_system (coding_system);
998 Fputhash (alias, coding_system, Vcoding_system_hash_table);
1000 /* Set up aliases for subsidiaries. */
1001 if (XCODING_SYSTEM_EOL_TYPE (coding_system) == EOL_AUTODETECT)
1004 XSETSTRING (str, symbol_name (XSYMBOL (alias)));
1005 #define FROB(type, name) \
1007 Lisp_Object subsidiary = XCODING_SYSTEM_EOL_##type (coding_system); \
1008 if (!NILP (subsidiary)) \
1009 Fdefine_coding_system_alias \
1010 (Fintern (concat2 (str, build_string (name)), Qnil), subsidiary); \
1013 FROB (CRLF, "-dos");
1017 /* FSF return value is a vector of [ALIAS-unix ALIAS-doc ALIAS-mac],
1018 but it doesn't look intentional, so I'd rather return something
1019 meaningful or nothing at all. */
1024 subsidiary_coding_system (Lisp_Object coding_system, enum eol_type type)
1026 Lisp_Coding_System *cs = XCODING_SYSTEM (coding_system);
1027 Lisp_Object new_coding_system;
1029 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
1030 return coding_system;
1034 case EOL_AUTODETECT: return coding_system;
1035 case EOL_LF: new_coding_system = CODING_SYSTEM_EOL_LF (cs); break;
1036 case EOL_CR: new_coding_system = CODING_SYSTEM_EOL_CR (cs); break;
1037 case EOL_CRLF: new_coding_system = CODING_SYSTEM_EOL_CRLF (cs); break;
1041 return NILP (new_coding_system) ? coding_system : new_coding_system;
1044 DEFUN ("subsidiary-coding-system", Fsubsidiary_coding_system, 2, 2, 0, /*
1045 Return the subsidiary coding system of CODING-SYSTEM with eol type EOL-TYPE.
1047 (coding_system, eol_type))
1049 coding_system = Fget_coding_system (coding_system);
1051 return subsidiary_coding_system (coding_system,
1052 symbol_to_eol_type (eol_type));
1056 /************************************************************************/
1057 /* Coding system accessors */
1058 /************************************************************************/
1060 DEFUN ("coding-system-doc-string", Fcoding_system_doc_string, 1, 1, 0, /*
1061 Return the doc string for CODING-SYSTEM.
1065 coding_system = Fget_coding_system (coding_system);
1066 return XCODING_SYSTEM_DOC_STRING (coding_system);
1069 DEFUN ("coding-system-type", Fcoding_system_type, 1, 1, 0, /*
1070 Return the type of CODING-SYSTEM.
1074 switch (XCODING_SYSTEM_TYPE (Fget_coding_system (coding_system)))
1077 case CODESYS_AUTODETECT: return Qundecided;
1079 case CODESYS_SHIFT_JIS: return Qshift_jis;
1080 case CODESYS_ISO2022: return Qiso2022;
1081 case CODESYS_BIG5: return Qbig5;
1082 case CODESYS_UCS4: return Qucs4;
1083 case CODESYS_UTF8: return Qutf8;
1084 case CODESYS_CCL: return Qccl;
1086 case CODESYS_NO_CONVERSION: return Qno_conversion;
1088 case CODESYS_INTERNAL: return Qinternal;
1095 Lisp_Object coding_system_charset (Lisp_Object coding_system, int gnum)
1098 = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, gnum);
1100 return CHARSETP (cs) ? XCHARSET_NAME (cs) : Qnil;
1103 DEFUN ("coding-system-charset", Fcoding_system_charset, 2, 2, 0, /*
1104 Return initial charset of CODING-SYSTEM designated to GNUM.
1107 (coding_system, gnum))
1109 coding_system = Fget_coding_system (coding_system);
1112 return coding_system_charset (coding_system, XINT (gnum));
1116 DEFUN ("coding-system-property", Fcoding_system_property, 2, 2, 0, /*
1117 Return the PROP property of CODING-SYSTEM.
1119 (coding_system, prop))
1122 enum coding_system_type type;
1124 coding_system = Fget_coding_system (coding_system);
1125 CHECK_SYMBOL (prop);
1126 type = XCODING_SYSTEM_TYPE (coding_system);
1128 for (i = 0; !ok && i < Dynarr_length (the_codesys_prop_dynarr); i++)
1129 if (EQ (Dynarr_at (the_codesys_prop_dynarr, i).sym, prop))
1132 switch (Dynarr_at (the_codesys_prop_dynarr, i).prop_type)
1134 case CODESYS_PROP_ALL_OK:
1137 case CODESYS_PROP_ISO2022:
1138 if (type != CODESYS_ISO2022)
1140 ("Property only valid in ISO2022 coding systems",
1144 case CODESYS_PROP_CCL:
1145 if (type != CODESYS_CCL)
1147 ("Property only valid in CCL coding systems",
1157 signal_simple_error ("Unrecognized property", prop);
1159 if (EQ (prop, Qname))
1160 return XCODING_SYSTEM_NAME (coding_system);
1161 else if (EQ (prop, Qtype))
1162 return Fcoding_system_type (coding_system);
1163 else if (EQ (prop, Qdoc_string))
1164 return XCODING_SYSTEM_DOC_STRING (coding_system);
1165 else if (EQ (prop, Qmnemonic))
1166 return XCODING_SYSTEM_MNEMONIC (coding_system);
1167 else if (EQ (prop, Qeol_type))
1168 return eol_type_to_symbol (XCODING_SYSTEM_EOL_TYPE (coding_system));
1169 else if (EQ (prop, Qeol_lf))
1170 return XCODING_SYSTEM_EOL_LF (coding_system);
1171 else if (EQ (prop, Qeol_crlf))
1172 return XCODING_SYSTEM_EOL_CRLF (coding_system);
1173 else if (EQ (prop, Qeol_cr))
1174 return XCODING_SYSTEM_EOL_CR (coding_system);
1175 else if (EQ (prop, Qpost_read_conversion))
1176 return XCODING_SYSTEM_POST_READ_CONVERSION (coding_system);
1177 else if (EQ (prop, Qpre_write_conversion))
1178 return XCODING_SYSTEM_PRE_WRITE_CONVERSION (coding_system);
1180 else if (type == CODESYS_ISO2022)
1182 if (EQ (prop, Qcharset_g0))
1183 return coding_system_charset (coding_system, 0);
1184 else if (EQ (prop, Qcharset_g1))
1185 return coding_system_charset (coding_system, 1);
1186 else if (EQ (prop, Qcharset_g2))
1187 return coding_system_charset (coding_system, 2);
1188 else if (EQ (prop, Qcharset_g3))
1189 return coding_system_charset (coding_system, 3);
1191 #define FORCE_CHARSET(charset_num) \
1192 (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \
1193 (coding_system, charset_num) ? Qt : Qnil)
1195 else if (EQ (prop, Qforce_g0_on_output)) return FORCE_CHARSET (0);
1196 else if (EQ (prop, Qforce_g1_on_output)) return FORCE_CHARSET (1);
1197 else if (EQ (prop, Qforce_g2_on_output)) return FORCE_CHARSET (2);
1198 else if (EQ (prop, Qforce_g3_on_output)) return FORCE_CHARSET (3);
1200 #define LISP_BOOLEAN(prop) \
1201 (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil)
1203 else if (EQ (prop, Qshort)) return LISP_BOOLEAN (SHORT);
1204 else if (EQ (prop, Qno_ascii_eol)) return LISP_BOOLEAN (NO_ASCII_EOL);
1205 else if (EQ (prop, Qno_ascii_cntl)) return LISP_BOOLEAN (NO_ASCII_CNTL);
1206 else if (EQ (prop, Qseven)) return LISP_BOOLEAN (SEVEN);
1207 else if (EQ (prop, Qlock_shift)) return LISP_BOOLEAN (LOCK_SHIFT);
1208 else if (EQ (prop, Qno_iso6429)) return LISP_BOOLEAN (NO_ISO6429);
1209 else if (EQ (prop, Qescape_quoted)) return LISP_BOOLEAN (ESCAPE_QUOTED);
1211 else if (EQ (prop, Qinput_charset_conversion))
1213 unparse_charset_conversion_specs
1214 (XCODING_SYSTEM (coding_system)->iso2022.input_conv);
1215 else if (EQ (prop, Qoutput_charset_conversion))
1217 unparse_charset_conversion_specs
1218 (XCODING_SYSTEM (coding_system)->iso2022.output_conv);
1222 else if (type == CODESYS_CCL)
1224 if (EQ (prop, Qdecode))
1225 return XCODING_SYSTEM_CCL_DECODE (coding_system);
1226 else if (EQ (prop, Qencode))
1227 return XCODING_SYSTEM_CCL_ENCODE (coding_system);
1235 return Qnil; /* not reached */
1239 /************************************************************************/
1240 /* Coding category functions */
1241 /************************************************************************/
1244 decode_coding_category (Lisp_Object symbol)
1248 CHECK_SYMBOL (symbol);
1249 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1250 if (EQ (coding_category_symbol[i], symbol))
1253 signal_simple_error ("Unrecognized coding category", symbol);
1254 return 0; /* not reached */
1257 DEFUN ("coding-category-list", Fcoding_category_list, 0, 0, 0, /*
1258 Return a list of all recognized coding categories.
1263 Lisp_Object list = Qnil;
1265 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1266 list = Fcons (coding_category_symbol[i], list);
1270 DEFUN ("set-coding-priority-list", Fset_coding_priority_list, 1, 1, 0, /*
1271 Change the priority order of the coding categories.
1272 LIST should be list of coding categories, in descending order of
1273 priority. Unspecified coding categories will be lower in priority
1274 than all specified ones, in the same relative order they were in
1279 int category_to_priority[CODING_CATEGORY_LAST + 1];
1283 /* First generate a list that maps coding categories to priorities. */
1285 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1286 category_to_priority[i] = -1;
1288 /* Highest priority comes from the specified list. */
1290 EXTERNAL_LIST_LOOP (rest, list)
1292 int cat = decode_coding_category (XCAR (rest));
1294 if (category_to_priority[cat] >= 0)
1295 signal_simple_error ("Duplicate coding category in list", XCAR (rest));
1296 category_to_priority[cat] = i++;
1299 /* Now go through the existing categories by priority to retrieve
1300 the categories not yet specified and preserve their priority
1302 for (j = 0; j <= CODING_CATEGORY_LAST; j++)
1304 int cat = coding_category_by_priority[j];
1305 if (category_to_priority[cat] < 0)
1306 category_to_priority[cat] = i++;
1309 /* Now we need to construct the inverse of the mapping we just
1312 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1313 coding_category_by_priority[category_to_priority[i]] = i;
1315 /* Phew! That was confusing. */
1319 DEFUN ("coding-priority-list", Fcoding_priority_list, 0, 0, 0, /*
1320 Return a list of coding categories in descending order of priority.
1325 Lisp_Object list = Qnil;
1327 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1328 list = Fcons (coding_category_symbol[coding_category_by_priority[i]],
1333 DEFUN ("set-coding-category-system", Fset_coding_category_system, 2, 2, 0, /*
1334 Change the coding system associated with a coding category.
1336 (coding_category, coding_system))
1338 int cat = decode_coding_category (coding_category);
1340 coding_system = Fget_coding_system (coding_system);
1341 coding_category_system[cat] = coding_system;
1345 DEFUN ("coding-category-system", Fcoding_category_system, 1, 1, 0, /*
1346 Return the coding system associated with a coding category.
1350 int cat = decode_coding_category (coding_category);
1351 Lisp_Object sys = coding_category_system[cat];
1354 return XCODING_SYSTEM_NAME (sys);
1359 /************************************************************************/
1360 /* Detecting the encoding of data */
1361 /************************************************************************/
1363 struct detection_state
1365 enum eol_type eol_type;
1401 struct iso2022_decoder iso;
1403 int high_byte_count;
1404 unsigned int saw_single_shift:1;
1417 acceptable_control_char_p (int c)
1421 /* Allow and ignore control characters that you might
1422 reasonably see in a text file */
1427 case 8: /* backspace */
1428 case 11: /* vertical tab */
1429 case 12: /* form feed */
1430 case 26: /* MS-DOS C-z junk */
1431 case 31: /* '^_' -- for info */
1439 mask_has_at_most_one_bit_p (int mask)
1441 /* Perhaps the only thing useful you learn from intensive Microsoft
1442 technical interviews */
1443 return (mask & (mask - 1)) == 0;
1446 static enum eol_type
1447 detect_eol_type (struct detection_state *st, CONST unsigned char *src,
1456 st->eol.just_saw_cr = 1;
1461 if (st->eol.just_saw_cr)
1463 else if (st->eol.seen_anything)
1466 else if (st->eol.just_saw_cr)
1468 st->eol.just_saw_cr = 0;
1470 st->eol.seen_anything = 1;
1473 return EOL_AUTODETECT;
1476 /* Attempt to determine the encoding and EOL type of the given text.
1477 Before calling this function for the first type, you must initialize
1478 st->eol_type as appropriate and initialize st->mask to ~0.
1480 st->eol_type holds the determined EOL type, or EOL_AUTODETECT if
1483 st->mask holds the determined coding category mask, or ~0 if only
1484 ASCII has been seen so far.
1488 0 == st->eol_type is EOL_AUTODETECT and/or more than coding category
1489 is present in st->mask
1490 1 == definitive answers are here for both st->eol_type and st->mask
1494 detect_coding_type (struct detection_state *st, CONST unsigned char *src,
1495 unsigned int n, int just_do_eol)
1499 if (st->eol_type == EOL_AUTODETECT)
1500 st->eol_type = detect_eol_type (st, src, n);
1503 return st->eol_type != EOL_AUTODETECT;
1505 if (!st->seen_non_ascii)
1507 for (; n; n--, src++)
1510 if ((c < 0x20 && !acceptable_control_char_p (c)) || c >= 0x80)
1512 st->seen_non_ascii = 1;
1514 st->shift_jis.mask = ~0;
1518 st->iso2022.mask = ~0;
1528 if (!mask_has_at_most_one_bit_p (st->iso2022.mask))
1529 st->iso2022.mask = detect_coding_iso2022 (st, src, n);
1530 if (!mask_has_at_most_one_bit_p (st->shift_jis.mask))
1531 st->shift_jis.mask = detect_coding_sjis (st, src, n);
1532 if (!mask_has_at_most_one_bit_p (st->big5.mask))
1533 st->big5.mask = detect_coding_big5 (st, src, n);
1534 if (!mask_has_at_most_one_bit_p (st->utf8.mask))
1535 st->utf8.mask = detect_coding_utf8 (st, src, n);
1536 if (!mask_has_at_most_one_bit_p (st->ucs4.mask))
1537 st->ucs4.mask = detect_coding_ucs4 (st, src, n);
1540 = st->iso2022.mask | st->shift_jis.mask | st->big5.mask
1541 | st->utf8.mask | st->ucs4.mask;
1544 int retval = mask_has_at_most_one_bit_p (st->mask);
1545 st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK;
1546 return retval && st->eol_type != EOL_AUTODETECT;
1551 coding_system_from_mask (int mask)
1555 /* If the file was entirely or basically ASCII, use the
1556 default value of `buffer-file-coding-system'. */
1557 Lisp_Object retval =
1558 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system;
1561 retval = Ffind_coding_system (retval);
1565 (Qbad_variable, Qwarning,
1566 "Invalid `default-buffer-file-coding-system', set to nil");
1567 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system = Qnil;
1571 retval = Fget_coding_system (Qraw_text);
1579 mask = postprocess_iso2022_mask (mask);
1581 /* Look through the coding categories by priority and find
1582 the first one that is allowed. */
1583 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1585 cat = coding_category_by_priority[i];
1586 if ((mask & (1 << cat)) &&
1587 !NILP (coding_category_system[cat]))
1591 return coding_category_system[cat];
1593 return Fget_coding_system (Qraw_text);
1597 /* Given a seekable read stream and potential coding system and EOL type
1598 as specified, do any autodetection that is called for. If the
1599 coding system and/or EOL type are not autodetect, they will be left
1600 alone; but this function will never return an autodetect coding system
1603 This function does not automatically fetch subsidiary coding systems;
1604 that should be unnecessary with the explicit eol-type argument. */
1607 determine_real_coding_system (Lstream *stream, Lisp_Object *codesys_in_out,
1608 enum eol_type *eol_type_in_out)
1610 struct detection_state decst;
1612 if (*eol_type_in_out == EOL_AUTODETECT)
1613 *eol_type_in_out = XCODING_SYSTEM_EOL_TYPE (*codesys_in_out);
1616 decst.eol_type = *eol_type_in_out;
1619 /* If autodetection is called for, do it now. */
1620 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT ||
1621 *eol_type_in_out == EOL_AUTODETECT)
1626 unsigned char random_buffer[4096];
1629 nread = Lstream_read (stream, random_buffer, sizeof (random_buffer));
1632 if (detect_coding_type (&decst, random_buffer, nread,
1633 XCODING_SYSTEM_TYPE (*codesys_in_out) !=
1634 CODESYS_AUTODETECT))
1638 *eol_type_in_out = decst.eol_type;
1639 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT)
1640 *codesys_in_out = coding_system_from_mask (decst.mask);
1643 /* If we absolutely can't determine the EOL type, just assume LF. */
1644 if (*eol_type_in_out == EOL_AUTODETECT)
1645 *eol_type_in_out = EOL_LF;
1647 Lstream_rewind (stream);
1650 DEFUN ("detect-coding-region", Fdetect_coding_region, 2, 3, 0, /*
1651 Detect coding system of the text in the region between START and END.
1652 Returned a list of possible coding systems ordered by priority.
1653 If only ASCII characters are found, it returns 'undecided or one of
1654 its subsidiary coding systems according to a detected end-of-line
1655 type. Optional arg BUFFER defaults to the current buffer.
1657 (start, end, buffer))
1659 Lisp_Object val = Qnil;
1660 struct buffer *buf = decode_buffer (buffer, 0);
1662 Lisp_Object instream, lb_instream;
1663 Lstream *istr, *lb_istr;
1664 struct detection_state decst;
1665 struct gcpro gcpro1, gcpro2;
1667 get_buffer_range_char (buf, start, end, &b, &e, 0);
1668 lb_instream = make_lisp_buffer_input_stream (buf, b, e, 0);
1669 lb_istr = XLSTREAM (lb_instream);
1670 instream = make_encoding_input_stream (lb_istr, Fget_coding_system (Qbinary));
1671 istr = XLSTREAM (instream);
1672 GCPRO2 (instream, lb_instream);
1674 decst.eol_type = EOL_AUTODETECT;
1678 unsigned char random_buffer[4096];
1679 int nread = Lstream_read (istr, random_buffer, sizeof (random_buffer));
1683 if (detect_coding_type (&decst, random_buffer, nread, 0))
1687 if (decst.mask == ~0)
1688 val = subsidiary_coding_system (Fget_coding_system (Qundecided),
1696 decst.mask = postprocess_iso2022_mask (decst.mask);
1698 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1700 int sys = coding_category_by_priority[i];
1701 if (decst.mask & (1 << sys))
1703 Lisp_Object codesys = coding_category_system[sys];
1704 if (!NILP (codesys))
1705 codesys = subsidiary_coding_system (codesys, decst.eol_type);
1706 val = Fcons (codesys, val);
1710 Lstream_close (istr);
1712 Lstream_delete (istr);
1713 Lstream_delete (lb_istr);
1718 /************************************************************************/
1719 /* Converting to internal Mule format ("decoding") */
1720 /************************************************************************/
1722 /* A decoding stream is a stream used for decoding text (i.e.
1723 converting from some external format to internal format).
1724 The decoding-stream object keeps track of the actual coding
1725 stream, the stream that is at the other end, and data that
1726 needs to be persistent across the lifetime of the stream. */
1728 /* Handle the EOL stuff related to just-read-in character C.
1729 EOL_TYPE is the EOL type of the coding stream.
1730 FLAGS is the current value of FLAGS in the coding stream, and may
1731 be modified by this macro. (The macro only looks at the
1732 CODING_STATE_CR flag.) DST is the Dynarr to which the decoded
1733 bytes are to be written. You need to also define a local goto
1734 label "label_continue_loop" that is at the end of the main
1735 character-reading loop.
1737 If C is a CR character, then this macro handles it entirely and
1738 jumps to label_continue_loop. Otherwise, this macro does not add
1739 anything to DST, and continues normally. You should continue
1740 processing C normally after this macro. */
1742 #define DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst) \
1746 if (eol_type == EOL_CR) \
1747 Dynarr_add (dst, '\n'); \
1748 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR) \
1749 Dynarr_add (dst, c); \
1751 flags |= CODING_STATE_CR; \
1752 goto label_continue_loop; \
1754 else if (flags & CODING_STATE_CR) \
1755 { /* eol_type == CODING_SYSTEM_EOL_CRLF */ \
1757 Dynarr_add (dst, '\r'); \
1758 flags &= ~CODING_STATE_CR; \
1762 /* C should be a binary character in the range 0 - 255; convert
1763 to internal format and add to Dynarr DST. */
1765 #define DECODE_ADD_BINARY_CHAR(c, dst) \
1767 if (BYTE_ASCII_P (c)) \
1768 Dynarr_add (dst, c); \
1769 else if (BYTE_C1_P (c)) \
1771 Dynarr_add (dst, LEADING_BYTE_CONTROL_1); \
1772 Dynarr_add (dst, c + 0x20); \
1776 Dynarr_add (dst, LEADING_BYTE_LATIN_ISO8859_1); \
1777 Dynarr_add (dst, c); \
1781 #define DECODE_OUTPUT_PARTIAL_CHAR(ch) \
1785 DECODE_ADD_BINARY_CHAR (ch, dst); \
1790 #define DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst) \
1792 if (flags & CODING_STATE_END) \
1794 DECODE_OUTPUT_PARTIAL_CHAR (ch); \
1795 if (flags & CODING_STATE_CR) \
1796 Dynarr_add (dst, '\r'); \
1800 #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding)
1802 struct decoding_stream
1804 /* Coding system that governs the conversion. */
1805 Lisp_Coding_System *codesys;
1807 /* Stream that we read the encoded data from or
1808 write the decoded data to. */
1811 /* If we are reading, then we can return only a fixed amount of
1812 data, so if the conversion resulted in too much data, we store it
1813 here for retrieval the next time around. */
1814 unsigned_char_dynarr *runoff;
1816 /* FLAGS holds flags indicating the current state of the decoding.
1817 Some of these flags are dependent on the coding system. */
1820 /* CH holds a partially built-up character. Since we only deal
1821 with one- and two-byte characters at the moment, we only use
1822 this to store the first byte of a two-byte character. */
1825 /* EOL_TYPE specifies the type of end-of-line conversion that
1826 currently applies. We need to keep this separate from the
1827 EOL type stored in CODESYS because the latter might indicate
1828 automatic EOL-type detection while the former will always
1829 indicate a particular EOL type. */
1830 enum eol_type eol_type;
1832 /* Additional ISO2022 information. We define the structure above
1833 because it's also needed by the detection routines. */
1834 struct iso2022_decoder iso2022;
1836 /* Additional information (the state of the running CCL program)
1837 used by the CCL decoder. */
1838 struct ccl_program ccl;
1840 struct detection_state decst;
1843 static int decoding_reader (Lstream *stream, unsigned char *data, size_t size);
1844 static int decoding_writer (Lstream *stream, CONST unsigned char *data, size_t size);
1845 static int decoding_rewinder (Lstream *stream);
1846 static int decoding_seekable_p (Lstream *stream);
1847 static int decoding_flusher (Lstream *stream);
1848 static int decoding_closer (Lstream *stream);
1850 static Lisp_Object decoding_marker (Lisp_Object stream,
1851 void (*markobj) (Lisp_Object));
1853 DEFINE_LSTREAM_IMPLEMENTATION ("decoding", lstream_decoding,
1854 sizeof (struct decoding_stream));
1857 decoding_marker (Lisp_Object stream, void (*markobj) (Lisp_Object))
1859 Lstream *str = DECODING_STREAM_DATA (XLSTREAM (stream))->other_end;
1860 Lisp_Object str_obj;
1862 /* We do not need to mark the coding systems or charsets stored
1863 within the stream because they are stored in a global list
1864 and automatically marked. */
1866 XSETLSTREAM (str_obj, str);
1868 if (str->imp->marker)
1869 return (str->imp->marker) (str_obj, markobj);
1874 /* Read SIZE bytes of data and store it into DATA. We are a decoding stream
1875 so we read data from the other end, decode it, and store it into DATA. */
1878 decoding_reader (Lstream *stream, unsigned char *data, size_t size)
1880 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1881 unsigned char *orig_data = data;
1883 int error_occurred = 0;
1885 /* We need to interface to mule_decode(), which expects to take some
1886 amount of data and store the result into a Dynarr. We have
1887 mule_decode() store into str->runoff, and take data from there
1890 /* We loop until we have enough data, reading chunks from the other
1891 end and decoding it. */
1894 /* Take data from the runoff if we can. Make sure to take at
1895 most SIZE bytes, and delete the data from the runoff. */
1896 if (Dynarr_length (str->runoff) > 0)
1898 size_t chunk = min (size, (size_t) Dynarr_length (str->runoff));
1899 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
1900 Dynarr_delete_many (str->runoff, 0, chunk);
1906 break; /* No more room for data */
1908 if (str->flags & CODING_STATE_END)
1909 /* This means that on the previous iteration, we hit the EOF on
1910 the other end. We loop once more so that mule_decode() can
1911 output any final stuff it may be holding, or any "go back
1912 to a sane state" escape sequences. (This latter makes sense
1913 during encoding.) */
1916 /* Exhausted the runoff, so get some more. DATA has at least
1917 SIZE bytes left of storage in it, so it's OK to read directly
1918 into it. (We'll be overwriting above, after we've decoded it
1919 into the runoff.) */
1920 read_size = Lstream_read (str->other_end, data, size);
1927 /* There might be some more end data produced in the translation.
1928 See the comment above. */
1929 str->flags |= CODING_STATE_END;
1930 mule_decode (stream, data, str->runoff, read_size);
1933 if (data - orig_data == 0)
1934 return error_occurred ? -1 : 0;
1936 return data - orig_data;
1940 decoding_writer (Lstream *stream, CONST unsigned char *data, size_t size)
1942 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1945 /* Decode all our data into the runoff, and then attempt to write
1946 it all out to the other end. Remove whatever chunk we succeeded
1948 mule_decode (stream, data, str->runoff, size);
1949 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
1950 Dynarr_length (str->runoff));
1952 Dynarr_delete_many (str->runoff, 0, retval);
1953 /* Do NOT return retval. The return value indicates how much
1954 of the incoming data was written, not how many bytes were
1960 reset_decoding_stream (struct decoding_stream *str)
1963 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_ISO2022)
1965 Lisp_Object coding_system;
1966 XSETCODING_SYSTEM (coding_system, str->codesys);
1967 reset_iso2022 (coding_system, &str->iso2022);
1969 else if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_CCL)
1971 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_DECODE (str->codesys));
1974 str->flags = str->ch = 0;
1978 decoding_rewinder (Lstream *stream)
1980 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1981 reset_decoding_stream (str);
1982 Dynarr_reset (str->runoff);
1983 return Lstream_rewind (str->other_end);
1987 decoding_seekable_p (Lstream *stream)
1989 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1990 return Lstream_seekable_p (str->other_end);
1994 decoding_flusher (Lstream *stream)
1996 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1997 return Lstream_flush (str->other_end);
2001 decoding_closer (Lstream *stream)
2003 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2004 if (stream->flags & LSTREAM_FL_WRITE)
2006 str->flags |= CODING_STATE_END;
2007 decoding_writer (stream, 0, 0);
2009 Dynarr_free (str->runoff);
2011 #ifdef ENABLE_COMPOSITE_CHARS
2012 if (str->iso2022.composite_chars)
2013 Dynarr_free (str->iso2022.composite_chars);
2016 return Lstream_close (str->other_end);
2020 decoding_stream_coding_system (Lstream *stream)
2022 Lisp_Object coding_system;
2023 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2025 XSETCODING_SYSTEM (coding_system, str->codesys);
2026 return subsidiary_coding_system (coding_system, str->eol_type);
2030 set_decoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2032 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2033 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2035 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
2036 str->eol_type = CODING_SYSTEM_EOL_TYPE (cs);
2037 reset_decoding_stream (str);
2040 /* WARNING WARNING WARNING WARNING!!!!! If you open up a decoding
2041 stream for writing, no automatic code detection will be performed.
2042 The reason for this is that automatic code detection requires a
2043 seekable input. Things will also fail if you open a decoding
2044 stream for reading using a non-fully-specified coding system and
2045 a non-seekable input stream. */
2048 make_decoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2051 Lstream *lstr = Lstream_new (lstream_decoding, mode);
2052 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2056 str->other_end = stream;
2057 str->runoff = (unsigned_char_dynarr *) Dynarr_new (unsigned_char);
2058 str->eol_type = EOL_AUTODETECT;
2059 if (!strcmp (mode, "r")
2060 && Lstream_seekable_p (stream))
2061 /* We can determine the coding system now. */
2062 determine_real_coding_system (stream, &codesys, &str->eol_type);
2063 set_decoding_stream_coding_system (lstr, codesys);
2064 str->decst.eol_type = str->eol_type;
2065 str->decst.mask = ~0;
2066 XSETLSTREAM (obj, lstr);
2071 make_decoding_input_stream (Lstream *stream, Lisp_Object codesys)
2073 return make_decoding_stream_1 (stream, codesys, "r");
2077 make_decoding_output_stream (Lstream *stream, Lisp_Object codesys)
2079 return make_decoding_stream_1 (stream, codesys, "w");
2082 /* Note: the decode_coding_* functions all take the same
2083 arguments as mule_decode(), which is to say some SRC data of
2084 size N, which is to be stored into dynamic array DST.
2085 DECODING is the stream within which the decoding is
2086 taking place, but no data is actually read from or
2087 written to that stream; that is handled in decoding_reader()
2088 or decoding_writer(). This allows the same functions to
2089 be used for both reading and writing. */
2092 mule_decode (Lstream *decoding, CONST unsigned char *src,
2093 unsigned_char_dynarr *dst, unsigned int n)
2095 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2097 /* If necessary, do encoding-detection now. We do this when
2098 we're a writing stream or a non-seekable reading stream,
2099 meaning that we can't just process the whole input,
2100 rewind, and start over. */
2102 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT ||
2103 str->eol_type == EOL_AUTODETECT)
2105 Lisp_Object codesys;
2107 XSETCODING_SYSTEM (codesys, str->codesys);
2108 detect_coding_type (&str->decst, src, n,
2109 CODING_SYSTEM_TYPE (str->codesys) !=
2110 CODESYS_AUTODETECT);
2111 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT &&
2112 str->decst.mask != ~0)
2113 /* #### This is cheesy. What we really ought to do is
2114 buffer up a certain amount of data so as to get a
2115 less random result. */
2116 codesys = coding_system_from_mask (str->decst.mask);
2117 str->eol_type = str->decst.eol_type;
2118 if (XCODING_SYSTEM (codesys) != str->codesys)
2120 /* Preserve the CODING_STATE_END flag in case it was set.
2121 If we erase it, bad things might happen. */
2122 int was_end = str->flags & CODING_STATE_END;
2123 set_decoding_stream_coding_system (decoding, codesys);
2125 str->flags |= CODING_STATE_END;
2129 switch (CODING_SYSTEM_TYPE (str->codesys))
2132 case CODESYS_INTERNAL:
2133 Dynarr_add_many (dst, src, n);
2136 case CODESYS_AUTODETECT:
2137 /* If we got this far and still haven't decided on the coding
2138 system, then do no conversion. */
2139 case CODESYS_NO_CONVERSION:
2140 decode_coding_no_conversion (decoding, src, dst, n);
2143 case CODESYS_SHIFT_JIS:
2144 decode_coding_sjis (decoding, src, dst, n);
2147 decode_coding_big5 (decoding, src, dst, n);
2150 decode_coding_ucs4 (decoding, src, dst, n);
2153 decode_coding_utf8 (decoding, src, dst, n);
2156 ccl_driver (&str->ccl, src, dst, n, 0, CCL_MODE_DECODING);
2158 case CODESYS_ISO2022:
2159 decode_coding_iso2022 (decoding, src, dst, n);
2167 DEFUN ("decode-coding-region", Fdecode_coding_region, 3, 4, 0, /*
2168 Decode the text between START and END which is encoded in CODING-SYSTEM.
2169 This is useful if you've read in encoded text from a file without decoding
2170 it (e.g. you read in a JIS-formatted file but used the `binary' or
2171 `no-conversion' coding system, so that it shows up as "^[$B!<!+^[(B").
2172 Return length of decoded text.
2173 BUFFER defaults to the current buffer if unspecified.
2175 (start, end, coding_system, buffer))
2178 struct buffer *buf = decode_buffer (buffer, 0);
2179 Lisp_Object instream, lb_outstream, de_outstream, outstream;
2180 Lstream *istr, *ostr;
2181 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2183 get_buffer_range_char (buf, start, end, &b, &e, 0);
2185 barf_if_buffer_read_only (buf, b, e);
2187 coding_system = Fget_coding_system (coding_system);
2188 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2189 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2190 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2192 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2193 Fget_coding_system (Qbinary));
2194 istr = XLSTREAM (instream);
2195 ostr = XLSTREAM (outstream);
2196 GCPRO4 (instream, lb_outstream, de_outstream, outstream);
2198 /* The chain of streams looks like this:
2200 [BUFFER] <----- send through
2201 ------> [ENCODE AS BINARY]
2202 ------> [DECODE AS SPECIFIED]
2208 char tempbuf[1024]; /* some random amount */
2209 Bufpos newpos, even_newer_pos;
2210 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2211 int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2215 newpos = lisp_buffer_stream_startpos (istr);
2216 Lstream_write (ostr, tempbuf, size_in_bytes);
2217 even_newer_pos = lisp_buffer_stream_startpos (istr);
2218 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2221 Lstream_close (istr);
2222 Lstream_close (ostr);
2224 Lstream_delete (istr);
2225 Lstream_delete (ostr);
2226 Lstream_delete (XLSTREAM (de_outstream));
2227 Lstream_delete (XLSTREAM (lb_outstream));
2232 /************************************************************************/
2233 /* Converting to an external encoding ("encoding") */
2234 /************************************************************************/
2236 /* An encoding stream is an output stream. When you create the
2237 stream, you specify the coding system that governs the encoding
2238 and another stream that the resulting encoded data is to be
2239 sent to, and then start sending data to it. */
2241 #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding)
2243 struct encoding_stream
2245 /* Coding system that governs the conversion. */
2246 Lisp_Coding_System *codesys;
2248 /* Stream that we read the encoded data from or
2249 write the decoded data to. */
2252 /* If we are reading, then we can return only a fixed amount of
2253 data, so if the conversion resulted in too much data, we store it
2254 here for retrieval the next time around. */
2255 unsigned_char_dynarr *runoff;
2257 /* FLAGS holds flags indicating the current state of the encoding.
2258 Some of these flags are dependent on the coding system. */
2261 /* CH holds a partially built-up character. Since we only deal
2262 with one- and two-byte characters at the moment, we only use
2263 this to store the first byte of a two-byte character. */
2266 /* Additional information used by the ISO2022 encoder. */
2269 /* CHARSET holds the character sets currently assigned to the G0
2270 through G3 registers. It is initialized from the array
2271 INITIAL_CHARSET in CODESYS. */
2272 Lisp_Object charset[4];
2274 /* Which registers are currently invoked into the left (GL) and
2275 right (GR) halves of the 8-bit encoding space? */
2276 int register_left, register_right;
2278 /* Whether we need to explicitly designate the charset in the
2279 G? register before using it. It is initialized from the
2280 array FORCE_CHARSET_ON_OUTPUT in CODESYS. */
2281 unsigned char force_charset_on_output[4];
2283 /* Other state variables that need to be preserved across
2285 Lisp_Object current_charset;
2287 int current_char_boundary;
2290 /* Additional information (the state of the running CCL program)
2291 used by the CCL encoder. */
2292 struct ccl_program ccl;
2296 static int encoding_reader (Lstream *stream, unsigned char *data, size_t size);
2297 static int encoding_writer (Lstream *stream, CONST unsigned char *data,
2299 static int encoding_rewinder (Lstream *stream);
2300 static int encoding_seekable_p (Lstream *stream);
2301 static int encoding_flusher (Lstream *stream);
2302 static int encoding_closer (Lstream *stream);
2304 static Lisp_Object encoding_marker (Lisp_Object stream,
2305 void (*markobj) (Lisp_Object));
2307 DEFINE_LSTREAM_IMPLEMENTATION ("encoding", lstream_encoding,
2308 sizeof (struct encoding_stream));
2311 encoding_marker (Lisp_Object stream, void (*markobj) (Lisp_Object))
2313 Lstream *str = ENCODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2314 Lisp_Object str_obj;
2316 /* We do not need to mark the coding systems or charsets stored
2317 within the stream because they are stored in a global list
2318 and automatically marked. */
2320 XSETLSTREAM (str_obj, str);
2322 if (str->imp->marker)
2323 return (str->imp->marker) (str_obj, markobj);
2328 /* Read SIZE bytes of data and store it into DATA. We are a encoding stream
2329 so we read data from the other end, encode it, and store it into DATA. */
2332 encoding_reader (Lstream *stream, unsigned char *data, size_t size)
2334 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2335 unsigned char *orig_data = data;
2337 int error_occurred = 0;
2339 /* We need to interface to mule_encode(), which expects to take some
2340 amount of data and store the result into a Dynarr. We have
2341 mule_encode() store into str->runoff, and take data from there
2344 /* We loop until we have enough data, reading chunks from the other
2345 end and encoding it. */
2348 /* Take data from the runoff if we can. Make sure to take at
2349 most SIZE bytes, and delete the data from the runoff. */
2350 if (Dynarr_length (str->runoff) > 0)
2352 int chunk = min ((int) size, Dynarr_length (str->runoff));
2353 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2354 Dynarr_delete_many (str->runoff, 0, chunk);
2360 break; /* No more room for data */
2362 if (str->flags & CODING_STATE_END)
2363 /* This means that on the previous iteration, we hit the EOF on
2364 the other end. We loop once more so that mule_encode() can
2365 output any final stuff it may be holding, or any "go back
2366 to a sane state" escape sequences. (This latter makes sense
2367 during encoding.) */
2370 /* Exhausted the runoff, so get some more. DATA at least SIZE bytes
2371 left of storage in it, so it's OK to read directly into it.
2372 (We'll be overwriting above, after we've encoded it into the
2374 read_size = Lstream_read (str->other_end, data, size);
2381 /* There might be some more end data produced in the translation.
2382 See the comment above. */
2383 str->flags |= CODING_STATE_END;
2384 mule_encode (stream, data, str->runoff, read_size);
2387 if (data == orig_data)
2388 return error_occurred ? -1 : 0;
2390 return data - orig_data;
2394 encoding_writer (Lstream *stream, CONST unsigned char *data, size_t size)
2396 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2399 /* Encode all our data into the runoff, and then attempt to write
2400 it all out to the other end. Remove whatever chunk we succeeded
2402 mule_encode (stream, data, str->runoff, size);
2403 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2404 Dynarr_length (str->runoff));
2406 Dynarr_delete_many (str->runoff, 0, retval);
2407 /* Do NOT return retval. The return value indicates how much
2408 of the incoming data was written, not how many bytes were
2414 reset_encoding_stream (struct encoding_stream *str)
2417 switch (CODING_SYSTEM_TYPE (str->codesys))
2419 case CODESYS_ISO2022:
2423 for (i = 0; i < 4; i++)
2425 str->iso2022.charset[i] =
2426 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, i);
2427 str->iso2022.force_charset_on_output[i] =
2428 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (str->codesys, i);
2430 str->iso2022.register_left = 0;
2431 str->iso2022.register_right = 1;
2432 str->iso2022.current_charset = Qnil;
2433 str->iso2022.current_half = 0;
2434 str->iso2022.current_char_boundary = 1;
2438 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_ENCODE (str->codesys));
2445 str->flags = str->ch = 0;
2449 encoding_rewinder (Lstream *stream)
2451 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2452 reset_encoding_stream (str);
2453 Dynarr_reset (str->runoff);
2454 return Lstream_rewind (str->other_end);
2458 encoding_seekable_p (Lstream *stream)
2460 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2461 return Lstream_seekable_p (str->other_end);
2465 encoding_flusher (Lstream *stream)
2467 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2468 return Lstream_flush (str->other_end);
2472 encoding_closer (Lstream *stream)
2474 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2475 if (stream->flags & LSTREAM_FL_WRITE)
2477 str->flags |= CODING_STATE_END;
2478 encoding_writer (stream, 0, 0);
2480 Dynarr_free (str->runoff);
2481 return Lstream_close (str->other_end);
2485 encoding_stream_coding_system (Lstream *stream)
2487 Lisp_Object coding_system;
2488 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2490 XSETCODING_SYSTEM (coding_system, str->codesys);
2491 return coding_system;
2495 set_encoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2497 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2498 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2500 reset_encoding_stream (str);
2504 make_encoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2507 Lstream *lstr = Lstream_new (lstream_encoding, mode);
2508 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2512 str->runoff = Dynarr_new (unsigned_char);
2513 str->other_end = stream;
2514 set_encoding_stream_coding_system (lstr, codesys);
2515 XSETLSTREAM (obj, lstr);
2520 make_encoding_input_stream (Lstream *stream, Lisp_Object codesys)
2522 return make_encoding_stream_1 (stream, codesys, "r");
2526 make_encoding_output_stream (Lstream *stream, Lisp_Object codesys)
2528 return make_encoding_stream_1 (stream, codesys, "w");
2531 /* Convert N bytes of internally-formatted data stored in SRC to an
2532 external format, according to the encoding stream ENCODING.
2533 Store the encoded data into DST. */
2536 mule_encode (Lstream *encoding, CONST unsigned char *src,
2537 unsigned_char_dynarr *dst, unsigned int n)
2539 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2541 switch (CODING_SYSTEM_TYPE (str->codesys))
2544 case CODESYS_INTERNAL:
2545 Dynarr_add_many (dst, src, n);
2548 case CODESYS_AUTODETECT:
2549 /* If we got this far and still haven't decided on the coding
2550 system, then do no conversion. */
2551 case CODESYS_NO_CONVERSION:
2552 encode_coding_no_conversion (encoding, src, dst, n);
2555 case CODESYS_SHIFT_JIS:
2556 encode_coding_sjis (encoding, src, dst, n);
2559 encode_coding_big5 (encoding, src, dst, n);
2562 encode_coding_ucs4 (encoding, src, dst, n);
2565 encode_coding_utf8 (encoding, src, dst, n);
2568 ccl_driver (&str->ccl, src, dst, n, 0, CCL_MODE_ENCODING);
2570 case CODESYS_ISO2022:
2571 encode_coding_iso2022 (encoding, src, dst, n);
2579 DEFUN ("encode-coding-region", Fencode_coding_region, 3, 4, 0, /*
2580 Encode the text between START and END using CODING-SYSTEM.
2581 This will, for example, convert Japanese characters into stuff such as
2582 "^[$B!<!+^[(B" if you use the JIS encoding. Return length of encoded
2583 text. BUFFER defaults to the current buffer if unspecified.
2585 (start, end, coding_system, buffer))
2588 struct buffer *buf = decode_buffer (buffer, 0);
2589 Lisp_Object instream, lb_outstream, de_outstream, outstream;
2590 Lstream *istr, *ostr;
2591 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2593 get_buffer_range_char (buf, start, end, &b, &e, 0);
2595 barf_if_buffer_read_only (buf, b, e);
2597 coding_system = Fget_coding_system (coding_system);
2598 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2599 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2600 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2601 Fget_coding_system (Qbinary));
2602 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2604 istr = XLSTREAM (instream);
2605 ostr = XLSTREAM (outstream);
2606 GCPRO4 (instream, outstream, de_outstream, lb_outstream);
2607 /* The chain of streams looks like this:
2609 [BUFFER] <----- send through
2610 ------> [ENCODE AS SPECIFIED]
2611 ------> [DECODE AS BINARY]
2616 char tempbuf[1024]; /* some random amount */
2617 Bufpos newpos, even_newer_pos;
2618 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2619 int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2623 newpos = lisp_buffer_stream_startpos (istr);
2624 Lstream_write (ostr, tempbuf, size_in_bytes);
2625 even_newer_pos = lisp_buffer_stream_startpos (istr);
2626 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2632 lisp_buffer_stream_startpos (XLSTREAM (instream)) - b;
2633 Lstream_close (istr);
2634 Lstream_close (ostr);
2636 Lstream_delete (istr);
2637 Lstream_delete (ostr);
2638 Lstream_delete (XLSTREAM (de_outstream));
2639 Lstream_delete (XLSTREAM (lb_outstream));
2640 return make_int (retlen);
2646 /************************************************************************/
2647 /* Shift-JIS methods */
2648 /************************************************************************/
2650 /* Shift-JIS is a coding system encoding three character sets: ASCII, right
2651 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
2652 as is. A character of JISX0201-Kana (DIMENSION1_CHARS94 character set) is
2653 encoded by "position-code + 0x80". A character of JISX0208
2654 (DIMENSION2_CHARS94 character set) is encoded in 2-byte but two
2655 position-codes are divided and shifted so that it fit in the range
2658 --- CODE RANGE of Shift-JIS ---
2659 (character set) (range)
2661 JISX0201-Kana 0xA0 .. 0xDF
2662 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xEF
2663 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
2664 -------------------------------
2668 /* Is this the first byte of a Shift-JIS two-byte char? */
2670 #define BYTE_SJIS_TWO_BYTE_1_P(c) \
2671 (((c) >= 0x81 && (c) <= 0x9F) || ((c) >= 0xE0 && (c) <= 0xEF))
2673 /* Is this the second byte of a Shift-JIS two-byte char? */
2675 #define BYTE_SJIS_TWO_BYTE_2_P(c) \
2676 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0x80 && (c) <= 0xFC))
2678 #define BYTE_SJIS_KATAKANA_P(c) \
2679 ((c) >= 0xA1 && (c) <= 0xDF)
2682 detect_coding_sjis (struct detection_state *st, CONST unsigned char *src,
2690 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
2692 if (st->shift_jis.in_second_byte)
2694 st->shift_jis.in_second_byte = 0;
2698 else if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
2699 st->shift_jis.in_second_byte = 1;
2701 return CODING_CATEGORY_SHIFT_JIS_MASK;
2704 /* Convert Shift-JIS data to internal format. */
2707 decode_coding_sjis (Lstream *decoding, CONST unsigned char *src,
2708 unsigned_char_dynarr *dst, unsigned int n)
2711 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2712 unsigned int flags = str->flags;
2713 unsigned int ch = str->ch;
2714 eol_type_t eol_type = str->eol_type;
2722 /* Previous character was first byte of Shift-JIS Kanji char. */
2723 if (BYTE_SJIS_TWO_BYTE_2_P (c))
2725 unsigned char e1, e2;
2727 Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
2728 DECODE_SJIS (ch, c, e1, e2);
2729 Dynarr_add (dst, e1);
2730 Dynarr_add (dst, e2);
2734 DECODE_ADD_BINARY_CHAR (ch, dst);
2735 DECODE_ADD_BINARY_CHAR (c, dst);
2741 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
2742 if (BYTE_SJIS_TWO_BYTE_1_P (c))
2744 else if (BYTE_SJIS_KATAKANA_P (c))
2746 Dynarr_add (dst, LEADING_BYTE_KATAKANA_JISX0201);
2747 Dynarr_add (dst, c);
2750 DECODE_ADD_BINARY_CHAR (c, dst);
2752 label_continue_loop:;
2755 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
2761 /* Convert internally-formatted data to Shift-JIS. */
2764 encode_coding_sjis (Lstream *encoding, CONST unsigned char *src,
2765 unsigned_char_dynarr *dst, unsigned int n)
2768 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2769 unsigned int flags = str->flags;
2770 unsigned int ch = str->ch;
2771 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
2778 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
2779 Dynarr_add (dst, '\r');
2780 if (eol_type != EOL_CR)
2781 Dynarr_add (dst, '\n');
2784 else if (BYTE_ASCII_P (c))
2786 Dynarr_add (dst, c);
2789 else if (BUFBYTE_LEADING_BYTE_P (c))
2790 ch = (c == LEADING_BYTE_KATAKANA_JISX0201 ||
2791 c == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
2792 c == LEADING_BYTE_JAPANESE_JISX0208) ? c : 0;
2795 if (ch == LEADING_BYTE_KATAKANA_JISX0201)
2797 Dynarr_add (dst, c);
2800 else if (ch == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
2801 ch == LEADING_BYTE_JAPANESE_JISX0208)
2805 unsigned char j1, j2;
2806 ENCODE_SJIS (ch, c, j1, j2);
2807 Dynarr_add (dst, j1);
2808 Dynarr_add (dst, j2);
2818 DEFUN ("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /*
2819 Decode a JISX0208 character of Shift-JIS coding-system.
2820 CODE is the character code in Shift-JIS as a cons of type bytes.
2821 Return the corresponding character.
2825 unsigned char c1, c2, s1, s2;
2828 CHECK_INT (XCAR (code));
2829 CHECK_INT (XCDR (code));
2830 s1 = XINT (XCAR (code));
2831 s2 = XINT (XCDR (code));
2832 if (BYTE_SJIS_TWO_BYTE_1_P (s1) &&
2833 BYTE_SJIS_TWO_BYTE_2_P (s2))
2835 DECODE_SJIS (s1, s2, c1, c2);
2836 return make_char (MAKE_CHAR (Vcharset_japanese_jisx0208,
2837 c1 & 0x7F, c2 & 0x7F));
2843 DEFUN ("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /*
2844 Encode a JISX0208 character CHAR to SHIFT-JIS coding-system.
2845 Return the corresponding character code in SHIFT-JIS as a cons of two bytes.
2849 Lisp_Object charset;
2852 CHECK_CHAR_COERCE_INT (ch);
2853 BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
2854 if (EQ (charset, Vcharset_japanese_jisx0208))
2856 ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
2857 return Fcons (make_int (s1), make_int (s2));
2864 /************************************************************************/
2866 /************************************************************************/
2868 /* BIG5 is a coding system encoding two character sets: ASCII and
2869 Big5. An ASCII character is encoded as is. Big5 is a two-byte
2870 character set and is encoded in two-byte.
2872 --- CODE RANGE of BIG5 ---
2873 (character set) (range)
2875 Big5 (1st byte) 0xA1 .. 0xFE
2876 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
2877 --------------------------
2879 Since the number of characters in Big5 is larger than maximum
2880 characters in Emacs' charset (96x96), it can't be handled as one
2881 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
2882 and `charset-big5-2'. Both <type>s are DIMENSION2_CHARS94. The former
2883 contains frequently used characters and the latter contains less
2884 frequently used characters. */
2886 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
2887 ((c) >= 0xA1 && (c) <= 0xFE)
2889 /* Is this the second byte of a Shift-JIS two-byte char? */
2891 #define BYTE_BIG5_TWO_BYTE_2_P(c) \
2892 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0xA1 && (c) <= 0xFE))
2894 /* Number of Big5 characters which have the same code in 1st byte. */
2896 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
2898 /* Code conversion macros. These are macros because they are used in
2899 inner loops during code conversion.
2901 Note that temporary variables in macros introduce the classic
2902 dynamic-scoping problems with variable names. We use capital-
2903 lettered variables in the assumption that XEmacs does not use
2904 capital letters in variables except in a very formalized way
2907 /* Convert Big5 code (b1, b2) into its internal string representation
2910 /* There is a much simpler way to split the Big5 charset into two.
2911 For the moment I'm going to leave the algorithm as-is because it
2912 claims to separate out the most-used characters into a single
2913 charset, which perhaps will lead to optimizations in various
2916 The way the algorithm works is something like this:
2918 Big5 can be viewed as a 94x157 charset, where the row is
2919 encoded into the bytes 0xA1 .. 0xFE and the column is encoded
2920 into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE. As for frequency,
2921 the split between low and high column numbers is apparently
2922 meaningless; ascending rows produce less and less frequent chars.
2923 Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to
2924 the first charset, and the upper half (0xC9 .. 0xFE) to the
2925 second. To do the conversion, we convert the character into
2926 a single number where 0 .. 156 is the first row, 157 .. 313
2927 is the second, etc. That way, the characters are ordered by
2928 decreasing frequency. Then we just chop the space in two
2929 and coerce the result into a 94x94 space.
2932 #define DECODE_BIG5(b1, b2, lb, c1, c2) do \
2934 int B1 = b1, B2 = b2; \
2936 = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62); \
2940 lb = LEADING_BYTE_CHINESE_BIG5_1; \
2944 lb = LEADING_BYTE_CHINESE_BIG5_2; \
2945 I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1); \
2947 c1 = I / (0xFF - 0xA1) + 0xA1; \
2948 c2 = I % (0xFF - 0xA1) + 0xA1; \
2951 /* Convert the internal string representation of a Big5 character
2952 (lb, c1, c2) into Big5 code (b1, b2). */
2954 #define ENCODE_BIG5(lb, c1, c2, b1, b2) do \
2956 unsigned int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1); \
2958 if (lb == LEADING_BYTE_CHINESE_BIG5_2) \
2960 I += BIG5_SAME_ROW * (0xC9 - 0xA1); \
2962 b1 = I / BIG5_SAME_ROW + 0xA1; \
2963 b2 = I % BIG5_SAME_ROW; \
2964 b2 += b2 < 0x3F ? 0x40 : 0x62; \
2968 detect_coding_big5 (struct detection_state *st, CONST unsigned char *src,
2976 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO ||
2977 (c >= 0x80 && c <= 0xA0))
2979 if (st->big5.in_second_byte)
2981 st->big5.in_second_byte = 0;
2982 if (c < 0x40 || (c >= 0x80 && c <= 0xA0))
2986 st->big5.in_second_byte = 1;
2988 return CODING_CATEGORY_BIG5_MASK;
2991 /* Convert Big5 data to internal format. */
2994 decode_coding_big5 (Lstream *decoding, CONST unsigned char *src,
2995 unsigned_char_dynarr *dst, unsigned int n)
2998 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2999 unsigned int flags = str->flags;
3000 unsigned int ch = str->ch;
3001 eol_type_t eol_type = str->eol_type;
3008 /* Previous character was first byte of Big5 char. */
3009 if (BYTE_BIG5_TWO_BYTE_2_P (c))
3011 unsigned char b1, b2, b3;
3012 DECODE_BIG5 (ch, c, b1, b2, b3);
3013 Dynarr_add (dst, b1);
3014 Dynarr_add (dst, b2);
3015 Dynarr_add (dst, b3);
3019 DECODE_ADD_BINARY_CHAR (ch, dst);
3020 DECODE_ADD_BINARY_CHAR (c, dst);
3026 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3027 if (BYTE_BIG5_TWO_BYTE_1_P (c))
3030 DECODE_ADD_BINARY_CHAR (c, dst);
3032 label_continue_loop:;
3035 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
3041 /* Convert internally-formatted data to Big5. */
3044 encode_coding_big5 (Lstream *encoding, CONST unsigned char *src,
3045 unsigned_char_dynarr *dst, unsigned int n)
3048 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3049 unsigned int flags = str->flags;
3050 unsigned int ch = str->ch;
3051 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3058 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3059 Dynarr_add (dst, '\r');
3060 if (eol_type != EOL_CR)
3061 Dynarr_add (dst, '\n');
3063 else if (BYTE_ASCII_P (c))
3066 Dynarr_add (dst, c);
3068 else if (BUFBYTE_LEADING_BYTE_P (c))
3070 if (c == LEADING_BYTE_CHINESE_BIG5_1 ||
3071 c == LEADING_BYTE_CHINESE_BIG5_2)
3073 /* A recognized leading byte. */
3075 continue; /* not done with this character. */
3077 /* otherwise just ignore this character. */
3079 else if (ch == LEADING_BYTE_CHINESE_BIG5_1 ||
3080 ch == LEADING_BYTE_CHINESE_BIG5_2)
3082 /* Previous char was a recognized leading byte. */
3084 continue; /* not done with this character. */
3088 /* Encountering second byte of a Big5 character. */
3089 unsigned char b1, b2;
3091 ENCODE_BIG5 (ch >> 8, ch & 0xFF, c, b1, b2);
3092 Dynarr_add (dst, b1);
3093 Dynarr_add (dst, b2);
3104 DEFUN ("decode-big5-char", Fdecode_big5_char, 1, 1, 0, /*
3105 Decode a Big5 character CODE of BIG5 coding-system.
3106 CODE is the character code in BIG5, a cons of two integers.
3107 Return the corresponding character.
3111 unsigned char c1, c2, b1, b2;
3114 CHECK_INT (XCAR (code));
3115 CHECK_INT (XCDR (code));
3116 b1 = XINT (XCAR (code));
3117 b2 = XINT (XCDR (code));
3118 if (BYTE_BIG5_TWO_BYTE_1_P (b1) &&
3119 BYTE_BIG5_TWO_BYTE_2_P (b2))
3122 Lisp_Object charset;
3123 DECODE_BIG5 (b1, b2, leading_byte, c1, c2);
3124 charset = CHARSET_BY_LEADING_BYTE (leading_byte);
3125 return make_char (MAKE_CHAR (charset, c1 & 0x7F, c2 & 0x7F));
3131 DEFUN ("encode-big5-char", Fencode_big5_char, 1, 1, 0, /*
3132 Encode the Big5 character CH to BIG5 coding-system.
3133 Return the corresponding character code in Big5.
3137 Lisp_Object charset;
3140 CHECK_CHAR_COERCE_INT (ch);
3141 BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
3142 if (EQ (charset, Vcharset_chinese_big5_1) ||
3143 EQ (charset, Vcharset_chinese_big5_2))
3145 ENCODE_BIG5 (XCHARSET_LEADING_BYTE (charset), c1 | 0x80, c2 | 0x80,
3147 return Fcons (make_int (b1), make_int (b2));
3154 /************************************************************************/
3157 /* UCS-4 character codes are implemented as nonnegative integers. */
3159 /************************************************************************/
3161 Lisp_Object ucs_to_mule_table[65536];
3162 Lisp_Object mule_to_ucs_table;
3164 DEFUN ("set-ucs-char", Fset_ucs_char, 2, 2, 0, /*
3165 Map UCS-4 code CODE to Mule character CHARACTER.
3167 Return T on success, NIL on failure.
3173 CHECK_CHAR (character);
3177 if (c < sizeof (ucs_to_mule_table))
3179 ucs_to_mule_table[c] = character;
3187 ucs_to_char (unsigned long code)
3189 if (code < sizeof (ucs_to_mule_table))
3191 return ucs_to_mule_table[code];
3193 else if ((0xe00000 <= code) && (code <= 0xe00000 + 94 * 94 * 14))
3198 c = code % (94 * 94);
3200 (MAKE_CHAR (CHARSET_BY_ATTRIBUTES
3201 (CHARSET_TYPE_94X94, code / (94 * 94) + '@',
3202 CHARSET_LEFT_TO_RIGHT),
3203 c / 94 + 33, c % 94 + 33));
3209 DEFUN ("ucs-char", Fucs_char, 1, 1, 0, /*
3210 Return Mule character corresponding to UCS code CODE (a positive integer).
3214 CHECK_NATNUM (code);
3215 return ucs_to_char (XINT (code));
3218 DEFUN ("set-char-ucs", Fset_char_ucs, 2, 2, 0, /*
3219 Map Mule character CHARACTER to UCS code CODE (a positive integer).
3223 /* #### Isn't this gilding the lily? Fput_char_table checks its args.
3224 Fset_char_ucs is more restrictive on index arg, but should
3225 check code arg in a char_table method. */
3226 CHECK_CHAR (character);
3227 CHECK_NATNUM (code);
3228 return Fput_char_table (character, code, mule_to_ucs_table);
3231 DEFUN ("char-ucs", Fchar_ucs, 1, 1, 0, /*
3232 Return the UCS code (a positive integer) corresponding to CHARACTER.
3236 return Fget_char_table (character, mule_to_ucs_table);
3239 /* Decode a UCS-4 character into a buffer. If the lookup fails, use
3240 <GETA MARK> (U+3013) of JIS X 0208, which means correct character
3241 is not found, instead.
3242 #### do something more appropriate (use blob?)
3243 Danger, Will Robinson! Data loss. Should we signal user? */
3245 decode_ucs4 (unsigned long ch, unsigned_char_dynarr *dst)
3247 Lisp_Object chr = ucs_to_char (ch);
3251 Bufbyte work[MAX_EMCHAR_LEN];
3256 simple_set_charptr_emchar (work, ch) :
3257 non_ascii_set_charptr_emchar (work, ch);
3258 Dynarr_add_many (dst, work, len);
3262 Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
3263 Dynarr_add (dst, 34 + 128);
3264 Dynarr_add (dst, 46 + 128);
3268 static unsigned long
3269 mule_char_to_ucs4 (Lisp_Object charset,
3270 unsigned char h, unsigned char l)
3273 = Fget_char_table (make_char (MAKE_CHAR (charset, h & 127, l & 127)),
3280 else if ( (XCHARSET_DIMENSION (charset) == 2) &&
3281 (XCHARSET_CHARS (charset) == 94) )
3283 unsigned char final = XCHARSET_FINAL (charset);
3285 if ( ('@' <= final) && (final < 0x7f) )
3287 return 0xe00000 + (final - '@') * 94 * 94
3288 + ((h & 127) - 33) * 94 + (l & 127) - 33;
3302 encode_ucs4 (Lisp_Object charset,
3303 unsigned char h, unsigned char l, unsigned_char_dynarr *dst)
3305 unsigned long code = mule_char_to_ucs4 (charset, h, l);
3306 Dynarr_add (dst, code >> 24);
3307 Dynarr_add (dst, (code >> 16) & 255);
3308 Dynarr_add (dst, (code >> 8) & 255);
3309 Dynarr_add (dst, code & 255);
3313 detect_coding_ucs4 (struct detection_state *st, CONST unsigned char *src,
3319 switch (st->ucs4.in_byte)
3328 st->ucs4.in_byte = 0;
3334 return CODING_CATEGORY_UCS4_MASK;
3338 decode_coding_ucs4 (Lstream *decoding, CONST unsigned char *src,
3339 unsigned_char_dynarr *dst, unsigned int n)
3341 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3342 unsigned int flags = str->flags;
3343 unsigned int ch = str->ch;
3347 unsigned char c = *src++;
3355 decode_ucs4 ( ( ch << 8 ) | c, dst);
3360 ch = ( ch << 8 ) | c;
3364 if (flags & CODING_STATE_END)
3365 DECODE_OUTPUT_PARTIAL_CHAR (ch);
3372 encode_coding_ucs4 (Lstream *encoding, CONST unsigned char *src,
3373 unsigned_char_dynarr *dst, unsigned int n)
3375 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3376 unsigned int flags = str->flags;
3377 unsigned int ch = str->ch;
3378 unsigned char char_boundary = str->iso2022.current_char_boundary;
3379 Lisp_Object charset = str->iso2022.current_charset;
3381 #ifdef ENABLE_COMPOSITE_CHARS
3382 /* flags for handling composite chars. We do a little switcharoo
3383 on the source while we're outputting the composite char. */
3384 unsigned int saved_n = 0;
3385 CONST unsigned char *saved_src = NULL;
3386 int in_composite = 0;
3393 unsigned char c = *src++;
3395 if (BYTE_ASCII_P (c))
3396 { /* Processing ASCII character */
3398 encode_ucs4 (Vcharset_ascii, c, 0, dst);
3401 else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
3402 { /* Processing Leading Byte */
3404 charset = CHARSET_BY_LEADING_BYTE (c);
3405 if (LEADING_BYTE_PREFIX_P(c))
3410 { /* Processing Non-ASCII character */
3412 if (EQ (charset, Vcharset_control_1))
3414 encode_ucs4 (Vcharset_control_1, c, 0, dst);
3418 switch (XCHARSET_REP_BYTES (charset))
3421 encode_ucs4 (charset, c, 0, dst);
3424 if (XCHARSET_PRIVATE_P (charset))
3426 encode_ucs4 (charset, c, 0, dst);
3431 #ifdef ENABLE_COMPOSITE_CHARS
3432 if (EQ (charset, Vcharset_composite))
3436 /* #### Bother! We don't know how to
3438 Dynarr_add (dst, 0);
3439 Dynarr_add (dst, 0);
3440 Dynarr_add (dst, 0);
3441 Dynarr_add (dst, '~');
3445 Emchar emch = MAKE_CHAR (Vcharset_composite,
3446 ch & 0x7F, c & 0x7F);
3447 Lisp_Object lstr = composite_char_string (emch);
3451 src = XSTRING_DATA (lstr);
3452 n = XSTRING_LENGTH (lstr);
3456 #endif /* ENABLE_COMPOSITE_CHARS */
3458 encode_ucs4(charset, ch, c, dst);
3471 encode_ucs4 (charset, ch, c, dst);
3487 #ifdef ENABLE_COMPOSITE_CHARS
3493 goto back_to_square_n; /* Wheeeeeeeee ..... */
3495 #endif /* ENABLE_COMPOSITE_CHARS */
3499 str->iso2022.current_char_boundary = char_boundary;
3500 str->iso2022.current_charset = charset;
3502 /* Verbum caro factum est! */
3506 /************************************************************************/
3508 /************************************************************************/
3511 detect_coding_utf8 (struct detection_state *st, CONST unsigned char *src,
3516 unsigned char c = *src++;
3517 switch (st->utf8.in_byte)
3520 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3523 st->utf8.in_byte = 5;
3525 st->utf8.in_byte = 4;
3527 st->utf8.in_byte = 3;
3529 st->utf8.in_byte = 2;
3531 st->utf8.in_byte = 1;
3536 if ((c & 0xc0) != 0x80)
3542 return CODING_CATEGORY_UTF8_MASK;
3546 decode_coding_utf8 (Lstream *decoding, CONST unsigned char *src,
3547 unsigned_char_dynarr *dst, unsigned int n)
3549 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3550 unsigned int flags = str->flags;
3551 unsigned int ch = str->ch;
3552 eol_type_t eol_type = str->eol_type;
3556 unsigned char c = *src++;
3565 else if ( c >= 0xf8 )
3570 else if ( c >= 0xf0 )
3575 else if ( c >= 0xe0 )
3580 else if ( c >= 0xc0 )
3587 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3588 decode_ucs4 (c, dst);
3592 ch = ( ch << 6 ) | ( c & 0x3f );
3593 decode_ucs4 (ch, dst);
3598 ch = ( ch << 6 ) | ( c & 0x3f );
3601 label_continue_loop:;
3604 if (flags & CODING_STATE_END)
3605 DECODE_OUTPUT_PARTIAL_CHAR (ch);
3612 encode_utf8 (Lisp_Object charset,
3613 unsigned char h, unsigned char l, unsigned_char_dynarr *dst)
3615 unsigned long code = mule_char_to_ucs4 (charset, h, l);
3618 Dynarr_add (dst, code);
3620 else if ( code <= 0x7ff )
3622 Dynarr_add (dst, (code >> 6) | 0xc0);
3623 Dynarr_add (dst, (code & 0x3f) | 0x80);
3625 else if ( code <= 0xffff )
3627 Dynarr_add (dst, (code >> 12) | 0xe0);
3628 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
3629 Dynarr_add (dst, (code & 0x3f) | 0x80);
3631 else if ( code <= 0x1fffff )
3633 Dynarr_add (dst, (code >> 18) | 0xf0);
3634 Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
3635 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
3636 Dynarr_add (dst, (code & 0x3f) | 0x80);
3638 else if ( code <= 0x3ffffff )
3640 Dynarr_add (dst, (code >> 24) | 0xf8);
3641 Dynarr_add (dst, ((code >> 18) & 0x3f) | 0x80);
3642 Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
3643 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
3644 Dynarr_add (dst, (code & 0x3f) | 0x80);
3648 Dynarr_add (dst, (code >> 30) | 0xfc);
3649 Dynarr_add (dst, ((code >> 24) & 0x3f) | 0x80);
3650 Dynarr_add (dst, ((code >> 18) & 0x3f) | 0x80);
3651 Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
3652 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
3653 Dynarr_add (dst, (code & 0x3f) | 0x80);
3658 encode_coding_utf8 (Lstream *encoding, CONST unsigned char *src,
3659 unsigned_char_dynarr *dst, unsigned int n)
3661 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3662 unsigned int flags = str->flags;
3663 unsigned int ch = str->ch;
3664 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3665 unsigned char char_boundary = str->iso2022.current_char_boundary;
3666 Lisp_Object charset = str->iso2022.current_charset;
3668 #ifdef ENABLE_COMPOSITE_CHARS
3669 /* flags for handling composite chars. We do a little switcharoo
3670 on the source while we're outputting the composite char. */
3671 unsigned int saved_n = 0;
3672 CONST unsigned char *saved_src = NULL;
3673 int in_composite = 0;
3676 #endif /* ENABLE_COMPOSITE_CHARS */
3680 unsigned char c = *src++;
3682 if (BYTE_ASCII_P (c))
3683 { /* Processing ASCII character */
3687 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3688 Dynarr_add (dst, '\r');
3689 if (eol_type != EOL_CR)
3690 Dynarr_add (dst, c);
3693 encode_utf8 (Vcharset_ascii, c, 0, dst);
3696 else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
3697 { /* Processing Leading Byte */
3699 charset = CHARSET_BY_LEADING_BYTE (c);
3700 if (LEADING_BYTE_PREFIX_P(c))
3705 { /* Processing Non-ASCII character */
3707 if (EQ (charset, Vcharset_control_1))
3709 encode_utf8 (Vcharset_control_1, c, 0, dst);
3713 switch (XCHARSET_REP_BYTES (charset))
3716 encode_utf8 (charset, c, 0, dst);
3719 if (XCHARSET_PRIVATE_P (charset))
3721 encode_utf8 (charset, c, 0, dst);
3726 #ifdef ENABLE_COMPOSITE_CHARS
3727 if (EQ (charset, Vcharset_composite))
3731 /* #### Bother! We don't know how to
3733 encode_utf8 (Vcharset_ascii, '~', 0, dst);
3737 Emchar emch = MAKE_CHAR (Vcharset_composite,
3738 ch & 0x7F, c & 0x7F);
3739 Lisp_Object lstr = composite_char_string (emch);
3743 src = XSTRING_DATA (lstr);
3744 n = XSTRING_LENGTH (lstr);
3748 #endif /* ENABLE_COMPOSITE_CHARS */
3750 encode_utf8 (charset, ch, c, dst);
3763 encode_utf8 (charset, ch, c, dst);
3779 #ifdef ENABLE_COMPOSITE_CHARS
3785 goto back_to_square_n; /* Wheeeeeeeee ..... */
3791 str->iso2022.current_char_boundary = char_boundary;
3792 str->iso2022.current_charset = charset;
3794 /* Verbum caro factum est! */
3798 /************************************************************************/
3799 /* ISO2022 methods */
3800 /************************************************************************/
3802 /* The following note describes the coding system ISO2022 briefly.
3803 Since the intention of this note is to help understand the
3804 functions in this file, some parts are NOT ACCURATE or OVERLY
3805 SIMPLIFIED. For thorough understanding, please refer to the
3806 original document of ISO2022.
3808 ISO2022 provides many mechanisms to encode several character sets
3809 in 7-bit and 8-bit environments. For 7-bit environments, all text
3810 is encoded using bytes less than 128. This may make the encoded
3811 text a little bit longer, but the text passes more easily through
3812 several gateways, some of which strip off MSB (Most Signigant Bit).
3814 There are two kinds of character sets: control character set and
3815 graphic character set. The former contains control characters such
3816 as `newline' and `escape' to provide control functions (control
3817 functions are also provided by escape sequences). The latter
3818 contains graphic characters such as 'A' and '-'. Emacs recognizes
3819 two control character sets and many graphic character sets.
3821 Graphic character sets are classified into one of the following
3822 four classes, according to the number of bytes (DIMENSION) and
3823 number of characters in one dimension (CHARS) of the set:
3824 - DIMENSION1_CHARS94
3825 - DIMENSION1_CHARS96
3826 - DIMENSION2_CHARS94
3827 - DIMENSION2_CHARS96
3829 In addition, each character set is assigned an identification tag,
3830 unique for each set, called "final character" (denoted as <F>
3831 hereafter). The <F> of each character set is decided by ECMA(*)
3832 when it is registered in ISO. The code range of <F> is 0x30..0x7F
3833 (0x30..0x3F are for private use only).
3835 Note (*): ECMA = European Computer Manufacturers Association
3837 Here are examples of graphic character set [NAME(<F>)]:
3838 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
3839 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
3840 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
3841 o DIMENSION2_CHARS96 -- none for the moment
3843 A code area (1 byte = 8 bits) is divided into 4 areas, C0, GL, C1, and GR.
3844 C0 [0x00..0x1F] -- control character plane 0
3845 GL [0x20..0x7F] -- graphic character plane 0
3846 C1 [0x80..0x9F] -- control character plane 1
3847 GR [0xA0..0xFF] -- graphic character plane 1
3849 A control character set is directly designated and invoked to C0 or
3850 C1 by an escape sequence. The most common case is that:
3851 - ISO646's control character set is designated/invoked to C0, and
3852 - ISO6429's control character set is designated/invoked to C1,
3853 and usually these designations/invocations are omitted in encoded
3854 text. In a 7-bit environment, only C0 can be used, and a control
3855 character for C1 is encoded by an appropriate escape sequence to
3856 fit into the environment. All control characters for C1 are
3857 defined to have corresponding escape sequences.
3859 A graphic character set is at first designated to one of four
3860 graphic registers (G0 through G3), then these graphic registers are
3861 invoked to GL or GR. These designations and invocations can be
3862 done independently. The most common case is that G0 is invoked to
3863 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
3864 these invocations and designations are omitted in encoded text.
3865 In a 7-bit environment, only GL can be used.
3867 When a graphic character set of CHARS94 is invoked to GL, codes
3868 0x20 and 0x7F of the GL area work as control characters SPACE and
3869 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
3872 There are two ways of invocation: locking-shift and single-shift.
3873 With locking-shift, the invocation lasts until the next different
3874 invocation, whereas with single-shift, the invocation affects the
3875 following character only and doesn't affect the locking-shift
3876 state. Invocations are done by the following control characters or
3879 ----------------------------------------------------------------------
3880 abbrev function cntrl escape seq description
3881 ----------------------------------------------------------------------
3882 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
3883 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
3884 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
3885 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
3886 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
3887 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
3888 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
3889 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
3890 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
3891 ----------------------------------------------------------------------
3892 (*) These are not used by any known coding system.
3894 Control characters for these functions are defined by macros
3895 ISO_CODE_XXX in `coding.h'.
3897 Designations are done by the following escape sequences:
3898 ----------------------------------------------------------------------
3899 escape sequence description
3900 ----------------------------------------------------------------------
3901 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
3902 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
3903 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
3904 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
3905 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
3906 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
3907 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
3908 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
3909 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
3910 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
3911 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
3912 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
3913 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
3914 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
3915 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
3916 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
3917 ----------------------------------------------------------------------
3919 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
3920 of dimension 1, chars 94, and final character <F>, etc...
3922 Note (*): Although these designations are not allowed in ISO2022,
3923 Emacs accepts them on decoding, and produces them on encoding
3924 CHARS96 character sets in a coding system which is characterized as
3925 7-bit environment, non-locking-shift, and non-single-shift.
3927 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
3928 '(' can be omitted. We refer to this as "short-form" hereafter.
3930 Now you may notice that there are a lot of ways for encoding the
3931 same multilingual text in ISO2022. Actually, there exist many
3932 coding systems such as Compound Text (used in X11's inter client
3933 communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
3934 (used in Korean internet), EUC (Extended UNIX Code, used in Asian
3935 localized platforms), and all of these are variants of ISO2022.
3937 In addition to the above, Emacs handles two more kinds of escape
3938 sequences: ISO6429's direction specification and Emacs' private
3939 sequence for specifying character composition.
3941 ISO6429's direction specification takes the following form:
3942 o CSI ']' -- end of the current direction
3943 o CSI '0' ']' -- end of the current direction
3944 o CSI '1' ']' -- start of left-to-right text
3945 o CSI '2' ']' -- start of right-to-left text
3946 The control character CSI (0x9B: control sequence introducer) is
3947 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
3949 Character composition specification takes the following form:
3950 o ESC '0' -- start character composition
3951 o ESC '1' -- end character composition
3952 Since these are not standard escape sequences of any ISO standard,
3953 their use with these meanings is restricted to Emacs only. */
3956 reset_iso2022 (Lisp_Object coding_system, struct iso2022_decoder *iso)
3960 for (i = 0; i < 4; i++)
3962 if (!NILP (coding_system))
3964 XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, i);
3966 iso->charset[i] = Qt;
3967 iso->invalid_designated[i] = 0;
3969 iso->esc = ISO_ESC_NOTHING;
3970 iso->esc_bytes_index = 0;
3971 iso->register_left = 0;
3972 iso->register_right = 1;
3973 iso->switched_dir_and_no_valid_charset_yet = 0;
3974 iso->invalid_switch_dir = 0;
3975 iso->output_direction_sequence = 0;
3976 iso->output_literally = 0;
3977 #ifdef ENABLE_COMPOSITE_CHARS
3978 if (iso->composite_chars)
3979 Dynarr_reset (iso->composite_chars);
3984 fit_to_be_escape_quoted (unsigned char c)
4001 /* Parse one byte of an ISO2022 escape sequence.
4002 If the result is an invalid escape sequence, return 0 and
4003 do not change anything in STR. Otherwise, if the result is
4004 an incomplete escape sequence, update ISO2022.ESC and
4005 ISO2022.ESC_BYTES and return -1. Otherwise, update
4006 all the state variables (but not ISO2022.ESC_BYTES) and
4009 If CHECK_INVALID_CHARSETS is non-zero, check for designation
4010 or invocation of an invalid character set and treat that as
4011 an unrecognized escape sequence. */
4014 parse_iso2022_esc (Lisp_Object codesys, struct iso2022_decoder *iso,
4015 unsigned char c, unsigned int *flags,
4016 int check_invalid_charsets)
4018 /* (1) If we're at the end of a designation sequence, CS is the
4019 charset being designated and REG is the register to designate
4022 (2) If we're at the end of a locking-shift sequence, REG is
4023 the register to invoke and HALF (0 == left, 1 == right) is
4024 the half to invoke it into.
4026 (3) If we're at the end of a single-shift sequence, REG is
4027 the register to invoke. */
4028 Lisp_Object cs = Qnil;
4031 /* NOTE: This code does goto's all over the fucking place.
4032 The reason for this is that we're basically implementing
4033 a state machine here, and hierarchical languages like C
4034 don't really provide a clean way of doing this. */
4036 if (! (*flags & CODING_STATE_ESCAPE))
4037 /* At beginning of escape sequence; we need to reset our
4038 escape-state variables. */
4039 iso->esc = ISO_ESC_NOTHING;
4041 iso->output_literally = 0;
4042 iso->output_direction_sequence = 0;
4046 case ISO_ESC_NOTHING:
4047 iso->esc_bytes_index = 0;
4050 case ISO_CODE_ESC: /* Start escape sequence */
4051 *flags |= CODING_STATE_ESCAPE;
4055 case ISO_CODE_CSI: /* ISO6429 (specifying directionality) */
4056 *flags |= CODING_STATE_ESCAPE;
4057 iso->esc = ISO_ESC_5_11;
4060 case ISO_CODE_SO: /* locking shift 1 */
4063 case ISO_CODE_SI: /* locking shift 0 */
4067 case ISO_CODE_SS2: /* single shift */
4070 case ISO_CODE_SS3: /* single shift */
4074 default: /* Other control characters */
4081 /**** single shift ****/
4083 case 'N': /* single shift 2 */
4086 case 'O': /* single shift 3 */
4090 /**** locking shift ****/
4092 case '~': /* locking shift 1 right */
4095 case 'n': /* locking shift 2 */
4098 case '}': /* locking shift 2 right */
4101 case 'o': /* locking shift 3 */
4104 case '|': /* locking shift 3 right */
4108 #ifdef ENABLE_COMPOSITE_CHARS
4109 /**** composite ****/
4112 iso->esc = ISO_ESC_START_COMPOSITE;
4113 *flags = (*flags & CODING_STATE_ISO2022_LOCK) |
4114 CODING_STATE_COMPOSITE;
4118 iso->esc = ISO_ESC_END_COMPOSITE;
4119 *flags = (*flags & CODING_STATE_ISO2022_LOCK) &
4120 ~CODING_STATE_COMPOSITE;
4122 #endif /* ENABLE_COMPOSITE_CHARS */
4124 /**** directionality ****/
4127 iso->esc = ISO_ESC_5_11;
4130 /**** designation ****/
4132 case '$': /* multibyte charset prefix */
4133 iso->esc = ISO_ESC_2_4;
4137 if (0x28 <= c && c <= 0x2F)
4139 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_8);
4143 /* This function is called with CODESYS equal to nil when
4144 doing coding-system detection. */
4146 && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
4147 && fit_to_be_escape_quoted (c))
4149 iso->esc = ISO_ESC_LITERAL;
4150 *flags &= CODING_STATE_ISO2022_LOCK;
4160 /**** directionality ****/
4162 case ISO_ESC_5_11: /* ISO6429 direction control */
4165 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4166 goto directionality;
4168 if (c == '0') iso->esc = ISO_ESC_5_11_0;
4169 else if (c == '1') iso->esc = ISO_ESC_5_11_1;
4170 else if (c == '2') iso->esc = ISO_ESC_5_11_2;
4174 case ISO_ESC_5_11_0:
4177 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4178 goto directionality;
4182 case ISO_ESC_5_11_1:
4185 *flags = (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4186 goto directionality;
4190 case ISO_ESC_5_11_2:
4193 *flags = (*flags & CODING_STATE_ISO2022_LOCK) | CODING_STATE_R2L;
4194 goto directionality;
4199 iso->esc = ISO_ESC_DIRECTIONALITY;
4200 /* Various junk here to attempt to preserve the direction sequences
4201 literally in the text if they would otherwise be swallowed due
4202 to invalid designations that don't show up as actual charset
4203 changes in the text. */
4204 if (iso->invalid_switch_dir)
4206 /* We already inserted a direction switch literally into the
4207 text. We assume (#### this may not be right) that the
4208 next direction switch is the one going the other way,
4209 and we need to output that literally as well. */
4210 iso->output_literally = 1;
4211 iso->invalid_switch_dir = 0;
4217 /* If we are in the thrall of an invalid designation,
4218 then stick the directionality sequence literally into the
4219 output stream so it ends up in the original text again. */
4220 for (jj = 0; jj < 4; jj++)
4221 if (iso->invalid_designated[jj])
4225 iso->output_literally = 1;
4226 iso->invalid_switch_dir = 1;
4229 /* Indicate that we haven't yet seen a valid designation,
4230 so that if a switch-dir is directly followed by an
4231 invalid designation, both get inserted literally. */
4232 iso->switched_dir_and_no_valid_charset_yet = 1;
4237 /**** designation ****/
4240 if (0x28 <= c && c <= 0x2F)
4242 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_4_8);
4245 if (0x40 <= c && c <= 0x42)
4247 cs = CHARSET_BY_ATTRIBUTES (CHARSET_TYPE_94X94, c,
4248 *flags & CODING_STATE_R2L ?
4249 CHARSET_RIGHT_TO_LEFT :
4250 CHARSET_LEFT_TO_RIGHT);
4260 if (c < '0' || c > '~')
4261 return 0; /* bad final byte */
4263 if (iso->esc >= ISO_ESC_2_8 &&
4264 iso->esc <= ISO_ESC_2_15)
4266 type = ((iso->esc >= ISO_ESC_2_12) ?
4267 CHARSET_TYPE_96 : CHARSET_TYPE_94);
4268 reg = (iso->esc - ISO_ESC_2_8) & 3;
4270 else if (iso->esc >= ISO_ESC_2_4_8 &&
4271 iso->esc <= ISO_ESC_2_4_15)
4273 type = ((iso->esc >= ISO_ESC_2_4_12) ?
4274 CHARSET_TYPE_96X96 : CHARSET_TYPE_94X94);
4275 reg = (iso->esc - ISO_ESC_2_4_8) & 3;
4279 /* Can this ever be reached? -slb */
4283 cs = CHARSET_BY_ATTRIBUTES (type, c,
4284 *flags & CODING_STATE_R2L ?
4285 CHARSET_RIGHT_TO_LEFT :
4286 CHARSET_LEFT_TO_RIGHT);
4292 iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char) c;
4296 if (check_invalid_charsets && !CHARSETP (iso->charset[reg]))
4297 /* can't invoke something that ain't there. */
4299 iso->esc = ISO_ESC_SINGLE_SHIFT;
4300 *flags &= CODING_STATE_ISO2022_LOCK;
4302 *flags |= CODING_STATE_SS2;
4304 *flags |= CODING_STATE_SS3;
4308 if (check_invalid_charsets &&
4309 !CHARSETP (iso->charset[reg]))
4310 /* can't invoke something that ain't there. */
4313 iso->register_right = reg;
4315 iso->register_left = reg;
4316 *flags &= CODING_STATE_ISO2022_LOCK;
4317 iso->esc = ISO_ESC_LOCKING_SHIFT;
4321 if (NILP (cs) && check_invalid_charsets)
4323 iso->invalid_designated[reg] = 1;
4324 iso->charset[reg] = Vcharset_ascii;
4325 iso->esc = ISO_ESC_DESIGNATE;
4326 *flags &= CODING_STATE_ISO2022_LOCK;
4327 iso->output_literally = 1;
4328 if (iso->switched_dir_and_no_valid_charset_yet)
4330 /* We encountered a switch-direction followed by an
4331 invalid designation. Ensure that the switch-direction
4332 gets outputted; otherwise it will probably get eaten
4333 when the text is written out again. */
4334 iso->switched_dir_and_no_valid_charset_yet = 0;
4335 iso->output_direction_sequence = 1;
4336 /* And make sure that the switch-dir going the other
4337 way gets outputted, as well. */
4338 iso->invalid_switch_dir = 1;
4342 /* This function is called with CODESYS equal to nil when
4343 doing coding-system detection. */
4344 if (!NILP (codesys))
4346 charset_conversion_spec_dynarr *dyn =
4347 XCODING_SYSTEM (codesys)->iso2022.input_conv;
4353 for (i = 0; i < Dynarr_length (dyn); i++)
4355 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
4356 if (EQ (cs, spec->from_charset))
4357 cs = spec->to_charset;
4362 iso->charset[reg] = cs;
4363 iso->esc = ISO_ESC_DESIGNATE;
4364 *flags &= CODING_STATE_ISO2022_LOCK;
4365 if (iso->invalid_designated[reg])
4367 iso->invalid_designated[reg] = 0;
4368 iso->output_literally = 1;
4370 if (iso->switched_dir_and_no_valid_charset_yet)
4371 iso->switched_dir_and_no_valid_charset_yet = 0;
4376 detect_coding_iso2022 (struct detection_state *st, CONST unsigned char *src,
4381 /* #### There are serious deficiencies in the recognition mechanism
4382 here. This needs to be much smarter if it's going to cut it.
4383 The sequence "\xff\x0f" is currently detected as LOCK_SHIFT while
4384 it should be detected as Latin-1.
4385 All the ISO2022 stuff in this file should be synced up with the
4386 code from FSF Emacs-20.4, in which Mule should be more or less stable.
4387 Perhaps we should wait till R2L works in FSF Emacs? */
4389 if (!st->iso2022.initted)
4391 reset_iso2022 (Qnil, &st->iso2022.iso);
4392 st->iso2022.mask = (CODING_CATEGORY_ISO_7_MASK |
4393 CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4394 CODING_CATEGORY_ISO_8_1_MASK |
4395 CODING_CATEGORY_ISO_8_2_MASK |
4396 CODING_CATEGORY_ISO_LOCK_SHIFT_MASK);
4397 st->iso2022.flags = 0;
4398 st->iso2022.high_byte_count = 0;
4399 st->iso2022.saw_single_shift = 0;
4400 st->iso2022.initted = 1;
4403 mask = st->iso2022.mask;
4410 mask &= ~CODING_CATEGORY_ISO_7_MASK;
4411 st->iso2022.high_byte_count++;
4415 if (st->iso2022.high_byte_count && !st->iso2022.saw_single_shift)
4417 if (st->iso2022.high_byte_count & 1)
4418 /* odd number of high bytes; assume not iso-8-2 */
4419 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4421 st->iso2022.high_byte_count = 0;
4422 st->iso2022.saw_single_shift = 0;
4424 mask &= ~CODING_CATEGORY_ISO_7_MASK;
4426 if (!(st->iso2022.flags & CODING_STATE_ESCAPE)
4427 && (BYTE_C0_P (c) || BYTE_C1_P (c)))
4428 { /* control chars */
4431 /* Allow and ignore control characters that you might
4432 reasonably see in a text file */
4437 case 8: /* backspace */
4438 case 11: /* vertical tab */
4439 case 12: /* form feed */
4440 case 26: /* MS-DOS C-z junk */
4441 case 31: /* '^_' -- for info */
4442 goto label_continue_loop;
4449 if ((st->iso2022.flags & CODING_STATE_ESCAPE) || BYTE_C0_P (c)
4452 if (parse_iso2022_esc (Qnil, &st->iso2022.iso, c,
4453 &st->iso2022.flags, 0))
4455 switch (st->iso2022.iso.esc)
4457 case ISO_ESC_DESIGNATE:
4458 mask &= ~CODING_CATEGORY_ISO_8_1_MASK;
4459 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4461 case ISO_ESC_LOCKING_SHIFT:
4462 mask = CODING_CATEGORY_ISO_LOCK_SHIFT_MASK;
4463 goto ran_out_of_chars;
4464 case ISO_ESC_SINGLE_SHIFT:
4465 mask &= ~CODING_CATEGORY_ISO_8_DESIGNATE_MASK;
4466 st->iso2022.saw_single_shift = 1;
4475 goto ran_out_of_chars;
4478 label_continue_loop:;
4487 postprocess_iso2022_mask (int mask)
4489 /* #### kind of cheesy */
4490 /* If seven-bit ISO is allowed, then assume that the encoding is
4491 entirely seven-bit and turn off the eight-bit ones. */
4492 if (mask & CODING_CATEGORY_ISO_7_MASK)
4493 mask &= ~ (CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4494 CODING_CATEGORY_ISO_8_1_MASK |
4495 CODING_CATEGORY_ISO_8_2_MASK);
4499 /* If FLAGS is a null pointer or specifies right-to-left motion,
4500 output a switch-dir-to-left-to-right sequence to DST.
4501 Also update FLAGS if it is not a null pointer.
4502 If INTERNAL_P is set, we are outputting in internal format and
4503 need to handle the CSI differently. */
4506 restore_left_to_right_direction (Lisp_Coding_System *codesys,
4507 unsigned_char_dynarr *dst,
4508 unsigned int *flags,
4511 if (!flags || (*flags & CODING_STATE_R2L))
4513 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4515 Dynarr_add (dst, ISO_CODE_ESC);
4516 Dynarr_add (dst, '[');
4518 else if (internal_p)
4519 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4521 Dynarr_add (dst, ISO_CODE_CSI);
4522 Dynarr_add (dst, '0');
4523 Dynarr_add (dst, ']');
4525 *flags &= ~CODING_STATE_R2L;
4529 /* If FLAGS is a null pointer or specifies a direction different from
4530 DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or
4531 CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape
4532 sequence to DST. Also update FLAGS if it is not a null pointer.
4533 If INTERNAL_P is set, we are outputting in internal format and
4534 need to handle the CSI differently. */
4537 ensure_correct_direction (int direction, Lisp_Coding_System *codesys,
4538 unsigned_char_dynarr *dst, unsigned int *flags,
4541 if ((!flags || (*flags & CODING_STATE_R2L)) &&
4542 direction == CHARSET_LEFT_TO_RIGHT)
4543 restore_left_to_right_direction (codesys, dst, flags, internal_p);
4544 else if (!CODING_SYSTEM_ISO2022_NO_ISO6429 (codesys)
4545 && (!flags || !(*flags & CODING_STATE_R2L)) &&
4546 direction == CHARSET_RIGHT_TO_LEFT)
4548 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4550 Dynarr_add (dst, ISO_CODE_ESC);
4551 Dynarr_add (dst, '[');
4553 else if (internal_p)
4554 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4556 Dynarr_add (dst, ISO_CODE_CSI);
4557 Dynarr_add (dst, '2');
4558 Dynarr_add (dst, ']');
4560 *flags |= CODING_STATE_R2L;
4564 /* Convert ISO2022-format data to internal format. */
4567 decode_coding_iso2022 (Lstream *decoding, CONST unsigned char *src,
4568 unsigned_char_dynarr *dst, unsigned int n)
4570 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4571 unsigned int flags = str->flags;
4572 unsigned int ch = str->ch;
4573 eol_type_t eol_type = str->eol_type;
4574 #ifdef ENABLE_COMPOSITE_CHARS
4575 unsigned_char_dynarr *real_dst = dst;
4577 Lisp_Object coding_system;
4579 XSETCODING_SYSTEM (coding_system, str->codesys);
4581 #ifdef ENABLE_COMPOSITE_CHARS
4582 if (flags & CODING_STATE_COMPOSITE)
4583 dst = str->iso2022.composite_chars;
4584 #endif /* ENABLE_COMPOSITE_CHARS */
4588 unsigned char c = *src++;
4589 if (flags & CODING_STATE_ESCAPE)
4590 { /* Within ESC sequence */
4591 int retval = parse_iso2022_esc (coding_system, &str->iso2022,
4596 switch (str->iso2022.esc)
4598 #ifdef ENABLE_COMPOSITE_CHARS
4599 case ISO_ESC_START_COMPOSITE:
4600 if (str->iso2022.composite_chars)
4601 Dynarr_reset (str->iso2022.composite_chars);
4603 str->iso2022.composite_chars = Dynarr_new (unsigned_char);
4604 dst = str->iso2022.composite_chars;
4606 case ISO_ESC_END_COMPOSITE:
4608 Bufbyte comstr[MAX_EMCHAR_LEN];
4610 Emchar emch = lookup_composite_char (Dynarr_atp (dst, 0),
4611 Dynarr_length (dst));
4613 len = set_charptr_emchar (comstr, emch);
4614 Dynarr_add_many (dst, comstr, len);
4617 #endif /* ENABLE_COMPOSITE_CHARS */
4619 case ISO_ESC_LITERAL:
4620 DECODE_ADD_BINARY_CHAR (c, dst);
4624 /* Everything else handled already */
4629 /* Attempted error recovery. */
4630 if (str->iso2022.output_direction_sequence)
4631 ensure_correct_direction (flags & CODING_STATE_R2L ?
4632 CHARSET_RIGHT_TO_LEFT :
4633 CHARSET_LEFT_TO_RIGHT,
4634 str->codesys, dst, 0, 1);
4635 /* More error recovery. */
4636 if (!retval || str->iso2022.output_literally)
4638 /* Output the (possibly invalid) sequence */
4640 for (i = 0; i < str->iso2022.esc_bytes_index; i++)
4641 DECODE_ADD_BINARY_CHAR (str->iso2022.esc_bytes[i], dst);
4642 flags &= CODING_STATE_ISO2022_LOCK;
4644 n++, src--;/* Repeat the loop with the same character. */
4647 /* No sense in reprocessing the final byte of the
4648 escape sequence; it could mess things up anyway.
4650 DECODE_ADD_BINARY_CHAR (c, dst);
4655 else if (BYTE_C0_P (c) || BYTE_C1_P (c))
4656 { /* Control characters */
4658 /***** Error-handling *****/
4660 /* If we were in the middle of a character, dump out the
4661 partial character. */
4662 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4664 /* If we just saw a single-shift character, dump it out.
4665 This may dump out the wrong sort of single-shift character,
4666 but least it will give an indication that something went
4668 if (flags & CODING_STATE_SS2)
4670 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS2, dst);
4671 flags &= ~CODING_STATE_SS2;
4673 if (flags & CODING_STATE_SS3)
4675 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS3, dst);
4676 flags &= ~CODING_STATE_SS3;
4679 /***** Now handle the control characters. *****/
4682 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4684 flags &= CODING_STATE_ISO2022_LOCK;
4686 if (!parse_iso2022_esc (coding_system, &str->iso2022, c, &flags, 1))
4687 DECODE_ADD_BINARY_CHAR (c, dst);
4690 { /* Graphic characters */
4691 Lisp_Object charset;
4695 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4697 /* Now determine the charset. */
4698 reg = ((flags & CODING_STATE_SS2) ? 2
4699 : (flags & CODING_STATE_SS3) ? 3
4700 : !BYTE_ASCII_P (c) ? str->iso2022.register_right
4701 : str->iso2022.register_left);
4702 charset = str->iso2022.charset[reg];
4704 /* Error checking: */
4705 if (! CHARSETP (charset)
4706 || str->iso2022.invalid_designated[reg]
4707 || (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL)
4708 && XCHARSET_CHARS (charset) == 94))
4709 /* Mrmph. We are trying to invoke a register that has no
4710 or an invalid charset in it, or trying to add a character
4711 outside the range of the charset. Insert that char literally
4712 to preserve it for the output. */
4714 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4715 DECODE_ADD_BINARY_CHAR (c, dst);
4720 /* Things are probably hunky-dorey. */
4722 /* Fetch reverse charset, maybe. */
4723 if (((flags & CODING_STATE_R2L) &&
4724 XCHARSET_DIRECTION (charset) == CHARSET_LEFT_TO_RIGHT)
4726 (!(flags & CODING_STATE_R2L) &&
4727 XCHARSET_DIRECTION (charset) == CHARSET_RIGHT_TO_LEFT))
4729 Lisp_Object new_charset =
4730 XCHARSET_REVERSE_DIRECTION_CHARSET (charset);
4731 if (!NILP (new_charset))
4732 charset = new_charset;
4735 lb = XCHARSET_LEADING_BYTE (charset);
4736 switch (XCHARSET_REP_BYTES (charset))
4739 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4740 Dynarr_add (dst, c & 0x7F);
4743 case 2: /* one-byte official */
4744 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4745 Dynarr_add (dst, lb);
4746 Dynarr_add (dst, c | 0x80);
4749 case 3: /* one-byte private or two-byte official */
4750 if (XCHARSET_PRIVATE_P (charset))
4752 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4753 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_1);
4754 Dynarr_add (dst, lb);
4755 Dynarr_add (dst, c | 0x80);
4761 Dynarr_add (dst, lb);
4762 Dynarr_add (dst, ch | 0x80);
4763 Dynarr_add (dst, c | 0x80);
4771 default: /* two-byte private */
4774 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_2);
4775 Dynarr_add (dst, lb);
4776 Dynarr_add (dst, ch | 0x80);
4777 Dynarr_add (dst, c | 0x80);
4786 flags &= CODING_STATE_ISO2022_LOCK;
4789 label_continue_loop:;
4792 if (flags & CODING_STATE_END)
4793 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4800 /***** ISO2022 encoder *****/
4802 /* Designate CHARSET into register REG. */
4805 iso2022_designate (Lisp_Object charset, unsigned char reg,
4806 struct encoding_stream *str, unsigned_char_dynarr *dst)
4808 static CONST char inter94[] = "()*+";
4809 static CONST char inter96[] = ",-./";
4811 unsigned char final;
4812 Lisp_Object old_charset = str->iso2022.charset[reg];
4814 str->iso2022.charset[reg] = charset;
4815 if (!CHARSETP (charset))
4816 /* charset might be an initial nil or t. */
4818 type = XCHARSET_TYPE (charset);
4819 final = XCHARSET_FINAL (charset);
4820 if (!str->iso2022.force_charset_on_output[reg] &&
4821 CHARSETP (old_charset) &&
4822 XCHARSET_TYPE (old_charset) == type &&
4823 XCHARSET_FINAL (old_charset) == final)
4826 str->iso2022.force_charset_on_output[reg] = 0;
4829 charset_conversion_spec_dynarr *dyn =
4830 str->codesys->iso2022.output_conv;
4836 for (i = 0; i < Dynarr_length (dyn); i++)
4838 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
4839 if (EQ (charset, spec->from_charset))
4840 charset = spec->to_charset;
4845 Dynarr_add (dst, ISO_CODE_ESC);
4848 case CHARSET_TYPE_94:
4849 Dynarr_add (dst, inter94[reg]);
4851 case CHARSET_TYPE_96:
4852 Dynarr_add (dst, inter96[reg]);
4854 case CHARSET_TYPE_94X94:
4855 Dynarr_add (dst, '$');
4857 || !(CODING_SYSTEM_ISO2022_SHORT (str->codesys))
4860 Dynarr_add (dst, inter94[reg]);
4862 case CHARSET_TYPE_96X96:
4863 Dynarr_add (dst, '$');
4864 Dynarr_add (dst, inter96[reg]);
4867 Dynarr_add (dst, final);
4871 ensure_normal_shift (struct encoding_stream *str, unsigned_char_dynarr *dst)
4873 if (str->iso2022.register_left != 0)
4875 Dynarr_add (dst, ISO_CODE_SI);
4876 str->iso2022.register_left = 0;
4881 ensure_shift_out (struct encoding_stream *str, unsigned_char_dynarr *dst)
4883 if (str->iso2022.register_left != 1)
4885 Dynarr_add (dst, ISO_CODE_SO);
4886 str->iso2022.register_left = 1;
4890 /* Convert internally-formatted data to ISO2022 format. */
4893 encode_coding_iso2022 (Lstream *encoding, CONST unsigned char *src,
4894 unsigned_char_dynarr *dst, unsigned int n)
4896 unsigned char charmask, c;
4897 unsigned char char_boundary;
4898 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
4899 unsigned int flags = str->flags;
4900 unsigned int ch = str->ch;
4901 Lisp_Coding_System *codesys = str->codesys;
4902 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
4904 Lisp_Object charset;
4907 #ifdef ENABLE_COMPOSITE_CHARS
4908 /* flags for handling composite chars. We do a little switcharoo
4909 on the source while we're outputting the composite char. */
4910 unsigned int saved_n = 0;
4911 CONST unsigned char *saved_src = NULL;
4912 int in_composite = 0;
4913 #endif /* ENABLE_COMPOSITE_CHARS */
4915 char_boundary = str->iso2022.current_char_boundary;
4916 charset = str->iso2022.current_charset;
4917 half = str->iso2022.current_half;
4919 #ifdef ENABLE_COMPOSITE_CHARS
4926 if (BYTE_ASCII_P (c))
4927 { /* Processing ASCII character */
4930 restore_left_to_right_direction (codesys, dst, &flags, 0);
4932 /* Make sure G0 contains ASCII */
4933 if ((c > ' ' && c < ISO_CODE_DEL) ||
4934 !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
4936 ensure_normal_shift (str, dst);
4937 iso2022_designate (Vcharset_ascii, 0, str, dst);
4940 /* If necessary, restore everything to the default state
4943 !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys)))
4945 restore_left_to_right_direction (codesys, dst, &flags, 0);
4947 ensure_normal_shift (str, dst);
4949 for (i = 0; i < 4; i++)
4951 Lisp_Object initial_charset =
4952 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
4953 iso2022_designate (initial_charset, i, str, dst);
4958 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
4959 Dynarr_add (dst, '\r');
4960 if (eol_type != EOL_CR)
4961 Dynarr_add (dst, c);
4965 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
4966 && fit_to_be_escape_quoted (c))
4967 Dynarr_add (dst, ISO_CODE_ESC);
4968 Dynarr_add (dst, c);
4973 else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
4974 { /* Processing Leading Byte */
4976 charset = CHARSET_BY_LEADING_BYTE (c);
4977 if (LEADING_BYTE_PREFIX_P(c))
4979 else if (!EQ (charset, Vcharset_control_1)
4980 #ifdef ENABLE_COMPOSITE_CHARS
4981 && !EQ (charset, Vcharset_composite)
4987 ensure_correct_direction (XCHARSET_DIRECTION (charset),
4988 codesys, dst, &flags, 0);
4990 /* Now determine which register to use. */
4992 for (i = 0; i < 4; i++)
4994 if (EQ (charset, str->iso2022.charset[i]) ||
4996 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i)))
5005 if (XCHARSET_GRAPHIC (charset) != 0)
5007 if (!NILP (str->iso2022.charset[1]) &&
5008 (!CODING_SYSTEM_ISO2022_SEVEN (codesys) ||
5009 CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys)))
5011 else if (!NILP (str->iso2022.charset[2]))
5013 else if (!NILP (str->iso2022.charset[3]))
5022 iso2022_designate (charset, reg, str, dst);
5024 /* Now invoke that register. */
5028 ensure_normal_shift (str, dst);
5033 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5035 ensure_shift_out (str, dst);
5043 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5045 Dynarr_add (dst, ISO_CODE_ESC);
5046 Dynarr_add (dst, 'N');
5051 Dynarr_add (dst, ISO_CODE_SS2);
5057 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5059 Dynarr_add (dst, ISO_CODE_ESC);
5060 Dynarr_add (dst, 'O');
5065 Dynarr_add (dst, ISO_CODE_SS3);
5077 { /* Processing Non-ASCII character */
5078 charmask = (half == 0 ? 0x7F : 0xFF);
5080 if (EQ (charset, Vcharset_control_1))
5082 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5083 && fit_to_be_escape_quoted (c))
5084 Dynarr_add (dst, ISO_CODE_ESC);
5085 /* you asked for it ... */
5086 Dynarr_add (dst, c - 0x20);
5090 switch (XCHARSET_REP_BYTES (charset))
5093 Dynarr_add (dst, c & charmask);
5096 if (XCHARSET_PRIVATE_P (charset))
5098 Dynarr_add (dst, c & charmask);
5103 #ifdef ENABLE_COMPOSITE_CHARS
5104 if (EQ (charset, Vcharset_composite))
5108 /* #### Bother! We don't know how to
5110 Dynarr_add (dst, '~');
5114 Emchar emch = MAKE_CHAR (Vcharset_composite,
5115 ch & 0x7F, c & 0x7F);
5116 Lisp_Object lstr = composite_char_string (emch);
5120 src = XSTRING_DATA (lstr);
5121 n = XSTRING_LENGTH (lstr);
5122 Dynarr_add (dst, ISO_CODE_ESC);
5123 Dynarr_add (dst, '0'); /* start composing */
5127 #endif /* ENABLE_COMPOSITE_CHARS */
5129 Dynarr_add (dst, ch & charmask);
5130 Dynarr_add (dst, c & charmask);
5143 Dynarr_add (dst, ch & charmask);
5144 Dynarr_add (dst, c & charmask);
5160 #ifdef ENABLE_COMPOSITE_CHARS
5166 Dynarr_add (dst, ISO_CODE_ESC);
5167 Dynarr_add (dst, '1'); /* end composing */
5168 goto back_to_square_n; /* Wheeeeeeeee ..... */
5170 #endif /* ENABLE_COMPOSITE_CHARS */
5172 if (char_boundary && flags & CODING_STATE_END)
5174 restore_left_to_right_direction (codesys, dst, &flags, 0);
5175 ensure_normal_shift (str, dst);
5176 for (i = 0; i < 4; i++)
5178 Lisp_Object initial_charset =
5179 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5180 iso2022_designate (initial_charset, i, str, dst);
5186 str->iso2022.current_char_boundary = char_boundary;
5187 str->iso2022.current_charset = charset;
5188 str->iso2022.current_half = half;
5190 /* Verbum caro factum est! */
5194 /************************************************************************/
5195 /* No-conversion methods */
5196 /************************************************************************/
5198 /* This is used when reading in "binary" files -- i.e. files that may
5199 contain all 256 possible byte values and that are not to be
5200 interpreted as being in any particular decoding. */
5202 decode_coding_no_conversion (Lstream *decoding, CONST unsigned char *src,
5203 unsigned_char_dynarr *dst, unsigned int n)
5206 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
5207 unsigned int flags = str->flags;
5208 unsigned int ch = str->ch;
5209 eol_type_t eol_type = str->eol_type;
5215 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5216 DECODE_ADD_BINARY_CHAR (c, dst);
5217 label_continue_loop:;
5220 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
5227 encode_coding_no_conversion (Lstream *encoding, CONST unsigned char *src,
5228 unsigned_char_dynarr *dst, unsigned int n)
5231 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
5232 unsigned int flags = str->flags;
5233 unsigned int ch = str->ch;
5234 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
5241 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5242 Dynarr_add (dst, '\r');
5243 if (eol_type != EOL_CR)
5244 Dynarr_add (dst, '\n');
5247 else if (BYTE_ASCII_P (c))
5250 Dynarr_add (dst, c);
5252 else if (BUFBYTE_LEADING_BYTE_P (c))
5255 if (c == LEADING_BYTE_LATIN_ISO8859_1 ||
5256 c == LEADING_BYTE_CONTROL_1)
5259 Dynarr_add (dst, '~'); /* untranslatable character */
5263 if (ch == LEADING_BYTE_LATIN_ISO8859_1)
5264 Dynarr_add (dst, c);
5265 else if (ch == LEADING_BYTE_CONTROL_1)
5268 Dynarr_add (dst, c - 0x20);
5270 /* else it should be the second or third byte of an
5271 untranslatable character, so ignore it */
5281 /************************************************************************/
5282 /* Simple internal/external functions */
5283 /************************************************************************/
5285 static Extbyte_dynarr *conversion_out_dynarr;
5286 static Bufbyte_dynarr *conversion_in_dynarr;
5288 /* Determine coding system from coding format */
5290 /* #### not correct for all values of `fmt'! */
5292 external_data_format_to_coding_system (enum external_data_format fmt)
5296 case FORMAT_FILENAME:
5297 case FORMAT_TERMINAL:
5298 if (EQ (Vfile_name_coding_system, Qnil) ||
5299 EQ (Vfile_name_coding_system, Qbinary))
5302 return Fget_coding_system (Vfile_name_coding_system);
5305 return Fget_coding_system (Qctext);
5313 convert_to_external_format (CONST Bufbyte *ptr,
5316 enum external_data_format fmt)
5318 Lisp_Object coding_system = external_data_format_to_coding_system (fmt);
5320 if (!conversion_out_dynarr)
5321 conversion_out_dynarr = Dynarr_new (Extbyte);
5323 Dynarr_reset (conversion_out_dynarr);
5325 if (NILP (coding_system))
5327 CONST Bufbyte *end = ptr + len;
5332 (BYTE_ASCII_P (*ptr)) ? *ptr :
5333 (*ptr == LEADING_BYTE_CONTROL_1) ? (*(ptr+1) - 0x20) :
5334 (*ptr == LEADING_BYTE_LATIN_ISO8859_1) ? (*(ptr+1)) :
5337 Dynarr_add (conversion_out_dynarr, (Extbyte) c);
5341 #ifdef ERROR_CHECK_BUFPOS
5342 assert (ptr == end);
5347 Lisp_Object instream, outstream, da_outstream;
5348 Lstream *istr, *ostr;
5349 struct gcpro gcpro1, gcpro2, gcpro3;
5350 char tempbuf[1024]; /* some random amount */
5352 instream = make_fixed_buffer_input_stream ((unsigned char *) ptr, len);
5353 da_outstream = make_dynarr_output_stream
5354 ((unsigned_char_dynarr *) conversion_out_dynarr);
5356 make_encoding_output_stream (XLSTREAM (da_outstream), coding_system);
5357 istr = XLSTREAM (instream);
5358 ostr = XLSTREAM (outstream);
5359 GCPRO3 (instream, outstream, da_outstream);
5362 int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
5365 Lstream_write (ostr, tempbuf, size_in_bytes);
5367 Lstream_close (istr);
5368 Lstream_close (ostr);
5370 Lstream_delete (istr);
5371 Lstream_delete (ostr);
5372 Lstream_delete (XLSTREAM (da_outstream));
5375 *len_out = Dynarr_length (conversion_out_dynarr);
5376 Dynarr_add (conversion_out_dynarr, 0); /* remember to zero-terminate! */
5377 return Dynarr_atp (conversion_out_dynarr, 0);
5381 convert_from_external_format (CONST Extbyte *ptr,
5384 enum external_data_format fmt)
5386 Lisp_Object coding_system = external_data_format_to_coding_system (fmt);
5388 if (!conversion_in_dynarr)
5389 conversion_in_dynarr = Dynarr_new (Bufbyte);
5391 Dynarr_reset (conversion_in_dynarr);
5393 if (NILP (coding_system))
5395 CONST Extbyte *end = ptr + len;
5396 for (; ptr < end; ptr++)
5399 DECODE_ADD_BINARY_CHAR (c, conversion_in_dynarr);
5404 Lisp_Object instream, outstream, da_outstream;
5405 Lstream *istr, *ostr;
5406 struct gcpro gcpro1, gcpro2, gcpro3;
5407 char tempbuf[1024]; /* some random amount */
5409 instream = make_fixed_buffer_input_stream ((unsigned char *) ptr, len);
5410 da_outstream = make_dynarr_output_stream
5411 ((unsigned_char_dynarr *) conversion_in_dynarr);
5413 make_decoding_output_stream (XLSTREAM (da_outstream), coding_system);
5414 istr = XLSTREAM (instream);
5415 ostr = XLSTREAM (outstream);
5416 GCPRO3 (instream, outstream, da_outstream);
5419 int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
5422 Lstream_write (ostr, tempbuf, size_in_bytes);
5424 Lstream_close (istr);
5425 Lstream_close (ostr);
5427 Lstream_delete (istr);
5428 Lstream_delete (ostr);
5429 Lstream_delete (XLSTREAM (da_outstream));
5432 *len_out = Dynarr_length (conversion_in_dynarr);
5433 Dynarr_add (conversion_in_dynarr, 0); /* remember to zero-terminate! */
5434 return Dynarr_atp (conversion_in_dynarr, 0);
5438 /************************************************************************/
5439 /* Initialization */
5440 /************************************************************************/
5443 syms_of_file_coding (void)
5445 defsymbol (&Qbuffer_file_coding_system, "buffer-file-coding-system");
5446 deferror (&Qcoding_system_error, "coding-system-error",
5447 "Coding-system error", Qio_error);
5449 DEFSUBR (Fcoding_system_p);
5450 DEFSUBR (Ffind_coding_system);
5451 DEFSUBR (Fget_coding_system);
5452 DEFSUBR (Fcoding_system_list);
5453 DEFSUBR (Fcoding_system_name);
5454 DEFSUBR (Fmake_coding_system);
5455 DEFSUBR (Fcopy_coding_system);
5456 DEFSUBR (Fdefine_coding_system_alias);
5457 DEFSUBR (Fsubsidiary_coding_system);
5459 DEFSUBR (Fcoding_system_type);
5460 DEFSUBR (Fcoding_system_doc_string);
5462 DEFSUBR (Fcoding_system_charset);
5464 DEFSUBR (Fcoding_system_property);
5466 DEFSUBR (Fcoding_category_list);
5467 DEFSUBR (Fset_coding_priority_list);
5468 DEFSUBR (Fcoding_priority_list);
5469 DEFSUBR (Fset_coding_category_system);
5470 DEFSUBR (Fcoding_category_system);
5472 DEFSUBR (Fdetect_coding_region);
5473 DEFSUBR (Fdecode_coding_region);
5474 DEFSUBR (Fencode_coding_region);
5476 DEFSUBR (Fdecode_shift_jis_char);
5477 DEFSUBR (Fencode_shift_jis_char);
5478 DEFSUBR (Fdecode_big5_char);
5479 DEFSUBR (Fencode_big5_char);
5480 DEFSUBR (Fset_ucs_char);
5481 DEFSUBR (Fucs_char);
5482 DEFSUBR (Fset_char_ucs);
5483 DEFSUBR (Fchar_ucs);
5485 defsymbol (&Qcoding_system_p, "coding-system-p");
5486 defsymbol (&Qno_conversion, "no-conversion");
5487 defsymbol (&Qraw_text, "raw-text");
5489 defsymbol (&Qbig5, "big5");
5490 defsymbol (&Qshift_jis, "shift-jis");
5491 defsymbol (&Qucs4, "ucs-4");
5492 defsymbol (&Qutf8, "utf-8");
5493 defsymbol (&Qccl, "ccl");
5494 defsymbol (&Qiso2022, "iso2022");
5496 defsymbol (&Qmnemonic, "mnemonic");
5497 defsymbol (&Qeol_type, "eol-type");
5498 defsymbol (&Qpost_read_conversion, "post-read-conversion");
5499 defsymbol (&Qpre_write_conversion, "pre-write-conversion");
5501 defsymbol (&Qcr, "cr");
5502 defsymbol (&Qlf, "lf");
5503 defsymbol (&Qcrlf, "crlf");
5504 defsymbol (&Qeol_cr, "eol-cr");
5505 defsymbol (&Qeol_lf, "eol-lf");
5506 defsymbol (&Qeol_crlf, "eol-crlf");
5508 defsymbol (&Qcharset_g0, "charset-g0");
5509 defsymbol (&Qcharset_g1, "charset-g1");
5510 defsymbol (&Qcharset_g2, "charset-g2");
5511 defsymbol (&Qcharset_g3, "charset-g3");
5512 defsymbol (&Qforce_g0_on_output, "force-g0-on-output");
5513 defsymbol (&Qforce_g1_on_output, "force-g1-on-output");
5514 defsymbol (&Qforce_g2_on_output, "force-g2-on-output");
5515 defsymbol (&Qforce_g3_on_output, "force-g3-on-output");
5516 defsymbol (&Qno_iso6429, "no-iso6429");
5517 defsymbol (&Qinput_charset_conversion, "input-charset-conversion");
5518 defsymbol (&Qoutput_charset_conversion, "output-charset-conversion");
5520 defsymbol (&Qshort, "short");
5521 defsymbol (&Qno_ascii_eol, "no-ascii-eol");
5522 defsymbol (&Qno_ascii_cntl, "no-ascii-cntl");
5523 defsymbol (&Qseven, "seven");
5524 defsymbol (&Qlock_shift, "lock-shift");
5525 defsymbol (&Qescape_quoted, "escape-quoted");
5527 defsymbol (&Qencode, "encode");
5528 defsymbol (&Qdecode, "decode");
5531 defsymbol (&Qctext, "ctext");
5532 defsymbol (&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS],
5534 defsymbol (&coding_category_symbol[CODING_CATEGORY_BIG5],
5536 defsymbol (&coding_category_symbol[CODING_CATEGORY_UCS4],
5538 defsymbol (&coding_category_symbol[CODING_CATEGORY_UTF8],
5540 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_7],
5542 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE],
5544 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_1],
5546 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_2],
5548 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_LOCK_SHIFT],
5551 defsymbol (&coding_category_symbol[CODING_CATEGORY_NO_CONVERSION],
5556 lstream_type_create_file_coding (void)
5558 LSTREAM_HAS_METHOD (decoding, reader);
5559 LSTREAM_HAS_METHOD (decoding, writer);
5560 LSTREAM_HAS_METHOD (decoding, rewinder);
5561 LSTREAM_HAS_METHOD (decoding, seekable_p);
5562 LSTREAM_HAS_METHOD (decoding, flusher);
5563 LSTREAM_HAS_METHOD (decoding, closer);
5564 LSTREAM_HAS_METHOD (decoding, marker);
5566 LSTREAM_HAS_METHOD (encoding, reader);
5567 LSTREAM_HAS_METHOD (encoding, writer);
5568 LSTREAM_HAS_METHOD (encoding, rewinder);
5569 LSTREAM_HAS_METHOD (encoding, seekable_p);
5570 LSTREAM_HAS_METHOD (encoding, flusher);
5571 LSTREAM_HAS_METHOD (encoding, closer);
5572 LSTREAM_HAS_METHOD (encoding, marker);
5576 vars_of_file_coding (void)
5580 /* Initialize to something reasonable ... */
5581 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
5583 coding_category_system[i] = Qnil;
5584 coding_category_by_priority[i] = i;
5587 Fprovide (intern ("file-coding"));
5589 DEFVAR_LISP ("keyboard-coding-system", &Vkeyboard_coding_system /*
5590 Coding system used for TTY keyboard input.
5591 Not used under a windowing system.
5593 Vkeyboard_coding_system = Qnil;
5595 DEFVAR_LISP ("terminal-coding-system", &Vterminal_coding_system /*
5596 Coding system used for TTY display output.
5597 Not used under a windowing system.
5599 Vterminal_coding_system = Qnil;
5601 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read /*
5602 Overriding coding system used when writing a file or process.
5603 You should *bind* this, not set it. If this is non-nil, it specifies
5604 the coding system that will be used when a file or process is read
5605 in, and overrides `buffer-file-coding-system-for-read',
5606 `insert-file-contents-pre-hook', etc. Use those variables instead of
5607 this one for permanent changes to the environment.
5609 Vcoding_system_for_read = Qnil;
5611 DEFVAR_LISP ("coding-system-for-write",
5612 &Vcoding_system_for_write /*
5613 Overriding coding system used when writing a file or process.
5614 You should *bind* this, not set it. If this is non-nil, it specifies
5615 the coding system that will be used when a file or process is wrote
5616 in, and overrides `buffer-file-coding-system',
5617 `write-region-pre-hook', etc. Use those variables instead of this one
5618 for permanent changes to the environment.
5620 Vcoding_system_for_write = Qnil;
5622 DEFVAR_LISP ("file-name-coding-system", &Vfile_name_coding_system /*
5623 Coding system used to convert pathnames when accessing files.
5625 Vfile_name_coding_system = Qnil;
5627 DEFVAR_BOOL ("enable-multibyte-characters", &enable_multibyte_characters /*
5628 Non-nil means the buffer contents are regarded as multi-byte form
5629 of characters, not a binary code. This affects the display, file I/O,
5630 and behaviors of various editing commands.
5632 Setting this to nil does not do anything.
5634 enable_multibyte_characters = 1;
5638 complex_vars_of_file_coding (void)
5640 staticpro (&Vcoding_system_hash_table);
5641 Vcoding_system_hash_table =
5642 make_lisp_hash_table (50, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ);
5644 the_codesys_prop_dynarr = Dynarr_new (codesys_prop);
5646 #define DEFINE_CODESYS_PROP(Prop_Type, Sym) do \
5648 struct codesys_prop csp; \
5650 csp.prop_type = (Prop_Type); \
5651 Dynarr_add (the_codesys_prop_dynarr, csp); \
5654 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qmnemonic);
5655 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_type);
5656 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_cr);
5657 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_crlf);
5658 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_lf);
5659 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpost_read_conversion);
5660 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpre_write_conversion);
5662 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g0);
5663 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g1);
5664 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g2);
5665 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g3);
5666 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g0_on_output);
5667 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g1_on_output);
5668 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g2_on_output);
5669 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g3_on_output);
5670 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qshort);
5671 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_eol);
5672 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_cntl);
5673 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qseven);
5674 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qlock_shift);
5675 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_iso6429);
5676 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qescape_quoted);
5677 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qinput_charset_conversion);
5678 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qoutput_charset_conversion);
5680 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qencode);
5681 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qdecode);
5683 /* Need to create this here or we're really screwed. */
5685 (Qraw_text, Qno_conversion,
5686 build_string ("Raw text, which means it converts only line-break-codes."),
5687 list2 (Qmnemonic, build_string ("Raw")));
5690 (Qbinary, Qno_conversion,
5691 build_string ("Binary, which means it does not convert anything."),
5692 list4 (Qeol_type, Qlf,
5693 Qmnemonic, build_string ("Binary")));
5695 Fdefine_coding_system_alias (Qno_conversion, Qraw_text);
5697 /* Need this for bootstrapping */
5698 coding_category_system[CODING_CATEGORY_NO_CONVERSION] =
5699 Fget_coding_system (Qraw_text);
5705 for (i = 0; i < 65536; i++)
5706 ucs_to_mule_table[i] = Qnil;
5708 staticpro (&mule_to_ucs_table);
5709 mule_to_ucs_table = Fmake_char_table(Qgeneric);