1 /* Code conversion functions.
2 Copyright (C) 1991, 1995 Free Software Foundation, Inc.
3 Copyright (C) 1995 Sun Microsystems, Inc.
5 This file is part of XEmacs.
7 XEmacs is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by the
9 Free Software Foundation; either version 2, or (at your option) any
12 XEmacs is distributed in the hope that it will be useful, but WITHOUT
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 You should have received a copy of the GNU General Public License
18 along with XEmacs; see the file COPYING. If not, write to
19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20 Boston, MA 02111-1307, USA. */
22 /* Synched up with: Mule 2.3. Not in FSF. */
24 /* Rewritten by Ben Wing <ben@xemacs.org>. */
37 #include "file-coding.h"
39 Lisp_Object Qbuffer_file_coding_system, Qcoding_system_error;
41 Lisp_Object Vkeyboard_coding_system;
42 Lisp_Object Vterminal_coding_system;
43 Lisp_Object Vcoding_system_for_read;
44 Lisp_Object Vcoding_system_for_write;
45 Lisp_Object Vfile_name_coding_system;
47 /* Table of symbols identifying each coding category. */
48 Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST + 1];
50 /* Coding system currently associated with each coding category. */
51 Lisp_Object coding_category_system[CODING_CATEGORY_LAST + 1];
53 /* Table of all coding categories in decreasing order of priority.
54 This describes a permutation of the possible coding categories. */
55 int coding_category_by_priority[CODING_CATEGORY_LAST + 1];
57 Lisp_Object Qcoding_system_p;
59 Lisp_Object Qraw_text, Qno_conversion, Qccl, Qiso2022;
60 /* Qinternal in general.c */
62 Lisp_Object Qmnemonic, Qeol_type;
63 Lisp_Object Qcr, Qcrlf, Qlf;
64 Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf;
65 Lisp_Object Qpost_read_conversion;
66 Lisp_Object Qpre_write_conversion;
69 Lisp_Object Qucs4, Qutf8;
70 Lisp_Object Qbig5, Qshift_jis;
71 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3;
72 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output;
73 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output;
74 Lisp_Object Qno_iso6429;
75 Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion;
76 Lisp_Object Qctext, Qescape_quoted;
77 Lisp_Object Qshort, Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift;
79 Lisp_Object Qencode, Qdecode;
81 Lisp_Object Vcoding_system_hash_table;
83 int enable_multibyte_characters;
86 /* Additional information used by the ISO2022 decoder and detector. */
87 struct iso2022_decoder
89 /* CHARSET holds the character sets currently assigned to the G0
90 through G3 variables. It is initialized from the array
91 INITIAL_CHARSET in CODESYS. */
92 Lisp_Object charset[4];
94 /* Which registers are currently invoked into the left (GL) and
95 right (GR) halves of the 8-bit encoding space? */
96 int register_left, register_right;
98 /* ISO_ESC holds a value indicating part of an escape sequence
99 that has already been seen. */
100 enum iso_esc_flag esc;
102 /* This records the bytes we've seen so far in an escape sequence,
103 in case the sequence is invalid (we spit out the bytes unchanged). */
104 unsigned char esc_bytes[8];
106 /* Index for next byte to store in ISO escape sequence. */
109 #ifdef ENABLE_COMPOSITE_CHARS
110 /* Stuff seen so far when composing a string. */
111 unsigned_char_dynarr *composite_chars;
114 /* If we saw an invalid designation sequence for a particular
115 register, we flag it here and switch to ASCII. The next time we
116 see a valid designation for this register, we turn off the flag
117 and do the designation normally, but pretend the sequence was
118 invalid. The effect of all this is that (most of the time) the
119 escape sequences for both the switch to the unknown charset, and
120 the switch back to the known charset, get inserted literally into
121 the buffer and saved out as such. The hope is that we can
122 preserve the escape sequences so that the resulting written out
123 file makes sense. If we don't do any of this, the designation
124 to the invalid charset will be preserved but that switch back
125 to the known charset will probably get eaten because it was
126 the same charset that was already present in the register. */
127 unsigned char invalid_designated[4];
129 /* We try to do similar things as above for direction-switching
130 sequences. If we encountered a direction switch while an
131 invalid designation was present, or an invalid designation
132 just after a direction switch (i.e. no valid designation
133 encountered yet), we insert the direction-switch escape
134 sequence literally into the output stream, and later on
135 insert the corresponding direction-restoring escape sequence
137 unsigned int switched_dir_and_no_valid_charset_yet :1;
138 unsigned int invalid_switch_dir :1;
140 /* Tells the decoder to output the escape sequence literally
141 even though it was valid. Used in the games we play to
142 avoid lossage when we encounter invalid designations. */
143 unsigned int output_literally :1;
144 /* We encountered a direction switch followed by an invalid
145 designation. We didn't output the direction switch
146 literally because we didn't know about the invalid designation;
147 but we have to do so now. */
148 unsigned int output_direction_sequence :1;
151 EXFUN (Fcopy_coding_system, 2);
153 struct detection_state;
156 text_encode_generic (Lstream *encoding, CONST unsigned char *src,
157 unsigned_char_dynarr *dst, unsigned int n);
159 static int detect_coding_sjis (struct detection_state *st,
160 CONST unsigned char *src,
162 static void decode_coding_sjis (Lstream *decoding,
163 CONST unsigned char *src,
164 unsigned_char_dynarr *dst,
166 static void encode_coding_sjis (Lstream *encoding,
167 CONST unsigned char *src,
168 unsigned_char_dynarr *dst,
170 static int detect_coding_big5 (struct detection_state *st,
171 CONST unsigned char *src,
173 static void decode_coding_big5 (Lstream *decoding,
174 CONST unsigned char *src,
175 unsigned_char_dynarr *dst, unsigned int n);
176 static void encode_coding_big5 (Lstream *encoding,
177 CONST unsigned char *src,
178 unsigned_char_dynarr *dst, unsigned int n);
179 static int detect_coding_ucs4 (struct detection_state *st,
180 CONST unsigned char *src,
182 static void decode_coding_ucs4 (Lstream *decoding,
183 CONST unsigned char *src,
184 unsigned_char_dynarr *dst, unsigned int n);
185 static void encode_coding_ucs4 (Lstream *encoding,
186 CONST unsigned char *src,
187 unsigned_char_dynarr *dst, unsigned int n);
188 static int detect_coding_utf8 (struct detection_state *st,
189 CONST unsigned char *src,
191 static void decode_coding_utf8 (Lstream *decoding,
192 CONST unsigned char *src,
193 unsigned_char_dynarr *dst, unsigned int n);
194 void char_encode_utf8 (struct encoding_stream *str, Emchar c,
195 unsigned_char_dynarr *dst, unsigned int *flags);
196 void char_finish_utf8 (struct encoding_stream *str,
197 unsigned_char_dynarr *dst, unsigned int *flags);
199 static int postprocess_iso2022_mask (int mask);
200 static void reset_iso2022 (Lisp_Object coding_system,
201 struct iso2022_decoder *iso);
202 static int detect_coding_iso2022 (struct detection_state *st,
203 CONST unsigned char *src,
205 static void decode_coding_iso2022 (Lstream *decoding,
206 CONST unsigned char *src,
207 unsigned_char_dynarr *dst, unsigned int n);
208 void char_encode_iso2022 (struct encoding_stream *str, Emchar c,
209 unsigned_char_dynarr *dst, unsigned int *flags);
210 void char_finish_iso2022 (struct encoding_stream *str,
211 unsigned_char_dynarr *dst, unsigned int *flags);
213 static void decode_coding_no_conversion (Lstream *decoding,
214 CONST unsigned char *src,
215 unsigned_char_dynarr *dst,
217 static void encode_coding_no_conversion (Lstream *encoding,
218 CONST unsigned char *src,
219 unsigned_char_dynarr *dst,
221 static void mule_decode (Lstream *decoding, CONST unsigned char *src,
222 unsigned_char_dynarr *dst, unsigned int n);
223 static void mule_encode (Lstream *encoding, CONST unsigned char *src,
224 unsigned_char_dynarr *dst, unsigned int n);
226 typedef struct codesys_prop codesys_prop;
235 Dynarr_declare (codesys_prop);
236 } codesys_prop_dynarr;
238 codesys_prop_dynarr *the_codesys_prop_dynarr;
240 enum codesys_prop_enum
243 CODESYS_PROP_ISO2022,
248 /************************************************************************/
249 /* Coding system functions */
250 /************************************************************************/
252 static Lisp_Object mark_coding_system (Lisp_Object, void (*) (Lisp_Object));
253 static void print_coding_system (Lisp_Object, Lisp_Object, int);
254 static void finalize_coding_system (void *header, int for_disksave);
257 static const struct lrecord_description ccs_description_1[] = {
258 { XD_LISP_OBJECT, offsetof(charset_conversion_spec, from_charset), 2 },
262 static const struct struct_description ccs_description = {
263 sizeof(charset_conversion_spec),
267 static const struct lrecord_description ccsd_description_1[] = {
268 XD_DYNARR_DESC(charset_conversion_spec_dynarr, &ccs_description),
272 static const struct struct_description ccsd_description = {
273 sizeof(charset_conversion_spec_dynarr),
278 static const struct lrecord_description coding_system_description[] = {
279 { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, name), 2 },
280 { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, mnemonic), 3 },
281 { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, eol_lf), 3 },
283 { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, iso2022.initial_charset), 4 },
284 { XD_STRUCT_PTR, offsetof(struct Lisp_Coding_System, iso2022.input_conv), 1, &ccsd_description },
285 { XD_STRUCT_PTR, offsetof(struct Lisp_Coding_System, iso2022.output_conv), 1, &ccsd_description },
286 { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, ccl.decode), 2 },
291 DEFINE_LRECORD_IMPLEMENTATION ("coding-system", coding_system,
292 mark_coding_system, print_coding_system,
293 finalize_coding_system,
294 0, 0, coding_system_description,
295 struct Lisp_Coding_System);
298 mark_coding_system (Lisp_Object obj, void (*markobj) (Lisp_Object))
300 Lisp_Coding_System *codesys = XCODING_SYSTEM (obj);
302 markobj (CODING_SYSTEM_NAME (codesys));
303 markobj (CODING_SYSTEM_DOC_STRING (codesys));
304 markobj (CODING_SYSTEM_MNEMONIC (codesys));
305 markobj (CODING_SYSTEM_EOL_LF (codesys));
306 markobj (CODING_SYSTEM_EOL_CRLF (codesys));
307 markobj (CODING_SYSTEM_EOL_CR (codesys));
309 switch (CODING_SYSTEM_TYPE (codesys))
313 case CODESYS_ISO2022:
314 for (i = 0; i < 4; i++)
315 markobj (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i));
316 if (codesys->iso2022.input_conv)
318 for (i = 0; i < Dynarr_length (codesys->iso2022.input_conv); i++)
320 struct charset_conversion_spec *ccs =
321 Dynarr_atp (codesys->iso2022.input_conv, i);
322 markobj (ccs->from_charset);
323 markobj (ccs->to_charset);
326 if (codesys->iso2022.output_conv)
328 for (i = 0; i < Dynarr_length (codesys->iso2022.output_conv); i++)
330 struct charset_conversion_spec *ccs =
331 Dynarr_atp (codesys->iso2022.output_conv, i);
332 markobj (ccs->from_charset);
333 markobj (ccs->to_charset);
339 markobj (CODING_SYSTEM_CCL_DECODE (codesys));
340 markobj (CODING_SYSTEM_CCL_ENCODE (codesys));
347 markobj (CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys));
348 return CODING_SYSTEM_POST_READ_CONVERSION (codesys);
352 print_coding_system (Lisp_Object obj, Lisp_Object printcharfun,
355 Lisp_Coding_System *c = XCODING_SYSTEM (obj);
357 error ("printing unreadable object #<coding_system 0x%x>",
360 write_c_string ("#<coding_system ", printcharfun);
361 print_internal (c->name, printcharfun, 1);
362 write_c_string (">", printcharfun);
366 finalize_coding_system (void *header, int for_disksave)
368 Lisp_Coding_System *c = (Lisp_Coding_System *) header;
369 /* Since coding systems never go away, this function is not
370 necessary. But it would be necessary if we changed things
371 so that coding systems could go away. */
372 if (!for_disksave) /* see comment in lstream.c */
374 switch (CODING_SYSTEM_TYPE (c))
377 case CODESYS_ISO2022:
378 if (c->iso2022.input_conv)
380 Dynarr_free (c->iso2022.input_conv);
381 c->iso2022.input_conv = 0;
383 if (c->iso2022.output_conv)
385 Dynarr_free (c->iso2022.output_conv);
386 c->iso2022.output_conv = 0;
397 symbol_to_eol_type (Lisp_Object symbol)
399 CHECK_SYMBOL (symbol);
400 if (NILP (symbol)) return EOL_AUTODETECT;
401 if (EQ (symbol, Qlf)) return EOL_LF;
402 if (EQ (symbol, Qcrlf)) return EOL_CRLF;
403 if (EQ (symbol, Qcr)) return EOL_CR;
405 signal_simple_error ("Unrecognized eol type", symbol);
406 return EOL_AUTODETECT; /* not reached */
410 eol_type_to_symbol (enum eol_type type)
415 case EOL_LF: return Qlf;
416 case EOL_CRLF: return Qcrlf;
417 case EOL_CR: return Qcr;
418 case EOL_AUTODETECT: return Qnil;
423 setup_eol_coding_systems (Lisp_Coding_System *codesys)
425 Lisp_Object codesys_obj;
426 int len = string_length (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name);
427 char *codesys_name = (char *) alloca (len + 7);
429 char *codesys_mnemonic=0;
431 Lisp_Object codesys_name_sym, sub_codesys_obj;
435 XSETCODING_SYSTEM (codesys_obj, codesys);
437 memcpy (codesys_name,
438 string_data (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name), len);
440 if (STRINGP (CODING_SYSTEM_MNEMONIC (codesys)))
442 mlen = XSTRING_LENGTH (CODING_SYSTEM_MNEMONIC (codesys));
443 codesys_mnemonic = (char *) alloca (mlen + 7);
444 memcpy (codesys_mnemonic,
445 XSTRING_DATA (CODING_SYSTEM_MNEMONIC (codesys)), mlen);
448 #define DEFINE_SUB_CODESYS(op_sys, op_sys_abbr, Type) do { \
449 strcpy (codesys_name + len, "-" op_sys); \
451 strcpy (codesys_mnemonic + mlen, op_sys_abbr); \
452 codesys_name_sym = intern (codesys_name); \
453 sub_codesys_obj = Fcopy_coding_system (codesys_obj, codesys_name_sym); \
454 XCODING_SYSTEM_EOL_TYPE (sub_codesys_obj) = Type; \
456 XCODING_SYSTEM_MNEMONIC(sub_codesys_obj) = \
457 build_string (codesys_mnemonic); \
458 CODING_SYSTEM_##Type (codesys) = sub_codesys_obj; \
461 DEFINE_SUB_CODESYS("unix", "", EOL_LF);
462 DEFINE_SUB_CODESYS("dos", ":T", EOL_CRLF);
463 DEFINE_SUB_CODESYS("mac", ":t", EOL_CR);
466 DEFUN ("coding-system-p", Fcoding_system_p, 1, 1, 0, /*
467 Return t if OBJECT is a coding system.
468 A coding system is an object that defines how text containing multiple
469 character sets is encoded into a stream of (typically 8-bit) bytes.
470 The coding system is used to decode the stream into a series of
471 characters (which may be from multiple charsets) when the text is read
472 from a file or process, and is used to encode the text back into the
473 same format when it is written out to a file or process.
475 For example, many ISO2022-compliant coding systems (such as Compound
476 Text, which is used for inter-client data under the X Window System)
477 use escape sequences to switch between different charsets -- Japanese
478 Kanji, for example, is invoked with "ESC $ ( B"; ASCII is invoked
479 with "ESC ( B"; and Cyrillic is invoked with "ESC - L". See
480 `make-coding-system' for more information.
482 Coding systems are normally identified using a symbol, and the
483 symbol is accepted in place of the actual coding system object whenever
484 a coding system is called for. (This is similar to how faces work.)
488 return CODING_SYSTEMP (object) ? Qt : Qnil;
491 DEFUN ("find-coding-system", Ffind_coding_system, 1, 1, 0, /*
492 Retrieve the coding system of the given name.
494 If CODING-SYSTEM-OR-NAME is a coding-system object, it is simply
495 returned. Otherwise, CODING-SYSTEM-OR-NAME should be a symbol.
496 If there is no such coding system, nil is returned. Otherwise the
497 associated coding system object is returned.
499 (coding_system_or_name))
501 if (CODING_SYSTEMP (coding_system_or_name))
502 return coding_system_or_name;
504 if (NILP (coding_system_or_name))
505 coding_system_or_name = Qbinary;
507 CHECK_SYMBOL (coding_system_or_name);
509 return Fgethash (coding_system_or_name, Vcoding_system_hash_table, Qnil);
512 DEFUN ("get-coding-system", Fget_coding_system, 1, 1, 0, /*
513 Retrieve the coding system of the given name.
514 Same as `find-coding-system' except that if there is no such
515 coding system, an error is signaled instead of returning nil.
519 Lisp_Object coding_system = Ffind_coding_system (name);
521 if (NILP (coding_system))
522 signal_simple_error ("No such coding system", name);
523 return coding_system;
526 /* We store the coding systems in hash tables with the names as the key and the
527 actual coding system object as the value. Occasionally we need to use them
528 in a list format. These routines provide us with that. */
529 struct coding_system_list_closure
531 Lisp_Object *coding_system_list;
535 add_coding_system_to_list_mapper (Lisp_Object key, Lisp_Object value,
536 void *coding_system_list_closure)
538 /* This function can GC */
539 struct coding_system_list_closure *cscl =
540 (struct coding_system_list_closure *) coding_system_list_closure;
541 Lisp_Object *coding_system_list = cscl->coding_system_list;
543 *coding_system_list = Fcons (XCODING_SYSTEM (value)->name,
544 *coding_system_list);
548 DEFUN ("coding-system-list", Fcoding_system_list, 0, 0, 0, /*
549 Return a list of the names of all defined coding systems.
553 Lisp_Object coding_system_list = Qnil;
555 struct coding_system_list_closure coding_system_list_closure;
557 GCPRO1 (coding_system_list);
558 coding_system_list_closure.coding_system_list = &coding_system_list;
559 elisp_maphash (add_coding_system_to_list_mapper, Vcoding_system_hash_table,
560 &coding_system_list_closure);
563 return coding_system_list;
566 DEFUN ("coding-system-name", Fcoding_system_name, 1, 1, 0, /*
567 Return the name of the given coding system.
571 coding_system = Fget_coding_system (coding_system);
572 return XCODING_SYSTEM_NAME (coding_system);
575 static Lisp_Coding_System *
576 allocate_coding_system (enum coding_system_type type, Lisp_Object name)
578 Lisp_Coding_System *codesys =
579 alloc_lcrecord_type (Lisp_Coding_System, &lrecord_coding_system);
581 zero_lcrecord (codesys);
582 CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = Qnil;
583 CODING_SYSTEM_POST_READ_CONVERSION (codesys) = Qnil;
584 CODING_SYSTEM_EOL_TYPE (codesys) = EOL_AUTODETECT;
585 CODING_SYSTEM_EOL_CRLF (codesys) = Qnil;
586 CODING_SYSTEM_EOL_CR (codesys) = Qnil;
587 CODING_SYSTEM_EOL_LF (codesys) = Qnil;
588 CODING_SYSTEM_TYPE (codesys) = type;
589 CODING_SYSTEM_MNEMONIC (codesys) = Qnil;
591 if (type == CODESYS_ISO2022)
594 for (i = 0; i < 4; i++)
595 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i) = Qnil;
597 else if (type == CODESYS_CCL)
599 CODING_SYSTEM_CCL_DECODE (codesys) = Qnil;
600 CODING_SYSTEM_CCL_ENCODE (codesys) = Qnil;
603 CODING_SYSTEM_NAME (codesys) = name;
609 /* Given a list of charset conversion specs as specified in a Lisp
610 program, parse it into STORE_HERE. */
613 parse_charset_conversion_specs (charset_conversion_spec_dynarr *store_here,
614 Lisp_Object spec_list)
618 EXTERNAL_LIST_LOOP (rest, spec_list)
620 Lisp_Object car = XCAR (rest);
621 Lisp_Object from, to;
622 struct charset_conversion_spec spec;
624 if (!CONSP (car) || !CONSP (XCDR (car)) || !NILP (XCDR (XCDR (car))))
625 signal_simple_error ("Invalid charset conversion spec", car);
626 from = Fget_charset (XCAR (car));
627 to = Fget_charset (XCAR (XCDR (car)));
628 if (XCHARSET_TYPE (from) != XCHARSET_TYPE (to))
629 signal_simple_error_2
630 ("Attempted conversion between different charset types",
632 spec.from_charset = from;
633 spec.to_charset = to;
635 Dynarr_add (store_here, spec);
639 /* Given a dynarr LOAD_HERE of internally-stored charset conversion
640 specs, return the equivalent as the Lisp programmer would see it.
642 If LOAD_HERE is 0, return Qnil. */
645 unparse_charset_conversion_specs (charset_conversion_spec_dynarr *load_here)
652 for (i = 0, result = Qnil; i < Dynarr_length (load_here); i++)
654 struct charset_conversion_spec *ccs = Dynarr_atp (load_here, i);
655 result = Fcons (list2 (ccs->from_charset, ccs->to_charset), result);
658 return Fnreverse (result);
663 DEFUN ("make-coding-system", Fmake_coding_system, 2, 4, 0, /*
664 Register symbol NAME as a coding system.
666 TYPE describes the conversion method used and should be one of
669 Automatic conversion. XEmacs attempts to detect the coding system
672 No conversion. Use this for binary files and such. On output,
673 graphic characters that are not in ASCII or Latin-1 will be
674 replaced by a ?. (For a no-conversion-encoded buffer, these
675 characters will only be present if you explicitly insert them.)
677 Shift-JIS (a Japanese encoding commonly used in PC operating systems).
679 ISO 10646 UCS-4 encoding.
681 ISO 10646 UTF-8 encoding.
683 Any ISO2022-compliant encoding. Among other things, this includes
684 JIS (the Japanese encoding commonly used for e-mail), EUC (the
685 standard Unix encoding for Japanese and other languages), and
686 Compound Text (the encoding used in X11). You can specify more
687 specific information about the conversion with the FLAGS argument.
689 Big5 (the encoding commonly used for Taiwanese).
691 The conversion is performed using a user-written pseudo-code
692 program. CCL (Code Conversion Language) is the name of this
695 Write out or read in the raw contents of the memory representing
696 the buffer's text. This is primarily useful for debugging
697 purposes, and is only enabled when XEmacs has been compiled with
698 DEBUG_XEMACS defined (via the --debug configure option).
699 WARNING: Reading in a file using 'internal conversion can result
700 in an internal inconsistency in the memory representing a
701 buffer's text, which will produce unpredictable results and may
702 cause XEmacs to crash. Under normal circumstances you should
703 never use 'internal conversion.
705 DOC-STRING is a string describing the coding system.
707 PROPS is a property list, describing the specific nature of the
708 character set. Recognized properties are:
711 String to be displayed in the modeline when this coding system is
715 End-of-line conversion to be used. It should be one of
718 Automatically detect the end-of-line type (LF, CRLF,
719 or CR). Also generate subsidiary coding systems named
720 `NAME-unix', `NAME-dos', and `NAME-mac', that are
721 identical to this coding system but have an EOL-TYPE
722 value of 'lf, 'crlf, and 'cr, respectively.
724 The end of a line is marked externally using ASCII LF.
725 Since this is also the way that XEmacs represents an
726 end-of-line internally, specifying this option results
727 in no end-of-line conversion. This is the standard
728 format for Unix text files.
730 The end of a line is marked externally using ASCII
731 CRLF. This is the standard format for MS-DOS text
734 The end of a line is marked externally using ASCII CR.
735 This is the standard format for Macintosh text files.
737 Automatically detect the end-of-line type but do not
738 generate subsidiary coding systems. (This value is
739 converted to nil when stored internally, and
740 `coding-system-property' will return nil.)
742 'post-read-conversion
743 Function called after a file has been read in, to perform the
744 decoding. Called with two arguments, BEG and END, denoting
745 a region of the current buffer to be decoded.
747 'pre-write-conversion
748 Function called before a file is written out, to perform the
749 encoding. Called with two arguments, BEG and END, denoting
750 a region of the current buffer to be encoded.
753 The following additional properties are recognized if TYPE is 'iso2022:
759 The character set initially designated to the G0 - G3 registers.
760 The value should be one of
762 -- A charset object (designate that character set)
763 -- nil (do not ever use this register)
764 -- t (no character set is initially designated to
765 the register, but may be later on; this automatically
766 sets the corresponding `force-g*-on-output' property)
772 If non-nil, send an explicit designation sequence on output before
773 using the specified register.
776 If non-nil, use the short forms "ESC $ @", "ESC $ A", and
777 "ESC $ B" on output in place of the full designation sequences
778 "ESC $ ( @", "ESC $ ( A", and "ESC $ ( B".
781 If non-nil, don't designate ASCII to G0 at each end of line on output.
782 Setting this to non-nil also suppresses other state-resetting that
783 normally happens at the end of a line.
786 If non-nil, don't designate ASCII to G0 before control chars on output.
789 If non-nil, use 7-bit environment on output. Otherwise, use 8-bit
793 If non-nil, use locking-shift (SO/SI) instead of single-shift
794 or designation by escape sequence.
797 If non-nil, don't use ISO6429's direction specification.
800 If non-nil, literal control characters that are the same as
801 the beginning of a recognized ISO2022 or ISO6429 escape sequence
802 (in particular, ESC (0x1B), SO (0x0E), SI (0x0F), SS2 (0x8E),
803 SS3 (0x8F), and CSI (0x9B)) are "quoted" with an escape character
804 so that they can be properly distinguished from an escape sequence.
805 (Note that doing this results in a non-portable encoding.) This
806 encoding flag is used for byte-compiled files. Note that ESC
807 is a good choice for a quoting character because there are no
808 escape sequences whose second byte is a character from the Control-0
809 or Control-1 character sets; this is explicitly disallowed by the
812 'input-charset-conversion
813 A list of conversion specifications, specifying conversion of
814 characters in one charset to another when decoding is performed.
815 Each specification is a list of two elements: the source charset,
816 and the destination charset.
818 'output-charset-conversion
819 A list of conversion specifications, specifying conversion of
820 characters in one charset to another when encoding is performed.
821 The form of each specification is the same as for
822 'input-charset-conversion.
825 The following additional properties are recognized (and required)
829 CCL program used for decoding (converting to internal format).
832 CCL program used for encoding (converting to external format).
834 (name, type, doc_string, props))
836 Lisp_Coding_System *codesys;
837 Lisp_Object rest, key, value;
838 enum coding_system_type ty;
839 int need_to_setup_eol_systems = 1;
841 /* Convert type to constant */
842 if (NILP (type) || EQ (type, Qundecided))
843 { ty = CODESYS_AUTODETECT; }
845 else if (EQ (type, Qshift_jis)) { ty = CODESYS_SHIFT_JIS; }
846 else if (EQ (type, Qiso2022)) { ty = CODESYS_ISO2022; }
847 else if (EQ (type, Qbig5)) { ty = CODESYS_BIG5; }
848 else if (EQ (type, Qucs4)) { ty = CODESYS_UCS4; }
849 else if (EQ (type, Qutf8)) { ty = CODESYS_UTF8; }
850 else if (EQ (type, Qccl)) { ty = CODESYS_CCL; }
852 else if (EQ (type, Qno_conversion)) { ty = CODESYS_NO_CONVERSION; }
854 else if (EQ (type, Qinternal)) { ty = CODESYS_INTERNAL; }
857 signal_simple_error ("Invalid coding system type", type);
861 codesys = allocate_coding_system (ty, name);
863 if (NILP (doc_string))
864 doc_string = build_string ("");
866 CHECK_STRING (doc_string);
867 CODING_SYSTEM_DOC_STRING (codesys) = doc_string;
870 if (ty == CODESYS_NO_CONVERSION)
871 codesys->fixed.size = 1;
873 EXTERNAL_PROPERTY_LIST_LOOP (rest, key, value, props)
875 if (EQ (key, Qmnemonic))
878 CHECK_STRING (value);
879 CODING_SYSTEM_MNEMONIC (codesys) = value;
882 else if (EQ (key, Qeol_type))
884 need_to_setup_eol_systems = NILP (value);
887 CODING_SYSTEM_EOL_TYPE (codesys) = symbol_to_eol_type (value);
890 else if (EQ (key, Qpost_read_conversion)) CODING_SYSTEM_POST_READ_CONVERSION (codesys) = value;
891 else if (EQ (key, Qpre_write_conversion)) CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = value;
893 else if (ty == CODESYS_ISO2022)
895 #define FROB_INITIAL_CHARSET(charset_num) \
896 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \
897 ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value))
899 if (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
900 else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
901 else if (EQ (key, Qcharset_g2)) FROB_INITIAL_CHARSET (2);
902 else if (EQ (key, Qcharset_g3)) FROB_INITIAL_CHARSET (3);
904 #define FROB_FORCE_CHARSET(charset_num) \
905 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = !NILP (value)
907 else if (EQ (key, Qforce_g0_on_output)) FROB_FORCE_CHARSET (0);
908 else if (EQ (key, Qforce_g1_on_output)) FROB_FORCE_CHARSET (1);
909 else if (EQ (key, Qforce_g2_on_output)) FROB_FORCE_CHARSET (2);
910 else if (EQ (key, Qforce_g3_on_output)) FROB_FORCE_CHARSET (3);
912 #define FROB_BOOLEAN_PROPERTY(prop) \
913 CODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value)
915 else if (EQ (key, Qshort)) FROB_BOOLEAN_PROPERTY (SHORT);
916 else if (EQ (key, Qno_ascii_eol)) FROB_BOOLEAN_PROPERTY (NO_ASCII_EOL);
917 else if (EQ (key, Qno_ascii_cntl)) FROB_BOOLEAN_PROPERTY (NO_ASCII_CNTL);
918 else if (EQ (key, Qseven)) FROB_BOOLEAN_PROPERTY (SEVEN);
919 else if (EQ (key, Qlock_shift)) FROB_BOOLEAN_PROPERTY (LOCK_SHIFT);
920 else if (EQ (key, Qno_iso6429)) FROB_BOOLEAN_PROPERTY (NO_ISO6429);
921 else if (EQ (key, Qescape_quoted)) FROB_BOOLEAN_PROPERTY (ESCAPE_QUOTED);
923 else if (EQ (key, Qinput_charset_conversion))
925 codesys->iso2022.input_conv =
926 Dynarr_new (charset_conversion_spec);
927 parse_charset_conversion_specs (codesys->iso2022.input_conv,
930 else if (EQ (key, Qoutput_charset_conversion))
932 codesys->iso2022.output_conv =
933 Dynarr_new (charset_conversion_spec);
934 parse_charset_conversion_specs (codesys->iso2022.output_conv,
938 signal_simple_error ("Unrecognized property", key);
940 else if (EQ (type, Qccl))
942 if (EQ (key, Qdecode))
944 CHECK_VECTOR (value);
945 CODING_SYSTEM_CCL_DECODE (codesys) = value;
947 else if (EQ (key, Qencode))
949 CHECK_VECTOR (value);
950 CODING_SYSTEM_CCL_ENCODE (codesys) = value;
953 signal_simple_error ("Unrecognized property", key);
957 signal_simple_error ("Unrecognized property", key);
960 if (need_to_setup_eol_systems)
961 setup_eol_coding_systems (codesys);
964 Lisp_Object codesys_obj;
965 XSETCODING_SYSTEM (codesys_obj, codesys);
966 Fputhash (name, codesys_obj, Vcoding_system_hash_table);
971 DEFUN ("copy-coding-system", Fcopy_coding_system, 2, 2, 0, /*
972 Copy OLD-CODING-SYSTEM to NEW-NAME.
973 If NEW-NAME does not name an existing coding system, a new one will
976 (old_coding_system, new_name))
978 Lisp_Object new_coding_system;
979 old_coding_system = Fget_coding_system (old_coding_system);
980 new_coding_system = Ffind_coding_system (new_name);
981 if (NILP (new_coding_system))
983 XSETCODING_SYSTEM (new_coding_system,
984 allocate_coding_system
985 (XCODING_SYSTEM_TYPE (old_coding_system),
987 Fputhash (new_name, new_coding_system, Vcoding_system_hash_table);
991 Lisp_Coding_System *to = XCODING_SYSTEM (new_coding_system);
992 Lisp_Coding_System *from = XCODING_SYSTEM (old_coding_system);
993 memcpy (((char *) to ) + sizeof (to->header),
994 ((char *) from) + sizeof (from->header),
995 sizeof (*from) - sizeof (from->header));
998 return new_coding_system;
1001 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias, 2, 2, 0, /*
1002 Define symbol ALIAS as an alias for coding system CODING-SYSTEM.
1004 (alias, coding_system))
1006 CHECK_SYMBOL (alias);
1007 if (!NILP (Ffind_coding_system (alias)))
1008 signal_simple_error ("Symbol already names a coding system", alias);
1009 coding_system = Fget_coding_system (coding_system);
1010 Fputhash (alias, coding_system, Vcoding_system_hash_table);
1012 /* Set up aliases for subsidiaries. */
1013 if (XCODING_SYSTEM_EOL_TYPE (coding_system) == EOL_AUTODETECT)
1016 XSETSTRING (str, symbol_name (XSYMBOL (alias)));
1017 #define FROB(type, name) \
1019 Lisp_Object subsidiary = XCODING_SYSTEM_EOL_##type (coding_system); \
1020 if (!NILP (subsidiary)) \
1021 Fdefine_coding_system_alias \
1022 (Fintern (concat2 (str, build_string (name)), Qnil), subsidiary); \
1025 FROB (CRLF, "-dos");
1029 /* FSF return value is a vector of [ALIAS-unix ALIAS-doc ALIAS-mac],
1030 but it doesn't look intentional, so I'd rather return something
1031 meaningful or nothing at all. */
1036 subsidiary_coding_system (Lisp_Object coding_system, enum eol_type type)
1038 Lisp_Coding_System *cs = XCODING_SYSTEM (coding_system);
1039 Lisp_Object new_coding_system;
1041 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
1042 return coding_system;
1046 case EOL_AUTODETECT: return coding_system;
1047 case EOL_LF: new_coding_system = CODING_SYSTEM_EOL_LF (cs); break;
1048 case EOL_CR: new_coding_system = CODING_SYSTEM_EOL_CR (cs); break;
1049 case EOL_CRLF: new_coding_system = CODING_SYSTEM_EOL_CRLF (cs); break;
1053 return NILP (new_coding_system) ? coding_system : new_coding_system;
1056 DEFUN ("subsidiary-coding-system", Fsubsidiary_coding_system, 2, 2, 0, /*
1057 Return the subsidiary coding system of CODING-SYSTEM with eol type EOL-TYPE.
1059 (coding_system, eol_type))
1061 coding_system = Fget_coding_system (coding_system);
1063 return subsidiary_coding_system (coding_system,
1064 symbol_to_eol_type (eol_type));
1068 /************************************************************************/
1069 /* Coding system accessors */
1070 /************************************************************************/
1072 DEFUN ("coding-system-doc-string", Fcoding_system_doc_string, 1, 1, 0, /*
1073 Return the doc string for CODING-SYSTEM.
1077 coding_system = Fget_coding_system (coding_system);
1078 return XCODING_SYSTEM_DOC_STRING (coding_system);
1081 DEFUN ("coding-system-type", Fcoding_system_type, 1, 1, 0, /*
1082 Return the type of CODING-SYSTEM.
1086 switch (XCODING_SYSTEM_TYPE (Fget_coding_system (coding_system)))
1089 case CODESYS_AUTODETECT: return Qundecided;
1091 case CODESYS_SHIFT_JIS: return Qshift_jis;
1092 case CODESYS_ISO2022: return Qiso2022;
1093 case CODESYS_BIG5: return Qbig5;
1094 case CODESYS_UCS4: return Qucs4;
1095 case CODESYS_UTF8: return Qutf8;
1096 case CODESYS_CCL: return Qccl;
1098 case CODESYS_NO_CONVERSION: return Qno_conversion;
1100 case CODESYS_INTERNAL: return Qinternal;
1107 Lisp_Object coding_system_charset (Lisp_Object coding_system, int gnum)
1110 = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, gnum);
1112 return CHARSETP (cs) ? XCHARSET_NAME (cs) : Qnil;
1115 DEFUN ("coding-system-charset", Fcoding_system_charset, 2, 2, 0, /*
1116 Return initial charset of CODING-SYSTEM designated to GNUM.
1119 (coding_system, gnum))
1121 coding_system = Fget_coding_system (coding_system);
1124 return coding_system_charset (coding_system, XINT (gnum));
1128 DEFUN ("coding-system-property", Fcoding_system_property, 2, 2, 0, /*
1129 Return the PROP property of CODING-SYSTEM.
1131 (coding_system, prop))
1134 enum coding_system_type type;
1136 coding_system = Fget_coding_system (coding_system);
1137 CHECK_SYMBOL (prop);
1138 type = XCODING_SYSTEM_TYPE (coding_system);
1140 for (i = 0; !ok && i < Dynarr_length (the_codesys_prop_dynarr); i++)
1141 if (EQ (Dynarr_at (the_codesys_prop_dynarr, i).sym, prop))
1144 switch (Dynarr_at (the_codesys_prop_dynarr, i).prop_type)
1146 case CODESYS_PROP_ALL_OK:
1149 case CODESYS_PROP_ISO2022:
1150 if (type != CODESYS_ISO2022)
1152 ("Property only valid in ISO2022 coding systems",
1156 case CODESYS_PROP_CCL:
1157 if (type != CODESYS_CCL)
1159 ("Property only valid in CCL coding systems",
1169 signal_simple_error ("Unrecognized property", prop);
1171 if (EQ (prop, Qname))
1172 return XCODING_SYSTEM_NAME (coding_system);
1173 else if (EQ (prop, Qtype))
1174 return Fcoding_system_type (coding_system);
1175 else if (EQ (prop, Qdoc_string))
1176 return XCODING_SYSTEM_DOC_STRING (coding_system);
1177 else if (EQ (prop, Qmnemonic))
1178 return XCODING_SYSTEM_MNEMONIC (coding_system);
1179 else if (EQ (prop, Qeol_type))
1180 return eol_type_to_symbol (XCODING_SYSTEM_EOL_TYPE (coding_system));
1181 else if (EQ (prop, Qeol_lf))
1182 return XCODING_SYSTEM_EOL_LF (coding_system);
1183 else if (EQ (prop, Qeol_crlf))
1184 return XCODING_SYSTEM_EOL_CRLF (coding_system);
1185 else if (EQ (prop, Qeol_cr))
1186 return XCODING_SYSTEM_EOL_CR (coding_system);
1187 else if (EQ (prop, Qpost_read_conversion))
1188 return XCODING_SYSTEM_POST_READ_CONVERSION (coding_system);
1189 else if (EQ (prop, Qpre_write_conversion))
1190 return XCODING_SYSTEM_PRE_WRITE_CONVERSION (coding_system);
1192 else if (type == CODESYS_ISO2022)
1194 if (EQ (prop, Qcharset_g0))
1195 return coding_system_charset (coding_system, 0);
1196 else if (EQ (prop, Qcharset_g1))
1197 return coding_system_charset (coding_system, 1);
1198 else if (EQ (prop, Qcharset_g2))
1199 return coding_system_charset (coding_system, 2);
1200 else if (EQ (prop, Qcharset_g3))
1201 return coding_system_charset (coding_system, 3);
1203 #define FORCE_CHARSET(charset_num) \
1204 (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \
1205 (coding_system, charset_num) ? Qt : Qnil)
1207 else if (EQ (prop, Qforce_g0_on_output)) return FORCE_CHARSET (0);
1208 else if (EQ (prop, Qforce_g1_on_output)) return FORCE_CHARSET (1);
1209 else if (EQ (prop, Qforce_g2_on_output)) return FORCE_CHARSET (2);
1210 else if (EQ (prop, Qforce_g3_on_output)) return FORCE_CHARSET (3);
1212 #define LISP_BOOLEAN(prop) \
1213 (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil)
1215 else if (EQ (prop, Qshort)) return LISP_BOOLEAN (SHORT);
1216 else if (EQ (prop, Qno_ascii_eol)) return LISP_BOOLEAN (NO_ASCII_EOL);
1217 else if (EQ (prop, Qno_ascii_cntl)) return LISP_BOOLEAN (NO_ASCII_CNTL);
1218 else if (EQ (prop, Qseven)) return LISP_BOOLEAN (SEVEN);
1219 else if (EQ (prop, Qlock_shift)) return LISP_BOOLEAN (LOCK_SHIFT);
1220 else if (EQ (prop, Qno_iso6429)) return LISP_BOOLEAN (NO_ISO6429);
1221 else if (EQ (prop, Qescape_quoted)) return LISP_BOOLEAN (ESCAPE_QUOTED);
1223 else if (EQ (prop, Qinput_charset_conversion))
1225 unparse_charset_conversion_specs
1226 (XCODING_SYSTEM (coding_system)->iso2022.input_conv);
1227 else if (EQ (prop, Qoutput_charset_conversion))
1229 unparse_charset_conversion_specs
1230 (XCODING_SYSTEM (coding_system)->iso2022.output_conv);
1234 else if (type == CODESYS_CCL)
1236 if (EQ (prop, Qdecode))
1237 return XCODING_SYSTEM_CCL_DECODE (coding_system);
1238 else if (EQ (prop, Qencode))
1239 return XCODING_SYSTEM_CCL_ENCODE (coding_system);
1247 return Qnil; /* not reached */
1251 /************************************************************************/
1252 /* Coding category functions */
1253 /************************************************************************/
1256 decode_coding_category (Lisp_Object symbol)
1260 CHECK_SYMBOL (symbol);
1261 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1262 if (EQ (coding_category_symbol[i], symbol))
1265 signal_simple_error ("Unrecognized coding category", symbol);
1266 return 0; /* not reached */
1269 DEFUN ("coding-category-list", Fcoding_category_list, 0, 0, 0, /*
1270 Return a list of all recognized coding categories.
1275 Lisp_Object list = Qnil;
1277 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1278 list = Fcons (coding_category_symbol[i], list);
1282 DEFUN ("set-coding-priority-list", Fset_coding_priority_list, 1, 1, 0, /*
1283 Change the priority order of the coding categories.
1284 LIST should be list of coding categories, in descending order of
1285 priority. Unspecified coding categories will be lower in priority
1286 than all specified ones, in the same relative order they were in
1291 int category_to_priority[CODING_CATEGORY_LAST + 1];
1295 /* First generate a list that maps coding categories to priorities. */
1297 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1298 category_to_priority[i] = -1;
1300 /* Highest priority comes from the specified list. */
1302 EXTERNAL_LIST_LOOP (rest, list)
1304 int cat = decode_coding_category (XCAR (rest));
1306 if (category_to_priority[cat] >= 0)
1307 signal_simple_error ("Duplicate coding category in list", XCAR (rest));
1308 category_to_priority[cat] = i++;
1311 /* Now go through the existing categories by priority to retrieve
1312 the categories not yet specified and preserve their priority
1314 for (j = 0; j <= CODING_CATEGORY_LAST; j++)
1316 int cat = coding_category_by_priority[j];
1317 if (category_to_priority[cat] < 0)
1318 category_to_priority[cat] = i++;
1321 /* Now we need to construct the inverse of the mapping we just
1324 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1325 coding_category_by_priority[category_to_priority[i]] = i;
1327 /* Phew! That was confusing. */
1331 DEFUN ("coding-priority-list", Fcoding_priority_list, 0, 0, 0, /*
1332 Return a list of coding categories in descending order of priority.
1337 Lisp_Object list = Qnil;
1339 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1340 list = Fcons (coding_category_symbol[coding_category_by_priority[i]],
1345 DEFUN ("set-coding-category-system", Fset_coding_category_system, 2, 2, 0, /*
1346 Change the coding system associated with a coding category.
1348 (coding_category, coding_system))
1350 int cat = decode_coding_category (coding_category);
1352 coding_system = Fget_coding_system (coding_system);
1353 coding_category_system[cat] = coding_system;
1357 DEFUN ("coding-category-system", Fcoding_category_system, 1, 1, 0, /*
1358 Return the coding system associated with a coding category.
1362 int cat = decode_coding_category (coding_category);
1363 Lisp_Object sys = coding_category_system[cat];
1366 return XCODING_SYSTEM_NAME (sys);
1371 /************************************************************************/
1372 /* Detecting the encoding of data */
1373 /************************************************************************/
1375 struct detection_state
1377 enum eol_type eol_type;
1413 struct iso2022_decoder iso;
1415 int high_byte_count;
1416 unsigned int saw_single_shift:1;
1429 acceptable_control_char_p (int c)
1433 /* Allow and ignore control characters that you might
1434 reasonably see in a text file */
1439 case 8: /* backspace */
1440 case 11: /* vertical tab */
1441 case 12: /* form feed */
1442 case 26: /* MS-DOS C-z junk */
1443 case 31: /* '^_' -- for info */
1451 mask_has_at_most_one_bit_p (int mask)
1453 /* Perhaps the only thing useful you learn from intensive Microsoft
1454 technical interviews */
1455 return (mask & (mask - 1)) == 0;
1458 static enum eol_type
1459 detect_eol_type (struct detection_state *st, CONST unsigned char *src,
1468 st->eol.just_saw_cr = 1;
1473 if (st->eol.just_saw_cr)
1475 else if (st->eol.seen_anything)
1478 else if (st->eol.just_saw_cr)
1480 st->eol.just_saw_cr = 0;
1482 st->eol.seen_anything = 1;
1485 return EOL_AUTODETECT;
1488 /* Attempt to determine the encoding and EOL type of the given text.
1489 Before calling this function for the first type, you must initialize
1490 st->eol_type as appropriate and initialize st->mask to ~0.
1492 st->eol_type holds the determined EOL type, or EOL_AUTODETECT if
1495 st->mask holds the determined coding category mask, or ~0 if only
1496 ASCII has been seen so far.
1500 0 == st->eol_type is EOL_AUTODETECT and/or more than coding category
1501 is present in st->mask
1502 1 == definitive answers are here for both st->eol_type and st->mask
1506 detect_coding_type (struct detection_state *st, CONST unsigned char *src,
1507 unsigned int n, int just_do_eol)
1511 if (st->eol_type == EOL_AUTODETECT)
1512 st->eol_type = detect_eol_type (st, src, n);
1515 return st->eol_type != EOL_AUTODETECT;
1517 if (!st->seen_non_ascii)
1519 for (; n; n--, src++)
1522 if ((c < 0x20 && !acceptable_control_char_p (c)) || c >= 0x80)
1524 st->seen_non_ascii = 1;
1526 st->shift_jis.mask = ~0;
1530 st->iso2022.mask = ~0;
1540 if (!mask_has_at_most_one_bit_p (st->iso2022.mask))
1541 st->iso2022.mask = detect_coding_iso2022 (st, src, n);
1542 if (!mask_has_at_most_one_bit_p (st->shift_jis.mask))
1543 st->shift_jis.mask = detect_coding_sjis (st, src, n);
1544 if (!mask_has_at_most_one_bit_p (st->big5.mask))
1545 st->big5.mask = detect_coding_big5 (st, src, n);
1546 if (!mask_has_at_most_one_bit_p (st->utf8.mask))
1547 st->utf8.mask = detect_coding_utf8 (st, src, n);
1548 if (!mask_has_at_most_one_bit_p (st->ucs4.mask))
1549 st->ucs4.mask = detect_coding_ucs4 (st, src, n);
1552 = st->iso2022.mask | st->shift_jis.mask | st->big5.mask
1553 | st->utf8.mask | st->ucs4.mask;
1556 int retval = mask_has_at_most_one_bit_p (st->mask);
1557 st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK;
1558 return retval && st->eol_type != EOL_AUTODETECT;
1563 coding_system_from_mask (int mask)
1567 /* If the file was entirely or basically ASCII, use the
1568 default value of `buffer-file-coding-system'. */
1569 Lisp_Object retval =
1570 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system;
1573 retval = Ffind_coding_system (retval);
1577 (Qbad_variable, Qwarning,
1578 "Invalid `default-buffer-file-coding-system', set to nil");
1579 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system = Qnil;
1583 retval = Fget_coding_system (Qraw_text);
1591 mask = postprocess_iso2022_mask (mask);
1593 /* Look through the coding categories by priority and find
1594 the first one that is allowed. */
1595 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1597 cat = coding_category_by_priority[i];
1598 if ((mask & (1 << cat)) &&
1599 !NILP (coding_category_system[cat]))
1603 return coding_category_system[cat];
1605 return Fget_coding_system (Qraw_text);
1609 /* Given a seekable read stream and potential coding system and EOL type
1610 as specified, do any autodetection that is called for. If the
1611 coding system and/or EOL type are not autodetect, they will be left
1612 alone; but this function will never return an autodetect coding system
1615 This function does not automatically fetch subsidiary coding systems;
1616 that should be unnecessary with the explicit eol-type argument. */
1619 determine_real_coding_system (Lstream *stream, Lisp_Object *codesys_in_out,
1620 enum eol_type *eol_type_in_out)
1622 struct detection_state decst;
1624 if (*eol_type_in_out == EOL_AUTODETECT)
1625 *eol_type_in_out = XCODING_SYSTEM_EOL_TYPE (*codesys_in_out);
1628 decst.eol_type = *eol_type_in_out;
1631 /* If autodetection is called for, do it now. */
1632 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT ||
1633 *eol_type_in_out == EOL_AUTODETECT)
1635 unsigned char random_buffer[4096];
1637 Lisp_Object coding_system = Qnil;
1639 nread = Lstream_read (stream, random_buffer, sizeof (random_buffer));
1642 unsigned char *cp = random_buffer;
1644 while (cp < random_buffer + nread)
1646 if ((*cp++ == 'c') && (cp < random_buffer + nread) &&
1647 (*cp++ == 'o') && (cp < random_buffer + nread) &&
1648 (*cp++ == 'd') && (cp < random_buffer + nread) &&
1649 (*cp++ == 'i') && (cp < random_buffer + nread) &&
1650 (*cp++ == 'n') && (cp < random_buffer + nread) &&
1651 (*cp++ == 'g') && (cp < random_buffer + nread) &&
1652 (*cp++ == ':') && (cp < random_buffer + nread))
1654 unsigned char coding_system_name[4096 - 6];
1655 unsigned char *np = coding_system_name;
1657 while ( (cp < random_buffer + nread)
1658 && ((*cp == ' ') || (*cp == '\t')) )
1662 while ( (cp < random_buffer + nread) &&
1663 (*cp != ' ') && (*cp != '\t') && (*cp != ';') )
1669 = Ffind_coding_system (intern (coding_system_name));
1673 if (EQ(coding_system, Qnil))
1675 if (detect_coding_type (&decst, random_buffer, nread,
1676 XCODING_SYSTEM_TYPE (*codesys_in_out)
1677 != CODESYS_AUTODETECT))
1679 nread = Lstream_read (stream,
1680 random_buffer, sizeof (random_buffer));
1685 *eol_type_in_out = decst.eol_type;
1686 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT)
1688 if (EQ(coding_system, Qnil))
1689 *codesys_in_out = coding_system_from_mask (decst.mask);
1691 *codesys_in_out = coding_system;
1694 /* If we absolutely can't determine the EOL type, just assume LF. */
1695 if (*eol_type_in_out == EOL_AUTODETECT)
1696 *eol_type_in_out = EOL_LF;
1698 Lstream_rewind (stream);
1701 DEFUN ("detect-coding-region", Fdetect_coding_region, 2, 3, 0, /*
1702 Detect coding system of the text in the region between START and END.
1703 Returned a list of possible coding systems ordered by priority.
1704 If only ASCII characters are found, it returns 'undecided or one of
1705 its subsidiary coding systems according to a detected end-of-line
1706 type. Optional arg BUFFER defaults to the current buffer.
1708 (start, end, buffer))
1710 Lisp_Object val = Qnil;
1711 struct buffer *buf = decode_buffer (buffer, 0);
1713 Lisp_Object instream, lb_instream;
1714 Lstream *istr, *lb_istr;
1715 struct detection_state decst;
1716 struct gcpro gcpro1, gcpro2;
1718 get_buffer_range_char (buf, start, end, &b, &e, 0);
1719 lb_instream = make_lisp_buffer_input_stream (buf, b, e, 0);
1720 lb_istr = XLSTREAM (lb_instream);
1721 instream = make_encoding_input_stream (lb_istr, Fget_coding_system (Qbinary));
1722 istr = XLSTREAM (instream);
1723 GCPRO2 (instream, lb_instream);
1725 decst.eol_type = EOL_AUTODETECT;
1729 unsigned char random_buffer[4096];
1730 int nread = Lstream_read (istr, random_buffer, sizeof (random_buffer));
1734 if (detect_coding_type (&decst, random_buffer, nread, 0))
1738 if (decst.mask == ~0)
1739 val = subsidiary_coding_system (Fget_coding_system (Qundecided),
1747 decst.mask = postprocess_iso2022_mask (decst.mask);
1749 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1751 int sys = coding_category_by_priority[i];
1752 if (decst.mask & (1 << sys))
1754 Lisp_Object codesys = coding_category_system[sys];
1755 if (!NILP (codesys))
1756 codesys = subsidiary_coding_system (codesys, decst.eol_type);
1757 val = Fcons (codesys, val);
1761 Lstream_close (istr);
1763 Lstream_delete (istr);
1764 Lstream_delete (lb_istr);
1769 /************************************************************************/
1770 /* Converting to internal Mule format ("decoding") */
1771 /************************************************************************/
1773 /* A decoding stream is a stream used for decoding text (i.e.
1774 converting from some external format to internal format).
1775 The decoding-stream object keeps track of the actual coding
1776 stream, the stream that is at the other end, and data that
1777 needs to be persistent across the lifetime of the stream. */
1779 /* Handle the EOL stuff related to just-read-in character C.
1780 EOL_TYPE is the EOL type of the coding stream.
1781 FLAGS is the current value of FLAGS in the coding stream, and may
1782 be modified by this macro. (The macro only looks at the
1783 CODING_STATE_CR flag.) DST is the Dynarr to which the decoded
1784 bytes are to be written. You need to also define a local goto
1785 label "label_continue_loop" that is at the end of the main
1786 character-reading loop.
1788 If C is a CR character, then this macro handles it entirely and
1789 jumps to label_continue_loop. Otherwise, this macro does not add
1790 anything to DST, and continues normally. You should continue
1791 processing C normally after this macro. */
1793 #define DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst) \
1797 if (eol_type == EOL_CR) \
1798 Dynarr_add (dst, '\n'); \
1799 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR) \
1800 Dynarr_add (dst, c); \
1802 flags |= CODING_STATE_CR; \
1803 goto label_continue_loop; \
1805 else if (flags & CODING_STATE_CR) \
1806 { /* eol_type == CODING_SYSTEM_EOL_CRLF */ \
1808 Dynarr_add (dst, '\r'); \
1809 flags &= ~CODING_STATE_CR; \
1813 /* C should be a binary character in the range 0 - 255; convert
1814 to internal format and add to Dynarr DST. */
1817 #define DECODE_ADD_BINARY_CHAR(c, dst) \
1819 if (BYTE_ASCII_P (c)) \
1820 Dynarr_add (dst, c); \
1823 Dynarr_add (dst, (c >> 6) | 0xc0); \
1824 Dynarr_add (dst, (c & 0x3f) | 0x80); \
1829 DECODE_ADD_UCS_CHAR(Emchar c, unsigned_char_dynarr* dst)
1833 Dynarr_add (dst, c);
1835 else if ( c <= 0x7ff )
1837 Dynarr_add (dst, (c >> 6) | 0xc0);
1838 Dynarr_add (dst, (c & 0x3f) | 0x80);
1840 else if ( c <= 0xffff )
1842 Dynarr_add (dst, (c >> 12) | 0xe0);
1843 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
1844 Dynarr_add (dst, (c & 0x3f) | 0x80);
1846 else if ( c <= 0x1fffff )
1848 Dynarr_add (dst, (c >> 18) | 0xf0);
1849 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
1850 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
1851 Dynarr_add (dst, (c & 0x3f) | 0x80);
1853 else if ( c <= 0x3ffffff )
1855 Dynarr_add (dst, (c >> 24) | 0xf8);
1856 Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
1857 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
1858 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
1859 Dynarr_add (dst, (c & 0x3f) | 0x80);
1863 Dynarr_add (dst, (c >> 30) | 0xfc);
1864 Dynarr_add (dst, ((c >> 24) & 0x3f) | 0x80);
1865 Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
1866 Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
1867 Dynarr_add (dst, ((c >> 6) & 0x3f) | 0x80);
1868 Dynarr_add (dst, (c & 0x3f) | 0x80);
1872 #define DECODE_ADD_BINARY_CHAR(c, dst) \
1874 if (BYTE_ASCII_P (c)) \
1875 Dynarr_add (dst, c); \
1876 else if (BYTE_C1_P (c)) \
1878 Dynarr_add (dst, LEADING_BYTE_CONTROL_1); \
1879 Dynarr_add (dst, c + 0x20); \
1883 Dynarr_add (dst, LEADING_BYTE_LATIN_ISO8859_1); \
1884 Dynarr_add (dst, c); \
1889 #define DECODE_OUTPUT_PARTIAL_CHAR(ch) \
1893 DECODE_ADD_BINARY_CHAR (ch, dst); \
1898 #define DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst) \
1900 if (flags & CODING_STATE_END) \
1902 DECODE_OUTPUT_PARTIAL_CHAR (ch); \
1903 if (flags & CODING_STATE_CR) \
1904 Dynarr_add (dst, '\r'); \
1908 #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding)
1910 struct decoding_stream
1912 /* Coding system that governs the conversion. */
1913 Lisp_Coding_System *codesys;
1915 /* Stream that we read the encoded data from or
1916 write the decoded data to. */
1919 /* If we are reading, then we can return only a fixed amount of
1920 data, so if the conversion resulted in too much data, we store it
1921 here for retrieval the next time around. */
1922 unsigned_char_dynarr *runoff;
1924 /* FLAGS holds flags indicating the current state of the decoding.
1925 Some of these flags are dependent on the coding system. */
1928 /* CH holds a partially built-up character. Since we only deal
1929 with one- and two-byte characters at the moment, we only use
1930 this to store the first byte of a two-byte character. */
1933 /* EOL_TYPE specifies the type of end-of-line conversion that
1934 currently applies. We need to keep this separate from the
1935 EOL type stored in CODESYS because the latter might indicate
1936 automatic EOL-type detection while the former will always
1937 indicate a particular EOL type. */
1938 enum eol_type eol_type;
1940 /* Additional ISO2022 information. We define the structure above
1941 because it's also needed by the detection routines. */
1942 struct iso2022_decoder iso2022;
1944 /* Additional information (the state of the running CCL program)
1945 used by the CCL decoder. */
1946 struct ccl_program ccl;
1948 /* counter for UTF-8 or UCS-4 */
1949 unsigned char counter;
1951 struct detection_state decst;
1954 static int decoding_reader (Lstream *stream, unsigned char *data, size_t size);
1955 static int decoding_writer (Lstream *stream, CONST unsigned char *data, size_t size);
1956 static int decoding_rewinder (Lstream *stream);
1957 static int decoding_seekable_p (Lstream *stream);
1958 static int decoding_flusher (Lstream *stream);
1959 static int decoding_closer (Lstream *stream);
1961 static Lisp_Object decoding_marker (Lisp_Object stream,
1962 void (*markobj) (Lisp_Object));
1964 DEFINE_LSTREAM_IMPLEMENTATION ("decoding", lstream_decoding,
1965 sizeof (struct decoding_stream));
1968 decoding_marker (Lisp_Object stream, void (*markobj) (Lisp_Object))
1970 Lstream *str = DECODING_STREAM_DATA (XLSTREAM (stream))->other_end;
1971 Lisp_Object str_obj;
1973 /* We do not need to mark the coding systems or charsets stored
1974 within the stream because they are stored in a global list
1975 and automatically marked. */
1977 XSETLSTREAM (str_obj, str);
1979 if (str->imp->marker)
1980 return (str->imp->marker) (str_obj, markobj);
1985 /* Read SIZE bytes of data and store it into DATA. We are a decoding stream
1986 so we read data from the other end, decode it, and store it into DATA. */
1989 decoding_reader (Lstream *stream, unsigned char *data, size_t size)
1991 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1992 unsigned char *orig_data = data;
1994 int error_occurred = 0;
1996 /* We need to interface to mule_decode(), which expects to take some
1997 amount of data and store the result into a Dynarr. We have
1998 mule_decode() store into str->runoff, and take data from there
2001 /* We loop until we have enough data, reading chunks from the other
2002 end and decoding it. */
2005 /* Take data from the runoff if we can. Make sure to take at
2006 most SIZE bytes, and delete the data from the runoff. */
2007 if (Dynarr_length (str->runoff) > 0)
2009 size_t chunk = min (size, (size_t) Dynarr_length (str->runoff));
2010 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2011 Dynarr_delete_many (str->runoff, 0, chunk);
2017 break; /* No more room for data */
2019 if (str->flags & CODING_STATE_END)
2020 /* This means that on the previous iteration, we hit the EOF on
2021 the other end. We loop once more so that mule_decode() can
2022 output any final stuff it may be holding, or any "go back
2023 to a sane state" escape sequences. (This latter makes sense
2024 during encoding.) */
2027 /* Exhausted the runoff, so get some more. DATA has at least
2028 SIZE bytes left of storage in it, so it's OK to read directly
2029 into it. (We'll be overwriting above, after we've decoded it
2030 into the runoff.) */
2031 read_size = Lstream_read (str->other_end, data, size);
2038 /* There might be some more end data produced in the translation.
2039 See the comment above. */
2040 str->flags |= CODING_STATE_END;
2041 mule_decode (stream, data, str->runoff, read_size);
2044 if (data - orig_data == 0)
2045 return error_occurred ? -1 : 0;
2047 return data - orig_data;
2051 decoding_writer (Lstream *stream, CONST unsigned char *data, size_t size)
2053 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2056 /* Decode all our data into the runoff, and then attempt to write
2057 it all out to the other end. Remove whatever chunk we succeeded
2059 mule_decode (stream, data, str->runoff, size);
2060 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2061 Dynarr_length (str->runoff));
2063 Dynarr_delete_many (str->runoff, 0, retval);
2064 /* Do NOT return retval. The return value indicates how much
2065 of the incoming data was written, not how many bytes were
2071 reset_decoding_stream (struct decoding_stream *str)
2074 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_ISO2022)
2076 Lisp_Object coding_system;
2077 XSETCODING_SYSTEM (coding_system, str->codesys);
2078 reset_iso2022 (coding_system, &str->iso2022);
2080 else if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_CCL)
2082 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_DECODE (str->codesys));
2086 str->flags = str->ch = 0;
2090 decoding_rewinder (Lstream *stream)
2092 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2093 reset_decoding_stream (str);
2094 Dynarr_reset (str->runoff);
2095 return Lstream_rewind (str->other_end);
2099 decoding_seekable_p (Lstream *stream)
2101 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2102 return Lstream_seekable_p (str->other_end);
2106 decoding_flusher (Lstream *stream)
2108 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2109 return Lstream_flush (str->other_end);
2113 decoding_closer (Lstream *stream)
2115 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2116 if (stream->flags & LSTREAM_FL_WRITE)
2118 str->flags |= CODING_STATE_END;
2119 decoding_writer (stream, 0, 0);
2121 Dynarr_free (str->runoff);
2123 #ifdef ENABLE_COMPOSITE_CHARS
2124 if (str->iso2022.composite_chars)
2125 Dynarr_free (str->iso2022.composite_chars);
2128 return Lstream_close (str->other_end);
2132 decoding_stream_coding_system (Lstream *stream)
2134 Lisp_Object coding_system;
2135 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2137 XSETCODING_SYSTEM (coding_system, str->codesys);
2138 return subsidiary_coding_system (coding_system, str->eol_type);
2142 set_decoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2144 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2145 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2147 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
2148 str->eol_type = CODING_SYSTEM_EOL_TYPE (cs);
2149 reset_decoding_stream (str);
2152 /* WARNING WARNING WARNING WARNING!!!!! If you open up a decoding
2153 stream for writing, no automatic code detection will be performed.
2154 The reason for this is that automatic code detection requires a
2155 seekable input. Things will also fail if you open a decoding
2156 stream for reading using a non-fully-specified coding system and
2157 a non-seekable input stream. */
2160 make_decoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2163 Lstream *lstr = Lstream_new (lstream_decoding, mode);
2164 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2168 str->other_end = stream;
2169 str->runoff = (unsigned_char_dynarr *) Dynarr_new (unsigned_char);
2170 str->eol_type = EOL_AUTODETECT;
2171 if (!strcmp (mode, "r")
2172 && Lstream_seekable_p (stream))
2173 /* We can determine the coding system now. */
2174 determine_real_coding_system (stream, &codesys, &str->eol_type);
2175 set_decoding_stream_coding_system (lstr, codesys);
2176 str->decst.eol_type = str->eol_type;
2177 str->decst.mask = ~0;
2178 XSETLSTREAM (obj, lstr);
2183 make_decoding_input_stream (Lstream *stream, Lisp_Object codesys)
2185 return make_decoding_stream_1 (stream, codesys, "r");
2189 make_decoding_output_stream (Lstream *stream, Lisp_Object codesys)
2191 return make_decoding_stream_1 (stream, codesys, "w");
2194 /* Note: the decode_coding_* functions all take the same
2195 arguments as mule_decode(), which is to say some SRC data of
2196 size N, which is to be stored into dynamic array DST.
2197 DECODING is the stream within which the decoding is
2198 taking place, but no data is actually read from or
2199 written to that stream; that is handled in decoding_reader()
2200 or decoding_writer(). This allows the same functions to
2201 be used for both reading and writing. */
2204 mule_decode (Lstream *decoding, CONST unsigned char *src,
2205 unsigned_char_dynarr *dst, unsigned int n)
2207 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2209 /* If necessary, do encoding-detection now. We do this when
2210 we're a writing stream or a non-seekable reading stream,
2211 meaning that we can't just process the whole input,
2212 rewind, and start over. */
2214 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT ||
2215 str->eol_type == EOL_AUTODETECT)
2217 Lisp_Object codesys;
2219 XSETCODING_SYSTEM (codesys, str->codesys);
2220 detect_coding_type (&str->decst, src, n,
2221 CODING_SYSTEM_TYPE (str->codesys) !=
2222 CODESYS_AUTODETECT);
2223 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT &&
2224 str->decst.mask != ~0)
2225 /* #### This is cheesy. What we really ought to do is
2226 buffer up a certain amount of data so as to get a
2227 less random result. */
2228 codesys = coding_system_from_mask (str->decst.mask);
2229 str->eol_type = str->decst.eol_type;
2230 if (XCODING_SYSTEM (codesys) != str->codesys)
2232 /* Preserve the CODING_STATE_END flag in case it was set.
2233 If we erase it, bad things might happen. */
2234 int was_end = str->flags & CODING_STATE_END;
2235 set_decoding_stream_coding_system (decoding, codesys);
2237 str->flags |= CODING_STATE_END;
2241 switch (CODING_SYSTEM_TYPE (str->codesys))
2244 case CODESYS_INTERNAL:
2245 Dynarr_add_many (dst, src, n);
2248 case CODESYS_AUTODETECT:
2249 /* If we got this far and still haven't decided on the coding
2250 system, then do no conversion. */
2251 case CODESYS_NO_CONVERSION:
2252 decode_coding_no_conversion (decoding, src, dst, n);
2255 case CODESYS_SHIFT_JIS:
2256 decode_coding_sjis (decoding, src, dst, n);
2259 decode_coding_big5 (decoding, src, dst, n);
2262 decode_coding_ucs4 (decoding, src, dst, n);
2265 decode_coding_utf8 (decoding, src, dst, n);
2268 str->ccl.last_block = str->flags & CODING_STATE_END;
2269 ccl_driver (&str->ccl, src, dst, n, 0, CCL_MODE_DECODING);
2271 case CODESYS_ISO2022:
2272 decode_coding_iso2022 (decoding, src, dst, n);
2280 DEFUN ("decode-coding-region", Fdecode_coding_region, 3, 4, 0, /*
2281 Decode the text between START and END which is encoded in CODING-SYSTEM.
2282 This is useful if you've read in encoded text from a file without decoding
2283 it (e.g. you read in a JIS-formatted file but used the `binary' or
2284 `no-conversion' coding system, so that it shows up as "^[$B!<!+^[(B").
2285 Return length of decoded text.
2286 BUFFER defaults to the current buffer if unspecified.
2288 (start, end, coding_system, buffer))
2291 struct buffer *buf = decode_buffer (buffer, 0);
2292 Lisp_Object instream, lb_outstream, de_outstream, outstream;
2293 Lstream *istr, *ostr;
2294 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2296 get_buffer_range_char (buf, start, end, &b, &e, 0);
2298 barf_if_buffer_read_only (buf, b, e);
2300 coding_system = Fget_coding_system (coding_system);
2301 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2302 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2303 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2305 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2306 Fget_coding_system (Qbinary));
2307 istr = XLSTREAM (instream);
2308 ostr = XLSTREAM (outstream);
2309 GCPRO4 (instream, lb_outstream, de_outstream, outstream);
2311 /* The chain of streams looks like this:
2313 [BUFFER] <----- send through
2314 ------> [ENCODE AS BINARY]
2315 ------> [DECODE AS SPECIFIED]
2321 char tempbuf[1024]; /* some random amount */
2322 Bufpos newpos, even_newer_pos;
2323 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2324 int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2328 newpos = lisp_buffer_stream_startpos (istr);
2329 Lstream_write (ostr, tempbuf, size_in_bytes);
2330 even_newer_pos = lisp_buffer_stream_startpos (istr);
2331 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2334 Lstream_close (istr);
2335 Lstream_close (ostr);
2337 Lstream_delete (istr);
2338 Lstream_delete (ostr);
2339 Lstream_delete (XLSTREAM (de_outstream));
2340 Lstream_delete (XLSTREAM (lb_outstream));
2345 /************************************************************************/
2346 /* Converting to an external encoding ("encoding") */
2347 /************************************************************************/
2349 /* An encoding stream is an output stream. When you create the
2350 stream, you specify the coding system that governs the encoding
2351 and another stream that the resulting encoded data is to be
2352 sent to, and then start sending data to it. */
2354 #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding)
2356 struct encoding_stream
2358 /* Coding system that governs the conversion. */
2359 Lisp_Coding_System *codesys;
2361 /* Stream that we read the encoded data from or
2362 write the decoded data to. */
2365 /* If we are reading, then we can return only a fixed amount of
2366 data, so if the conversion resulted in too much data, we store it
2367 here for retrieval the next time around. */
2368 unsigned_char_dynarr *runoff;
2370 /* FLAGS holds flags indicating the current state of the encoding.
2371 Some of these flags are dependent on the coding system. */
2374 /* CH holds a partially built-up character. Since we only deal
2375 with one- and two-byte characters at the moment, we only use
2376 this to store the first byte of a two-byte character. */
2379 /* Additional information used by the ISO2022 encoder. */
2382 /* CHARSET holds the character sets currently assigned to the G0
2383 through G3 registers. It is initialized from the array
2384 INITIAL_CHARSET in CODESYS. */
2385 Lisp_Object charset[4];
2387 /* Which registers are currently invoked into the left (GL) and
2388 right (GR) halves of the 8-bit encoding space? */
2389 int register_left, register_right;
2391 /* Whether we need to explicitly designate the charset in the
2392 G? register before using it. It is initialized from the
2393 array FORCE_CHARSET_ON_OUTPUT in CODESYS. */
2394 unsigned char force_charset_on_output[4];
2396 /* Other state variables that need to be preserved across
2398 Lisp_Object current_charset;
2400 int current_char_boundary;
2403 void (*encode_char) (struct encoding_stream *str, Emchar c,
2404 unsigned_char_dynarr *dst, unsigned int *flags);
2405 void (*finish) (struct encoding_stream *str,
2406 unsigned_char_dynarr *dst, unsigned int *flags);
2408 /* Additional information (the state of the running CCL program)
2409 used by the CCL encoder. */
2410 struct ccl_program ccl;
2414 static int encoding_reader (Lstream *stream, unsigned char *data, size_t size);
2415 static int encoding_writer (Lstream *stream, CONST unsigned char *data,
2417 static int encoding_rewinder (Lstream *stream);
2418 static int encoding_seekable_p (Lstream *stream);
2419 static int encoding_flusher (Lstream *stream);
2420 static int encoding_closer (Lstream *stream);
2422 static Lisp_Object encoding_marker (Lisp_Object stream,
2423 void (*markobj) (Lisp_Object));
2425 DEFINE_LSTREAM_IMPLEMENTATION ("encoding", lstream_encoding,
2426 sizeof (struct encoding_stream));
2429 encoding_marker (Lisp_Object stream, void (*markobj) (Lisp_Object))
2431 Lstream *str = ENCODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2432 Lisp_Object str_obj;
2434 /* We do not need to mark the coding systems or charsets stored
2435 within the stream because they are stored in a global list
2436 and automatically marked. */
2438 XSETLSTREAM (str_obj, str);
2440 if (str->imp->marker)
2441 return (str->imp->marker) (str_obj, markobj);
2446 /* Read SIZE bytes of data and store it into DATA. We are a encoding stream
2447 so we read data from the other end, encode it, and store it into DATA. */
2450 encoding_reader (Lstream *stream, unsigned char *data, size_t size)
2452 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2453 unsigned char *orig_data = data;
2455 int error_occurred = 0;
2457 /* We need to interface to mule_encode(), which expects to take some
2458 amount of data and store the result into a Dynarr. We have
2459 mule_encode() store into str->runoff, and take data from there
2462 /* We loop until we have enough data, reading chunks from the other
2463 end and encoding it. */
2466 /* Take data from the runoff if we can. Make sure to take at
2467 most SIZE bytes, and delete the data from the runoff. */
2468 if (Dynarr_length (str->runoff) > 0)
2470 int chunk = min ((int) size, Dynarr_length (str->runoff));
2471 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2472 Dynarr_delete_many (str->runoff, 0, chunk);
2478 break; /* No more room for data */
2480 if (str->flags & CODING_STATE_END)
2481 /* This means that on the previous iteration, we hit the EOF on
2482 the other end. We loop once more so that mule_encode() can
2483 output any final stuff it may be holding, or any "go back
2484 to a sane state" escape sequences. (This latter makes sense
2485 during encoding.) */
2488 /* Exhausted the runoff, so get some more. DATA at least SIZE bytes
2489 left of storage in it, so it's OK to read directly into it.
2490 (We'll be overwriting above, after we've encoded it into the
2492 read_size = Lstream_read (str->other_end, data, size);
2499 /* There might be some more end data produced in the translation.
2500 See the comment above. */
2501 str->flags |= CODING_STATE_END;
2502 mule_encode (stream, data, str->runoff, read_size);
2505 if (data == orig_data)
2506 return error_occurred ? -1 : 0;
2508 return data - orig_data;
2512 encoding_writer (Lstream *stream, CONST unsigned char *data, size_t size)
2514 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2517 /* Encode all our data into the runoff, and then attempt to write
2518 it all out to the other end. Remove whatever chunk we succeeded
2520 mule_encode (stream, data, str->runoff, size);
2521 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2522 Dynarr_length (str->runoff));
2524 Dynarr_delete_many (str->runoff, 0, retval);
2525 /* Do NOT return retval. The return value indicates how much
2526 of the incoming data was written, not how many bytes were
2532 reset_encoding_stream (struct encoding_stream *str)
2535 switch (CODING_SYSTEM_TYPE (str->codesys))
2537 case CODESYS_ISO2022:
2541 str->encode_char = &char_encode_iso2022;
2542 str->finish = &char_finish_iso2022;
2543 for (i = 0; i < 4; i++)
2545 str->iso2022.charset[i] =
2546 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, i);
2547 str->iso2022.force_charset_on_output[i] =
2548 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (str->codesys, i);
2550 str->iso2022.register_left = 0;
2551 str->iso2022.register_right = 1;
2552 str->iso2022.current_charset = Qnil;
2553 str->iso2022.current_half = 0;
2557 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_ENCODE (str->codesys));
2560 str->encode_char = &char_encode_utf8;
2561 str->finish = &char_finish_utf8;
2566 str->iso2022.current_char_boundary = 0;
2567 str->flags = str->ch = 0;
2571 encoding_rewinder (Lstream *stream)
2573 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2574 reset_encoding_stream (str);
2575 Dynarr_reset (str->runoff);
2576 return Lstream_rewind (str->other_end);
2580 encoding_seekable_p (Lstream *stream)
2582 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2583 return Lstream_seekable_p (str->other_end);
2587 encoding_flusher (Lstream *stream)
2589 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2590 return Lstream_flush (str->other_end);
2594 encoding_closer (Lstream *stream)
2596 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2597 if (stream->flags & LSTREAM_FL_WRITE)
2599 str->flags |= CODING_STATE_END;
2600 encoding_writer (stream, 0, 0);
2602 Dynarr_free (str->runoff);
2603 return Lstream_close (str->other_end);
2607 encoding_stream_coding_system (Lstream *stream)
2609 Lisp_Object coding_system;
2610 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2612 XSETCODING_SYSTEM (coding_system, str->codesys);
2613 return coding_system;
2617 set_encoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2619 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2620 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2622 reset_encoding_stream (str);
2626 make_encoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2629 Lstream *lstr = Lstream_new (lstream_encoding, mode);
2630 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2634 str->runoff = Dynarr_new (unsigned_char);
2635 str->other_end = stream;
2636 set_encoding_stream_coding_system (lstr, codesys);
2637 XSETLSTREAM (obj, lstr);
2642 make_encoding_input_stream (Lstream *stream, Lisp_Object codesys)
2644 return make_encoding_stream_1 (stream, codesys, "r");
2648 make_encoding_output_stream (Lstream *stream, Lisp_Object codesys)
2650 return make_encoding_stream_1 (stream, codesys, "w");
2653 /* Convert N bytes of internally-formatted data stored in SRC to an
2654 external format, according to the encoding stream ENCODING.
2655 Store the encoded data into DST. */
2658 mule_encode (Lstream *encoding, CONST unsigned char *src,
2659 unsigned_char_dynarr *dst, unsigned int n)
2661 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2663 switch (CODING_SYSTEM_TYPE (str->codesys))
2666 case CODESYS_INTERNAL:
2667 Dynarr_add_many (dst, src, n);
2670 case CODESYS_AUTODETECT:
2671 /* If we got this far and still haven't decided on the coding
2672 system, then do no conversion. */
2673 case CODESYS_NO_CONVERSION:
2674 encode_coding_no_conversion (encoding, src, dst, n);
2677 case CODESYS_SHIFT_JIS:
2678 encode_coding_sjis (encoding, src, dst, n);
2681 encode_coding_big5 (encoding, src, dst, n);
2684 encode_coding_ucs4 (encoding, src, dst, n);
2687 str->ccl.last_block = str->flags & CODING_STATE_END;
2688 ccl_driver (&str->ccl, src, dst, n, 0, CCL_MODE_ENCODING);
2692 text_encode_generic (encoding, src, dst, n);
2696 DEFUN ("encode-coding-region", Fencode_coding_region, 3, 4, 0, /*
2697 Encode the text between START and END using CODING-SYSTEM.
2698 This will, for example, convert Japanese characters into stuff such as
2699 "^[$B!<!+^[(B" if you use the JIS encoding. Return length of encoded
2700 text. BUFFER defaults to the current buffer if unspecified.
2702 (start, end, coding_system, buffer))
2705 struct buffer *buf = decode_buffer (buffer, 0);
2706 Lisp_Object instream, lb_outstream, de_outstream, outstream;
2707 Lstream *istr, *ostr;
2708 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2710 get_buffer_range_char (buf, start, end, &b, &e, 0);
2712 barf_if_buffer_read_only (buf, b, e);
2714 coding_system = Fget_coding_system (coding_system);
2715 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2716 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2717 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2718 Fget_coding_system (Qbinary));
2719 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2721 istr = XLSTREAM (instream);
2722 ostr = XLSTREAM (outstream);
2723 GCPRO4 (instream, outstream, de_outstream, lb_outstream);
2724 /* The chain of streams looks like this:
2726 [BUFFER] <----- send through
2727 ------> [ENCODE AS SPECIFIED]
2728 ------> [DECODE AS BINARY]
2733 char tempbuf[1024]; /* some random amount */
2734 Bufpos newpos, even_newer_pos;
2735 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2736 int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2740 newpos = lisp_buffer_stream_startpos (istr);
2741 Lstream_write (ostr, tempbuf, size_in_bytes);
2742 even_newer_pos = lisp_buffer_stream_startpos (istr);
2743 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2749 lisp_buffer_stream_startpos (XLSTREAM (instream)) - b;
2750 Lstream_close (istr);
2751 Lstream_close (ostr);
2753 Lstream_delete (istr);
2754 Lstream_delete (ostr);
2755 Lstream_delete (XLSTREAM (de_outstream));
2756 Lstream_delete (XLSTREAM (lb_outstream));
2757 return make_int (retlen);
2764 text_encode_generic (Lstream *encoding, CONST unsigned char *src,
2765 unsigned_char_dynarr *dst, unsigned int n)
2768 unsigned char char_boundary;
2769 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2770 unsigned int flags = str->flags;
2771 Emchar ch = str->ch;
2772 Lisp_Object charset;
2775 char_boundary = str->iso2022.current_char_boundary;
2776 charset = str->iso2022.current_charset;
2777 half = str->iso2022.current_half;
2783 switch (char_boundary)
2791 else if ( c >= 0xf8 )
2796 else if ( c >= 0xf0 )
2801 else if ( c >= 0xe0 )
2806 else if ( c >= 0xc0 )
2813 (*str->encode_char) (str, c, dst, &flags);
2819 (*str->encode_char) (str, (ch << 6) | (c & 0x3f), dst, &flags);
2824 ch = ( ch << 6 ) | ( c & 0x3f );
2829 if ( (char_boundary == 0) && flags & CODING_STATE_END)
2831 (*str->finish) (str, dst, &flags);
2836 str->iso2022.current_char_boundary = char_boundary;
2837 str->iso2022.current_charset = charset;
2838 str->iso2022.current_half = half;
2840 /* Verbum caro factum est! */
2844 /************************************************************************/
2845 /* Shift-JIS methods */
2846 /************************************************************************/
2848 /* Shift-JIS is a coding system encoding three character sets: ASCII, right
2849 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
2850 as is. A character of JISX0201-Kana (DIMENSION1_CHARS94 character set) is
2851 encoded by "position-code + 0x80". A character of JISX0208
2852 (DIMENSION2_CHARS94 character set) is encoded in 2-byte but two
2853 position-codes are divided and shifted so that it fit in the range
2856 --- CODE RANGE of Shift-JIS ---
2857 (character set) (range)
2859 JISX0201-Kana 0xA0 .. 0xDF
2860 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xEF
2861 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
2862 -------------------------------
2866 /* Is this the first byte of a Shift-JIS two-byte char? */
2868 #define BYTE_SJIS_TWO_BYTE_1_P(c) \
2869 (((c) >= 0x81 && (c) <= 0x9F) || ((c) >= 0xE0 && (c) <= 0xEF))
2871 /* Is this the second byte of a Shift-JIS two-byte char? */
2873 #define BYTE_SJIS_TWO_BYTE_2_P(c) \
2874 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0x80 && (c) <= 0xFC))
2876 #define BYTE_SJIS_KATAKANA_P(c) \
2877 ((c) >= 0xA1 && (c) <= 0xDF)
2880 detect_coding_sjis (struct detection_state *st, CONST unsigned char *src,
2888 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
2890 if (st->shift_jis.in_second_byte)
2892 st->shift_jis.in_second_byte = 0;
2896 else if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
2897 st->shift_jis.in_second_byte = 1;
2899 return CODING_CATEGORY_SHIFT_JIS_MASK;
2902 /* Convert Shift-JIS data to internal format. */
2905 decode_coding_sjis (Lstream *decoding, CONST unsigned char *src,
2906 unsigned_char_dynarr *dst, unsigned int n)
2909 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2910 unsigned int flags = str->flags;
2911 unsigned int ch = str->ch;
2912 eol_type_t eol_type = str->eol_type;
2920 /* Previous character was first byte of Shift-JIS Kanji char. */
2921 if (BYTE_SJIS_TWO_BYTE_2_P (c))
2923 unsigned char e1, e2;
2925 DECODE_SJIS (ch, c, e1, e2);
2927 DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_japanese_jisx0208,
2931 Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
2932 Dynarr_add (dst, e1);
2933 Dynarr_add (dst, e2);
2938 DECODE_ADD_BINARY_CHAR (ch, dst);
2939 DECODE_ADD_BINARY_CHAR (c, dst);
2945 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
2946 if (BYTE_SJIS_TWO_BYTE_1_P (c))
2948 else if (BYTE_SJIS_KATAKANA_P (c))
2951 DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_katakana_jisx0201,
2954 Dynarr_add (dst, LEADING_BYTE_KATAKANA_JISX0201);
2955 Dynarr_add (dst, c);
2959 DECODE_ADD_BINARY_CHAR (c, dst);
2961 label_continue_loop:;
2964 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
2970 /* Convert internally-formatted data to Shift-JIS. */
2973 encode_coding_sjis (Lstream *encoding, CONST unsigned char *src,
2974 unsigned_char_dynarr *dst, unsigned int n)
2977 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2978 unsigned int flags = str->flags;
2979 unsigned int ch = str->ch;
2980 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
2982 unsigned char char_boundary = str->iso2022.current_char_boundary;
2989 switch (char_boundary)
2997 else if ( c >= 0xf8 )
3002 else if ( c >= 0xf0 )
3007 else if ( c >= 0xe0 )
3012 else if ( c >= 0xc0 )
3022 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3023 Dynarr_add (dst, '\r');
3024 if (eol_type != EOL_CR)
3025 Dynarr_add (dst, c);
3028 Dynarr_add (dst, c);
3033 ch = ( ch << 6 ) | ( c & 0x3f );
3035 Lisp_Object charset;
3036 unsigned int c1, c2, s1, s2;
3038 BREAKUP_CHAR (ch, charset, c1, c2);
3039 if (EQ(charset, Vcharset_katakana_jisx0201))
3041 Dynarr_add (dst, c1 | 0x80);
3043 else if (EQ(charset, Vcharset_japanese_jisx0208))
3045 ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3046 Dynarr_add (dst, s1);
3047 Dynarr_add (dst, s2);
3053 ch = ( ch << 6 ) | ( c & 0x3f );
3059 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3060 Dynarr_add (dst, '\r');
3061 if (eol_type != EOL_CR)
3062 Dynarr_add (dst, '\n');
3065 else if (BYTE_ASCII_P (c))
3067 Dynarr_add (dst, c);
3070 else if (BUFBYTE_LEADING_BYTE_P (c))
3071 ch = (c == LEADING_BYTE_KATAKANA_JISX0201 ||
3072 c == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
3073 c == LEADING_BYTE_JAPANESE_JISX0208) ? c : 0;
3076 if (ch == LEADING_BYTE_KATAKANA_JISX0201)
3078 Dynarr_add (dst, c);
3081 else if (ch == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
3082 ch == LEADING_BYTE_JAPANESE_JISX0208)
3086 unsigned char j1, j2;
3087 ENCODE_SJIS (ch, c, j1, j2);
3088 Dynarr_add (dst, j1);
3089 Dynarr_add (dst, j2);
3099 str->iso2022.current_char_boundary = char_boundary;
3103 DEFUN ("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /*
3104 Decode a JISX0208 character of Shift-JIS coding-system.
3105 CODE is the character code in Shift-JIS as a cons of type bytes.
3106 Return the corresponding character.
3110 unsigned char c1, c2, s1, s2;
3113 CHECK_INT (XCAR (code));
3114 CHECK_INT (XCDR (code));
3115 s1 = XINT (XCAR (code));
3116 s2 = XINT (XCDR (code));
3117 if (BYTE_SJIS_TWO_BYTE_1_P (s1) &&
3118 BYTE_SJIS_TWO_BYTE_2_P (s2))
3120 DECODE_SJIS (s1, s2, c1, c2);
3121 return make_char (MAKE_CHAR (Vcharset_japanese_jisx0208,
3122 c1 & 0x7F, c2 & 0x7F));
3128 DEFUN ("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /*
3129 Encode a JISX0208 character CHAR to SHIFT-JIS coding-system.
3130 Return the corresponding character code in SHIFT-JIS as a cons of two bytes.
3134 Lisp_Object charset;
3137 CHECK_CHAR_COERCE_INT (ch);
3138 BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
3139 if (EQ (charset, Vcharset_japanese_jisx0208))
3141 ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3142 return Fcons (make_int (s1), make_int (s2));
3149 /************************************************************************/
3151 /************************************************************************/
3153 /* BIG5 is a coding system encoding two character sets: ASCII and
3154 Big5. An ASCII character is encoded as is. Big5 is a two-byte
3155 character set and is encoded in two-byte.
3157 --- CODE RANGE of BIG5 ---
3158 (character set) (range)
3160 Big5 (1st byte) 0xA1 .. 0xFE
3161 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
3162 --------------------------
3164 Since the number of characters in Big5 is larger than maximum
3165 characters in Emacs' charset (96x96), it can't be handled as one
3166 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
3167 and `charset-big5-2'. Both <type>s are DIMENSION2_CHARS94. The former
3168 contains frequently used characters and the latter contains less
3169 frequently used characters. */
3171 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
3172 ((c) >= 0xA1 && (c) <= 0xFE)
3174 /* Is this the second byte of a Shift-JIS two-byte char? */
3176 #define BYTE_BIG5_TWO_BYTE_2_P(c) \
3177 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0xA1 && (c) <= 0xFE))
3179 /* Number of Big5 characters which have the same code in 1st byte. */
3181 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
3183 /* Code conversion macros. These are macros because they are used in
3184 inner loops during code conversion.
3186 Note that temporary variables in macros introduce the classic
3187 dynamic-scoping problems with variable names. We use capital-
3188 lettered variables in the assumption that XEmacs does not use
3189 capital letters in variables except in a very formalized way
3192 /* Convert Big5 code (b1, b2) into its internal string representation
3195 /* There is a much simpler way to split the Big5 charset into two.
3196 For the moment I'm going to leave the algorithm as-is because it
3197 claims to separate out the most-used characters into a single
3198 charset, which perhaps will lead to optimizations in various
3201 The way the algorithm works is something like this:
3203 Big5 can be viewed as a 94x157 charset, where the row is
3204 encoded into the bytes 0xA1 .. 0xFE and the column is encoded
3205 into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE. As for frequency,
3206 the split between low and high column numbers is apparently
3207 meaningless; ascending rows produce less and less frequent chars.
3208 Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to
3209 the first charset, and the upper half (0xC9 .. 0xFE) to the
3210 second. To do the conversion, we convert the character into
3211 a single number where 0 .. 156 is the first row, 157 .. 313
3212 is the second, etc. That way, the characters are ordered by
3213 decreasing frequency. Then we just chop the space in two
3214 and coerce the result into a 94x94 space.
3217 #define DECODE_BIG5(b1, b2, lb, c1, c2) do \
3219 int B1 = b1, B2 = b2; \
3221 = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62); \
3225 lb = LEADING_BYTE_CHINESE_BIG5_1; \
3229 lb = LEADING_BYTE_CHINESE_BIG5_2; \
3230 I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1); \
3232 c1 = I / (0xFF - 0xA1) + 0xA1; \
3233 c2 = I % (0xFF - 0xA1) + 0xA1; \
3236 /* Convert the internal string representation of a Big5 character
3237 (lb, c1, c2) into Big5 code (b1, b2). */
3239 #define ENCODE_BIG5(lb, c1, c2, b1, b2) do \
3241 unsigned int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1); \
3243 if (lb == LEADING_BYTE_CHINESE_BIG5_2) \
3245 I += BIG5_SAME_ROW * (0xC9 - 0xA1); \
3247 b1 = I / BIG5_SAME_ROW + 0xA1; \
3248 b2 = I % BIG5_SAME_ROW; \
3249 b2 += b2 < 0x3F ? 0x40 : 0x62; \
3253 detect_coding_big5 (struct detection_state *st, CONST unsigned char *src,
3261 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO ||
3262 (c >= 0x80 && c <= 0xA0))
3264 if (st->big5.in_second_byte)
3266 st->big5.in_second_byte = 0;
3267 if (c < 0x40 || (c >= 0x80 && c <= 0xA0))
3271 st->big5.in_second_byte = 1;
3273 return CODING_CATEGORY_BIG5_MASK;
3276 /* Convert Big5 data to internal format. */
3279 decode_coding_big5 (Lstream *decoding, CONST unsigned char *src,
3280 unsigned_char_dynarr *dst, unsigned int n)
3283 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3284 unsigned int flags = str->flags;
3285 unsigned int ch = str->ch;
3286 eol_type_t eol_type = str->eol_type;
3293 /* Previous character was first byte of Big5 char. */
3294 if (BYTE_BIG5_TWO_BYTE_2_P (c))
3296 unsigned char b1, b2, b3;
3297 DECODE_BIG5 (ch, c, b1, b2, b3);
3298 Dynarr_add (dst, b1);
3299 Dynarr_add (dst, b2);
3300 Dynarr_add (dst, b3);
3304 DECODE_ADD_BINARY_CHAR (ch, dst);
3305 DECODE_ADD_BINARY_CHAR (c, dst);
3311 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3312 if (BYTE_BIG5_TWO_BYTE_1_P (c))
3315 DECODE_ADD_BINARY_CHAR (c, dst);
3317 label_continue_loop:;
3320 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
3326 /* Convert internally-formatted data to Big5. */
3329 encode_coding_big5 (Lstream *encoding, CONST unsigned char *src,
3330 unsigned_char_dynarr *dst, unsigned int n)
3334 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3335 unsigned int flags = str->flags;
3336 unsigned int ch = str->ch;
3337 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3344 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3345 Dynarr_add (dst, '\r');
3346 if (eol_type != EOL_CR)
3347 Dynarr_add (dst, '\n');
3349 else if (BYTE_ASCII_P (c))
3352 Dynarr_add (dst, c);
3354 else if (BUFBYTE_LEADING_BYTE_P (c))
3356 if (c == LEADING_BYTE_CHINESE_BIG5_1 ||
3357 c == LEADING_BYTE_CHINESE_BIG5_2)
3359 /* A recognized leading byte. */
3361 continue; /* not done with this character. */
3363 /* otherwise just ignore this character. */
3365 else if (ch == LEADING_BYTE_CHINESE_BIG5_1 ||
3366 ch == LEADING_BYTE_CHINESE_BIG5_2)
3368 /* Previous char was a recognized leading byte. */
3370 continue; /* not done with this character. */
3374 /* Encountering second byte of a Big5 character. */
3375 unsigned char b1, b2;
3377 ENCODE_BIG5 (ch >> 8, ch & 0xFF, c, b1, b2);
3378 Dynarr_add (dst, b1);
3379 Dynarr_add (dst, b2);
3391 DEFUN ("decode-big5-char", Fdecode_big5_char, 1, 1, 0, /*
3392 Decode a Big5 character CODE of BIG5 coding-system.
3393 CODE is the character code in BIG5, a cons of two integers.
3394 Return the corresponding character.
3398 unsigned char c1, c2, b1, b2;
3401 CHECK_INT (XCAR (code));
3402 CHECK_INT (XCDR (code));
3403 b1 = XINT (XCAR (code));
3404 b2 = XINT (XCDR (code));
3405 if (BYTE_BIG5_TWO_BYTE_1_P (b1) &&
3406 BYTE_BIG5_TWO_BYTE_2_P (b2))
3408 Charset_ID leading_byte;
3409 Lisp_Object charset;
3410 DECODE_BIG5 (b1, b2, leading_byte, c1, c2);
3411 charset = CHARSET_BY_LEADING_BYTE (leading_byte);
3412 return make_char (MAKE_CHAR (charset, c1 & 0x7F, c2 & 0x7F));
3418 DEFUN ("encode-big5-char", Fencode_big5_char, 1, 1, 0, /*
3419 Encode the Big5 character CH to BIG5 coding-system.
3420 Return the corresponding character code in Big5.
3424 Lisp_Object charset;
3427 CHECK_CHAR_COERCE_INT (ch);
3428 BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
3429 if (EQ (charset, Vcharset_chinese_big5_1) ||
3430 EQ (charset, Vcharset_chinese_big5_2))
3432 ENCODE_BIG5 (XCHARSET_LEADING_BYTE (charset), c1 | 0x80, c2 | 0x80,
3434 return Fcons (make_int (b1), make_int (b2));
3441 /************************************************************************/
3444 /* UCS-4 character codes are implemented as nonnegative integers. */
3446 /************************************************************************/
3448 Lisp_Object ucs_to_mule_table[65536];
3449 Lisp_Object mule_to_ucs_table;
3451 DEFUN ("set-ucs-char", Fset_ucs_char, 2, 2, 0, /*
3452 Map UCS-4 code CODE to Mule character CHARACTER.
3454 Return T on success, NIL on failure.
3460 CHECK_CHAR (character);
3464 if (c < sizeof (ucs_to_mule_table))
3466 ucs_to_mule_table[c] = character;
3474 ucs_to_char (unsigned long code)
3476 if (code < sizeof (ucs_to_mule_table))
3478 return ucs_to_mule_table[code];
3480 else if ((0xe00000 <= code) && (code <= 0xe00000 + 94 * 94 * 14))
3485 c = code % (94 * 94);
3487 (MAKE_CHAR (CHARSET_BY_ATTRIBUTES
3488 (CHARSET_TYPE_94X94, code / (94 * 94) + '@',
3489 CHARSET_LEFT_TO_RIGHT),
3490 c / 94 + 33, c % 94 + 33));
3496 DEFUN ("ucs-char", Fucs_char, 1, 1, 0, /*
3497 Return Mule character corresponding to UCS code CODE (a positive integer).
3501 CHECK_NATNUM (code);
3502 return ucs_to_char (XINT (code));
3505 DEFUN ("set-char-ucs", Fset_char_ucs, 2, 2, 0, /*
3506 Map Mule character CHARACTER to UCS code CODE (a positive integer).
3510 /* #### Isn't this gilding the lily? Fput_char_table checks its args.
3511 Fset_char_ucs is more restrictive on index arg, but should
3512 check code arg in a char_table method. */
3513 CHECK_CHAR (character);
3514 CHECK_NATNUM (code);
3515 return Fput_char_table (character, code, mule_to_ucs_table);
3518 DEFUN ("char-ucs", Fchar_ucs, 1, 1, 0, /*
3519 Return the UCS code (a positive integer) corresponding to CHARACTER.
3523 return Fget_char_table (character, mule_to_ucs_table);
3527 #define decode_ucs4 DECODE_ADD_UCS_CHAR
3529 /* Decode a UCS-4 character into a buffer. If the lookup fails, use
3530 <GETA MARK> (U+3013) of JIS X 0208, which means correct character
3531 is not found, instead.
3532 #### do something more appropriate (use blob?)
3533 Danger, Will Robinson! Data loss. Should we signal user? */
3535 decode_ucs4 (unsigned long ch, unsigned_char_dynarr *dst)
3537 Lisp_Object chr = ucs_to_char (ch);
3541 Bufbyte work[MAX_EMCHAR_LEN];
3546 simple_set_charptr_emchar (work, ch) :
3547 non_ascii_set_charptr_emchar (work, ch);
3548 Dynarr_add_many (dst, work, len);
3552 Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
3553 Dynarr_add (dst, 34 + 128);
3554 Dynarr_add (dst, 46 + 128);
3559 static unsigned long
3560 mule_char_to_ucs4 (Lisp_Object charset,
3561 unsigned char h, unsigned char l)
3564 = Fget_char_table (make_char (MAKE_CHAR (charset, h & 127, l & 127)),
3571 else if ( (XCHARSET_DIMENSION (charset) == 2) &&
3572 (XCHARSET_CHARS (charset) == 94) )
3574 unsigned char final = XCHARSET_FINAL (charset);
3576 if ( ('@' <= final) && (final < 0x7f) )
3578 return 0xe00000 + (final - '@') * 94 * 94
3579 + ((h & 127) - 33) * 94 + (l & 127) - 33;
3593 encode_ucs4 (Lisp_Object charset,
3594 unsigned char h, unsigned char l, unsigned_char_dynarr *dst)
3596 unsigned long code = mule_char_to_ucs4 (charset, h, l);
3597 Dynarr_add (dst, code >> 24);
3598 Dynarr_add (dst, (code >> 16) & 255);
3599 Dynarr_add (dst, (code >> 8) & 255);
3600 Dynarr_add (dst, code & 255);
3604 detect_coding_ucs4 (struct detection_state *st, CONST unsigned char *src,
3610 switch (st->ucs4.in_byte)
3619 st->ucs4.in_byte = 0;
3625 return CODING_CATEGORY_UCS4_MASK;
3629 decode_coding_ucs4 (Lstream *decoding, CONST unsigned char *src,
3630 unsigned_char_dynarr *dst, unsigned int n)
3632 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3633 unsigned int flags = str->flags;
3634 unsigned int ch = str->ch;
3635 unsigned char counter = str->counter;
3639 unsigned char c = *src++;
3647 decode_ucs4 ( ( ch << 8 ) | c, dst);
3652 ch = ( ch << 8 ) | c;
3656 if (counter & CODING_STATE_END)
3657 DECODE_OUTPUT_PARTIAL_CHAR (ch);
3661 str->counter = counter;
3665 encode_coding_ucs4 (Lstream *encoding, CONST unsigned char *src,
3666 unsigned_char_dynarr *dst, unsigned int n)
3669 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3670 unsigned int flags = str->flags;
3671 unsigned int ch = str->ch;
3672 unsigned char char_boundary = str->iso2022.current_char_boundary;
3673 Lisp_Object charset = str->iso2022.current_charset;
3675 #ifdef ENABLE_COMPOSITE_CHARS
3676 /* flags for handling composite chars. We do a little switcharoo
3677 on the source while we're outputting the composite char. */
3678 unsigned int saved_n = 0;
3679 CONST unsigned char *saved_src = NULL;
3680 int in_composite = 0;
3687 unsigned char c = *src++;
3689 if (BYTE_ASCII_P (c))
3690 { /* Processing ASCII character */
3692 encode_ucs4 (Vcharset_ascii, c, 0, dst);
3695 else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
3696 { /* Processing Leading Byte */
3698 charset = CHARSET_BY_LEADING_BYTE (c);
3699 if (LEADING_BYTE_PREFIX_P(c))
3704 { /* Processing Non-ASCII character */
3706 if (EQ (charset, Vcharset_control_1))
3708 encode_ucs4 (Vcharset_control_1, c, 0, dst);
3712 switch (XCHARSET_REP_BYTES (charset))
3715 encode_ucs4 (charset, c, 0, dst);
3718 if (XCHARSET_PRIVATE_P (charset))
3720 encode_ucs4 (charset, c, 0, dst);
3725 #ifdef ENABLE_COMPOSITE_CHARS
3726 if (EQ (charset, Vcharset_composite))
3730 /* #### Bother! We don't know how to
3732 Dynarr_add (dst, 0);
3733 Dynarr_add (dst, 0);
3734 Dynarr_add (dst, 0);
3735 Dynarr_add (dst, '~');
3739 Emchar emch = MAKE_CHAR (Vcharset_composite,
3740 ch & 0x7F, c & 0x7F);
3741 Lisp_Object lstr = composite_char_string (emch);
3745 src = XSTRING_DATA (lstr);
3746 n = XSTRING_LENGTH (lstr);
3750 #endif /* ENABLE_COMPOSITE_CHARS */
3752 encode_ucs4(charset, ch, c, dst);
3765 encode_ucs4 (charset, ch, c, dst);
3781 #ifdef ENABLE_COMPOSITE_CHARS
3787 goto back_to_square_n; /* Wheeeeeeeee ..... */
3789 #endif /* ENABLE_COMPOSITE_CHARS */
3793 str->iso2022.current_char_boundary = char_boundary;
3794 str->iso2022.current_charset = charset;
3796 /* Verbum caro factum est! */
3801 /************************************************************************/
3803 /************************************************************************/
3806 detect_coding_utf8 (struct detection_state *st, CONST unsigned char *src,
3811 unsigned char c = *src++;
3812 switch (st->utf8.in_byte)
3815 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3818 st->utf8.in_byte = 5;
3820 st->utf8.in_byte = 4;
3822 st->utf8.in_byte = 3;
3824 st->utf8.in_byte = 2;
3826 st->utf8.in_byte = 1;
3831 if ((c & 0xc0) != 0x80)
3837 return CODING_CATEGORY_UTF8_MASK;
3841 decode_coding_utf8 (Lstream *decoding, CONST unsigned char *src,
3842 unsigned_char_dynarr *dst, unsigned int n)
3844 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3845 unsigned int flags = str->flags;
3846 unsigned int ch = str->ch;
3847 eol_type_t eol_type = str->eol_type;
3848 unsigned char counter = str->counter;
3852 unsigned char c = *src++;
3861 else if ( c >= 0xf8 )
3866 else if ( c >= 0xf0 )
3871 else if ( c >= 0xe0 )
3876 else if ( c >= 0xc0 )
3883 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3884 decode_ucs4 (c, dst);
3888 ch = ( ch << 6 ) | ( c & 0x3f );
3889 decode_ucs4 (ch, dst);
3894 ch = ( ch << 6 ) | ( c & 0x3f );
3897 label_continue_loop:;
3900 if (flags & CODING_STATE_END)
3901 DECODE_OUTPUT_PARTIAL_CHAR (ch);
3905 str->counter = counter;
3909 char_encode_utf8 (struct encoding_stream *str, Emchar code,
3910 unsigned_char_dynarr *dst, unsigned int *flags)
3914 Dynarr_add (dst, code);
3916 else if ( code <= 0x7ff )
3918 Dynarr_add (dst, (code >> 6) | 0xc0);
3919 Dynarr_add (dst, (code & 0x3f) | 0x80);
3921 else if ( code <= 0xffff )
3923 Dynarr_add (dst, (code >> 12) | 0xe0);
3924 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
3925 Dynarr_add (dst, (code & 0x3f) | 0x80);
3927 else if ( code <= 0x1fffff )
3929 Dynarr_add (dst, (code >> 18) | 0xf0);
3930 Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
3931 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
3932 Dynarr_add (dst, (code & 0x3f) | 0x80);
3934 else if ( code <= 0x3ffffff )
3936 Dynarr_add (dst, (code >> 24) | 0xf8);
3937 Dynarr_add (dst, ((code >> 18) & 0x3f) | 0x80);
3938 Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
3939 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
3940 Dynarr_add (dst, (code & 0x3f) | 0x80);
3944 Dynarr_add (dst, (code >> 30) | 0xfc);
3945 Dynarr_add (dst, ((code >> 24) & 0x3f) | 0x80);
3946 Dynarr_add (dst, ((code >> 18) & 0x3f) | 0x80);
3947 Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
3948 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
3949 Dynarr_add (dst, (code & 0x3f) | 0x80);
3954 char_finish_utf8 (struct encoding_stream *str, unsigned_char_dynarr *dst,
3955 unsigned int *flags)
3960 /************************************************************************/
3961 /* ISO2022 methods */
3962 /************************************************************************/
3964 /* The following note describes the coding system ISO2022 briefly.
3965 Since the intention of this note is to help understand the
3966 functions in this file, some parts are NOT ACCURATE or OVERLY
3967 SIMPLIFIED. For thorough understanding, please refer to the
3968 original document of ISO2022.
3970 ISO2022 provides many mechanisms to encode several character sets
3971 in 7-bit and 8-bit environments. For 7-bit environments, all text
3972 is encoded using bytes less than 128. This may make the encoded
3973 text a little bit longer, but the text passes more easily through
3974 several gateways, some of which strip off MSB (Most Signigant Bit).
3976 There are two kinds of character sets: control character set and
3977 graphic character set. The former contains control characters such
3978 as `newline' and `escape' to provide control functions (control
3979 functions are also provided by escape sequences). The latter
3980 contains graphic characters such as 'A' and '-'. Emacs recognizes
3981 two control character sets and many graphic character sets.
3983 Graphic character sets are classified into one of the following
3984 four classes, according to the number of bytes (DIMENSION) and
3985 number of characters in one dimension (CHARS) of the set:
3986 - DIMENSION1_CHARS94
3987 - DIMENSION1_CHARS96
3988 - DIMENSION2_CHARS94
3989 - DIMENSION2_CHARS96
3991 In addition, each character set is assigned an identification tag,
3992 unique for each set, called "final character" (denoted as <F>
3993 hereafter). The <F> of each character set is decided by ECMA(*)
3994 when it is registered in ISO. The code range of <F> is 0x30..0x7F
3995 (0x30..0x3F are for private use only).
3997 Note (*): ECMA = European Computer Manufacturers Association
3999 Here are examples of graphic character set [NAME(<F>)]:
4000 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
4001 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
4002 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
4003 o DIMENSION2_CHARS96 -- none for the moment
4005 A code area (1 byte = 8 bits) is divided into 4 areas, C0, GL, C1, and GR.
4006 C0 [0x00..0x1F] -- control character plane 0
4007 GL [0x20..0x7F] -- graphic character plane 0
4008 C1 [0x80..0x9F] -- control character plane 1
4009 GR [0xA0..0xFF] -- graphic character plane 1
4011 A control character set is directly designated and invoked to C0 or
4012 C1 by an escape sequence. The most common case is that:
4013 - ISO646's control character set is designated/invoked to C0, and
4014 - ISO6429's control character set is designated/invoked to C1,
4015 and usually these designations/invocations are omitted in encoded
4016 text. In a 7-bit environment, only C0 can be used, and a control
4017 character for C1 is encoded by an appropriate escape sequence to
4018 fit into the environment. All control characters for C1 are
4019 defined to have corresponding escape sequences.
4021 A graphic character set is at first designated to one of four
4022 graphic registers (G0 through G3), then these graphic registers are
4023 invoked to GL or GR. These designations and invocations can be
4024 done independently. The most common case is that G0 is invoked to
4025 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
4026 these invocations and designations are omitted in encoded text.
4027 In a 7-bit environment, only GL can be used.
4029 When a graphic character set of CHARS94 is invoked to GL, codes
4030 0x20 and 0x7F of the GL area work as control characters SPACE and
4031 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
4034 There are two ways of invocation: locking-shift and single-shift.
4035 With locking-shift, the invocation lasts until the next different
4036 invocation, whereas with single-shift, the invocation affects the
4037 following character only and doesn't affect the locking-shift
4038 state. Invocations are done by the following control characters or
4041 ----------------------------------------------------------------------
4042 abbrev function cntrl escape seq description
4043 ----------------------------------------------------------------------
4044 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
4045 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
4046 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
4047 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
4048 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
4049 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
4050 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
4051 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
4052 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
4053 ----------------------------------------------------------------------
4054 (*) These are not used by any known coding system.
4056 Control characters for these functions are defined by macros
4057 ISO_CODE_XXX in `coding.h'.
4059 Designations are done by the following escape sequences:
4060 ----------------------------------------------------------------------
4061 escape sequence description
4062 ----------------------------------------------------------------------
4063 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
4064 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
4065 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
4066 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
4067 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
4068 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
4069 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
4070 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
4071 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
4072 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
4073 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
4074 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
4075 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
4076 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
4077 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
4078 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
4079 ----------------------------------------------------------------------
4081 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
4082 of dimension 1, chars 94, and final character <F>, etc...
4084 Note (*): Although these designations are not allowed in ISO2022,
4085 Emacs accepts them on decoding, and produces them on encoding
4086 CHARS96 character sets in a coding system which is characterized as
4087 7-bit environment, non-locking-shift, and non-single-shift.
4089 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
4090 '(' can be omitted. We refer to this as "short-form" hereafter.
4092 Now you may notice that there are a lot of ways for encoding the
4093 same multilingual text in ISO2022. Actually, there exist many
4094 coding systems such as Compound Text (used in X11's inter client
4095 communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
4096 (used in Korean internet), EUC (Extended UNIX Code, used in Asian
4097 localized platforms), and all of these are variants of ISO2022.
4099 In addition to the above, Emacs handles two more kinds of escape
4100 sequences: ISO6429's direction specification and Emacs' private
4101 sequence for specifying character composition.
4103 ISO6429's direction specification takes the following form:
4104 o CSI ']' -- end of the current direction
4105 o CSI '0' ']' -- end of the current direction
4106 o CSI '1' ']' -- start of left-to-right text
4107 o CSI '2' ']' -- start of right-to-left text
4108 The control character CSI (0x9B: control sequence introducer) is
4109 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
4111 Character composition specification takes the following form:
4112 o ESC '0' -- start character composition
4113 o ESC '1' -- end character composition
4114 Since these are not standard escape sequences of any ISO standard,
4115 their use with these meanings is restricted to Emacs only. */
4118 reset_iso2022 (Lisp_Object coding_system, struct iso2022_decoder *iso)
4122 for (i = 0; i < 4; i++)
4124 if (!NILP (coding_system))
4126 XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, i);
4128 iso->charset[i] = Qt;
4129 iso->invalid_designated[i] = 0;
4131 iso->esc = ISO_ESC_NOTHING;
4132 iso->esc_bytes_index = 0;
4133 iso->register_left = 0;
4134 iso->register_right = 1;
4135 iso->switched_dir_and_no_valid_charset_yet = 0;
4136 iso->invalid_switch_dir = 0;
4137 iso->output_direction_sequence = 0;
4138 iso->output_literally = 0;
4139 #ifdef ENABLE_COMPOSITE_CHARS
4140 if (iso->composite_chars)
4141 Dynarr_reset (iso->composite_chars);
4146 fit_to_be_escape_quoted (unsigned char c)
4163 /* Parse one byte of an ISO2022 escape sequence.
4164 If the result is an invalid escape sequence, return 0 and
4165 do not change anything in STR. Otherwise, if the result is
4166 an incomplete escape sequence, update ISO2022.ESC and
4167 ISO2022.ESC_BYTES and return -1. Otherwise, update
4168 all the state variables (but not ISO2022.ESC_BYTES) and
4171 If CHECK_INVALID_CHARSETS is non-zero, check for designation
4172 or invocation of an invalid character set and treat that as
4173 an unrecognized escape sequence. */
4176 parse_iso2022_esc (Lisp_Object codesys, struct iso2022_decoder *iso,
4177 unsigned char c, unsigned int *flags,
4178 int check_invalid_charsets)
4180 /* (1) If we're at the end of a designation sequence, CS is the
4181 charset being designated and REG is the register to designate
4184 (2) If we're at the end of a locking-shift sequence, REG is
4185 the register to invoke and HALF (0 == left, 1 == right) is
4186 the half to invoke it into.
4188 (3) If we're at the end of a single-shift sequence, REG is
4189 the register to invoke. */
4190 Lisp_Object cs = Qnil;
4193 /* NOTE: This code does goto's all over the fucking place.
4194 The reason for this is that we're basically implementing
4195 a state machine here, and hierarchical languages like C
4196 don't really provide a clean way of doing this. */
4198 if (! (*flags & CODING_STATE_ESCAPE))
4199 /* At beginning of escape sequence; we need to reset our
4200 escape-state variables. */
4201 iso->esc = ISO_ESC_NOTHING;
4203 iso->output_literally = 0;
4204 iso->output_direction_sequence = 0;
4208 case ISO_ESC_NOTHING:
4209 iso->esc_bytes_index = 0;
4212 case ISO_CODE_ESC: /* Start escape sequence */
4213 *flags |= CODING_STATE_ESCAPE;
4217 case ISO_CODE_CSI: /* ISO6429 (specifying directionality) */
4218 *flags |= CODING_STATE_ESCAPE;
4219 iso->esc = ISO_ESC_5_11;
4222 case ISO_CODE_SO: /* locking shift 1 */
4225 case ISO_CODE_SI: /* locking shift 0 */
4229 case ISO_CODE_SS2: /* single shift */
4232 case ISO_CODE_SS3: /* single shift */
4236 default: /* Other control characters */
4243 /**** single shift ****/
4245 case 'N': /* single shift 2 */
4248 case 'O': /* single shift 3 */
4252 /**** locking shift ****/
4254 case '~': /* locking shift 1 right */
4257 case 'n': /* locking shift 2 */
4260 case '}': /* locking shift 2 right */
4263 case 'o': /* locking shift 3 */
4266 case '|': /* locking shift 3 right */
4270 #ifdef ENABLE_COMPOSITE_CHARS
4271 /**** composite ****/
4274 iso->esc = ISO_ESC_START_COMPOSITE;
4275 *flags = (*flags & CODING_STATE_ISO2022_LOCK) |
4276 CODING_STATE_COMPOSITE;
4280 iso->esc = ISO_ESC_END_COMPOSITE;
4281 *flags = (*flags & CODING_STATE_ISO2022_LOCK) &
4282 ~CODING_STATE_COMPOSITE;
4284 #endif /* ENABLE_COMPOSITE_CHARS */
4286 /**** directionality ****/
4289 iso->esc = ISO_ESC_5_11;
4292 /**** designation ****/
4294 case '$': /* multibyte charset prefix */
4295 iso->esc = ISO_ESC_2_4;
4299 if (0x28 <= c && c <= 0x2F)
4301 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_8);
4305 /* This function is called with CODESYS equal to nil when
4306 doing coding-system detection. */
4308 && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
4309 && fit_to_be_escape_quoted (c))
4311 iso->esc = ISO_ESC_LITERAL;
4312 *flags &= CODING_STATE_ISO2022_LOCK;
4322 /**** directionality ****/
4324 case ISO_ESC_5_11: /* ISO6429 direction control */
4327 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4328 goto directionality;
4330 if (c == '0') iso->esc = ISO_ESC_5_11_0;
4331 else if (c == '1') iso->esc = ISO_ESC_5_11_1;
4332 else if (c == '2') iso->esc = ISO_ESC_5_11_2;
4336 case ISO_ESC_5_11_0:
4339 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4340 goto directionality;
4344 case ISO_ESC_5_11_1:
4347 *flags = (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4348 goto directionality;
4352 case ISO_ESC_5_11_2:
4355 *flags = (*flags & CODING_STATE_ISO2022_LOCK) | CODING_STATE_R2L;
4356 goto directionality;
4361 iso->esc = ISO_ESC_DIRECTIONALITY;
4362 /* Various junk here to attempt to preserve the direction sequences
4363 literally in the text if they would otherwise be swallowed due
4364 to invalid designations that don't show up as actual charset
4365 changes in the text. */
4366 if (iso->invalid_switch_dir)
4368 /* We already inserted a direction switch literally into the
4369 text. We assume (#### this may not be right) that the
4370 next direction switch is the one going the other way,
4371 and we need to output that literally as well. */
4372 iso->output_literally = 1;
4373 iso->invalid_switch_dir = 0;
4379 /* If we are in the thrall of an invalid designation,
4380 then stick the directionality sequence literally into the
4381 output stream so it ends up in the original text again. */
4382 for (jj = 0; jj < 4; jj++)
4383 if (iso->invalid_designated[jj])
4387 iso->output_literally = 1;
4388 iso->invalid_switch_dir = 1;
4391 /* Indicate that we haven't yet seen a valid designation,
4392 so that if a switch-dir is directly followed by an
4393 invalid designation, both get inserted literally. */
4394 iso->switched_dir_and_no_valid_charset_yet = 1;
4399 /**** designation ****/
4402 if (0x28 <= c && c <= 0x2F)
4404 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_4_8);
4407 if (0x40 <= c && c <= 0x42)
4409 cs = CHARSET_BY_ATTRIBUTES (CHARSET_TYPE_94X94, c,
4410 *flags & CODING_STATE_R2L ?
4411 CHARSET_RIGHT_TO_LEFT :
4412 CHARSET_LEFT_TO_RIGHT);
4422 if (c < '0' || c > '~')
4423 return 0; /* bad final byte */
4425 if (iso->esc >= ISO_ESC_2_8 &&
4426 iso->esc <= ISO_ESC_2_15)
4428 type = ((iso->esc >= ISO_ESC_2_12) ?
4429 CHARSET_TYPE_96 : CHARSET_TYPE_94);
4430 reg = (iso->esc - ISO_ESC_2_8) & 3;
4432 else if (iso->esc >= ISO_ESC_2_4_8 &&
4433 iso->esc <= ISO_ESC_2_4_15)
4435 type = ((iso->esc >= ISO_ESC_2_4_12) ?
4436 CHARSET_TYPE_96X96 : CHARSET_TYPE_94X94);
4437 reg = (iso->esc - ISO_ESC_2_4_8) & 3;
4441 /* Can this ever be reached? -slb */
4445 cs = CHARSET_BY_ATTRIBUTES (type, c,
4446 *flags & CODING_STATE_R2L ?
4447 CHARSET_RIGHT_TO_LEFT :
4448 CHARSET_LEFT_TO_RIGHT);
4454 iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char) c;
4458 if (check_invalid_charsets && !CHARSETP (iso->charset[reg]))
4459 /* can't invoke something that ain't there. */
4461 iso->esc = ISO_ESC_SINGLE_SHIFT;
4462 *flags &= CODING_STATE_ISO2022_LOCK;
4464 *flags |= CODING_STATE_SS2;
4466 *flags |= CODING_STATE_SS3;
4470 if (check_invalid_charsets &&
4471 !CHARSETP (iso->charset[reg]))
4472 /* can't invoke something that ain't there. */
4475 iso->register_right = reg;
4477 iso->register_left = reg;
4478 *flags &= CODING_STATE_ISO2022_LOCK;
4479 iso->esc = ISO_ESC_LOCKING_SHIFT;
4483 if (NILP (cs) && check_invalid_charsets)
4485 iso->invalid_designated[reg] = 1;
4486 iso->charset[reg] = Vcharset_ascii;
4487 iso->esc = ISO_ESC_DESIGNATE;
4488 *flags &= CODING_STATE_ISO2022_LOCK;
4489 iso->output_literally = 1;
4490 if (iso->switched_dir_and_no_valid_charset_yet)
4492 /* We encountered a switch-direction followed by an
4493 invalid designation. Ensure that the switch-direction
4494 gets outputted; otherwise it will probably get eaten
4495 when the text is written out again. */
4496 iso->switched_dir_and_no_valid_charset_yet = 0;
4497 iso->output_direction_sequence = 1;
4498 /* And make sure that the switch-dir going the other
4499 way gets outputted, as well. */
4500 iso->invalid_switch_dir = 1;
4504 /* This function is called with CODESYS equal to nil when
4505 doing coding-system detection. */
4506 if (!NILP (codesys))
4508 charset_conversion_spec_dynarr *dyn =
4509 XCODING_SYSTEM (codesys)->iso2022.input_conv;
4515 for (i = 0; i < Dynarr_length (dyn); i++)
4517 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
4518 if (EQ (cs, spec->from_charset))
4519 cs = spec->to_charset;
4524 iso->charset[reg] = cs;
4525 iso->esc = ISO_ESC_DESIGNATE;
4526 *flags &= CODING_STATE_ISO2022_LOCK;
4527 if (iso->invalid_designated[reg])
4529 iso->invalid_designated[reg] = 0;
4530 iso->output_literally = 1;
4532 if (iso->switched_dir_and_no_valid_charset_yet)
4533 iso->switched_dir_and_no_valid_charset_yet = 0;
4538 detect_coding_iso2022 (struct detection_state *st, CONST unsigned char *src,
4543 /* #### There are serious deficiencies in the recognition mechanism
4544 here. This needs to be much smarter if it's going to cut it.
4545 The sequence "\xff\x0f" is currently detected as LOCK_SHIFT while
4546 it should be detected as Latin-1.
4547 All the ISO2022 stuff in this file should be synced up with the
4548 code from FSF Emacs-20.4, in which Mule should be more or less stable.
4549 Perhaps we should wait till R2L works in FSF Emacs? */
4551 if (!st->iso2022.initted)
4553 reset_iso2022 (Qnil, &st->iso2022.iso);
4554 st->iso2022.mask = (CODING_CATEGORY_ISO_7_MASK |
4555 CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4556 CODING_CATEGORY_ISO_8_1_MASK |
4557 CODING_CATEGORY_ISO_8_2_MASK |
4558 CODING_CATEGORY_ISO_LOCK_SHIFT_MASK);
4559 st->iso2022.flags = 0;
4560 st->iso2022.high_byte_count = 0;
4561 st->iso2022.saw_single_shift = 0;
4562 st->iso2022.initted = 1;
4565 mask = st->iso2022.mask;
4572 mask &= ~CODING_CATEGORY_ISO_7_MASK;
4573 st->iso2022.high_byte_count++;
4577 if (st->iso2022.high_byte_count && !st->iso2022.saw_single_shift)
4579 if (st->iso2022.high_byte_count & 1)
4580 /* odd number of high bytes; assume not iso-8-2 */
4581 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4583 st->iso2022.high_byte_count = 0;
4584 st->iso2022.saw_single_shift = 0;
4586 mask &= ~CODING_CATEGORY_ISO_7_MASK;
4588 if (!(st->iso2022.flags & CODING_STATE_ESCAPE)
4589 && (BYTE_C0_P (c) || BYTE_C1_P (c)))
4590 { /* control chars */
4593 /* Allow and ignore control characters that you might
4594 reasonably see in a text file */
4599 case 8: /* backspace */
4600 case 11: /* vertical tab */
4601 case 12: /* form feed */
4602 case 26: /* MS-DOS C-z junk */
4603 case 31: /* '^_' -- for info */
4604 goto label_continue_loop;
4611 if ((st->iso2022.flags & CODING_STATE_ESCAPE) || BYTE_C0_P (c)
4614 if (parse_iso2022_esc (Qnil, &st->iso2022.iso, c,
4615 &st->iso2022.flags, 0))
4617 switch (st->iso2022.iso.esc)
4619 case ISO_ESC_DESIGNATE:
4620 mask &= ~CODING_CATEGORY_ISO_8_1_MASK;
4621 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4623 case ISO_ESC_LOCKING_SHIFT:
4624 mask = CODING_CATEGORY_ISO_LOCK_SHIFT_MASK;
4625 goto ran_out_of_chars;
4626 case ISO_ESC_SINGLE_SHIFT:
4627 mask &= ~CODING_CATEGORY_ISO_8_DESIGNATE_MASK;
4628 st->iso2022.saw_single_shift = 1;
4637 goto ran_out_of_chars;
4640 label_continue_loop:;
4649 postprocess_iso2022_mask (int mask)
4651 /* #### kind of cheesy */
4652 /* If seven-bit ISO is allowed, then assume that the encoding is
4653 entirely seven-bit and turn off the eight-bit ones. */
4654 if (mask & CODING_CATEGORY_ISO_7_MASK)
4655 mask &= ~ (CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4656 CODING_CATEGORY_ISO_8_1_MASK |
4657 CODING_CATEGORY_ISO_8_2_MASK);
4661 /* If FLAGS is a null pointer or specifies right-to-left motion,
4662 output a switch-dir-to-left-to-right sequence to DST.
4663 Also update FLAGS if it is not a null pointer.
4664 If INTERNAL_P is set, we are outputting in internal format and
4665 need to handle the CSI differently. */
4668 restore_left_to_right_direction (Lisp_Coding_System *codesys,
4669 unsigned_char_dynarr *dst,
4670 unsigned int *flags,
4673 if (!flags || (*flags & CODING_STATE_R2L))
4675 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4677 Dynarr_add (dst, ISO_CODE_ESC);
4678 Dynarr_add (dst, '[');
4680 else if (internal_p)
4681 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4683 Dynarr_add (dst, ISO_CODE_CSI);
4684 Dynarr_add (dst, '0');
4685 Dynarr_add (dst, ']');
4687 *flags &= ~CODING_STATE_R2L;
4691 /* If FLAGS is a null pointer or specifies a direction different from
4692 DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or
4693 CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape
4694 sequence to DST. Also update FLAGS if it is not a null pointer.
4695 If INTERNAL_P is set, we are outputting in internal format and
4696 need to handle the CSI differently. */
4699 ensure_correct_direction (int direction, Lisp_Coding_System *codesys,
4700 unsigned_char_dynarr *dst, unsigned int *flags,
4703 if ((!flags || (*flags & CODING_STATE_R2L)) &&
4704 direction == CHARSET_LEFT_TO_RIGHT)
4705 restore_left_to_right_direction (codesys, dst, flags, internal_p);
4706 else if (!CODING_SYSTEM_ISO2022_NO_ISO6429 (codesys)
4707 && (!flags || !(*flags & CODING_STATE_R2L)) &&
4708 direction == CHARSET_RIGHT_TO_LEFT)
4710 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4712 Dynarr_add (dst, ISO_CODE_ESC);
4713 Dynarr_add (dst, '[');
4715 else if (internal_p)
4716 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4718 Dynarr_add (dst, ISO_CODE_CSI);
4719 Dynarr_add (dst, '2');
4720 Dynarr_add (dst, ']');
4722 *flags |= CODING_STATE_R2L;
4726 /* Convert ISO2022-format data to internal format. */
4729 decode_coding_iso2022 (Lstream *decoding, CONST unsigned char *src,
4730 unsigned_char_dynarr *dst, unsigned int n)
4732 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4733 unsigned int flags = str->flags;
4734 unsigned int ch = str->ch;
4735 eol_type_t eol_type = str->eol_type;
4736 #ifdef ENABLE_COMPOSITE_CHARS
4737 unsigned_char_dynarr *real_dst = dst;
4739 Lisp_Object coding_system;
4741 XSETCODING_SYSTEM (coding_system, str->codesys);
4743 #ifdef ENABLE_COMPOSITE_CHARS
4744 if (flags & CODING_STATE_COMPOSITE)
4745 dst = str->iso2022.composite_chars;
4746 #endif /* ENABLE_COMPOSITE_CHARS */
4750 unsigned char c = *src++;
4751 if (flags & CODING_STATE_ESCAPE)
4752 { /* Within ESC sequence */
4753 int retval = parse_iso2022_esc (coding_system, &str->iso2022,
4758 switch (str->iso2022.esc)
4760 #ifdef ENABLE_COMPOSITE_CHARS
4761 case ISO_ESC_START_COMPOSITE:
4762 if (str->iso2022.composite_chars)
4763 Dynarr_reset (str->iso2022.composite_chars);
4765 str->iso2022.composite_chars = Dynarr_new (unsigned_char);
4766 dst = str->iso2022.composite_chars;
4768 case ISO_ESC_END_COMPOSITE:
4770 Bufbyte comstr[MAX_EMCHAR_LEN];
4772 Emchar emch = lookup_composite_char (Dynarr_atp (dst, 0),
4773 Dynarr_length (dst));
4775 len = set_charptr_emchar (comstr, emch);
4776 Dynarr_add_many (dst, comstr, len);
4779 #endif /* ENABLE_COMPOSITE_CHARS */
4781 case ISO_ESC_LITERAL:
4782 DECODE_ADD_BINARY_CHAR (c, dst);
4786 /* Everything else handled already */
4791 /* Attempted error recovery. */
4792 if (str->iso2022.output_direction_sequence)
4793 ensure_correct_direction (flags & CODING_STATE_R2L ?
4794 CHARSET_RIGHT_TO_LEFT :
4795 CHARSET_LEFT_TO_RIGHT,
4796 str->codesys, dst, 0, 1);
4797 /* More error recovery. */
4798 if (!retval || str->iso2022.output_literally)
4800 /* Output the (possibly invalid) sequence */
4802 for (i = 0; i < str->iso2022.esc_bytes_index; i++)
4803 DECODE_ADD_BINARY_CHAR (str->iso2022.esc_bytes[i], dst);
4804 flags &= CODING_STATE_ISO2022_LOCK;
4806 n++, src--;/* Repeat the loop with the same character. */
4809 /* No sense in reprocessing the final byte of the
4810 escape sequence; it could mess things up anyway.
4812 DECODE_ADD_BINARY_CHAR (c, dst);
4817 else if (BYTE_C0_P (c) || BYTE_C1_P (c))
4818 { /* Control characters */
4820 /***** Error-handling *****/
4822 /* If we were in the middle of a character, dump out the
4823 partial character. */
4824 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4826 /* If we just saw a single-shift character, dump it out.
4827 This may dump out the wrong sort of single-shift character,
4828 but least it will give an indication that something went
4830 if (flags & CODING_STATE_SS2)
4832 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS2, dst);
4833 flags &= ~CODING_STATE_SS2;
4835 if (flags & CODING_STATE_SS3)
4837 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS3, dst);
4838 flags &= ~CODING_STATE_SS3;
4841 /***** Now handle the control characters. *****/
4844 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4846 flags &= CODING_STATE_ISO2022_LOCK;
4848 if (!parse_iso2022_esc (coding_system, &str->iso2022, c, &flags, 1))
4849 DECODE_ADD_BINARY_CHAR (c, dst);
4852 { /* Graphic characters */
4853 Lisp_Object charset;
4859 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4861 /* Now determine the charset. */
4862 reg = ((flags & CODING_STATE_SS2) ? 2
4863 : (flags & CODING_STATE_SS3) ? 3
4864 : !BYTE_ASCII_P (c) ? str->iso2022.register_right
4865 : str->iso2022.register_left);
4866 charset = str->iso2022.charset[reg];
4868 /* Error checking: */
4869 if (! CHARSETP (charset)
4870 || str->iso2022.invalid_designated[reg]
4871 || (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL)
4872 && XCHARSET_CHARS (charset) == 94))
4873 /* Mrmph. We are trying to invoke a register that has no
4874 or an invalid charset in it, or trying to add a character
4875 outside the range of the charset. Insert that char literally
4876 to preserve it for the output. */
4878 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4879 DECODE_ADD_BINARY_CHAR (c, dst);
4884 /* Things are probably hunky-dorey. */
4886 /* Fetch reverse charset, maybe. */
4887 if (((flags & CODING_STATE_R2L) &&
4888 XCHARSET_DIRECTION (charset) == CHARSET_LEFT_TO_RIGHT)
4890 (!(flags & CODING_STATE_R2L) &&
4891 XCHARSET_DIRECTION (charset) == CHARSET_RIGHT_TO_LEFT))
4893 Lisp_Object new_charset =
4894 XCHARSET_REVERSE_DIRECTION_CHARSET (charset);
4895 if (!NILP (new_charset))
4896 charset = new_charset;
4900 lb = XCHARSET_LEADING_BYTE (charset);
4902 switch (XCHARSET_REP_BYTES (charset))
4905 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4906 Dynarr_add (dst, c & 0x7F);
4909 case 2: /* one-byte official */
4910 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4912 DECODE_ADD_UCS_CHAR(MAKE_CHAR(charset, c & 0x7F, 0), dst);
4914 Dynarr_add (dst, lb);
4915 Dynarr_add (dst, c | 0x80);
4919 case 3: /* one-byte private or two-byte official */
4921 if (XCHARSET_DIMENSION (charset) == 1)
4923 if (XCHARSET_PRIVATE_P (charset))
4926 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4928 DECODE_ADD_UCS_CHAR(MAKE_CHAR(charset, c & 0x7F, 0),
4931 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_1);
4932 Dynarr_add (dst, lb);
4933 Dynarr_add (dst, c | 0x80);
4941 DECODE_ADD_UCS_CHAR(MAKE_CHAR(charset,
4945 Dynarr_add (dst, lb);
4946 Dynarr_add (dst, ch | 0x80);
4947 Dynarr_add (dst, c | 0x80);
4956 default: /* two-byte private */
4960 DECODE_ADD_UCS_CHAR(MAKE_CHAR(charset,
4964 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_2);
4965 Dynarr_add (dst, lb);
4966 Dynarr_add (dst, ch | 0x80);
4967 Dynarr_add (dst, c | 0x80);
4977 flags &= CODING_STATE_ISO2022_LOCK;
4980 label_continue_loop:;
4983 if (flags & CODING_STATE_END)
4984 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4991 /***** ISO2022 encoder *****/
4993 /* Designate CHARSET into register REG. */
4996 iso2022_designate (Lisp_Object charset, unsigned char reg,
4997 struct encoding_stream *str, unsigned_char_dynarr *dst)
4999 static CONST char inter94[] = "()*+";
5000 static CONST char inter96[] = ",-./";
5002 unsigned char final;
5003 Lisp_Object old_charset = str->iso2022.charset[reg];
5005 str->iso2022.charset[reg] = charset;
5006 if (!CHARSETP (charset))
5007 /* charset might be an initial nil or t. */
5009 type = XCHARSET_TYPE (charset);
5010 final = XCHARSET_FINAL (charset);
5011 if (!str->iso2022.force_charset_on_output[reg] &&
5012 CHARSETP (old_charset) &&
5013 XCHARSET_TYPE (old_charset) == type &&
5014 XCHARSET_FINAL (old_charset) == final)
5017 str->iso2022.force_charset_on_output[reg] = 0;
5020 charset_conversion_spec_dynarr *dyn =
5021 str->codesys->iso2022.output_conv;
5027 for (i = 0; i < Dynarr_length (dyn); i++)
5029 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
5030 if (EQ (charset, spec->from_charset))
5031 charset = spec->to_charset;
5036 Dynarr_add (dst, ISO_CODE_ESC);
5039 case CHARSET_TYPE_94:
5040 Dynarr_add (dst, inter94[reg]);
5042 case CHARSET_TYPE_96:
5043 Dynarr_add (dst, inter96[reg]);
5045 case CHARSET_TYPE_94X94:
5046 Dynarr_add (dst, '$');
5048 || !(CODING_SYSTEM_ISO2022_SHORT (str->codesys))
5051 Dynarr_add (dst, inter94[reg]);
5053 case CHARSET_TYPE_96X96:
5054 Dynarr_add (dst, '$');
5055 Dynarr_add (dst, inter96[reg]);
5058 Dynarr_add (dst, final);
5062 ensure_normal_shift (struct encoding_stream *str, unsigned_char_dynarr *dst)
5064 if (str->iso2022.register_left != 0)
5066 Dynarr_add (dst, ISO_CODE_SI);
5067 str->iso2022.register_left = 0;
5072 ensure_shift_out (struct encoding_stream *str, unsigned_char_dynarr *dst)
5074 if (str->iso2022.register_left != 1)
5076 Dynarr_add (dst, ISO_CODE_SO);
5077 str->iso2022.register_left = 1;
5082 char_encode_iso2022 (struct encoding_stream *str, Emchar ch,
5083 unsigned_char_dynarr *dst, unsigned int *flags)
5085 unsigned char charmask;
5086 Lisp_Coding_System* codesys = str->codesys;
5087 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
5089 Lisp_Object charset;
5091 unsigned int byte1, byte2;
5095 restore_left_to_right_direction (codesys, dst, flags, 0);
5097 /* Make sure G0 contains ASCII */
5098 if ((ch > ' ' && ch < ISO_CODE_DEL)
5099 || !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
5101 ensure_normal_shift (str, dst);
5102 iso2022_designate (Vcharset_ascii, 0, str, dst);
5105 /* If necessary, restore everything to the default state
5107 if (ch == '\n' && !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys)))
5109 restore_left_to_right_direction (codesys, dst, flags, 0);
5111 ensure_normal_shift (str, dst);
5113 for (i = 0; i < 4; i++)
5115 Lisp_Object initial_charset =
5116 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5117 iso2022_designate (initial_charset, i, str, dst);
5122 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5123 Dynarr_add (dst, '\r');
5124 if (eol_type != EOL_CR)
5125 Dynarr_add (dst, ch);
5129 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5130 && fit_to_be_escape_quoted (ch))
5131 Dynarr_add (dst, ISO_CODE_ESC);
5132 Dynarr_add (dst, ch);
5135 else if ( (0x80 <= ch) && (ch <= 0x9f) )
5137 charmask = (half == 0 ? 0x00 : 0x80);
5139 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5140 && fit_to_be_escape_quoted (ch))
5141 Dynarr_add (dst, ISO_CODE_ESC);
5142 /* you asked for it ... */
5143 Dynarr_add (dst, ch);
5149 BREAKUP_CHAR (ch, charset, byte1, byte2);
5150 ensure_correct_direction (XCHARSET_DIRECTION (charset),
5151 codesys, dst, flags, 0);
5153 /* Now determine which register to use. */
5155 for (i = 0; i < 4; i++)
5157 if (EQ (charset, str->iso2022.charset[i]) ||
5159 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i)))
5168 if (XCHARSET_GRAPHIC (charset) != 0)
5170 if (!NILP (str->iso2022.charset[1]) &&
5171 (!CODING_SYSTEM_ISO2022_SEVEN (codesys)
5172 || CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys)))
5174 else if (!NILP (str->iso2022.charset[2]))
5176 else if (!NILP (str->iso2022.charset[3]))
5185 iso2022_designate (charset, reg, str, dst);
5187 /* Now invoke that register. */
5191 ensure_normal_shift (str, dst);
5195 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5197 ensure_shift_out (str, dst);
5204 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5206 Dynarr_add (dst, ISO_CODE_ESC);
5207 Dynarr_add (dst, 'N');
5212 Dynarr_add (dst, ISO_CODE_SS2);
5217 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5219 Dynarr_add (dst, ISO_CODE_ESC);
5220 Dynarr_add (dst, 'O');
5225 Dynarr_add (dst, ISO_CODE_SS3);
5233 charmask = (half == 0 ? 0x00 : 0x80);
5235 switch (XCHARSET_DIMENSION (charset))
5238 Dynarr_add (dst, byte1 | charmask);
5241 Dynarr_add (dst, byte1 | charmask);
5242 Dynarr_add (dst, byte2 | charmask);
5251 char_finish_iso2022 (struct encoding_stream *str, unsigned_char_dynarr *dst,
5252 unsigned int *flags)
5254 Lisp_Coding_System* codesys = str->codesys;
5257 restore_left_to_right_direction (codesys, dst, flags, 0);
5258 ensure_normal_shift (str, dst);
5259 for (i = 0; i < 4; i++)
5261 Lisp_Object initial_charset
5262 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5263 iso2022_designate (initial_charset, i, str, dst);
5268 /************************************************************************/
5269 /* No-conversion methods */
5270 /************************************************************************/
5272 /* This is used when reading in "binary" files -- i.e. files that may
5273 contain all 256 possible byte values and that are not to be
5274 interpreted as being in any particular decoding. */
5276 decode_coding_no_conversion (Lstream *decoding, CONST unsigned char *src,
5277 unsigned_char_dynarr *dst, unsigned int n)
5280 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
5281 unsigned int flags = str->flags;
5282 unsigned int ch = str->ch;
5283 eol_type_t eol_type = str->eol_type;
5289 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5290 DECODE_ADD_BINARY_CHAR (c, dst);
5291 label_continue_loop:;
5294 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
5301 encode_coding_no_conversion (Lstream *encoding, CONST unsigned char *src,
5302 unsigned_char_dynarr *dst, unsigned int n)
5305 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
5306 unsigned int flags = str->flags;
5307 unsigned int ch = str->ch;
5308 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
5310 unsigned char char_boundary = str->iso2022.current_char_boundary;
5317 switch (char_boundary)
5325 else if ( c >= 0xf8 )
5330 else if ( c >= 0xf0 )
5335 else if ( c >= 0xe0 )
5340 else if ( c >= 0xc0 )
5351 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5352 Dynarr_add (dst, '\r');
5353 if (eol_type != EOL_CR)
5354 Dynarr_add (dst, c);
5357 Dynarr_add (dst, c);
5362 ch = ( ch << 6 ) | ( c & 0x3f );
5363 switch ( str->codesys->fixed.size )
5366 Dynarr_add (dst, ch & 0xff);
5369 Dynarr_add (dst, (ch >> 8) & 0xff);
5370 Dynarr_add (dst, ch & 0xff);
5373 Dynarr_add (dst, (ch >> 16) & 0xff);
5374 Dynarr_add (dst, (ch >> 8) & 0xff);
5375 Dynarr_add (dst, ch & 0xff);
5378 Dynarr_add (dst, (ch >> 24) & 0xff);
5379 Dynarr_add (dst, (ch >> 16) & 0xff);
5380 Dynarr_add (dst, (ch >> 8) & 0xff);
5381 Dynarr_add (dst, ch & 0xff);
5384 fprintf(stderr, "It seems %d bytes stream.\n",
5385 str->codesys->fixed.size);
5391 ch = ( ch << 6 ) | ( c & 0x3f );
5394 #else /* not UTF2000 */
5397 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5398 Dynarr_add (dst, '\r');
5399 if (eol_type != EOL_CR)
5400 Dynarr_add (dst, '\n');
5403 else if (BYTE_ASCII_P (c))
5406 Dynarr_add (dst, c);
5408 else if (BUFBYTE_LEADING_BYTE_P (c))
5411 if (c == LEADING_BYTE_LATIN_ISO8859_1 ||
5412 c == LEADING_BYTE_CONTROL_1)
5415 Dynarr_add (dst, '~'); /* untranslatable character */
5419 if (ch == LEADING_BYTE_LATIN_ISO8859_1)
5420 Dynarr_add (dst, c);
5421 else if (ch == LEADING_BYTE_CONTROL_1)
5424 Dynarr_add (dst, c - 0x20);
5426 /* else it should be the second or third byte of an
5427 untranslatable character, so ignore it */
5430 #endif /* not UTF2000 */
5436 str->iso2022.current_char_boundary = char_boundary;
5441 /************************************************************************/
5442 /* Simple internal/external functions */
5443 /************************************************************************/
5445 static Extbyte_dynarr *conversion_out_dynarr;
5446 static Bufbyte_dynarr *conversion_in_dynarr;
5448 /* Determine coding system from coding format */
5450 /* #### not correct for all values of `fmt'! */
5452 external_data_format_to_coding_system (enum external_data_format fmt)
5456 case FORMAT_FILENAME:
5457 case FORMAT_TERMINAL:
5458 if (EQ (Vfile_name_coding_system, Qnil) ||
5459 EQ (Vfile_name_coding_system, Qbinary))
5462 return Fget_coding_system (Vfile_name_coding_system);
5465 return Fget_coding_system (Qctext);
5473 convert_to_external_format (CONST Bufbyte *ptr,
5476 enum external_data_format fmt)
5478 Lisp_Object coding_system = external_data_format_to_coding_system (fmt);
5480 if (!conversion_out_dynarr)
5481 conversion_out_dynarr = Dynarr_new (Extbyte);
5483 Dynarr_reset (conversion_out_dynarr);
5485 if (NILP (coding_system))
5487 CONST Bufbyte *end = ptr + len;
5493 (*ptr < 0xc0) ? *ptr :
5494 ((*ptr & 0x1f) << 6) | (*(ptr+1) & 0x3f);
5497 (BYTE_ASCII_P (*ptr)) ? *ptr :
5498 (*ptr == LEADING_BYTE_CONTROL_1) ? (*(ptr+1) - 0x20) :
5499 (*ptr == LEADING_BYTE_LATIN_ISO8859_1) ? (*(ptr+1)) :
5502 Dynarr_add (conversion_out_dynarr, (Extbyte) c);
5506 #ifdef ERROR_CHECK_BUFPOS
5507 assert (ptr == end);
5512 Lisp_Object instream, outstream, da_outstream;
5513 Lstream *istr, *ostr;
5514 struct gcpro gcpro1, gcpro2, gcpro3;
5515 char tempbuf[1024]; /* some random amount */
5517 instream = make_fixed_buffer_input_stream ((unsigned char *) ptr, len);
5518 da_outstream = make_dynarr_output_stream
5519 ((unsigned_char_dynarr *) conversion_out_dynarr);
5521 make_encoding_output_stream (XLSTREAM (da_outstream), coding_system);
5522 istr = XLSTREAM (instream);
5523 ostr = XLSTREAM (outstream);
5524 GCPRO3 (instream, outstream, da_outstream);
5527 int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
5530 Lstream_write (ostr, tempbuf, size_in_bytes);
5532 Lstream_close (istr);
5533 Lstream_close (ostr);
5535 Lstream_delete (istr);
5536 Lstream_delete (ostr);
5537 Lstream_delete (XLSTREAM (da_outstream));
5540 *len_out = Dynarr_length (conversion_out_dynarr);
5541 Dynarr_add (conversion_out_dynarr, 0); /* remember to zero-terminate! */
5542 return Dynarr_atp (conversion_out_dynarr, 0);
5546 convert_from_external_format (CONST Extbyte *ptr,
5549 enum external_data_format fmt)
5551 Lisp_Object coding_system = external_data_format_to_coding_system (fmt);
5553 if (!conversion_in_dynarr)
5554 conversion_in_dynarr = Dynarr_new (Bufbyte);
5556 Dynarr_reset (conversion_in_dynarr);
5558 if (NILP (coding_system))
5560 CONST Extbyte *end = ptr + len;
5561 for (; ptr < end; ptr++)
5564 DECODE_ADD_BINARY_CHAR (c, conversion_in_dynarr);
5569 Lisp_Object instream, outstream, da_outstream;
5570 Lstream *istr, *ostr;
5571 struct gcpro gcpro1, gcpro2, gcpro3;
5572 char tempbuf[1024]; /* some random amount */
5574 instream = make_fixed_buffer_input_stream ((unsigned char *) ptr, len);
5575 da_outstream = make_dynarr_output_stream
5576 ((unsigned_char_dynarr *) conversion_in_dynarr);
5578 make_decoding_output_stream (XLSTREAM (da_outstream), coding_system);
5579 istr = XLSTREAM (instream);
5580 ostr = XLSTREAM (outstream);
5581 GCPRO3 (instream, outstream, da_outstream);
5584 int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
5587 Lstream_write (ostr, tempbuf, size_in_bytes);
5589 Lstream_close (istr);
5590 Lstream_close (ostr);
5592 Lstream_delete (istr);
5593 Lstream_delete (ostr);
5594 Lstream_delete (XLSTREAM (da_outstream));
5597 *len_out = Dynarr_length (conversion_in_dynarr);
5598 Dynarr_add (conversion_in_dynarr, 0); /* remember to zero-terminate! */
5599 return Dynarr_atp (conversion_in_dynarr, 0);
5603 /************************************************************************/
5604 /* Initialization */
5605 /************************************************************************/
5608 syms_of_file_coding (void)
5610 defsymbol (&Qbuffer_file_coding_system, "buffer-file-coding-system");
5611 deferror (&Qcoding_system_error, "coding-system-error",
5612 "Coding-system error", Qio_error);
5614 DEFSUBR (Fcoding_system_p);
5615 DEFSUBR (Ffind_coding_system);
5616 DEFSUBR (Fget_coding_system);
5617 DEFSUBR (Fcoding_system_list);
5618 DEFSUBR (Fcoding_system_name);
5619 DEFSUBR (Fmake_coding_system);
5620 DEFSUBR (Fcopy_coding_system);
5621 DEFSUBR (Fdefine_coding_system_alias);
5622 DEFSUBR (Fsubsidiary_coding_system);
5624 DEFSUBR (Fcoding_system_type);
5625 DEFSUBR (Fcoding_system_doc_string);
5627 DEFSUBR (Fcoding_system_charset);
5629 DEFSUBR (Fcoding_system_property);
5631 DEFSUBR (Fcoding_category_list);
5632 DEFSUBR (Fset_coding_priority_list);
5633 DEFSUBR (Fcoding_priority_list);
5634 DEFSUBR (Fset_coding_category_system);
5635 DEFSUBR (Fcoding_category_system);
5637 DEFSUBR (Fdetect_coding_region);
5638 DEFSUBR (Fdecode_coding_region);
5639 DEFSUBR (Fencode_coding_region);
5641 DEFSUBR (Fdecode_shift_jis_char);
5642 DEFSUBR (Fencode_shift_jis_char);
5643 DEFSUBR (Fdecode_big5_char);
5644 DEFSUBR (Fencode_big5_char);
5645 DEFSUBR (Fset_ucs_char);
5646 DEFSUBR (Fucs_char);
5647 DEFSUBR (Fset_char_ucs);
5648 DEFSUBR (Fchar_ucs);
5650 defsymbol (&Qcoding_system_p, "coding-system-p");
5651 defsymbol (&Qno_conversion, "no-conversion");
5652 defsymbol (&Qraw_text, "raw-text");
5654 defsymbol (&Qbig5, "big5");
5655 defsymbol (&Qshift_jis, "shift-jis");
5656 defsymbol (&Qucs4, "ucs-4");
5657 defsymbol (&Qutf8, "utf-8");
5658 defsymbol (&Qccl, "ccl");
5659 defsymbol (&Qiso2022, "iso2022");
5661 defsymbol (&Qmnemonic, "mnemonic");
5662 defsymbol (&Qeol_type, "eol-type");
5663 defsymbol (&Qpost_read_conversion, "post-read-conversion");
5664 defsymbol (&Qpre_write_conversion, "pre-write-conversion");
5666 defsymbol (&Qcr, "cr");
5667 defsymbol (&Qlf, "lf");
5668 defsymbol (&Qcrlf, "crlf");
5669 defsymbol (&Qeol_cr, "eol-cr");
5670 defsymbol (&Qeol_lf, "eol-lf");
5671 defsymbol (&Qeol_crlf, "eol-crlf");
5673 defsymbol (&Qcharset_g0, "charset-g0");
5674 defsymbol (&Qcharset_g1, "charset-g1");
5675 defsymbol (&Qcharset_g2, "charset-g2");
5676 defsymbol (&Qcharset_g3, "charset-g3");
5677 defsymbol (&Qforce_g0_on_output, "force-g0-on-output");
5678 defsymbol (&Qforce_g1_on_output, "force-g1-on-output");
5679 defsymbol (&Qforce_g2_on_output, "force-g2-on-output");
5680 defsymbol (&Qforce_g3_on_output, "force-g3-on-output");
5681 defsymbol (&Qno_iso6429, "no-iso6429");
5682 defsymbol (&Qinput_charset_conversion, "input-charset-conversion");
5683 defsymbol (&Qoutput_charset_conversion, "output-charset-conversion");
5685 defsymbol (&Qshort, "short");
5686 defsymbol (&Qno_ascii_eol, "no-ascii-eol");
5687 defsymbol (&Qno_ascii_cntl, "no-ascii-cntl");
5688 defsymbol (&Qseven, "seven");
5689 defsymbol (&Qlock_shift, "lock-shift");
5690 defsymbol (&Qescape_quoted, "escape-quoted");
5692 defsymbol (&Qencode, "encode");
5693 defsymbol (&Qdecode, "decode");
5696 defsymbol (&Qctext, "ctext");
5697 defsymbol (&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS],
5699 defsymbol (&coding_category_symbol[CODING_CATEGORY_BIG5],
5701 defsymbol (&coding_category_symbol[CODING_CATEGORY_UCS4],
5703 defsymbol (&coding_category_symbol[CODING_CATEGORY_UTF8],
5705 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_7],
5707 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE],
5709 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_1],
5711 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_2],
5713 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_LOCK_SHIFT],
5716 defsymbol (&coding_category_symbol[CODING_CATEGORY_NO_CONVERSION],
5721 lstream_type_create_file_coding (void)
5723 LSTREAM_HAS_METHOD (decoding, reader);
5724 LSTREAM_HAS_METHOD (decoding, writer);
5725 LSTREAM_HAS_METHOD (decoding, rewinder);
5726 LSTREAM_HAS_METHOD (decoding, seekable_p);
5727 LSTREAM_HAS_METHOD (decoding, flusher);
5728 LSTREAM_HAS_METHOD (decoding, closer);
5729 LSTREAM_HAS_METHOD (decoding, marker);
5731 LSTREAM_HAS_METHOD (encoding, reader);
5732 LSTREAM_HAS_METHOD (encoding, writer);
5733 LSTREAM_HAS_METHOD (encoding, rewinder);
5734 LSTREAM_HAS_METHOD (encoding, seekable_p);
5735 LSTREAM_HAS_METHOD (encoding, flusher);
5736 LSTREAM_HAS_METHOD (encoding, closer);
5737 LSTREAM_HAS_METHOD (encoding, marker);
5741 vars_of_file_coding (void)
5745 /* Initialize to something reasonable ... */
5746 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
5748 coding_category_system[i] = Qnil;
5749 coding_category_by_priority[i] = i;
5752 Fprovide (intern ("file-coding"));
5754 DEFVAR_LISP ("keyboard-coding-system", &Vkeyboard_coding_system /*
5755 Coding system used for TTY keyboard input.
5756 Not used under a windowing system.
5758 Vkeyboard_coding_system = Qnil;
5760 DEFVAR_LISP ("terminal-coding-system", &Vterminal_coding_system /*
5761 Coding system used for TTY display output.
5762 Not used under a windowing system.
5764 Vterminal_coding_system = Qnil;
5766 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read /*
5767 Overriding coding system used when writing a file or process.
5768 You should *bind* this, not set it. If this is non-nil, it specifies
5769 the coding system that will be used when a file or process is read
5770 in, and overrides `buffer-file-coding-system-for-read',
5771 `insert-file-contents-pre-hook', etc. Use those variables instead of
5772 this one for permanent changes to the environment.
5774 Vcoding_system_for_read = Qnil;
5776 DEFVAR_LISP ("coding-system-for-write",
5777 &Vcoding_system_for_write /*
5778 Overriding coding system used when writing a file or process.
5779 You should *bind* this, not set it. If this is non-nil, it specifies
5780 the coding system that will be used when a file or process is wrote
5781 in, and overrides `buffer-file-coding-system',
5782 `write-region-pre-hook', etc. Use those variables instead of this one
5783 for permanent changes to the environment.
5785 Vcoding_system_for_write = Qnil;
5787 DEFVAR_LISP ("file-name-coding-system", &Vfile_name_coding_system /*
5788 Coding system used to convert pathnames when accessing files.
5790 Vfile_name_coding_system = Qnil;
5792 DEFVAR_BOOL ("enable-multibyte-characters", &enable_multibyte_characters /*
5793 Non-nil means the buffer contents are regarded as multi-byte form
5794 of characters, not a binary code. This affects the display, file I/O,
5795 and behaviors of various editing commands.
5797 Setting this to nil does not do anything.
5799 enable_multibyte_characters = 1;
5803 complex_vars_of_file_coding (void)
5805 staticpro (&Vcoding_system_hash_table);
5806 Vcoding_system_hash_table =
5807 make_lisp_hash_table (50, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ);
5809 the_codesys_prop_dynarr = Dynarr_new (codesys_prop);
5811 #define DEFINE_CODESYS_PROP(Prop_Type, Sym) do \
5813 struct codesys_prop csp; \
5815 csp.prop_type = (Prop_Type); \
5816 Dynarr_add (the_codesys_prop_dynarr, csp); \
5819 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qmnemonic);
5820 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_type);
5821 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_cr);
5822 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_crlf);
5823 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_lf);
5824 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpost_read_conversion);
5825 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpre_write_conversion);
5827 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g0);
5828 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g1);
5829 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g2);
5830 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g3);
5831 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g0_on_output);
5832 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g1_on_output);
5833 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g2_on_output);
5834 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g3_on_output);
5835 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qshort);
5836 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_eol);
5837 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_cntl);
5838 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qseven);
5839 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qlock_shift);
5840 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_iso6429);
5841 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qescape_quoted);
5842 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qinput_charset_conversion);
5843 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qoutput_charset_conversion);
5845 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qencode);
5846 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qdecode);
5848 /* Need to create this here or we're really screwed. */
5850 (Qraw_text, Qno_conversion,
5851 build_string ("Raw text, which means it converts only line-break-codes."),
5852 list2 (Qmnemonic, build_string ("Raw")));
5855 (Qbinary, Qno_conversion,
5856 build_string ("Binary, which means it does not convert anything."),
5857 list4 (Qeol_type, Qlf,
5858 Qmnemonic, build_string ("Binary")));
5863 build_string ("Coding-system of ISO/IEC 10646 UTF-8."),
5864 list2 (Qmnemonic, build_string ("UTF8")));
5867 Fdefine_coding_system_alias (Qno_conversion, Qraw_text);
5869 /* Need this for bootstrapping */
5870 coding_category_system[CODING_CATEGORY_NO_CONVERSION] =
5871 Fget_coding_system (Qraw_text);
5874 coding_category_system[CODING_CATEGORY_UTF8]
5875 = Fget_coding_system (Qutf8);
5882 for (i = 0; i < 65536; i++)
5883 ucs_to_mule_table[i] = Qnil;
5885 staticpro (&mule_to_ucs_table);
5886 mule_to_ucs_table = Fmake_char_table(Qgeneric);