1 /* Code conversion functions.
2 Copyright (C) 1991, 1995 Free Software Foundation, Inc.
3 Copyright (C) 1995 Sun Microsystems, Inc.
5 This file is part of XEmacs.
7 XEmacs is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by the
9 Free Software Foundation; either version 2, or (at your option) any
12 XEmacs is distributed in the hope that it will be useful, but WITHOUT
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 You should have received a copy of the GNU General Public License
18 along with XEmacs; see the file COPYING. If not, write to
19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20 Boston, MA 02111-1307, USA. */
22 /* Synched up with: Mule 2.3. Not in FSF. */
24 /* Rewritten by Ben Wing <ben@xemacs.org>. */
38 #include "file-coding.h"
40 Lisp_Object Qcoding_system_error;
42 Lisp_Object Vkeyboard_coding_system;
43 Lisp_Object Vterminal_coding_system;
44 Lisp_Object Vcoding_system_for_read;
45 Lisp_Object Vcoding_system_for_write;
46 Lisp_Object Vfile_name_coding_system;
48 /* Table of symbols identifying each coding category. */
49 Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST + 1];
53 struct file_coding_dump {
54 /* Coding system currently associated with each coding category. */
55 Lisp_Object coding_category_system[CODING_CATEGORY_LAST + 1];
57 /* Table of all coding categories in decreasing order of priority.
58 This describes a permutation of the possible coding categories. */
59 int coding_category_by_priority[CODING_CATEGORY_LAST + 1];
62 Lisp_Object ucs_to_mule_table[65536];
66 static const struct lrecord_description fcd_description_1[] = {
67 { XD_LISP_OBJECT_ARRAY, offsetof (struct file_coding_dump, coding_category_system), CODING_CATEGORY_LAST + 1 },
69 { XD_LISP_OBJECT_ARRAY, offsetof (struct file_coding_dump, ucs_to_mule_table), 65536 },
74 static const struct struct_description fcd_description = {
75 sizeof (struct file_coding_dump),
79 Lisp_Object mule_to_ucs_table;
81 Lisp_Object Qcoding_systemp;
83 Lisp_Object Qraw_text, Qno_conversion, Qccl, Qiso2022;
84 /* Qinternal in general.c */
86 Lisp_Object Qmnemonic, Qeol_type;
87 Lisp_Object Qcr, Qcrlf, Qlf;
88 Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf;
89 Lisp_Object Qpost_read_conversion;
90 Lisp_Object Qpre_write_conversion;
93 Lisp_Object Qucs4, Qutf8;
94 Lisp_Object Qbig5, Qshift_jis;
95 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3;
96 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output;
97 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output;
98 Lisp_Object Qno_iso6429;
99 Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion;
100 Lisp_Object Qescape_quoted;
101 Lisp_Object Qshort, Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift;
103 Lisp_Object Qencode, Qdecode;
105 Lisp_Object Vcoding_system_hash_table;
107 int enable_multibyte_characters;
110 /* Additional information used by the ISO2022 decoder and detector. */
111 struct iso2022_decoder
113 /* CHARSET holds the character sets currently assigned to the G0
114 through G3 variables. It is initialized from the array
115 INITIAL_CHARSET in CODESYS. */
116 Lisp_Object charset[4];
118 /* Which registers are currently invoked into the left (GL) and
119 right (GR) halves of the 8-bit encoding space? */
120 int register_left, register_right;
122 /* ISO_ESC holds a value indicating part of an escape sequence
123 that has already been seen. */
124 enum iso_esc_flag esc;
126 /* This records the bytes we've seen so far in an escape sequence,
127 in case the sequence is invalid (we spit out the bytes unchanged). */
128 unsigned char esc_bytes[8];
130 /* Index for next byte to store in ISO escape sequence. */
133 #ifdef ENABLE_COMPOSITE_CHARS
134 /* Stuff seen so far when composing a string. */
135 unsigned_char_dynarr *composite_chars;
138 /* If we saw an invalid designation sequence for a particular
139 register, we flag it here and switch to ASCII. The next time we
140 see a valid designation for this register, we turn off the flag
141 and do the designation normally, but pretend the sequence was
142 invalid. The effect of all this is that (most of the time) the
143 escape sequences for both the switch to the unknown charset, and
144 the switch back to the known charset, get inserted literally into
145 the buffer and saved out as such. The hope is that we can
146 preserve the escape sequences so that the resulting written out
147 file makes sense. If we don't do any of this, the designation
148 to the invalid charset will be preserved but that switch back
149 to the known charset will probably get eaten because it was
150 the same charset that was already present in the register. */
151 unsigned char invalid_designated[4];
153 /* We try to do similar things as above for direction-switching
154 sequences. If we encountered a direction switch while an
155 invalid designation was present, or an invalid designation
156 just after a direction switch (i.e. no valid designation
157 encountered yet), we insert the direction-switch escape
158 sequence literally into the output stream, and later on
159 insert the corresponding direction-restoring escape sequence
161 unsigned int switched_dir_and_no_valid_charset_yet :1;
162 unsigned int invalid_switch_dir :1;
164 /* Tells the decoder to output the escape sequence literally
165 even though it was valid. Used in the games we play to
166 avoid lossage when we encounter invalid designations. */
167 unsigned int output_literally :1;
168 /* We encountered a direction switch followed by an invalid
169 designation. We didn't output the direction switch
170 literally because we didn't know about the invalid designation;
171 but we have to do so now. */
172 unsigned int output_direction_sequence :1;
175 EXFUN (Fcopy_coding_system, 2);
177 struct detection_state;
178 static int detect_coding_sjis (struct detection_state *st,
179 const unsigned char *src,
181 static void decode_coding_sjis (Lstream *decoding,
182 const unsigned char *src,
183 unsigned_char_dynarr *dst,
185 static void encode_coding_sjis (Lstream *encoding,
186 const unsigned char *src,
187 unsigned_char_dynarr *dst,
189 static int detect_coding_big5 (struct detection_state *st,
190 const unsigned char *src,
192 static void decode_coding_big5 (Lstream *decoding,
193 const unsigned char *src,
194 unsigned_char_dynarr *dst, unsigned int n);
195 static void encode_coding_big5 (Lstream *encoding,
196 const unsigned char *src,
197 unsigned_char_dynarr *dst, unsigned int n);
198 static int detect_coding_ucs4 (struct detection_state *st,
199 const unsigned char *src,
201 static void decode_coding_ucs4 (Lstream *decoding,
202 const unsigned char *src,
203 unsigned_char_dynarr *dst, unsigned int n);
204 static void encode_coding_ucs4 (Lstream *encoding,
205 const unsigned char *src,
206 unsigned_char_dynarr *dst, unsigned int n);
207 static int detect_coding_utf8 (struct detection_state *st,
208 const unsigned char *src,
210 static void decode_coding_utf8 (Lstream *decoding,
211 const unsigned char *src,
212 unsigned_char_dynarr *dst, unsigned int n);
213 static void encode_coding_utf8 (Lstream *encoding,
214 const unsigned char *src,
215 unsigned_char_dynarr *dst, unsigned int n);
216 static int postprocess_iso2022_mask (int mask);
217 static void reset_iso2022 (Lisp_Object coding_system,
218 struct iso2022_decoder *iso);
219 static int detect_coding_iso2022 (struct detection_state *st,
220 const unsigned char *src,
222 static void decode_coding_iso2022 (Lstream *decoding,
223 const unsigned char *src,
224 unsigned_char_dynarr *dst, unsigned int n);
225 static void encode_coding_iso2022 (Lstream *encoding,
226 const unsigned char *src,
227 unsigned_char_dynarr *dst, unsigned int n);
229 static void decode_coding_no_conversion (Lstream *decoding,
230 const unsigned char *src,
231 unsigned_char_dynarr *dst,
233 static void encode_coding_no_conversion (Lstream *encoding,
234 const unsigned char *src,
235 unsigned_char_dynarr *dst,
237 static void mule_decode (Lstream *decoding, const unsigned char *src,
238 unsigned_char_dynarr *dst, unsigned int n);
239 static void mule_encode (Lstream *encoding, const unsigned char *src,
240 unsigned_char_dynarr *dst, unsigned int n);
242 typedef struct codesys_prop codesys_prop;
251 Dynarr_declare (codesys_prop);
252 } codesys_prop_dynarr;
254 static const struct lrecord_description codesys_prop_description_1[] = {
255 { XD_LISP_OBJECT, offsetof (codesys_prop, sym) },
259 static const struct struct_description codesys_prop_description = {
260 sizeof (codesys_prop),
261 codesys_prop_description_1
264 static const struct lrecord_description codesys_prop_dynarr_description_1[] = {
265 XD_DYNARR_DESC (codesys_prop_dynarr, &codesys_prop_description),
269 static const struct struct_description codesys_prop_dynarr_description = {
270 sizeof (codesys_prop_dynarr),
271 codesys_prop_dynarr_description_1
274 codesys_prop_dynarr *the_codesys_prop_dynarr;
276 enum codesys_prop_enum
279 CODESYS_PROP_ISO2022,
284 /************************************************************************/
285 /* Coding system functions */
286 /************************************************************************/
288 static Lisp_Object mark_coding_system (Lisp_Object);
289 static void print_coding_system (Lisp_Object, Lisp_Object, int);
290 static void finalize_coding_system (void *header, int for_disksave);
293 static const struct lrecord_description ccs_description_1[] = {
294 { XD_LISP_OBJECT, offsetof (charset_conversion_spec, from_charset) },
295 { XD_LISP_OBJECT, offsetof (charset_conversion_spec, to_charset) },
299 static const struct struct_description ccs_description = {
300 sizeof (charset_conversion_spec),
304 static const struct lrecord_description ccsd_description_1[] = {
305 XD_DYNARR_DESC (charset_conversion_spec_dynarr, &ccs_description),
309 static const struct struct_description ccsd_description = {
310 sizeof (charset_conversion_spec_dynarr),
315 static const struct lrecord_description coding_system_description[] = {
316 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, name) },
317 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, doc_string) },
318 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, mnemonic) },
319 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, post_read_conversion) },
320 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, pre_write_conversion) },
321 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_lf) },
322 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_crlf) },
323 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_cr) },
325 { XD_LISP_OBJECT_ARRAY, offsetof (Lisp_Coding_System, iso2022.initial_charset), 4 },
326 { XD_STRUCT_PTR, offsetof (Lisp_Coding_System, iso2022.input_conv), 1, &ccsd_description },
327 { XD_STRUCT_PTR, offsetof (Lisp_Coding_System, iso2022.output_conv), 1, &ccsd_description },
328 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccl.decode) },
329 { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccl.encode) },
334 DEFINE_LRECORD_IMPLEMENTATION ("coding-system", coding_system,
335 mark_coding_system, print_coding_system,
336 finalize_coding_system,
337 0, 0, coding_system_description,
341 mark_coding_system (Lisp_Object obj)
343 Lisp_Coding_System *codesys = XCODING_SYSTEM (obj);
345 mark_object (CODING_SYSTEM_NAME (codesys));
346 mark_object (CODING_SYSTEM_DOC_STRING (codesys));
347 mark_object (CODING_SYSTEM_MNEMONIC (codesys));
348 mark_object (CODING_SYSTEM_EOL_LF (codesys));
349 mark_object (CODING_SYSTEM_EOL_CRLF (codesys));
350 mark_object (CODING_SYSTEM_EOL_CR (codesys));
352 switch (CODING_SYSTEM_TYPE (codesys))
356 case CODESYS_ISO2022:
357 for (i = 0; i < 4; i++)
358 mark_object (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i));
359 if (codesys->iso2022.input_conv)
361 for (i = 0; i < Dynarr_length (codesys->iso2022.input_conv); i++)
363 struct charset_conversion_spec *ccs =
364 Dynarr_atp (codesys->iso2022.input_conv, i);
365 mark_object (ccs->from_charset);
366 mark_object (ccs->to_charset);
369 if (codesys->iso2022.output_conv)
371 for (i = 0; i < Dynarr_length (codesys->iso2022.output_conv); i++)
373 struct charset_conversion_spec *ccs =
374 Dynarr_atp (codesys->iso2022.output_conv, i);
375 mark_object (ccs->from_charset);
376 mark_object (ccs->to_charset);
382 mark_object (CODING_SYSTEM_CCL_DECODE (codesys));
383 mark_object (CODING_SYSTEM_CCL_ENCODE (codesys));
390 mark_object (CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys));
391 return CODING_SYSTEM_POST_READ_CONVERSION (codesys);
395 print_coding_system (Lisp_Object obj, Lisp_Object printcharfun,
398 Lisp_Coding_System *c = XCODING_SYSTEM (obj);
400 error ("printing unreadable object #<coding_system 0x%x>",
403 write_c_string ("#<coding_system ", printcharfun);
404 print_internal (c->name, printcharfun, 1);
405 write_c_string (">", printcharfun);
409 finalize_coding_system (void *header, int for_disksave)
411 Lisp_Coding_System *c = (Lisp_Coding_System *) header;
412 /* Since coding systems never go away, this function is not
413 necessary. But it would be necessary if we changed things
414 so that coding systems could go away. */
415 if (!for_disksave) /* see comment in lstream.c */
417 switch (CODING_SYSTEM_TYPE (c))
420 case CODESYS_ISO2022:
421 if (c->iso2022.input_conv)
423 Dynarr_free (c->iso2022.input_conv);
424 c->iso2022.input_conv = 0;
426 if (c->iso2022.output_conv)
428 Dynarr_free (c->iso2022.output_conv);
429 c->iso2022.output_conv = 0;
440 symbol_to_eol_type (Lisp_Object symbol)
442 CHECK_SYMBOL (symbol);
443 if (NILP (symbol)) return EOL_AUTODETECT;
444 if (EQ (symbol, Qlf)) return EOL_LF;
445 if (EQ (symbol, Qcrlf)) return EOL_CRLF;
446 if (EQ (symbol, Qcr)) return EOL_CR;
448 signal_simple_error ("Unrecognized eol type", symbol);
449 return EOL_AUTODETECT; /* not reached */
453 eol_type_to_symbol (eol_type_t type)
458 case EOL_LF: return Qlf;
459 case EOL_CRLF: return Qcrlf;
460 case EOL_CR: return Qcr;
461 case EOL_AUTODETECT: return Qnil;
466 setup_eol_coding_systems (Lisp_Coding_System *codesys)
468 Lisp_Object codesys_obj;
469 int len = string_length (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name);
470 char *codesys_name = (char *) alloca (len + 7);
472 char *codesys_mnemonic=0;
474 Lisp_Object codesys_name_sym, sub_codesys_obj;
478 XSETCODING_SYSTEM (codesys_obj, codesys);
480 memcpy (codesys_name,
481 string_data (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name), len);
483 if (STRINGP (CODING_SYSTEM_MNEMONIC (codesys)))
485 mlen = XSTRING_LENGTH (CODING_SYSTEM_MNEMONIC (codesys));
486 codesys_mnemonic = (char *) alloca (mlen + 7);
487 memcpy (codesys_mnemonic,
488 XSTRING_DATA (CODING_SYSTEM_MNEMONIC (codesys)), mlen);
491 #define DEFINE_SUB_CODESYS(op_sys, op_sys_abbr, Type) do { \
492 strcpy (codesys_name + len, "-" op_sys); \
494 strcpy (codesys_mnemonic + mlen, op_sys_abbr); \
495 codesys_name_sym = intern (codesys_name); \
496 sub_codesys_obj = Fcopy_coding_system (codesys_obj, codesys_name_sym); \
497 XCODING_SYSTEM_EOL_TYPE (sub_codesys_obj) = Type; \
499 XCODING_SYSTEM_MNEMONIC(sub_codesys_obj) = \
500 build_string (codesys_mnemonic); \
501 CODING_SYSTEM_##Type (codesys) = sub_codesys_obj; \
504 DEFINE_SUB_CODESYS("unix", "", EOL_LF);
505 DEFINE_SUB_CODESYS("dos", ":T", EOL_CRLF);
506 DEFINE_SUB_CODESYS("mac", ":t", EOL_CR);
509 DEFUN ("coding-system-p", Fcoding_system_p, 1, 1, 0, /*
510 Return t if OBJECT is a coding system.
511 A coding system is an object that defines how text containing multiple
512 character sets is encoded into a stream of (typically 8-bit) bytes.
513 The coding system is used to decode the stream into a series of
514 characters (which may be from multiple charsets) when the text is read
515 from a file or process, and is used to encode the text back into the
516 same format when it is written out to a file or process.
518 For example, many ISO2022-compliant coding systems (such as Compound
519 Text, which is used for inter-client data under the X Window System)
520 use escape sequences to switch between different charsets -- Japanese
521 Kanji, for example, is invoked with "ESC $ ( B"; ASCII is invoked
522 with "ESC ( B"; and Cyrillic is invoked with "ESC - L". See
523 `make-coding-system' for more information.
525 Coding systems are normally identified using a symbol, and the
526 symbol is accepted in place of the actual coding system object whenever
527 a coding system is called for. (This is similar to how faces work.)
531 return CODING_SYSTEMP (object) ? Qt : Qnil;
534 DEFUN ("find-coding-system", Ffind_coding_system, 1, 1, 0, /*
535 Retrieve the coding system of the given name.
537 If CODING-SYSTEM-OR-NAME is a coding-system object, it is simply
538 returned. Otherwise, CODING-SYSTEM-OR-NAME should be a symbol.
539 If there is no such coding system, nil is returned. Otherwise the
540 associated coding system object is returned.
542 (coding_system_or_name))
544 if (NILP (coding_system_or_name))
545 coding_system_or_name = Qbinary;
546 else if (CODING_SYSTEMP (coding_system_or_name))
547 return coding_system_or_name;
549 CHECK_SYMBOL (coding_system_or_name);
553 coding_system_or_name =
554 Fgethash (coding_system_or_name, Vcoding_system_hash_table, Qnil);
556 if (CODING_SYSTEMP (coding_system_or_name) || NILP (coding_system_or_name))
557 return coding_system_or_name;
561 DEFUN ("get-coding-system", Fget_coding_system, 1, 1, 0, /*
562 Retrieve the coding system of the given name.
563 Same as `find-coding-system' except that if there is no such
564 coding system, an error is signaled instead of returning nil.
568 Lisp_Object coding_system = Ffind_coding_system (name);
570 if (NILP (coding_system))
571 signal_simple_error ("No such coding system", name);
572 return coding_system;
575 /* We store the coding systems in hash tables with the names as the key and the
576 actual coding system object as the value. Occasionally we need to use them
577 in a list format. These routines provide us with that. */
578 struct coding_system_list_closure
580 Lisp_Object *coding_system_list;
584 add_coding_system_to_list_mapper (Lisp_Object key, Lisp_Object value,
585 void *coding_system_list_closure)
587 /* This function can GC */
588 struct coding_system_list_closure *cscl =
589 (struct coding_system_list_closure *) coding_system_list_closure;
590 Lisp_Object *coding_system_list = cscl->coding_system_list;
592 *coding_system_list = Fcons (key, *coding_system_list);
596 DEFUN ("coding-system-list", Fcoding_system_list, 0, 0, 0, /*
597 Return a list of the names of all defined coding systems.
601 Lisp_Object coding_system_list = Qnil;
603 struct coding_system_list_closure coding_system_list_closure;
605 GCPRO1 (coding_system_list);
606 coding_system_list_closure.coding_system_list = &coding_system_list;
607 elisp_maphash (add_coding_system_to_list_mapper, Vcoding_system_hash_table,
608 &coding_system_list_closure);
611 return coding_system_list;
614 DEFUN ("coding-system-name", Fcoding_system_name, 1, 1, 0, /*
615 Return the name of the given coding system.
619 coding_system = Fget_coding_system (coding_system);
620 return XCODING_SYSTEM_NAME (coding_system);
623 static Lisp_Coding_System *
624 allocate_coding_system (enum coding_system_type type, Lisp_Object name)
626 Lisp_Coding_System *codesys =
627 alloc_lcrecord_type (Lisp_Coding_System, &lrecord_coding_system);
629 zero_lcrecord (codesys);
630 CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = Qnil;
631 CODING_SYSTEM_POST_READ_CONVERSION (codesys) = Qnil;
632 CODING_SYSTEM_EOL_TYPE (codesys) = EOL_AUTODETECT;
633 CODING_SYSTEM_EOL_CRLF (codesys) = Qnil;
634 CODING_SYSTEM_EOL_CR (codesys) = Qnil;
635 CODING_SYSTEM_EOL_LF (codesys) = Qnil;
636 CODING_SYSTEM_TYPE (codesys) = type;
637 CODING_SYSTEM_MNEMONIC (codesys) = Qnil;
639 if (type == CODESYS_ISO2022)
642 for (i = 0; i < 4; i++)
643 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i) = Qnil;
645 else if (type == CODESYS_CCL)
647 CODING_SYSTEM_CCL_DECODE (codesys) = Qnil;
648 CODING_SYSTEM_CCL_ENCODE (codesys) = Qnil;
651 CODING_SYSTEM_NAME (codesys) = name;
657 /* Given a list of charset conversion specs as specified in a Lisp
658 program, parse it into STORE_HERE. */
661 parse_charset_conversion_specs (charset_conversion_spec_dynarr *store_here,
662 Lisp_Object spec_list)
666 EXTERNAL_LIST_LOOP (rest, spec_list)
668 Lisp_Object car = XCAR (rest);
669 Lisp_Object from, to;
670 struct charset_conversion_spec spec;
672 if (!CONSP (car) || !CONSP (XCDR (car)) || !NILP (XCDR (XCDR (car))))
673 signal_simple_error ("Invalid charset conversion spec", car);
674 from = Fget_charset (XCAR (car));
675 to = Fget_charset (XCAR (XCDR (car)));
676 if (XCHARSET_TYPE (from) != XCHARSET_TYPE (to))
677 signal_simple_error_2
678 ("Attempted conversion between different charset types",
680 spec.from_charset = from;
681 spec.to_charset = to;
683 Dynarr_add (store_here, spec);
687 /* Given a dynarr LOAD_HERE of internally-stored charset conversion
688 specs, return the equivalent as the Lisp programmer would see it.
690 If LOAD_HERE is 0, return Qnil. */
693 unparse_charset_conversion_specs (charset_conversion_spec_dynarr *load_here)
700 for (i = 0, result = Qnil; i < Dynarr_length (load_here); i++)
702 struct charset_conversion_spec *ccs = Dynarr_atp (load_here, i);
703 result = Fcons (list2 (ccs->from_charset, ccs->to_charset), result);
706 return Fnreverse (result);
711 DEFUN ("make-coding-system", Fmake_coding_system, 2, 4, 0, /*
712 Register symbol NAME as a coding system.
714 TYPE describes the conversion method used and should be one of
717 Automatic conversion. XEmacs attempts to detect the coding system
720 No conversion. Use this for binary files and such. On output,
721 graphic characters that are not in ASCII or Latin-1 will be
722 replaced by a ?. (For a no-conversion-encoded buffer, these
723 characters will only be present if you explicitly insert them.)
725 Shift-JIS (a Japanese encoding commonly used in PC operating systems).
727 ISO 10646 UCS-4 encoding.
729 ISO 10646 UTF-8 encoding.
731 Any ISO2022-compliant encoding. Among other things, this includes
732 JIS (the Japanese encoding commonly used for e-mail), EUC (the
733 standard Unix encoding for Japanese and other languages), and
734 Compound Text (the encoding used in X11). You can specify more
735 specific information about the conversion with the FLAGS argument.
737 Big5 (the encoding commonly used for Taiwanese).
739 The conversion is performed using a user-written pseudo-code
740 program. CCL (Code Conversion Language) is the name of this
743 Write out or read in the raw contents of the memory representing
744 the buffer's text. This is primarily useful for debugging
745 purposes, and is only enabled when XEmacs has been compiled with
746 DEBUG_XEMACS defined (via the --debug configure option).
747 WARNING: Reading in a file using 'internal conversion can result
748 in an internal inconsistency in the memory representing a
749 buffer's text, which will produce unpredictable results and may
750 cause XEmacs to crash. Under normal circumstances you should
751 never use 'internal conversion.
753 DOC-STRING is a string describing the coding system.
755 PROPS is a property list, describing the specific nature of the
756 character set. Recognized properties are:
759 String to be displayed in the modeline when this coding system is
763 End-of-line conversion to be used. It should be one of
766 Automatically detect the end-of-line type (LF, CRLF,
767 or CR). Also generate subsidiary coding systems named
768 `NAME-unix', `NAME-dos', and `NAME-mac', that are
769 identical to this coding system but have an EOL-TYPE
770 value of 'lf, 'crlf, and 'cr, respectively.
772 The end of a line is marked externally using ASCII LF.
773 Since this is also the way that XEmacs represents an
774 end-of-line internally, specifying this option results
775 in no end-of-line conversion. This is the standard
776 format for Unix text files.
778 The end of a line is marked externally using ASCII
779 CRLF. This is the standard format for MS-DOS text
782 The end of a line is marked externally using ASCII CR.
783 This is the standard format for Macintosh text files.
785 Automatically detect the end-of-line type but do not
786 generate subsidiary coding systems. (This value is
787 converted to nil when stored internally, and
788 `coding-system-property' will return nil.)
790 'post-read-conversion
791 Function called after a file has been read in, to perform the
792 decoding. Called with two arguments, BEG and END, denoting
793 a region of the current buffer to be decoded.
795 'pre-write-conversion
796 Function called before a file is written out, to perform the
797 encoding. Called with two arguments, BEG and END, denoting
798 a region of the current buffer to be encoded.
801 The following additional properties are recognized if TYPE is 'iso2022:
807 The character set initially designated to the G0 - G3 registers.
808 The value should be one of
810 -- A charset object (designate that character set)
811 -- nil (do not ever use this register)
812 -- t (no character set is initially designated to
813 the register, but may be later on; this automatically
814 sets the corresponding `force-g*-on-output' property)
820 If non-nil, send an explicit designation sequence on output before
821 using the specified register.
824 If non-nil, use the short forms "ESC $ @", "ESC $ A", and
825 "ESC $ B" on output in place of the full designation sequences
826 "ESC $ ( @", "ESC $ ( A", and "ESC $ ( B".
829 If non-nil, don't designate ASCII to G0 at each end of line on output.
830 Setting this to non-nil also suppresses other state-resetting that
831 normally happens at the end of a line.
834 If non-nil, don't designate ASCII to G0 before control chars on output.
837 If non-nil, use 7-bit environment on output. Otherwise, use 8-bit
841 If non-nil, use locking-shift (SO/SI) instead of single-shift
842 or designation by escape sequence.
845 If non-nil, don't use ISO6429's direction specification.
848 If non-nil, literal control characters that are the same as
849 the beginning of a recognized ISO2022 or ISO6429 escape sequence
850 (in particular, ESC (0x1B), SO (0x0E), SI (0x0F), SS2 (0x8E),
851 SS3 (0x8F), and CSI (0x9B)) are "quoted" with an escape character
852 so that they can be properly distinguished from an escape sequence.
853 (Note that doing this results in a non-portable encoding.) This
854 encoding flag is used for byte-compiled files. Note that ESC
855 is a good choice for a quoting character because there are no
856 escape sequences whose second byte is a character from the Control-0
857 or Control-1 character sets; this is explicitly disallowed by the
860 'input-charset-conversion
861 A list of conversion specifications, specifying conversion of
862 characters in one charset to another when decoding is performed.
863 Each specification is a list of two elements: the source charset,
864 and the destination charset.
866 'output-charset-conversion
867 A list of conversion specifications, specifying conversion of
868 characters in one charset to another when encoding is performed.
869 The form of each specification is the same as for
870 'input-charset-conversion.
873 The following additional properties are recognized (and required)
877 CCL program used for decoding (converting to internal format).
880 CCL program used for encoding (converting to external format).
882 (name, type, doc_string, props))
884 Lisp_Coding_System *codesys;
885 Lisp_Object rest, key, value;
886 enum coding_system_type ty;
887 int need_to_setup_eol_systems = 1;
889 /* Convert type to constant */
890 if (NILP (type) || EQ (type, Qundecided))
891 { ty = CODESYS_AUTODETECT; }
893 else if (EQ (type, Qshift_jis)) { ty = CODESYS_SHIFT_JIS; }
894 else if (EQ (type, Qiso2022)) { ty = CODESYS_ISO2022; }
895 else if (EQ (type, Qbig5)) { ty = CODESYS_BIG5; }
896 else if (EQ (type, Qucs4)) { ty = CODESYS_UCS4; }
897 else if (EQ (type, Qutf8)) { ty = CODESYS_UTF8; }
898 else if (EQ (type, Qccl)) { ty = CODESYS_CCL; }
900 else if (EQ (type, Qno_conversion)) { ty = CODESYS_NO_CONVERSION; }
902 else if (EQ (type, Qinternal)) { ty = CODESYS_INTERNAL; }
905 signal_simple_error ("Invalid coding system type", type);
909 codesys = allocate_coding_system (ty, name);
911 if (NILP (doc_string))
912 doc_string = build_string ("");
914 CHECK_STRING (doc_string);
915 CODING_SYSTEM_DOC_STRING (codesys) = doc_string;
917 EXTERNAL_PROPERTY_LIST_LOOP (rest, key, value, props)
919 if (EQ (key, Qmnemonic))
922 CHECK_STRING (value);
923 CODING_SYSTEM_MNEMONIC (codesys) = value;
926 else if (EQ (key, Qeol_type))
928 need_to_setup_eol_systems = NILP (value);
931 CODING_SYSTEM_EOL_TYPE (codesys) = symbol_to_eol_type (value);
934 else if (EQ (key, Qpost_read_conversion)) CODING_SYSTEM_POST_READ_CONVERSION (codesys) = value;
935 else if (EQ (key, Qpre_write_conversion)) CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = value;
937 else if (ty == CODESYS_ISO2022)
939 #define FROB_INITIAL_CHARSET(charset_num) \
940 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \
941 ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value))
943 if (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
944 else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
945 else if (EQ (key, Qcharset_g2)) FROB_INITIAL_CHARSET (2);
946 else if (EQ (key, Qcharset_g3)) FROB_INITIAL_CHARSET (3);
948 #define FROB_FORCE_CHARSET(charset_num) \
949 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = !NILP (value)
951 else if (EQ (key, Qforce_g0_on_output)) FROB_FORCE_CHARSET (0);
952 else if (EQ (key, Qforce_g1_on_output)) FROB_FORCE_CHARSET (1);
953 else if (EQ (key, Qforce_g2_on_output)) FROB_FORCE_CHARSET (2);
954 else if (EQ (key, Qforce_g3_on_output)) FROB_FORCE_CHARSET (3);
956 #define FROB_BOOLEAN_PROPERTY(prop) \
957 CODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value)
959 else if (EQ (key, Qshort)) FROB_BOOLEAN_PROPERTY (SHORT);
960 else if (EQ (key, Qno_ascii_eol)) FROB_BOOLEAN_PROPERTY (NO_ASCII_EOL);
961 else if (EQ (key, Qno_ascii_cntl)) FROB_BOOLEAN_PROPERTY (NO_ASCII_CNTL);
962 else if (EQ (key, Qseven)) FROB_BOOLEAN_PROPERTY (SEVEN);
963 else if (EQ (key, Qlock_shift)) FROB_BOOLEAN_PROPERTY (LOCK_SHIFT);
964 else if (EQ (key, Qno_iso6429)) FROB_BOOLEAN_PROPERTY (NO_ISO6429);
965 else if (EQ (key, Qescape_quoted)) FROB_BOOLEAN_PROPERTY (ESCAPE_QUOTED);
967 else if (EQ (key, Qinput_charset_conversion))
969 codesys->iso2022.input_conv =
970 Dynarr_new (charset_conversion_spec);
971 parse_charset_conversion_specs (codesys->iso2022.input_conv,
974 else if (EQ (key, Qoutput_charset_conversion))
976 codesys->iso2022.output_conv =
977 Dynarr_new (charset_conversion_spec);
978 parse_charset_conversion_specs (codesys->iso2022.output_conv,
982 signal_simple_error ("Unrecognized property", key);
984 else if (EQ (type, Qccl))
986 if (EQ (key, Qdecode))
988 CHECK_VECTOR (value);
989 CODING_SYSTEM_CCL_DECODE (codesys) = value;
991 else if (EQ (key, Qencode))
993 CHECK_VECTOR (value);
994 CODING_SYSTEM_CCL_ENCODE (codesys) = value;
997 signal_simple_error ("Unrecognized property", key);
1001 signal_simple_error ("Unrecognized property", key);
1004 if (need_to_setup_eol_systems)
1005 setup_eol_coding_systems (codesys);
1008 Lisp_Object codesys_obj;
1009 XSETCODING_SYSTEM (codesys_obj, codesys);
1010 Fputhash (name, codesys_obj, Vcoding_system_hash_table);
1015 DEFUN ("copy-coding-system", Fcopy_coding_system, 2, 2, 0, /*
1016 Copy OLD-CODING-SYSTEM to NEW-NAME.
1017 If NEW-NAME does not name an existing coding system, a new one will
1020 (old_coding_system, new_name))
1022 Lisp_Object new_coding_system;
1023 old_coding_system = Fget_coding_system (old_coding_system);
1024 new_coding_system = Ffind_coding_system (new_name);
1025 if (NILP (new_coding_system))
1027 XSETCODING_SYSTEM (new_coding_system,
1028 allocate_coding_system
1029 (XCODING_SYSTEM_TYPE (old_coding_system),
1031 Fputhash (new_name, new_coding_system, Vcoding_system_hash_table);
1035 Lisp_Coding_System *to = XCODING_SYSTEM (new_coding_system);
1036 Lisp_Coding_System *from = XCODING_SYSTEM (old_coding_system);
1037 memcpy (((char *) to ) + sizeof (to->header),
1038 ((char *) from) + sizeof (from->header),
1039 sizeof (*from) - sizeof (from->header));
1040 to->name = new_name;
1042 return new_coding_system;
1045 DEFUN ("coding-system-canonical-name-p", Fcoding_system_canonical_name_p, 1, 1, 0, /*
1046 Return t if OBJECT names a coding system, and is not a coding system alias.
1050 return CODING_SYSTEMP (Fgethash (object, Vcoding_system_hash_table, Qnil))
1054 DEFUN ("coding-system-alias-p", Fcoding_system_alias_p, 1, 1, 0, /*
1055 Return t if OBJECT is a coding system alias.
1056 All coding system aliases are created by `define-coding-system-alias'.
1060 return SYMBOLP (Fgethash (object, Vcoding_system_hash_table, Qzero))
1064 DEFUN ("coding-system-aliasee", Fcoding_system_aliasee, 1, 1, 0, /*
1065 Return the coding-system symbol for which symbol ALIAS is an alias.
1069 Lisp_Object aliasee = Fgethash (alias, Vcoding_system_hash_table, Qnil);
1070 if (SYMBOLP (aliasee))
1073 signal_simple_error ("Symbol is not a coding system alias", alias);
1074 return Qnil; /* To keep the compiler happy */
1078 append_suffix_to_symbol (Lisp_Object symbol, const char *ascii_string)
1080 return Fintern (concat2 (Fsymbol_name (symbol), build_string (ascii_string)),
1084 /* A maphash function, for removing dangling coding system aliases. */
1086 dangling_coding_system_alias_p (Lisp_Object alias,
1087 Lisp_Object aliasee,
1088 void *dangling_aliases)
1090 if (SYMBOLP (aliasee)
1091 && NILP (Fgethash (aliasee, Vcoding_system_hash_table, Qnil)))
1093 (*(int *) dangling_aliases)++;
1100 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias, 2, 2, 0, /*
1101 Define symbol ALIAS as an alias for coding system ALIASEE.
1103 You can use this function to redefine an alias that has already been defined,
1104 but you cannot redefine a name which is the canonical name for a coding system.
1105 \(a canonical name of a coding system is what is returned when you call
1106 `coding-system-name' on a coding system).
1108 ALIASEE itself can be an alias, which allows you to define nested aliases.
1110 You are forbidden, however, from creating alias loops or `dangling' aliases.
1111 These will be detected, and an error will be signaled if you attempt to do so.
1113 If ALIASEE is nil, then ALIAS will simply be undefined.
1115 See also `coding-system-alias-p', `coding-system-aliasee',
1116 and `coding-system-canonical-name-p'.
1120 Lisp_Object real_coding_system, probe;
1122 CHECK_SYMBOL (alias);
1124 if (!NILP (Fcoding_system_canonical_name_p (alias)))
1126 ("Symbol is the canonical name of a coding system and cannot be redefined",
1131 Lisp_Object subsidiary_unix = append_suffix_to_symbol (alias, "-unix");
1132 Lisp_Object subsidiary_dos = append_suffix_to_symbol (alias, "-dos");
1133 Lisp_Object subsidiary_mac = append_suffix_to_symbol (alias, "-mac");
1135 Fremhash (alias, Vcoding_system_hash_table);
1137 /* Undefine subsidiary aliases,
1138 presumably created by a previous call to this function */
1139 if (! NILP (Fcoding_system_alias_p (subsidiary_unix)) &&
1140 ! NILP (Fcoding_system_alias_p (subsidiary_dos)) &&
1141 ! NILP (Fcoding_system_alias_p (subsidiary_mac)))
1143 Fdefine_coding_system_alias (subsidiary_unix, Qnil);
1144 Fdefine_coding_system_alias (subsidiary_dos, Qnil);
1145 Fdefine_coding_system_alias (subsidiary_mac, Qnil);
1148 /* Undefine dangling coding system aliases. */
1150 int dangling_aliases;
1153 dangling_aliases = 0;
1154 elisp_map_remhash (dangling_coding_system_alias_p,
1155 Vcoding_system_hash_table,
1157 } while (dangling_aliases > 0);
1163 if (CODING_SYSTEMP (aliasee))
1164 aliasee = XCODING_SYSTEM_NAME (aliasee);
1166 /* Checks that aliasee names a coding-system */
1167 real_coding_system = Fget_coding_system (aliasee);
1169 /* Check for coding system alias loops */
1170 if (EQ (alias, aliasee))
1171 alias_loop: signal_simple_error_2
1172 ("Attempt to create a coding system alias loop", alias, aliasee);
1174 for (probe = aliasee;
1176 probe = Fgethash (probe, Vcoding_system_hash_table, Qzero))
1178 if (EQ (probe, alias))
1182 Fputhash (alias, aliasee, Vcoding_system_hash_table);
1184 /* Set up aliases for subsidiaries.
1185 #### There must be a better way to handle subsidiary coding systems. */
1187 static const char *suffixes[] = { "-unix", "-dos", "-mac" };
1189 for (i = 0; i < countof (suffixes); i++)
1191 Lisp_Object alias_subsidiary =
1192 append_suffix_to_symbol (alias, suffixes[i]);
1193 Lisp_Object aliasee_subsidiary =
1194 append_suffix_to_symbol (aliasee, suffixes[i]);
1196 if (! NILP (Ffind_coding_system (aliasee_subsidiary)))
1197 Fdefine_coding_system_alias (alias_subsidiary, aliasee_subsidiary);
1200 /* FSF return value is a vector of [ALIAS-unix ALIAS-dos ALIAS-mac],
1201 but it doesn't look intentional, so I'd rather return something
1202 meaningful or nothing at all. */
1207 subsidiary_coding_system (Lisp_Object coding_system, eol_type_t type)
1209 Lisp_Coding_System *cs = XCODING_SYSTEM (coding_system);
1210 Lisp_Object new_coding_system;
1212 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
1213 return coding_system;
1217 case EOL_AUTODETECT: return coding_system;
1218 case EOL_LF: new_coding_system = CODING_SYSTEM_EOL_LF (cs); break;
1219 case EOL_CR: new_coding_system = CODING_SYSTEM_EOL_CR (cs); break;
1220 case EOL_CRLF: new_coding_system = CODING_SYSTEM_EOL_CRLF (cs); break;
1224 return NILP (new_coding_system) ? coding_system : new_coding_system;
1227 DEFUN ("subsidiary-coding-system", Fsubsidiary_coding_system, 2, 2, 0, /*
1228 Return the subsidiary coding system of CODING-SYSTEM with eol type EOL-TYPE.
1230 (coding_system, eol_type))
1232 coding_system = Fget_coding_system (coding_system);
1234 return subsidiary_coding_system (coding_system,
1235 symbol_to_eol_type (eol_type));
1239 /************************************************************************/
1240 /* Coding system accessors */
1241 /************************************************************************/
1243 DEFUN ("coding-system-doc-string", Fcoding_system_doc_string, 1, 1, 0, /*
1244 Return the doc string for CODING-SYSTEM.
1248 coding_system = Fget_coding_system (coding_system);
1249 return XCODING_SYSTEM_DOC_STRING (coding_system);
1252 DEFUN ("coding-system-type", Fcoding_system_type, 1, 1, 0, /*
1253 Return the type of CODING-SYSTEM.
1257 switch (XCODING_SYSTEM_TYPE (Fget_coding_system (coding_system)))
1260 case CODESYS_AUTODETECT: return Qundecided;
1262 case CODESYS_SHIFT_JIS: return Qshift_jis;
1263 case CODESYS_ISO2022: return Qiso2022;
1264 case CODESYS_BIG5: return Qbig5;
1265 case CODESYS_UCS4: return Qucs4;
1266 case CODESYS_UTF8: return Qutf8;
1267 case CODESYS_CCL: return Qccl;
1269 case CODESYS_NO_CONVERSION: return Qno_conversion;
1271 case CODESYS_INTERNAL: return Qinternal;
1278 Lisp_Object coding_system_charset (Lisp_Object coding_system, int gnum)
1281 = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, gnum);
1283 return CHARSETP (cs) ? XCHARSET_NAME (cs) : Qnil;
1286 DEFUN ("coding-system-charset", Fcoding_system_charset, 2, 2, 0, /*
1287 Return initial charset of CODING-SYSTEM designated to GNUM.
1290 (coding_system, gnum))
1292 coding_system = Fget_coding_system (coding_system);
1295 return coding_system_charset (coding_system, XINT (gnum));
1299 DEFUN ("coding-system-property", Fcoding_system_property, 2, 2, 0, /*
1300 Return the PROP property of CODING-SYSTEM.
1302 (coding_system, prop))
1305 enum coding_system_type type;
1307 coding_system = Fget_coding_system (coding_system);
1308 CHECK_SYMBOL (prop);
1309 type = XCODING_SYSTEM_TYPE (coding_system);
1311 for (i = 0; !ok && i < Dynarr_length (the_codesys_prop_dynarr); i++)
1312 if (EQ (Dynarr_at (the_codesys_prop_dynarr, i).sym, prop))
1315 switch (Dynarr_at (the_codesys_prop_dynarr, i).prop_type)
1317 case CODESYS_PROP_ALL_OK:
1320 case CODESYS_PROP_ISO2022:
1321 if (type != CODESYS_ISO2022)
1323 ("Property only valid in ISO2022 coding systems",
1327 case CODESYS_PROP_CCL:
1328 if (type != CODESYS_CCL)
1330 ("Property only valid in CCL coding systems",
1340 signal_simple_error ("Unrecognized property", prop);
1342 if (EQ (prop, Qname))
1343 return XCODING_SYSTEM_NAME (coding_system);
1344 else if (EQ (prop, Qtype))
1345 return Fcoding_system_type (coding_system);
1346 else if (EQ (prop, Qdoc_string))
1347 return XCODING_SYSTEM_DOC_STRING (coding_system);
1348 else if (EQ (prop, Qmnemonic))
1349 return XCODING_SYSTEM_MNEMONIC (coding_system);
1350 else if (EQ (prop, Qeol_type))
1351 return eol_type_to_symbol (XCODING_SYSTEM_EOL_TYPE (coding_system));
1352 else if (EQ (prop, Qeol_lf))
1353 return XCODING_SYSTEM_EOL_LF (coding_system);
1354 else if (EQ (prop, Qeol_crlf))
1355 return XCODING_SYSTEM_EOL_CRLF (coding_system);
1356 else if (EQ (prop, Qeol_cr))
1357 return XCODING_SYSTEM_EOL_CR (coding_system);
1358 else if (EQ (prop, Qpost_read_conversion))
1359 return XCODING_SYSTEM_POST_READ_CONVERSION (coding_system);
1360 else if (EQ (prop, Qpre_write_conversion))
1361 return XCODING_SYSTEM_PRE_WRITE_CONVERSION (coding_system);
1363 else if (type == CODESYS_ISO2022)
1365 if (EQ (prop, Qcharset_g0))
1366 return coding_system_charset (coding_system, 0);
1367 else if (EQ (prop, Qcharset_g1))
1368 return coding_system_charset (coding_system, 1);
1369 else if (EQ (prop, Qcharset_g2))
1370 return coding_system_charset (coding_system, 2);
1371 else if (EQ (prop, Qcharset_g3))
1372 return coding_system_charset (coding_system, 3);
1374 #define FORCE_CHARSET(charset_num) \
1375 (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \
1376 (coding_system, charset_num) ? Qt : Qnil)
1378 else if (EQ (prop, Qforce_g0_on_output)) return FORCE_CHARSET (0);
1379 else if (EQ (prop, Qforce_g1_on_output)) return FORCE_CHARSET (1);
1380 else if (EQ (prop, Qforce_g2_on_output)) return FORCE_CHARSET (2);
1381 else if (EQ (prop, Qforce_g3_on_output)) return FORCE_CHARSET (3);
1383 #define LISP_BOOLEAN(prop) \
1384 (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil)
1386 else if (EQ (prop, Qshort)) return LISP_BOOLEAN (SHORT);
1387 else if (EQ (prop, Qno_ascii_eol)) return LISP_BOOLEAN (NO_ASCII_EOL);
1388 else if (EQ (prop, Qno_ascii_cntl)) return LISP_BOOLEAN (NO_ASCII_CNTL);
1389 else if (EQ (prop, Qseven)) return LISP_BOOLEAN (SEVEN);
1390 else if (EQ (prop, Qlock_shift)) return LISP_BOOLEAN (LOCK_SHIFT);
1391 else if (EQ (prop, Qno_iso6429)) return LISP_BOOLEAN (NO_ISO6429);
1392 else if (EQ (prop, Qescape_quoted)) return LISP_BOOLEAN (ESCAPE_QUOTED);
1394 else if (EQ (prop, Qinput_charset_conversion))
1396 unparse_charset_conversion_specs
1397 (XCODING_SYSTEM (coding_system)->iso2022.input_conv);
1398 else if (EQ (prop, Qoutput_charset_conversion))
1400 unparse_charset_conversion_specs
1401 (XCODING_SYSTEM (coding_system)->iso2022.output_conv);
1405 else if (type == CODESYS_CCL)
1407 if (EQ (prop, Qdecode))
1408 return XCODING_SYSTEM_CCL_DECODE (coding_system);
1409 else if (EQ (prop, Qencode))
1410 return XCODING_SYSTEM_CCL_ENCODE (coding_system);
1418 return Qnil; /* not reached */
1422 /************************************************************************/
1423 /* Coding category functions */
1424 /************************************************************************/
1427 decode_coding_category (Lisp_Object symbol)
1431 CHECK_SYMBOL (symbol);
1432 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1433 if (EQ (coding_category_symbol[i], symbol))
1436 signal_simple_error ("Unrecognized coding category", symbol);
1437 return 0; /* not reached */
1440 DEFUN ("coding-category-list", Fcoding_category_list, 0, 0, 0, /*
1441 Return a list of all recognized coding categories.
1446 Lisp_Object list = Qnil;
1448 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1449 list = Fcons (coding_category_symbol[i], list);
1453 DEFUN ("set-coding-priority-list", Fset_coding_priority_list, 1, 1, 0, /*
1454 Change the priority order of the coding categories.
1455 LIST should be list of coding categories, in descending order of
1456 priority. Unspecified coding categories will be lower in priority
1457 than all specified ones, in the same relative order they were in
1462 int category_to_priority[CODING_CATEGORY_LAST + 1];
1466 /* First generate a list that maps coding categories to priorities. */
1468 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1469 category_to_priority[i] = -1;
1471 /* Highest priority comes from the specified list. */
1473 EXTERNAL_LIST_LOOP (rest, list)
1475 int cat = decode_coding_category (XCAR (rest));
1477 if (category_to_priority[cat] >= 0)
1478 signal_simple_error ("Duplicate coding category in list", XCAR (rest));
1479 category_to_priority[cat] = i++;
1482 /* Now go through the existing categories by priority to retrieve
1483 the categories not yet specified and preserve their priority
1485 for (j = 0; j <= CODING_CATEGORY_LAST; j++)
1487 int cat = fcd->coding_category_by_priority[j];
1488 if (category_to_priority[cat] < 0)
1489 category_to_priority[cat] = i++;
1492 /* Now we need to construct the inverse of the mapping we just
1495 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1496 fcd->coding_category_by_priority[category_to_priority[i]] = i;
1498 /* Phew! That was confusing. */
1502 DEFUN ("coding-priority-list", Fcoding_priority_list, 0, 0, 0, /*
1503 Return a list of coding categories in descending order of priority.
1508 Lisp_Object list = Qnil;
1510 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1511 list = Fcons (coding_category_symbol[fcd->coding_category_by_priority[i]],
1516 DEFUN ("set-coding-category-system", Fset_coding_category_system, 2, 2, 0, /*
1517 Change the coding system associated with a coding category.
1519 (coding_category, coding_system))
1521 int cat = decode_coding_category (coding_category);
1523 coding_system = Fget_coding_system (coding_system);
1524 fcd->coding_category_system[cat] = coding_system;
1528 DEFUN ("coding-category-system", Fcoding_category_system, 1, 1, 0, /*
1529 Return the coding system associated with a coding category.
1533 int cat = decode_coding_category (coding_category);
1534 Lisp_Object sys = fcd->coding_category_system[cat];
1537 return XCODING_SYSTEM_NAME (sys);
1542 /************************************************************************/
1543 /* Detecting the encoding of data */
1544 /************************************************************************/
1546 struct detection_state
1548 eol_type_t eol_type;
1584 struct iso2022_decoder iso;
1586 int high_byte_count;
1587 unsigned int saw_single_shift:1;
1600 acceptable_control_char_p (int c)
1604 /* Allow and ignore control characters that you might
1605 reasonably see in a text file */
1610 case 8: /* backspace */
1611 case 11: /* vertical tab */
1612 case 12: /* form feed */
1613 case 26: /* MS-DOS C-z junk */
1614 case 31: /* '^_' -- for info */
1622 mask_has_at_most_one_bit_p (int mask)
1624 /* Perhaps the only thing useful you learn from intensive Microsoft
1625 technical interviews */
1626 return (mask & (mask - 1)) == 0;
1630 detect_eol_type (struct detection_state *st, const unsigned char *src,
1640 if (st->eol.just_saw_cr)
1642 else if (st->eol.seen_anything)
1645 else if (st->eol.just_saw_cr)
1648 st->eol.just_saw_cr = 1;
1650 st->eol.just_saw_cr = 0;
1651 st->eol.seen_anything = 1;
1654 return EOL_AUTODETECT;
1657 /* Attempt to determine the encoding and EOL type of the given text.
1658 Before calling this function for the first type, you must initialize
1659 st->eol_type as appropriate and initialize st->mask to ~0.
1661 st->eol_type holds the determined EOL type, or EOL_AUTODETECT if
1664 st->mask holds the determined coding category mask, or ~0 if only
1665 ASCII has been seen so far.
1669 0 == st->eol_type is EOL_AUTODETECT and/or more than coding category
1670 is present in st->mask
1671 1 == definitive answers are here for both st->eol_type and st->mask
1675 detect_coding_type (struct detection_state *st, const Extbyte *src,
1676 unsigned int n, int just_do_eol)
1680 if (st->eol_type == EOL_AUTODETECT)
1681 st->eol_type = detect_eol_type (st, src, n);
1684 return st->eol_type != EOL_AUTODETECT;
1686 if (!st->seen_non_ascii)
1688 for (; n; n--, src++)
1691 if ((c < 0x20 && !acceptable_control_char_p (c)) || c >= 0x80)
1693 st->seen_non_ascii = 1;
1695 st->shift_jis.mask = ~0;
1699 st->iso2022.mask = ~0;
1709 if (!mask_has_at_most_one_bit_p (st->iso2022.mask))
1710 st->iso2022.mask = detect_coding_iso2022 (st, src, n);
1711 if (!mask_has_at_most_one_bit_p (st->shift_jis.mask))
1712 st->shift_jis.mask = detect_coding_sjis (st, src, n);
1713 if (!mask_has_at_most_one_bit_p (st->big5.mask))
1714 st->big5.mask = detect_coding_big5 (st, src, n);
1715 if (!mask_has_at_most_one_bit_p (st->utf8.mask))
1716 st->utf8.mask = detect_coding_utf8 (st, src, n);
1717 if (!mask_has_at_most_one_bit_p (st->ucs4.mask))
1718 st->ucs4.mask = detect_coding_ucs4 (st, src, n);
1721 = st->iso2022.mask | st->shift_jis.mask | st->big5.mask
1722 | st->utf8.mask | st->ucs4.mask;
1725 int retval = mask_has_at_most_one_bit_p (st->mask);
1726 st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK;
1727 return retval && st->eol_type != EOL_AUTODETECT;
1732 coding_system_from_mask (int mask)
1736 /* If the file was entirely or basically ASCII, use the
1737 default value of `buffer-file-coding-system'. */
1738 Lisp_Object retval =
1739 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system;
1742 retval = Ffind_coding_system (retval);
1746 (Qbad_variable, Qwarning,
1747 "Invalid `default-buffer-file-coding-system', set to nil");
1748 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system = Qnil;
1752 retval = Fget_coding_system (Qraw_text);
1760 mask = postprocess_iso2022_mask (mask);
1762 /* Look through the coding categories by priority and find
1763 the first one that is allowed. */
1764 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1766 cat = fcd->coding_category_by_priority[i];
1767 if ((mask & (1 << cat)) &&
1768 !NILP (fcd->coding_category_system[cat]))
1772 return fcd->coding_category_system[cat];
1774 return Fget_coding_system (Qraw_text);
1778 /* Given a seekable read stream and potential coding system and EOL type
1779 as specified, do any autodetection that is called for. If the
1780 coding system and/or EOL type are not `autodetect', they will be left
1781 alone; but this function will never return an autodetect coding system
1784 This function does not automatically fetch subsidiary coding systems;
1785 that should be unnecessary with the explicit eol-type argument. */
1787 #define LENGTH(string_constant) (sizeof (string_constant) - 1)
1790 determine_real_coding_system (Lstream *stream, Lisp_Object *codesys_in_out,
1791 eol_type_t *eol_type_in_out)
1793 struct detection_state decst;
1795 if (*eol_type_in_out == EOL_AUTODETECT)
1796 *eol_type_in_out = XCODING_SYSTEM_EOL_TYPE (*codesys_in_out);
1799 decst.eol_type = *eol_type_in_out;
1802 /* If autodetection is called for, do it now. */
1803 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT
1804 || *eol_type_in_out == EOL_AUTODETECT)
1807 Lisp_Object coding_system = Qnil;
1809 ssize_t nread = Lstream_read (stream, buf, sizeof (buf));
1812 /* Look for initial "-*-"; mode line prefix */
1814 scan_end = buf + nread - LENGTH ("-*-coding:?-*-");
1819 if (*p == '-' && *(p+1) == '*' && *(p+2) == '-')
1821 Extbyte *local_vars_beg = p + 3;
1822 /* Look for final "-*-"; mode line suffix */
1823 for (p = local_vars_beg,
1824 scan_end = buf + nread - LENGTH ("-*-");
1829 if (*p == '-' && *(p+1) == '*' && *(p+2) == '-')
1831 Extbyte *suffix = p;
1832 /* Look for "coding:" */
1833 for (p = local_vars_beg,
1834 scan_end = suffix - LENGTH ("coding:?");
1837 if (memcmp ("coding:", p, LENGTH ("coding:")) == 0
1838 && (p == local_vars_beg
1839 || (*(p-1) == ' ' ||
1845 p += LENGTH ("coding:");
1846 while (*p == ' ' || *p == '\t') p++;
1848 /* Get coding system name */
1849 save = *suffix; *suffix = '\0';
1850 /* Characters valid in a MIME charset name (rfc 1521),
1851 and in a Lisp symbol name. */
1852 n = strspn ( (char *) p,
1853 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
1854 "abcdefghijklmnopqrstuvwxyz"
1860 save = p[n]; p[n] = '\0';
1862 Ffind_coding_system (intern ((char *) p));
1872 if (NILP (coding_system))
1875 if (detect_coding_type (&decst, buf, nread,
1876 XCODING_SYSTEM_TYPE (*codesys_in_out)
1877 != CODESYS_AUTODETECT))
1879 nread = Lstream_read (stream, buf, sizeof (buf));
1885 else if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT
1886 && XCODING_SYSTEM_EOL_TYPE (coding_system) == EOL_AUTODETECT)
1889 if (detect_coding_type (&decst, buf, nread, 1))
1891 nread = Lstream_read (stream, buf, sizeof (buf));
1897 *eol_type_in_out = decst.eol_type;
1898 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT)
1900 if (NILP (coding_system))
1901 *codesys_in_out = coding_system_from_mask (decst.mask);
1903 *codesys_in_out = coding_system;
1907 /* If we absolutely can't determine the EOL type, just assume LF. */
1908 if (*eol_type_in_out == EOL_AUTODETECT)
1909 *eol_type_in_out = EOL_LF;
1911 Lstream_rewind (stream);
1914 DEFUN ("detect-coding-region", Fdetect_coding_region, 2, 3, 0, /*
1915 Detect coding system of the text in the region between START and END.
1916 Returned a list of possible coding systems ordered by priority.
1917 If only ASCII characters are found, it returns 'undecided or one of
1918 its subsidiary coding systems according to a detected end-of-line
1919 type. Optional arg BUFFER defaults to the current buffer.
1921 (start, end, buffer))
1923 Lisp_Object val = Qnil;
1924 struct buffer *buf = decode_buffer (buffer, 0);
1926 Lisp_Object instream, lb_instream;
1927 Lstream *istr, *lb_istr;
1928 struct detection_state decst;
1929 struct gcpro gcpro1, gcpro2;
1931 get_buffer_range_char (buf, start, end, &b, &e, 0);
1932 lb_instream = make_lisp_buffer_input_stream (buf, b, e, 0);
1933 lb_istr = XLSTREAM (lb_instream);
1934 instream = make_encoding_input_stream (lb_istr, Fget_coding_system (Qbinary));
1935 istr = XLSTREAM (instream);
1936 GCPRO2 (instream, lb_instream);
1938 decst.eol_type = EOL_AUTODETECT;
1942 unsigned char random_buffer[4096];
1943 ssize_t nread = Lstream_read (istr, random_buffer, sizeof (random_buffer));
1947 if (detect_coding_type (&decst, random_buffer, nread, 0))
1951 if (decst.mask == ~0)
1952 val = subsidiary_coding_system (Fget_coding_system (Qundecided),
1960 decst.mask = postprocess_iso2022_mask (decst.mask);
1962 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1964 int sys = fcd->coding_category_by_priority[i];
1965 if (decst.mask & (1 << sys))
1967 Lisp_Object codesys = fcd->coding_category_system[sys];
1968 if (!NILP (codesys))
1969 codesys = subsidiary_coding_system (codesys, decst.eol_type);
1970 val = Fcons (codesys, val);
1974 Lstream_close (istr);
1976 Lstream_delete (istr);
1977 Lstream_delete (lb_istr);
1982 /************************************************************************/
1983 /* Converting to internal Mule format ("decoding") */
1984 /************************************************************************/
1986 /* A decoding stream is a stream used for decoding text (i.e.
1987 converting from some external format to internal format).
1988 The decoding-stream object keeps track of the actual coding
1989 stream, the stream that is at the other end, and data that
1990 needs to be persistent across the lifetime of the stream. */
1992 /* Handle the EOL stuff related to just-read-in character C.
1993 EOL_TYPE is the EOL type of the coding stream.
1994 FLAGS is the current value of FLAGS in the coding stream, and may
1995 be modified by this macro. (The macro only looks at the
1996 CODING_STATE_CR flag.) DST is the Dynarr to which the decoded
1997 bytes are to be written. You need to also define a local goto
1998 label "label_continue_loop" that is at the end of the main
1999 character-reading loop.
2001 If C is a CR character, then this macro handles it entirely and
2002 jumps to label_continue_loop. Otherwise, this macro does not add
2003 anything to DST, and continues normally. You should continue
2004 processing C normally after this macro. */
2006 #define DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst) \
2010 if (eol_type == EOL_CR) \
2011 Dynarr_add (dst, '\n'); \
2012 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR) \
2013 Dynarr_add (dst, c); \
2015 flags |= CODING_STATE_CR; \
2016 goto label_continue_loop; \
2018 else if (flags & CODING_STATE_CR) \
2019 { /* eol_type == CODING_SYSTEM_EOL_CRLF */ \
2021 Dynarr_add (dst, '\r'); \
2022 flags &= ~CODING_STATE_CR; \
2026 /* C should be a binary character in the range 0 - 255; convert
2027 to internal format and add to Dynarr DST. */
2029 #define DECODE_ADD_BINARY_CHAR(c, dst) \
2031 if (BYTE_ASCII_P (c)) \
2032 Dynarr_add (dst, c); \
2033 else if (BYTE_C1_P (c)) \
2035 Dynarr_add (dst, LEADING_BYTE_CONTROL_1); \
2036 Dynarr_add (dst, c + 0x20); \
2040 Dynarr_add (dst, LEADING_BYTE_LATIN_ISO8859_1); \
2041 Dynarr_add (dst, c); \
2045 #define DECODE_OUTPUT_PARTIAL_CHAR(ch) \
2049 DECODE_ADD_BINARY_CHAR (ch, dst); \
2054 #define DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst) \
2056 if (flags & CODING_STATE_END) \
2058 DECODE_OUTPUT_PARTIAL_CHAR (ch); \
2059 if (flags & CODING_STATE_CR) \
2060 Dynarr_add (dst, '\r'); \
2064 #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding)
2066 struct decoding_stream
2068 /* Coding system that governs the conversion. */
2069 Lisp_Coding_System *codesys;
2071 /* Stream that we read the encoded data from or
2072 write the decoded data to. */
2075 /* If we are reading, then we can return only a fixed amount of
2076 data, so if the conversion resulted in too much data, we store it
2077 here for retrieval the next time around. */
2078 unsigned_char_dynarr *runoff;
2080 /* FLAGS holds flags indicating the current state of the decoding.
2081 Some of these flags are dependent on the coding system. */
2084 /* CH holds a partially built-up character. Since we only deal
2085 with one- and two-byte characters at the moment, we only use
2086 this to store the first byte of a two-byte character. */
2089 /* EOL_TYPE specifies the type of end-of-line conversion that
2090 currently applies. We need to keep this separate from the
2091 EOL type stored in CODESYS because the latter might indicate
2092 automatic EOL-type detection while the former will always
2093 indicate a particular EOL type. */
2094 eol_type_t eol_type;
2096 /* Additional ISO2022 information. We define the structure above
2097 because it's also needed by the detection routines. */
2098 struct iso2022_decoder iso2022;
2100 /* Additional information (the state of the running CCL program)
2101 used by the CCL decoder. */
2102 struct ccl_program ccl;
2104 /* counter for UTF-8 or UCS-4 */
2105 unsigned char counter;
2107 struct detection_state decst;
2110 static ssize_t decoding_reader (Lstream *stream,
2111 unsigned char *data, size_t size);
2112 static ssize_t decoding_writer (Lstream *stream,
2113 const unsigned char *data, size_t size);
2114 static int decoding_rewinder (Lstream *stream);
2115 static int decoding_seekable_p (Lstream *stream);
2116 static int decoding_flusher (Lstream *stream);
2117 static int decoding_closer (Lstream *stream);
2119 static Lisp_Object decoding_marker (Lisp_Object stream);
2121 DEFINE_LSTREAM_IMPLEMENTATION ("decoding", lstream_decoding,
2122 sizeof (struct decoding_stream));
2125 decoding_marker (Lisp_Object stream)
2127 Lstream *str = DECODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2128 Lisp_Object str_obj;
2130 /* We do not need to mark the coding systems or charsets stored
2131 within the stream because they are stored in a global list
2132 and automatically marked. */
2134 XSETLSTREAM (str_obj, str);
2135 mark_object (str_obj);
2136 if (str->imp->marker)
2137 return (str->imp->marker) (str_obj);
2142 /* Read SIZE bytes of data and store it into DATA. We are a decoding stream
2143 so we read data from the other end, decode it, and store it into DATA. */
2146 decoding_reader (Lstream *stream, unsigned char *data, size_t size)
2148 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2149 unsigned char *orig_data = data;
2151 int error_occurred = 0;
2153 /* We need to interface to mule_decode(), which expects to take some
2154 amount of data and store the result into a Dynarr. We have
2155 mule_decode() store into str->runoff, and take data from there
2158 /* We loop until we have enough data, reading chunks from the other
2159 end and decoding it. */
2162 /* Take data from the runoff if we can. Make sure to take at
2163 most SIZE bytes, and delete the data from the runoff. */
2164 if (Dynarr_length (str->runoff) > 0)
2166 size_t chunk = min (size, (size_t) Dynarr_length (str->runoff));
2167 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2168 Dynarr_delete_many (str->runoff, 0, chunk);
2174 break; /* No more room for data */
2176 if (str->flags & CODING_STATE_END)
2177 /* This means that on the previous iteration, we hit the EOF on
2178 the other end. We loop once more so that mule_decode() can
2179 output any final stuff it may be holding, or any "go back
2180 to a sane state" escape sequences. (This latter makes sense
2181 during encoding.) */
2184 /* Exhausted the runoff, so get some more. DATA has at least
2185 SIZE bytes left of storage in it, so it's OK to read directly
2186 into it. (We'll be overwriting above, after we've decoded it
2187 into the runoff.) */
2188 read_size = Lstream_read (str->other_end, data, size);
2195 /* There might be some more end data produced in the translation.
2196 See the comment above. */
2197 str->flags |= CODING_STATE_END;
2198 mule_decode (stream, data, str->runoff, read_size);
2201 if (data - orig_data == 0)
2202 return error_occurred ? -1 : 0;
2204 return data - orig_data;
2208 decoding_writer (Lstream *stream, const unsigned char *data, size_t size)
2210 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2213 /* Decode all our data into the runoff, and then attempt to write
2214 it all out to the other end. Remove whatever chunk we succeeded
2216 mule_decode (stream, data, str->runoff, size);
2217 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2218 Dynarr_length (str->runoff));
2220 Dynarr_delete_many (str->runoff, 0, retval);
2221 /* Do NOT return retval. The return value indicates how much
2222 of the incoming data was written, not how many bytes were
2228 reset_decoding_stream (struct decoding_stream *str)
2231 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_ISO2022)
2233 Lisp_Object coding_system;
2234 XSETCODING_SYSTEM (coding_system, str->codesys);
2235 reset_iso2022 (coding_system, &str->iso2022);
2237 else if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_CCL)
2239 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_DECODE (str->codesys));
2243 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT
2244 || CODING_SYSTEM_EOL_TYPE (str->codesys) == EOL_AUTODETECT)
2247 str->decst.eol_type = EOL_AUTODETECT;
2248 str->decst.mask = ~0;
2250 str->flags = str->ch = 0;
2254 decoding_rewinder (Lstream *stream)
2256 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2257 reset_decoding_stream (str);
2258 Dynarr_reset (str->runoff);
2259 return Lstream_rewind (str->other_end);
2263 decoding_seekable_p (Lstream *stream)
2265 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2266 return Lstream_seekable_p (str->other_end);
2270 decoding_flusher (Lstream *stream)
2272 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2273 return Lstream_flush (str->other_end);
2277 decoding_closer (Lstream *stream)
2279 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2280 if (stream->flags & LSTREAM_FL_WRITE)
2282 str->flags |= CODING_STATE_END;
2283 decoding_writer (stream, 0, 0);
2285 Dynarr_free (str->runoff);
2287 #ifdef ENABLE_COMPOSITE_CHARS
2288 if (str->iso2022.composite_chars)
2289 Dynarr_free (str->iso2022.composite_chars);
2292 return Lstream_close (str->other_end);
2296 decoding_stream_coding_system (Lstream *stream)
2298 Lisp_Object coding_system;
2299 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2301 XSETCODING_SYSTEM (coding_system, str->codesys);
2302 return subsidiary_coding_system (coding_system, str->eol_type);
2306 set_decoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2308 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2309 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2311 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
2312 str->eol_type = CODING_SYSTEM_EOL_TYPE (cs);
2313 reset_decoding_stream (str);
2316 /* WARNING WARNING WARNING WARNING!!!!! If you open up a decoding
2317 stream for writing, no automatic code detection will be performed.
2318 The reason for this is that automatic code detection requires a
2319 seekable input. Things will also fail if you open a decoding
2320 stream for reading using a non-fully-specified coding system and
2321 a non-seekable input stream. */
2324 make_decoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2327 Lstream *lstr = Lstream_new (lstream_decoding, mode);
2328 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2332 str->other_end = stream;
2333 str->runoff = (unsigned_char_dynarr *) Dynarr_new (unsigned_char);
2334 str->eol_type = EOL_AUTODETECT;
2335 if (!strcmp (mode, "r")
2336 && Lstream_seekable_p (stream))
2337 /* We can determine the coding system now. */
2338 determine_real_coding_system (stream, &codesys, &str->eol_type);
2339 set_decoding_stream_coding_system (lstr, codesys);
2340 str->decst.eol_type = str->eol_type;
2341 str->decst.mask = ~0;
2342 XSETLSTREAM (obj, lstr);
2347 make_decoding_input_stream (Lstream *stream, Lisp_Object codesys)
2349 return make_decoding_stream_1 (stream, codesys, "r");
2353 make_decoding_output_stream (Lstream *stream, Lisp_Object codesys)
2355 return make_decoding_stream_1 (stream, codesys, "w");
2358 /* Note: the decode_coding_* functions all take the same
2359 arguments as mule_decode(), which is to say some SRC data of
2360 size N, which is to be stored into dynamic array DST.
2361 DECODING is the stream within which the decoding is
2362 taking place, but no data is actually read from or
2363 written to that stream; that is handled in decoding_reader()
2364 or decoding_writer(). This allows the same functions to
2365 be used for both reading and writing. */
2368 mule_decode (Lstream *decoding, const unsigned char *src,
2369 unsigned_char_dynarr *dst, unsigned int n)
2371 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2373 /* If necessary, do encoding-detection now. We do this when
2374 we're a writing stream or a non-seekable reading stream,
2375 meaning that we can't just process the whole input,
2376 rewind, and start over. */
2378 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT ||
2379 str->eol_type == EOL_AUTODETECT)
2381 Lisp_Object codesys;
2383 XSETCODING_SYSTEM (codesys, str->codesys);
2384 detect_coding_type (&str->decst, src, n,
2385 CODING_SYSTEM_TYPE (str->codesys) !=
2386 CODESYS_AUTODETECT);
2387 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT &&
2388 str->decst.mask != ~0)
2389 /* #### This is cheesy. What we really ought to do is
2390 buffer up a certain amount of data so as to get a
2391 less random result. */
2392 codesys = coding_system_from_mask (str->decst.mask);
2393 str->eol_type = str->decst.eol_type;
2394 if (XCODING_SYSTEM (codesys) != str->codesys)
2396 /* Preserve the CODING_STATE_END flag in case it was set.
2397 If we erase it, bad things might happen. */
2398 int was_end = str->flags & CODING_STATE_END;
2399 set_decoding_stream_coding_system (decoding, codesys);
2401 str->flags |= CODING_STATE_END;
2405 switch (CODING_SYSTEM_TYPE (str->codesys))
2408 case CODESYS_INTERNAL:
2409 Dynarr_add_many (dst, src, n);
2412 case CODESYS_AUTODETECT:
2413 /* If we got this far and still haven't decided on the coding
2414 system, then do no conversion. */
2415 case CODESYS_NO_CONVERSION:
2416 decode_coding_no_conversion (decoding, src, dst, n);
2419 case CODESYS_SHIFT_JIS:
2420 decode_coding_sjis (decoding, src, dst, n);
2423 decode_coding_big5 (decoding, src, dst, n);
2426 decode_coding_ucs4 (decoding, src, dst, n);
2429 decode_coding_utf8 (decoding, src, dst, n);
2432 str->ccl.last_block = str->flags & CODING_STATE_END;
2433 ccl_driver (&str->ccl, src, dst, n, 0, CCL_MODE_DECODING);
2435 case CODESYS_ISO2022:
2436 decode_coding_iso2022 (decoding, src, dst, n);
2444 DEFUN ("decode-coding-region", Fdecode_coding_region, 3, 4, 0, /*
2445 Decode the text between START and END which is encoded in CODING-SYSTEM.
2446 This is useful if you've read in encoded text from a file without decoding
2447 it (e.g. you read in a JIS-formatted file but used the `binary' or
2448 `no-conversion' coding system, so that it shows up as "^[$B!<!+^[(B").
2449 Return length of decoded text.
2450 BUFFER defaults to the current buffer if unspecified.
2452 (start, end, coding_system, buffer))
2455 struct buffer *buf = decode_buffer (buffer, 0);
2456 Lisp_Object instream, lb_outstream, de_outstream, outstream;
2457 Lstream *istr, *ostr;
2458 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2460 get_buffer_range_char (buf, start, end, &b, &e, 0);
2462 barf_if_buffer_read_only (buf, b, e);
2464 coding_system = Fget_coding_system (coding_system);
2465 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2466 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2467 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2469 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2470 Fget_coding_system (Qbinary));
2471 istr = XLSTREAM (instream);
2472 ostr = XLSTREAM (outstream);
2473 GCPRO4 (instream, lb_outstream, de_outstream, outstream);
2475 /* The chain of streams looks like this:
2477 [BUFFER] <----- send through
2478 ------> [ENCODE AS BINARY]
2479 ------> [DECODE AS SPECIFIED]
2485 char tempbuf[1024]; /* some random amount */
2486 Bufpos newpos, even_newer_pos;
2487 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2488 ssize_t size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2492 newpos = lisp_buffer_stream_startpos (istr);
2493 Lstream_write (ostr, tempbuf, size_in_bytes);
2494 even_newer_pos = lisp_buffer_stream_startpos (istr);
2495 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2498 Lstream_close (istr);
2499 Lstream_close (ostr);
2501 Lstream_delete (istr);
2502 Lstream_delete (ostr);
2503 Lstream_delete (XLSTREAM (de_outstream));
2504 Lstream_delete (XLSTREAM (lb_outstream));
2509 /************************************************************************/
2510 /* Converting to an external encoding ("encoding") */
2511 /************************************************************************/
2513 /* An encoding stream is an output stream. When you create the
2514 stream, you specify the coding system that governs the encoding
2515 and another stream that the resulting encoded data is to be
2516 sent to, and then start sending data to it. */
2518 #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding)
2520 struct encoding_stream
2522 /* Coding system that governs the conversion. */
2523 Lisp_Coding_System *codesys;
2525 /* Stream that we read the encoded data from or
2526 write the decoded data to. */
2529 /* If we are reading, then we can return only a fixed amount of
2530 data, so if the conversion resulted in too much data, we store it
2531 here for retrieval the next time around. */
2532 unsigned_char_dynarr *runoff;
2534 /* FLAGS holds flags indicating the current state of the encoding.
2535 Some of these flags are dependent on the coding system. */
2538 /* CH holds a partially built-up character. Since we only deal
2539 with one- and two-byte characters at the moment, we only use
2540 this to store the first byte of a two-byte character. */
2543 /* Additional information used by the ISO2022 encoder. */
2546 /* CHARSET holds the character sets currently assigned to the G0
2547 through G3 registers. It is initialized from the array
2548 INITIAL_CHARSET in CODESYS. */
2549 Lisp_Object charset[4];
2551 /* Which registers are currently invoked into the left (GL) and
2552 right (GR) halves of the 8-bit encoding space? */
2553 int register_left, register_right;
2555 /* Whether we need to explicitly designate the charset in the
2556 G? register before using it. It is initialized from the
2557 array FORCE_CHARSET_ON_OUTPUT in CODESYS. */
2558 unsigned char force_charset_on_output[4];
2560 /* Other state variables that need to be preserved across
2562 Lisp_Object current_charset;
2564 int current_char_boundary;
2567 /* Additional information (the state of the running CCL program)
2568 used by the CCL encoder. */
2569 struct ccl_program ccl;
2573 static ssize_t encoding_reader (Lstream *stream, unsigned char *data, size_t size);
2574 static ssize_t encoding_writer (Lstream *stream, const unsigned char *data,
2576 static int encoding_rewinder (Lstream *stream);
2577 static int encoding_seekable_p (Lstream *stream);
2578 static int encoding_flusher (Lstream *stream);
2579 static int encoding_closer (Lstream *stream);
2581 static Lisp_Object encoding_marker (Lisp_Object stream);
2583 DEFINE_LSTREAM_IMPLEMENTATION ("encoding", lstream_encoding,
2584 sizeof (struct encoding_stream));
2587 encoding_marker (Lisp_Object stream)
2589 Lstream *str = ENCODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2590 Lisp_Object str_obj;
2592 /* We do not need to mark the coding systems or charsets stored
2593 within the stream because they are stored in a global list
2594 and automatically marked. */
2596 XSETLSTREAM (str_obj, str);
2597 mark_object (str_obj);
2598 if (str->imp->marker)
2599 return (str->imp->marker) (str_obj);
2604 /* Read SIZE bytes of data and store it into DATA. We are a encoding stream
2605 so we read data from the other end, encode it, and store it into DATA. */
2608 encoding_reader (Lstream *stream, unsigned char *data, size_t size)
2610 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2611 unsigned char *orig_data = data;
2613 int error_occurred = 0;
2615 /* We need to interface to mule_encode(), which expects to take some
2616 amount of data and store the result into a Dynarr. We have
2617 mule_encode() store into str->runoff, and take data from there
2620 /* We loop until we have enough data, reading chunks from the other
2621 end and encoding it. */
2624 /* Take data from the runoff if we can. Make sure to take at
2625 most SIZE bytes, and delete the data from the runoff. */
2626 if (Dynarr_length (str->runoff) > 0)
2628 int chunk = min ((int) size, Dynarr_length (str->runoff));
2629 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2630 Dynarr_delete_many (str->runoff, 0, chunk);
2636 break; /* No more room for data */
2638 if (str->flags & CODING_STATE_END)
2639 /* This means that on the previous iteration, we hit the EOF on
2640 the other end. We loop once more so that mule_encode() can
2641 output any final stuff it may be holding, or any "go back
2642 to a sane state" escape sequences. (This latter makes sense
2643 during encoding.) */
2646 /* Exhausted the runoff, so get some more. DATA at least SIZE bytes
2647 left of storage in it, so it's OK to read directly into it.
2648 (We'll be overwriting above, after we've encoded it into the
2650 read_size = Lstream_read (str->other_end, data, size);
2657 /* There might be some more end data produced in the translation.
2658 See the comment above. */
2659 str->flags |= CODING_STATE_END;
2660 mule_encode (stream, data, str->runoff, read_size);
2663 if (data == orig_data)
2664 return error_occurred ? -1 : 0;
2666 return data - orig_data;
2670 encoding_writer (Lstream *stream, const unsigned char *data, size_t size)
2672 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2675 /* Encode all our data into the runoff, and then attempt to write
2676 it all out to the other end. Remove whatever chunk we succeeded
2678 mule_encode (stream, data, str->runoff, size);
2679 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2680 Dynarr_length (str->runoff));
2682 Dynarr_delete_many (str->runoff, 0, retval);
2683 /* Do NOT return retval. The return value indicates how much
2684 of the incoming data was written, not how many bytes were
2690 reset_encoding_stream (struct encoding_stream *str)
2693 switch (CODING_SYSTEM_TYPE (str->codesys))
2695 case CODESYS_ISO2022:
2699 for (i = 0; i < 4; i++)
2701 str->iso2022.charset[i] =
2702 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, i);
2703 str->iso2022.force_charset_on_output[i] =
2704 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (str->codesys, i);
2706 str->iso2022.register_left = 0;
2707 str->iso2022.register_right = 1;
2708 str->iso2022.current_charset = Qnil;
2709 str->iso2022.current_half = 0;
2710 str->iso2022.current_char_boundary = 1;
2714 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_ENCODE (str->codesys));
2721 str->flags = str->ch = 0;
2725 encoding_rewinder (Lstream *stream)
2727 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2728 reset_encoding_stream (str);
2729 Dynarr_reset (str->runoff);
2730 return Lstream_rewind (str->other_end);
2734 encoding_seekable_p (Lstream *stream)
2736 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2737 return Lstream_seekable_p (str->other_end);
2741 encoding_flusher (Lstream *stream)
2743 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2744 return Lstream_flush (str->other_end);
2748 encoding_closer (Lstream *stream)
2750 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2751 if (stream->flags & LSTREAM_FL_WRITE)
2753 str->flags |= CODING_STATE_END;
2754 encoding_writer (stream, 0, 0);
2756 Dynarr_free (str->runoff);
2757 return Lstream_close (str->other_end);
2761 encoding_stream_coding_system (Lstream *stream)
2763 Lisp_Object coding_system;
2764 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2766 XSETCODING_SYSTEM (coding_system, str->codesys);
2767 return coding_system;
2771 set_encoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2773 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2774 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2776 reset_encoding_stream (str);
2780 make_encoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2783 Lstream *lstr = Lstream_new (lstream_encoding, mode);
2784 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2788 str->runoff = Dynarr_new (unsigned_char);
2789 str->other_end = stream;
2790 set_encoding_stream_coding_system (lstr, codesys);
2791 XSETLSTREAM (obj, lstr);
2796 make_encoding_input_stream (Lstream *stream, Lisp_Object codesys)
2798 return make_encoding_stream_1 (stream, codesys, "r");
2802 make_encoding_output_stream (Lstream *stream, Lisp_Object codesys)
2804 return make_encoding_stream_1 (stream, codesys, "w");
2807 /* Convert N bytes of internally-formatted data stored in SRC to an
2808 external format, according to the encoding stream ENCODING.
2809 Store the encoded data into DST. */
2812 mule_encode (Lstream *encoding, const unsigned char *src,
2813 unsigned_char_dynarr *dst, unsigned int n)
2815 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2817 switch (CODING_SYSTEM_TYPE (str->codesys))
2820 case CODESYS_INTERNAL:
2821 Dynarr_add_many (dst, src, n);
2824 case CODESYS_AUTODETECT:
2825 /* If we got this far and still haven't decided on the coding
2826 system, then do no conversion. */
2827 case CODESYS_NO_CONVERSION:
2828 encode_coding_no_conversion (encoding, src, dst, n);
2831 case CODESYS_SHIFT_JIS:
2832 encode_coding_sjis (encoding, src, dst, n);
2835 encode_coding_big5 (encoding, src, dst, n);
2838 encode_coding_ucs4 (encoding, src, dst, n);
2841 encode_coding_utf8 (encoding, src, dst, n);
2844 str->ccl.last_block = str->flags & CODING_STATE_END;
2845 ccl_driver (&str->ccl, src, dst, n, 0, CCL_MODE_ENCODING);
2847 case CODESYS_ISO2022:
2848 encode_coding_iso2022 (encoding, src, dst, n);
2856 DEFUN ("encode-coding-region", Fencode_coding_region, 3, 4, 0, /*
2857 Encode the text between START and END using CODING-SYSTEM.
2858 This will, for example, convert Japanese characters into stuff such as
2859 "^[$B!<!+^[(B" if you use the JIS encoding. Return length of encoded
2860 text. BUFFER defaults to the current buffer if unspecified.
2862 (start, end, coding_system, buffer))
2865 struct buffer *buf = decode_buffer (buffer, 0);
2866 Lisp_Object instream, lb_outstream, de_outstream, outstream;
2867 Lstream *istr, *ostr;
2868 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2870 get_buffer_range_char (buf, start, end, &b, &e, 0);
2872 barf_if_buffer_read_only (buf, b, e);
2874 coding_system = Fget_coding_system (coding_system);
2875 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2876 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2877 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2878 Fget_coding_system (Qbinary));
2879 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2881 istr = XLSTREAM (instream);
2882 ostr = XLSTREAM (outstream);
2883 GCPRO4 (instream, outstream, de_outstream, lb_outstream);
2884 /* The chain of streams looks like this:
2886 [BUFFER] <----- send through
2887 ------> [ENCODE AS SPECIFIED]
2888 ------> [DECODE AS BINARY]
2893 char tempbuf[1024]; /* some random amount */
2894 Bufpos newpos, even_newer_pos;
2895 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2896 ssize_t size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2900 newpos = lisp_buffer_stream_startpos (istr);
2901 Lstream_write (ostr, tempbuf, size_in_bytes);
2902 even_newer_pos = lisp_buffer_stream_startpos (istr);
2903 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2909 lisp_buffer_stream_startpos (XLSTREAM (instream)) - b;
2910 Lstream_close (istr);
2911 Lstream_close (ostr);
2913 Lstream_delete (istr);
2914 Lstream_delete (ostr);
2915 Lstream_delete (XLSTREAM (de_outstream));
2916 Lstream_delete (XLSTREAM (lb_outstream));
2917 return make_int (retlen);
2923 /************************************************************************/
2924 /* Shift-JIS methods */
2925 /************************************************************************/
2927 /* Shift-JIS is a coding system encoding three character sets: ASCII, right
2928 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
2929 as is. A character of JISX0201-Kana (DIMENSION1_CHARS94 character set) is
2930 encoded by "position-code + 0x80". A character of JISX0208
2931 (DIMENSION2_CHARS94 character set) is encoded in 2-byte but two
2932 position-codes are divided and shifted so that it fit in the range
2935 --- CODE RANGE of Shift-JIS ---
2936 (character set) (range)
2938 JISX0201-Kana 0xA0 .. 0xDF
2939 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xEF
2940 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
2941 -------------------------------
2945 /* Is this the first byte of a Shift-JIS two-byte char? */
2947 #define BYTE_SJIS_TWO_BYTE_1_P(c) \
2948 (((c) >= 0x81 && (c) <= 0x9F) || ((c) >= 0xE0 && (c) <= 0xEF))
2950 /* Is this the second byte of a Shift-JIS two-byte char? */
2952 #define BYTE_SJIS_TWO_BYTE_2_P(c) \
2953 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0x80 && (c) <= 0xFC))
2955 #define BYTE_SJIS_KATAKANA_P(c) \
2956 ((c) >= 0xA1 && (c) <= 0xDF)
2959 detect_coding_sjis (struct detection_state *st, const unsigned char *src,
2967 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
2969 if (st->shift_jis.in_second_byte)
2971 st->shift_jis.in_second_byte = 0;
2975 else if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
2976 st->shift_jis.in_second_byte = 1;
2978 return CODING_CATEGORY_SHIFT_JIS_MASK;
2981 /* Convert Shift-JIS data to internal format. */
2984 decode_coding_sjis (Lstream *decoding, const unsigned char *src,
2985 unsigned_char_dynarr *dst, unsigned int n)
2988 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2989 unsigned int flags = str->flags;
2990 unsigned int ch = str->ch;
2991 eol_type_t eol_type = str->eol_type;
2999 /* Previous character was first byte of Shift-JIS Kanji char. */
3000 if (BYTE_SJIS_TWO_BYTE_2_P (c))
3002 unsigned char e1, e2;
3004 Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
3005 DECODE_SJIS (ch, c, e1, e2);
3006 Dynarr_add (dst, e1);
3007 Dynarr_add (dst, e2);
3011 DECODE_ADD_BINARY_CHAR (ch, dst);
3012 DECODE_ADD_BINARY_CHAR (c, dst);
3018 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3019 if (BYTE_SJIS_TWO_BYTE_1_P (c))
3021 else if (BYTE_SJIS_KATAKANA_P (c))
3023 Dynarr_add (dst, LEADING_BYTE_KATAKANA_JISX0201);
3024 Dynarr_add (dst, c);
3027 DECODE_ADD_BINARY_CHAR (c, dst);
3029 label_continue_loop:;
3032 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
3038 /* Convert internally-formatted data to Shift-JIS. */
3041 encode_coding_sjis (Lstream *encoding, const unsigned char *src,
3042 unsigned_char_dynarr *dst, unsigned int n)
3045 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3046 unsigned int flags = str->flags;
3047 unsigned int ch = str->ch;
3048 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3055 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3056 Dynarr_add (dst, '\r');
3057 if (eol_type != EOL_CR)
3058 Dynarr_add (dst, '\n');
3061 else if (BYTE_ASCII_P (c))
3063 Dynarr_add (dst, c);
3066 else if (BUFBYTE_LEADING_BYTE_P (c))
3067 ch = (c == LEADING_BYTE_KATAKANA_JISX0201 ||
3068 c == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
3069 c == LEADING_BYTE_JAPANESE_JISX0208) ? c : 0;
3072 if (ch == LEADING_BYTE_KATAKANA_JISX0201)
3074 Dynarr_add (dst, c);
3077 else if (ch == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
3078 ch == LEADING_BYTE_JAPANESE_JISX0208)
3082 unsigned char j1, j2;
3083 ENCODE_SJIS (ch, c, j1, j2);
3084 Dynarr_add (dst, j1);
3085 Dynarr_add (dst, j2);
3095 DEFUN ("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /*
3096 Decode a JISX0208 character of Shift-JIS coding-system.
3097 CODE is the character code in Shift-JIS as a cons of type bytes.
3098 Return the corresponding character.
3102 unsigned char c1, c2, s1, s2;
3105 CHECK_INT (XCAR (code));
3106 CHECK_INT (XCDR (code));
3107 s1 = XINT (XCAR (code));
3108 s2 = XINT (XCDR (code));
3109 if (BYTE_SJIS_TWO_BYTE_1_P (s1) &&
3110 BYTE_SJIS_TWO_BYTE_2_P (s2))
3112 DECODE_SJIS (s1, s2, c1, c2);
3113 return make_char (MAKE_CHAR (Vcharset_japanese_jisx0208,
3114 c1 & 0x7F, c2 & 0x7F));
3120 DEFUN ("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /*
3121 Encode a JISX0208 character CHAR to SHIFT-JIS coding-system.
3122 Return the corresponding character code in SHIFT-JIS as a cons of two bytes.
3126 Lisp_Object charset;
3129 CHECK_CHAR_COERCE_INT (ch);
3130 BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
3131 if (EQ (charset, Vcharset_japanese_jisx0208))
3133 ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3134 return Fcons (make_int (s1), make_int (s2));
3141 /************************************************************************/
3143 /************************************************************************/
3145 /* BIG5 is a coding system encoding two character sets: ASCII and
3146 Big5. An ASCII character is encoded as is. Big5 is a two-byte
3147 character set and is encoded in two-byte.
3149 --- CODE RANGE of BIG5 ---
3150 (character set) (range)
3152 Big5 (1st byte) 0xA1 .. 0xFE
3153 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
3154 --------------------------
3156 Since the number of characters in Big5 is larger than maximum
3157 characters in Emacs' charset (96x96), it can't be handled as one
3158 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
3159 and `charset-big5-2'. Both <type>s are DIMENSION2_CHARS94. The former
3160 contains frequently used characters and the latter contains less
3161 frequently used characters. */
3163 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
3164 ((c) >= 0xA1 && (c) <= 0xFE)
3166 /* Is this the second byte of a Shift-JIS two-byte char? */
3168 #define BYTE_BIG5_TWO_BYTE_2_P(c) \
3169 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0xA1 && (c) <= 0xFE))
3171 /* Number of Big5 characters which have the same code in 1st byte. */
3173 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
3175 /* Code conversion macros. These are macros because they are used in
3176 inner loops during code conversion.
3178 Note that temporary variables in macros introduce the classic
3179 dynamic-scoping problems with variable names. We use capital-
3180 lettered variables in the assumption that XEmacs does not use
3181 capital letters in variables except in a very formalized way
3184 /* Convert Big5 code (b1, b2) into its internal string representation
3187 /* There is a much simpler way to split the Big5 charset into two.
3188 For the moment I'm going to leave the algorithm as-is because it
3189 claims to separate out the most-used characters into a single
3190 charset, which perhaps will lead to optimizations in various
3193 The way the algorithm works is something like this:
3195 Big5 can be viewed as a 94x157 charset, where the row is
3196 encoded into the bytes 0xA1 .. 0xFE and the column is encoded
3197 into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE. As for frequency,
3198 the split between low and high column numbers is apparently
3199 meaningless; ascending rows produce less and less frequent chars.
3200 Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to
3201 the first charset, and the upper half (0xC9 .. 0xFE) to the
3202 second. To do the conversion, we convert the character into
3203 a single number where 0 .. 156 is the first row, 157 .. 313
3204 is the second, etc. That way, the characters are ordered by
3205 decreasing frequency. Then we just chop the space in two
3206 and coerce the result into a 94x94 space.
3209 #define DECODE_BIG5(b1, b2, lb, c1, c2) do \
3211 int B1 = b1, B2 = b2; \
3213 = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62); \
3217 lb = LEADING_BYTE_CHINESE_BIG5_1; \
3221 lb = LEADING_BYTE_CHINESE_BIG5_2; \
3222 I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1); \
3224 c1 = I / (0xFF - 0xA1) + 0xA1; \
3225 c2 = I % (0xFF - 0xA1) + 0xA1; \
3228 /* Convert the internal string representation of a Big5 character
3229 (lb, c1, c2) into Big5 code (b1, b2). */
3231 #define ENCODE_BIG5(lb, c1, c2, b1, b2) do \
3233 unsigned int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1); \
3235 if (lb == LEADING_BYTE_CHINESE_BIG5_2) \
3237 I += BIG5_SAME_ROW * (0xC9 - 0xA1); \
3239 b1 = I / BIG5_SAME_ROW + 0xA1; \
3240 b2 = I % BIG5_SAME_ROW; \
3241 b2 += b2 < 0x3F ? 0x40 : 0x62; \
3245 detect_coding_big5 (struct detection_state *st, const unsigned char *src,
3253 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO ||
3254 (c >= 0x80 && c <= 0xA0))
3256 if (st->big5.in_second_byte)
3258 st->big5.in_second_byte = 0;
3259 if (c < 0x40 || (c >= 0x80 && c <= 0xA0))
3263 st->big5.in_second_byte = 1;
3265 return CODING_CATEGORY_BIG5_MASK;
3268 /* Convert Big5 data to internal format. */
3271 decode_coding_big5 (Lstream *decoding, const unsigned char *src,
3272 unsigned_char_dynarr *dst, unsigned int n)
3275 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3276 unsigned int flags = str->flags;
3277 unsigned int ch = str->ch;
3278 eol_type_t eol_type = str->eol_type;
3285 /* Previous character was first byte of Big5 char. */
3286 if (BYTE_BIG5_TWO_BYTE_2_P (c))
3288 unsigned char b1, b2, b3;
3289 DECODE_BIG5 (ch, c, b1, b2, b3);
3290 Dynarr_add (dst, b1);
3291 Dynarr_add (dst, b2);
3292 Dynarr_add (dst, b3);
3296 DECODE_ADD_BINARY_CHAR (ch, dst);
3297 DECODE_ADD_BINARY_CHAR (c, dst);
3303 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3304 if (BYTE_BIG5_TWO_BYTE_1_P (c))
3307 DECODE_ADD_BINARY_CHAR (c, dst);
3309 label_continue_loop:;
3312 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
3318 /* Convert internally-formatted data to Big5. */
3321 encode_coding_big5 (Lstream *encoding, const unsigned char *src,
3322 unsigned_char_dynarr *dst, unsigned int n)
3325 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3326 unsigned int flags = str->flags;
3327 unsigned int ch = str->ch;
3328 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3335 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3336 Dynarr_add (dst, '\r');
3337 if (eol_type != EOL_CR)
3338 Dynarr_add (dst, '\n');
3340 else if (BYTE_ASCII_P (c))
3343 Dynarr_add (dst, c);
3345 else if (BUFBYTE_LEADING_BYTE_P (c))
3347 if (c == LEADING_BYTE_CHINESE_BIG5_1 ||
3348 c == LEADING_BYTE_CHINESE_BIG5_2)
3350 /* A recognized leading byte. */
3352 continue; /* not done with this character. */
3354 /* otherwise just ignore this character. */
3356 else if (ch == LEADING_BYTE_CHINESE_BIG5_1 ||
3357 ch == LEADING_BYTE_CHINESE_BIG5_2)
3359 /* Previous char was a recognized leading byte. */
3361 continue; /* not done with this character. */
3365 /* Encountering second byte of a Big5 character. */
3366 unsigned char b1, b2;
3368 ENCODE_BIG5 (ch >> 8, ch & 0xFF, c, b1, b2);
3369 Dynarr_add (dst, b1);
3370 Dynarr_add (dst, b2);
3381 DEFUN ("decode-big5-char", Fdecode_big5_char, 1, 1, 0, /*
3382 Decode a Big5 character CODE of BIG5 coding-system.
3383 CODE is the character code in BIG5, a cons of two integers.
3384 Return the corresponding character.
3388 unsigned char c1, c2, b1, b2;
3391 CHECK_INT (XCAR (code));
3392 CHECK_INT (XCDR (code));
3393 b1 = XINT (XCAR (code));
3394 b2 = XINT (XCDR (code));
3395 if (BYTE_BIG5_TWO_BYTE_1_P (b1) &&
3396 BYTE_BIG5_TWO_BYTE_2_P (b2))
3399 Lisp_Object charset;
3400 DECODE_BIG5 (b1, b2, leading_byte, c1, c2);
3401 charset = CHARSET_BY_LEADING_BYTE (leading_byte);
3402 return make_char (MAKE_CHAR (charset, c1 & 0x7F, c2 & 0x7F));
3408 DEFUN ("encode-big5-char", Fencode_big5_char, 1, 1, 0, /*
3409 Encode the Big5 character CH to BIG5 coding-system.
3410 Return the corresponding character code in Big5.
3414 Lisp_Object charset;
3417 CHECK_CHAR_COERCE_INT (ch);
3418 BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
3419 if (EQ (charset, Vcharset_chinese_big5_1) ||
3420 EQ (charset, Vcharset_chinese_big5_2))
3422 ENCODE_BIG5 (XCHARSET_LEADING_BYTE (charset), c1 | 0x80, c2 | 0x80,
3424 return Fcons (make_int (b1), make_int (b2));
3431 /************************************************************************/
3434 /* UCS-4 character codes are implemented as nonnegative integers. */
3436 /************************************************************************/
3439 DEFUN ("set-ucs-char", Fset_ucs_char, 2, 2, 0, /*
3440 Map UCS-4 code CODE to Mule character CHARACTER.
3442 Return T on success, NIL on failure.
3448 CHECK_CHAR (character);
3452 if (c < sizeof (fcd->ucs_to_mule_table))
3454 fcd->ucs_to_mule_table[c] = character;
3462 ucs_to_char (unsigned long code)
3464 if (code < sizeof (fcd->ucs_to_mule_table))
3466 return fcd->ucs_to_mule_table[code];
3468 else if ((0xe00000 <= code) && (code <= 0xe00000 + 94 * 94 * 14))
3473 c = code % (94 * 94);
3475 (MAKE_CHAR (CHARSET_BY_ATTRIBUTES
3476 (CHARSET_TYPE_94X94, code / (94 * 94) + '@',
3477 CHARSET_LEFT_TO_RIGHT),
3478 c / 94 + 33, c % 94 + 33));
3484 DEFUN ("ucs-char", Fucs_char, 1, 1, 0, /*
3485 Return Mule character corresponding to UCS code CODE (a positive integer).
3489 CHECK_NATNUM (code);
3490 return ucs_to_char (XINT (code));
3493 DEFUN ("set-char-ucs", Fset_char_ucs, 2, 2, 0, /*
3494 Map Mule character CHARACTER to UCS code CODE (a positive integer).
3498 /* #### Isn't this gilding the lily? Fput_char_table checks its args.
3499 Fset_char_ucs is more restrictive on index arg, but should
3500 check code arg in a char_table method. */
3501 CHECK_CHAR (character);
3502 CHECK_NATNUM (code);
3503 return Fput_char_table (character, code, mule_to_ucs_table);
3506 DEFUN ("char-ucs", Fchar_ucs, 1, 1, 0, /*
3507 Return the UCS code (a positive integer) corresponding to CHARACTER.
3511 return Fget_char_table (character, mule_to_ucs_table);
3514 /* Decode a UCS-4 character into a buffer. If the lookup fails, use
3515 <GETA MARK> (U+3013) of JIS X 0208, which means correct character
3516 is not found, instead.
3517 #### do something more appropriate (use blob?)
3518 Danger, Will Robinson! Data loss. Should we signal user? */
3520 decode_ucs4 (unsigned long ch, unsigned_char_dynarr *dst)
3522 Lisp_Object chr = ucs_to_char (ch);
3526 Bufbyte work[MAX_EMCHAR_LEN];
3531 simple_set_charptr_emchar (work, ch) :
3532 non_ascii_set_charptr_emchar (work, ch);
3533 Dynarr_add_many (dst, work, len);
3537 Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
3538 Dynarr_add (dst, 34 + 128);
3539 Dynarr_add (dst, 46 + 128);
3543 static unsigned long
3544 mule_char_to_ucs4 (Lisp_Object charset,
3545 unsigned char h, unsigned char l)
3548 = Fget_char_table (make_char (MAKE_CHAR (charset, h & 127, l & 127)),
3555 else if ( (XCHARSET_DIMENSION (charset) == 2) &&
3556 (XCHARSET_CHARS (charset) == 94) )
3558 unsigned char final = XCHARSET_FINAL (charset);
3560 if ( ('@' <= final) && (final < 0x7f) )
3562 return 0xe00000 + (final - '@') * 94 * 94
3563 + ((h & 127) - 33) * 94 + (l & 127) - 33;
3577 encode_ucs4 (Lisp_Object charset,
3578 unsigned char h, unsigned char l, unsigned_char_dynarr *dst)
3580 unsigned long code = mule_char_to_ucs4 (charset, h, l);
3581 Dynarr_add (dst, code >> 24);
3582 Dynarr_add (dst, (code >> 16) & 255);
3583 Dynarr_add (dst, (code >> 8) & 255);
3584 Dynarr_add (dst, code & 255);
3588 detect_coding_ucs4 (struct detection_state *st, const unsigned char *src,
3594 switch (st->ucs4.in_byte)
3603 st->ucs4.in_byte = 0;
3609 return CODING_CATEGORY_UCS4_MASK;
3613 decode_coding_ucs4 (Lstream *decoding, const unsigned char *src,
3614 unsigned_char_dynarr *dst, unsigned int n)
3616 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3617 unsigned int flags = str->flags;
3618 unsigned int ch = str->ch;
3619 unsigned char counter = str->counter;
3623 unsigned char c = *src++;
3631 decode_ucs4 ( ( ch << 8 ) | c, dst);
3636 ch = ( ch << 8 ) | c;
3640 if (counter & CODING_STATE_END)
3641 DECODE_OUTPUT_PARTIAL_CHAR (ch);
3645 str->counter = counter;
3649 encode_coding_ucs4 (Lstream *encoding, const unsigned char *src,
3650 unsigned_char_dynarr *dst, unsigned int n)
3652 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3653 unsigned int flags = str->flags;
3654 unsigned int ch = str->ch;
3655 unsigned char char_boundary = str->iso2022.current_char_boundary;
3656 Lisp_Object charset = str->iso2022.current_charset;
3658 #ifdef ENABLE_COMPOSITE_CHARS
3659 /* flags for handling composite chars. We do a little switcharoo
3660 on the source while we're outputting the composite char. */
3661 unsigned int saved_n = 0;
3662 const unsigned char *saved_src = NULL;
3663 int in_composite = 0;
3670 unsigned char c = *src++;
3672 if (BYTE_ASCII_P (c))
3673 { /* Processing ASCII character */
3675 encode_ucs4 (Vcharset_ascii, c, 0, dst);
3678 else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
3679 { /* Processing Leading Byte */
3681 charset = CHARSET_BY_LEADING_BYTE (c);
3682 if (LEADING_BYTE_PREFIX_P(c))
3687 { /* Processing Non-ASCII character */
3689 if (EQ (charset, Vcharset_control_1))
3691 encode_ucs4 (Vcharset_control_1, c, 0, dst);
3695 switch (XCHARSET_REP_BYTES (charset))
3698 encode_ucs4 (charset, c, 0, dst);
3701 if (XCHARSET_PRIVATE_P (charset))
3703 encode_ucs4 (charset, c, 0, dst);
3708 #ifdef ENABLE_COMPOSITE_CHARS
3709 if (EQ (charset, Vcharset_composite))
3713 /* #### Bother! We don't know how to
3715 Dynarr_add (dst, 0);
3716 Dynarr_add (dst, 0);
3717 Dynarr_add (dst, 0);
3718 Dynarr_add (dst, '~');
3722 Emchar emch = MAKE_CHAR (Vcharset_composite,
3723 ch & 0x7F, c & 0x7F);
3724 Lisp_Object lstr = composite_char_string (emch);
3728 src = XSTRING_DATA (lstr);
3729 n = XSTRING_LENGTH (lstr);
3733 #endif /* ENABLE_COMPOSITE_CHARS */
3735 encode_ucs4(charset, ch, c, dst);
3748 encode_ucs4 (charset, ch, c, dst);
3764 #ifdef ENABLE_COMPOSITE_CHARS
3770 goto back_to_square_n; /* Wheeeeeeeee ..... */
3772 #endif /* ENABLE_COMPOSITE_CHARS */
3776 str->iso2022.current_char_boundary = char_boundary;
3777 str->iso2022.current_charset = charset;
3779 /* Verbum caro factum est! */
3783 /************************************************************************/
3785 /************************************************************************/
3788 detect_coding_utf8 (struct detection_state *st, const unsigned char *src,
3793 unsigned char c = *src++;
3794 switch (st->utf8.in_byte)
3797 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3800 st->utf8.in_byte = 5;
3802 st->utf8.in_byte = 4;
3804 st->utf8.in_byte = 3;
3806 st->utf8.in_byte = 2;
3808 st->utf8.in_byte = 1;
3813 if ((c & 0xc0) != 0x80)
3819 return CODING_CATEGORY_UTF8_MASK;
3823 decode_coding_utf8 (Lstream *decoding, const unsigned char *src,
3824 unsigned_char_dynarr *dst, unsigned int n)
3826 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3827 unsigned int flags = str->flags;
3828 unsigned int ch = str->ch;
3829 eol_type_t eol_type = str->eol_type;
3830 unsigned char counter = str->counter;
3834 unsigned char c = *src++;
3843 else if ( c >= 0xf8 )
3848 else if ( c >= 0xf0 )
3853 else if ( c >= 0xe0 )
3858 else if ( c >= 0xc0 )
3865 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3866 decode_ucs4 (c, dst);
3870 ch = ( ch << 6 ) | ( c & 0x3f );
3871 decode_ucs4 (ch, dst);
3876 ch = ( ch << 6 ) | ( c & 0x3f );
3879 label_continue_loop:;
3882 if (flags & CODING_STATE_END)
3883 DECODE_OUTPUT_PARTIAL_CHAR (ch);
3887 str->counter = counter;
3891 encode_utf8 (Lisp_Object charset,
3892 unsigned char h, unsigned char l, unsigned_char_dynarr *dst)
3894 unsigned long code = mule_char_to_ucs4 (charset, h, l);
3897 Dynarr_add (dst, code);
3899 else if ( code <= 0x7ff )
3901 Dynarr_add (dst, (code >> 6) | 0xc0);
3902 Dynarr_add (dst, (code & 0x3f) | 0x80);
3904 else if ( code <= 0xffff )
3906 Dynarr_add (dst, (code >> 12) | 0xe0);
3907 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
3908 Dynarr_add (dst, (code & 0x3f) | 0x80);
3910 else if ( code <= 0x1fffff )
3912 Dynarr_add (dst, (code >> 18) | 0xf0);
3913 Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
3914 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
3915 Dynarr_add (dst, (code & 0x3f) | 0x80);
3917 else if ( code <= 0x3ffffff )
3919 Dynarr_add (dst, (code >> 24) | 0xf8);
3920 Dynarr_add (dst, ((code >> 18) & 0x3f) | 0x80);
3921 Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
3922 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
3923 Dynarr_add (dst, (code & 0x3f) | 0x80);
3927 Dynarr_add (dst, (code >> 30) | 0xfc);
3928 Dynarr_add (dst, ((code >> 24) & 0x3f) | 0x80);
3929 Dynarr_add (dst, ((code >> 18) & 0x3f) | 0x80);
3930 Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
3931 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
3932 Dynarr_add (dst, (code & 0x3f) | 0x80);
3937 encode_coding_utf8 (Lstream *encoding, const unsigned char *src,
3938 unsigned_char_dynarr *dst, unsigned int n)
3940 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3941 unsigned int flags = str->flags;
3942 unsigned int ch = str->ch;
3943 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3944 unsigned char char_boundary = str->iso2022.current_char_boundary;
3945 Lisp_Object charset = str->iso2022.current_charset;
3947 #ifdef ENABLE_COMPOSITE_CHARS
3948 /* flags for handling composite chars. We do a little switcharoo
3949 on the source while we're outputting the composite char. */
3950 unsigned int saved_n = 0;
3951 const unsigned char *saved_src = NULL;
3952 int in_composite = 0;
3955 #endif /* ENABLE_COMPOSITE_CHARS */
3959 unsigned char c = *src++;
3961 if (BYTE_ASCII_P (c))
3962 { /* Processing ASCII character */
3966 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3967 Dynarr_add (dst, '\r');
3968 if (eol_type != EOL_CR)
3969 Dynarr_add (dst, c);
3972 encode_utf8 (Vcharset_ascii, c, 0, dst);
3975 else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
3976 { /* Processing Leading Byte */
3978 charset = CHARSET_BY_LEADING_BYTE (c);
3979 if (LEADING_BYTE_PREFIX_P(c))
3984 { /* Processing Non-ASCII character */
3986 if (EQ (charset, Vcharset_control_1))
3988 encode_utf8 (Vcharset_control_1, c, 0, dst);
3992 switch (XCHARSET_REP_BYTES (charset))
3995 encode_utf8 (charset, c, 0, dst);
3998 if (XCHARSET_PRIVATE_P (charset))
4000 encode_utf8 (charset, c, 0, dst);
4005 #ifdef ENABLE_COMPOSITE_CHARS
4006 if (EQ (charset, Vcharset_composite))
4010 /* #### Bother! We don't know how to
4012 encode_utf8 (Vcharset_ascii, '~', 0, dst);
4016 Emchar emch = MAKE_CHAR (Vcharset_composite,
4017 ch & 0x7F, c & 0x7F);
4018 Lisp_Object lstr = composite_char_string (emch);
4022 src = XSTRING_DATA (lstr);
4023 n = XSTRING_LENGTH (lstr);
4027 #endif /* ENABLE_COMPOSITE_CHARS */
4029 encode_utf8 (charset, ch, c, dst);
4042 encode_utf8 (charset, ch, c, dst);
4058 #ifdef ENABLE_COMPOSITE_CHARS
4064 goto back_to_square_n; /* Wheeeeeeeee ..... */
4070 str->iso2022.current_char_boundary = char_boundary;
4071 str->iso2022.current_charset = charset;
4073 /* Verbum caro factum est! */
4077 /************************************************************************/
4078 /* ISO2022 methods */
4079 /************************************************************************/
4081 /* The following note describes the coding system ISO2022 briefly.
4082 Since the intention of this note is to help understand the
4083 functions in this file, some parts are NOT ACCURATE or OVERLY
4084 SIMPLIFIED. For thorough understanding, please refer to the
4085 original document of ISO2022.
4087 ISO2022 provides many mechanisms to encode several character sets
4088 in 7-bit and 8-bit environments. For 7-bit environments, all text
4089 is encoded using bytes less than 128. This may make the encoded
4090 text a little bit longer, but the text passes more easily through
4091 several gateways, some of which strip off MSB (Most Signigant Bit).
4093 There are two kinds of character sets: control character set and
4094 graphic character set. The former contains control characters such
4095 as `newline' and `escape' to provide control functions (control
4096 functions are also provided by escape sequences). The latter
4097 contains graphic characters such as 'A' and '-'. Emacs recognizes
4098 two control character sets and many graphic character sets.
4100 Graphic character sets are classified into one of the following
4101 four classes, according to the number of bytes (DIMENSION) and
4102 number of characters in one dimension (CHARS) of the set:
4103 - DIMENSION1_CHARS94
4104 - DIMENSION1_CHARS96
4105 - DIMENSION2_CHARS94
4106 - DIMENSION2_CHARS96
4108 In addition, each character set is assigned an identification tag,
4109 unique for each set, called "final character" (denoted as <F>
4110 hereafter). The <F> of each character set is decided by ECMA(*)
4111 when it is registered in ISO. The code range of <F> is 0x30..0x7F
4112 (0x30..0x3F are for private use only).
4114 Note (*): ECMA = European Computer Manufacturers Association
4116 Here are examples of graphic character set [NAME(<F>)]:
4117 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
4118 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
4119 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
4120 o DIMENSION2_CHARS96 -- none for the moment
4122 A code area (1 byte = 8 bits) is divided into 4 areas, C0, GL, C1, and GR.
4123 C0 [0x00..0x1F] -- control character plane 0
4124 GL [0x20..0x7F] -- graphic character plane 0
4125 C1 [0x80..0x9F] -- control character plane 1
4126 GR [0xA0..0xFF] -- graphic character plane 1
4128 A control character set is directly designated and invoked to C0 or
4129 C1 by an escape sequence. The most common case is that:
4130 - ISO646's control character set is designated/invoked to C0, and
4131 - ISO6429's control character set is designated/invoked to C1,
4132 and usually these designations/invocations are omitted in encoded
4133 text. In a 7-bit environment, only C0 can be used, and a control
4134 character for C1 is encoded by an appropriate escape sequence to
4135 fit into the environment. All control characters for C1 are
4136 defined to have corresponding escape sequences.
4138 A graphic character set is at first designated to one of four
4139 graphic registers (G0 through G3), then these graphic registers are
4140 invoked to GL or GR. These designations and invocations can be
4141 done independently. The most common case is that G0 is invoked to
4142 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
4143 these invocations and designations are omitted in encoded text.
4144 In a 7-bit environment, only GL can be used.
4146 When a graphic character set of CHARS94 is invoked to GL, codes
4147 0x20 and 0x7F of the GL area work as control characters SPACE and
4148 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
4151 There are two ways of invocation: locking-shift and single-shift.
4152 With locking-shift, the invocation lasts until the next different
4153 invocation, whereas with single-shift, the invocation affects the
4154 following character only and doesn't affect the locking-shift
4155 state. Invocations are done by the following control characters or
4158 ----------------------------------------------------------------------
4159 abbrev function cntrl escape seq description
4160 ----------------------------------------------------------------------
4161 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
4162 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
4163 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
4164 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
4165 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
4166 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
4167 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
4168 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
4169 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
4170 ----------------------------------------------------------------------
4171 (*) These are not used by any known coding system.
4173 Control characters for these functions are defined by macros
4174 ISO_CODE_XXX in `coding.h'.
4176 Designations are done by the following escape sequences:
4177 ----------------------------------------------------------------------
4178 escape sequence description
4179 ----------------------------------------------------------------------
4180 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
4181 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
4182 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
4183 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
4184 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
4185 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
4186 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
4187 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
4188 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
4189 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
4190 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
4191 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
4192 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
4193 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
4194 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
4195 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
4196 ----------------------------------------------------------------------
4198 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
4199 of dimension 1, chars 94, and final character <F>, etc...
4201 Note (*): Although these designations are not allowed in ISO2022,
4202 Emacs accepts them on decoding, and produces them on encoding
4203 CHARS96 character sets in a coding system which is characterized as
4204 7-bit environment, non-locking-shift, and non-single-shift.
4206 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
4207 '(' can be omitted. We refer to this as "short-form" hereafter.
4209 Now you may notice that there are a lot of ways for encoding the
4210 same multilingual text in ISO2022. Actually, there exist many
4211 coding systems such as Compound Text (used in X11's inter client
4212 communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
4213 (used in Korean internet), EUC (Extended UNIX Code, used in Asian
4214 localized platforms), and all of these are variants of ISO2022.
4216 In addition to the above, Emacs handles two more kinds of escape
4217 sequences: ISO6429's direction specification and Emacs' private
4218 sequence for specifying character composition.
4220 ISO6429's direction specification takes the following form:
4221 o CSI ']' -- end of the current direction
4222 o CSI '0' ']' -- end of the current direction
4223 o CSI '1' ']' -- start of left-to-right text
4224 o CSI '2' ']' -- start of right-to-left text
4225 The control character CSI (0x9B: control sequence introducer) is
4226 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
4228 Character composition specification takes the following form:
4229 o ESC '0' -- start character composition
4230 o ESC '1' -- end character composition
4231 Since these are not standard escape sequences of any ISO standard,
4232 their use with these meanings is restricted to Emacs only. */
4235 reset_iso2022 (Lisp_Object coding_system, struct iso2022_decoder *iso)
4239 for (i = 0; i < 4; i++)
4241 if (!NILP (coding_system))
4243 XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, i);
4245 iso->charset[i] = Qt;
4246 iso->invalid_designated[i] = 0;
4248 iso->esc = ISO_ESC_NOTHING;
4249 iso->esc_bytes_index = 0;
4250 iso->register_left = 0;
4251 iso->register_right = 1;
4252 iso->switched_dir_and_no_valid_charset_yet = 0;
4253 iso->invalid_switch_dir = 0;
4254 iso->output_direction_sequence = 0;
4255 iso->output_literally = 0;
4256 #ifdef ENABLE_COMPOSITE_CHARS
4257 if (iso->composite_chars)
4258 Dynarr_reset (iso->composite_chars);
4263 fit_to_be_escape_quoted (unsigned char c)
4280 /* Parse one byte of an ISO2022 escape sequence.
4281 If the result is an invalid escape sequence, return 0 and
4282 do not change anything in STR. Otherwise, if the result is
4283 an incomplete escape sequence, update ISO2022.ESC and
4284 ISO2022.ESC_BYTES and return -1. Otherwise, update
4285 all the state variables (but not ISO2022.ESC_BYTES) and
4288 If CHECK_INVALID_CHARSETS is non-zero, check for designation
4289 or invocation of an invalid character set and treat that as
4290 an unrecognized escape sequence. */
4293 parse_iso2022_esc (Lisp_Object codesys, struct iso2022_decoder *iso,
4294 unsigned char c, unsigned int *flags,
4295 int check_invalid_charsets)
4297 /* (1) If we're at the end of a designation sequence, CS is the
4298 charset being designated and REG is the register to designate
4301 (2) If we're at the end of a locking-shift sequence, REG is
4302 the register to invoke and HALF (0 == left, 1 == right) is
4303 the half to invoke it into.
4305 (3) If we're at the end of a single-shift sequence, REG is
4306 the register to invoke. */
4307 Lisp_Object cs = Qnil;
4310 /* NOTE: This code does goto's all over the fucking place.
4311 The reason for this is that we're basically implementing
4312 a state machine here, and hierarchical languages like C
4313 don't really provide a clean way of doing this. */
4315 if (! (*flags & CODING_STATE_ESCAPE))
4316 /* At beginning of escape sequence; we need to reset our
4317 escape-state variables. */
4318 iso->esc = ISO_ESC_NOTHING;
4320 iso->output_literally = 0;
4321 iso->output_direction_sequence = 0;
4325 case ISO_ESC_NOTHING:
4326 iso->esc_bytes_index = 0;
4329 case ISO_CODE_ESC: /* Start escape sequence */
4330 *flags |= CODING_STATE_ESCAPE;
4334 case ISO_CODE_CSI: /* ISO6429 (specifying directionality) */
4335 *flags |= CODING_STATE_ESCAPE;
4336 iso->esc = ISO_ESC_5_11;
4339 case ISO_CODE_SO: /* locking shift 1 */
4342 case ISO_CODE_SI: /* locking shift 0 */
4346 case ISO_CODE_SS2: /* single shift */
4349 case ISO_CODE_SS3: /* single shift */
4353 default: /* Other control characters */
4360 /**** single shift ****/
4362 case 'N': /* single shift 2 */
4365 case 'O': /* single shift 3 */
4369 /**** locking shift ****/
4371 case '~': /* locking shift 1 right */
4374 case 'n': /* locking shift 2 */
4377 case '}': /* locking shift 2 right */
4380 case 'o': /* locking shift 3 */
4383 case '|': /* locking shift 3 right */
4387 #ifdef ENABLE_COMPOSITE_CHARS
4388 /**** composite ****/
4391 iso->esc = ISO_ESC_START_COMPOSITE;
4392 *flags = (*flags & CODING_STATE_ISO2022_LOCK) |
4393 CODING_STATE_COMPOSITE;
4397 iso->esc = ISO_ESC_END_COMPOSITE;
4398 *flags = (*flags & CODING_STATE_ISO2022_LOCK) &
4399 ~CODING_STATE_COMPOSITE;
4401 #endif /* ENABLE_COMPOSITE_CHARS */
4403 /**** directionality ****/
4406 iso->esc = ISO_ESC_5_11;
4409 /**** designation ****/
4411 case '$': /* multibyte charset prefix */
4412 iso->esc = ISO_ESC_2_4;
4416 if (0x28 <= c && c <= 0x2F)
4418 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_8);
4422 /* This function is called with CODESYS equal to nil when
4423 doing coding-system detection. */
4425 && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
4426 && fit_to_be_escape_quoted (c))
4428 iso->esc = ISO_ESC_LITERAL;
4429 *flags &= CODING_STATE_ISO2022_LOCK;
4439 /**** directionality ****/
4441 case ISO_ESC_5_11: /* ISO6429 direction control */
4444 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4445 goto directionality;
4447 if (c == '0') iso->esc = ISO_ESC_5_11_0;
4448 else if (c == '1') iso->esc = ISO_ESC_5_11_1;
4449 else if (c == '2') iso->esc = ISO_ESC_5_11_2;
4453 case ISO_ESC_5_11_0:
4456 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4457 goto directionality;
4461 case ISO_ESC_5_11_1:
4464 *flags = (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4465 goto directionality;
4469 case ISO_ESC_5_11_2:
4472 *flags = (*flags & CODING_STATE_ISO2022_LOCK) | CODING_STATE_R2L;
4473 goto directionality;
4478 iso->esc = ISO_ESC_DIRECTIONALITY;
4479 /* Various junk here to attempt to preserve the direction sequences
4480 literally in the text if they would otherwise be swallowed due
4481 to invalid designations that don't show up as actual charset
4482 changes in the text. */
4483 if (iso->invalid_switch_dir)
4485 /* We already inserted a direction switch literally into the
4486 text. We assume (#### this may not be right) that the
4487 next direction switch is the one going the other way,
4488 and we need to output that literally as well. */
4489 iso->output_literally = 1;
4490 iso->invalid_switch_dir = 0;
4496 /* If we are in the thrall of an invalid designation,
4497 then stick the directionality sequence literally into the
4498 output stream so it ends up in the original text again. */
4499 for (jj = 0; jj < 4; jj++)
4500 if (iso->invalid_designated[jj])
4504 iso->output_literally = 1;
4505 iso->invalid_switch_dir = 1;
4508 /* Indicate that we haven't yet seen a valid designation,
4509 so that if a switch-dir is directly followed by an
4510 invalid designation, both get inserted literally. */
4511 iso->switched_dir_and_no_valid_charset_yet = 1;
4516 /**** designation ****/
4519 if (0x28 <= c && c <= 0x2F)
4521 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_4_8);
4524 if (0x40 <= c && c <= 0x42)
4526 cs = CHARSET_BY_ATTRIBUTES (CHARSET_TYPE_94X94, c,
4527 *flags & CODING_STATE_R2L ?
4528 CHARSET_RIGHT_TO_LEFT :
4529 CHARSET_LEFT_TO_RIGHT);
4539 if (c < '0' || c > '~')
4540 return 0; /* bad final byte */
4542 if (iso->esc >= ISO_ESC_2_8 &&
4543 iso->esc <= ISO_ESC_2_15)
4545 type = ((iso->esc >= ISO_ESC_2_12) ?
4546 CHARSET_TYPE_96 : CHARSET_TYPE_94);
4547 reg = (iso->esc - ISO_ESC_2_8) & 3;
4549 else if (iso->esc >= ISO_ESC_2_4_8 &&
4550 iso->esc <= ISO_ESC_2_4_15)
4552 type = ((iso->esc >= ISO_ESC_2_4_12) ?
4553 CHARSET_TYPE_96X96 : CHARSET_TYPE_94X94);
4554 reg = (iso->esc - ISO_ESC_2_4_8) & 3;
4558 /* Can this ever be reached? -slb */
4562 cs = CHARSET_BY_ATTRIBUTES (type, c,
4563 *flags & CODING_STATE_R2L ?
4564 CHARSET_RIGHT_TO_LEFT :
4565 CHARSET_LEFT_TO_RIGHT);
4571 iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char) c;
4575 if (check_invalid_charsets && !CHARSETP (iso->charset[reg]))
4576 /* can't invoke something that ain't there. */
4578 iso->esc = ISO_ESC_SINGLE_SHIFT;
4579 *flags &= CODING_STATE_ISO2022_LOCK;
4581 *flags |= CODING_STATE_SS2;
4583 *flags |= CODING_STATE_SS3;
4587 if (check_invalid_charsets &&
4588 !CHARSETP (iso->charset[reg]))
4589 /* can't invoke something that ain't there. */
4592 iso->register_right = reg;
4594 iso->register_left = reg;
4595 *flags &= CODING_STATE_ISO2022_LOCK;
4596 iso->esc = ISO_ESC_LOCKING_SHIFT;
4600 if (NILP (cs) && check_invalid_charsets)
4602 iso->invalid_designated[reg] = 1;
4603 iso->charset[reg] = Vcharset_ascii;
4604 iso->esc = ISO_ESC_DESIGNATE;
4605 *flags &= CODING_STATE_ISO2022_LOCK;
4606 iso->output_literally = 1;
4607 if (iso->switched_dir_and_no_valid_charset_yet)
4609 /* We encountered a switch-direction followed by an
4610 invalid designation. Ensure that the switch-direction
4611 gets outputted; otherwise it will probably get eaten
4612 when the text is written out again. */
4613 iso->switched_dir_and_no_valid_charset_yet = 0;
4614 iso->output_direction_sequence = 1;
4615 /* And make sure that the switch-dir going the other
4616 way gets outputted, as well. */
4617 iso->invalid_switch_dir = 1;
4621 /* This function is called with CODESYS equal to nil when
4622 doing coding-system detection. */
4623 if (!NILP (codesys))
4625 charset_conversion_spec_dynarr *dyn =
4626 XCODING_SYSTEM (codesys)->iso2022.input_conv;
4632 for (i = 0; i < Dynarr_length (dyn); i++)
4634 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
4635 if (EQ (cs, spec->from_charset))
4636 cs = spec->to_charset;
4641 iso->charset[reg] = cs;
4642 iso->esc = ISO_ESC_DESIGNATE;
4643 *flags &= CODING_STATE_ISO2022_LOCK;
4644 if (iso->invalid_designated[reg])
4646 iso->invalid_designated[reg] = 0;
4647 iso->output_literally = 1;
4649 if (iso->switched_dir_and_no_valid_charset_yet)
4650 iso->switched_dir_and_no_valid_charset_yet = 0;
4655 detect_coding_iso2022 (struct detection_state *st, const unsigned char *src,
4660 /* #### There are serious deficiencies in the recognition mechanism
4661 here. This needs to be much smarter if it's going to cut it.
4662 The sequence "\xff\x0f" is currently detected as LOCK_SHIFT while
4663 it should be detected as Latin-1.
4664 All the ISO2022 stuff in this file should be synced up with the
4665 code from FSF Emacs-20.4, in which Mule should be more or less stable.
4666 Perhaps we should wait till R2L works in FSF Emacs? */
4668 if (!st->iso2022.initted)
4670 reset_iso2022 (Qnil, &st->iso2022.iso);
4671 st->iso2022.mask = (CODING_CATEGORY_ISO_7_MASK |
4672 CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4673 CODING_CATEGORY_ISO_8_1_MASK |
4674 CODING_CATEGORY_ISO_8_2_MASK |
4675 CODING_CATEGORY_ISO_LOCK_SHIFT_MASK);
4676 st->iso2022.flags = 0;
4677 st->iso2022.high_byte_count = 0;
4678 st->iso2022.saw_single_shift = 0;
4679 st->iso2022.initted = 1;
4682 mask = st->iso2022.mask;
4689 mask &= ~CODING_CATEGORY_ISO_7_MASK;
4690 st->iso2022.high_byte_count++;
4694 if (st->iso2022.high_byte_count && !st->iso2022.saw_single_shift)
4696 if (st->iso2022.high_byte_count & 1)
4697 /* odd number of high bytes; assume not iso-8-2 */
4698 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4700 st->iso2022.high_byte_count = 0;
4701 st->iso2022.saw_single_shift = 0;
4703 mask &= ~CODING_CATEGORY_ISO_7_MASK;
4705 if (!(st->iso2022.flags & CODING_STATE_ESCAPE)
4706 && (BYTE_C0_P (c) || BYTE_C1_P (c)))
4707 { /* control chars */
4710 /* Allow and ignore control characters that you might
4711 reasonably see in a text file */
4716 case 8: /* backspace */
4717 case 11: /* vertical tab */
4718 case 12: /* form feed */
4719 case 26: /* MS-DOS C-z junk */
4720 case 31: /* '^_' -- for info */
4721 goto label_continue_loop;
4728 if ((st->iso2022.flags & CODING_STATE_ESCAPE) || BYTE_C0_P (c)
4731 if (parse_iso2022_esc (Qnil, &st->iso2022.iso, c,
4732 &st->iso2022.flags, 0))
4734 switch (st->iso2022.iso.esc)
4736 case ISO_ESC_DESIGNATE:
4737 mask &= ~CODING_CATEGORY_ISO_8_1_MASK;
4738 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4740 case ISO_ESC_LOCKING_SHIFT:
4741 mask = CODING_CATEGORY_ISO_LOCK_SHIFT_MASK;
4742 goto ran_out_of_chars;
4743 case ISO_ESC_SINGLE_SHIFT:
4744 mask &= ~CODING_CATEGORY_ISO_8_DESIGNATE_MASK;
4745 st->iso2022.saw_single_shift = 1;
4754 goto ran_out_of_chars;
4757 label_continue_loop:;
4766 postprocess_iso2022_mask (int mask)
4768 /* #### kind of cheesy */
4769 /* If seven-bit ISO is allowed, then assume that the encoding is
4770 entirely seven-bit and turn off the eight-bit ones. */
4771 if (mask & CODING_CATEGORY_ISO_7_MASK)
4772 mask &= ~ (CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4773 CODING_CATEGORY_ISO_8_1_MASK |
4774 CODING_CATEGORY_ISO_8_2_MASK);
4778 /* If FLAGS is a null pointer or specifies right-to-left motion,
4779 output a switch-dir-to-left-to-right sequence to DST.
4780 Also update FLAGS if it is not a null pointer.
4781 If INTERNAL_P is set, we are outputting in internal format and
4782 need to handle the CSI differently. */
4785 restore_left_to_right_direction (Lisp_Coding_System *codesys,
4786 unsigned_char_dynarr *dst,
4787 unsigned int *flags,
4790 if (!flags || (*flags & CODING_STATE_R2L))
4792 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4794 Dynarr_add (dst, ISO_CODE_ESC);
4795 Dynarr_add (dst, '[');
4797 else if (internal_p)
4798 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4800 Dynarr_add (dst, ISO_CODE_CSI);
4801 Dynarr_add (dst, '0');
4802 Dynarr_add (dst, ']');
4804 *flags &= ~CODING_STATE_R2L;
4808 /* If FLAGS is a null pointer or specifies a direction different from
4809 DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or
4810 CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape
4811 sequence to DST. Also update FLAGS if it is not a null pointer.
4812 If INTERNAL_P is set, we are outputting in internal format and
4813 need to handle the CSI differently. */
4816 ensure_correct_direction (int direction, Lisp_Coding_System *codesys,
4817 unsigned_char_dynarr *dst, unsigned int *flags,
4820 if ((!flags || (*flags & CODING_STATE_R2L)) &&
4821 direction == CHARSET_LEFT_TO_RIGHT)
4822 restore_left_to_right_direction (codesys, dst, flags, internal_p);
4823 else if (!CODING_SYSTEM_ISO2022_NO_ISO6429 (codesys)
4824 && (!flags || !(*flags & CODING_STATE_R2L)) &&
4825 direction == CHARSET_RIGHT_TO_LEFT)
4827 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4829 Dynarr_add (dst, ISO_CODE_ESC);
4830 Dynarr_add (dst, '[');
4832 else if (internal_p)
4833 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4835 Dynarr_add (dst, ISO_CODE_CSI);
4836 Dynarr_add (dst, '2');
4837 Dynarr_add (dst, ']');
4839 *flags |= CODING_STATE_R2L;
4843 /* Convert ISO2022-format data to internal format. */
4846 decode_coding_iso2022 (Lstream *decoding, const unsigned char *src,
4847 unsigned_char_dynarr *dst, unsigned int n)
4849 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4850 unsigned int flags = str->flags;
4851 unsigned int ch = str->ch;
4852 eol_type_t eol_type = str->eol_type;
4853 #ifdef ENABLE_COMPOSITE_CHARS
4854 unsigned_char_dynarr *real_dst = dst;
4856 Lisp_Object coding_system;
4858 XSETCODING_SYSTEM (coding_system, str->codesys);
4860 #ifdef ENABLE_COMPOSITE_CHARS
4861 if (flags & CODING_STATE_COMPOSITE)
4862 dst = str->iso2022.composite_chars;
4863 #endif /* ENABLE_COMPOSITE_CHARS */
4867 unsigned char c = *src++;
4868 if (flags & CODING_STATE_ESCAPE)
4869 { /* Within ESC sequence */
4870 int retval = parse_iso2022_esc (coding_system, &str->iso2022,
4875 switch (str->iso2022.esc)
4877 #ifdef ENABLE_COMPOSITE_CHARS
4878 case ISO_ESC_START_COMPOSITE:
4879 if (str->iso2022.composite_chars)
4880 Dynarr_reset (str->iso2022.composite_chars);
4882 str->iso2022.composite_chars = Dynarr_new (unsigned_char);
4883 dst = str->iso2022.composite_chars;
4885 case ISO_ESC_END_COMPOSITE:
4887 Bufbyte comstr[MAX_EMCHAR_LEN];
4889 Emchar emch = lookup_composite_char (Dynarr_atp (dst, 0),
4890 Dynarr_length (dst));
4892 len = set_charptr_emchar (comstr, emch);
4893 Dynarr_add_many (dst, comstr, len);
4896 #endif /* ENABLE_COMPOSITE_CHARS */
4898 case ISO_ESC_LITERAL:
4899 DECODE_ADD_BINARY_CHAR (c, dst);
4903 /* Everything else handled already */
4908 /* Attempted error recovery. */
4909 if (str->iso2022.output_direction_sequence)
4910 ensure_correct_direction (flags & CODING_STATE_R2L ?
4911 CHARSET_RIGHT_TO_LEFT :
4912 CHARSET_LEFT_TO_RIGHT,
4913 str->codesys, dst, 0, 1);
4914 /* More error recovery. */
4915 if (!retval || str->iso2022.output_literally)
4917 /* Output the (possibly invalid) sequence */
4919 for (i = 0; i < str->iso2022.esc_bytes_index; i++)
4920 DECODE_ADD_BINARY_CHAR (str->iso2022.esc_bytes[i], dst);
4921 flags &= CODING_STATE_ISO2022_LOCK;
4923 n++, src--;/* Repeat the loop with the same character. */
4926 /* No sense in reprocessing the final byte of the
4927 escape sequence; it could mess things up anyway.
4929 DECODE_ADD_BINARY_CHAR (c, dst);
4934 else if (BYTE_C0_P (c) || BYTE_C1_P (c))
4935 { /* Control characters */
4937 /***** Error-handling *****/
4939 /* If we were in the middle of a character, dump out the
4940 partial character. */
4941 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4943 /* If we just saw a single-shift character, dump it out.
4944 This may dump out the wrong sort of single-shift character,
4945 but least it will give an indication that something went
4947 if (flags & CODING_STATE_SS2)
4949 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS2, dst);
4950 flags &= ~CODING_STATE_SS2;
4952 if (flags & CODING_STATE_SS3)
4954 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS3, dst);
4955 flags &= ~CODING_STATE_SS3;
4958 /***** Now handle the control characters. *****/
4961 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4963 flags &= CODING_STATE_ISO2022_LOCK;
4965 if (!parse_iso2022_esc (coding_system, &str->iso2022, c, &flags, 1))
4966 DECODE_ADD_BINARY_CHAR (c, dst);
4969 { /* Graphic characters */
4970 Lisp_Object charset;
4974 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4976 /* Now determine the charset. */
4977 reg = ((flags & CODING_STATE_SS2) ? 2
4978 : (flags & CODING_STATE_SS3) ? 3
4979 : !BYTE_ASCII_P (c) ? str->iso2022.register_right
4980 : str->iso2022.register_left);
4981 charset = str->iso2022.charset[reg];
4983 /* Error checking: */
4984 if (! CHARSETP (charset)
4985 || str->iso2022.invalid_designated[reg]
4986 || (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL)
4987 && XCHARSET_CHARS (charset) == 94))
4988 /* Mrmph. We are trying to invoke a register that has no
4989 or an invalid charset in it, or trying to add a character
4990 outside the range of the charset. Insert that char literally
4991 to preserve it for the output. */
4993 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4994 DECODE_ADD_BINARY_CHAR (c, dst);
4999 /* Things are probably hunky-dorey. */
5001 /* Fetch reverse charset, maybe. */
5002 if (((flags & CODING_STATE_R2L) &&
5003 XCHARSET_DIRECTION (charset) == CHARSET_LEFT_TO_RIGHT)
5005 (!(flags & CODING_STATE_R2L) &&
5006 XCHARSET_DIRECTION (charset) == CHARSET_RIGHT_TO_LEFT))
5008 Lisp_Object new_charset =
5009 XCHARSET_REVERSE_DIRECTION_CHARSET (charset);
5010 if (!NILP (new_charset))
5011 charset = new_charset;
5014 lb = XCHARSET_LEADING_BYTE (charset);
5015 switch (XCHARSET_REP_BYTES (charset))
5018 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5019 Dynarr_add (dst, c & 0x7F);
5022 case 2: /* one-byte official */
5023 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5024 Dynarr_add (dst, lb);
5025 Dynarr_add (dst, c | 0x80);
5028 case 3: /* one-byte private or two-byte official */
5029 if (XCHARSET_PRIVATE_P (charset))
5031 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5032 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_1);
5033 Dynarr_add (dst, lb);
5034 Dynarr_add (dst, c | 0x80);
5040 Dynarr_add (dst, lb);
5041 Dynarr_add (dst, ch | 0x80);
5042 Dynarr_add (dst, c | 0x80);
5050 default: /* two-byte private */
5053 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_2);
5054 Dynarr_add (dst, lb);
5055 Dynarr_add (dst, ch | 0x80);
5056 Dynarr_add (dst, c | 0x80);
5065 flags &= CODING_STATE_ISO2022_LOCK;
5068 label_continue_loop:;
5071 if (flags & CODING_STATE_END)
5072 DECODE_OUTPUT_PARTIAL_CHAR (ch);
5079 /***** ISO2022 encoder *****/
5081 /* Designate CHARSET into register REG. */
5084 iso2022_designate (Lisp_Object charset, unsigned char reg,
5085 struct encoding_stream *str, unsigned_char_dynarr *dst)
5087 static const char inter94[] = "()*+";
5088 static const char inter96[] = ",-./";
5090 unsigned char final;
5091 Lisp_Object old_charset = str->iso2022.charset[reg];
5093 str->iso2022.charset[reg] = charset;
5094 if (!CHARSETP (charset))
5095 /* charset might be an initial nil or t. */
5097 type = XCHARSET_TYPE (charset);
5098 final = XCHARSET_FINAL (charset);
5099 if (!str->iso2022.force_charset_on_output[reg] &&
5100 CHARSETP (old_charset) &&
5101 XCHARSET_TYPE (old_charset) == type &&
5102 XCHARSET_FINAL (old_charset) == final)
5105 str->iso2022.force_charset_on_output[reg] = 0;
5108 charset_conversion_spec_dynarr *dyn =
5109 str->codesys->iso2022.output_conv;
5115 for (i = 0; i < Dynarr_length (dyn); i++)
5117 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
5118 if (EQ (charset, spec->from_charset))
5119 charset = spec->to_charset;
5124 Dynarr_add (dst, ISO_CODE_ESC);
5127 case CHARSET_TYPE_94:
5128 Dynarr_add (dst, inter94[reg]);
5130 case CHARSET_TYPE_96:
5131 Dynarr_add (dst, inter96[reg]);
5133 case CHARSET_TYPE_94X94:
5134 Dynarr_add (dst, '$');
5136 || !(CODING_SYSTEM_ISO2022_SHORT (str->codesys))
5139 Dynarr_add (dst, inter94[reg]);
5141 case CHARSET_TYPE_96X96:
5142 Dynarr_add (dst, '$');
5143 Dynarr_add (dst, inter96[reg]);
5146 Dynarr_add (dst, final);
5150 ensure_normal_shift (struct encoding_stream *str, unsigned_char_dynarr *dst)
5152 if (str->iso2022.register_left != 0)
5154 Dynarr_add (dst, ISO_CODE_SI);
5155 str->iso2022.register_left = 0;
5160 ensure_shift_out (struct encoding_stream *str, unsigned_char_dynarr *dst)
5162 if (str->iso2022.register_left != 1)
5164 Dynarr_add (dst, ISO_CODE_SO);
5165 str->iso2022.register_left = 1;
5169 /* Convert internally-formatted data to ISO2022 format. */
5172 encode_coding_iso2022 (Lstream *encoding, const unsigned char *src,
5173 unsigned_char_dynarr *dst, unsigned int n)
5175 unsigned char charmask, c;
5176 unsigned char char_boundary;
5177 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
5178 unsigned int flags = str->flags;
5179 unsigned int ch = str->ch;
5180 Lisp_Coding_System *codesys = str->codesys;
5181 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
5183 Lisp_Object charset;
5186 #ifdef ENABLE_COMPOSITE_CHARS
5187 /* flags for handling composite chars. We do a little switcharoo
5188 on the source while we're outputting the composite char. */
5189 unsigned int saved_n = 0;
5190 const unsigned char *saved_src = NULL;
5191 int in_composite = 0;
5192 #endif /* ENABLE_COMPOSITE_CHARS */
5194 char_boundary = str->iso2022.current_char_boundary;
5195 charset = str->iso2022.current_charset;
5196 half = str->iso2022.current_half;
5198 #ifdef ENABLE_COMPOSITE_CHARS
5205 if (BYTE_ASCII_P (c))
5206 { /* Processing ASCII character */
5209 restore_left_to_right_direction (codesys, dst, &flags, 0);
5211 /* Make sure G0 contains ASCII */
5212 if ((c > ' ' && c < ISO_CODE_DEL) ||
5213 !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
5215 ensure_normal_shift (str, dst);
5216 iso2022_designate (Vcharset_ascii, 0, str, dst);
5219 /* If necessary, restore everything to the default state
5222 !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys)))
5224 restore_left_to_right_direction (codesys, dst, &flags, 0);
5226 ensure_normal_shift (str, dst);
5228 for (i = 0; i < 4; i++)
5230 Lisp_Object initial_charset =
5231 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5232 iso2022_designate (initial_charset, i, str, dst);
5237 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5238 Dynarr_add (dst, '\r');
5239 if (eol_type != EOL_CR)
5240 Dynarr_add (dst, c);
5244 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5245 && fit_to_be_escape_quoted (c))
5246 Dynarr_add (dst, ISO_CODE_ESC);
5247 Dynarr_add (dst, c);
5252 else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
5253 { /* Processing Leading Byte */
5255 charset = CHARSET_BY_LEADING_BYTE (c);
5256 if (LEADING_BYTE_PREFIX_P(c))
5258 else if (!EQ (charset, Vcharset_control_1)
5259 #ifdef ENABLE_COMPOSITE_CHARS
5260 && !EQ (charset, Vcharset_composite)
5266 ensure_correct_direction (XCHARSET_DIRECTION (charset),
5267 codesys, dst, &flags, 0);
5269 /* Now determine which register to use. */
5271 for (i = 0; i < 4; i++)
5273 if (EQ (charset, str->iso2022.charset[i]) ||
5275 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i)))
5284 if (XCHARSET_GRAPHIC (charset) != 0)
5286 if (!NILP (str->iso2022.charset[1]) &&
5287 (!CODING_SYSTEM_ISO2022_SEVEN (codesys) ||
5288 CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys)))
5290 else if (!NILP (str->iso2022.charset[2]))
5292 else if (!NILP (str->iso2022.charset[3]))
5301 iso2022_designate (charset, reg, str, dst);
5303 /* Now invoke that register. */
5307 ensure_normal_shift (str, dst);
5312 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5314 ensure_shift_out (str, dst);
5322 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5324 Dynarr_add (dst, ISO_CODE_ESC);
5325 Dynarr_add (dst, 'N');
5330 Dynarr_add (dst, ISO_CODE_SS2);
5336 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5338 Dynarr_add (dst, ISO_CODE_ESC);
5339 Dynarr_add (dst, 'O');
5344 Dynarr_add (dst, ISO_CODE_SS3);
5356 { /* Processing Non-ASCII character */
5357 charmask = (half == 0 ? 0x7F : 0xFF);
5359 if (EQ (charset, Vcharset_control_1))
5361 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5362 && fit_to_be_escape_quoted (c))
5363 Dynarr_add (dst, ISO_CODE_ESC);
5364 /* you asked for it ... */
5365 Dynarr_add (dst, c - 0x20);
5369 switch (XCHARSET_REP_BYTES (charset))
5372 Dynarr_add (dst, c & charmask);
5375 if (XCHARSET_PRIVATE_P (charset))
5377 Dynarr_add (dst, c & charmask);
5382 #ifdef ENABLE_COMPOSITE_CHARS
5383 if (EQ (charset, Vcharset_composite))
5387 /* #### Bother! We don't know how to
5389 Dynarr_add (dst, '~');
5393 Emchar emch = MAKE_CHAR (Vcharset_composite,
5394 ch & 0x7F, c & 0x7F);
5395 Lisp_Object lstr = composite_char_string (emch);
5399 src = XSTRING_DATA (lstr);
5400 n = XSTRING_LENGTH (lstr);
5401 Dynarr_add (dst, ISO_CODE_ESC);
5402 Dynarr_add (dst, '0'); /* start composing */
5406 #endif /* ENABLE_COMPOSITE_CHARS */
5408 Dynarr_add (dst, ch & charmask);
5409 Dynarr_add (dst, c & charmask);
5422 Dynarr_add (dst, ch & charmask);
5423 Dynarr_add (dst, c & charmask);
5439 #ifdef ENABLE_COMPOSITE_CHARS
5445 Dynarr_add (dst, ISO_CODE_ESC);
5446 Dynarr_add (dst, '1'); /* end composing */
5447 goto back_to_square_n; /* Wheeeeeeeee ..... */
5449 #endif /* ENABLE_COMPOSITE_CHARS */
5451 if (char_boundary && flags & CODING_STATE_END)
5453 restore_left_to_right_direction (codesys, dst, &flags, 0);
5454 ensure_normal_shift (str, dst);
5455 for (i = 0; i < 4; i++)
5457 Lisp_Object initial_charset =
5458 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5459 iso2022_designate (initial_charset, i, str, dst);
5465 str->iso2022.current_char_boundary = char_boundary;
5466 str->iso2022.current_charset = charset;
5467 str->iso2022.current_half = half;
5469 /* Verbum caro factum est! */
5473 /************************************************************************/
5474 /* No-conversion methods */
5475 /************************************************************************/
5477 /* This is used when reading in "binary" files -- i.e. files that may
5478 contain all 256 possible byte values and that are not to be
5479 interpreted as being in any particular decoding. */
5481 decode_coding_no_conversion (Lstream *decoding, const unsigned char *src,
5482 unsigned_char_dynarr *dst, unsigned int n)
5485 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
5486 unsigned int flags = str->flags;
5487 unsigned int ch = str->ch;
5488 eol_type_t eol_type = str->eol_type;
5494 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5495 DECODE_ADD_BINARY_CHAR (c, dst);
5496 label_continue_loop:;
5499 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
5506 encode_coding_no_conversion (Lstream *encoding, const unsigned char *src,
5507 unsigned_char_dynarr *dst, unsigned int n)
5510 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
5511 unsigned int flags = str->flags;
5512 unsigned int ch = str->ch;
5513 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
5520 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5521 Dynarr_add (dst, '\r');
5522 if (eol_type != EOL_CR)
5523 Dynarr_add (dst, '\n');
5526 else if (BYTE_ASCII_P (c))
5529 Dynarr_add (dst, c);
5531 else if (BUFBYTE_LEADING_BYTE_P (c))
5534 if (c == LEADING_BYTE_LATIN_ISO8859_1 ||
5535 c == LEADING_BYTE_CONTROL_1)
5538 Dynarr_add (dst, '~'); /* untranslatable character */
5542 if (ch == LEADING_BYTE_LATIN_ISO8859_1)
5543 Dynarr_add (dst, c);
5544 else if (ch == LEADING_BYTE_CONTROL_1)
5547 Dynarr_add (dst, c - 0x20);
5549 /* else it should be the second or third byte of an
5550 untranslatable character, so ignore it */
5561 /************************************************************************/
5562 /* Initialization */
5563 /************************************************************************/
5566 syms_of_file_coding (void)
5568 INIT_LRECORD_IMPLEMENTATION (coding_system);
5570 deferror (&Qcoding_system_error, "coding-system-error",
5571 "Coding-system error", Qio_error);
5573 DEFSUBR (Fcoding_system_p);
5574 DEFSUBR (Ffind_coding_system);
5575 DEFSUBR (Fget_coding_system);
5576 DEFSUBR (Fcoding_system_list);
5577 DEFSUBR (Fcoding_system_name);
5578 DEFSUBR (Fmake_coding_system);
5579 DEFSUBR (Fcopy_coding_system);
5580 DEFSUBR (Fcoding_system_canonical_name_p);
5581 DEFSUBR (Fcoding_system_alias_p);
5582 DEFSUBR (Fcoding_system_aliasee);
5583 DEFSUBR (Fdefine_coding_system_alias);
5584 DEFSUBR (Fsubsidiary_coding_system);
5586 DEFSUBR (Fcoding_system_type);
5587 DEFSUBR (Fcoding_system_doc_string);
5589 DEFSUBR (Fcoding_system_charset);
5591 DEFSUBR (Fcoding_system_property);
5593 DEFSUBR (Fcoding_category_list);
5594 DEFSUBR (Fset_coding_priority_list);
5595 DEFSUBR (Fcoding_priority_list);
5596 DEFSUBR (Fset_coding_category_system);
5597 DEFSUBR (Fcoding_category_system);
5599 DEFSUBR (Fdetect_coding_region);
5600 DEFSUBR (Fdecode_coding_region);
5601 DEFSUBR (Fencode_coding_region);
5603 DEFSUBR (Fdecode_shift_jis_char);
5604 DEFSUBR (Fencode_shift_jis_char);
5605 DEFSUBR (Fdecode_big5_char);
5606 DEFSUBR (Fencode_big5_char);
5607 DEFSUBR (Fset_ucs_char);
5608 DEFSUBR (Fucs_char);
5609 DEFSUBR (Fset_char_ucs);
5610 DEFSUBR (Fchar_ucs);
5612 defsymbol (&Qcoding_systemp, "coding-system-p");
5613 defsymbol (&Qno_conversion, "no-conversion");
5614 defsymbol (&Qraw_text, "raw-text");
5616 defsymbol (&Qbig5, "big5");
5617 defsymbol (&Qshift_jis, "shift-jis");
5618 defsymbol (&Qucs4, "ucs-4");
5619 defsymbol (&Qutf8, "utf-8");
5620 defsymbol (&Qccl, "ccl");
5621 defsymbol (&Qiso2022, "iso2022");
5623 defsymbol (&Qmnemonic, "mnemonic");
5624 defsymbol (&Qeol_type, "eol-type");
5625 defsymbol (&Qpost_read_conversion, "post-read-conversion");
5626 defsymbol (&Qpre_write_conversion, "pre-write-conversion");
5628 defsymbol (&Qcr, "cr");
5629 defsymbol (&Qlf, "lf");
5630 defsymbol (&Qcrlf, "crlf");
5631 defsymbol (&Qeol_cr, "eol-cr");
5632 defsymbol (&Qeol_lf, "eol-lf");
5633 defsymbol (&Qeol_crlf, "eol-crlf");
5635 defsymbol (&Qcharset_g0, "charset-g0");
5636 defsymbol (&Qcharset_g1, "charset-g1");
5637 defsymbol (&Qcharset_g2, "charset-g2");
5638 defsymbol (&Qcharset_g3, "charset-g3");
5639 defsymbol (&Qforce_g0_on_output, "force-g0-on-output");
5640 defsymbol (&Qforce_g1_on_output, "force-g1-on-output");
5641 defsymbol (&Qforce_g2_on_output, "force-g2-on-output");
5642 defsymbol (&Qforce_g3_on_output, "force-g3-on-output");
5643 defsymbol (&Qno_iso6429, "no-iso6429");
5644 defsymbol (&Qinput_charset_conversion, "input-charset-conversion");
5645 defsymbol (&Qoutput_charset_conversion, "output-charset-conversion");
5647 defsymbol (&Qshort, "short");
5648 defsymbol (&Qno_ascii_eol, "no-ascii-eol");
5649 defsymbol (&Qno_ascii_cntl, "no-ascii-cntl");
5650 defsymbol (&Qseven, "seven");
5651 defsymbol (&Qlock_shift, "lock-shift");
5652 defsymbol (&Qescape_quoted, "escape-quoted");
5654 defsymbol (&Qencode, "encode");
5655 defsymbol (&Qdecode, "decode");
5658 defsymbol (&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS],
5660 defsymbol (&coding_category_symbol[CODING_CATEGORY_BIG5],
5662 defsymbol (&coding_category_symbol[CODING_CATEGORY_UCS4],
5664 defsymbol (&coding_category_symbol[CODING_CATEGORY_UTF8],
5666 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_7],
5668 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE],
5670 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_1],
5672 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_2],
5674 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_LOCK_SHIFT],
5677 defsymbol (&coding_category_symbol[CODING_CATEGORY_NO_CONVERSION],
5682 lstream_type_create_file_coding (void)
5684 LSTREAM_HAS_METHOD (decoding, reader);
5685 LSTREAM_HAS_METHOD (decoding, writer);
5686 LSTREAM_HAS_METHOD (decoding, rewinder);
5687 LSTREAM_HAS_METHOD (decoding, seekable_p);
5688 LSTREAM_HAS_METHOD (decoding, flusher);
5689 LSTREAM_HAS_METHOD (decoding, closer);
5690 LSTREAM_HAS_METHOD (decoding, marker);
5692 LSTREAM_HAS_METHOD (encoding, reader);
5693 LSTREAM_HAS_METHOD (encoding, writer);
5694 LSTREAM_HAS_METHOD (encoding, rewinder);
5695 LSTREAM_HAS_METHOD (encoding, seekable_p);
5696 LSTREAM_HAS_METHOD (encoding, flusher);
5697 LSTREAM_HAS_METHOD (encoding, closer);
5698 LSTREAM_HAS_METHOD (encoding, marker);
5702 vars_of_file_coding (void)
5706 fcd = xnew (struct file_coding_dump);
5707 dumpstruct (&fcd, &fcd_description);
5709 /* Initialize to something reasonable ... */
5710 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
5712 fcd->coding_category_system[i] = Qnil;
5713 fcd->coding_category_by_priority[i] = i;
5716 Fprovide (intern ("file-coding"));
5718 DEFVAR_LISP ("keyboard-coding-system", &Vkeyboard_coding_system /*
5719 Coding system used for TTY keyboard input.
5720 Not used under a windowing system.
5722 Vkeyboard_coding_system = Qnil;
5724 DEFVAR_LISP ("terminal-coding-system", &Vterminal_coding_system /*
5725 Coding system used for TTY display output.
5726 Not used under a windowing system.
5728 Vterminal_coding_system = Qnil;
5730 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read /*
5731 Overriding coding system used when reading from a file or process.
5732 You should bind this variable with `let', but do not set it globally.
5733 If this is non-nil, it specifies the coding system that will be used
5734 to decode input on read operations, such as from a file or process.
5735 It overrides `buffer-file-coding-system-for-read',
5736 `insert-file-contents-pre-hook', etc. Use those variables instead of
5737 this one for permanent changes to the environment. */ );
5738 Vcoding_system_for_read = Qnil;
5740 DEFVAR_LISP ("coding-system-for-write",
5741 &Vcoding_system_for_write /*
5742 Overriding coding system used when writing to a file or process.
5743 You should bind this variable with `let', but do not set it globally.
5744 If this is non-nil, it specifies the coding system that will be used
5745 to encode output for write operations, such as to a file or process.
5746 It overrides `buffer-file-coding-system', `write-region-pre-hook', etc.
5747 Use those variables instead of this one for permanent changes to the
5749 Vcoding_system_for_write = Qnil;
5751 DEFVAR_LISP ("file-name-coding-system", &Vfile_name_coding_system /*
5752 Coding system used to convert pathnames when accessing files.
5754 Vfile_name_coding_system = Qnil;
5756 DEFVAR_BOOL ("enable-multibyte-characters", &enable_multibyte_characters /*
5757 Non-nil means the buffer contents are regarded as multi-byte form
5758 of characters, not a binary code. This affects the display, file I/O,
5759 and behaviors of various editing commands.
5761 Setting this to nil does not do anything.
5763 enable_multibyte_characters = 1;
5767 complex_vars_of_file_coding (void)
5769 staticpro (&Vcoding_system_hash_table);
5770 Vcoding_system_hash_table =
5771 make_lisp_hash_table (50, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ);
5773 the_codesys_prop_dynarr = Dynarr_new (codesys_prop);
5774 dumpstruct (&the_codesys_prop_dynarr, &codesys_prop_dynarr_description);
5776 #define DEFINE_CODESYS_PROP(Prop_Type, Sym) do \
5778 struct codesys_prop csp; \
5780 csp.prop_type = (Prop_Type); \
5781 Dynarr_add (the_codesys_prop_dynarr, csp); \
5784 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qmnemonic);
5785 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_type);
5786 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_cr);
5787 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_crlf);
5788 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_lf);
5789 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpost_read_conversion);
5790 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpre_write_conversion);
5792 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g0);
5793 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g1);
5794 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g2);
5795 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g3);
5796 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g0_on_output);
5797 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g1_on_output);
5798 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g2_on_output);
5799 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g3_on_output);
5800 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qshort);
5801 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_eol);
5802 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_cntl);
5803 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qseven);
5804 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qlock_shift);
5805 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_iso6429);
5806 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qescape_quoted);
5807 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qinput_charset_conversion);
5808 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qoutput_charset_conversion);
5810 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qencode);
5811 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qdecode);
5813 /* Need to create this here or we're really screwed. */
5815 (Qraw_text, Qno_conversion,
5816 build_string ("Raw text, which means it converts only line-break-codes."),
5817 list2 (Qmnemonic, build_string ("Raw")));
5820 (Qbinary, Qno_conversion,
5821 build_string ("Binary, which means it does not convert anything."),
5822 list4 (Qeol_type, Qlf,
5823 Qmnemonic, build_string ("Binary")));
5825 Fdefine_coding_system_alias (Qno_conversion, Qraw_text);
5827 Fdefine_coding_system_alias (Qfile_name, Qbinary);
5829 Fdefine_coding_system_alias (Qterminal, Qbinary);
5830 Fdefine_coding_system_alias (Qkeyboard, Qbinary);
5832 /* Need this for bootstrapping */
5833 fcd->coding_category_system[CODING_CATEGORY_NO_CONVERSION] =
5834 Fget_coding_system (Qraw_text);
5840 for (i = 0; i < 65536; i++)
5841 fcd->ucs_to_mule_table[i] = Qnil;
5843 staticpro (&mule_to_ucs_table);
5844 mule_to_ucs_table = Fmake_char_table(Qgeneric);