src/file-coding.c

   1 /* Code conversion functions.
   2    Copyright (C) 1991, 1995 Free Software Foundation, Inc.
   3    Copyright (C) 1995 Sun Microsystems, Inc.
   4
   5 This file is part of XEmacs.
   6
   7 XEmacs is free software; you can redistribute it and/or modify it
   8 under the terms of the GNU General Public License as published by the
   9 Free Software Foundation; either version 2, or (at your option) any
  10 later version.
  11
  12 XEmacs is distributed in the hope that it will be useful, but WITHOUT
  13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  14 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  15 for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with XEmacs; see the file COPYING.  If not, write to
  19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  20 Boston, MA 02111-1307, USA.  */
  21
  22 /* Synched up with: Mule 2.3.   Not in FSF. */
  23
  24 /* Rewritten by Ben Wing <ben@xemacs.org>. */
  25
  26 #include <config.h>
  27 #include "lisp.h"
  28
  29 #include "buffer.h"
  30 #include "elhash.h"
  31 #include "insdel.h"
  32 #include "lstream.h"
  33 #include "opaque.h"
  34 #ifdef MULE
  35 #include "mule-ccl.h"
  36 #include "chartab.h"
  37 #endif
  38 #include "file-coding.h"
  39
  40 Lisp_Object Qcoding_system_error;
  41
  42 Lisp_Object Vkeyboard_coding_system;
  43 Lisp_Object Vterminal_coding_system;
  44 Lisp_Object Vcoding_system_for_read;
  45 Lisp_Object Vcoding_system_for_write;
  46 Lisp_Object Vfile_name_coding_system;
  47
  48 /* Table of symbols identifying each coding category. */
  49 Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST];
  50
  51
  52
  53 struct file_coding_dump {
  54   /* Coding system currently associated with each coding category. */
  55   Lisp_Object coding_category_system[CODING_CATEGORY_LAST];
  56
  57   /* Table of all coding categories in decreasing order of priority.
  58      This describes a permutation of the possible coding categories. */
  59   int coding_category_by_priority[CODING_CATEGORY_LAST];
  60
  61 #ifdef MULE
  62   Lisp_Object ucs_to_mule_table[65536];
  63 #endif
  64 } *fcd;
  65
  66 static const struct lrecord_description fcd_description_1[] = {
  67   { XD_LISP_OBJECT_ARRAY, offsetof (struct file_coding_dump, coding_category_system), CODING_CATEGORY_LAST },
  68 #ifdef MULE
  69   { XD_LISP_OBJECT_ARRAY, offsetof (struct file_coding_dump, ucs_to_mule_table), countof (fcd->ucs_to_mule_table) },
  70 #endif
  71   { XD_END }
  72 };
  73
  74 static const struct struct_description fcd_description = {
  75   sizeof (struct file_coding_dump),
  76   fcd_description_1
  77 };
  78
  79 Lisp_Object mule_to_ucs_table;
  80
  81 Lisp_Object Qcoding_systemp;
  82
  83 Lisp_Object Qraw_text, Qno_conversion, Qccl, Qiso2022;
  84 /* Qinternal in general.c */
  85
  86 Lisp_Object Qmnemonic, Qeol_type;
  87 Lisp_Object Qcr, Qcrlf, Qlf;
  88 Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf;
  89 Lisp_Object Qpost_read_conversion;
  90 Lisp_Object Qpre_write_conversion;
  91
  92 #ifdef MULE
  93 Lisp_Object Qucs4, Qutf8;
  94 Lisp_Object Qbig5, Qshift_jis;
  95 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3;
  96 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output;
  97 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output;
  98 Lisp_Object Qno_iso6429;
  99 Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion;
 100 Lisp_Object Qescape_quoted;
 101 Lisp_Object Qshort, Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift;
 102 #endif
 103 Lisp_Object Qencode, Qdecode;
 104
 105 Lisp_Object Vcoding_system_hash_table;
 106
 107 int enable_multibyte_characters;
 108
 109 #ifdef MULE
 110 /* Additional information used by the ISO2022 decoder and detector. */
 111 struct iso2022_decoder
 112 {
 113   /* CHARSET holds the character sets currently assigned to the G0
 114      through G3 variables.  It is initialized from the array
 115      INITIAL_CHARSET in CODESYS. */
 116   Lisp_Object charset[4];
 117
 118   /* Which registers are currently invoked into the left (GL) and
 119      right (GR) halves of the 8-bit encoding space? */
 120   int register_left, register_right;
 121
 122   /* ISO_ESC holds a value indicating part of an escape sequence
 123      that has already been seen. */
 124   enum iso_esc_flag esc;
 125
 126   /* This records the bytes we've seen so far in an escape sequence,
 127      in case the sequence is invalid (we spit out the bytes unchanged). */
 128   unsigned char esc_bytes[8];
 129
 130   /* Index for next byte to store in ISO escape sequence. */
 131   int esc_bytes_index;
 132
 133 #ifdef ENABLE_COMPOSITE_CHARS
 134   /* Stuff seen so far when composing a string. */
 135   unsigned_char_dynarr *composite_chars;
 136 #endif
 137
 138   /* If we saw an invalid designation sequence for a particular
 139      register, we flag it here and switch to ASCII.  The next time we
 140      see a valid designation for this register, we turn off the flag
 141      and do the designation normally, but pretend the sequence was
 142      invalid.  The effect of all this is that (most of the time) the
 143      escape sequences for both the switch to the unknown charset, and
 144      the switch back to the known charset, get inserted literally into
 145      the buffer and saved out as such.  The hope is that we can
 146      preserve the escape sequences so that the resulting written out
 147      file makes sense.  If we don't do any of this, the designation
 148      to the invalid charset will be preserved but that switch back
 149      to the known charset will probably get eaten because it was
 150      the same charset that was already present in the register. */
 151   unsigned char invalid_designated[4];
 152
 153   /* We try to do similar things as above for direction-switching
 154      sequences.  If we encountered a direction switch while an
 155      invalid designation was present, or an invalid designation
 156      just after a direction switch (i.e. no valid designation
 157      encountered yet), we insert the direction-switch escape
 158      sequence literally into the output stream, and later on
 159      insert the corresponding direction-restoring escape sequence
 160      literally also. */
 161   unsigned int switched_dir_and_no_valid_charset_yet :1;
 162   unsigned int invalid_switch_dir :1;
 163
 164   /* Tells the decoder to output the escape sequence literally
 165      even though it was valid.  Used in the games we play to
 166      avoid lossage when we encounter invalid designations. */
 167   unsigned int output_literally :1;
 168   /* We encountered a direction switch followed by an invalid
 169      designation.  We didn't output the direction switch
 170      literally because we didn't know about the invalid designation;
 171      but we have to do so now. */
 172   unsigned int output_direction_sequence :1;
 173 };
 174 #endif /* MULE */
 175 EXFUN (Fcopy_coding_system, 2);
 176 #ifdef MULE
 177 struct detection_state;
 178 static int detect_coding_sjis (struct detection_state *st,
 179                                const Extbyte *src, Lstream_data_count n);
 180 static void decode_coding_sjis (Lstream *decoding, const Extbyte *src,
 181                                 unsigned_char_dynarr *dst, Lstream_data_count n);
 182 static void encode_coding_sjis (Lstream *encoding, const Bufbyte *src,
 183                                 unsigned_char_dynarr *dst, Lstream_data_count n);
 184 static int detect_coding_big5 (struct detection_state *st,
 185                                const Extbyte *src, Lstream_data_count n);
 186 static void decode_coding_big5 (Lstream *decoding, const Extbyte *src,
 187                                 unsigned_char_dynarr *dst, Lstream_data_count n);
 188 static void encode_coding_big5 (Lstream *encoding, const Bufbyte *src,
 189                                 unsigned_char_dynarr *dst, Lstream_data_count n);
 190 static int detect_coding_ucs4 (struct detection_state *st,
 191                                const Extbyte *src, Lstream_data_count n);
 192 static void decode_coding_ucs4 (Lstream *decoding, const Extbyte *src,
 193                                 unsigned_char_dynarr *dst, Lstream_data_count n);
 194 static void encode_coding_ucs4 (Lstream *encoding, const Bufbyte *src,
 195                                 unsigned_char_dynarr *dst, Lstream_data_count n);
 196 static int detect_coding_utf8 (struct detection_state *st,
 197                                const Extbyte *src, Lstream_data_count n);
 198 static void decode_coding_utf8 (Lstream *decoding, const Extbyte *src,
 199                                 unsigned_char_dynarr *dst, Lstream_data_count n);
 200 static void encode_coding_utf8 (Lstream *encoding, const Bufbyte *src,
 201                                 unsigned_char_dynarr *dst, Lstream_data_count n);
 202 static int postprocess_iso2022_mask (int mask);
 203 static void reset_iso2022 (Lisp_Object coding_system,
 204                            struct iso2022_decoder *iso);
 205 static int detect_coding_iso2022 (struct detection_state *st,
 206                                   const Extbyte *src, Lstream_data_count n);
 207 static void decode_coding_iso2022 (Lstream *decoding, const Extbyte *src,
 208                                    unsigned_char_dynarr *dst, Lstream_data_count n);
 209 static void encode_coding_iso2022 (Lstream *encoding, const Bufbyte *src,
 210                                    unsigned_char_dynarr *dst, Lstream_data_count n);
 211 #endif /* MULE */
 212 static void decode_coding_no_conversion (Lstream *decoding, const Extbyte *src,
 213                                          unsigned_char_dynarr *dst, Lstream_data_count n);
 214 static void encode_coding_no_conversion (Lstream *encoding, const Bufbyte *src,
 215                                          unsigned_char_dynarr *dst, Lstream_data_count n);
 216 static void mule_decode (Lstream *decoding, const Extbyte *src,
 217                          unsigned_char_dynarr *dst, Lstream_data_count n);
 218 static void mule_encode (Lstream *encoding, const Bufbyte *src,
 219                          unsigned_char_dynarr *dst, Lstream_data_count n);
 220
 221 typedef struct codesys_prop codesys_prop;
 222 struct codesys_prop
 223 {
 224   Lisp_Object sym;
 225   int prop_type;
 226 };
 227
 228 typedef struct
 229 {
 230   Dynarr_declare (codesys_prop);
 231 } codesys_prop_dynarr;
 232
 233 static const struct lrecord_description codesys_prop_description_1[] = {
 234   { XD_LISP_OBJECT, offsetof (codesys_prop, sym) },
 235   { XD_END }
 236 };
 237
 238 static const struct struct_description codesys_prop_description = {
 239   sizeof (codesys_prop),
 240   codesys_prop_description_1
 241 };
 242
 243 static const struct lrecord_description codesys_prop_dynarr_description_1[] = {
 244   XD_DYNARR_DESC (codesys_prop_dynarr, &codesys_prop_description),
 245   { XD_END }
 246 };
 247
 248 static const struct struct_description codesys_prop_dynarr_description = {
 249   sizeof (codesys_prop_dynarr),
 250   codesys_prop_dynarr_description_1
 251 };
 252
 253 codesys_prop_dynarr *the_codesys_prop_dynarr;
 254
 255 enum codesys_prop_enum
 256 {
 257   CODESYS_PROP_ALL_OK,
 258   CODESYS_PROP_ISO2022,
 259   CODESYS_PROP_CCL
 260 };
 261
 262 \f
 263 /************************************************************************/
 264 /*                       Coding system functions                        */
 265 /************************************************************************/
 266
 267 static Lisp_Object mark_coding_system (Lisp_Object);
 268 static void print_coding_system (Lisp_Object, Lisp_Object, int);
 269 static void finalize_coding_system (void *header, int for_disksave);
 270
 271 #ifdef MULE
 272 static const struct lrecord_description ccs_description_1[] = {
 273   { XD_LISP_OBJECT, offsetof (charset_conversion_spec, from_charset) },
 274   { XD_LISP_OBJECT, offsetof (charset_conversion_spec, to_charset) },
 275   { XD_END }
 276 };
 277
 278 static const struct struct_description ccs_description = {
 279   sizeof (charset_conversion_spec),
 280   ccs_description_1
 281 };
 282
 283 static const struct lrecord_description ccsd_description_1[] = {
 284   XD_DYNARR_DESC (charset_conversion_spec_dynarr, &ccs_description),
 285   { XD_END }
 286 };
 287
 288 static const struct struct_description ccsd_description = {
 289   sizeof (charset_conversion_spec_dynarr),
 290   ccsd_description_1
 291 };
 292 #endif
 293
 294 static const struct lrecord_description coding_system_description[] = {
 295   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, name) },
 296   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, doc_string) },
 297   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, mnemonic) },
 298   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, post_read_conversion) },
 299   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, pre_write_conversion) },
 300   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_lf) },
 301   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_crlf) },
 302   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_cr) },
 303 #ifdef MULE
 304   { XD_LISP_OBJECT_ARRAY, offsetof (Lisp_Coding_System, iso2022.initial_charset), 4 },
 305   { XD_STRUCT_PTR,  offsetof (Lisp_Coding_System, iso2022.input_conv),  1, &ccsd_description },
 306   { XD_STRUCT_PTR,  offsetof (Lisp_Coding_System, iso2022.output_conv), 1, &ccsd_description },
 307   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccl.decode) },
 308   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccl.encode) },
 309 #endif
 310   { XD_END }
 311 };
 312
 313 DEFINE_LRECORD_IMPLEMENTATION ("coding-system", coding_system,
 314                                mark_coding_system, print_coding_system,
 315                                finalize_coding_system,
 316                                0, 0, coding_system_description,
 317                                Lisp_Coding_System);
 318
 319 static Lisp_Object
 320 mark_coding_system (Lisp_Object obj)
 321 {
 322   Lisp_Coding_System *codesys = XCODING_SYSTEM (obj);
 323
 324   mark_object (CODING_SYSTEM_NAME (codesys));
 325   mark_object (CODING_SYSTEM_DOC_STRING (codesys));
 326   mark_object (CODING_SYSTEM_MNEMONIC (codesys));
 327   mark_object (CODING_SYSTEM_EOL_LF (codesys));
 328   mark_object (CODING_SYSTEM_EOL_CRLF (codesys));
 329   mark_object (CODING_SYSTEM_EOL_CR (codesys));
 330
 331   switch (CODING_SYSTEM_TYPE (codesys))
 332     {
 333 #ifdef MULE
 334       int i;
 335     case CODESYS_ISO2022:
 336       for (i = 0; i < 4; i++)
 337         mark_object (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i));
 338       if (codesys->iso2022.input_conv)
 339         {
 340           for (i = 0; i < Dynarr_length (codesys->iso2022.input_conv); i++)
 341             {
 342               struct charset_conversion_spec *ccs =
 343                 Dynarr_atp (codesys->iso2022.input_conv, i);
 344               mark_object (ccs->from_charset);
 345               mark_object (ccs->to_charset);
 346             }
 347         }
 348       if (codesys->iso2022.output_conv)
 349         {
 350           for (i = 0; i < Dynarr_length (codesys->iso2022.output_conv); i++)
 351             {
 352               struct charset_conversion_spec *ccs =
 353                 Dynarr_atp (codesys->iso2022.output_conv, i);
 354               mark_object (ccs->from_charset);
 355               mark_object (ccs->to_charset);
 356             }
 357         }
 358       break;
 359
 360     case CODESYS_CCL:
 361       mark_object (CODING_SYSTEM_CCL_DECODE (codesys));
 362       mark_object (CODING_SYSTEM_CCL_ENCODE (codesys));
 363       break;
 364 #endif /* MULE */
 365     default:
 366       break;
 367     }
 368
 369   mark_object (CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys));
 370   return CODING_SYSTEM_POST_READ_CONVERSION (codesys);
 371 }
 372
 373 static void
 374 print_coding_system (Lisp_Object obj, Lisp_Object printcharfun,
 375                      int escapeflag)
 376 {
 377   Lisp_Coding_System *c = XCODING_SYSTEM (obj);
 378   if (print_readably)
 379     error ("printing unreadable object #<coding_system 0x%x>",
 380            c->header.uid);
 381
 382   write_c_string ("#<coding_system ", printcharfun);
 383   print_internal (c->name, printcharfun, 1);
 384   write_c_string (">", printcharfun);
 385 }
 386
 387 static void
 388 finalize_coding_system (void *header, int for_disksave)
 389 {
 390   Lisp_Coding_System *c = (Lisp_Coding_System *) header;
 391   /* Since coding systems never go away, this function is not
 392      necessary.  But it would be necessary if we changed things
 393      so that coding systems could go away. */
 394   if (!for_disksave) /* see comment in lstream.c */
 395     {
 396       switch (CODING_SYSTEM_TYPE (c))
 397         {
 398 #ifdef MULE
 399         case CODESYS_ISO2022:
 400           if (c->iso2022.input_conv)
 401             {
 402               Dynarr_free (c->iso2022.input_conv);
 403               c->iso2022.input_conv = 0;
 404             }
 405           if (c->iso2022.output_conv)
 406             {
 407               Dynarr_free (c->iso2022.output_conv);
 408               c->iso2022.output_conv = 0;
 409             }
 410           break;
 411 #endif /* MULE */
 412         default:
 413           break;
 414         }
 415     }
 416 }
 417
 418 static eol_type_t
 419 symbol_to_eol_type (Lisp_Object symbol)
 420 {
 421   CHECK_SYMBOL (symbol);
 422   if (NILP (symbol))      return EOL_AUTODETECT;
 423   if (EQ (symbol, Qlf))   return EOL_LF;
 424   if (EQ (symbol, Qcrlf)) return EOL_CRLF;
 425   if (EQ (symbol, Qcr))   return EOL_CR;
 426
 427   signal_simple_error ("Unrecognized eol type", symbol);
 428   return EOL_AUTODETECT; /* not reached */
 429 }
 430
 431 static Lisp_Object
 432 eol_type_to_symbol (eol_type_t type)
 433 {
 434   switch (type)
 435     {
 436     default: abort ();
 437     case EOL_LF:         return Qlf;
 438     case EOL_CRLF:       return Qcrlf;
 439     case EOL_CR:         return Qcr;
 440     case EOL_AUTODETECT: return Qnil;
 441     }
 442 }
 443
 444 static void
 445 setup_eol_coding_systems (Lisp_Coding_System *codesys)
 446 {
 447   Lisp_Object codesys_obj;
 448   int len = string_length (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name);
 449   char *codesys_name = (char *) alloca (len + 7);
 450   int mlen = -1;
 451   char *codesys_mnemonic=0;
 452
 453   Lisp_Object codesys_name_sym, sub_codesys_obj;
 454
 455   /* kludge */
 456
 457   XSETCODING_SYSTEM (codesys_obj, codesys);
 458
 459   memcpy (codesys_name,
 460           string_data (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name), len);
 461
 462   if (STRINGP (CODING_SYSTEM_MNEMONIC (codesys)))
 463     {
 464       mlen = XSTRING_LENGTH (CODING_SYSTEM_MNEMONIC (codesys));
 465       codesys_mnemonic = (char *) alloca (mlen + 7);
 466       memcpy (codesys_mnemonic,
 467               XSTRING_DATA (CODING_SYSTEM_MNEMONIC (codesys)), mlen);
 468     }
 469
 470 #define DEFINE_SUB_CODESYS(op_sys, op_sys_abbr, Type) do {                      \
 471   strcpy (codesys_name + len, "-" op_sys);                                      \
 472   if (mlen != -1)                                                               \
 473     strcpy (codesys_mnemonic + mlen, op_sys_abbr);                              \
 474   codesys_name_sym = intern (codesys_name);                                     \
 475   sub_codesys_obj = Fcopy_coding_system (codesys_obj, codesys_name_sym);        \
 476   XCODING_SYSTEM_EOL_TYPE (sub_codesys_obj) = Type;                             \
 477   if (mlen != -1)                                                               \
 478     XCODING_SYSTEM_MNEMONIC(sub_codesys_obj) =                                  \
 479       build_string (codesys_mnemonic);                                          \
 480   CODING_SYSTEM_##Type (codesys) = sub_codesys_obj;                             \
 481 } while (0)
 482
 483   DEFINE_SUB_CODESYS("unix", "", EOL_LF);
 484   DEFINE_SUB_CODESYS("dos",  ":T", EOL_CRLF);
 485   DEFINE_SUB_CODESYS("mac",  ":t", EOL_CR);
 486 }
 487
 488 DEFUN ("coding-system-p", Fcoding_system_p, 1, 1, 0, /*
 489 Return t if OBJECT is a coding system.
 490 A coding system is an object that defines how text containing multiple
 491 character sets is encoded into a stream of (typically 8-bit) bytes.
 492 The coding system is used to decode the stream into a series of
 493 characters (which may be from multiple charsets) when the text is read
 494 from a file or process, and is used to encode the text back into the
 495 same format when it is written out to a file or process.
 496
 497 For example, many ISO2022-compliant coding systems (such as Compound
 498 Text, which is used for inter-client data under the X Window System)
 499 use escape sequences to switch between different charsets -- Japanese
 500 Kanji, for example, is invoked with "ESC $ ( B"; ASCII is invoked
 501 with "ESC ( B"; and Cyrillic is invoked with "ESC - L".  See
 502 `make-coding-system' for more information.
 503
 504 Coding systems are normally identified using a symbol, and the
 505 symbol is accepted in place of the actual coding system object whenever
 506 a coding system is called for. (This is similar to how faces work.)
 507 */
 508        (object))
 509 {
 510   return CODING_SYSTEMP (object) ? Qt : Qnil;
 511 }
 512
 513 DEFUN ("find-coding-system", Ffind_coding_system, 1, 1, 0, /*
 514 Retrieve the coding system of the given name.
 515
 516 If CODING-SYSTEM-OR-NAME is a coding-system object, it is simply
 517 returned.  Otherwise, CODING-SYSTEM-OR-NAME should be a symbol.
 518 If there is no such coding system, nil is returned.  Otherwise the
 519 associated coding system object is returned.
 520 */
 521        (coding_system_or_name))
 522 {
 523   if (NILP (coding_system_or_name))
 524     coding_system_or_name = Qbinary;
 525   else if (CODING_SYSTEMP (coding_system_or_name))
 526     return coding_system_or_name;
 527   else
 528     CHECK_SYMBOL (coding_system_or_name);
 529
 530   while (1)
 531     {
 532       coding_system_or_name =
 533         Fgethash (coding_system_or_name, Vcoding_system_hash_table, Qnil);
 534
 535       if (CODING_SYSTEMP (coding_system_or_name) || NILP (coding_system_or_name))
 536         return coding_system_or_name;
 537     }
 538 }
 539
 540 DEFUN ("get-coding-system", Fget_coding_system, 1, 1, 0, /*
 541 Retrieve the coding system of the given name.
 542 Same as `find-coding-system' except that if there is no such
 543 coding system, an error is signaled instead of returning nil.
 544 */
 545        (name))
 546 {
 547   Lisp_Object coding_system = Ffind_coding_system (name);
 548
 549   if (NILP (coding_system))
 550     signal_simple_error ("No such coding system", name);
 551   return coding_system;
 552 }
 553
 554 /* We store the coding systems in hash tables with the names as the key and the
 555    actual coding system object as the value.  Occasionally we need to use them
 556    in a list format.  These routines provide us with that. */
 557 struct coding_system_list_closure
 558 {
 559   Lisp_Object *coding_system_list;
 560 };
 561
 562 static int
 563 add_coding_system_to_list_mapper (Lisp_Object key, Lisp_Object value,
 564                                   void *coding_system_list_closure)
 565 {
 566   /* This function can GC */
 567   struct coding_system_list_closure *cscl =
 568     (struct coding_system_list_closure *) coding_system_list_closure;
 569   Lisp_Object *coding_system_list = cscl->coding_system_list;
 570
 571   *coding_system_list = Fcons (key, *coding_system_list);
 572   return 0;
 573 }
 574
 575 DEFUN ("coding-system-list", Fcoding_system_list, 0, 0, 0, /*
 576 Return a list of the names of all defined coding systems.
 577 */
 578        ())
 579 {
 580   Lisp_Object coding_system_list = Qnil;
 581   struct gcpro gcpro1;
 582   struct coding_system_list_closure coding_system_list_closure;
 583
 584   GCPRO1 (coding_system_list);
 585   coding_system_list_closure.coding_system_list = &coding_system_list;
 586   elisp_maphash (add_coding_system_to_list_mapper, Vcoding_system_hash_table,
 587                  &coding_system_list_closure);
 588   UNGCPRO;
 589
 590   return coding_system_list;
 591 }
 592
 593 DEFUN ("coding-system-name", Fcoding_system_name, 1, 1, 0, /*
 594 Return the name of the given coding system.
 595 */
 596        (coding_system))
 597 {
 598   coding_system = Fget_coding_system (coding_system);
 599   return XCODING_SYSTEM_NAME (coding_system);
 600 }
 601
 602 static Lisp_Coding_System *
 603 allocate_coding_system (enum coding_system_type type, Lisp_Object name)
 604 {
 605   Lisp_Coding_System *codesys =
 606     alloc_lcrecord_type (Lisp_Coding_System, &lrecord_coding_system);
 607
 608   zero_lcrecord (codesys);
 609   CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = Qnil;
 610   CODING_SYSTEM_POST_READ_CONVERSION (codesys) = Qnil;
 611   CODING_SYSTEM_EOL_TYPE (codesys) = EOL_AUTODETECT;
 612   CODING_SYSTEM_EOL_CRLF (codesys) = Qnil;
 613   CODING_SYSTEM_EOL_CR   (codesys) = Qnil;
 614   CODING_SYSTEM_EOL_LF   (codesys) = Qnil;
 615   CODING_SYSTEM_TYPE     (codesys) = type;
 616   CODING_SYSTEM_MNEMONIC (codesys) = Qnil;
 617 #ifdef MULE
 618   if (type == CODESYS_ISO2022)
 619     {
 620       int i;
 621       for (i = 0; i < 4; i++)
 622         CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i) = Qnil;
 623     }
 624   else if (type == CODESYS_CCL)
 625     {
 626       CODING_SYSTEM_CCL_DECODE (codesys) = Qnil;
 627       CODING_SYSTEM_CCL_ENCODE (codesys) = Qnil;
 628     }
 629 #endif /* MULE */
 630   CODING_SYSTEM_NAME (codesys) = name;
 631
 632   return codesys;
 633 }
 634
 635 #ifdef MULE
 636 /* Given a list of charset conversion specs as specified in a Lisp
 637    program, parse it into STORE_HERE. */
 638
 639 static void
 640 parse_charset_conversion_specs (charset_conversion_spec_dynarr *store_here,
 641                                 Lisp_Object spec_list)
 642 {
 643   Lisp_Object rest;
 644
 645   EXTERNAL_LIST_LOOP (rest, spec_list)
 646     {
 647       Lisp_Object car = XCAR (rest);
 648       Lisp_Object from, to;
 649       struct charset_conversion_spec spec;
 650
 651       if (!CONSP (car) || !CONSP (XCDR (car)) || !NILP (XCDR (XCDR (car))))
 652         signal_simple_error ("Invalid charset conversion spec", car);
 653       from = Fget_charset (XCAR (car));
 654       to = Fget_charset (XCAR (XCDR (car)));
 655       if (XCHARSET_TYPE (from) != XCHARSET_TYPE (to))
 656         signal_simple_error_2
 657           ("Attempted conversion between different charset types",
 658            from, to);
 659       spec.from_charset = from;
 660       spec.to_charset = to;
 661
 662       Dynarr_add (store_here, spec);
 663     }
 664 }
 665
 666 /* Given a dynarr LOAD_HERE of internally-stored charset conversion
 667    specs, return the equivalent as the Lisp programmer would see it.
 668
 669    If LOAD_HERE is 0, return Qnil. */
 670
 671 static Lisp_Object
 672 unparse_charset_conversion_specs (charset_conversion_spec_dynarr *load_here)
 673 {
 674   int i;
 675   Lisp_Object result;
 676
 677   if (!load_here)
 678     return Qnil;
 679   for (i = 0, result = Qnil; i < Dynarr_length (load_here); i++)
 680     {
 681       struct charset_conversion_spec *ccs = Dynarr_atp (load_here, i);
 682       result = Fcons (list2 (ccs->from_charset, ccs->to_charset), result);
 683     }
 684
 685   return Fnreverse (result);
 686 }
 687
 688 #endif /* MULE */
 689
 690 DEFUN ("make-coding-system", Fmake_coding_system, 2, 4, 0, /*
 691 Register symbol NAME as a coding system.
 692
 693 TYPE describes the conversion method used and should be one of
 694
 695 nil or 'undecided
 696      Automatic conversion.  XEmacs attempts to detect the coding system
 697      used in the file.
 698 'no-conversion
 699      No conversion.  Use this for binary files and such.  On output,
 700      graphic characters that are not in ASCII or Latin-1 will be
 701      replaced by a ?. (For a no-conversion-encoded buffer, these
 702      characters will only be present if you explicitly insert them.)
 703 'shift-jis
 704      Shift-JIS (a Japanese encoding commonly used in PC operating systems).
 705 'ucs-4
 706      ISO 10646 UCS-4 encoding.
 707 'utf-8
 708      ISO 10646 UTF-8 encoding.
 709 'iso2022
 710      Any ISO2022-compliant encoding.  Among other things, this includes
 711      JIS (the Japanese encoding commonly used for e-mail), EUC (the
 712      standard Unix encoding for Japanese and other languages), and
 713      Compound Text (the encoding used in X11).  You can specify more
 714      specific information about the conversion with the PROPS argument.
 715 'big5
 716      Big5 (the encoding commonly used for Taiwanese).
 717 'ccl
 718      The conversion is performed using a user-written pseudo-code
 719      program.  CCL (Code Conversion Language) is the name of this
 720      pseudo-code.
 721 'internal
 722      Write out or read in the raw contents of the memory representing
 723      the buffer's text.  This is primarily useful for debugging
 724      purposes, and is only enabled when XEmacs has been compiled with
 725      DEBUG_XEMACS defined (via the --debug configure option).
 726      WARNING: Reading in a file using 'internal conversion can result
 727      in an internal inconsistency in the memory representing a
 728      buffer's text, which will produce unpredictable results and may
 729      cause XEmacs to crash.  Under normal circumstances you should
 730      never use 'internal conversion.
 731
 732 DOC-STRING is a string describing the coding system.
 733
 734 PROPS is a property list, describing the specific nature of the
 735 character set.  Recognized properties are:
 736
 737 'mnemonic
 738      String to be displayed in the modeline when this coding system is
 739      active.
 740
 741 'eol-type
 742      End-of-line conversion to be used.  It should be one of
 743
 744         nil
 745                 Automatically detect the end-of-line type (LF, CRLF,
 746                 or CR).  Also generate subsidiary coding systems named
 747                 `NAME-unix', `NAME-dos', and `NAME-mac', that are
 748                 identical to this coding system but have an EOL-TYPE
 749                 value of 'lf, 'crlf, and 'cr, respectively.
 750         'lf
 751                 The end of a line is marked externally using ASCII LF.
 752                 Since this is also the way that XEmacs represents an
 753                 end-of-line internally, specifying this option results
 754                 in no end-of-line conversion.  This is the standard
 755                 format for Unix text files.
 756         'crlf
 757                 The end of a line is marked externally using ASCII
 758                 CRLF.  This is the standard format for MS-DOS text
 759                 files.
 760         'cr
 761                 The end of a line is marked externally using ASCII CR.
 762                 This is the standard format for Macintosh text files.
 763         t
 764                 Automatically detect the end-of-line type but do not
 765                 generate subsidiary coding systems.  (This value is
 766                 converted to nil when stored internally, and
 767                 `coding-system-property' will return nil.)
 768
 769 'post-read-conversion
 770      Function called after a file has been read in, to perform the
 771      decoding.  Called with two arguments, START and END, denoting
 772      a region of the current buffer to be decoded.
 773
 774 'pre-write-conversion
 775      Function called before a file is written out, to perform the
 776      encoding.  Called with two arguments, START and END, denoting
 777      a region of the current buffer to be encoded.
 778
 779
 780 The following additional properties are recognized if TYPE is 'iso2022:
 781
 782 'charset-g0
 783 'charset-g1
 784 'charset-g2
 785 'charset-g3
 786      The character set initially designated to the G0 - G3 registers.
 787      The value should be one of
 788
 789           -- A charset object (designate that character set)
 790           -- nil (do not ever use this register)
 791           -- t (no character set is initially designated to
 792                 the register, but may be later on; this automatically
 793                 sets the corresponding `force-g*-on-output' property)
 794
 795 'force-g0-on-output
 796 'force-g1-on-output
 797 'force-g2-on-output
 798 'force-g2-on-output
 799      If non-nil, send an explicit designation sequence on output before
 800      using the specified register.
 801
 802 'short
 803      If non-nil, use the short forms "ESC $ @", "ESC $ A", and
 804      "ESC $ B" on output in place of the full designation sequences
 805      "ESC $ ( @", "ESC $ ( A", and "ESC $ ( B".
 806
 807 'no-ascii-eol
 808      If non-nil, don't designate ASCII to G0 at each end of line on output.
 809      Setting this to non-nil also suppresses other state-resetting that
 810      normally happens at the end of a line.
 811
 812 'no-ascii-cntl
 813      If non-nil, don't designate ASCII to G0 before control chars on output.
 814
 815 'seven
 816      If non-nil, use 7-bit environment on output.  Otherwise, use 8-bit
 817      environment.
 818
 819 'lock-shift
 820      If non-nil, use locking-shift (SO/SI) instead of single-shift
 821      or designation by escape sequence.
 822
 823 'no-iso6429
 824      If non-nil, don't use ISO6429's direction specification.
 825
 826 'escape-quoted
 827      If non-nil, literal control characters that are the same as
 828      the beginning of a recognized ISO2022 or ISO6429 escape sequence
 829      (in particular, ESC (0x1B), SO (0x0E), SI (0x0F), SS2 (0x8E),
 830      SS3 (0x8F), and CSI (0x9B)) are "quoted" with an escape character
 831      so that they can be properly distinguished from an escape sequence.
 832      (Note that doing this results in a non-portable encoding.) This
 833      encoding flag is used for byte-compiled files.  Note that ESC
 834      is a good choice for a quoting character because there are no
 835      escape sequences whose second byte is a character from the Control-0
 836      or Control-1 character sets; this is explicitly disallowed by the
 837      ISO2022 standard.
 838
 839 'input-charset-conversion
 840      A list of conversion specifications, specifying conversion of
 841      characters in one charset to another when decoding is performed.
 842      Each specification is a list of two elements: the source charset,
 843      and the destination charset.
 844
 845 'output-charset-conversion
 846      A list of conversion specifications, specifying conversion of
 847      characters in one charset to another when encoding is performed.
 848      The form of each specification is the same as for
 849      'input-charset-conversion.
 850
 851
 852 The following additional properties are recognized (and required)
 853 if TYPE is 'ccl:
 854
 855 'decode
 856      CCL program used for decoding (converting to internal format).
 857
 858 'encode
 859      CCL program used for encoding (converting to external format).
 860 */
 861        (name, type, doc_string, props))
 862 {
 863   Lisp_Coding_System *codesys;
 864   enum coding_system_type ty;
 865   int need_to_setup_eol_systems = 1;
 866
 867   /* Convert type to constant */
 868   if (NILP (type) || EQ (type, Qundecided))
 869                                       { ty = CODESYS_AUTODETECT; }
 870 #ifdef MULE
 871   else if (EQ (type, Qshift_jis))     { ty = CODESYS_SHIFT_JIS; }
 872   else if (EQ (type, Qiso2022))       { ty = CODESYS_ISO2022; }
 873   else if (EQ (type, Qbig5))          { ty = CODESYS_BIG5; }
 874   else if (EQ (type, Qucs4))          { ty = CODESYS_UCS4; }
 875   else if (EQ (type, Qutf8))          { ty = CODESYS_UTF8; }
 876   else if (EQ (type, Qccl))           { ty = CODESYS_CCL; }
 877 #endif
 878   else if (EQ (type, Qno_conversion)) { ty = CODESYS_NO_CONVERSION; }
 879 #ifdef DEBUG_XEMACS
 880   else if (EQ (type, Qinternal))      { ty = CODESYS_INTERNAL; }
 881 #endif
 882   else
 883     signal_simple_error ("Invalid coding system type", type);
 884
 885   CHECK_SYMBOL (name);
 886
 887   codesys = allocate_coding_system (ty, name);
 888
 889   if (NILP (doc_string))
 890     doc_string = build_string ("");
 891   else
 892     CHECK_STRING (doc_string);
 893   CODING_SYSTEM_DOC_STRING (codesys) = doc_string;
 894
 895   {
 896     EXTERNAL_PROPERTY_LIST_LOOP_3 (key, value, props)
 897       {
 898         if (EQ (key, Qmnemonic))
 899           {
 900             if (!NILP (value))
 901               CHECK_STRING (value);
 902             CODING_SYSTEM_MNEMONIC (codesys) = value;
 903           }
 904
 905         else if (EQ (key, Qeol_type))
 906           {
 907             need_to_setup_eol_systems = NILP (value);
 908             if (EQ (value, Qt))
 909               value = Qnil;
 910             CODING_SYSTEM_EOL_TYPE (codesys) = symbol_to_eol_type (value);
 911           }
 912
 913         else if (EQ (key, Qpost_read_conversion)) CODING_SYSTEM_POST_READ_CONVERSION (codesys) = value;
 914         else if (EQ (key, Qpre_write_conversion)) CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = value;
 915 #ifdef MULE
 916         else if (ty == CODESYS_ISO2022)
 917           {
 918 #define FROB_INITIAL_CHARSET(charset_num) \
 919   CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \
 920     ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value))
 921
 922             if      (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
 923             else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
 924             else if (EQ (key, Qcharset_g2)) FROB_INITIAL_CHARSET (2);
 925             else if (EQ (key, Qcharset_g3)) FROB_INITIAL_CHARSET (3);
 926
 927 #define FROB_FORCE_CHARSET(charset_num) \
 928   CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = !NILP (value)
 929
 930             else if (EQ (key, Qforce_g0_on_output)) FROB_FORCE_CHARSET (0);
 931             else if (EQ (key, Qforce_g1_on_output)) FROB_FORCE_CHARSET (1);
 932             else if (EQ (key, Qforce_g2_on_output)) FROB_FORCE_CHARSET (2);
 933             else if (EQ (key, Qforce_g3_on_output)) FROB_FORCE_CHARSET (3);
 934
 935 #define FROB_BOOLEAN_PROPERTY(prop) \
 936   CODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value)
 937
 938             else if (EQ (key, Qshort))         FROB_BOOLEAN_PROPERTY (SHORT);
 939             else if (EQ (key, Qno_ascii_eol))  FROB_BOOLEAN_PROPERTY (NO_ASCII_EOL);
 940             else if (EQ (key, Qno_ascii_cntl)) FROB_BOOLEAN_PROPERTY (NO_ASCII_CNTL);
 941             else if (EQ (key, Qseven))         FROB_BOOLEAN_PROPERTY (SEVEN);
 942             else if (EQ (key, Qlock_shift))    FROB_BOOLEAN_PROPERTY (LOCK_SHIFT);
 943             else if (EQ (key, Qno_iso6429))    FROB_BOOLEAN_PROPERTY (NO_ISO6429);
 944             else if (EQ (key, Qescape_quoted)) FROB_BOOLEAN_PROPERTY (ESCAPE_QUOTED);
 945
 946             else if (EQ (key, Qinput_charset_conversion))
 947               {
 948                 codesys->iso2022.input_conv =
 949                   Dynarr_new (charset_conversion_spec);
 950                 parse_charset_conversion_specs (codesys->iso2022.input_conv,
 951                                                 value);
 952               }
 953             else if (EQ (key, Qoutput_charset_conversion))
 954               {
 955                 codesys->iso2022.output_conv =
 956                   Dynarr_new (charset_conversion_spec);
 957                 parse_charset_conversion_specs (codesys->iso2022.output_conv,
 958                                                 value);
 959               }
 960             else
 961               signal_simple_error ("Unrecognized property", key);
 962           }
 963         else if (EQ (type, Qccl))
 964           {
 965             Lisp_Object sym;
 966             struct ccl_program test_ccl;
 967             Extbyte *suffix;
 968
 969             /* Check key first.  */
 970             if (EQ (key, Qdecode))
 971               suffix = "-ccl-decode";
 972             else if (EQ (key, Qencode))
 973               suffix = "-ccl-encode";
 974             else
 975               signal_simple_error ("Unrecognized property", key);
 976
 977             /* If value is vector, register it as a ccl program
 978                associated with an newly created symbol for
 979                backward compatibility.  */
 980             if (VECTORP (value))
 981               {
 982                 sym = Fintern (concat2 (Fsymbol_name (name),
 983                                         build_string (suffix)),
 984                                Qnil);
 985                 Fregister_ccl_program (sym, value);
 986               }
 987             else
 988               {
 989                 CHECK_SYMBOL (value);
 990                 sym = value;
 991               }
 992             /* check if the given ccl programs are valid.  */
 993             if (setup_ccl_program (&test_ccl, sym) < 0)
 994               signal_simple_error ("Invalid CCL program", value);
 995
 996             if (EQ (key, Qdecode))
 997               CODING_SYSTEM_CCL_DECODE (codesys) = sym;
 998             else if (EQ (key, Qencode))
 999               CODING_SYSTEM_CCL_ENCODE (codesys) = sym;
1000
1001           }
1002 #endif /* MULE */
1003         else
1004           signal_simple_error ("Unrecognized property", key);
1005       }
1006   }
1007
1008   if (need_to_setup_eol_systems)
1009     setup_eol_coding_systems (codesys);
1010
1011   {
1012     Lisp_Object codesys_obj;
1013     XSETCODING_SYSTEM (codesys_obj, codesys);
1014     Fputhash (name, codesys_obj, Vcoding_system_hash_table);
1015     return codesys_obj;
1016   }
1017 }
1018
1019 DEFUN ("copy-coding-system", Fcopy_coding_system, 2, 2, 0, /*
1020 Copy OLD-CODING-SYSTEM to NEW-NAME.
1021 If NEW-NAME does not name an existing coding system, a new one will
1022 be created.
1023 */
1024        (old_coding_system, new_name))
1025 {
1026   Lisp_Object new_coding_system;
1027   old_coding_system = Fget_coding_system (old_coding_system);
1028   new_coding_system = Ffind_coding_system (new_name);
1029   if (NILP (new_coding_system))
1030     {
1031       XSETCODING_SYSTEM (new_coding_system,
1032                          allocate_coding_system
1033                          (XCODING_SYSTEM_TYPE (old_coding_system),
1034                           new_name));
1035       Fputhash (new_name, new_coding_system, Vcoding_system_hash_table);
1036     }
1037
1038   {
1039     Lisp_Coding_System *to = XCODING_SYSTEM (new_coding_system);
1040     Lisp_Coding_System *from = XCODING_SYSTEM (old_coding_system);
1041     memcpy (((char *) to  ) + sizeof (to->header),
1042             ((char *) from) + sizeof (from->header),
1043             sizeof (*from) - sizeof (from->header));
1044     to->name = new_name;
1045   }
1046   return new_coding_system;
1047 }
1048
1049 DEFUN ("coding-system-canonical-name-p", Fcoding_system_canonical_name_p, 1, 1, 0, /*
1050 Return t if OBJECT names a coding system, and is not a coding system alias.
1051 */
1052        (object))
1053 {
1054   return CODING_SYSTEMP (Fgethash (object, Vcoding_system_hash_table, Qnil))
1055     ? Qt : Qnil;
1056 }
1057
1058 DEFUN ("coding-system-alias-p", Fcoding_system_alias_p, 1, 1, 0, /*
1059 Return t if OBJECT is a coding system alias.
1060 All coding system aliases are created by `define-coding-system-alias'.
1061 */
1062        (object))
1063 {
1064   return SYMBOLP (Fgethash (object, Vcoding_system_hash_table, Qzero))
1065     ? Qt : Qnil;
1066 }
1067
1068 DEFUN ("coding-system-aliasee", Fcoding_system_aliasee, 1, 1, 0, /*
1069 Return the coding-system symbol for which symbol ALIAS is an alias.
1070 */
1071        (alias))
1072 {
1073   Lisp_Object aliasee = Fgethash (alias, Vcoding_system_hash_table, Qnil);
1074   if (SYMBOLP (aliasee))
1075     return aliasee;
1076   else
1077     signal_simple_error ("Symbol is not a coding system alias", alias);
1078   return Qnil;          /* To keep the compiler happy */
1079 }
1080
1081 static Lisp_Object
1082 append_suffix_to_symbol (Lisp_Object symbol, const char *ascii_string)
1083 {
1084   return Fintern (concat2 (Fsymbol_name (symbol), build_string (ascii_string)),
1085                   Qnil);
1086 }
1087
1088 /* A maphash function, for removing dangling coding system aliases. */
1089 static int
1090 dangling_coding_system_alias_p (Lisp_Object alias,
1091                                 Lisp_Object aliasee,
1092                                 void *dangling_aliases)
1093 {
1094   if (SYMBOLP (aliasee)
1095       && NILP (Fgethash (aliasee, Vcoding_system_hash_table, Qnil)))
1096     {
1097       (*(int *) dangling_aliases)++;
1098       return 1;
1099     }
1100   else
1101     return 0;
1102 }
1103
1104 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias, 2, 2, 0, /*
1105 Define symbol ALIAS as an alias for coding system ALIASEE.
1106
1107 You can use this function to redefine an alias that has already been defined,
1108 but you cannot redefine a name which is the canonical name for a coding system.
1109 \(a canonical name of a coding system is what is returned when you call
1110 `coding-system-name' on a coding system).
1111
1112 ALIASEE itself can be an alias, which allows you to define nested aliases.
1113
1114 You are forbidden, however, from creating alias loops or `dangling' aliases.
1115 These will be detected, and an error will be signaled if you attempt to do so.
1116
1117 If ALIASEE is nil, then ALIAS will simply be undefined.
1118
1119 See also `coding-system-alias-p', `coding-system-aliasee',
1120 and `coding-system-canonical-name-p'.
1121 */
1122        (alias, aliasee))
1123 {
1124   Lisp_Object real_coding_system, probe;
1125
1126   CHECK_SYMBOL (alias);
1127
1128   if (!NILP (Fcoding_system_canonical_name_p (alias)))
1129     signal_simple_error
1130       ("Symbol is the canonical name of a coding system and cannot be redefined",
1131        alias);
1132
1133   if (NILP (aliasee))
1134     {
1135       Lisp_Object subsidiary_unix = append_suffix_to_symbol (alias, "-unix");
1136       Lisp_Object subsidiary_dos  = append_suffix_to_symbol (alias, "-dos");
1137       Lisp_Object subsidiary_mac  = append_suffix_to_symbol (alias, "-mac");
1138
1139       Fremhash (alias, Vcoding_system_hash_table);
1140
1141       /* Undefine subsidiary aliases,
1142          presumably created by a previous call to this function */
1143       if (! NILP (Fcoding_system_alias_p (subsidiary_unix)) &&
1144           ! NILP (Fcoding_system_alias_p (subsidiary_dos))  &&
1145           ! NILP (Fcoding_system_alias_p (subsidiary_mac)))
1146         {
1147           Fdefine_coding_system_alias (subsidiary_unix, Qnil);
1148           Fdefine_coding_system_alias (subsidiary_dos,  Qnil);
1149           Fdefine_coding_system_alias (subsidiary_mac,  Qnil);
1150         }
1151
1152       /* Undefine dangling coding system aliases. */
1153       {
1154         int dangling_aliases;
1155
1156         do {
1157           dangling_aliases = 0;
1158           elisp_map_remhash (dangling_coding_system_alias_p,
1159                              Vcoding_system_hash_table,
1160                              &dangling_aliases);
1161         } while (dangling_aliases > 0);
1162       }
1163
1164       return Qnil;
1165     }
1166
1167   if (CODING_SYSTEMP (aliasee))
1168     aliasee = XCODING_SYSTEM_NAME (aliasee);
1169
1170   /* Checks that aliasee names a coding-system */
1171   real_coding_system = Fget_coding_system (aliasee);
1172
1173   /* Check for coding system alias loops */
1174   if (EQ (alias, aliasee))
1175     alias_loop: signal_simple_error_2
1176       ("Attempt to create a coding system alias loop", alias, aliasee);
1177
1178   for (probe = aliasee;
1179        SYMBOLP (probe);
1180        probe = Fgethash (probe, Vcoding_system_hash_table, Qzero))
1181     {
1182       if (EQ (probe, alias))
1183         goto alias_loop;
1184     }
1185
1186   Fputhash (alias, aliasee, Vcoding_system_hash_table);
1187
1188   /* Set up aliases for subsidiaries.
1189      #### There must be a better way to handle subsidiary coding systems. */
1190   {
1191     static const char *suffixes[] = { "-unix", "-dos", "-mac" };
1192     int i;
1193     for (i = 0; i < countof (suffixes); i++)
1194       {
1195         Lisp_Object alias_subsidiary =
1196           append_suffix_to_symbol (alias, suffixes[i]);
1197         Lisp_Object aliasee_subsidiary =
1198           append_suffix_to_symbol (aliasee, suffixes[i]);
1199
1200         if (! NILP (Ffind_coding_system (aliasee_subsidiary)))
1201           Fdefine_coding_system_alias (alias_subsidiary, aliasee_subsidiary);
1202       }
1203   }
1204   /* FSF return value is a vector of [ALIAS-unix ALIAS-dos ALIAS-mac],
1205      but it doesn't look intentional, so I'd rather return something
1206      meaningful or nothing at all. */
1207   return Qnil;
1208 }
1209
1210 static Lisp_Object
1211 subsidiary_coding_system (Lisp_Object coding_system, eol_type_t type)
1212 {
1213   Lisp_Coding_System *cs = XCODING_SYSTEM (coding_system);
1214   Lisp_Object new_coding_system;
1215
1216   if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
1217     return coding_system;
1218
1219   switch (type)
1220     {
1221     case EOL_AUTODETECT: return coding_system;
1222     case EOL_LF:   new_coding_system = CODING_SYSTEM_EOL_LF   (cs); break;
1223     case EOL_CR:   new_coding_system = CODING_SYSTEM_EOL_CR   (cs); break;
1224     case EOL_CRLF: new_coding_system = CODING_SYSTEM_EOL_CRLF (cs); break;
1225     default:       abort (); return Qnil;
1226     }
1227
1228   return NILP (new_coding_system) ? coding_system : new_coding_system;
1229 }
1230
1231 DEFUN ("subsidiary-coding-system", Fsubsidiary_coding_system, 2, 2, 0, /*
1232 Return the subsidiary coding system of CODING-SYSTEM with eol type EOL-TYPE.
1233 */
1234        (coding_system, eol_type))
1235 {
1236   coding_system = Fget_coding_system (coding_system);
1237
1238   return subsidiary_coding_system (coding_system,
1239                                    symbol_to_eol_type (eol_type));
1240 }
1241
1242 \f
1243 /************************************************************************/
1244 /*                         Coding system accessors                      */
1245 /************************************************************************/
1246
1247 DEFUN ("coding-system-doc-string", Fcoding_system_doc_string, 1, 1, 0, /*
1248 Return the doc string for CODING-SYSTEM.
1249 */
1250        (coding_system))
1251 {
1252   coding_system = Fget_coding_system (coding_system);
1253   return XCODING_SYSTEM_DOC_STRING (coding_system);
1254 }
1255
1256 DEFUN ("coding-system-type", Fcoding_system_type, 1, 1, 0, /*
1257 Return the type of CODING-SYSTEM.
1258 */
1259        (coding_system))
1260 {
1261   switch (XCODING_SYSTEM_TYPE (Fget_coding_system (coding_system)))
1262     {
1263     default: abort ();
1264     case CODESYS_AUTODETECT:    return Qundecided;
1265 #ifdef MULE
1266     case CODESYS_SHIFT_JIS:     return Qshift_jis;
1267     case CODESYS_ISO2022:       return Qiso2022;
1268     case CODESYS_BIG5:          return Qbig5;
1269     case CODESYS_UCS4:          return Qucs4;
1270     case CODESYS_UTF8:          return Qutf8;
1271     case CODESYS_CCL:           return Qccl;
1272 #endif
1273     case CODESYS_NO_CONVERSION: return Qno_conversion;
1274 #ifdef DEBUG_XEMACS
1275     case CODESYS_INTERNAL:      return Qinternal;
1276 #endif
1277     }
1278 }
1279
1280 #ifdef MULE
1281 static
1282 Lisp_Object coding_system_charset (Lisp_Object coding_system, int gnum)
1283 {
1284   Lisp_Object cs
1285     = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, gnum);
1286
1287   return CHARSETP (cs) ? XCHARSET_NAME (cs) : Qnil;
1288 }
1289
1290 DEFUN ("coding-system-charset", Fcoding_system_charset, 2, 2, 0, /*
1291 Return initial charset of CODING-SYSTEM designated to GNUM.
1292 GNUM allows 0 .. 3.
1293 */
1294        (coding_system, gnum))
1295 {
1296   coding_system = Fget_coding_system (coding_system);
1297   CHECK_INT (gnum);
1298
1299   return coding_system_charset (coding_system, XINT (gnum));
1300 }
1301 #endif /* MULE */
1302
1303 DEFUN ("coding-system-property", Fcoding_system_property, 2, 2, 0, /*
1304 Return the PROP property of CODING-SYSTEM.
1305 */
1306        (coding_system, prop))
1307 {
1308   int i, ok = 0;
1309   enum coding_system_type type;
1310
1311   coding_system = Fget_coding_system (coding_system);
1312   CHECK_SYMBOL (prop);
1313   type = XCODING_SYSTEM_TYPE (coding_system);
1314
1315   for (i = 0; !ok && i < Dynarr_length (the_codesys_prop_dynarr); i++)
1316     if (EQ (Dynarr_at (the_codesys_prop_dynarr, i).sym, prop))
1317       {
1318         ok = 1;
1319         switch (Dynarr_at (the_codesys_prop_dynarr, i).prop_type)
1320           {
1321           case CODESYS_PROP_ALL_OK:
1322             break;
1323 #ifdef MULE
1324           case CODESYS_PROP_ISO2022:
1325             if (type != CODESYS_ISO2022)
1326               signal_simple_error
1327                 ("Property only valid in ISO2022 coding systems",
1328                  prop);
1329             break;
1330
1331           case CODESYS_PROP_CCL:
1332             if (type != CODESYS_CCL)
1333               signal_simple_error
1334                 ("Property only valid in CCL coding systems",
1335                  prop);
1336             break;
1337 #endif /* MULE */
1338           default:
1339             abort ();
1340           }
1341       }
1342
1343   if (!ok)
1344     signal_simple_error ("Unrecognized property", prop);
1345
1346   if (EQ (prop, Qname))
1347     return XCODING_SYSTEM_NAME (coding_system);
1348   else if (EQ (prop, Qtype))
1349     return Fcoding_system_type (coding_system);
1350   else if (EQ (prop, Qdoc_string))
1351     return XCODING_SYSTEM_DOC_STRING (coding_system);
1352   else if (EQ (prop, Qmnemonic))
1353     return XCODING_SYSTEM_MNEMONIC (coding_system);
1354   else if (EQ (prop, Qeol_type))
1355     return eol_type_to_symbol (XCODING_SYSTEM_EOL_TYPE (coding_system));
1356   else if (EQ (prop, Qeol_lf))
1357     return XCODING_SYSTEM_EOL_LF (coding_system);
1358   else if (EQ (prop, Qeol_crlf))
1359     return XCODING_SYSTEM_EOL_CRLF (coding_system);
1360   else if (EQ (prop, Qeol_cr))
1361     return XCODING_SYSTEM_EOL_CR (coding_system);
1362   else if (EQ (prop, Qpost_read_conversion))
1363     return XCODING_SYSTEM_POST_READ_CONVERSION (coding_system);
1364   else if (EQ (prop, Qpre_write_conversion))
1365     return XCODING_SYSTEM_PRE_WRITE_CONVERSION (coding_system);
1366 #ifdef MULE
1367   else if (type == CODESYS_ISO2022)
1368     {
1369       if (EQ (prop, Qcharset_g0))
1370         return coding_system_charset (coding_system, 0);
1371       else if (EQ (prop, Qcharset_g1))
1372         return coding_system_charset (coding_system, 1);
1373       else if (EQ (prop, Qcharset_g2))
1374         return coding_system_charset (coding_system, 2);
1375       else if (EQ (prop, Qcharset_g3))
1376         return coding_system_charset (coding_system, 3);
1377
1378 #define FORCE_CHARSET(charset_num) \
1379   (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \
1380    (coding_system, charset_num) ? Qt : Qnil)
1381
1382       else if (EQ (prop, Qforce_g0_on_output)) return FORCE_CHARSET (0);
1383       else if (EQ (prop, Qforce_g1_on_output)) return FORCE_CHARSET (1);
1384       else if (EQ (prop, Qforce_g2_on_output)) return FORCE_CHARSET (2);
1385       else if (EQ (prop, Qforce_g3_on_output)) return FORCE_CHARSET (3);
1386
1387 #define LISP_BOOLEAN(prop) \
1388   (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil)
1389
1390       else if (EQ (prop, Qshort))         return LISP_BOOLEAN (SHORT);
1391       else if (EQ (prop, Qno_ascii_eol))  return LISP_BOOLEAN (NO_ASCII_EOL);
1392       else if (EQ (prop, Qno_ascii_cntl)) return LISP_BOOLEAN (NO_ASCII_CNTL);
1393       else if (EQ (prop, Qseven))         return LISP_BOOLEAN (SEVEN);
1394       else if (EQ (prop, Qlock_shift))    return LISP_BOOLEAN (LOCK_SHIFT);
1395       else if (EQ (prop, Qno_iso6429))    return LISP_BOOLEAN (NO_ISO6429);
1396       else if (EQ (prop, Qescape_quoted)) return LISP_BOOLEAN (ESCAPE_QUOTED);
1397
1398       else if (EQ (prop, Qinput_charset_conversion))
1399         return
1400           unparse_charset_conversion_specs
1401             (XCODING_SYSTEM (coding_system)->iso2022.input_conv);
1402       else if (EQ (prop, Qoutput_charset_conversion))
1403         return
1404           unparse_charset_conversion_specs
1405             (XCODING_SYSTEM (coding_system)->iso2022.output_conv);
1406       else
1407         abort ();
1408     }
1409   else if (type == CODESYS_CCL)
1410     {
1411       if (EQ (prop, Qdecode))
1412         return XCODING_SYSTEM_CCL_DECODE (coding_system);
1413       else if (EQ (prop, Qencode))
1414         return XCODING_SYSTEM_CCL_ENCODE (coding_system);
1415       else
1416         abort ();
1417     }
1418 #endif /* MULE */
1419   else
1420     abort ();
1421
1422   return Qnil; /* not reached */
1423 }
1424
1425 \f
1426 /************************************************************************/
1427 /*                       Coding category functions                      */
1428 /************************************************************************/
1429
1430 static int
1431 decode_coding_category (Lisp_Object symbol)
1432 {
1433   int i;
1434
1435   CHECK_SYMBOL (symbol);
1436   for (i = 0; i < CODING_CATEGORY_LAST; i++)
1437     if (EQ (coding_category_symbol[i], symbol))
1438       return i;
1439
1440   signal_simple_error ("Unrecognized coding category", symbol);
1441   return 0; /* not reached */
1442 }
1443
1444 DEFUN ("coding-category-list", Fcoding_category_list, 0, 0, 0, /*
1445 Return a list of all recognized coding categories.
1446 */
1447        ())
1448 {
1449   int i;
1450   Lisp_Object list = Qnil;
1451
1452   for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
1453     list = Fcons (coding_category_symbol[i], list);
1454   return list;
1455 }
1456
1457 DEFUN ("set-coding-priority-list", Fset_coding_priority_list, 1, 1, 0, /*
1458 Change the priority order of the coding categories.
1459 LIST should be list of coding categories, in descending order of
1460 priority.  Unspecified coding categories will be lower in priority
1461 than all specified ones, in the same relative order they were in
1462 previously.
1463 */
1464        (list))
1465 {
1466   int category_to_priority[CODING_CATEGORY_LAST];
1467   int i, j;
1468   Lisp_Object rest;
1469
1470   /* First generate a list that maps coding categories to priorities. */
1471
1472   for (i = 0; i < CODING_CATEGORY_LAST; i++)
1473     category_to_priority[i] = -1;
1474
1475   /* Highest priority comes from the specified list. */
1476   i = 0;
1477   EXTERNAL_LIST_LOOP (rest, list)
1478     {
1479       int cat = decode_coding_category (XCAR (rest));
1480
1481       if (category_to_priority[cat] >= 0)
1482         signal_simple_error ("Duplicate coding category in list", XCAR (rest));
1483       category_to_priority[cat] = i++;
1484     }
1485
1486   /* Now go through the existing categories by priority to retrieve
1487      the categories not yet specified and preserve their priority
1488      order. */
1489   for (j = 0; j < CODING_CATEGORY_LAST; j++)
1490     {
1491       int cat = fcd->coding_category_by_priority[j];
1492       if (category_to_priority[cat] < 0)
1493         category_to_priority[cat] = i++;
1494     }
1495
1496   /* Now we need to construct the inverse of the mapping we just
1497      constructed. */
1498
1499   for (i = 0; i < CODING_CATEGORY_LAST; i++)
1500     fcd->coding_category_by_priority[category_to_priority[i]] = i;
1501
1502   /* Phew!  That was confusing. */
1503   return Qnil;
1504 }
1505
1506 DEFUN ("coding-priority-list", Fcoding_priority_list, 0, 0, 0, /*
1507 Return a list of coding categories in descending order of priority.
1508 */
1509        ())
1510 {
1511   int i;
1512   Lisp_Object list = Qnil;
1513
1514   for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
1515     list = Fcons (coding_category_symbol[fcd->coding_category_by_priority[i]],
1516                   list);
1517   return list;
1518 }
1519
1520 DEFUN ("set-coding-category-system", Fset_coding_category_system, 2, 2, 0, /*
1521 Change the coding system associated with a coding category.
1522 */
1523        (coding_category, coding_system))
1524 {
1525   int cat = decode_coding_category (coding_category);
1526
1527   coding_system = Fget_coding_system (coding_system);
1528   fcd->coding_category_system[cat] = coding_system;
1529   return Qnil;
1530 }
1531
1532 DEFUN ("coding-category-system", Fcoding_category_system, 1, 1, 0, /*
1533 Return the coding system associated with a coding category.
1534 */
1535        (coding_category))
1536 {
1537   int cat = decode_coding_category (coding_category);
1538   Lisp_Object sys = fcd->coding_category_system[cat];
1539
1540   if (!NILP (sys))
1541     return XCODING_SYSTEM_NAME (sys);
1542   return Qnil;
1543 }
1544
1545 \f
1546 /************************************************************************/
1547 /*                     Detecting the encoding of data                   */
1548 /************************************************************************/
1549
1550 struct detection_state
1551 {
1552   eol_type_t eol_type;
1553   int seen_non_ascii;
1554   int mask;
1555 #ifdef MULE
1556   struct
1557     {
1558       int mask;
1559       int in_second_byte;
1560     }
1561   big5;
1562
1563   struct
1564     {
1565       int mask;
1566       int in_second_byte;
1567     }
1568   shift_jis;
1569
1570   struct
1571     {
1572       int mask;
1573       int in_byte;
1574   }
1575   ucs4;
1576
1577   struct
1578     {
1579       int mask;
1580       int in_byte;
1581     }
1582   utf8;
1583
1584   struct
1585     {
1586       int mask;
1587       int initted;
1588       struct iso2022_decoder iso;
1589       unsigned int flags;
1590       int high_byte_count;
1591       unsigned int saw_single_shift:1;
1592     }
1593   iso2022;
1594 #endif
1595   struct
1596     {
1597       int seen_anything;
1598       int just_saw_cr;
1599     }
1600   eol;
1601 };
1602
1603 static int
1604 acceptable_control_char_p (int c)
1605 {
1606   switch (c)
1607     {
1608       /* Allow and ignore control characters that you might
1609          reasonably see in a text file */
1610     case '\r':
1611     case '\n':
1612     case '\t':
1613     case  7: /* bell */
1614     case  8: /* backspace */
1615     case 11: /* vertical tab */
1616     case 12: /* form feed */
1617     case 26: /* MS-DOS C-z junk */
1618     case 31: /* '^_' -- for info */
1619       return 1;
1620     default:
1621       return 0;
1622     }
1623 }
1624
1625 static int
1626 mask_has_at_most_one_bit_p (int mask)
1627 {
1628   /* Perhaps the only thing useful you learn from intensive Microsoft
1629      technical interviews */
1630   return (mask & (mask - 1)) == 0;
1631 }
1632
1633 static eol_type_t
1634 detect_eol_type (struct detection_state *st, const Extbyte *src,
1635                  Lstream_data_count n)
1636 {
1637   while (n--)
1638     {
1639       unsigned char c = *(unsigned char *)src++;
1640       if (c == '\n')
1641         {
1642           if (st->eol.just_saw_cr)
1643             return EOL_CRLF;
1644           else if (st->eol.seen_anything)
1645             return EOL_LF;
1646         }
1647       else if (st->eol.just_saw_cr)
1648         return EOL_CR;
1649       else if (c == '\r')
1650         st->eol.just_saw_cr = 1;
1651       else
1652         st->eol.just_saw_cr = 0;
1653       st->eol.seen_anything = 1;
1654     }
1655
1656   return EOL_AUTODETECT;
1657 }
1658
1659 /* Attempt to determine the encoding and EOL type of the given text.
1660    Before calling this function for the first type, you must initialize
1661    st->eol_type as appropriate and initialize st->mask to ~0.
1662
1663    st->eol_type holds the determined EOL type, or EOL_AUTODETECT if
1664    not yet known.
1665
1666    st->mask holds the determined coding category mask, or ~0 if only
1667    ASCII has been seen so far.
1668
1669    Returns:
1670
1671    0 == st->eol_type is EOL_AUTODETECT and/or more than coding category
1672         is present in st->mask
1673    1 == definitive answers are here for both st->eol_type and st->mask
1674 */
1675
1676 static int
1677 detect_coding_type (struct detection_state *st, const Extbyte *src,
1678                     Lstream_data_count n, int just_do_eol)
1679 {
1680   if (st->eol_type == EOL_AUTODETECT)
1681     st->eol_type = detect_eol_type (st, src, n);
1682
1683   if (just_do_eol)
1684     return st->eol_type != EOL_AUTODETECT;
1685
1686   if (!st->seen_non_ascii)
1687     {
1688       for (; n; n--, src++)
1689         {
1690           unsigned char c = *(unsigned char *) src;
1691           if ((c < 0x20 && !acceptable_control_char_p (c)) || c >= 0x80)
1692             {
1693               st->seen_non_ascii = 1;
1694 #ifdef MULE
1695               st->shift_jis.mask = ~0;
1696               st->big5.mask = ~0;
1697               st->ucs4.mask = ~0;
1698               st->utf8.mask = ~0;
1699               st->iso2022.mask = ~0;
1700 #endif
1701               break;
1702             }
1703         }
1704     }
1705
1706   if (!n)
1707     return 0;
1708 #ifdef MULE
1709   if (!mask_has_at_most_one_bit_p (st->iso2022.mask))
1710     st->iso2022.mask = detect_coding_iso2022 (st, src, n);
1711   if (!mask_has_at_most_one_bit_p (st->shift_jis.mask))
1712     st->shift_jis.mask = detect_coding_sjis (st, src, n);
1713   if (!mask_has_at_most_one_bit_p (st->big5.mask))
1714     st->big5.mask = detect_coding_big5 (st, src, n);
1715   if (!mask_has_at_most_one_bit_p (st->utf8.mask))
1716     st->utf8.mask = detect_coding_utf8 (st, src, n);
1717   if (!mask_has_at_most_one_bit_p (st->ucs4.mask))
1718     st->ucs4.mask = detect_coding_ucs4 (st, src, n);
1719
1720   st->mask
1721     = st->iso2022.mask | st->shift_jis.mask | st->big5.mask
1722     | st->utf8.mask | st->ucs4.mask;
1723 #endif
1724   {
1725     int retval = mask_has_at_most_one_bit_p (st->mask);
1726     st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK;
1727     return retval && st->eol_type != EOL_AUTODETECT;
1728   }
1729 }
1730
1731 static Lisp_Object
1732 coding_system_from_mask (int mask)
1733 {
1734   if (mask == ~0)
1735     {
1736       /* If the file was entirely or basically ASCII, use the
1737          default value of `buffer-file-coding-system'. */
1738       Lisp_Object retval =
1739         XBUFFER (Vbuffer_defaults)->buffer_file_coding_system;
1740       if (!NILP (retval))
1741         {
1742           retval = Ffind_coding_system (retval);
1743           if (NILP (retval))
1744             {
1745               warn_when_safe
1746                 (Qbad_variable, Qwarning,
1747                  "Invalid `default-buffer-file-coding-system', set to nil");
1748               XBUFFER (Vbuffer_defaults)->buffer_file_coding_system = Qnil;
1749             }
1750         }
1751       if (NILP (retval))
1752         retval = Fget_coding_system (Qraw_text);
1753       return retval;
1754     }
1755   else
1756     {
1757       int i;
1758       int cat = -1;
1759 #ifdef MULE
1760       mask = postprocess_iso2022_mask (mask);
1761 #endif
1762       /* Look through the coding categories by priority and find
1763          the first one that is allowed. */
1764       for (i = 0; i < CODING_CATEGORY_LAST; i++)
1765         {
1766           cat = fcd->coding_category_by_priority[i];
1767           if ((mask & (1 << cat)) &&
1768               !NILP (fcd->coding_category_system[cat]))
1769             break;
1770         }
1771       if (cat >= 0)
1772         return fcd->coding_category_system[cat];
1773       else
1774         return Fget_coding_system (Qraw_text);
1775     }
1776 }
1777
1778 /* Given a seekable read stream and potential coding system and EOL type
1779    as specified, do any autodetection that is called for.  If the
1780    coding system and/or EOL type are not `autodetect', they will be left
1781    alone; but this function will never return an autodetect coding system
1782    or EOL type.
1783
1784    This function does not automatically fetch subsidiary coding systems;
1785    that should be unnecessary with the explicit eol-type argument. */
1786
1787 #define LENGTH(string_constant) (sizeof (string_constant) - 1)
1788 /* number of leading lines to check for a coding cookie */
1789 #define LINES_TO_CHECK 2
1790
1791 void
1792 determine_real_coding_system (Lstream *stream, Lisp_Object *codesys_in_out,
1793                               eol_type_t *eol_type_in_out)
1794 {
1795   struct detection_state decst;
1796
1797   if (*eol_type_in_out == EOL_AUTODETECT)
1798     *eol_type_in_out = XCODING_SYSTEM_EOL_TYPE (*codesys_in_out);
1799
1800   xzero (decst);
1801   decst.eol_type = *eol_type_in_out;
1802   decst.mask = ~0;
1803
1804   /* If autodetection is called for, do it now. */
1805   if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT
1806       || *eol_type_in_out == EOL_AUTODETECT)
1807     {
1808       Extbyte buf[4096];
1809       Lisp_Object coding_system = Qnil;
1810       Extbyte *p;
1811       Lstream_data_count nread = Lstream_read (stream, buf, sizeof (buf));
1812       Extbyte *scan_end;
1813       int lines_checked = 0;
1814
1815       /* Look for initial "-*-"; mode line prefix */
1816       for (p = buf,
1817              scan_end = buf + nread - LENGTH ("-*-coding:?-*-");
1818            p <= scan_end
1819              && lines_checked < LINES_TO_CHECK;
1820            p++)
1821         if (*p == '-' && *(p+1) == '*' && *(p+2) == '-')
1822           {
1823             Extbyte *local_vars_beg = p + 3;
1824             /* Look for final "-*-"; mode line suffix */
1825             for (p = local_vars_beg,
1826                    scan_end = buf + nread - LENGTH ("-*-");
1827                  p <= scan_end
1828                    && lines_checked < LINES_TO_CHECK;
1829                  p++)
1830               if (*p == '-' && *(p+1) == '*' && *(p+2) == '-')
1831                 {
1832                   Extbyte *suffix = p;
1833                   /* Look for "coding:" */
1834                   for (p = local_vars_beg,
1835                          scan_end = suffix - LENGTH ("coding:?");
1836                        p <= scan_end;
1837                        p++)
1838                     if (memcmp ("coding:", p, LENGTH ("coding:")) == 0
1839                         && (p == local_vars_beg
1840                             || (*(p-1) == ' '  ||
1841                                 *(p-1) == '\t' ||
1842                                 *(p-1) == ';')))
1843                       {
1844                         Extbyte save;
1845                         int n;
1846                         p += LENGTH ("coding:");
1847                         while (*p == ' ' || *p == '\t') p++;
1848
1849                         /* Get coding system name */
1850                         save = *suffix; *suffix = '\0';
1851                         /* Characters valid in a MIME charset name (rfc 1521),
1852                            and in a Lisp symbol name. */
1853                         n = strspn ( (char *) p,
1854                                     "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
1855                                     "abcdefghijklmnopqrstuvwxyz"
1856                                     "0123456789"
1857                                     "!$%&*+-.^_{|}~");
1858                         *suffix = save;
1859                         if (n > 0)
1860                           {
1861                             save = p[n]; p[n] = '\0';
1862                             coding_system =
1863                               Ffind_coding_system (intern ((char *) p));
1864                             p[n] = save;
1865                           }
1866                         break;
1867                       }
1868                   break;
1869                 }
1870               /* #### file must use standard EOLs or we miss 2d line */
1871               /* #### not to mention this is broken for UTF-16 DOS files */
1872               else if (*p == '\n' || *p == '\r')
1873                 {
1874                   lines_checked++;
1875                   /* skip past multibyte (DOS) newline */
1876                   if (*p == '\r' && *(p+1) == '\n') p++;
1877                 }
1878             break;
1879           }
1880         /* #### file must use standard EOLs or we miss 2d line */
1881         /* #### not to mention this is broken for UTF-16 DOS files */
1882         else if (*p == '\n' || *p == '\r')
1883           {
1884             lines_checked++;
1885             /* skip past multibyte (DOS) newline */
1886             if (*p == '\r' && *(p+1) == '\n') p++;
1887           }
1888
1889       if (NILP (coding_system))
1890         do
1891           {
1892             if (detect_coding_type (&decst, buf, nread,
1893                                     XCODING_SYSTEM_TYPE (*codesys_in_out)
1894                                     != CODESYS_AUTODETECT))
1895               break;
1896             nread = Lstream_read (stream, buf, sizeof (buf));
1897             if (nread == 0)
1898               break;
1899           }
1900         while (1);
1901
1902       else if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT
1903                && XCODING_SYSTEM_EOL_TYPE (coding_system) == EOL_AUTODETECT)
1904         do
1905           {
1906             if (detect_coding_type (&decst, buf, nread, 1))
1907               break;
1908             nread = Lstream_read (stream, buf, sizeof (buf));
1909             if (!nread)
1910               break;
1911           }
1912         while (1);
1913
1914       *eol_type_in_out = decst.eol_type;
1915       if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT)
1916         {
1917           if (NILP (coding_system))
1918             *codesys_in_out = coding_system_from_mask (decst.mask);
1919           else
1920             *codesys_in_out = coding_system;
1921         }
1922     }
1923
1924   /* If we absolutely can't determine the EOL type, just assume LF. */
1925   if (*eol_type_in_out == EOL_AUTODETECT)
1926     *eol_type_in_out = EOL_LF;
1927
1928   Lstream_rewind (stream);
1929 }
1930
1931 DEFUN ("detect-coding-region", Fdetect_coding_region, 2, 3, 0, /*
1932 Detect coding system of the text in the region between START and END.
1933 Return a list of possible coding systems ordered by priority.
1934 If only ASCII characters are found, return 'undecided or one of
1935 its subsidiary coding systems according to a detected end-of-line
1936 type.  Optional arg BUFFER defaults to the current buffer.
1937 */
1938        (start, end, buffer))
1939 {
1940   Lisp_Object val = Qnil;
1941   struct buffer *buf = decode_buffer (buffer, 0);
1942   Bufpos b, e;
1943   Lisp_Object instream, lb_instream;
1944   Lstream *istr, *lb_istr;
1945   struct detection_state decst;
1946   struct gcpro gcpro1, gcpro2;
1947
1948   get_buffer_range_char (buf, start, end, &b, &e, 0);
1949   lb_instream = make_lisp_buffer_input_stream (buf, b, e, 0);
1950   lb_istr = XLSTREAM (lb_instream);
1951   instream = make_encoding_input_stream (lb_istr, Fget_coding_system (Qbinary));
1952   istr = XLSTREAM (instream);
1953   GCPRO2 (instream, lb_instream);
1954   xzero (decst);
1955   decst.eol_type = EOL_AUTODETECT;
1956   decst.mask = ~0;
1957   while (1)
1958     {
1959       Extbyte random_buffer[4096];
1960       Lstream_data_count nread = Lstream_read (istr, random_buffer, sizeof (random_buffer));
1961
1962       if (!nread)
1963         break;
1964       if (detect_coding_type (&decst, random_buffer, nread, 0))
1965         break;
1966     }
1967
1968   if (decst.mask == ~0)
1969     val = subsidiary_coding_system (Fget_coding_system (Qundecided),
1970                                     decst.eol_type);
1971   else
1972     {
1973       int i;
1974
1975       val = Qnil;
1976 #ifdef MULE
1977       decst.mask = postprocess_iso2022_mask (decst.mask);
1978 #endif
1979       for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
1980         {
1981           int sys = fcd->coding_category_by_priority[i];
1982           if (decst.mask & (1 << sys))
1983             {
1984               Lisp_Object codesys = fcd->coding_category_system[sys];
1985               if (!NILP (codesys))
1986                 codesys = subsidiary_coding_system (codesys, decst.eol_type);
1987               val = Fcons (codesys, val);
1988             }
1989         }
1990     }
1991   Lstream_close (istr);
1992   UNGCPRO;
1993   Lstream_delete (istr);
1994   Lstream_delete (lb_istr);
1995   return val;
1996 }
1997
1998 \f
1999 /************************************************************************/
2000 /*           Converting to internal Mule format ("decoding")            */
2001 /************************************************************************/
2002
2003 /* A decoding stream is a stream used for decoding text (i.e.
2004    converting from some external format to internal format).
2005    The decoding-stream object keeps track of the actual coding
2006    stream, the stream that is at the other end, and data that
2007    needs to be persistent across the lifetime of the stream. */
2008
2009 /* Handle the EOL stuff related to just-read-in character C.
2010    EOL_TYPE is the EOL type of the coding stream.
2011    FLAGS is the current value of FLAGS in the coding stream, and may
2012    be modified by this macro.  (The macro only looks at the
2013    CODING_STATE_CR flag.)  DST is the Dynarr to which the decoded
2014    bytes are to be written.  You need to also define a local goto
2015    label "label_continue_loop" that is at the end of the main
2016    character-reading loop.
2017
2018    If C is a CR character, then this macro handles it entirely and
2019    jumps to label_continue_loop.  Otherwise, this macro does not add
2020    anything to DST, and continues normally.  You should continue
2021    processing C normally after this macro. */
2022
2023 #define DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst)         \
2024 do {                                                            \
2025   if (c == '\r')                                                \
2026     {                                                           \
2027       if (eol_type == EOL_CR)                                   \
2028         Dynarr_add (dst, '\n');                                 \
2029       else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR) \
2030         Dynarr_add (dst, c);                                    \
2031       else                                                      \
2032         flags |= CODING_STATE_CR;                               \
2033       goto label_continue_loop;                                 \
2034     }                                                           \
2035   else if (flags & CODING_STATE_CR)                             \
2036     {   /* eol_type == CODING_SYSTEM_EOL_CRLF */                \
2037       if (c != '\n')                                            \
2038         Dynarr_add (dst, '\r');                                 \
2039       flags &= ~CODING_STATE_CR;                                \
2040     }                                                           \
2041 } while (0)
2042
2043 /* C should be a binary character in the range 0 - 255; convert
2044    to internal format and add to Dynarr DST. */
2045
2046 #define DECODE_ADD_BINARY_CHAR(c, dst)          \
2047 do {                                            \
2048   if (BYTE_ASCII_P (c))                         \
2049     Dynarr_add (dst, c);                        \
2050   else if (BYTE_C1_P (c))                       \
2051     {                                           \
2052       Dynarr_add (dst, LEADING_BYTE_CONTROL_1); \
2053       Dynarr_add (dst, c + 0x20);               \
2054     }                                           \
2055   else                                          \
2056     {                                           \
2057       Dynarr_add (dst, LEADING_BYTE_LATIN_ISO8859_1); \
2058       Dynarr_add (dst, c);                      \
2059     }                                           \
2060 } while (0)
2061
2062 #define DECODE_OUTPUT_PARTIAL_CHAR(ch)  \
2063 do {                                    \
2064   if (ch)                               \
2065     {                                   \
2066       DECODE_ADD_BINARY_CHAR (ch, dst); \
2067       ch = 0;                           \
2068     }                                   \
2069 } while (0)
2070
2071 #define DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst) \
2072 do {                                    \
2073   if (flags & CODING_STATE_END)         \
2074     {                                   \
2075       DECODE_OUTPUT_PARTIAL_CHAR (ch);  \
2076       if (flags & CODING_STATE_CR)      \
2077         Dynarr_add (dst, '\r');         \
2078     }                                   \
2079 } while (0)
2080
2081 #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding)
2082
2083 struct decoding_stream
2084 {
2085   /* Coding system that governs the conversion. */
2086   Lisp_Coding_System *codesys;
2087
2088   /* Stream that we read the encoded data from or
2089      write the decoded data to. */
2090   Lstream *other_end;
2091
2092   /* If we are reading, then we can return only a fixed amount of
2093      data, so if the conversion resulted in too much data, we store it
2094      here for retrieval the next time around. */
2095   unsigned_char_dynarr *runoff;
2096
2097   /* FLAGS holds flags indicating the current state of the decoding.
2098      Some of these flags are dependent on the coding system. */
2099   unsigned int flags;
2100
2101   /* CH holds a partially built-up character.  Since we only deal
2102      with one- and two-byte characters at the moment, we only use
2103      this to store the first byte of a two-byte character. */
2104   unsigned int ch;
2105
2106   /* EOL_TYPE specifies the type of end-of-line conversion that
2107      currently applies.  We need to keep this separate from the
2108      EOL type stored in CODESYS because the latter might indicate
2109      automatic EOL-type detection while the former will always
2110      indicate a particular EOL type. */
2111   eol_type_t eol_type;
2112 #ifdef MULE
2113   /* Additional ISO2022 information.  We define the structure above
2114      because it's also needed by the detection routines. */
2115   struct iso2022_decoder iso2022;
2116
2117   /* Additional information (the state of the running CCL program)
2118      used by the CCL decoder. */
2119   struct ccl_program ccl;
2120
2121   /* counter for UTF-8 or UCS-4 */
2122   unsigned char counter;
2123 #endif
2124   struct detection_state decst;
2125 };
2126
2127 static Lstream_data_count decoding_reader (Lstream *stream,
2128                                 unsigned char *data, Lstream_data_count size);
2129 static Lstream_data_count decoding_writer (Lstream *stream,
2130                                 const unsigned char *data, Lstream_data_count size);
2131 static int decoding_rewinder   (Lstream *stream);
2132 static int decoding_seekable_p (Lstream *stream);
2133 static int decoding_flusher    (Lstream *stream);
2134 static int decoding_closer     (Lstream *stream);
2135
2136 static Lisp_Object decoding_marker (Lisp_Object stream);
2137
2138 DEFINE_LSTREAM_IMPLEMENTATION ("decoding", lstream_decoding,
2139                                sizeof (struct decoding_stream));
2140
2141 static Lisp_Object
2142 decoding_marker (Lisp_Object stream)
2143 {
2144   Lstream *str = DECODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2145   Lisp_Object str_obj;
2146
2147   /* We do not need to mark the coding systems or charsets stored
2148      within the stream because they are stored in a global list
2149      and automatically marked. */
2150
2151   XSETLSTREAM (str_obj, str);
2152   mark_object (str_obj);
2153   if (str->imp->marker)
2154     return (str->imp->marker) (str_obj);
2155   else
2156     return Qnil;
2157 }
2158
2159 /* Read SIZE bytes of data and store it into DATA.  We are a decoding stream
2160    so we read data from the other end, decode it, and store it into DATA. */
2161
2162 static Lstream_data_count
2163 decoding_reader (Lstream *stream, unsigned char *data, Lstream_data_count size)
2164 {
2165   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2166   unsigned char *orig_data = data;
2167   Lstream_data_count read_size;
2168   int error_occurred = 0;
2169
2170   /* We need to interface to mule_decode(), which expects to take some
2171      amount of data and store the result into a Dynarr.  We have
2172      mule_decode() store into str->runoff, and take data from there
2173      as necessary. */
2174
2175   /* We loop until we have enough data, reading chunks from the other
2176      end and decoding it. */
2177   while (1)
2178     {
2179       /* Take data from the runoff if we can.  Make sure to take at
2180          most SIZE bytes, and delete the data from the runoff. */
2181       if (Dynarr_length (str->runoff) > 0)
2182         {
2183           Lstream_data_count chunk = min (size, (Lstream_data_count) Dynarr_length (str->runoff));
2184           memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2185           Dynarr_delete_many (str->runoff, 0, chunk);
2186           data += chunk;
2187           size -= chunk;
2188         }
2189
2190       if (size == 0)
2191         break; /* No more room for data */
2192
2193       if (str->flags & CODING_STATE_END)
2194         /* This means that on the previous iteration, we hit the EOF on
2195            the other end.  We loop once more so that mule_decode() can
2196            output any final stuff it may be holding, or any "go back
2197            to a sane state" escape sequences. (This latter makes sense
2198            during encoding.) */
2199         break;
2200
2201       /* Exhausted the runoff, so get some more.  DATA has at least
2202          SIZE bytes left of storage in it, so it's OK to read directly
2203          into it.  (We'll be overwriting above, after we've decoded it
2204          into the runoff.) */
2205       read_size = Lstream_read (str->other_end, data, size);
2206       if (read_size < 0)
2207         {
2208           error_occurred = 1;
2209           break;
2210         }
2211       if (read_size == 0)
2212         /* There might be some more end data produced in the translation.
2213            See the comment above. */
2214         str->flags |= CODING_STATE_END;
2215       mule_decode (stream, (Extbyte *) data, str->runoff, read_size);
2216     }
2217
2218   if (data - orig_data == 0)
2219     return error_occurred ? -1 : 0;
2220   else
2221     return data - orig_data;
2222 }
2223
2224 static Lstream_data_count
2225 decoding_writer (Lstream *stream, const unsigned char *data, Lstream_data_count size)
2226 {
2227   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2228   Lstream_data_count retval;
2229
2230   /* Decode all our data into the runoff, and then attempt to write
2231      it all out to the other end.  Remove whatever chunk we succeeded
2232      in writing. */
2233   mule_decode (stream, (Extbyte *) data, str->runoff, size);
2234   retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2235                           Dynarr_length (str->runoff));
2236   if (retval > 0)
2237     Dynarr_delete_many (str->runoff, 0, retval);
2238   /* Do NOT return retval.  The return value indicates how much
2239      of the incoming data was written, not how many bytes were
2240      written. */
2241   return size;
2242 }
2243
2244 static void
2245 reset_decoding_stream (struct decoding_stream *str)
2246 {
2247 #ifdef MULE
2248   if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_ISO2022)
2249     {
2250       Lisp_Object coding_system;
2251       XSETCODING_SYSTEM (coding_system, str->codesys);
2252       reset_iso2022 (coding_system, &str->iso2022);
2253     }
2254   else if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_CCL)
2255     {
2256       setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_DECODE (str->codesys));
2257     }
2258   str->counter = 0;
2259 #endif /* MULE */
2260   if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT
2261       || CODING_SYSTEM_EOL_TYPE (str->codesys) == EOL_AUTODETECT)
2262     {
2263       xzero (str->decst);
2264       str->decst.eol_type = EOL_AUTODETECT;
2265       str->decst.mask = ~0;
2266     }
2267   str->flags = str->ch = 0;
2268 }
2269
2270 static int
2271 decoding_rewinder (Lstream *stream)
2272 {
2273   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2274   reset_decoding_stream (str);
2275   Dynarr_reset (str->runoff);
2276   return Lstream_rewind (str->other_end);
2277 }
2278
2279 static int
2280 decoding_seekable_p (Lstream *stream)
2281 {
2282   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2283   return Lstream_seekable_p (str->other_end);
2284 }
2285
2286 static int
2287 decoding_flusher (Lstream *stream)
2288 {
2289   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2290   return Lstream_flush (str->other_end);
2291 }
2292
2293 static int
2294 decoding_closer (Lstream *stream)
2295 {
2296   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2297   if (stream->flags & LSTREAM_FL_WRITE)
2298     {
2299       str->flags |= CODING_STATE_END;
2300       decoding_writer (stream, 0, 0);
2301     }
2302   Dynarr_free (str->runoff);
2303 #ifdef MULE
2304 #ifdef ENABLE_COMPOSITE_CHARS
2305   if (str->iso2022.composite_chars)
2306     Dynarr_free (str->iso2022.composite_chars);
2307 #endif
2308 #endif
2309   return Lstream_close (str->other_end);
2310 }
2311
2312 Lisp_Object
2313 decoding_stream_coding_system (Lstream *stream)
2314 {
2315   Lisp_Object coding_system;
2316   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2317
2318   XSETCODING_SYSTEM (coding_system, str->codesys);
2319   return subsidiary_coding_system (coding_system, str->eol_type);
2320 }
2321
2322 void
2323 set_decoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2324 {
2325   Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2326   struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2327   str->codesys = cs;
2328   if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
2329     str->eol_type = CODING_SYSTEM_EOL_TYPE (cs);
2330   reset_decoding_stream (str);
2331 }
2332
2333 /* WARNING WARNING WARNING WARNING!!!!!  If you open up a decoding
2334    stream for writing, no automatic code detection will be performed.
2335    The reason for this is that automatic code detection requires a
2336    seekable input.  Things will also fail if you open a decoding
2337    stream for reading using a non-fully-specified coding system and
2338    a non-seekable input stream. */
2339
2340 static Lisp_Object
2341 make_decoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2342                         const char *mode)
2343 {
2344   Lstream *lstr = Lstream_new (lstream_decoding, mode);
2345   struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2346   Lisp_Object obj;
2347
2348   xzero (*str);
2349   str->other_end = stream;
2350   str->runoff = (unsigned_char_dynarr *) Dynarr_new (unsigned_char);
2351   str->eol_type = EOL_AUTODETECT;
2352   if (!strcmp (mode, "r")
2353       && Lstream_seekable_p (stream))
2354     /* We can determine the coding system now. */
2355     determine_real_coding_system (stream, &codesys, &str->eol_type);
2356   set_decoding_stream_coding_system (lstr, codesys);
2357   str->decst.eol_type = str->eol_type;
2358   str->decst.mask = ~0;
2359   XSETLSTREAM (obj, lstr);
2360   return obj;
2361 }
2362
2363 Lisp_Object
2364 make_decoding_input_stream (Lstream *stream, Lisp_Object codesys)
2365 {
2366   return make_decoding_stream_1 (stream, codesys, "r");
2367 }
2368
2369 Lisp_Object
2370 make_decoding_output_stream (Lstream *stream, Lisp_Object codesys)
2371 {
2372   return make_decoding_stream_1 (stream, codesys, "w");
2373 }
2374
2375 /* Note: the decode_coding_* functions all take the same
2376    arguments as mule_decode(), which is to say some SRC data of
2377    size N, which is to be stored into dynamic array DST.
2378    DECODING is the stream within which the decoding is
2379    taking place, but no data is actually read from or
2380    written to that stream; that is handled in decoding_reader()
2381    or decoding_writer().  This allows the same functions to
2382    be used for both reading and writing. */
2383
2384 static void
2385 mule_decode (Lstream *decoding, const Extbyte *src,
2386              unsigned_char_dynarr *dst, Lstream_data_count n)
2387 {
2388   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2389
2390   /* If necessary, do encoding-detection now.  We do this when
2391      we're a writing stream or a non-seekable reading stream,
2392      meaning that we can't just process the whole input,
2393      rewind, and start over. */
2394
2395   if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT ||
2396       str->eol_type == EOL_AUTODETECT)
2397     {
2398       Lisp_Object codesys;
2399
2400       XSETCODING_SYSTEM (codesys, str->codesys);
2401       detect_coding_type (&str->decst, src, n,
2402                           CODING_SYSTEM_TYPE (str->codesys) !=
2403                           CODESYS_AUTODETECT);
2404       if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT &&
2405           str->decst.mask != ~0)
2406         /* #### This is cheesy.  What we really ought to do is
2407            buffer up a certain amount of data so as to get a
2408            less random result. */
2409         codesys = coding_system_from_mask (str->decst.mask);
2410       str->eol_type = str->decst.eol_type;
2411       if (XCODING_SYSTEM (codesys) != str->codesys)
2412         {
2413           /* Preserve the CODING_STATE_END flag in case it was set.
2414              If we erase it, bad things might happen. */
2415           int was_end = str->flags & CODING_STATE_END;
2416           set_decoding_stream_coding_system (decoding, codesys);
2417           if (was_end)
2418             str->flags |= CODING_STATE_END;
2419         }
2420     }
2421
2422   switch (CODING_SYSTEM_TYPE (str->codesys))
2423     {
2424 #ifdef DEBUG_XEMACS
2425     case CODESYS_INTERNAL:
2426       Dynarr_add_many (dst, src, n);
2427       break;
2428 #endif
2429     case CODESYS_AUTODETECT:
2430       /* If we got this far and still haven't decided on the coding
2431          system, then do no conversion. */
2432     case CODESYS_NO_CONVERSION:
2433       decode_coding_no_conversion (decoding, src, dst, n);
2434       break;
2435 #ifdef MULE
2436     case CODESYS_SHIFT_JIS:
2437       decode_coding_sjis (decoding, src, dst, n);
2438       break;
2439     case CODESYS_BIG5:
2440       decode_coding_big5 (decoding, src, dst, n);
2441       break;
2442     case CODESYS_UCS4:
2443       decode_coding_ucs4 (decoding, src, dst, n);
2444       break;
2445     case CODESYS_UTF8:
2446       decode_coding_utf8 (decoding, src, dst, n);
2447       break;
2448     case CODESYS_CCL:
2449       str->ccl.last_block = str->flags & CODING_STATE_END;
2450       /* When applying ccl program to stream, MUST NOT set NULL
2451          pointer to src.  */
2452       ccl_driver (&str->ccl, (src ? (unsigned char *)src : (unsigned char*)""),
2453                   dst, n, 0, CCL_MODE_DECODING);
2454       break;
2455     case CODESYS_ISO2022:
2456       decode_coding_iso2022 (decoding, src, dst, n);
2457       break;
2458 #endif /* MULE */
2459     default:
2460       abort ();
2461     }
2462 }
2463
2464 DEFUN ("decode-coding-region", Fdecode_coding_region, 3, 4, 0, /*
2465 Decode the text between START and END which is encoded in CODING-SYSTEM.
2466 This is useful if you've read in encoded text from a file without decoding
2467 it (e.g. you read in a JIS-formatted file but used the `binary' or
2468 `no-conversion' coding system, so that it shows up as "^[$B!<!+^[(B").
2469 Return length of decoded text.
2470 BUFFER defaults to the current buffer if unspecified.
2471 */
2472        (start, end, coding_system, buffer))
2473 {
2474   Bufpos b, e;
2475   struct buffer *buf = decode_buffer (buffer, 0);
2476   Lisp_Object instream, lb_outstream, de_outstream, outstream;
2477   Lstream *istr, *ostr;
2478   struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2479
2480   get_buffer_range_char (buf, start, end, &b, &e, 0);
2481
2482   barf_if_buffer_read_only (buf, b, e);
2483
2484   coding_system = Fget_coding_system (coding_system);
2485   instream = make_lisp_buffer_input_stream  (buf, b, e, 0);
2486   lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2487   de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2488                                               coding_system);
2489   outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2490                                            Fget_coding_system (Qbinary));
2491   istr = XLSTREAM (instream);
2492   ostr = XLSTREAM (outstream);
2493   GCPRO4 (instream, lb_outstream, de_outstream, outstream);
2494
2495   /* The chain of streams looks like this:
2496
2497      [BUFFER] <----- send through
2498                      ------> [ENCODE AS BINARY]
2499                              ------> [DECODE AS SPECIFIED]
2500                                      ------> [BUFFER]
2501    */
2502
2503   while (1)
2504     {
2505       char tempbuf[1024]; /* some random amount */
2506       Bufpos newpos, even_newer_pos;
2507       Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2508       Lstream_data_count size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2509
2510       if (!size_in_bytes)
2511         break;
2512       newpos = lisp_buffer_stream_startpos (istr);
2513       Lstream_write (ostr, tempbuf, size_in_bytes);
2514       even_newer_pos = lisp_buffer_stream_startpos (istr);
2515       buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2516                            even_newer_pos, 0);
2517     }
2518   Lstream_close (istr);
2519   Lstream_close (ostr);
2520   UNGCPRO;
2521   Lstream_delete (istr);
2522   Lstream_delete (ostr);
2523   Lstream_delete (XLSTREAM (de_outstream));
2524   Lstream_delete (XLSTREAM (lb_outstream));
2525   return Qnil;
2526 }
2527
2528 \f
2529 /************************************************************************/
2530 /*           Converting to an external encoding ("encoding")            */
2531 /************************************************************************/
2532
2533 /* An encoding stream is an output stream.  When you create the
2534    stream, you specify the coding system that governs the encoding
2535    and another stream that the resulting encoded data is to be
2536    sent to, and then start sending data to it. */
2537
2538 #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding)
2539
2540 struct encoding_stream
2541 {
2542   /* Coding system that governs the conversion. */
2543   Lisp_Coding_System *codesys;
2544
2545   /* Stream that we read the encoded data from or
2546      write the decoded data to. */
2547   Lstream *other_end;
2548
2549   /* If we are reading, then we can return only a fixed amount of
2550      data, so if the conversion resulted in too much data, we store it
2551      here for retrieval the next time around. */
2552   unsigned_char_dynarr *runoff;
2553
2554   /* FLAGS holds flags indicating the current state of the encoding.
2555      Some of these flags are dependent on the coding system. */
2556   unsigned int flags;
2557
2558   /* CH holds a partially built-up character.  Since we only deal
2559      with one- and two-byte characters at the moment, we only use
2560      this to store the first byte of a two-byte character. */
2561   unsigned int ch;
2562 #ifdef MULE
2563   /* Additional information used by the ISO2022 encoder. */
2564   struct
2565     {
2566       /* CHARSET holds the character sets currently assigned to the G0
2567          through G3 registers.  It is initialized from the array
2568          INITIAL_CHARSET in CODESYS. */
2569       Lisp_Object charset[4];
2570
2571       /* Which registers are currently invoked into the left (GL) and
2572          right (GR) halves of the 8-bit encoding space? */
2573       int register_left, register_right;
2574
2575       /* Whether we need to explicitly designate the charset in the
2576          G? register before using it.  It is initialized from the
2577          array FORCE_CHARSET_ON_OUTPUT in CODESYS. */
2578       unsigned char force_charset_on_output[4];
2579
2580       /* Other state variables that need to be preserved across
2581          invocations. */
2582       Lisp_Object current_charset;
2583       int current_half;
2584       int current_char_boundary;
2585     } iso2022;
2586
2587   /* Additional information (the state of the running CCL program)
2588      used by the CCL encoder. */
2589   struct ccl_program ccl;
2590 #endif /* MULE */
2591 };
2592
2593 static Lstream_data_count encoding_reader (Lstream *stream, unsigned char *data, Lstream_data_count size);
2594 static Lstream_data_count encoding_writer (Lstream *stream, const unsigned char *data,
2595                                 Lstream_data_count size);
2596 static int encoding_rewinder   (Lstream *stream);
2597 static int encoding_seekable_p (Lstream *stream);
2598 static int encoding_flusher    (Lstream *stream);
2599 static int encoding_closer     (Lstream *stream);
2600
2601 static Lisp_Object encoding_marker (Lisp_Object stream);
2602
2603 DEFINE_LSTREAM_IMPLEMENTATION ("encoding", lstream_encoding,
2604                                sizeof (struct encoding_stream));
2605
2606 static Lisp_Object
2607 encoding_marker (Lisp_Object stream)
2608 {
2609   Lstream *str = ENCODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2610   Lisp_Object str_obj;
2611
2612   /* We do not need to mark the coding systems or charsets stored
2613      within the stream because they are stored in a global list
2614      and automatically marked. */
2615
2616   XSETLSTREAM (str_obj, str);
2617   mark_object (str_obj);
2618   if (str->imp->marker)
2619     return (str->imp->marker) (str_obj);
2620   else
2621     return Qnil;
2622 }
2623
2624 /* Read SIZE bytes of data and store it into DATA.  We are a encoding stream
2625    so we read data from the other end, encode it, and store it into DATA. */
2626
2627 static Lstream_data_count
2628 encoding_reader (Lstream *stream, unsigned char *data, Lstream_data_count size)
2629 {
2630   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2631   unsigned char *orig_data = data;
2632   Lstream_data_count read_size;
2633   int error_occurred = 0;
2634
2635   /* We need to interface to mule_encode(), which expects to take some
2636      amount of data and store the result into a Dynarr.  We have
2637      mule_encode() store into str->runoff, and take data from there
2638      as necessary. */
2639
2640   /* We loop until we have enough data, reading chunks from the other
2641      end and encoding it. */
2642   while (1)
2643     {
2644       /* Take data from the runoff if we can.  Make sure to take at
2645          most SIZE bytes, and delete the data from the runoff. */
2646       if (Dynarr_length (str->runoff) > 0)
2647         {
2648           int chunk = min ((int) size, Dynarr_length (str->runoff));
2649           memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2650           Dynarr_delete_many (str->runoff, 0, chunk);
2651           data += chunk;
2652           size -= chunk;
2653         }
2654
2655       if (size == 0)
2656         break; /* No more room for data */
2657
2658       if (str->flags & CODING_STATE_END)
2659         /* This means that on the previous iteration, we hit the EOF on
2660            the other end.  We loop once more so that mule_encode() can
2661            output any final stuff it may be holding, or any "go back
2662            to a sane state" escape sequences. (This latter makes sense
2663            during encoding.) */
2664         break;
2665
2666       /* Exhausted the runoff, so get some more.  DATA at least SIZE bytes
2667          left of storage in it, so it's OK to read directly into it.
2668          (We'll be overwriting above, after we've encoded it into the
2669          runoff.) */
2670       read_size = Lstream_read (str->other_end, data, size);
2671       if (read_size < 0)
2672         {
2673           error_occurred = 1;
2674           break;
2675         }
2676       if (read_size == 0)
2677         /* There might be some more end data produced in the translation.
2678            See the comment above. */
2679         str->flags |= CODING_STATE_END;
2680       mule_encode (stream, data, str->runoff, read_size);
2681     }
2682
2683   if (data == orig_data)
2684     return error_occurred ? -1 : 0;
2685   else
2686     return data - orig_data;
2687 }
2688
2689 static Lstream_data_count
2690 encoding_writer (Lstream *stream, const unsigned char *data, Lstream_data_count size)
2691 {
2692   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2693   Lstream_data_count retval;
2694
2695   /* Encode all our data into the runoff, and then attempt to write
2696      it all out to the other end.  Remove whatever chunk we succeeded
2697      in writing. */
2698   mule_encode (stream, data, str->runoff, size);
2699   retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2700                           Dynarr_length (str->runoff));
2701   if (retval > 0)
2702     Dynarr_delete_many (str->runoff, 0, retval);
2703   /* Do NOT return retval.  The return value indicates how much
2704      of the incoming data was written, not how many bytes were
2705      written. */
2706   return size;
2707 }
2708
2709 static void
2710 reset_encoding_stream (struct encoding_stream *str)
2711 {
2712 #ifdef MULE
2713   switch (CODING_SYSTEM_TYPE (str->codesys))
2714     {
2715     case CODESYS_ISO2022:
2716       {
2717         int i;
2718
2719         for (i = 0; i < 4; i++)
2720           {
2721             str->iso2022.charset[i] =
2722               CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, i);
2723             str->iso2022.force_charset_on_output[i] =
2724               CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (str->codesys, i);
2725           }
2726         str->iso2022.register_left = 0;
2727         str->iso2022.register_right = 1;
2728         str->iso2022.current_charset = Qnil;
2729         str->iso2022.current_half = 0;
2730         str->iso2022.current_char_boundary = 1;
2731         break;
2732       }
2733     case CODESYS_CCL:
2734       setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_ENCODE (str->codesys));
2735       break;
2736     default:
2737       break;
2738     }
2739 #endif /* MULE */
2740
2741   str->flags = str->ch = 0;
2742 }
2743
2744 static int
2745 encoding_rewinder (Lstream *stream)
2746 {
2747   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2748   reset_encoding_stream (str);
2749   Dynarr_reset (str->runoff);
2750   return Lstream_rewind (str->other_end);
2751 }
2752
2753 static int
2754 encoding_seekable_p (Lstream *stream)
2755 {
2756   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2757   return Lstream_seekable_p (str->other_end);
2758 }
2759
2760 static int
2761 encoding_flusher (Lstream *stream)
2762 {
2763   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2764   return Lstream_flush (str->other_end);
2765 }
2766
2767 static int
2768 encoding_closer (Lstream *stream)
2769 {
2770   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2771   if (stream->flags & LSTREAM_FL_WRITE)
2772     {
2773       str->flags |= CODING_STATE_END;
2774       encoding_writer (stream, 0, 0);
2775     }
2776   Dynarr_free (str->runoff);
2777   return Lstream_close (str->other_end);
2778 }
2779
2780 Lisp_Object
2781 encoding_stream_coding_system (Lstream *stream)
2782 {
2783   Lisp_Object coding_system;
2784   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2785
2786   XSETCODING_SYSTEM (coding_system, str->codesys);
2787   return coding_system;
2788 }
2789
2790 void
2791 set_encoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2792 {
2793   Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2794   struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2795   str->codesys = cs;
2796   reset_encoding_stream (str);
2797 }
2798
2799 static Lisp_Object
2800 make_encoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2801                         const char *mode)
2802 {
2803   Lstream *lstr = Lstream_new (lstream_encoding, mode);
2804   struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2805   Lisp_Object obj;
2806
2807   xzero (*str);
2808   str->runoff = Dynarr_new (unsigned_char);
2809   str->other_end = stream;
2810   set_encoding_stream_coding_system (lstr, codesys);
2811   XSETLSTREAM (obj, lstr);
2812   return obj;
2813 }
2814
2815 Lisp_Object
2816 make_encoding_input_stream (Lstream *stream, Lisp_Object codesys)
2817 {
2818   return make_encoding_stream_1 (stream, codesys, "r");
2819 }
2820
2821 Lisp_Object
2822 make_encoding_output_stream (Lstream *stream, Lisp_Object codesys)
2823 {
2824   return make_encoding_stream_1 (stream, codesys, "w");
2825 }
2826
2827 /* Convert N bytes of internally-formatted data stored in SRC to an
2828    external format, according to the encoding stream ENCODING.
2829    Store the encoded data into DST. */
2830
2831 static void
2832 mule_encode (Lstream *encoding, const Bufbyte *src,
2833              unsigned_char_dynarr *dst, Lstream_data_count n)
2834 {
2835   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2836
2837   switch (CODING_SYSTEM_TYPE (str->codesys))
2838     {
2839 #ifdef DEBUG_XEMACS
2840     case CODESYS_INTERNAL:
2841       Dynarr_add_many (dst, src, n);
2842       break;
2843 #endif
2844     case CODESYS_AUTODETECT:
2845       /* If we got this far and still haven't decided on the coding
2846          system, then do no conversion. */
2847     case CODESYS_NO_CONVERSION:
2848       encode_coding_no_conversion (encoding, src, dst, n);
2849       break;
2850 #ifdef MULE
2851     case CODESYS_SHIFT_JIS:
2852       encode_coding_sjis (encoding, src, dst, n);
2853       break;
2854     case CODESYS_BIG5:
2855       encode_coding_big5 (encoding, src, dst, n);
2856       break;
2857     case CODESYS_UCS4:
2858       encode_coding_ucs4 (encoding, src, dst, n);
2859       break;
2860     case CODESYS_UTF8:
2861       encode_coding_utf8 (encoding, src, dst, n);
2862       break;
2863     case CODESYS_CCL:
2864       str->ccl.last_block = str->flags & CODING_STATE_END;
2865       /* When applying ccl program to stream, MUST NOT set NULL
2866          pointer to src.  */
2867       ccl_driver (&str->ccl, ((src) ? src : (unsigned char*)""),
2868                   dst, n, 0, CCL_MODE_ENCODING);
2869       break;
2870     case CODESYS_ISO2022:
2871       encode_coding_iso2022 (encoding, src, dst, n);
2872       break;
2873 #endif /* MULE */
2874     default:
2875       abort ();
2876     }
2877 }
2878
2879 DEFUN ("encode-coding-region", Fencode_coding_region, 3, 4, 0, /*
2880 Encode the text between START and END using CODING-SYSTEM.
2881 This will, for example, convert Japanese characters into stuff such as
2882 "^[$B!<!+^[(B" if you use the JIS encoding.  Return length of encoded
2883 text.  BUFFER defaults to the current buffer if unspecified.
2884 */
2885        (start, end, coding_system, buffer))
2886 {
2887   Bufpos b, e;
2888   struct buffer *buf = decode_buffer (buffer, 0);
2889   Lisp_Object instream, lb_outstream, de_outstream, outstream;
2890   Lstream *istr, *ostr;
2891   struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2892
2893   get_buffer_range_char (buf, start, end, &b, &e, 0);
2894
2895   barf_if_buffer_read_only (buf, b, e);
2896
2897   coding_system = Fget_coding_system (coding_system);
2898   instream  = make_lisp_buffer_input_stream  (buf, b, e, 0);
2899   lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2900   de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2901                                               Fget_coding_system (Qbinary));
2902   outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2903                                            coding_system);
2904   istr = XLSTREAM (instream);
2905   ostr = XLSTREAM (outstream);
2906   GCPRO4 (instream, outstream, de_outstream, lb_outstream);
2907   /* The chain of streams looks like this:
2908
2909      [BUFFER] <----- send through
2910                      ------> [ENCODE AS SPECIFIED]
2911                              ------> [DECODE AS BINARY]
2912                                      ------> [BUFFER]
2913    */
2914   while (1)
2915     {
2916       char tempbuf[1024]; /* some random amount */
2917       Bufpos newpos, even_newer_pos;
2918       Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2919       Lstream_data_count size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2920
2921       if (!size_in_bytes)
2922         break;
2923       newpos = lisp_buffer_stream_startpos (istr);
2924       Lstream_write (ostr, tempbuf, size_in_bytes);
2925       even_newer_pos = lisp_buffer_stream_startpos (istr);
2926       buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2927                            even_newer_pos, 0);
2928     }
2929
2930   {
2931     Charcount retlen =
2932       lisp_buffer_stream_startpos (XLSTREAM (instream)) - b;
2933     Lstream_close (istr);
2934     Lstream_close (ostr);
2935     UNGCPRO;
2936     Lstream_delete (istr);
2937     Lstream_delete (ostr);
2938     Lstream_delete (XLSTREAM (de_outstream));
2939     Lstream_delete (XLSTREAM (lb_outstream));
2940     return make_int (retlen);
2941   }
2942 }
2943
2944 #ifdef MULE
2945 \f
2946 /************************************************************************/
2947 /*                          Shift-JIS methods                           */
2948 /************************************************************************/
2949
2950 /* Shift-JIS is a coding system encoding three character sets: ASCII, right
2951    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
2952    as is.  A character of JISX0201-Kana (DIMENSION1_CHARS94 character set) is
2953    encoded by "position-code + 0x80".  A character of JISX0208
2954    (DIMENSION2_CHARS94 character set) is encoded in 2-byte but two
2955    position-codes are divided and shifted so that it fit in the range
2956    below.
2957
2958    --- CODE RANGE of Shift-JIS ---
2959    (character set)      (range)
2960    ASCII                0x00 .. 0x7F
2961    JISX0201-Kana        0xA0 .. 0xDF
2962    JISX0208 (1st byte)  0x80 .. 0x9F and 0xE0 .. 0xEF
2963             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
2964    -------------------------------
2965
2966 */
2967
2968 /* Is this the first byte of a Shift-JIS two-byte char? */
2969
2970 #define BYTE_SJIS_TWO_BYTE_1_P(c) \
2971   (((c) >= 0x81 && (c) <= 0x9F) || ((c) >= 0xE0 && (c) <= 0xEF))
2972
2973 /* Is this the second byte of a Shift-JIS two-byte char? */
2974
2975 #define BYTE_SJIS_TWO_BYTE_2_P(c) \
2976   (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0x80 && (c) <= 0xFC))
2977
2978 #define BYTE_SJIS_KATAKANA_P(c) \
2979   ((c) >= 0xA1 && (c) <= 0xDF)
2980
2981 static int
2982 detect_coding_sjis (struct detection_state *st, const Extbyte *src, Lstream_data_count n)
2983 {
2984   while (n--)
2985     {
2986       unsigned char c = *(unsigned char *)src++;
2987       if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
2988         return 0;
2989       if (st->shift_jis.in_second_byte)
2990         {
2991           st->shift_jis.in_second_byte = 0;
2992           if (c < 0x40)
2993             return 0;
2994         }
2995       else if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
2996         st->shift_jis.in_second_byte = 1;
2997     }
2998   return CODING_CATEGORY_SHIFT_JIS_MASK;
2999 }
3000
3001 /* Convert Shift-JIS data to internal format. */
3002
3003 static void
3004 decode_coding_sjis (Lstream *decoding, const Extbyte *src,
3005                     unsigned_char_dynarr *dst, Lstream_data_count n)
3006 {
3007   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3008   unsigned int flags  = str->flags;
3009   unsigned int ch     = str->ch;
3010   eol_type_t eol_type = str->eol_type;
3011
3012   while (n--)
3013     {
3014       unsigned char c = *(unsigned char *)src++;
3015
3016       if (ch)
3017         {
3018           /* Previous character was first byte of Shift-JIS Kanji char. */
3019           if (BYTE_SJIS_TWO_BYTE_2_P (c))
3020             {
3021               unsigned char e1, e2;
3022
3023               Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
3024               DECODE_SJIS (ch, c, e1, e2);
3025               Dynarr_add (dst, e1);
3026               Dynarr_add (dst, e2);
3027             }
3028           else
3029             {
3030               DECODE_ADD_BINARY_CHAR (ch, dst);
3031               DECODE_ADD_BINARY_CHAR (c, dst);
3032             }
3033           ch = 0;
3034         }
3035       else
3036         {
3037           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3038           if (BYTE_SJIS_TWO_BYTE_1_P (c))
3039             ch = c;
3040           else if (BYTE_SJIS_KATAKANA_P (c))
3041             {
3042               Dynarr_add (dst, LEADING_BYTE_KATAKANA_JISX0201);
3043               Dynarr_add (dst, c);
3044             }
3045           else
3046             DECODE_ADD_BINARY_CHAR (c, dst);
3047         }
3048     label_continue_loop:;
3049     }
3050
3051   DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
3052
3053   str->flags = flags;
3054   str->ch    = ch;
3055 }
3056
3057 /* Convert internally-formatted data to Shift-JIS. */
3058
3059 static void
3060 encode_coding_sjis (Lstream *encoding, const Bufbyte *src,
3061                     unsigned_char_dynarr *dst, Lstream_data_count n)
3062 {
3063   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3064   unsigned int flags  = str->flags;
3065   unsigned int ch     = str->ch;
3066   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3067
3068   while (n--)
3069     {
3070       Bufbyte c = *src++;
3071       if (c == '\n')
3072         {
3073           if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3074             Dynarr_add (dst, '\r');
3075           if (eol_type != EOL_CR)
3076             Dynarr_add (dst, '\n');
3077           ch = 0;
3078         }
3079       else if (BYTE_ASCII_P (c))
3080         {
3081           Dynarr_add (dst, c);
3082           ch = 0;
3083         }
3084       else if (BUFBYTE_LEADING_BYTE_P (c))
3085         ch = (c == LEADING_BYTE_KATAKANA_JISX0201 ||
3086               c == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
3087               c == LEADING_BYTE_JAPANESE_JISX0208) ? c : 0;
3088       else if (ch)
3089         {
3090           if (ch == LEADING_BYTE_KATAKANA_JISX0201)
3091             {
3092               Dynarr_add (dst, c);
3093               ch = 0;
3094             }
3095           else if (ch == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
3096                    ch == LEADING_BYTE_JAPANESE_JISX0208)
3097             ch = c;
3098           else
3099             {
3100               unsigned char j1, j2;
3101               ENCODE_SJIS (ch, c, j1, j2);
3102               Dynarr_add (dst, j1);
3103               Dynarr_add (dst, j2);
3104               ch = 0;
3105             }
3106         }
3107     }
3108
3109   str->flags = flags;
3110   str->ch    = ch;
3111 }
3112
3113 DEFUN ("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /*
3114 Decode a JISX0208 character of Shift-JIS coding-system.
3115 CODE is the character code in Shift-JIS as a cons of type bytes.
3116 Return the corresponding character.
3117 */
3118        (code))
3119 {
3120   unsigned char c1, c2, s1, s2;
3121
3122   CHECK_CONS (code);
3123   CHECK_INT (XCAR (code));
3124   CHECK_INT (XCDR (code));
3125   s1 = XINT (XCAR (code));
3126   s2 = XINT (XCDR (code));
3127   if (BYTE_SJIS_TWO_BYTE_1_P (s1) &&
3128       BYTE_SJIS_TWO_BYTE_2_P (s2))
3129     {
3130       DECODE_SJIS (s1, s2, c1, c2);
3131       return make_char (MAKE_CHAR (Vcharset_japanese_jisx0208,
3132                                    c1 & 0x7F, c2 & 0x7F));
3133     }
3134   else
3135     return Qnil;
3136 }
3137
3138 DEFUN ("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /*
3139 Encode a JISX0208 character CHARACTER to SHIFT-JIS coding-system.
3140 Return the corresponding character code in SHIFT-JIS as a cons of two bytes.
3141 */
3142        (character))
3143 {
3144   Lisp_Object charset;
3145   int c1, c2, s1, s2;
3146
3147   CHECK_CHAR_COERCE_INT (character);
3148   BREAKUP_CHAR (XCHAR (character), charset, c1, c2);
3149   if (EQ (charset, Vcharset_japanese_jisx0208))
3150     {
3151       ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3152       return Fcons (make_int (s1), make_int (s2));
3153     }
3154   else
3155     return Qnil;
3156 }
3157
3158 \f
3159 /************************************************************************/
3160 /*                            Big5 methods                              */
3161 /************************************************************************/
3162
3163 /* BIG5 is a coding system encoding two character sets: ASCII and
3164    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
3165    character set and is encoded in two-byte.
3166
3167    --- CODE RANGE of BIG5 ---
3168    (character set)      (range)
3169    ASCII                0x00 .. 0x7F
3170    Big5 (1st byte)      0xA1 .. 0xFE
3171         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
3172    --------------------------
3173
3174    Since the number of characters in Big5 is larger than maximum
3175    characters in Emacs' charset (96x96), it can't be handled as one
3176    charset.  So, in Emacs, Big5 is divided into two: `charset-big5-1'
3177    and `charset-big5-2'.  Both <type>s are DIMENSION2_CHARS94.  The former
3178    contains frequently used characters and the latter contains less
3179    frequently used characters.  */
3180
3181 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
3182   ((c) >= 0xA1 && (c) <= 0xFE)
3183
3184 /* Is this the second byte of a Shift-JIS two-byte char? */
3185
3186 #define BYTE_BIG5_TWO_BYTE_2_P(c) \
3187   (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0xA1 && (c) <= 0xFE))
3188
3189 /* Number of Big5 characters which have the same code in 1st byte.  */
3190
3191 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
3192
3193 /* Code conversion macros.  These are macros because they are used in
3194    inner loops during code conversion.
3195
3196    Note that temporary variables in macros introduce the classic
3197    dynamic-scoping problems with variable names.  We use capital-
3198    lettered variables in the assumption that XEmacs does not use
3199    capital letters in variables except in a very formalized way
3200    (e.g. Qstring). */
3201
3202 /* Convert Big5 code (b1, b2) into its internal string representation
3203    (lb, c1, c2). */
3204
3205 /* There is a much simpler way to split the Big5 charset into two.
3206    For the moment I'm going to leave the algorithm as-is because it
3207    claims to separate out the most-used characters into a single
3208    charset, which perhaps will lead to optimizations in various
3209    places.
3210
3211    The way the algorithm works is something like this:
3212
3213    Big5 can be viewed as a 94x157 charset, where the row is
3214    encoded into the bytes 0xA1 .. 0xFE and the column is encoded
3215    into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE.  As for frequency,
3216    the split between low and high column numbers is apparently
3217    meaningless; ascending rows produce less and less frequent chars.
3218    Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to
3219    the first charset, and the upper half (0xC9 .. 0xFE) to the
3220    second.  To do the conversion, we convert the character into
3221    a single number where 0 .. 156 is the first row, 157 .. 313
3222    is the second, etc.  That way, the characters are ordered by
3223    decreasing frequency.  Then we just chop the space in two
3224    and coerce the result into a 94x94 space.
3225    */
3226
3227 #define DECODE_BIG5(b1, b2, lb, c1, c2) do                              \
3228 {                                                                       \
3229   int B1 = b1, B2 = b2;                                                 \
3230   unsigned int I                                                        \
3231     = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62);     \
3232                                                                         \
3233   if (B1 < 0xC9)                                                        \
3234     {                                                                   \
3235       lb = LEADING_BYTE_CHINESE_BIG5_1;                                 \
3236     }                                                                   \
3237   else                                                                  \
3238     {                                                                   \
3239       lb = LEADING_BYTE_CHINESE_BIG5_2;                                 \
3240       I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1);                             \
3241     }                                                                   \
3242   c1 = I / (0xFF - 0xA1) + 0xA1;                                        \
3243   c2 = I % (0xFF - 0xA1) + 0xA1;                                        \
3244 } while (0)
3245
3246 /* Convert the internal string representation of a Big5 character
3247    (lb, c1, c2) into Big5 code (b1, b2). */
3248
3249 #define ENCODE_BIG5(lb, c1, c2, b1, b2) do                              \
3250 {                                                                       \
3251   unsigned int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1);       \
3252                                                                         \
3253   if (lb == LEADING_BYTE_CHINESE_BIG5_2)                                \
3254     {                                                                   \
3255       I += BIG5_SAME_ROW * (0xC9 - 0xA1);                               \
3256     }                                                                   \
3257   b1 = I / BIG5_SAME_ROW + 0xA1;                                        \
3258   b2 = I % BIG5_SAME_ROW;                                               \
3259   b2 += b2 < 0x3F ? 0x40 : 0x62;                                        \
3260 } while (0)
3261
3262 static int
3263 detect_coding_big5 (struct detection_state *st, const Extbyte *src, Lstream_data_count n)
3264 {
3265   while (n--)
3266     {
3267       unsigned char c = *(unsigned char *)src++;
3268       if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO ||
3269           (c >= 0x80 && c <= 0xA0))
3270         return 0;
3271       if (st->big5.in_second_byte)
3272         {
3273           st->big5.in_second_byte = 0;
3274           if (c < 0x40 || (c >= 0x80 && c <= 0xA0))
3275             return 0;
3276         }
3277       else if (c >= 0xA1)
3278         st->big5.in_second_byte = 1;
3279     }
3280   return CODING_CATEGORY_BIG5_MASK;
3281 }
3282
3283 /* Convert Big5 data to internal format. */
3284
3285 static void
3286 decode_coding_big5 (Lstream *decoding, const Extbyte *src,
3287                     unsigned_char_dynarr *dst, Lstream_data_count n)
3288 {
3289   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3290   unsigned int flags  = str->flags;
3291   unsigned int ch     = str->ch;
3292   eol_type_t eol_type = str->eol_type;
3293
3294   while (n--)
3295     {
3296       unsigned char c = *(unsigned char *)src++;
3297       if (ch)
3298         {
3299           /* Previous character was first byte of Big5 char. */
3300           if (BYTE_BIG5_TWO_BYTE_2_P (c))
3301             {
3302               unsigned char b1, b2, b3;
3303               DECODE_BIG5 (ch, c, b1, b2, b3);
3304               Dynarr_add (dst, b1);
3305               Dynarr_add (dst, b2);
3306               Dynarr_add (dst, b3);
3307             }
3308           else
3309             {
3310               DECODE_ADD_BINARY_CHAR (ch, dst);
3311               DECODE_ADD_BINARY_CHAR (c, dst);
3312             }
3313           ch = 0;
3314         }
3315       else
3316         {
3317           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3318           if (BYTE_BIG5_TWO_BYTE_1_P (c))
3319             ch = c;
3320           else
3321             DECODE_ADD_BINARY_CHAR (c, dst);
3322         }
3323     label_continue_loop:;
3324     }
3325
3326   DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
3327
3328   str->flags = flags;
3329   str->ch    = ch;
3330 }
3331
3332 /* Convert internally-formatted data to Big5. */
3333
3334 static void
3335 encode_coding_big5 (Lstream *encoding, const Bufbyte *src,
3336                     unsigned_char_dynarr *dst, Lstream_data_count n)
3337 {
3338   unsigned char c;
3339   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3340   unsigned int flags  = str->flags;
3341   unsigned int ch     = str->ch;
3342   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3343
3344   while (n--)
3345     {
3346       c = *src++;
3347       if (c == '\n')
3348         {
3349           if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3350             Dynarr_add (dst, '\r');
3351           if (eol_type != EOL_CR)
3352             Dynarr_add (dst, '\n');
3353         }
3354       else if (BYTE_ASCII_P (c))
3355         {
3356           /* ASCII. */
3357           Dynarr_add (dst, c);
3358         }
3359       else if (BUFBYTE_LEADING_BYTE_P (c))
3360         {
3361           if (c == LEADING_BYTE_CHINESE_BIG5_1 ||
3362               c == LEADING_BYTE_CHINESE_BIG5_2)
3363             {
3364               /* A recognized leading byte. */
3365               ch = c;
3366               continue; /* not done with this character. */
3367             }
3368           /* otherwise just ignore this character. */
3369         }
3370       else if (ch == LEADING_BYTE_CHINESE_BIG5_1 ||
3371                ch == LEADING_BYTE_CHINESE_BIG5_2)
3372         {
3373           /* Previous char was a recognized leading byte. */
3374           ch = (ch << 8) | c;
3375           continue; /* not done with this character. */
3376         }
3377       else if (ch)
3378         {
3379           /* Encountering second byte of a Big5 character. */
3380           unsigned char b1, b2;
3381
3382           ENCODE_BIG5 (ch >> 8, ch & 0xFF, c, b1, b2);
3383           Dynarr_add (dst, b1);
3384           Dynarr_add (dst, b2);
3385         }
3386
3387       ch = 0;
3388     }
3389
3390   str->flags = flags;
3391   str->ch    = ch;
3392 }
3393
3394
3395 DEFUN ("decode-big5-char", Fdecode_big5_char, 1, 1, 0, /*
3396 Decode a Big5 character CODE of BIG5 coding-system.
3397 CODE is the character code in BIG5, a cons of two integers.
3398 Return the corresponding character.
3399 */
3400        (code))
3401 {
3402   unsigned char c1, c2, b1, b2;
3403
3404   CHECK_CONS (code);
3405   CHECK_INT (XCAR (code));
3406   CHECK_INT (XCDR (code));
3407   b1 = XINT (XCAR (code));
3408   b2 = XINT (XCDR (code));
3409   if (BYTE_BIG5_TWO_BYTE_1_P (b1) &&
3410       BYTE_BIG5_TWO_BYTE_2_P (b2))
3411     {
3412       int leading_byte;
3413       Lisp_Object charset;
3414       DECODE_BIG5 (b1, b2, leading_byte, c1, c2);
3415       charset = CHARSET_BY_LEADING_BYTE (leading_byte);
3416       return make_char (MAKE_CHAR (charset, c1 & 0x7F, c2 & 0x7F));
3417     }
3418   else
3419     return Qnil;
3420 }
3421
3422 DEFUN ("encode-big5-char", Fencode_big5_char, 1, 1, 0, /*
3423 Encode the Big5 character CHARACTER in the BIG5 coding-system.
3424 Return the corresponding character code in Big5.
3425 */
3426        (character))
3427 {
3428   Lisp_Object charset;
3429   int c1, c2, b1, b2;
3430
3431   CHECK_CHAR_COERCE_INT (character);
3432   BREAKUP_CHAR (XCHAR (character), charset, c1, c2);
3433   if (EQ (charset, Vcharset_chinese_big5_1) ||
3434       EQ (charset, Vcharset_chinese_big5_2))
3435     {
3436       ENCODE_BIG5 (XCHARSET_LEADING_BYTE (charset), c1 | 0x80, c2 | 0x80,
3437                    b1, b2);
3438       return Fcons (make_int (b1), make_int (b2));
3439     }
3440   else
3441     return Qnil;
3442 }
3443
3444 \f
3445 /************************************************************************/
3446 /*                           UCS-4 methods                              */
3447 /*                                                                      */
3448 /*  UCS-4 character codes are implemented as nonnegative integers.      */
3449 /*                                                                      */
3450 /************************************************************************/
3451
3452
3453 DEFUN ("set-ucs-char", Fset_ucs_char, 2, 2, 0, /*
3454 Map UCS-4 code CODE to Mule character CHARACTER.
3455
3456 Return T on success, NIL on failure.
3457 */
3458        (code, character))
3459 {
3460   size_t c;
3461
3462   CHECK_CHAR (character);
3463   CHECK_NATNUM (code);
3464   c = XINT (code);
3465
3466   if (c < countof (fcd->ucs_to_mule_table))
3467     {
3468       fcd->ucs_to_mule_table[c] = character;
3469       return Qt;
3470     }
3471   else
3472     return Qnil;
3473 }
3474
3475 static Lisp_Object
3476 ucs_to_char (unsigned long code)
3477 {
3478   if (code < countof (fcd->ucs_to_mule_table))
3479     {
3480       return fcd->ucs_to_mule_table[code];
3481     }
3482   else if ((0xe00000 <= code) && (code <= 0xe00000 + 94 * 94 * 14))
3483     {
3484       unsigned int c;
3485
3486       code -= 0xe00000;
3487       c = code % (94 * 94);
3488       return make_char
3489         (MAKE_CHAR (CHARSET_BY_ATTRIBUTES
3490                     (CHARSET_TYPE_94X94, code / (94 * 94) + '@',
3491                      CHARSET_LEFT_TO_RIGHT),
3492                     c / 94 + 33, c % 94 + 33));
3493     }
3494   else
3495     return Qnil;
3496 }
3497
3498 DEFUN ("ucs-char", Fucs_char, 1, 1, 0, /*
3499 Return Mule character corresponding to UCS code CODE (a positive integer).
3500 */
3501        (code))
3502 {
3503   CHECK_NATNUM (code);
3504   return ucs_to_char (XINT (code));
3505 }
3506
3507 DEFUN ("set-char-ucs", Fset_char_ucs, 2, 2, 0, /*
3508 Map Mule character CHARACTER to UCS code CODE (a positive integer).
3509 */
3510        (character, code))
3511 {
3512   /* #### Isn't this gilding the lily?  Fput_char_table checks its args.
3513           Fset_char_ucs is more restrictive on index arg, but should
3514           check code arg in a char_table method. */
3515   CHECK_CHAR (character);
3516   CHECK_NATNUM (code);
3517   return Fput_char_table (character, code, mule_to_ucs_table);
3518 }
3519
3520 DEFUN ("char-ucs", Fchar_ucs, 1, 1, 0, /*
3521 Return the UCS code (a positive integer) corresponding to CHARACTER.
3522 */
3523        (character))
3524 {
3525   return Fget_char_table (character, mule_to_ucs_table);
3526 }
3527
3528 /* Decode a UCS-4 character into a buffer.  If the lookup fails, use
3529    <GETA MARK> (U+3013) of JIS X 0208, which means correct character
3530    is not found, instead.
3531    #### do something more appropriate (use blob?)
3532         Danger, Will Robinson!  Data loss.  Should we signal user? */
3533 static void
3534 decode_ucs4 (unsigned long ch, unsigned_char_dynarr *dst)
3535 {
3536   Lisp_Object chr = ucs_to_char (ch);
3537
3538   if (! NILP (chr))
3539     {
3540       Bufbyte work[MAX_EMCHAR_LEN];
3541       int len;
3542
3543       ch = XCHAR (chr);
3544       len = (ch < 128) ?
3545         simple_set_charptr_emchar (work, ch) :
3546         non_ascii_set_charptr_emchar (work, ch);
3547       Dynarr_add_many (dst, work, len);
3548     }
3549   else
3550     {
3551       Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
3552       Dynarr_add (dst, 34 + 128);
3553       Dynarr_add (dst, 46 + 128);
3554     }
3555 }
3556
3557 static unsigned long
3558 mule_char_to_ucs4 (Lisp_Object charset,
3559                    unsigned char h, unsigned char l)
3560 {
3561   Lisp_Object code
3562     = Fget_char_table (make_char (MAKE_CHAR (charset, h & 127, l & 127)),
3563                        mule_to_ucs_table);
3564
3565   if (INTP (code))
3566     {
3567       return XINT (code);
3568     }
3569   else if ( (XCHARSET_DIMENSION (charset) == 2) &&
3570             (XCHARSET_CHARS (charset) == 94) )
3571     {
3572       unsigned char final = XCHARSET_FINAL (charset);
3573
3574       if ( ('@' <= final) && (final < 0x7f) )
3575         {
3576           return 0xe00000 + (final - '@') * 94 * 94
3577             + ((h & 127) - 33) * 94 + (l & 127) - 33;
3578         }
3579       else
3580         {
3581           return '?';
3582         }
3583     }
3584   else
3585     {
3586       return '?';
3587     }
3588 }
3589
3590 static void
3591 encode_ucs4 (Lisp_Object charset,
3592              unsigned char h, unsigned char l, unsigned_char_dynarr *dst)
3593 {
3594   unsigned long code = mule_char_to_ucs4 (charset, h, l);
3595   Dynarr_add (dst,  code >> 24);
3596   Dynarr_add (dst, (code >> 16) & 255);
3597   Dynarr_add (dst, (code >>  8) & 255);
3598   Dynarr_add (dst,  code        & 255);
3599 }
3600
3601 static int
3602 detect_coding_ucs4 (struct detection_state *st, const Extbyte *src, Lstream_data_count n)
3603 {
3604   while (n--)
3605     {
3606       unsigned char c = *(unsigned char *)src++;
3607       switch (st->ucs4.in_byte)
3608         {
3609         case 0:
3610           if (c >= 128)
3611             return 0;
3612           else
3613             st->ucs4.in_byte++;
3614           break;
3615         case 3:
3616           st->ucs4.in_byte = 0;
3617           break;
3618         default:
3619           st->ucs4.in_byte++;
3620         }
3621     }
3622   return CODING_CATEGORY_UCS4_MASK;
3623 }
3624
3625 static void
3626 decode_coding_ucs4 (Lstream *decoding, const Extbyte *src,
3627                     unsigned_char_dynarr *dst, Lstream_data_count n)
3628 {
3629   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3630   unsigned int flags = str->flags;
3631   unsigned int ch    = str->ch;
3632   unsigned char counter = str->counter;
3633
3634   while (n--)
3635     {
3636       unsigned char c = *(unsigned char *)src++;
3637       switch (counter)
3638         {
3639         case 0:
3640           ch = c;
3641           counter = 3;
3642           break;
3643         case 1:
3644           decode_ucs4 ( ( ch << 8 ) | c, dst);
3645           ch = 0;
3646           counter = 0;
3647           break;
3648         default:
3649           ch = ( ch << 8 ) | c;
3650           counter--;
3651         }
3652     }
3653   if (counter & CODING_STATE_END)
3654     DECODE_OUTPUT_PARTIAL_CHAR (ch);
3655
3656   str->flags = flags;
3657   str->ch    = ch;
3658   str->counter = counter;
3659 }
3660
3661 static void
3662 encode_coding_ucs4 (Lstream *encoding, const Bufbyte *src,
3663                     unsigned_char_dynarr *dst, Lstream_data_count n)
3664 {
3665   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3666   unsigned int flags = str->flags;
3667   unsigned int ch = str->ch;
3668   unsigned char char_boundary = str->iso2022.current_char_boundary;
3669   Lisp_Object charset = str->iso2022.current_charset;
3670
3671 #ifdef ENABLE_COMPOSITE_CHARS
3672   /* flags for handling composite chars.  We do a little switcharoo
3673      on the source while we're outputting the composite char. */
3674   unsigned int saved_n = 0;
3675   const unsigned char *saved_src = NULL;
3676   int in_composite = 0;
3677
3678  back_to_square_n:
3679 #endif
3680
3681   while (n--)
3682     {
3683       unsigned char c = *src++;
3684
3685       if (BYTE_ASCII_P (c))
3686         {               /* Processing ASCII character */
3687           ch = 0;
3688           encode_ucs4 (Vcharset_ascii, c, 0, dst);
3689           char_boundary = 1;
3690         }
3691       else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
3692         { /* Processing Leading Byte */
3693           ch = 0;
3694           charset = CHARSET_BY_LEADING_BYTE (c);
3695           if (LEADING_BYTE_PREFIX_P(c))
3696             ch = c;
3697           char_boundary = 0;
3698         }
3699       else
3700         {                       /* Processing Non-ASCII character */
3701           char_boundary = 1;
3702           if (EQ (charset, Vcharset_control_1))
3703             {
3704               encode_ucs4 (Vcharset_control_1, c, 0, dst);
3705             }
3706           else
3707             {
3708               switch (XCHARSET_REP_BYTES (charset))
3709                 {
3710                 case 2:
3711                   encode_ucs4 (charset, c, 0, dst);
3712                   break;
3713                 case 3:
3714                   if (XCHARSET_PRIVATE_P (charset))
3715                     {
3716                       encode_ucs4 (charset, c, 0, dst);
3717                       ch = 0;
3718                     }
3719                   else if (ch)
3720                     {
3721 #ifdef ENABLE_COMPOSITE_CHARS
3722                       if (EQ (charset, Vcharset_composite))
3723                         {
3724                           if (in_composite)
3725                             {
3726                               /* #### Bother! We don't know how to
3727                                  handle this yet. */
3728                               Dynarr_add (dst, '\0');
3729                               Dynarr_add (dst, '\0');
3730                               Dynarr_add (dst, '\0');
3731                               Dynarr_add (dst, '~');
3732                             }
3733                           else
3734                             {
3735                               Emchar emch = MAKE_CHAR (Vcharset_composite,
3736                                                        ch & 0x7F, c & 0x7F);
3737                               Lisp_Object lstr = composite_char_string (emch);
3738                               saved_n = n;
3739                               saved_src = src;
3740                               in_composite = 1;
3741                               src = XSTRING_DATA   (lstr);
3742                               n   = XSTRING_LENGTH (lstr);
3743                             }
3744                         }
3745                       else
3746 #endif /* ENABLE_COMPOSITE_CHARS */
3747                         {
3748                           encode_ucs4(charset, ch, c, dst);
3749                         }
3750                       ch = 0;
3751                     }
3752                   else
3753                     {
3754                       ch = c;
3755                       char_boundary = 0;
3756                     }
3757                   break;
3758                 case 4:
3759                   if (ch)
3760                     {
3761                       encode_ucs4 (charset, ch, c, dst);
3762                       ch = 0;
3763                     }
3764                   else
3765                     {
3766                       ch = c;
3767                       char_boundary = 0;
3768                     }
3769                   break;
3770                 default:
3771                   abort ();
3772                 }
3773             }
3774         }
3775     }
3776
3777 #ifdef ENABLE_COMPOSITE_CHARS
3778   if (in_composite)
3779     {
3780       n = saved_n;
3781       src = saved_src;
3782       in_composite = 0;
3783       goto back_to_square_n; /* Wheeeeeeeee ..... */
3784     }
3785 #endif /* ENABLE_COMPOSITE_CHARS */
3786
3787   str->flags = flags;
3788   str->ch = ch;
3789   str->iso2022.current_char_boundary = char_boundary;
3790   str->iso2022.current_charset = charset;
3791
3792   /* Verbum caro factum est! */
3793 }
3794
3795 \f
3796 /************************************************************************/
3797 /*                           UTF-8 methods                              */
3798 /************************************************************************/
3799
3800 static int
3801 detect_coding_utf8 (struct detection_state *st, const Extbyte *src, Lstream_data_count n)
3802 {
3803   while (n--)
3804     {
3805       unsigned char c = *(unsigned char *)src++;
3806       switch (st->utf8.in_byte)
3807         {
3808         case 0:
3809           if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3810             return 0;
3811           else if (c >= 0xfc)
3812             st->utf8.in_byte = 5;
3813           else if (c >= 0xf8)
3814             st->utf8.in_byte = 4;
3815           else if (c >= 0xf0)
3816             st->utf8.in_byte = 3;
3817           else if (c >= 0xe0)
3818             st->utf8.in_byte = 2;
3819           else if (c >= 0xc0)
3820             st->utf8.in_byte = 1;
3821           else if (c >= 0x80)
3822             return 0;
3823           break;
3824         default:
3825           if ((c & 0xc0) != 0x80)
3826             return 0;
3827           else
3828             st->utf8.in_byte--;
3829         }
3830     }
3831   return CODING_CATEGORY_UTF8_MASK;
3832 }
3833
3834 static void
3835 decode_coding_utf8 (Lstream *decoding, const Extbyte *src,
3836                     unsigned_char_dynarr *dst, Lstream_data_count n)
3837 {
3838   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3839   unsigned int flags  = str->flags;
3840   unsigned int ch     = str->ch;
3841   eol_type_t eol_type = str->eol_type;
3842   unsigned char counter = str->counter;
3843
3844   while (n--)
3845     {
3846       unsigned char c = *(unsigned char *)src++;
3847       switch (counter)
3848         {
3849         case 0:
3850           if ( c >= 0xfc )
3851             {
3852               ch = c & 0x01;
3853               counter = 5;
3854             }
3855           else if ( c >= 0xf8 )
3856             {
3857               ch = c & 0x03;
3858               counter = 4;
3859             }
3860           else if ( c >= 0xf0 )
3861             {
3862               ch = c & 0x07;
3863               counter = 3;
3864             }
3865           else if ( c >= 0xe0 )
3866             {
3867               ch = c & 0x0f;
3868               counter = 2;
3869             }
3870           else if ( c >= 0xc0 )
3871             {
3872               ch = c & 0x1f;
3873               counter = 1;
3874             }
3875           else
3876             {
3877               DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3878               decode_ucs4 (c, dst);
3879             }
3880           break;
3881         case 1:
3882           ch = ( ch << 6 ) | ( c & 0x3f );
3883           decode_ucs4 (ch, dst);
3884           ch = 0;
3885           counter = 0;
3886           break;
3887         default:
3888           ch = ( ch << 6 ) | ( c & 0x3f );
3889           counter--;
3890         }
3891     label_continue_loop:;
3892     }
3893
3894   if (flags & CODING_STATE_END)
3895     DECODE_OUTPUT_PARTIAL_CHAR (ch);
3896
3897   str->flags = flags;
3898   str->ch    = ch;
3899   str->counter = counter;
3900 }
3901
3902 static void
3903 encode_utf8 (Lisp_Object charset,
3904              unsigned char h, unsigned char l, unsigned_char_dynarr *dst)
3905 {
3906   unsigned long code = mule_char_to_ucs4 (charset, h, l);
3907   if ( code <= 0x7f )
3908     {
3909       Dynarr_add (dst, code);
3910     }
3911   else if ( code <= 0x7ff )
3912     {
3913       Dynarr_add (dst, (code >> 6) | 0xc0);
3914       Dynarr_add (dst, (code & 0x3f) | 0x80);
3915     }
3916   else if ( code <= 0xffff )
3917     {
3918       Dynarr_add (dst,  (code >> 12) | 0xe0);
3919       Dynarr_add (dst, ((code >>  6) & 0x3f) | 0x80);
3920       Dynarr_add (dst,  (code        & 0x3f) | 0x80);
3921     }
3922   else if ( code <= 0x1fffff )
3923     {
3924       Dynarr_add (dst,  (code >> 18) | 0xf0);
3925       Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
3926       Dynarr_add (dst, ((code >>  6) & 0x3f) | 0x80);
3927       Dynarr_add (dst,  (code        & 0x3f) | 0x80);
3928     }
3929   else if ( code <= 0x3ffffff )
3930     {
3931       Dynarr_add (dst,  (code >> 24) | 0xf8);
3932       Dynarr_add (dst, ((code >> 18) & 0x3f) | 0x80);
3933       Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
3934       Dynarr_add (dst, ((code >>  6) & 0x3f) | 0x80);
3935       Dynarr_add (dst,  (code        & 0x3f) | 0x80);
3936     }
3937   else
3938     {
3939       Dynarr_add (dst,  (code >> 30) | 0xfc);
3940       Dynarr_add (dst, ((code >> 24) & 0x3f) | 0x80);
3941       Dynarr_add (dst, ((code >> 18) & 0x3f) | 0x80);
3942       Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
3943       Dynarr_add (dst, ((code >>  6) & 0x3f) | 0x80);
3944       Dynarr_add (dst,  (code        & 0x3f) | 0x80);
3945     }
3946 }
3947
3948 static void
3949 encode_coding_utf8 (Lstream *encoding, const Bufbyte *src,
3950                     unsigned_char_dynarr *dst, Lstream_data_count n)
3951 {
3952   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3953   unsigned int flags  = str->flags;
3954   unsigned int ch     = str->ch;
3955   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3956   unsigned char char_boundary = str->iso2022.current_char_boundary;
3957   Lisp_Object charset = str->iso2022.current_charset;
3958
3959 #ifdef ENABLE_COMPOSITE_CHARS
3960   /* flags for handling composite chars.  We do a little switcharoo
3961      on the source while we're outputting the composite char. */
3962   unsigned int saved_n = 0;
3963   const unsigned char *saved_src = NULL;
3964   int in_composite = 0;
3965
3966  back_to_square_n:
3967 #endif /* ENABLE_COMPOSITE_CHARS */
3968
3969   while (n--)
3970     {
3971       unsigned char c = *src++;
3972
3973       if (BYTE_ASCII_P (c))
3974         {               /* Processing ASCII character */
3975           ch = 0;
3976           if (c == '\n')
3977             {
3978               if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3979                 Dynarr_add (dst, '\r');
3980               if (eol_type != EOL_CR)
3981                 Dynarr_add (dst, c);
3982             }
3983           else
3984             encode_utf8 (Vcharset_ascii, c, 0, dst);
3985           char_boundary = 1;
3986         }
3987       else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
3988         { /* Processing Leading Byte */
3989           ch = 0;
3990           charset = CHARSET_BY_LEADING_BYTE (c);
3991           if (LEADING_BYTE_PREFIX_P(c))
3992             ch = c;
3993           char_boundary = 0;
3994         }
3995       else
3996         {                       /* Processing Non-ASCII character */
3997           char_boundary = 1;
3998           if (EQ (charset, Vcharset_control_1))
3999             {
4000               encode_utf8 (Vcharset_control_1, c, 0, dst);
4001             }
4002           else
4003             {
4004               switch (XCHARSET_REP_BYTES (charset))
4005                 {
4006                 case 2:
4007                   encode_utf8 (charset, c, 0, dst);
4008                   break;
4009                 case 3:
4010                   if (XCHARSET_PRIVATE_P (charset))
4011                     {
4012                       encode_utf8 (charset, c, 0, dst);
4013                       ch = 0;
4014                     }
4015                   else if (ch)
4016                     {
4017 #ifdef ENABLE_COMPOSITE_CHARS
4018                       if (EQ (charset, Vcharset_composite))
4019                         {
4020                           if (in_composite)
4021                             {
4022                               /* #### Bother! We don't know how to
4023                                  handle this yet. */
4024                               encode_utf8 (Vcharset_ascii, '~', 0, dst);
4025                             }
4026                           else
4027                             {
4028                               Emchar emch = MAKE_CHAR (Vcharset_composite,
4029                                                        ch & 0x7F, c & 0x7F);
4030                               Lisp_Object lstr = composite_char_string (emch);
4031                               saved_n = n;
4032                               saved_src = src;
4033                               in_composite = 1;
4034                               src = XSTRING_DATA   (lstr);
4035                               n   = XSTRING_LENGTH (lstr);
4036                             }
4037                         }
4038                       else
4039 #endif /* ENABLE_COMPOSITE_CHARS */
4040                         {
4041                           encode_utf8 (charset, ch, c, dst);
4042                         }
4043                       ch = 0;
4044                     }
4045                   else
4046                     {
4047                       ch = c;
4048                       char_boundary = 0;
4049                     }
4050                   break;
4051                 case 4:
4052                   if (ch)
4053                     {
4054                       encode_utf8 (charset, ch, c, dst);
4055                       ch = 0;
4056                     }
4057                   else
4058                     {
4059                       ch = c;
4060                       char_boundary = 0;
4061                     }
4062                   break;
4063                 default:
4064                   abort ();
4065                 }
4066             }
4067         }
4068     }
4069
4070 #ifdef ENABLE_COMPOSITE_CHARS
4071   if (in_composite)
4072     {
4073       n = saved_n;
4074       src = saved_src;
4075       in_composite = 0;
4076       goto back_to_square_n; /* Wheeeeeeeee ..... */
4077     }
4078 #endif
4079
4080   str->flags = flags;
4081   str->ch    = ch;
4082   str->iso2022.current_char_boundary = char_boundary;
4083   str->iso2022.current_charset = charset;
4084
4085   /* Verbum caro factum est! */
4086 }
4087
4088 \f
4089 /************************************************************************/
4090 /*                           ISO2022 methods                            */
4091 /************************************************************************/
4092
4093 /* The following note describes the coding system ISO2022 briefly.
4094    Since the intention of this note is to help understand the
4095    functions in this file, some parts are NOT ACCURATE or OVERLY
4096    SIMPLIFIED.  For thorough understanding, please refer to the
4097    original document of ISO2022.
4098
4099    ISO2022 provides many mechanisms to encode several character sets
4100    in 7-bit and 8-bit environments.  For 7-bit environments, all text
4101    is encoded using bytes less than 128.  This may make the encoded
4102    text a little bit longer, but the text passes more easily through
4103    several gateways, some of which strip off MSB (Most Signigant Bit).
4104
4105    There are two kinds of character sets: control character set and
4106    graphic character set.  The former contains control characters such
4107    as `newline' and `escape' to provide control functions (control
4108    functions are also provided by escape sequences).  The latter
4109    contains graphic characters such as 'A' and '-'.  Emacs recognizes
4110    two control character sets and many graphic character sets.
4111
4112    Graphic character sets are classified into one of the following
4113    four classes, according to the number of bytes (DIMENSION) and
4114    number of characters in one dimension (CHARS) of the set:
4115    - DIMENSION1_CHARS94
4116    - DIMENSION1_CHARS96
4117    - DIMENSION2_CHARS94
4118    - DIMENSION2_CHARS96
4119
4120    In addition, each character set is assigned an identification tag,
4121    unique for each set, called "final character" (denoted as <F>
4122    hereafter).  The <F> of each character set is decided by ECMA(*)
4123    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
4124    (0x30..0x3F are for private use only).
4125
4126    Note (*): ECMA = European Computer Manufacturers Association
4127
4128    Here are examples of graphic character set [NAME(<F>)]:
4129         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
4130         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
4131         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
4132         o DIMENSION2_CHARS96 -- none for the moment
4133
4134    A code area (1 byte = 8 bits) is divided into 4 areas, C0, GL, C1, and GR.
4135         C0 [0x00..0x1F] -- control character plane 0
4136         GL [0x20..0x7F] -- graphic character plane 0
4137         C1 [0x80..0x9F] -- control character plane 1
4138         GR [0xA0..0xFF] -- graphic character plane 1
4139
4140    A control character set is directly designated and invoked to C0 or
4141    C1 by an escape sequence.  The most common case is that:
4142    - ISO646's  control character set is designated/invoked to C0, and
4143    - ISO6429's control character set is designated/invoked to C1,
4144    and usually these designations/invocations are omitted in encoded
4145    text.  In a 7-bit environment, only C0 can be used, and a control
4146    character for C1 is encoded by an appropriate escape sequence to
4147    fit into the environment.  All control characters for C1 are
4148    defined to have corresponding escape sequences.
4149
4150    A graphic character set is at first designated to one of four
4151    graphic registers (G0 through G3), then these graphic registers are
4152    invoked to GL or GR.  These designations and invocations can be
4153    done independently.  The most common case is that G0 is invoked to
4154    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
4155    these invocations and designations are omitted in encoded text.
4156    In a 7-bit environment, only GL can be used.
4157
4158    When a graphic character set of CHARS94 is invoked to GL, codes
4159    0x20 and 0x7F of the GL area work as control characters SPACE and
4160    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
4161    be used.
4162
4163    There are two ways of invocation: locking-shift and single-shift.
4164    With locking-shift, the invocation lasts until the next different
4165    invocation, whereas with single-shift, the invocation affects the
4166    following character only and doesn't affect the locking-shift
4167    state.  Invocations are done by the following control characters or
4168    escape sequences:
4169
4170    ----------------------------------------------------------------------
4171    abbrev  function                  cntrl escape seq   description
4172    ----------------------------------------------------------------------
4173    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
4174    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
4175    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
4176    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
4177    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
4178    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
4179    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
4180    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
4181    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
4182    ----------------------------------------------------------------------
4183    (*) These are not used by any known coding system.
4184
4185    Control characters for these functions are defined by macros
4186    ISO_CODE_XXX in `coding.h'.
4187
4188    Designations are done by the following escape sequences:
4189    ----------------------------------------------------------------------
4190    escape sequence      description
4191    ----------------------------------------------------------------------
4192    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
4193    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
4194    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
4195    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
4196    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
4197    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
4198    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
4199    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
4200    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
4201    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
4202    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
4203    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
4204    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
4205    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
4206    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
4207    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
4208    ----------------------------------------------------------------------
4209
4210    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
4211    of dimension 1, chars 94, and final character <F>, etc...
4212
4213    Note (*): Although these designations are not allowed in ISO2022,
4214    Emacs accepts them on decoding, and produces them on encoding
4215    CHARS96 character sets in a coding system which is characterized as
4216    7-bit environment, non-locking-shift, and non-single-shift.
4217
4218    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
4219    '(' can be omitted.  We refer to this as "short-form" hereafter.
4220
4221    Now you may notice that there are a lot of ways for encoding the
4222    same multilingual text in ISO2022.  Actually, there exist many
4223    coding systems such as Compound Text (used in X11's inter client
4224    communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
4225    (used in Korean internet), EUC (Extended UNIX Code, used in Asian
4226    localized platforms), and all of these are variants of ISO2022.
4227
4228    In addition to the above, Emacs handles two more kinds of escape
4229    sequences: ISO6429's direction specification and Emacs' private
4230    sequence for specifying character composition.
4231
4232    ISO6429's direction specification takes the following form:
4233         o CSI ']'      -- end of the current direction
4234         o CSI '0' ']'  -- end of the current direction
4235         o CSI '1' ']'  -- start of left-to-right text
4236         o CSI '2' ']'  -- start of right-to-left text
4237    The control character CSI (0x9B: control sequence introducer) is
4238    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
4239
4240    Character composition specification takes the following form:
4241         o ESC '0' -- start character composition
4242         o ESC '1' -- end character composition
4243    Since these are not standard escape sequences of any ISO standard,
4244    their use with these meanings is restricted to Emacs only.  */
4245
4246 static void
4247 reset_iso2022 (Lisp_Object coding_system, struct iso2022_decoder *iso)
4248 {
4249   int i;
4250
4251   for (i = 0; i < 4; i++)
4252     {
4253       if (!NILP (coding_system))
4254         iso->charset[i] =
4255           XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, i);
4256       else
4257         iso->charset[i] = Qt;
4258       iso->invalid_designated[i] = 0;
4259     }
4260   iso->esc = ISO_ESC_NOTHING;
4261   iso->esc_bytes_index = 0;
4262   iso->register_left = 0;
4263   iso->register_right = 1;
4264   iso->switched_dir_and_no_valid_charset_yet = 0;
4265   iso->invalid_switch_dir = 0;
4266   iso->output_direction_sequence = 0;
4267   iso->output_literally = 0;
4268 #ifdef ENABLE_COMPOSITE_CHARS
4269   if (iso->composite_chars)
4270     Dynarr_reset (iso->composite_chars);
4271 #endif
4272 }
4273
4274 static int
4275 fit_to_be_escape_quoted (unsigned char c)
4276 {
4277   switch (c)
4278     {
4279     case ISO_CODE_ESC:
4280     case ISO_CODE_CSI:
4281     case ISO_CODE_SS2:
4282     case ISO_CODE_SS3:
4283     case ISO_CODE_SO:
4284     case ISO_CODE_SI:
4285       return 1;
4286
4287     default:
4288       return 0;
4289     }
4290 }
4291
4292 /* Parse one byte of an ISO2022 escape sequence.
4293    If the result is an invalid escape sequence, return 0 and
4294    do not change anything in STR.  Otherwise, if the result is
4295    an incomplete escape sequence, update ISO2022.ESC and
4296    ISO2022.ESC_BYTES and return -1.  Otherwise, update
4297    all the state variables (but not ISO2022.ESC_BYTES) and
4298    return 1.
4299
4300    If CHECK_INVALID_CHARSETS is non-zero, check for designation
4301    or invocation of an invalid character set and treat that as
4302    an unrecognized escape sequence.
4303
4304    ********************************************************************
4305
4306    #### Strategies for error annotation and coding orthogonalization
4307
4308    We really want to separate out a number of things.  Conceptually,
4309    there is a nested syntax.
4310
4311    At the top level is the ISO 2022 extension syntax, including charset
4312    designation and invocation, and certain auxiliary controls such as the
4313    ISO 6429 direction specification.  These are octet-oriented, with the
4314    single exception (AFAIK) of the "exit Unicode" sequence which uses the
4315    UTF's natural width (1 byte for UTF-7 and UTF-8, 2 bytes for UCS-2 and
4316    UTF-16, and 4 bytes for UCS-4 and UTF-32).  This will be treated as a
4317    (deprecated) special case in Unicode processing.
4318
4319    The middle layer is ISO 2022 character interpretation.  This will depend
4320    on the current state of the ISO 2022 registers, and assembles octets
4321    into the character's internal representation.
4322
4323    The lowest level is translating system control conventions.  At present
4324    this is restricted to newline translation, but one could imagine doing
4325    tab conversion or line wrapping here.  "Escape from Unicode" processing
4326    would be done at this level.
4327
4328    At each level the parser will verify the syntax.  In the case of a
4329    syntax error or warning (such as a redundant escape sequence that affects
4330    no characters), the parser will take some action, typically inserting the
4331    erroneous octets directly into the output and creating an annotation
4332    which can be used by higher level I/O to mark the affected region.
4333
4334    This should make it possible to do something sensible about separating
4335    newline convention processing from character construction, and about
4336    preventing ISO 2022 escape sequences from being recognized
4337    inappropriately.
4338
4339    The basic strategy will be to have octet classification tables, and
4340    switch processing according to the table entry.
4341
4342    It's possible that, by doing the processing with tables of functions or
4343    the like, the parser can be used for both detection and translation. */
4344
4345 static int
4346 parse_iso2022_esc (Lisp_Object codesys, struct iso2022_decoder *iso,
4347                    unsigned char c, unsigned int *flags,
4348                    int check_invalid_charsets)
4349 {
4350   /* (1) If we're at the end of a designation sequence, CS is the
4351      charset being designated and REG is the register to designate
4352      it to.
4353
4354      (2) If we're at the end of a locking-shift sequence, REG is
4355      the register to invoke and HALF (0 == left, 1 == right) is
4356      the half to invoke it into.
4357
4358      (3) If we're at the end of a single-shift sequence, REG is
4359      the register to invoke. */
4360   Lisp_Object cs = Qnil;
4361   int reg, half;
4362
4363   /* NOTE: This code does goto's all over the fucking place.
4364      The reason for this is that we're basically implementing
4365      a state machine here, and hierarchical languages like C
4366      don't really provide a clean way of doing this. */
4367
4368   if (! (*flags & CODING_STATE_ESCAPE))
4369     /* At beginning of escape sequence; we need to reset our
4370        escape-state variables. */
4371     iso->esc = ISO_ESC_NOTHING;
4372
4373   iso->output_literally = 0;
4374   iso->output_direction_sequence = 0;
4375
4376   switch (iso->esc)
4377     {
4378     case ISO_ESC_NOTHING:
4379       iso->esc_bytes_index = 0;
4380       switch (c)
4381         {
4382         case ISO_CODE_ESC:      /* Start escape sequence */
4383           *flags |= CODING_STATE_ESCAPE;
4384           iso->esc = ISO_ESC;
4385           goto not_done;
4386
4387         case ISO_CODE_CSI:      /* ISO6429 (specifying directionality) */
4388           *flags |= CODING_STATE_ESCAPE;
4389           iso->esc = ISO_ESC_5_11;
4390           goto not_done;
4391
4392         case ISO_CODE_SO:       /* locking shift 1 */
4393           reg = 1; half = 0;
4394           goto locking_shift;
4395         case ISO_CODE_SI:       /* locking shift 0 */
4396           reg = 0; half = 0;
4397           goto locking_shift;
4398
4399         case ISO_CODE_SS2:      /* single shift */
4400           reg = 2;
4401           goto single_shift;
4402         case ISO_CODE_SS3:      /* single shift */
4403           reg = 3;
4404           goto single_shift;
4405
4406         default:                        /* Other control characters */
4407           return 0;
4408         }
4409
4410     case ISO_ESC:
4411       switch (c)
4412         {
4413           /**** single shift ****/
4414
4415         case 'N':       /* single shift 2 */
4416           reg = 2;
4417           goto single_shift;
4418         case 'O':       /* single shift 3 */
4419           reg = 3;
4420           goto single_shift;
4421
4422           /**** locking shift ****/
4423
4424         case '~':       /* locking shift 1 right */
4425           reg = 1; half = 1;
4426           goto locking_shift;
4427         case 'n':       /* locking shift 2 */
4428           reg = 2; half = 0;
4429           goto locking_shift;
4430         case '}':       /* locking shift 2 right */
4431           reg = 2; half = 1;
4432           goto locking_shift;
4433         case 'o':       /* locking shift 3 */
4434           reg = 3; half = 0;
4435           goto locking_shift;
4436         case '|':       /* locking shift 3 right */
4437           reg = 3; half = 1;
4438           goto locking_shift;
4439
4440 #ifdef ENABLE_COMPOSITE_CHARS
4441           /**** composite ****/
4442
4443         case '0':
4444           iso->esc = ISO_ESC_START_COMPOSITE;
4445           *flags = (*flags & CODING_STATE_ISO2022_LOCK) |
4446             CODING_STATE_COMPOSITE;
4447           return 1;
4448
4449         case '1':
4450           iso->esc = ISO_ESC_END_COMPOSITE;
4451           *flags = (*flags & CODING_STATE_ISO2022_LOCK) &
4452             ~CODING_STATE_COMPOSITE;
4453           return 1;
4454 #endif /* ENABLE_COMPOSITE_CHARS */
4455
4456           /**** directionality ****/
4457
4458         case '[':
4459           iso->esc = ISO_ESC_5_11;
4460           goto not_done;
4461
4462           /**** designation ****/
4463
4464         case '$':       /* multibyte charset prefix */
4465           iso->esc = ISO_ESC_2_4;
4466           goto not_done;
4467
4468         default:
4469           if (0x28 <= c && c <= 0x2F)
4470             {
4471               iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_8);
4472               goto not_done;
4473             }
4474
4475           /* This function is called with CODESYS equal to nil when
4476              doing coding-system detection. */
4477           if (!NILP (codesys)
4478               && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
4479               && fit_to_be_escape_quoted (c))
4480             {
4481               iso->esc = ISO_ESC_LITERAL;
4482               *flags &= CODING_STATE_ISO2022_LOCK;
4483               return 1;
4484             }
4485
4486           /* bzzzt! */
4487           return 0;
4488         }
4489
4490
4491
4492       /**** directionality ****/
4493
4494     case ISO_ESC_5_11:          /* ISO6429 direction control */
4495       if (c == ']')
4496         {
4497           *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4498           goto directionality;
4499         }
4500       if      (c == '0') iso->esc = ISO_ESC_5_11_0;
4501       else if (c == '1') iso->esc = ISO_ESC_5_11_1;
4502       else if (c == '2') iso->esc = ISO_ESC_5_11_2;
4503       else               return 0;
4504       goto not_done;
4505
4506     case ISO_ESC_5_11_0:
4507       if (c == ']')
4508         {
4509           *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4510           goto directionality;
4511         }
4512       return 0;
4513
4514     case ISO_ESC_5_11_1:
4515       if (c == ']')
4516         {
4517           *flags = (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4518           goto directionality;
4519         }
4520       return 0;
4521
4522     case ISO_ESC_5_11_2:
4523       if (c == ']')
4524         {
4525           *flags = (*flags & CODING_STATE_ISO2022_LOCK) | CODING_STATE_R2L;
4526           goto directionality;
4527         }
4528       return 0;
4529
4530     directionality:
4531       iso->esc = ISO_ESC_DIRECTIONALITY;
4532       /* Various junk here to attempt to preserve the direction sequences
4533          literally in the text if they would otherwise be swallowed due
4534          to invalid designations that don't show up as actual charset
4535          changes in the text. */
4536       if (iso->invalid_switch_dir)
4537         {
4538           /* We already inserted a direction switch literally into the
4539              text.  We assume (#### this may not be right) that the
4540              next direction switch is the one going the other way,
4541              and we need to output that literally as well. */
4542           iso->output_literally = 1;
4543           iso->invalid_switch_dir = 0;
4544         }
4545       else
4546         {
4547           int jj;
4548
4549           /* If we are in the thrall of an invalid designation,
4550            then stick the directionality sequence literally into the
4551            output stream so it ends up in the original text again. */
4552           for (jj = 0; jj < 4; jj++)
4553             if (iso->invalid_designated[jj])
4554               break;
4555           if (jj < 4)
4556             {
4557               iso->output_literally = 1;
4558               iso->invalid_switch_dir = 1;
4559             }
4560           else
4561             /* Indicate that we haven't yet seen a valid designation,
4562                so that if a switch-dir is directly followed by an
4563                invalid designation, both get inserted literally. */
4564             iso->switched_dir_and_no_valid_charset_yet = 1;
4565         }
4566       return 1;
4567
4568
4569       /**** designation ****/
4570
4571     case ISO_ESC_2_4:
4572       if (0x28 <= c && c <= 0x2F)
4573         {
4574           iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_4_8);
4575           goto not_done;
4576         }
4577       if (0x40 <= c && c <= 0x42)
4578         {
4579           cs = CHARSET_BY_ATTRIBUTES (CHARSET_TYPE_94X94, c,
4580                                       *flags & CODING_STATE_R2L ?
4581                                       CHARSET_RIGHT_TO_LEFT :
4582                                       CHARSET_LEFT_TO_RIGHT);
4583           reg = 0;
4584           goto designated;
4585         }
4586       return 0;
4587
4588     default:
4589       {
4590         int type =-1;
4591
4592         if (c < '0' || c > '~')
4593           return 0; /* bad final byte */
4594
4595         if (iso->esc >= ISO_ESC_2_8 &&
4596             iso->esc <= ISO_ESC_2_15)
4597           {
4598             type = ((iso->esc >= ISO_ESC_2_12) ?
4599                     CHARSET_TYPE_96 : CHARSET_TYPE_94);
4600             reg = (iso->esc - ISO_ESC_2_8) & 3;
4601           }
4602         else if (iso->esc >= ISO_ESC_2_4_8 &&
4603                  iso->esc <= ISO_ESC_2_4_15)
4604           {
4605             type = ((iso->esc >= ISO_ESC_2_4_12) ?
4606                     CHARSET_TYPE_96X96 : CHARSET_TYPE_94X94);
4607             reg = (iso->esc - ISO_ESC_2_4_8) & 3;
4608           }
4609         else
4610           {
4611             /* Can this ever be reached? -slb */
4612             abort();
4613             return 0;
4614           }
4615
4616         cs = CHARSET_BY_ATTRIBUTES (type, c,
4617                                     *flags & CODING_STATE_R2L ?
4618                                     CHARSET_RIGHT_TO_LEFT :
4619                                     CHARSET_LEFT_TO_RIGHT);
4620         goto designated;
4621       }
4622     }
4623
4624  not_done:
4625   iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char) c;
4626   return -1;
4627
4628  single_shift:
4629   if (check_invalid_charsets && !CHARSETP (iso->charset[reg]))
4630     /* can't invoke something that ain't there. */
4631     return 0;
4632   iso->esc = ISO_ESC_SINGLE_SHIFT;
4633   *flags &= CODING_STATE_ISO2022_LOCK;
4634   if (reg == 2)
4635     *flags |= CODING_STATE_SS2;
4636   else
4637     *flags |= CODING_STATE_SS3;
4638   return 1;
4639
4640  locking_shift:
4641   if (check_invalid_charsets &&
4642       !CHARSETP (iso->charset[reg]))
4643     /* can't invoke something that ain't there. */
4644     return 0;
4645   if (half)
4646     iso->register_right = reg;
4647   else
4648     iso->register_left = reg;
4649   *flags &= CODING_STATE_ISO2022_LOCK;
4650   iso->esc = ISO_ESC_LOCKING_SHIFT;
4651   return 1;
4652
4653  designated:
4654   if (NILP (cs) && check_invalid_charsets)
4655     {
4656       iso->invalid_designated[reg] = 1;
4657       iso->charset[reg] = Vcharset_ascii;
4658       iso->esc = ISO_ESC_DESIGNATE;
4659       *flags &= CODING_STATE_ISO2022_LOCK;
4660       iso->output_literally = 1;
4661       if (iso->switched_dir_and_no_valid_charset_yet)
4662         {
4663           /* We encountered a switch-direction followed by an
4664              invalid designation.  Ensure that the switch-direction
4665              gets outputted; otherwise it will probably get eaten
4666              when the text is written out again. */
4667           iso->switched_dir_and_no_valid_charset_yet = 0;
4668           iso->output_direction_sequence = 1;
4669           /* And make sure that the switch-dir going the other
4670              way gets outputted, as well. */
4671           iso->invalid_switch_dir = 1;
4672         }
4673       return 1;
4674     }
4675   /* This function is called with CODESYS equal to nil when
4676      doing coding-system detection. */
4677   if (!NILP (codesys))
4678     {
4679       charset_conversion_spec_dynarr *dyn =
4680         XCODING_SYSTEM (codesys)->iso2022.input_conv;
4681
4682       if (dyn)
4683         {
4684           int i;
4685
4686           for (i = 0; i < Dynarr_length (dyn); i++)
4687             {
4688               struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
4689               if (EQ (cs, spec->from_charset))
4690                 cs = spec->to_charset;
4691             }
4692         }
4693     }
4694
4695   iso->charset[reg] = cs;
4696   iso->esc = ISO_ESC_DESIGNATE;
4697   *flags &= CODING_STATE_ISO2022_LOCK;
4698   if (iso->invalid_designated[reg])
4699     {
4700       iso->invalid_designated[reg] = 0;
4701       iso->output_literally = 1;
4702     }
4703   if (iso->switched_dir_and_no_valid_charset_yet)
4704     iso->switched_dir_and_no_valid_charset_yet = 0;
4705   return 1;
4706 }
4707
4708 static int
4709 detect_coding_iso2022 (struct detection_state *st, const Extbyte *src, Lstream_data_count n)
4710 {
4711   int mask;
4712
4713   /* #### There are serious deficiencies in the recognition mechanism
4714      here.  This needs to be much smarter if it's going to cut it.
4715      The sequence "\xff\x0f" is currently detected as LOCK_SHIFT while
4716      it should be detected as Latin-1.
4717      All the ISO2022 stuff in this file should be synced up with the
4718      code from FSF Emacs-20.4, in which Mule should be more or less stable.
4719      Perhaps we should wait till R2L works in FSF Emacs? */
4720
4721   if (!st->iso2022.initted)
4722     {
4723       reset_iso2022 (Qnil, &st->iso2022.iso);
4724       st->iso2022.mask = (CODING_CATEGORY_ISO_7_MASK |
4725                           CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4726                           CODING_CATEGORY_ISO_8_1_MASK |
4727                           CODING_CATEGORY_ISO_8_2_MASK |
4728                           CODING_CATEGORY_ISO_LOCK_SHIFT_MASK);
4729       st->iso2022.flags = 0;
4730       st->iso2022.high_byte_count = 0;
4731       st->iso2022.saw_single_shift = 0;
4732       st->iso2022.initted = 1;
4733     }
4734
4735   mask = st->iso2022.mask;
4736
4737   while (n--)
4738     {
4739       unsigned char c = *(unsigned char *)src++;
4740       if (c >= 0xA0)
4741         {
4742           mask &= ~CODING_CATEGORY_ISO_7_MASK;
4743           st->iso2022.high_byte_count++;
4744         }
4745       else
4746         {
4747           if (st->iso2022.high_byte_count && !st->iso2022.saw_single_shift)
4748             {
4749               if (st->iso2022.high_byte_count & 1)
4750                 /* odd number of high bytes; assume not iso-8-2 */
4751                 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4752             }
4753           st->iso2022.high_byte_count = 0;
4754           st->iso2022.saw_single_shift = 0;
4755           if (c > 0x80)
4756             mask &= ~CODING_CATEGORY_ISO_7_MASK;
4757         }
4758       if (!(st->iso2022.flags & CODING_STATE_ESCAPE)
4759           && (BYTE_C0_P (c) || BYTE_C1_P (c)))
4760         { /* control chars */
4761           switch (c)
4762             {
4763               /* Allow and ignore control characters that you might
4764                  reasonably see in a text file */
4765             case '\r':
4766             case '\n':
4767             case '\t':
4768             case  7: /* bell */
4769             case  8: /* backspace */
4770             case 11: /* vertical tab */
4771             case 12: /* form feed */
4772             case 26: /* MS-DOS C-z junk */
4773             case 31: /* '^_' -- for info */
4774               goto label_continue_loop;
4775
4776             default:
4777               break;
4778             }
4779         }
4780
4781       if ((st->iso2022.flags & CODING_STATE_ESCAPE) || BYTE_C0_P (c)
4782           || BYTE_C1_P (c))
4783         {
4784           if (parse_iso2022_esc (Qnil, &st->iso2022.iso, c,
4785                                  &st->iso2022.flags, 0))
4786             {
4787               switch (st->iso2022.iso.esc)
4788                 {
4789                 case ISO_ESC_DESIGNATE:
4790                   mask &= ~CODING_CATEGORY_ISO_8_1_MASK;
4791                   mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4792                   break;
4793                 case ISO_ESC_LOCKING_SHIFT:
4794                   mask = CODING_CATEGORY_ISO_LOCK_SHIFT_MASK;
4795                   goto ran_out_of_chars;
4796                 case ISO_ESC_SINGLE_SHIFT:
4797                   mask &= ~CODING_CATEGORY_ISO_8_DESIGNATE_MASK;
4798                   st->iso2022.saw_single_shift = 1;
4799                   break;
4800                 default:
4801                   break;
4802                 }
4803             }
4804           else
4805             {
4806               mask = 0;
4807               goto ran_out_of_chars;
4808             }
4809         }
4810     label_continue_loop:;
4811     }
4812
4813  ran_out_of_chars:
4814
4815   return mask;
4816 }
4817
4818 static int
4819 postprocess_iso2022_mask (int mask)
4820 {
4821   /* #### kind of cheesy */
4822   /* If seven-bit ISO is allowed, then assume that the encoding is
4823      entirely seven-bit and turn off the eight-bit ones. */
4824   if (mask & CODING_CATEGORY_ISO_7_MASK)
4825     mask &= ~ (CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4826                CODING_CATEGORY_ISO_8_1_MASK |
4827                CODING_CATEGORY_ISO_8_2_MASK);
4828   return mask;
4829 }
4830
4831 /* If FLAGS is a null pointer or specifies right-to-left motion,
4832    output a switch-dir-to-left-to-right sequence to DST.
4833    Also update FLAGS if it is not a null pointer.
4834    If INTERNAL_P is set, we are outputting in internal format and
4835    need to handle the CSI differently. */
4836
4837 static void
4838 restore_left_to_right_direction (Lisp_Coding_System *codesys,
4839                                  unsigned_char_dynarr *dst,
4840                                  unsigned int *flags,
4841                                  int internal_p)
4842 {
4843   if (!flags || (*flags & CODING_STATE_R2L))
4844     {
4845       if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4846         {
4847           Dynarr_add (dst, ISO_CODE_ESC);
4848           Dynarr_add (dst, '[');
4849         }
4850       else if (internal_p)
4851         DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4852       else
4853         Dynarr_add (dst, ISO_CODE_CSI);
4854       Dynarr_add (dst, '0');
4855       Dynarr_add (dst, ']');
4856       if (flags)
4857         *flags &= ~CODING_STATE_R2L;
4858     }
4859 }
4860
4861 /* If FLAGS is a null pointer or specifies a direction different from
4862    DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or
4863    CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape
4864    sequence to DST.  Also update FLAGS if it is not a null pointer.
4865    If INTERNAL_P is set, we are outputting in internal format and
4866    need to handle the CSI differently. */
4867
4868 static void
4869 ensure_correct_direction (int direction, Lisp_Coding_System *codesys,
4870                           unsigned_char_dynarr *dst, unsigned int *flags,
4871                           int internal_p)
4872 {
4873   if ((!flags || (*flags & CODING_STATE_R2L)) &&
4874       direction == CHARSET_LEFT_TO_RIGHT)
4875     restore_left_to_right_direction (codesys, dst, flags, internal_p);
4876   else if (!CODING_SYSTEM_ISO2022_NO_ISO6429 (codesys)
4877            && (!flags || !(*flags & CODING_STATE_R2L)) &&
4878            direction == CHARSET_RIGHT_TO_LEFT)
4879     {
4880       if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4881         {
4882           Dynarr_add (dst, ISO_CODE_ESC);
4883           Dynarr_add (dst, '[');
4884         }
4885       else if (internal_p)
4886         DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4887       else
4888         Dynarr_add (dst, ISO_CODE_CSI);
4889       Dynarr_add (dst, '2');
4890       Dynarr_add (dst, ']');
4891       if (flags)
4892         *flags |= CODING_STATE_R2L;
4893     }
4894 }
4895
4896 /* Convert ISO2022-format data to internal format. */
4897
4898 static void
4899 decode_coding_iso2022 (Lstream *decoding, const Extbyte *src,
4900                        unsigned_char_dynarr *dst, Lstream_data_count n)
4901 {
4902   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4903   unsigned int flags  = str->flags;
4904   unsigned int ch     = str->ch;
4905   eol_type_t eol_type = str->eol_type;
4906 #ifdef ENABLE_COMPOSITE_CHARS
4907   unsigned_char_dynarr *real_dst = dst;
4908 #endif
4909   Lisp_Object coding_system;
4910
4911   XSETCODING_SYSTEM (coding_system, str->codesys);
4912
4913 #ifdef ENABLE_COMPOSITE_CHARS
4914   if (flags & CODING_STATE_COMPOSITE)
4915     dst = str->iso2022.composite_chars;
4916 #endif /* ENABLE_COMPOSITE_CHARS */
4917
4918   while (n--)
4919     {
4920       unsigned char c = *(unsigned char *)src++;
4921       if (flags & CODING_STATE_ESCAPE)
4922         {       /* Within ESC sequence */
4923           int retval = parse_iso2022_esc (coding_system, &str->iso2022,
4924                                           c, &flags, 1);
4925
4926           if (retval)
4927             {
4928               switch (str->iso2022.esc)
4929                 {
4930 #ifdef ENABLE_COMPOSITE_CHARS
4931                 case ISO_ESC_START_COMPOSITE:
4932                   if (str->iso2022.composite_chars)
4933                     Dynarr_reset (str->iso2022.composite_chars);
4934                   else
4935                     str->iso2022.composite_chars = Dynarr_new (unsigned_char);
4936                   dst = str->iso2022.composite_chars;
4937                   break;
4938                 case ISO_ESC_END_COMPOSITE:
4939                   {
4940                     Bufbyte comstr[MAX_EMCHAR_LEN];
4941                     Bytecount len;
4942                     Emchar emch = lookup_composite_char (Dynarr_atp (dst, 0),
4943                                                          Dynarr_length (dst));
4944                     dst = real_dst;
4945                     len = set_charptr_emchar (comstr, emch);
4946                     Dynarr_add_many (dst, comstr, len);
4947                     break;
4948                   }
4949 #endif /* ENABLE_COMPOSITE_CHARS */
4950
4951                 case ISO_ESC_LITERAL:
4952                   DECODE_ADD_BINARY_CHAR (c, dst);
4953                   break;
4954
4955                 default:
4956                   /* Everything else handled already */
4957                   break;
4958                 }
4959             }
4960
4961           /* Attempted error recovery. */
4962           if (str->iso2022.output_direction_sequence)
4963             ensure_correct_direction (flags & CODING_STATE_R2L ?
4964                                       CHARSET_RIGHT_TO_LEFT :
4965                                       CHARSET_LEFT_TO_RIGHT,
4966                                       str->codesys, dst, 0, 1);
4967           /* More error recovery. */
4968           if (!retval || str->iso2022.output_literally)
4969             {
4970               /* Output the (possibly invalid) sequence */
4971               int i;
4972               for (i = 0; i < str->iso2022.esc_bytes_index; i++)
4973                 DECODE_ADD_BINARY_CHAR (str->iso2022.esc_bytes[i], dst);
4974               flags &= CODING_STATE_ISO2022_LOCK;
4975               if (!retval)
4976                 n++, src--;/* Repeat the loop with the same character. */
4977               else
4978                 {
4979                   /* No sense in reprocessing the final byte of the
4980                      escape sequence; it could mess things up anyway.
4981                      Just add it now. */
4982                   DECODE_ADD_BINARY_CHAR (c, dst);
4983                 }
4984             }
4985           ch = 0;
4986         }
4987       else if (BYTE_C0_P (c) || BYTE_C1_P (c))
4988         { /* Control characters */
4989
4990           /***** Error-handling *****/
4991
4992           /* If we were in the middle of a character, dump out the
4993              partial character. */
4994           DECODE_OUTPUT_PARTIAL_CHAR (ch);
4995
4996           /* If we just saw a single-shift character, dump it out.
4997              This may dump out the wrong sort of single-shift character,
4998              but least it will give an indication that something went
4999              wrong. */
5000           if (flags & CODING_STATE_SS2)
5001             {
5002               DECODE_ADD_BINARY_CHAR (ISO_CODE_SS2, dst);
5003               flags &= ~CODING_STATE_SS2;
5004             }
5005           if (flags & CODING_STATE_SS3)
5006             {
5007               DECODE_ADD_BINARY_CHAR (ISO_CODE_SS3, dst);
5008               flags &= ~CODING_STATE_SS3;
5009             }
5010
5011           /***** Now handle the control characters. *****/
5012
5013           /* Handle CR/LF */
5014           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5015
5016           flags &= CODING_STATE_ISO2022_LOCK;
5017
5018           if (!parse_iso2022_esc (coding_system, &str->iso2022, c, &flags, 1))
5019             DECODE_ADD_BINARY_CHAR (c, dst);
5020         }
5021       else
5022         {                       /* Graphic characters */
5023           Lisp_Object charset;
5024           int lb;
5025           int reg;
5026
5027           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5028
5029           /* Now determine the charset. */
5030           reg = ((flags & CODING_STATE_SS2) ? 2
5031                  : (flags & CODING_STATE_SS3) ? 3
5032                  : !BYTE_ASCII_P (c) ? str->iso2022.register_right
5033                  : str->iso2022.register_left);
5034           charset = str->iso2022.charset[reg];
5035
5036           /* Error checking: */
5037           if (! CHARSETP (charset)
5038               || str->iso2022.invalid_designated[reg]
5039               || (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL)
5040                   && XCHARSET_CHARS (charset) == 94))
5041             /* Mrmph.  We are trying to invoke a register that has no
5042                or an invalid charset in it, or trying to add a character
5043                outside the range of the charset.  Insert that char literally
5044                to preserve it for the output. */
5045             {
5046               DECODE_OUTPUT_PARTIAL_CHAR (ch);
5047               DECODE_ADD_BINARY_CHAR (c, dst);
5048             }
5049
5050           else
5051             {
5052               /* Things are probably hunky-dorey. */
5053
5054               /* Fetch reverse charset, maybe. */
5055               if (((flags & CODING_STATE_R2L) &&
5056                    XCHARSET_DIRECTION (charset) == CHARSET_LEFT_TO_RIGHT)
5057                   ||
5058                   (!(flags & CODING_STATE_R2L) &&
5059                    XCHARSET_DIRECTION (charset) == CHARSET_RIGHT_TO_LEFT))
5060                 {
5061                   Lisp_Object new_charset =
5062                     XCHARSET_REVERSE_DIRECTION_CHARSET (charset);
5063                   if (!NILP (new_charset))
5064                     charset = new_charset;
5065                 }
5066
5067               lb = XCHARSET_LEADING_BYTE (charset);
5068               switch (XCHARSET_REP_BYTES (charset))
5069                 {
5070                 case 1: /* ASCII */
5071                   DECODE_OUTPUT_PARTIAL_CHAR (ch);
5072                   Dynarr_add (dst, c & 0x7F);
5073                   break;
5074
5075                 case 2: /* one-byte official */
5076                   DECODE_OUTPUT_PARTIAL_CHAR (ch);
5077                   Dynarr_add (dst, lb);
5078                   Dynarr_add (dst, c | 0x80);
5079                   break;
5080
5081                 case 3: /* one-byte private or two-byte official */
5082                   if (XCHARSET_PRIVATE_P (charset))
5083                     {
5084                       DECODE_OUTPUT_PARTIAL_CHAR (ch);
5085                       Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_1);
5086                       Dynarr_add (dst, lb);
5087                       Dynarr_add (dst, c | 0x80);
5088                     }
5089                   else
5090                     {
5091                       if (ch)
5092                         {
5093                           Dynarr_add (dst, lb);
5094                           Dynarr_add (dst, ch | 0x80);
5095                           Dynarr_add (dst, c | 0x80);
5096                           ch = 0;
5097                         }
5098                       else
5099                         ch = c;
5100                     }
5101                   break;
5102
5103                 default:        /* two-byte private */
5104                   if (ch)
5105                     {
5106                       Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_2);
5107                       Dynarr_add (dst, lb);
5108                       Dynarr_add (dst, ch | 0x80);
5109                       Dynarr_add (dst, c | 0x80);
5110                       ch = 0;
5111                     }
5112                   else
5113                     ch = c;
5114                 }
5115             }
5116
5117           if (!ch)
5118             flags &= CODING_STATE_ISO2022_LOCK;
5119         }
5120
5121     label_continue_loop:;
5122     }
5123
5124   if (flags & CODING_STATE_END)
5125     DECODE_OUTPUT_PARTIAL_CHAR (ch);
5126
5127   str->flags = flags;
5128   str->ch    = ch;
5129 }
5130
5131
5132 /***** ISO2022 encoder *****/
5133
5134 /* Designate CHARSET into register REG. */
5135
5136 static void
5137 iso2022_designate (Lisp_Object charset, unsigned char reg,
5138                    struct encoding_stream *str, unsigned_char_dynarr *dst)
5139 {
5140   static const char inter94[] = "()*+";
5141   static const char inter96[] = ",-./";
5142   unsigned int type;
5143   unsigned char final;
5144   Lisp_Object old_charset = str->iso2022.charset[reg];
5145
5146   str->iso2022.charset[reg] = charset;
5147   if (!CHARSETP (charset))
5148     /* charset might be an initial nil or t. */
5149     return;
5150   type = XCHARSET_TYPE (charset);
5151   final = XCHARSET_FINAL (charset);
5152   if (!str->iso2022.force_charset_on_output[reg] &&
5153       CHARSETP (old_charset) &&
5154       XCHARSET_TYPE (old_charset) == type &&
5155       XCHARSET_FINAL (old_charset) == final)
5156     return;
5157
5158   str->iso2022.force_charset_on_output[reg] = 0;
5159
5160   {
5161     charset_conversion_spec_dynarr *dyn =
5162       str->codesys->iso2022.output_conv;
5163
5164     if (dyn)
5165       {
5166         int i;
5167
5168         for (i = 0; i < Dynarr_length (dyn); i++)
5169           {
5170             struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
5171             if (EQ (charset, spec->from_charset))
5172                 charset = spec->to_charset;
5173           }
5174       }
5175   }
5176
5177   Dynarr_add (dst, ISO_CODE_ESC);
5178   switch (type)
5179     {
5180     case CHARSET_TYPE_94:
5181       Dynarr_add (dst, inter94[reg]);
5182       break;
5183     case CHARSET_TYPE_96:
5184       Dynarr_add (dst, inter96[reg]);
5185       break;
5186     case CHARSET_TYPE_94X94:
5187       Dynarr_add (dst, '$');
5188       if (reg != 0
5189           || !(CODING_SYSTEM_ISO2022_SHORT (str->codesys))
5190           || final < '@'
5191           || final > 'B')
5192         Dynarr_add (dst, inter94[reg]);
5193       break;
5194     case CHARSET_TYPE_96X96:
5195       Dynarr_add (dst, '$');
5196       Dynarr_add (dst, inter96[reg]);
5197       break;
5198     }
5199   Dynarr_add (dst, final);
5200 }
5201
5202 static void
5203 ensure_normal_shift (struct encoding_stream *str, unsigned_char_dynarr *dst)
5204 {
5205   if (str->iso2022.register_left != 0)
5206     {
5207       Dynarr_add (dst, ISO_CODE_SI);
5208       str->iso2022.register_left = 0;
5209     }
5210 }
5211
5212 static void
5213 ensure_shift_out (struct encoding_stream *str, unsigned_char_dynarr *dst)
5214 {
5215   if (str->iso2022.register_left != 1)
5216     {
5217       Dynarr_add (dst, ISO_CODE_SO);
5218       str->iso2022.register_left = 1;
5219     }
5220 }
5221
5222 /* Convert internally-formatted data to ISO2022 format. */
5223
5224 static void
5225 encode_coding_iso2022 (Lstream *encoding, const Bufbyte *src,
5226                        unsigned_char_dynarr *dst, Lstream_data_count n)
5227 {
5228   unsigned char charmask, c;
5229   unsigned char char_boundary;
5230   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
5231   unsigned int flags          = str->flags;
5232   unsigned int ch             = str->ch;
5233   Lisp_Coding_System *codesys = str->codesys;
5234   eol_type_t eol_type         = CODING_SYSTEM_EOL_TYPE (str->codesys);
5235   int i;
5236   Lisp_Object charset;
5237   int half;
5238
5239 #ifdef ENABLE_COMPOSITE_CHARS
5240   /* flags for handling composite chars.  We do a little switcharoo
5241      on the source while we're outputting the composite char. */
5242   unsigned int saved_n = 0;
5243   const unsigned char *saved_src = NULL;
5244   int in_composite = 0;
5245 #endif /* ENABLE_COMPOSITE_CHARS */
5246
5247   char_boundary = str->iso2022.current_char_boundary;
5248   charset = str->iso2022.current_charset;
5249   half = str->iso2022.current_half;
5250
5251 #ifdef ENABLE_COMPOSITE_CHARS
5252  back_to_square_n:
5253 #endif
5254   while (n--)
5255     {
5256       c = *src++;
5257
5258       if (BYTE_ASCII_P (c))
5259         {               /* Processing ASCII character */
5260           ch = 0;
5261
5262           restore_left_to_right_direction (codesys, dst, &flags, 0);
5263
5264           /* Make sure G0 contains ASCII */
5265           if ((c > ' ' && c < ISO_CODE_DEL) ||
5266               !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
5267             {
5268               ensure_normal_shift (str, dst);
5269               iso2022_designate (Vcharset_ascii, 0, str, dst);
5270             }
5271
5272           /* If necessary, restore everything to the default state
5273              at end-of-line */
5274           if (c == '\n' &&
5275               !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys)))
5276             {
5277               restore_left_to_right_direction (codesys, dst, &flags, 0);
5278
5279               ensure_normal_shift (str, dst);
5280
5281               for (i = 0; i < 4; i++)
5282                 {
5283                   Lisp_Object initial_charset =
5284                     CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5285                   iso2022_designate (initial_charset, i, str, dst);
5286                 }
5287             }
5288           if (c == '\n')
5289             {
5290               if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5291                 Dynarr_add (dst, '\r');
5292               if (eol_type != EOL_CR)
5293                 Dynarr_add (dst, c);
5294             }
5295           else
5296             {
5297               if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5298                   && fit_to_be_escape_quoted (c))
5299                 Dynarr_add (dst, ISO_CODE_ESC);
5300               Dynarr_add (dst, c);
5301             }
5302           char_boundary = 1;
5303         }
5304
5305       else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
5306         { /* Processing Leading Byte */
5307           ch = 0;
5308           charset = CHARSET_BY_LEADING_BYTE (c);
5309           if (LEADING_BYTE_PREFIX_P(c))
5310             ch = c;
5311           else if (!EQ (charset, Vcharset_control_1)
5312 #ifdef ENABLE_COMPOSITE_CHARS
5313                    && !EQ (charset, Vcharset_composite)
5314 #endif
5315                    )
5316             {
5317               int reg;
5318
5319               ensure_correct_direction (XCHARSET_DIRECTION (charset),
5320                                         codesys, dst, &flags, 0);
5321
5322               /* Now determine which register to use. */
5323               reg = -1;
5324               for (i = 0; i < 4; i++)
5325                 {
5326                   if (EQ (charset, str->iso2022.charset[i]) ||
5327                       EQ (charset,
5328                           CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i)))
5329                     {
5330                       reg = i;
5331                       break;
5332                     }
5333                 }
5334
5335               if (reg == -1)
5336                 {
5337                   if (XCHARSET_GRAPHIC (charset) != 0)
5338                     {
5339                       if (!NILP (str->iso2022.charset[1]) &&
5340                           (!CODING_SYSTEM_ISO2022_SEVEN (codesys) ||
5341                            CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys)))
5342                         reg = 1;
5343                       else if (!NILP (str->iso2022.charset[2]))
5344                         reg = 2;
5345                       else if (!NILP (str->iso2022.charset[3]))
5346                         reg = 3;
5347                       else
5348                         reg = 0;
5349                     }
5350                   else
5351                     reg = 0;
5352                 }
5353
5354               iso2022_designate (charset, reg, str, dst);
5355
5356               /* Now invoke that register. */
5357               switch (reg)
5358                 {
5359                 case 0:
5360                   ensure_normal_shift (str, dst);
5361                   half = 0;
5362                   break;
5363
5364                 case 1:
5365                   if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5366                     {
5367                       ensure_shift_out (str, dst);
5368                       half = 0;
5369                     }
5370                   else
5371                     half = 1;
5372                   break;
5373
5374                 case 2:
5375                   if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5376                     {
5377                       Dynarr_add (dst, ISO_CODE_ESC);
5378                       Dynarr_add (dst, 'N');
5379                       half = 0;
5380                     }
5381                   else
5382                     {
5383                       Dynarr_add (dst, ISO_CODE_SS2);
5384                       half = 1;
5385                     }
5386                   break;
5387
5388                 case 3:
5389                   if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5390                     {
5391                       Dynarr_add (dst, ISO_CODE_ESC);
5392                       Dynarr_add (dst, 'O');
5393                       half = 0;
5394                     }
5395                   else
5396                     {
5397                       Dynarr_add (dst, ISO_CODE_SS3);
5398                       half = 1;
5399                     }
5400                   break;
5401
5402                 default:
5403                   abort ();
5404                 }
5405             }
5406           char_boundary = 0;
5407         }
5408       else
5409         {                       /* Processing Non-ASCII character */
5410           charmask = (half == 0 ? 0x7F : 0xFF);
5411           char_boundary = 1;
5412           if (EQ (charset, Vcharset_control_1))
5413             {
5414               if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5415                   && fit_to_be_escape_quoted (c))
5416                 Dynarr_add (dst, ISO_CODE_ESC);
5417               /* you asked for it ... */
5418               Dynarr_add (dst, c - 0x20);
5419             }
5420           else
5421             {
5422               switch (XCHARSET_REP_BYTES (charset))
5423                 {
5424                 case 2:
5425                   Dynarr_add (dst, c & charmask);
5426                   break;
5427                 case 3:
5428                   if (XCHARSET_PRIVATE_P (charset))
5429                     {
5430                       Dynarr_add (dst, c & charmask);
5431                       ch = 0;
5432                     }
5433                   else if (ch)
5434                     {
5435 #ifdef ENABLE_COMPOSITE_CHARS
5436                       if (EQ (charset, Vcharset_composite))
5437                         {
5438                           if (in_composite)
5439                             {
5440                               /* #### Bother! We don't know how to
5441                                  handle this yet. */
5442                               Dynarr_add (dst, '~');
5443                             }
5444                           else
5445                             {
5446                               Emchar emch = MAKE_CHAR (Vcharset_composite,
5447                                                        ch & 0x7F, c & 0x7F);
5448                               Lisp_Object lstr = composite_char_string (emch);
5449                               saved_n = n;
5450                               saved_src = src;
5451                               in_composite = 1;
5452                               src = XSTRING_DATA   (lstr);
5453                               n   = XSTRING_LENGTH (lstr);
5454                               Dynarr_add (dst, ISO_CODE_ESC);
5455                               Dynarr_add (dst, '0'); /* start composing */
5456                             }
5457                         }
5458                       else
5459 #endif /* ENABLE_COMPOSITE_CHARS */
5460                         {
5461                           Dynarr_add (dst, ch & charmask);
5462                           Dynarr_add (dst, c & charmask);
5463                         }
5464                       ch = 0;
5465                     }
5466                   else
5467                     {
5468                       ch = c;
5469                       char_boundary = 0;
5470                     }
5471                   break;
5472                 case 4:
5473                   if (ch)
5474                     {
5475                       Dynarr_add (dst, ch & charmask);
5476                       Dynarr_add (dst, c & charmask);
5477                       ch = 0;
5478                     }
5479                   else
5480                     {
5481                       ch = c;
5482                       char_boundary = 0;
5483                     }
5484                   break;
5485                 default:
5486                   abort ();
5487                 }
5488             }
5489         }
5490     }
5491
5492 #ifdef ENABLE_COMPOSITE_CHARS
5493   if (in_composite)
5494     {
5495       n = saved_n;
5496       src = saved_src;
5497       in_composite = 0;
5498       Dynarr_add (dst, ISO_CODE_ESC);
5499       Dynarr_add (dst, '1'); /* end composing */
5500       goto back_to_square_n; /* Wheeeeeeeee ..... */
5501     }
5502 #endif /* ENABLE_COMPOSITE_CHARS */
5503
5504   if (char_boundary && flags & CODING_STATE_END)
5505     {
5506       restore_left_to_right_direction (codesys, dst, &flags, 0);
5507       ensure_normal_shift (str, dst);
5508       for (i = 0; i < 4; i++)
5509         {
5510           Lisp_Object initial_charset =
5511             CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5512           iso2022_designate (initial_charset, i, str, dst);
5513         }
5514     }
5515
5516   str->flags = flags;
5517   str->ch    = ch;
5518   str->iso2022.current_char_boundary = char_boundary;
5519   str->iso2022.current_charset = charset;
5520   str->iso2022.current_half = half;
5521
5522   /* Verbum caro factum est! */
5523 }
5524 #endif /* MULE */
5525 \f
5526 /************************************************************************/
5527 /*                     No-conversion methods                            */
5528 /************************************************************************/
5529
5530 /* This is used when reading in "binary" files -- i.e. files that may
5531    contain all 256 possible byte values and that are not to be
5532    interpreted as being in any particular decoding. */
5533 static void
5534 decode_coding_no_conversion (Lstream *decoding, const Extbyte *src,
5535                              unsigned_char_dynarr *dst, Lstream_data_count n)
5536 {
5537   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
5538   unsigned int flags  = str->flags;
5539   unsigned int ch     = str->ch;
5540   eol_type_t eol_type = str->eol_type;
5541
5542   while (n--)
5543     {
5544       unsigned char c = *(unsigned char *)src++;
5545
5546       DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5547       DECODE_ADD_BINARY_CHAR (c, dst);
5548     label_continue_loop:;
5549     }
5550
5551   DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
5552
5553   str->flags = flags;
5554   str->ch    = ch;
5555 }
5556
5557 static void
5558 encode_coding_no_conversion (Lstream *encoding, const Bufbyte *src,
5559                              unsigned_char_dynarr *dst, Lstream_data_count n)
5560 {
5561   unsigned char c;
5562   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
5563   unsigned int flags  = str->flags;
5564   unsigned int ch     = str->ch;
5565   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
5566
5567   while (n--)
5568     {
5569       c = *src++;
5570       if (c == '\n')
5571         {
5572           if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5573             Dynarr_add (dst, '\r');
5574           if (eol_type != EOL_CR)
5575             Dynarr_add (dst, '\n');
5576           ch = 0;
5577         }
5578       else if (BYTE_ASCII_P (c))
5579         {
5580           assert (ch == 0);
5581           Dynarr_add (dst, c);
5582         }
5583       else if (BUFBYTE_LEADING_BYTE_P (c))
5584         {
5585           assert (ch == 0);
5586           if (c == LEADING_BYTE_LATIN_ISO8859_1 ||
5587               c == LEADING_BYTE_CONTROL_1)
5588             ch = c;
5589           else
5590             Dynarr_add (dst, '~'); /* untranslatable character */
5591         }
5592       else
5593         {
5594           if (ch == LEADING_BYTE_LATIN_ISO8859_1)
5595             Dynarr_add (dst, c);
5596           else if (ch == LEADING_BYTE_CONTROL_1)
5597             {
5598               assert (c < 0xC0);
5599               Dynarr_add (dst, c - 0x20);
5600             }
5601           /* else it should be the second or third byte of an
5602              untranslatable character, so ignore it */
5603           ch = 0;
5604         }
5605     }
5606
5607   str->flags = flags;
5608   str->ch    = ch;
5609 }
5610
5611 \f
5612
5613 /************************************************************************/
5614 /*                             Initialization                           */
5615 /************************************************************************/
5616
5617 void
5618 syms_of_file_coding (void)
5619 {
5620   INIT_LRECORD_IMPLEMENTATION (coding_system);
5621
5622   DEFERROR_STANDARD (Qcoding_system_error, Qio_error);
5623
5624   DEFSUBR (Fcoding_system_p);
5625   DEFSUBR (Ffind_coding_system);
5626   DEFSUBR (Fget_coding_system);
5627   DEFSUBR (Fcoding_system_list);
5628   DEFSUBR (Fcoding_system_name);
5629   DEFSUBR (Fmake_coding_system);
5630   DEFSUBR (Fcopy_coding_system);
5631   DEFSUBR (Fcoding_system_canonical_name_p);
5632   DEFSUBR (Fcoding_system_alias_p);
5633   DEFSUBR (Fcoding_system_aliasee);
5634   DEFSUBR (Fdefine_coding_system_alias);
5635   DEFSUBR (Fsubsidiary_coding_system);
5636
5637   DEFSUBR (Fcoding_system_type);
5638   DEFSUBR (Fcoding_system_doc_string);
5639 #ifdef MULE
5640   DEFSUBR (Fcoding_system_charset);
5641 #endif
5642   DEFSUBR (Fcoding_system_property);
5643
5644   DEFSUBR (Fcoding_category_list);
5645   DEFSUBR (Fset_coding_priority_list);
5646   DEFSUBR (Fcoding_priority_list);
5647   DEFSUBR (Fset_coding_category_system);
5648   DEFSUBR (Fcoding_category_system);
5649
5650   DEFSUBR (Fdetect_coding_region);
5651   DEFSUBR (Fdecode_coding_region);
5652   DEFSUBR (Fencode_coding_region);
5653 #ifdef MULE
5654   DEFSUBR (Fdecode_shift_jis_char);
5655   DEFSUBR (Fencode_shift_jis_char);
5656   DEFSUBR (Fdecode_big5_char);
5657   DEFSUBR (Fencode_big5_char);
5658   DEFSUBR (Fset_ucs_char);
5659   DEFSUBR (Fucs_char);
5660   DEFSUBR (Fset_char_ucs);
5661   DEFSUBR (Fchar_ucs);
5662 #endif /* MULE */
5663   defsymbol (&Qcoding_systemp, "coding-system-p");
5664   defsymbol (&Qno_conversion, "no-conversion");
5665   defsymbol (&Qraw_text, "raw-text");
5666 #ifdef MULE
5667   defsymbol (&Qbig5, "big5");
5668   defsymbol (&Qshift_jis, "shift-jis");
5669   defsymbol (&Qucs4, "ucs-4");
5670   defsymbol (&Qutf8, "utf-8");
5671   defsymbol (&Qccl, "ccl");
5672   defsymbol (&Qiso2022, "iso2022");
5673 #endif /* MULE */
5674   defsymbol (&Qmnemonic, "mnemonic");
5675   defsymbol (&Qeol_type, "eol-type");
5676   defsymbol (&Qpost_read_conversion, "post-read-conversion");
5677   defsymbol (&Qpre_write_conversion, "pre-write-conversion");
5678
5679   defsymbol (&Qcr, "cr");
5680   defsymbol (&Qlf, "lf");
5681   defsymbol (&Qcrlf, "crlf");
5682   defsymbol (&Qeol_cr, "eol-cr");
5683   defsymbol (&Qeol_lf, "eol-lf");
5684   defsymbol (&Qeol_crlf, "eol-crlf");
5685 #ifdef MULE
5686   defsymbol (&Qcharset_g0, "charset-g0");
5687   defsymbol (&Qcharset_g1, "charset-g1");
5688   defsymbol (&Qcharset_g2, "charset-g2");
5689   defsymbol (&Qcharset_g3, "charset-g3");
5690   defsymbol (&Qforce_g0_on_output, "force-g0-on-output");
5691   defsymbol (&Qforce_g1_on_output, "force-g1-on-output");
5692   defsymbol (&Qforce_g2_on_output, "force-g2-on-output");
5693   defsymbol (&Qforce_g3_on_output, "force-g3-on-output");
5694   defsymbol (&Qno_iso6429, "no-iso6429");
5695   defsymbol (&Qinput_charset_conversion, "input-charset-conversion");
5696   defsymbol (&Qoutput_charset_conversion, "output-charset-conversion");
5697
5698   defsymbol (&Qshort, "short");
5699   defsymbol (&Qno_ascii_eol, "no-ascii-eol");
5700   defsymbol (&Qno_ascii_cntl, "no-ascii-cntl");
5701   defsymbol (&Qseven, "seven");
5702   defsymbol (&Qlock_shift, "lock-shift");
5703   defsymbol (&Qescape_quoted, "escape-quoted");
5704 #endif /* MULE */
5705   defsymbol (&Qencode, "encode");
5706   defsymbol (&Qdecode, "decode");
5707
5708 #ifdef MULE
5709   defsymbol (&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS],
5710              "shift-jis");
5711   defsymbol (&coding_category_symbol[CODING_CATEGORY_BIG5],
5712              "big5");
5713   defsymbol (&coding_category_symbol[CODING_CATEGORY_UCS4],
5714              "ucs-4");
5715   defsymbol (&coding_category_symbol[CODING_CATEGORY_UTF8],
5716              "utf-8");
5717   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_7],
5718              "iso-7");
5719   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE],
5720              "iso-8-designate");
5721   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_1],
5722              "iso-8-1");
5723   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_2],
5724              "iso-8-2");
5725   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_LOCK_SHIFT],
5726              "iso-lock-shift");
5727 #endif /* MULE */
5728   defsymbol (&coding_category_symbol[CODING_CATEGORY_NO_CONVERSION],
5729              "no-conversion");
5730 }
5731
5732 void
5733 lstream_type_create_file_coding (void)
5734 {
5735   LSTREAM_HAS_METHOD (decoding, reader);
5736   LSTREAM_HAS_METHOD (decoding, writer);
5737   LSTREAM_HAS_METHOD (decoding, rewinder);
5738   LSTREAM_HAS_METHOD (decoding, seekable_p);
5739   LSTREAM_HAS_METHOD (decoding, flusher);
5740   LSTREAM_HAS_METHOD (decoding, closer);
5741   LSTREAM_HAS_METHOD (decoding, marker);
5742
5743   LSTREAM_HAS_METHOD (encoding, reader);
5744   LSTREAM_HAS_METHOD (encoding, writer);
5745   LSTREAM_HAS_METHOD (encoding, rewinder);
5746   LSTREAM_HAS_METHOD (encoding, seekable_p);
5747   LSTREAM_HAS_METHOD (encoding, flusher);
5748   LSTREAM_HAS_METHOD (encoding, closer);
5749   LSTREAM_HAS_METHOD (encoding, marker);
5750 }
5751
5752 void
5753 vars_of_file_coding (void)
5754 {
5755   int i;
5756
5757   fcd = xnew (struct file_coding_dump);
5758   dump_add_root_struct_ptr (&fcd, &fcd_description);
5759
5760   /* Initialize to something reasonable ... */
5761   for (i = 0; i < CODING_CATEGORY_LAST; i++)
5762     {
5763       fcd->coding_category_system[i] = Qnil;
5764       fcd->coding_category_by_priority[i] = i;
5765     }
5766
5767   Fprovide (intern ("file-coding"));
5768
5769   DEFVAR_LISP ("keyboard-coding-system", &Vkeyboard_coding_system /*
5770 Coding system used for TTY keyboard input.
5771 Not used under a windowing system.
5772 */ );
5773   Vkeyboard_coding_system = Qnil;
5774
5775   DEFVAR_LISP ("terminal-coding-system", &Vterminal_coding_system /*
5776 Coding system used for TTY display output.
5777 Not used under a windowing system.
5778 */ );
5779   Vterminal_coding_system = Qnil;
5780
5781   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read /*
5782 Overriding coding system used when reading from a file or process.
5783 You should bind this variable with `let', but do not set it globally.
5784 If this is non-nil, it specifies the coding system that will be used
5785 to decode input on read operations, such as from a file or process.
5786 It overrides `buffer-file-coding-system-for-read',
5787 `insert-file-contents-pre-hook', etc.  Use those variables instead of
5788 this one for permanent changes to the environment.  */ );
5789   Vcoding_system_for_read = Qnil;
5790
5791   DEFVAR_LISP ("coding-system-for-write",
5792                &Vcoding_system_for_write /*
5793 Overriding coding system used when writing to a file or process.
5794 You should bind this variable with `let', but do not set it globally.
5795 If this is non-nil, it specifies the coding system that will be used
5796 to encode output for write operations, such as to a file or process.
5797 It overrides `buffer-file-coding-system', `write-region-pre-hook', etc.
5798 Use those variables instead of this one for permanent changes to the
5799 environment.  */ );
5800   Vcoding_system_for_write = Qnil;
5801
5802   DEFVAR_LISP ("file-name-coding-system", &Vfile_name_coding_system /*
5803 Coding system used to convert pathnames when accessing files.
5804 */ );
5805   Vfile_name_coding_system = Qnil;
5806
5807   DEFVAR_BOOL ("enable-multibyte-characters", &enable_multibyte_characters /*
5808 Non-nil means the buffer contents are regarded as multi-byte form
5809 of characters, not a binary code.  This affects the display, file I/O,
5810 and behaviors of various editing commands.
5811
5812 Setting this to nil does not do anything.
5813 */ );
5814   enable_multibyte_characters = 1;
5815 }
5816
5817 void
5818 complex_vars_of_file_coding (void)
5819 {
5820   staticpro (&Vcoding_system_hash_table);
5821   Vcoding_system_hash_table =
5822     make_lisp_hash_table (50, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ);
5823
5824   the_codesys_prop_dynarr = Dynarr_new (codesys_prop);
5825   dump_add_root_struct_ptr (&the_codesys_prop_dynarr, &codesys_prop_dynarr_description);
5826
5827 #define DEFINE_CODESYS_PROP(Prop_Type, Sym) do  \
5828 {                                               \
5829   struct codesys_prop csp;                      \
5830   csp.sym = (Sym);                              \
5831   csp.prop_type = (Prop_Type);                  \
5832   Dynarr_add (the_codesys_prop_dynarr, csp);    \
5833 } while (0)
5834
5835   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qmnemonic);
5836   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_type);
5837   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_cr);
5838   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_crlf);
5839   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_lf);
5840   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qpost_read_conversion);
5841   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qpre_write_conversion);
5842 #ifdef MULE
5843   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g0);
5844   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g1);
5845   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g2);
5846   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g3);
5847   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g0_on_output);
5848   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g1_on_output);
5849   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g2_on_output);
5850   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g3_on_output);
5851   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qshort);
5852   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_eol);
5853   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_cntl);
5854   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qseven);
5855   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qlock_shift);
5856   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_iso6429);
5857   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qescape_quoted);
5858   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qinput_charset_conversion);
5859   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qoutput_charset_conversion);
5860
5861   DEFINE_CODESYS_PROP (CODESYS_PROP_CCL,     Qencode);
5862   DEFINE_CODESYS_PROP (CODESYS_PROP_CCL,     Qdecode);
5863 #endif /* MULE */
5864   /* Need to create this here or we're really screwed. */
5865   Fmake_coding_system
5866     (Qraw_text, Qno_conversion,
5867      build_string ("Raw text, which means it converts only line-break-codes."),
5868      list2 (Qmnemonic, build_string ("Raw")));
5869
5870   Fmake_coding_system
5871     (Qbinary, Qno_conversion,
5872      build_string ("Binary, which means it does not convert anything."),
5873      list4 (Qeol_type, Qlf,
5874             Qmnemonic, build_string ("Binary")));
5875
5876   Fdefine_coding_system_alias (Qno_conversion, Qraw_text);
5877
5878   Fdefine_coding_system_alias (Qfile_name, Qbinary);
5879
5880   Fdefine_coding_system_alias (Qterminal, Qbinary);
5881   Fdefine_coding_system_alias (Qkeyboard, Qbinary);
5882
5883   /* Need this for bootstrapping */
5884   fcd->coding_category_system[CODING_CATEGORY_NO_CONVERSION] =
5885     Fget_coding_system (Qraw_text);
5886
5887 #ifdef MULE
5888   {
5889     size_t i;
5890
5891     for (i = 0; i < countof (fcd->ucs_to_mule_table); i++)
5892       fcd->ucs_to_mule_table[i] = Qnil;
5893   }
5894   staticpro (&mule_to_ucs_table);
5895   mule_to_ucs_table = Fmake_char_table(Qgeneric);
5896 #endif /* MULE */
5897 }