src/file-coding.c

   1 /* Code conversion functions.
   2    Copyright (C) 1991, 1995 Free Software Foundation, Inc.
   3    Copyright (C) 1995 Sun Microsystems, Inc.
   4
   5 This file is part of XEmacs.
   6
   7 XEmacs is free software; you can redistribute it and/or modify it
   8 under the terms of the GNU General Public License as published by the
   9 Free Software Foundation; either version 2, or (at your option) any
  10 later version.
  11
  12 XEmacs is distributed in the hope that it will be useful, but WITHOUT
  13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  14 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  15 for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with XEmacs; see the file COPYING.  If not, write to
  19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  20 Boston, MA 02111-1307, USA.  */
  21
  22 /* Synched up with: Mule 2.3.   Not in FSF. */
  23
  24 /* Rewritten by Ben Wing <ben@xemacs.org>. */
  25
  26 #include <config.h>
  27 #include "lisp.h"
  28
  29 #include "buffer.h"
  30 #include "elhash.h"
  31 #include "insdel.h"
  32 #include "lstream.h"
  33 #include "opaque.h"
  34 #ifdef MULE
  35 #include "mule-ccl.h"
  36 #include "chartab.h"
  37 #endif
  38 #include "file-coding.h"
  39
  40 Lisp_Object Qcoding_system_error;
  41
  42 Lisp_Object Vkeyboard_coding_system;
  43 Lisp_Object Vterminal_coding_system;
  44 Lisp_Object Vcoding_system_for_read;
  45 Lisp_Object Vcoding_system_for_write;
  46 Lisp_Object Vfile_name_coding_system;
  47
  48 /* Table of symbols identifying each coding category. */
  49 Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST + 1];
  50
  51
  52
  53 struct file_coding_dump {
  54   /* Coding system currently associated with each coding category. */
  55   Lisp_Object coding_category_system[CODING_CATEGORY_LAST + 1];
  56
  57   /* Table of all coding categories in decreasing order of priority.
  58      This describes a permutation of the possible coding categories. */
  59   int coding_category_by_priority[CODING_CATEGORY_LAST + 1];
  60
  61 #ifdef MULE
  62   Lisp_Object ucs_to_mule_table[65536];
  63 #endif
  64 } *fcd;
  65
  66 static const struct lrecord_description fcd_description_1[] = {
  67   { XD_LISP_OBJECT_ARRAY, offsetof (struct file_coding_dump, coding_category_system), CODING_CATEGORY_LAST + 1 },
  68 #ifdef MULE
  69   { XD_LISP_OBJECT_ARRAY, offsetof (struct file_coding_dump, ucs_to_mule_table), countof (fcd->ucs_to_mule_table) },
  70 #endif
  71   { XD_END }
  72 };
  73
  74 static const struct struct_description fcd_description = {
  75   sizeof (struct file_coding_dump),
  76   fcd_description_1
  77 };
  78
  79 Lisp_Object mule_to_ucs_table;
  80
  81 Lisp_Object Qcoding_systemp;
  82
  83 Lisp_Object Qraw_text, Qno_conversion, Qccl, Qiso2022;
  84 /* Qinternal in general.c */
  85
  86 Lisp_Object Qmnemonic, Qeol_type;
  87 Lisp_Object Qcr, Qcrlf, Qlf;
  88 Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf;
  89 Lisp_Object Qpost_read_conversion;
  90 Lisp_Object Qpre_write_conversion;
  91
  92 #ifdef MULE
  93 Lisp_Object Qucs4, Qutf8;
  94 Lisp_Object Qbig5, Qshift_jis;
  95 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3;
  96 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output;
  97 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output;
  98 Lisp_Object Qno_iso6429;
  99 Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion;
 100 Lisp_Object Qescape_quoted;
 101 Lisp_Object Qshort, Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift;
 102 #endif
 103 Lisp_Object Qencode, Qdecode;
 104
 105 Lisp_Object Vcoding_system_hash_table;
 106
 107 int enable_multibyte_characters;
 108
 109 #ifdef MULE
 110 /* Additional information used by the ISO2022 decoder and detector. */
 111 struct iso2022_decoder
 112 {
 113   /* CHARSET holds the character sets currently assigned to the G0
 114      through G3 variables.  It is initialized from the array
 115      INITIAL_CHARSET in CODESYS. */
 116   Lisp_Object charset[4];
 117
 118   /* Which registers are currently invoked into the left (GL) and
 119      right (GR) halves of the 8-bit encoding space? */
 120   int register_left, register_right;
 121
 122   /* ISO_ESC holds a value indicating part of an escape sequence
 123      that has already been seen. */
 124   enum iso_esc_flag esc;
 125
 126   /* This records the bytes we've seen so far in an escape sequence,
 127      in case the sequence is invalid (we spit out the bytes unchanged). */
 128   unsigned char esc_bytes[8];
 129
 130   /* Index for next byte to store in ISO escape sequence. */
 131   int esc_bytes_index;
 132
 133 #ifdef ENABLE_COMPOSITE_CHARS
 134   /* Stuff seen so far when composing a string. */
 135   unsigned_char_dynarr *composite_chars;
 136 #endif
 137
 138   /* If we saw an invalid designation sequence for a particular
 139      register, we flag it here and switch to ASCII.  The next time we
 140      see a valid designation for this register, we turn off the flag
 141      and do the designation normally, but pretend the sequence was
 142      invalid.  The effect of all this is that (most of the time) the
 143      escape sequences for both the switch to the unknown charset, and
 144      the switch back to the known charset, get inserted literally into
 145      the buffer and saved out as such.  The hope is that we can
 146      preserve the escape sequences so that the resulting written out
 147      file makes sense.  If we don't do any of this, the designation
 148      to the invalid charset will be preserved but that switch back
 149      to the known charset will probably get eaten because it was
 150      the same charset that was already present in the register. */
 151   unsigned char invalid_designated[4];
 152
 153   /* We try to do similar things as above for direction-switching
 154      sequences.  If we encountered a direction switch while an
 155      invalid designation was present, or an invalid designation
 156      just after a direction switch (i.e. no valid designation
 157      encountered yet), we insert the direction-switch escape
 158      sequence literally into the output stream, and later on
 159      insert the corresponding direction-restoring escape sequence
 160      literally also. */
 161   unsigned int switched_dir_and_no_valid_charset_yet :1;
 162   unsigned int invalid_switch_dir :1;
 163
 164   /* Tells the decoder to output the escape sequence literally
 165      even though it was valid.  Used in the games we play to
 166      avoid lossage when we encounter invalid designations. */
 167   unsigned int output_literally :1;
 168   /* We encountered a direction switch followed by an invalid
 169      designation.  We didn't output the direction switch
 170      literally because we didn't know about the invalid designation;
 171      but we have to do so now. */
 172   unsigned int output_direction_sequence :1;
 173 };
 174 #endif /* MULE */
 175 EXFUN (Fcopy_coding_system, 2);
 176 #ifdef MULE
 177 struct detection_state;
 178 static int detect_coding_sjis (struct detection_state *st,
 179                                const unsigned char *src,
 180                                unsigned int n);
 181 static void decode_coding_sjis (Lstream *decoding,
 182                                 const unsigned char *src,
 183                                 unsigned_char_dynarr *dst,
 184                                 unsigned int n);
 185 static void encode_coding_sjis (Lstream *encoding,
 186                                 const unsigned char *src,
 187                                 unsigned_char_dynarr *dst,
 188                                 unsigned int n);
 189 static int detect_coding_big5 (struct detection_state *st,
 190                                const unsigned char *src,
 191                                unsigned int n);
 192 static void decode_coding_big5 (Lstream *decoding,
 193                                 const unsigned char *src,
 194                                 unsigned_char_dynarr *dst, unsigned int n);
 195 static void encode_coding_big5 (Lstream *encoding,
 196                                 const unsigned char *src,
 197                                 unsigned_char_dynarr *dst, unsigned int n);
 198 static int detect_coding_ucs4 (struct detection_state *st,
 199                                const unsigned char *src,
 200                                unsigned int n);
 201 static void decode_coding_ucs4 (Lstream *decoding,
 202                                 const unsigned char *src,
 203                                 unsigned_char_dynarr *dst, unsigned int n);
 204 static void encode_coding_ucs4 (Lstream *encoding,
 205                                 const unsigned char *src,
 206                                 unsigned_char_dynarr *dst, unsigned int n);
 207 static int detect_coding_utf8 (struct detection_state *st,
 208                                const unsigned char *src,
 209                                unsigned int n);
 210 static void decode_coding_utf8 (Lstream *decoding,
 211                                 const unsigned char *src,
 212                                 unsigned_char_dynarr *dst, unsigned int n);
 213 static void encode_coding_utf8 (Lstream *encoding,
 214                                 const unsigned char *src,
 215                                 unsigned_char_dynarr *dst, unsigned int n);
 216 static int postprocess_iso2022_mask (int mask);
 217 static void reset_iso2022 (Lisp_Object coding_system,
 218                            struct iso2022_decoder *iso);
 219 static int detect_coding_iso2022 (struct detection_state *st,
 220                                   const unsigned char *src,
 221                                   unsigned int n);
 222 static void decode_coding_iso2022 (Lstream *decoding,
 223                                    const unsigned char *src,
 224                                    unsigned_char_dynarr *dst, unsigned int n);
 225 static void encode_coding_iso2022 (Lstream *encoding,
 226                                    const unsigned char *src,
 227                                    unsigned_char_dynarr *dst, unsigned int n);
 228 #endif /* MULE */
 229 static void decode_coding_no_conversion (Lstream *decoding,
 230                                          const unsigned char *src,
 231                                          unsigned_char_dynarr *dst,
 232                                          unsigned int n);
 233 static void encode_coding_no_conversion (Lstream *encoding,
 234                                          const unsigned char *src,
 235                                          unsigned_char_dynarr *dst,
 236                                          unsigned int n);
 237 static void mule_decode (Lstream *decoding, const unsigned char *src,
 238                          unsigned_char_dynarr *dst, unsigned int n);
 239 static void mule_encode (Lstream *encoding, const unsigned char *src,
 240                          unsigned_char_dynarr *dst, unsigned int n);
 241
 242 typedef struct codesys_prop codesys_prop;
 243 struct codesys_prop
 244 {
 245   Lisp_Object sym;
 246   int prop_type;
 247 };
 248
 249 typedef struct
 250 {
 251   Dynarr_declare (codesys_prop);
 252 } codesys_prop_dynarr;
 253
 254 static const struct lrecord_description codesys_prop_description_1[] = {
 255   { XD_LISP_OBJECT, offsetof (codesys_prop, sym) },
 256   { XD_END }
 257 };
 258
 259 static const struct struct_description codesys_prop_description = {
 260   sizeof (codesys_prop),
 261   codesys_prop_description_1
 262 };
 263
 264 static const struct lrecord_description codesys_prop_dynarr_description_1[] = {
 265   XD_DYNARR_DESC (codesys_prop_dynarr, &codesys_prop_description),
 266   { XD_END }
 267 };
 268
 269 static const struct struct_description codesys_prop_dynarr_description = {
 270   sizeof (codesys_prop_dynarr),
 271   codesys_prop_dynarr_description_1
 272 };
 273
 274 codesys_prop_dynarr *the_codesys_prop_dynarr;
 275
 276 enum codesys_prop_enum
 277 {
 278   CODESYS_PROP_ALL_OK,
 279   CODESYS_PROP_ISO2022,
 280   CODESYS_PROP_CCL
 281 };
 282
 283 \f
 284 /************************************************************************/
 285 /*                       Coding system functions                        */
 286 /************************************************************************/
 287
 288 static Lisp_Object mark_coding_system (Lisp_Object);
 289 static void print_coding_system (Lisp_Object, Lisp_Object, int);
 290 static void finalize_coding_system (void *header, int for_disksave);
 291
 292 #ifdef MULE
 293 static const struct lrecord_description ccs_description_1[] = {
 294   { XD_LISP_OBJECT, offsetof (charset_conversion_spec, from_charset) },
 295   { XD_LISP_OBJECT, offsetof (charset_conversion_spec, to_charset) },
 296   { XD_END }
 297 };
 298
 299 static const struct struct_description ccs_description = {
 300   sizeof (charset_conversion_spec),
 301   ccs_description_1
 302 };
 303
 304 static const struct lrecord_description ccsd_description_1[] = {
 305   XD_DYNARR_DESC (charset_conversion_spec_dynarr, &ccs_description),
 306   { XD_END }
 307 };
 308
 309 static const struct struct_description ccsd_description = {
 310   sizeof (charset_conversion_spec_dynarr),
 311   ccsd_description_1
 312 };
 313 #endif
 314
 315 static const struct lrecord_description coding_system_description[] = {
 316   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, name) },
 317   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, doc_string) },
 318   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, mnemonic) },
 319   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, post_read_conversion) },
 320   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, pre_write_conversion) },
 321   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_lf) },
 322   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_crlf) },
 323   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_cr) },
 324 #ifdef MULE
 325   { XD_LISP_OBJECT_ARRAY, offsetof (Lisp_Coding_System, iso2022.initial_charset), 4 },
 326   { XD_STRUCT_PTR,  offsetof (Lisp_Coding_System, iso2022.input_conv),  1, &ccsd_description },
 327   { XD_STRUCT_PTR,  offsetof (Lisp_Coding_System, iso2022.output_conv), 1, &ccsd_description },
 328   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccl.decode) },
 329   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccl.encode) },
 330 #endif
 331   { XD_END }
 332 };
 333
 334 DEFINE_LRECORD_IMPLEMENTATION ("coding-system", coding_system,
 335                                mark_coding_system, print_coding_system,
 336                                finalize_coding_system,
 337                                0, 0, coding_system_description,
 338                                Lisp_Coding_System);
 339
 340 static Lisp_Object
 341 mark_coding_system (Lisp_Object obj)
 342 {
 343   Lisp_Coding_System *codesys = XCODING_SYSTEM (obj);
 344
 345   mark_object (CODING_SYSTEM_NAME (codesys));
 346   mark_object (CODING_SYSTEM_DOC_STRING (codesys));
 347   mark_object (CODING_SYSTEM_MNEMONIC (codesys));
 348   mark_object (CODING_SYSTEM_EOL_LF (codesys));
 349   mark_object (CODING_SYSTEM_EOL_CRLF (codesys));
 350   mark_object (CODING_SYSTEM_EOL_CR (codesys));
 351
 352   switch (CODING_SYSTEM_TYPE (codesys))
 353     {
 354 #ifdef MULE
 355       int i;
 356     case CODESYS_ISO2022:
 357       for (i = 0; i < 4; i++)
 358         mark_object (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i));
 359       if (codesys->iso2022.input_conv)
 360         {
 361           for (i = 0; i < Dynarr_length (codesys->iso2022.input_conv); i++)
 362             {
 363               struct charset_conversion_spec *ccs =
 364                 Dynarr_atp (codesys->iso2022.input_conv, i);
 365               mark_object (ccs->from_charset);
 366               mark_object (ccs->to_charset);
 367             }
 368         }
 369       if (codesys->iso2022.output_conv)
 370         {
 371           for (i = 0; i < Dynarr_length (codesys->iso2022.output_conv); i++)
 372             {
 373               struct charset_conversion_spec *ccs =
 374                 Dynarr_atp (codesys->iso2022.output_conv, i);
 375               mark_object (ccs->from_charset);
 376               mark_object (ccs->to_charset);
 377             }
 378         }
 379       break;
 380
 381     case CODESYS_CCL:
 382       mark_object (CODING_SYSTEM_CCL_DECODE (codesys));
 383       mark_object (CODING_SYSTEM_CCL_ENCODE (codesys));
 384       break;
 385 #endif /* MULE */
 386     default:
 387       break;
 388     }
 389
 390   mark_object (CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys));
 391   return CODING_SYSTEM_POST_READ_CONVERSION (codesys);
 392 }
 393
 394 static void
 395 print_coding_system (Lisp_Object obj, Lisp_Object printcharfun,
 396                      int escapeflag)
 397 {
 398   Lisp_Coding_System *c = XCODING_SYSTEM (obj);
 399   if (print_readably)
 400     error ("printing unreadable object #<coding_system 0x%x>",
 401            c->header.uid);
 402
 403   write_c_string ("#<coding_system ", printcharfun);
 404   print_internal (c->name, printcharfun, 1);
 405   write_c_string (">", printcharfun);
 406 }
 407
 408 static void
 409 finalize_coding_system (void *header, int for_disksave)
 410 {
 411   Lisp_Coding_System *c = (Lisp_Coding_System *) header;
 412   /* Since coding systems never go away, this function is not
 413      necessary.  But it would be necessary if we changed things
 414      so that coding systems could go away. */
 415   if (!for_disksave) /* see comment in lstream.c */
 416     {
 417       switch (CODING_SYSTEM_TYPE (c))
 418         {
 419 #ifdef MULE
 420         case CODESYS_ISO2022:
 421           if (c->iso2022.input_conv)
 422             {
 423               Dynarr_free (c->iso2022.input_conv);
 424               c->iso2022.input_conv = 0;
 425             }
 426           if (c->iso2022.output_conv)
 427             {
 428               Dynarr_free (c->iso2022.output_conv);
 429               c->iso2022.output_conv = 0;
 430             }
 431           break;
 432 #endif /* MULE */
 433         default:
 434           break;
 435         }
 436     }
 437 }
 438
 439 static eol_type_t
 440 symbol_to_eol_type (Lisp_Object symbol)
 441 {
 442   CHECK_SYMBOL (symbol);
 443   if (NILP (symbol))      return EOL_AUTODETECT;
 444   if (EQ (symbol, Qlf))   return EOL_LF;
 445   if (EQ (symbol, Qcrlf)) return EOL_CRLF;
 446   if (EQ (symbol, Qcr))   return EOL_CR;
 447
 448   signal_simple_error ("Unrecognized eol type", symbol);
 449   return EOL_AUTODETECT; /* not reached */
 450 }
 451
 452 static Lisp_Object
 453 eol_type_to_symbol (eol_type_t type)
 454 {
 455   switch (type)
 456     {
 457     default: abort ();
 458     case EOL_LF:         return Qlf;
 459     case EOL_CRLF:       return Qcrlf;
 460     case EOL_CR:         return Qcr;
 461     case EOL_AUTODETECT: return Qnil;
 462     }
 463 }
 464
 465 static void
 466 setup_eol_coding_systems (Lisp_Coding_System *codesys)
 467 {
 468   Lisp_Object codesys_obj;
 469   int len = string_length (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name);
 470   char *codesys_name = (char *) alloca (len + 7);
 471   int mlen = -1;
 472   char *codesys_mnemonic=0;
 473
 474   Lisp_Object codesys_name_sym, sub_codesys_obj;
 475
 476   /* kludge */
 477
 478   XSETCODING_SYSTEM (codesys_obj, codesys);
 479
 480   memcpy (codesys_name,
 481           string_data (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name), len);
 482
 483   if (STRINGP (CODING_SYSTEM_MNEMONIC (codesys)))
 484     {
 485       mlen = XSTRING_LENGTH (CODING_SYSTEM_MNEMONIC (codesys));
 486       codesys_mnemonic = (char *) alloca (mlen + 7);
 487       memcpy (codesys_mnemonic,
 488               XSTRING_DATA (CODING_SYSTEM_MNEMONIC (codesys)), mlen);
 489     }
 490
 491 #define DEFINE_SUB_CODESYS(op_sys, op_sys_abbr, Type) do {                      \
 492   strcpy (codesys_name + len, "-" op_sys);                                      \
 493   if (mlen != -1)                                                               \
 494     strcpy (codesys_mnemonic + mlen, op_sys_abbr);                              \
 495   codesys_name_sym = intern (codesys_name);                                     \
 496   sub_codesys_obj = Fcopy_coding_system (codesys_obj, codesys_name_sym);        \
 497   XCODING_SYSTEM_EOL_TYPE (sub_codesys_obj) = Type;                             \
 498   if (mlen != -1)                                                               \
 499     XCODING_SYSTEM_MNEMONIC(sub_codesys_obj) =                                  \
 500       build_string (codesys_mnemonic);                                          \
 501   CODING_SYSTEM_##Type (codesys) = sub_codesys_obj;                             \
 502 } while (0)
 503
 504   DEFINE_SUB_CODESYS("unix", "", EOL_LF);
 505   DEFINE_SUB_CODESYS("dos",  ":T", EOL_CRLF);
 506   DEFINE_SUB_CODESYS("mac",  ":t", EOL_CR);
 507 }
 508
 509 DEFUN ("coding-system-p", Fcoding_system_p, 1, 1, 0, /*
 510 Return t if OBJECT is a coding system.
 511 A coding system is an object that defines how text containing multiple
 512 character sets is encoded into a stream of (typically 8-bit) bytes.
 513 The coding system is used to decode the stream into a series of
 514 characters (which may be from multiple charsets) when the text is read
 515 from a file or process, and is used to encode the text back into the
 516 same format when it is written out to a file or process.
 517
 518 For example, many ISO2022-compliant coding systems (such as Compound
 519 Text, which is used for inter-client data under the X Window System)
 520 use escape sequences to switch between different charsets -- Japanese
 521 Kanji, for example, is invoked with "ESC $ ( B"; ASCII is invoked
 522 with "ESC ( B"; and Cyrillic is invoked with "ESC - L".  See
 523 `make-coding-system' for more information.
 524
 525 Coding systems are normally identified using a symbol, and the
 526 symbol is accepted in place of the actual coding system object whenever
 527 a coding system is called for. (This is similar to how faces work.)
 528 */
 529        (object))
 530 {
 531   return CODING_SYSTEMP (object) ? Qt : Qnil;
 532 }
 533
 534 DEFUN ("find-coding-system", Ffind_coding_system, 1, 1, 0, /*
 535 Retrieve the coding system of the given name.
 536
 537 If CODING-SYSTEM-OR-NAME is a coding-system object, it is simply
 538 returned.  Otherwise, CODING-SYSTEM-OR-NAME should be a symbol.
 539 If there is no such coding system, nil is returned.  Otherwise the
 540 associated coding system object is returned.
 541 */
 542        (coding_system_or_name))
 543 {
 544   if (NILP (coding_system_or_name))
 545     coding_system_or_name = Qbinary;
 546   else if (CODING_SYSTEMP (coding_system_or_name))
 547     return coding_system_or_name;
 548   else
 549     CHECK_SYMBOL (coding_system_or_name);
 550
 551   while (1)
 552     {
 553       coding_system_or_name =
 554         Fgethash (coding_system_or_name, Vcoding_system_hash_table, Qnil);
 555
 556       if (CODING_SYSTEMP (coding_system_or_name) || NILP (coding_system_or_name))
 557         return coding_system_or_name;
 558     }
 559 }
 560
 561 DEFUN ("get-coding-system", Fget_coding_system, 1, 1, 0, /*
 562 Retrieve the coding system of the given name.
 563 Same as `find-coding-system' except that if there is no such
 564 coding system, an error is signaled instead of returning nil.
 565 */
 566        (name))
 567 {
 568   Lisp_Object coding_system = Ffind_coding_system (name);
 569
 570   if (NILP (coding_system))
 571     signal_simple_error ("No such coding system", name);
 572   return coding_system;
 573 }
 574
 575 /* We store the coding systems in hash tables with the names as the key and the
 576    actual coding system object as the value.  Occasionally we need to use them
 577    in a list format.  These routines provide us with that. */
 578 struct coding_system_list_closure
 579 {
 580   Lisp_Object *coding_system_list;
 581 };
 582
 583 static int
 584 add_coding_system_to_list_mapper (Lisp_Object key, Lisp_Object value,
 585                                   void *coding_system_list_closure)
 586 {
 587   /* This function can GC */
 588   struct coding_system_list_closure *cscl =
 589     (struct coding_system_list_closure *) coding_system_list_closure;
 590   Lisp_Object *coding_system_list = cscl->coding_system_list;
 591
 592   *coding_system_list = Fcons (key, *coding_system_list);
 593   return 0;
 594 }
 595
 596 DEFUN ("coding-system-list", Fcoding_system_list, 0, 0, 0, /*
 597 Return a list of the names of all defined coding systems.
 598 */
 599        ())
 600 {
 601   Lisp_Object coding_system_list = Qnil;
 602   struct gcpro gcpro1;
 603   struct coding_system_list_closure coding_system_list_closure;
 604
 605   GCPRO1 (coding_system_list);
 606   coding_system_list_closure.coding_system_list = &coding_system_list;
 607   elisp_maphash (add_coding_system_to_list_mapper, Vcoding_system_hash_table,
 608                  &coding_system_list_closure);
 609   UNGCPRO;
 610
 611   return coding_system_list;
 612 }
 613
 614 DEFUN ("coding-system-name", Fcoding_system_name, 1, 1, 0, /*
 615 Return the name of the given coding system.
 616 */
 617        (coding_system))
 618 {
 619   coding_system = Fget_coding_system (coding_system);
 620   return XCODING_SYSTEM_NAME (coding_system);
 621 }
 622
 623 static Lisp_Coding_System *
 624 allocate_coding_system (enum coding_system_type type, Lisp_Object name)
 625 {
 626   Lisp_Coding_System *codesys =
 627     alloc_lcrecord_type (Lisp_Coding_System, &lrecord_coding_system);
 628
 629   zero_lcrecord (codesys);
 630   CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = Qnil;
 631   CODING_SYSTEM_POST_READ_CONVERSION (codesys) = Qnil;
 632   CODING_SYSTEM_EOL_TYPE (codesys) = EOL_AUTODETECT;
 633   CODING_SYSTEM_EOL_CRLF (codesys) = Qnil;
 634   CODING_SYSTEM_EOL_CR   (codesys) = Qnil;
 635   CODING_SYSTEM_EOL_LF   (codesys) = Qnil;
 636   CODING_SYSTEM_TYPE     (codesys) = type;
 637   CODING_SYSTEM_MNEMONIC (codesys) = Qnil;
 638 #ifdef MULE
 639   if (type == CODESYS_ISO2022)
 640     {
 641       int i;
 642       for (i = 0; i < 4; i++)
 643         CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i) = Qnil;
 644     }
 645   else if (type == CODESYS_CCL)
 646     {
 647       CODING_SYSTEM_CCL_DECODE (codesys) = Qnil;
 648       CODING_SYSTEM_CCL_ENCODE (codesys) = Qnil;
 649     }
 650 #endif /* MULE */
 651   CODING_SYSTEM_NAME (codesys) = name;
 652
 653   return codesys;
 654 }
 655
 656 #ifdef MULE
 657 /* Given a list of charset conversion specs as specified in a Lisp
 658    program, parse it into STORE_HERE. */
 659
 660 static void
 661 parse_charset_conversion_specs (charset_conversion_spec_dynarr *store_here,
 662                                 Lisp_Object spec_list)
 663 {
 664   Lisp_Object rest;
 665
 666   EXTERNAL_LIST_LOOP (rest, spec_list)
 667     {
 668       Lisp_Object car = XCAR (rest);
 669       Lisp_Object from, to;
 670       struct charset_conversion_spec spec;
 671
 672       if (!CONSP (car) || !CONSP (XCDR (car)) || !NILP (XCDR (XCDR (car))))
 673         signal_simple_error ("Invalid charset conversion spec", car);
 674       from = Fget_charset (XCAR (car));
 675       to = Fget_charset (XCAR (XCDR (car)));
 676       if (XCHARSET_TYPE (from) != XCHARSET_TYPE (to))
 677         signal_simple_error_2
 678           ("Attempted conversion between different charset types",
 679            from, to);
 680       spec.from_charset = from;
 681       spec.to_charset = to;
 682
 683       Dynarr_add (store_here, spec);
 684     }
 685 }
 686
 687 /* Given a dynarr LOAD_HERE of internally-stored charset conversion
 688    specs, return the equivalent as the Lisp programmer would see it.
 689
 690    If LOAD_HERE is 0, return Qnil. */
 691
 692 static Lisp_Object
 693 unparse_charset_conversion_specs (charset_conversion_spec_dynarr *load_here)
 694 {
 695   int i;
 696   Lisp_Object result;
 697
 698   if (!load_here)
 699     return Qnil;
 700   for (i = 0, result = Qnil; i < Dynarr_length (load_here); i++)
 701     {
 702       struct charset_conversion_spec *ccs = Dynarr_atp (load_here, i);
 703       result = Fcons (list2 (ccs->from_charset, ccs->to_charset), result);
 704     }
 705
 706   return Fnreverse (result);
 707 }
 708
 709 #endif /* MULE */
 710
 711 DEFUN ("make-coding-system", Fmake_coding_system, 2, 4, 0, /*
 712 Register symbol NAME as a coding system.
 713
 714 TYPE describes the conversion method used and should be one of
 715
 716 nil or 'undecided
 717      Automatic conversion.  XEmacs attempts to detect the coding system
 718      used in the file.
 719 'no-conversion
 720      No conversion.  Use this for binary files and such.  On output,
 721      graphic characters that are not in ASCII or Latin-1 will be
 722      replaced by a ?. (For a no-conversion-encoded buffer, these
 723      characters will only be present if you explicitly insert them.)
 724 'shift-jis
 725      Shift-JIS (a Japanese encoding commonly used in PC operating systems).
 726 'ucs-4
 727      ISO 10646 UCS-4 encoding.
 728 'utf-8
 729      ISO 10646 UTF-8 encoding.
 730 'iso2022
 731      Any ISO2022-compliant encoding.  Among other things, this includes
 732      JIS (the Japanese encoding commonly used for e-mail), EUC (the
 733      standard Unix encoding for Japanese and other languages), and
 734      Compound Text (the encoding used in X11).  You can specify more
 735      specific information about the conversion with the PROPS argument.
 736 'big5
 737      Big5 (the encoding commonly used for Taiwanese).
 738 'ccl
 739      The conversion is performed using a user-written pseudo-code
 740      program.  CCL (Code Conversion Language) is the name of this
 741      pseudo-code.
 742 'internal
 743      Write out or read in the raw contents of the memory representing
 744      the buffer's text.  This is primarily useful for debugging
 745      purposes, and is only enabled when XEmacs has been compiled with
 746      DEBUG_XEMACS defined (via the --debug configure option).
 747      WARNING: Reading in a file using 'internal conversion can result
 748      in an internal inconsistency in the memory representing a
 749      buffer's text, which will produce unpredictable results and may
 750      cause XEmacs to crash.  Under normal circumstances you should
 751      never use 'internal conversion.
 752
 753 DOC-STRING is a string describing the coding system.
 754
 755 PROPS is a property list, describing the specific nature of the
 756 character set.  Recognized properties are:
 757
 758 'mnemonic
 759      String to be displayed in the modeline when this coding system is
 760      active.
 761
 762 'eol-type
 763      End-of-line conversion to be used.  It should be one of
 764
 765         nil
 766                 Automatically detect the end-of-line type (LF, CRLF,
 767                 or CR).  Also generate subsidiary coding systems named
 768                 `NAME-unix', `NAME-dos', and `NAME-mac', that are
 769                 identical to this coding system but have an EOL-TYPE
 770                 value of 'lf, 'crlf, and 'cr, respectively.
 771         'lf
 772                 The end of a line is marked externally using ASCII LF.
 773                 Since this is also the way that XEmacs represents an
 774                 end-of-line internally, specifying this option results
 775                 in no end-of-line conversion.  This is the standard
 776                 format for Unix text files.
 777         'crlf
 778                 The end of a line is marked externally using ASCII
 779                 CRLF.  This is the standard format for MS-DOS text
 780                 files.
 781         'cr
 782                 The end of a line is marked externally using ASCII CR.
 783                 This is the standard format for Macintosh text files.
 784         t
 785                 Automatically detect the end-of-line type but do not
 786                 generate subsidiary coding systems.  (This value is
 787                 converted to nil when stored internally, and
 788                 `coding-system-property' will return nil.)
 789
 790 'post-read-conversion
 791      Function called after a file has been read in, to perform the
 792      decoding.  Called with two arguments, BEG and END, denoting
 793      a region of the current buffer to be decoded.
 794
 795 'pre-write-conversion
 796      Function called before a file is written out, to perform the
 797      encoding.  Called with two arguments, BEG and END, denoting
 798      a region of the current buffer to be encoded.
 799
 800
 801 The following additional properties are recognized if TYPE is 'iso2022:
 802
 803 'charset-g0
 804 'charset-g1
 805 'charset-g2
 806 'charset-g3
 807      The character set initially designated to the G0 - G3 registers.
 808      The value should be one of
 809
 810           -- A charset object (designate that character set)
 811           -- nil (do not ever use this register)
 812           -- t (no character set is initially designated to
 813                 the register, but may be later on; this automatically
 814                 sets the corresponding `force-g*-on-output' property)
 815
 816 'force-g0-on-output
 817 'force-g1-on-output
 818 'force-g2-on-output
 819 'force-g2-on-output
 820      If non-nil, send an explicit designation sequence on output before
 821      using the specified register.
 822
 823 'short
 824      If non-nil, use the short forms "ESC $ @", "ESC $ A", and
 825      "ESC $ B" on output in place of the full designation sequences
 826      "ESC $ ( @", "ESC $ ( A", and "ESC $ ( B".
 827
 828 'no-ascii-eol
 829      If non-nil, don't designate ASCII to G0 at each end of line on output.
 830      Setting this to non-nil also suppresses other state-resetting that
 831      normally happens at the end of a line.
 832
 833 'no-ascii-cntl
 834      If non-nil, don't designate ASCII to G0 before control chars on output.
 835
 836 'seven
 837      If non-nil, use 7-bit environment on output.  Otherwise, use 8-bit
 838      environment.
 839
 840 'lock-shift
 841      If non-nil, use locking-shift (SO/SI) instead of single-shift
 842      or designation by escape sequence.
 843
 844 'no-iso6429
 845      If non-nil, don't use ISO6429's direction specification.
 846
 847 'escape-quoted
 848      If non-nil, literal control characters that are the same as
 849      the beginning of a recognized ISO2022 or ISO6429 escape sequence
 850      (in particular, ESC (0x1B), SO (0x0E), SI (0x0F), SS2 (0x8E),
 851      SS3 (0x8F), and CSI (0x9B)) are "quoted" with an escape character
 852      so that they can be properly distinguished from an escape sequence.
 853      (Note that doing this results in a non-portable encoding.) This
 854      encoding flag is used for byte-compiled files.  Note that ESC
 855      is a good choice for a quoting character because there are no
 856      escape sequences whose second byte is a character from the Control-0
 857      or Control-1 character sets; this is explicitly disallowed by the
 858      ISO2022 standard.
 859
 860 'input-charset-conversion
 861      A list of conversion specifications, specifying conversion of
 862      characters in one charset to another when decoding is performed.
 863      Each specification is a list of two elements: the source charset,
 864      and the destination charset.
 865
 866 'output-charset-conversion
 867      A list of conversion specifications, specifying conversion of
 868      characters in one charset to another when encoding is performed.
 869      The form of each specification is the same as for
 870      'input-charset-conversion.
 871
 872
 873 The following additional properties are recognized (and required)
 874 if TYPE is 'ccl:
 875
 876 'decode
 877      CCL program used for decoding (converting to internal format).
 878
 879 'encode
 880      CCL program used for encoding (converting to external format).
 881 */
 882        (name, type, doc_string, props))
 883 {
 884   Lisp_Coding_System *codesys;
 885   enum coding_system_type ty;
 886   int need_to_setup_eol_systems = 1;
 887
 888   /* Convert type to constant */
 889   if (NILP (type) || EQ (type, Qundecided))
 890                                       { ty = CODESYS_AUTODETECT; }
 891 #ifdef MULE
 892   else if (EQ (type, Qshift_jis))     { ty = CODESYS_SHIFT_JIS; }
 893   else if (EQ (type, Qiso2022))       { ty = CODESYS_ISO2022; }
 894   else if (EQ (type, Qbig5))          { ty = CODESYS_BIG5; }
 895   else if (EQ (type, Qucs4))          { ty = CODESYS_UCS4; }
 896   else if (EQ (type, Qutf8))          { ty = CODESYS_UTF8; }
 897   else if (EQ (type, Qccl))           { ty = CODESYS_CCL; }
 898 #endif
 899   else if (EQ (type, Qno_conversion)) { ty = CODESYS_NO_CONVERSION; }
 900 #ifdef DEBUG_XEMACS
 901   else if (EQ (type, Qinternal))      { ty = CODESYS_INTERNAL; }
 902 #endif
 903   else
 904     signal_simple_error ("Invalid coding system type", type);
 905
 906   CHECK_SYMBOL (name);
 907
 908   codesys = allocate_coding_system (ty, name);
 909
 910   if (NILP (doc_string))
 911     doc_string = build_string ("");
 912   else
 913     CHECK_STRING (doc_string);
 914   CODING_SYSTEM_DOC_STRING (codesys) = doc_string;
 915
 916   {
 917     EXTERNAL_PROPERTY_LIST_LOOP_3 (key, value, props)
 918       {
 919         if (EQ (key, Qmnemonic))
 920           {
 921             if (!NILP (value))
 922               CHECK_STRING (value);
 923             CODING_SYSTEM_MNEMONIC (codesys) = value;
 924           }
 925
 926         else if (EQ (key, Qeol_type))
 927           {
 928             need_to_setup_eol_systems = NILP (value);
 929             if (EQ (value, Qt))
 930               value = Qnil;
 931             CODING_SYSTEM_EOL_TYPE (codesys) = symbol_to_eol_type (value);
 932           }
 933
 934         else if (EQ (key, Qpost_read_conversion)) CODING_SYSTEM_POST_READ_CONVERSION (codesys) = value;
 935         else if (EQ (key, Qpre_write_conversion)) CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = value;
 936 #ifdef MULE
 937         else if (ty == CODESYS_ISO2022)
 938           {
 939 #define FROB_INITIAL_CHARSET(charset_num) \
 940   CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \
 941     ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value))
 942
 943             if      (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
 944             else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
 945             else if (EQ (key, Qcharset_g2)) FROB_INITIAL_CHARSET (2);
 946             else if (EQ (key, Qcharset_g3)) FROB_INITIAL_CHARSET (3);
 947
 948 #define FROB_FORCE_CHARSET(charset_num) \
 949   CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = !NILP (value)
 950
 951             else if (EQ (key, Qforce_g0_on_output)) FROB_FORCE_CHARSET (0);
 952             else if (EQ (key, Qforce_g1_on_output)) FROB_FORCE_CHARSET (1);
 953             else if (EQ (key, Qforce_g2_on_output)) FROB_FORCE_CHARSET (2);
 954             else if (EQ (key, Qforce_g3_on_output)) FROB_FORCE_CHARSET (3);
 955
 956 #define FROB_BOOLEAN_PROPERTY(prop) \
 957   CODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value)
 958
 959             else if (EQ (key, Qshort))         FROB_BOOLEAN_PROPERTY (SHORT);
 960             else if (EQ (key, Qno_ascii_eol))  FROB_BOOLEAN_PROPERTY (NO_ASCII_EOL);
 961             else if (EQ (key, Qno_ascii_cntl)) FROB_BOOLEAN_PROPERTY (NO_ASCII_CNTL);
 962             else if (EQ (key, Qseven))         FROB_BOOLEAN_PROPERTY (SEVEN);
 963             else if (EQ (key, Qlock_shift))    FROB_BOOLEAN_PROPERTY (LOCK_SHIFT);
 964             else if (EQ (key, Qno_iso6429))    FROB_BOOLEAN_PROPERTY (NO_ISO6429);
 965             else if (EQ (key, Qescape_quoted)) FROB_BOOLEAN_PROPERTY (ESCAPE_QUOTED);
 966
 967             else if (EQ (key, Qinput_charset_conversion))
 968               {
 969                 codesys->iso2022.input_conv =
 970                   Dynarr_new (charset_conversion_spec);
 971                 parse_charset_conversion_specs (codesys->iso2022.input_conv,
 972                                                 value);
 973               }
 974             else if (EQ (key, Qoutput_charset_conversion))
 975               {
 976                 codesys->iso2022.output_conv =
 977                   Dynarr_new (charset_conversion_spec);
 978                 parse_charset_conversion_specs (codesys->iso2022.output_conv,
 979                                                 value);
 980               }
 981             else
 982               signal_simple_error ("Unrecognized property", key);
 983           }
 984         else if (EQ (type, Qccl))
 985           {
 986             if (EQ (key, Qdecode))
 987               {
 988                 CHECK_VECTOR (value);
 989                 CODING_SYSTEM_CCL_DECODE (codesys) = value;
 990               }
 991             else if (EQ (key, Qencode))
 992               {
 993                 CHECK_VECTOR (value);
 994                 CODING_SYSTEM_CCL_ENCODE (codesys) = value;
 995               }
 996             else
 997               signal_simple_error ("Unrecognized property", key);
 998           }
 999 #endif /* MULE */
1000         else
1001           signal_simple_error ("Unrecognized property", key);
1002       }
1003   }
1004
1005   if (need_to_setup_eol_systems)
1006     setup_eol_coding_systems (codesys);
1007
1008   {
1009     Lisp_Object codesys_obj;
1010     XSETCODING_SYSTEM (codesys_obj, codesys);
1011     Fputhash (name, codesys_obj, Vcoding_system_hash_table);
1012     return codesys_obj;
1013   }
1014 }
1015
1016 DEFUN ("copy-coding-system", Fcopy_coding_system, 2, 2, 0, /*
1017 Copy OLD-CODING-SYSTEM to NEW-NAME.
1018 If NEW-NAME does not name an existing coding system, a new one will
1019 be created.
1020 */
1021        (old_coding_system, new_name))
1022 {
1023   Lisp_Object new_coding_system;
1024   old_coding_system = Fget_coding_system (old_coding_system);
1025   new_coding_system = Ffind_coding_system (new_name);
1026   if (NILP (new_coding_system))
1027     {
1028       XSETCODING_SYSTEM (new_coding_system,
1029                          allocate_coding_system
1030                          (XCODING_SYSTEM_TYPE (old_coding_system),
1031                           new_name));
1032       Fputhash (new_name, new_coding_system, Vcoding_system_hash_table);
1033     }
1034
1035   {
1036     Lisp_Coding_System *to = XCODING_SYSTEM (new_coding_system);
1037     Lisp_Coding_System *from = XCODING_SYSTEM (old_coding_system);
1038     memcpy (((char *) to  ) + sizeof (to->header),
1039             ((char *) from) + sizeof (from->header),
1040             sizeof (*from) - sizeof (from->header));
1041     to->name = new_name;
1042   }
1043   return new_coding_system;
1044 }
1045
1046 DEFUN ("coding-system-canonical-name-p", Fcoding_system_canonical_name_p, 1, 1, 0, /*
1047 Return t if OBJECT names a coding system, and is not a coding system alias.
1048 */
1049        (object))
1050 {
1051   return CODING_SYSTEMP (Fgethash (object, Vcoding_system_hash_table, Qnil))
1052     ? Qt : Qnil;
1053 }
1054
1055 DEFUN ("coding-system-alias-p", Fcoding_system_alias_p, 1, 1, 0, /*
1056 Return t if OBJECT is a coding system alias.
1057 All coding system aliases are created by `define-coding-system-alias'.
1058 */
1059        (object))
1060 {
1061   return SYMBOLP (Fgethash (object, Vcoding_system_hash_table, Qzero))
1062     ? Qt : Qnil;
1063 }
1064
1065 DEFUN ("coding-system-aliasee", Fcoding_system_aliasee, 1, 1, 0, /*
1066 Return the coding-system symbol for which symbol ALIAS is an alias.
1067 */
1068        (alias))
1069 {
1070   Lisp_Object aliasee = Fgethash (alias, Vcoding_system_hash_table, Qnil);
1071   if (SYMBOLP (aliasee))
1072     return aliasee;
1073   else
1074     signal_simple_error ("Symbol is not a coding system alias", alias);
1075   return Qnil;          /* To keep the compiler happy */
1076 }
1077
1078 static Lisp_Object
1079 append_suffix_to_symbol (Lisp_Object symbol, const char *ascii_string)
1080 {
1081   return Fintern (concat2 (Fsymbol_name (symbol), build_string (ascii_string)),
1082                   Qnil);
1083 }
1084
1085 /* A maphash function, for removing dangling coding system aliases. */
1086 static int
1087 dangling_coding_system_alias_p (Lisp_Object alias,
1088                                 Lisp_Object aliasee,
1089                                 void *dangling_aliases)
1090 {
1091   if (SYMBOLP (aliasee)
1092       && NILP (Fgethash (aliasee, Vcoding_system_hash_table, Qnil)))
1093     {
1094       (*(int *) dangling_aliases)++;
1095       return 1;
1096     }
1097   else
1098     return 0;
1099 }
1100
1101 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias, 2, 2, 0, /*
1102 Define symbol ALIAS as an alias for coding system ALIASEE.
1103
1104 You can use this function to redefine an alias that has already been defined,
1105 but you cannot redefine a name which is the canonical name for a coding system.
1106 \(a canonical name of a coding system is what is returned when you call
1107 `coding-system-name' on a coding system).
1108
1109 ALIASEE itself can be an alias, which allows you to define nested aliases.
1110
1111 You are forbidden, however, from creating alias loops or `dangling' aliases.
1112 These will be detected, and an error will be signaled if you attempt to do so.
1113
1114 If ALIASEE is nil, then ALIAS will simply be undefined.
1115
1116 See also `coding-system-alias-p', `coding-system-aliasee',
1117 and `coding-system-canonical-name-p'.
1118 */
1119        (alias, aliasee))
1120 {
1121   Lisp_Object real_coding_system, probe;
1122
1123   CHECK_SYMBOL (alias);
1124
1125   if (!NILP (Fcoding_system_canonical_name_p (alias)))
1126     signal_simple_error
1127       ("Symbol is the canonical name of a coding system and cannot be redefined",
1128        alias);
1129
1130   if (NILP (aliasee))
1131     {
1132       Lisp_Object subsidiary_unix = append_suffix_to_symbol (alias, "-unix");
1133       Lisp_Object subsidiary_dos  = append_suffix_to_symbol (alias, "-dos");
1134       Lisp_Object subsidiary_mac  = append_suffix_to_symbol (alias, "-mac");
1135
1136       Fremhash (alias, Vcoding_system_hash_table);
1137
1138       /* Undefine subsidiary aliases,
1139          presumably created by a previous call to this function */
1140       if (! NILP (Fcoding_system_alias_p (subsidiary_unix)) &&
1141           ! NILP (Fcoding_system_alias_p (subsidiary_dos))  &&
1142           ! NILP (Fcoding_system_alias_p (subsidiary_mac)))
1143         {
1144           Fdefine_coding_system_alias (subsidiary_unix, Qnil);
1145           Fdefine_coding_system_alias (subsidiary_dos,  Qnil);
1146           Fdefine_coding_system_alias (subsidiary_mac,  Qnil);
1147         }
1148
1149       /* Undefine dangling coding system aliases. */
1150       {
1151         int dangling_aliases;
1152
1153         do {
1154           dangling_aliases = 0;
1155           elisp_map_remhash (dangling_coding_system_alias_p,
1156                              Vcoding_system_hash_table,
1157                              &dangling_aliases);
1158         } while (dangling_aliases > 0);
1159       }
1160
1161       return Qnil;
1162     }
1163
1164   if (CODING_SYSTEMP (aliasee))
1165     aliasee = XCODING_SYSTEM_NAME (aliasee);
1166
1167   /* Checks that aliasee names a coding-system */
1168   real_coding_system = Fget_coding_system (aliasee);
1169
1170   /* Check for coding system alias loops */
1171   if (EQ (alias, aliasee))
1172     alias_loop: signal_simple_error_2
1173       ("Attempt to create a coding system alias loop", alias, aliasee);
1174
1175   for (probe = aliasee;
1176        SYMBOLP (probe);
1177        probe = Fgethash (probe, Vcoding_system_hash_table, Qzero))
1178     {
1179       if (EQ (probe, alias))
1180         goto alias_loop;
1181     }
1182
1183   Fputhash (alias, aliasee, Vcoding_system_hash_table);
1184
1185   /* Set up aliases for subsidiaries.
1186      #### There must be a better way to handle subsidiary coding systems. */
1187   {
1188     static const char *suffixes[] = { "-unix", "-dos", "-mac" };
1189     int i;
1190     for (i = 0; i < countof (suffixes); i++)
1191       {
1192         Lisp_Object alias_subsidiary =
1193           append_suffix_to_symbol (alias, suffixes[i]);
1194         Lisp_Object aliasee_subsidiary =
1195           append_suffix_to_symbol (aliasee, suffixes[i]);
1196
1197         if (! NILP (Ffind_coding_system (aliasee_subsidiary)))
1198           Fdefine_coding_system_alias (alias_subsidiary, aliasee_subsidiary);
1199       }
1200   }
1201   /* FSF return value is a vector of [ALIAS-unix ALIAS-dos ALIAS-mac],
1202      but it doesn't look intentional, so I'd rather return something
1203      meaningful or nothing at all. */
1204   return Qnil;
1205 }
1206
1207 static Lisp_Object
1208 subsidiary_coding_system (Lisp_Object coding_system, eol_type_t type)
1209 {
1210   Lisp_Coding_System *cs = XCODING_SYSTEM (coding_system);
1211   Lisp_Object new_coding_system;
1212
1213   if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
1214     return coding_system;
1215
1216   switch (type)
1217     {
1218     case EOL_AUTODETECT: return coding_system;
1219     case EOL_LF:   new_coding_system = CODING_SYSTEM_EOL_LF   (cs); break;
1220     case EOL_CR:   new_coding_system = CODING_SYSTEM_EOL_CR   (cs); break;
1221     case EOL_CRLF: new_coding_system = CODING_SYSTEM_EOL_CRLF (cs); break;
1222     default:       abort (); return Qnil;
1223     }
1224
1225   return NILP (new_coding_system) ? coding_system : new_coding_system;
1226 }
1227
1228 DEFUN ("subsidiary-coding-system", Fsubsidiary_coding_system, 2, 2, 0, /*
1229 Return the subsidiary coding system of CODING-SYSTEM with eol type EOL-TYPE.
1230 */
1231        (coding_system, eol_type))
1232 {
1233   coding_system = Fget_coding_system (coding_system);
1234
1235   return subsidiary_coding_system (coding_system,
1236                                    symbol_to_eol_type (eol_type));
1237 }
1238
1239 \f
1240 /************************************************************************/
1241 /*                         Coding system accessors                      */
1242 /************************************************************************/
1243
1244 DEFUN ("coding-system-doc-string", Fcoding_system_doc_string, 1, 1, 0, /*
1245 Return the doc string for CODING-SYSTEM.
1246 */
1247        (coding_system))
1248 {
1249   coding_system = Fget_coding_system (coding_system);
1250   return XCODING_SYSTEM_DOC_STRING (coding_system);
1251 }
1252
1253 DEFUN ("coding-system-type", Fcoding_system_type, 1, 1, 0, /*
1254 Return the type of CODING-SYSTEM.
1255 */
1256        (coding_system))
1257 {
1258   switch (XCODING_SYSTEM_TYPE (Fget_coding_system (coding_system)))
1259     {
1260     default: abort ();
1261     case CODESYS_AUTODETECT:    return Qundecided;
1262 #ifdef MULE
1263     case CODESYS_SHIFT_JIS:     return Qshift_jis;
1264     case CODESYS_ISO2022:       return Qiso2022;
1265     case CODESYS_BIG5:          return Qbig5;
1266     case CODESYS_UCS4:          return Qucs4;
1267     case CODESYS_UTF8:          return Qutf8;
1268     case CODESYS_CCL:           return Qccl;
1269 #endif
1270     case CODESYS_NO_CONVERSION: return Qno_conversion;
1271 #ifdef DEBUG_XEMACS
1272     case CODESYS_INTERNAL:      return Qinternal;
1273 #endif
1274     }
1275 }
1276
1277 #ifdef MULE
1278 static
1279 Lisp_Object coding_system_charset (Lisp_Object coding_system, int gnum)
1280 {
1281   Lisp_Object cs
1282     = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, gnum);
1283
1284   return CHARSETP (cs) ? XCHARSET_NAME (cs) : Qnil;
1285 }
1286
1287 DEFUN ("coding-system-charset", Fcoding_system_charset, 2, 2, 0, /*
1288 Return initial charset of CODING-SYSTEM designated to GNUM.
1289 GNUM allows 0 .. 3.
1290 */
1291        (coding_system, gnum))
1292 {
1293   coding_system = Fget_coding_system (coding_system);
1294   CHECK_INT (gnum);
1295
1296   return coding_system_charset (coding_system, XINT (gnum));
1297 }
1298 #endif /* MULE */
1299
1300 DEFUN ("coding-system-property", Fcoding_system_property, 2, 2, 0, /*
1301 Return the PROP property of CODING-SYSTEM.
1302 */
1303        (coding_system, prop))
1304 {
1305   int i, ok = 0;
1306   enum coding_system_type type;
1307
1308   coding_system = Fget_coding_system (coding_system);
1309   CHECK_SYMBOL (prop);
1310   type = XCODING_SYSTEM_TYPE (coding_system);
1311
1312   for (i = 0; !ok && i < Dynarr_length (the_codesys_prop_dynarr); i++)
1313     if (EQ (Dynarr_at (the_codesys_prop_dynarr, i).sym, prop))
1314       {
1315         ok = 1;
1316         switch (Dynarr_at (the_codesys_prop_dynarr, i).prop_type)
1317           {
1318           case CODESYS_PROP_ALL_OK:
1319             break;
1320 #ifdef MULE
1321           case CODESYS_PROP_ISO2022:
1322             if (type != CODESYS_ISO2022)
1323               signal_simple_error
1324                 ("Property only valid in ISO2022 coding systems",
1325                  prop);
1326             break;
1327
1328           case CODESYS_PROP_CCL:
1329             if (type != CODESYS_CCL)
1330               signal_simple_error
1331                 ("Property only valid in CCL coding systems",
1332                  prop);
1333             break;
1334 #endif /* MULE */
1335           default:
1336             abort ();
1337           }
1338       }
1339
1340   if (!ok)
1341     signal_simple_error ("Unrecognized property", prop);
1342
1343   if (EQ (prop, Qname))
1344     return XCODING_SYSTEM_NAME (coding_system);
1345   else if (EQ (prop, Qtype))
1346     return Fcoding_system_type (coding_system);
1347   else if (EQ (prop, Qdoc_string))
1348     return XCODING_SYSTEM_DOC_STRING (coding_system);
1349   else if (EQ (prop, Qmnemonic))
1350     return XCODING_SYSTEM_MNEMONIC (coding_system);
1351   else if (EQ (prop, Qeol_type))
1352     return eol_type_to_symbol (XCODING_SYSTEM_EOL_TYPE (coding_system));
1353   else if (EQ (prop, Qeol_lf))
1354     return XCODING_SYSTEM_EOL_LF (coding_system);
1355   else if (EQ (prop, Qeol_crlf))
1356     return XCODING_SYSTEM_EOL_CRLF (coding_system);
1357   else if (EQ (prop, Qeol_cr))
1358     return XCODING_SYSTEM_EOL_CR (coding_system);
1359   else if (EQ (prop, Qpost_read_conversion))
1360     return XCODING_SYSTEM_POST_READ_CONVERSION (coding_system);
1361   else if (EQ (prop, Qpre_write_conversion))
1362     return XCODING_SYSTEM_PRE_WRITE_CONVERSION (coding_system);
1363 #ifdef MULE
1364   else if (type == CODESYS_ISO2022)
1365     {
1366       if (EQ (prop, Qcharset_g0))
1367         return coding_system_charset (coding_system, 0);
1368       else if (EQ (prop, Qcharset_g1))
1369         return coding_system_charset (coding_system, 1);
1370       else if (EQ (prop, Qcharset_g2))
1371         return coding_system_charset (coding_system, 2);
1372       else if (EQ (prop, Qcharset_g3))
1373         return coding_system_charset (coding_system, 3);
1374
1375 #define FORCE_CHARSET(charset_num) \
1376   (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \
1377    (coding_system, charset_num) ? Qt : Qnil)
1378
1379       else if (EQ (prop, Qforce_g0_on_output)) return FORCE_CHARSET (0);
1380       else if (EQ (prop, Qforce_g1_on_output)) return FORCE_CHARSET (1);
1381       else if (EQ (prop, Qforce_g2_on_output)) return FORCE_CHARSET (2);
1382       else if (EQ (prop, Qforce_g3_on_output)) return FORCE_CHARSET (3);
1383
1384 #define LISP_BOOLEAN(prop) \
1385   (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil)
1386
1387       else if (EQ (prop, Qshort))         return LISP_BOOLEAN (SHORT);
1388       else if (EQ (prop, Qno_ascii_eol))  return LISP_BOOLEAN (NO_ASCII_EOL);
1389       else if (EQ (prop, Qno_ascii_cntl)) return LISP_BOOLEAN (NO_ASCII_CNTL);
1390       else if (EQ (prop, Qseven))         return LISP_BOOLEAN (SEVEN);
1391       else if (EQ (prop, Qlock_shift))    return LISP_BOOLEAN (LOCK_SHIFT);
1392       else if (EQ (prop, Qno_iso6429))    return LISP_BOOLEAN (NO_ISO6429);
1393       else if (EQ (prop, Qescape_quoted)) return LISP_BOOLEAN (ESCAPE_QUOTED);
1394
1395       else if (EQ (prop, Qinput_charset_conversion))
1396         return
1397           unparse_charset_conversion_specs
1398             (XCODING_SYSTEM (coding_system)->iso2022.input_conv);
1399       else if (EQ (prop, Qoutput_charset_conversion))
1400         return
1401           unparse_charset_conversion_specs
1402             (XCODING_SYSTEM (coding_system)->iso2022.output_conv);
1403       else
1404         abort ();
1405     }
1406   else if (type == CODESYS_CCL)
1407     {
1408       if (EQ (prop, Qdecode))
1409         return XCODING_SYSTEM_CCL_DECODE (coding_system);
1410       else if (EQ (prop, Qencode))
1411         return XCODING_SYSTEM_CCL_ENCODE (coding_system);
1412       else
1413         abort ();
1414     }
1415 #endif /* MULE */
1416   else
1417     abort ();
1418
1419   return Qnil; /* not reached */
1420 }
1421
1422 \f
1423 /************************************************************************/
1424 /*                       Coding category functions                      */
1425 /************************************************************************/
1426
1427 static int
1428 decode_coding_category (Lisp_Object symbol)
1429 {
1430   int i;
1431
1432   CHECK_SYMBOL (symbol);
1433   for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1434     if (EQ (coding_category_symbol[i], symbol))
1435       return i;
1436
1437   signal_simple_error ("Unrecognized coding category", symbol);
1438   return 0; /* not reached */
1439 }
1440
1441 DEFUN ("coding-category-list", Fcoding_category_list, 0, 0, 0, /*
1442 Return a list of all recognized coding categories.
1443 */
1444        ())
1445 {
1446   int i;
1447   Lisp_Object list = Qnil;
1448
1449   for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1450     list = Fcons (coding_category_symbol[i], list);
1451   return list;
1452 }
1453
1454 DEFUN ("set-coding-priority-list", Fset_coding_priority_list, 1, 1, 0, /*
1455 Change the priority order of the coding categories.
1456 LIST should be list of coding categories, in descending order of
1457 priority.  Unspecified coding categories will be lower in priority
1458 than all specified ones, in the same relative order they were in
1459 previously.
1460 */
1461        (list))
1462 {
1463   int category_to_priority[CODING_CATEGORY_LAST + 1];
1464   int i, j;
1465   Lisp_Object rest;
1466
1467   /* First generate a list that maps coding categories to priorities. */
1468
1469   for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1470     category_to_priority[i] = -1;
1471
1472   /* Highest priority comes from the specified list. */
1473   i = 0;
1474   EXTERNAL_LIST_LOOP (rest, list)
1475     {
1476       int cat = decode_coding_category (XCAR (rest));
1477
1478       if (category_to_priority[cat] >= 0)
1479         signal_simple_error ("Duplicate coding category in list", XCAR (rest));
1480       category_to_priority[cat] = i++;
1481     }
1482
1483   /* Now go through the existing categories by priority to retrieve
1484      the categories not yet specified and preserve their priority
1485      order. */
1486   for (j = 0; j <= CODING_CATEGORY_LAST; j++)
1487     {
1488       int cat = fcd->coding_category_by_priority[j];
1489       if (category_to_priority[cat] < 0)
1490         category_to_priority[cat] = i++;
1491     }
1492
1493   /* Now we need to construct the inverse of the mapping we just
1494      constructed. */
1495
1496   for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1497     fcd->coding_category_by_priority[category_to_priority[i]] = i;
1498
1499   /* Phew!  That was confusing. */
1500   return Qnil;
1501 }
1502
1503 DEFUN ("coding-priority-list", Fcoding_priority_list, 0, 0, 0, /*
1504 Return a list of coding categories in descending order of priority.
1505 */
1506        ())
1507 {
1508   int i;
1509   Lisp_Object list = Qnil;
1510
1511   for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1512     list = Fcons (coding_category_symbol[fcd->coding_category_by_priority[i]],
1513                   list);
1514   return list;
1515 }
1516
1517 DEFUN ("set-coding-category-system", Fset_coding_category_system, 2, 2, 0, /*
1518 Change the coding system associated with a coding category.
1519 */
1520        (coding_category, coding_system))
1521 {
1522   int cat = decode_coding_category (coding_category);
1523
1524   coding_system = Fget_coding_system (coding_system);
1525   fcd->coding_category_system[cat] = coding_system;
1526   return Qnil;
1527 }
1528
1529 DEFUN ("coding-category-system", Fcoding_category_system, 1, 1, 0, /*
1530 Return the coding system associated with a coding category.
1531 */
1532        (coding_category))
1533 {
1534   int cat = decode_coding_category (coding_category);
1535   Lisp_Object sys = fcd->coding_category_system[cat];
1536
1537   if (!NILP (sys))
1538     return XCODING_SYSTEM_NAME (sys);
1539   return Qnil;
1540 }
1541
1542 \f
1543 /************************************************************************/
1544 /*                     Detecting the encoding of data                   */
1545 /************************************************************************/
1546
1547 struct detection_state
1548 {
1549   eol_type_t eol_type;
1550   int seen_non_ascii;
1551   int mask;
1552 #ifdef MULE
1553   struct
1554     {
1555       int mask;
1556       int in_second_byte;
1557     }
1558   big5;
1559
1560   struct
1561     {
1562       int mask;
1563       int in_second_byte;
1564     }
1565   shift_jis;
1566
1567   struct
1568     {
1569       int mask;
1570       int in_byte;
1571   }
1572   ucs4;
1573
1574   struct
1575     {
1576       int mask;
1577       int in_byte;
1578     }
1579   utf8;
1580
1581   struct
1582     {
1583       int mask;
1584       int initted;
1585       struct iso2022_decoder iso;
1586       unsigned int flags;
1587       int high_byte_count;
1588       unsigned int saw_single_shift:1;
1589     }
1590   iso2022;
1591 #endif
1592   struct
1593     {
1594       int seen_anything;
1595       int just_saw_cr;
1596     }
1597   eol;
1598 };
1599
1600 static int
1601 acceptable_control_char_p (int c)
1602 {
1603   switch (c)
1604     {
1605       /* Allow and ignore control characters that you might
1606          reasonably see in a text file */
1607     case '\r':
1608     case '\n':
1609     case '\t':
1610     case  7: /* bell */
1611     case  8: /* backspace */
1612     case 11: /* vertical tab */
1613     case 12: /* form feed */
1614     case 26: /* MS-DOS C-z junk */
1615     case 31: /* '^_' -- for info */
1616       return 1;
1617     default:
1618       return 0;
1619     }
1620 }
1621
1622 static int
1623 mask_has_at_most_one_bit_p (int mask)
1624 {
1625   /* Perhaps the only thing useful you learn from intensive Microsoft
1626      technical interviews */
1627   return (mask & (mask - 1)) == 0;
1628 }
1629
1630 static eol_type_t
1631 detect_eol_type (struct detection_state *st, const unsigned char *src,
1632                  unsigned int n)
1633 {
1634   int c;
1635
1636   while (n--)
1637     {
1638       c = *src++;
1639       if (c == '\n')
1640         {
1641           if (st->eol.just_saw_cr)
1642             return EOL_CRLF;
1643           else if (st->eol.seen_anything)
1644             return EOL_LF;
1645         }
1646       else if (st->eol.just_saw_cr)
1647         return EOL_CR;
1648       else if (c == '\r')
1649         st->eol.just_saw_cr = 1;
1650       else
1651         st->eol.just_saw_cr = 0;
1652       st->eol.seen_anything = 1;
1653     }
1654
1655   return EOL_AUTODETECT;
1656 }
1657
1658 /* Attempt to determine the encoding and EOL type of the given text.
1659    Before calling this function for the first type, you must initialize
1660    st->eol_type as appropriate and initialize st->mask to ~0.
1661
1662    st->eol_type holds the determined EOL type, or EOL_AUTODETECT if
1663    not yet known.
1664
1665    st->mask holds the determined coding category mask, or ~0 if only
1666    ASCII has been seen so far.
1667
1668    Returns:
1669
1670    0 == st->eol_type is EOL_AUTODETECT and/or more than coding category
1671         is present in st->mask
1672    1 == definitive answers are here for both st->eol_type and st->mask
1673 */
1674
1675 static int
1676 detect_coding_type (struct detection_state *st, const Extbyte *src,
1677                     unsigned int n, int just_do_eol)
1678 {
1679   int c;
1680
1681   if (st->eol_type == EOL_AUTODETECT)
1682     st->eol_type = detect_eol_type (st, src, n);
1683
1684   if (just_do_eol)
1685     return st->eol_type != EOL_AUTODETECT;
1686
1687   if (!st->seen_non_ascii)
1688     {
1689       for (; n; n--, src++)
1690         {
1691           c = *src;
1692           if ((c < 0x20 && !acceptable_control_char_p (c)) || c >= 0x80)
1693             {
1694               st->seen_non_ascii = 1;
1695 #ifdef MULE
1696               st->shift_jis.mask = ~0;
1697               st->big5.mask = ~0;
1698               st->ucs4.mask = ~0;
1699               st->utf8.mask = ~0;
1700               st->iso2022.mask = ~0;
1701 #endif
1702               break;
1703             }
1704         }
1705     }
1706
1707   if (!n)
1708     return 0;
1709 #ifdef MULE
1710   if (!mask_has_at_most_one_bit_p (st->iso2022.mask))
1711     st->iso2022.mask = detect_coding_iso2022 (st, src, n);
1712   if (!mask_has_at_most_one_bit_p (st->shift_jis.mask))
1713     st->shift_jis.mask = detect_coding_sjis (st, src, n);
1714   if (!mask_has_at_most_one_bit_p (st->big5.mask))
1715     st->big5.mask = detect_coding_big5 (st, src, n);
1716   if (!mask_has_at_most_one_bit_p (st->utf8.mask))
1717     st->utf8.mask = detect_coding_utf8 (st, src, n);
1718   if (!mask_has_at_most_one_bit_p (st->ucs4.mask))
1719     st->ucs4.mask = detect_coding_ucs4 (st, src, n);
1720
1721   st->mask
1722     = st->iso2022.mask | st->shift_jis.mask | st->big5.mask
1723     | st->utf8.mask | st->ucs4.mask;
1724 #endif
1725   {
1726     int retval = mask_has_at_most_one_bit_p (st->mask);
1727     st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK;
1728     return retval && st->eol_type != EOL_AUTODETECT;
1729   }
1730 }
1731
1732 static Lisp_Object
1733 coding_system_from_mask (int mask)
1734 {
1735   if (mask == ~0)
1736     {
1737       /* If the file was entirely or basically ASCII, use the
1738          default value of `buffer-file-coding-system'. */
1739       Lisp_Object retval =
1740         XBUFFER (Vbuffer_defaults)->buffer_file_coding_system;
1741       if (!NILP (retval))
1742         {
1743           retval = Ffind_coding_system (retval);
1744           if (NILP (retval))
1745             {
1746               warn_when_safe
1747                 (Qbad_variable, Qwarning,
1748                  "Invalid `default-buffer-file-coding-system', set to nil");
1749               XBUFFER (Vbuffer_defaults)->buffer_file_coding_system = Qnil;
1750             }
1751         }
1752       if (NILP (retval))
1753         retval = Fget_coding_system (Qraw_text);
1754       return retval;
1755     }
1756   else
1757     {
1758       int i;
1759       int cat = -1;
1760 #ifdef MULE
1761       mask = postprocess_iso2022_mask (mask);
1762 #endif
1763       /* Look through the coding categories by priority and find
1764          the first one that is allowed. */
1765       for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1766         {
1767           cat = fcd->coding_category_by_priority[i];
1768           if ((mask & (1 << cat)) &&
1769               !NILP (fcd->coding_category_system[cat]))
1770             break;
1771         }
1772       if (cat >= 0)
1773         return fcd->coding_category_system[cat];
1774       else
1775         return Fget_coding_system (Qraw_text);
1776     }
1777 }
1778
1779 /* Given a seekable read stream and potential coding system and EOL type
1780    as specified, do any autodetection that is called for.  If the
1781    coding system and/or EOL type are not `autodetect', they will be left
1782    alone; but this function will never return an autodetect coding system
1783    or EOL type.
1784
1785    This function does not automatically fetch subsidiary coding systems;
1786    that should be unnecessary with the explicit eol-type argument. */
1787
1788 #define LENGTH(string_constant) (sizeof (string_constant) - 1)
1789
1790 void
1791 determine_real_coding_system (Lstream *stream, Lisp_Object *codesys_in_out,
1792                               eol_type_t *eol_type_in_out)
1793 {
1794   struct detection_state decst;
1795
1796   if (*eol_type_in_out == EOL_AUTODETECT)
1797     *eol_type_in_out = XCODING_SYSTEM_EOL_TYPE (*codesys_in_out);
1798
1799   xzero (decst);
1800   decst.eol_type = *eol_type_in_out;
1801   decst.mask = ~0;
1802
1803   /* If autodetection is called for, do it now. */
1804   if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT
1805       || *eol_type_in_out == EOL_AUTODETECT)
1806     {
1807       Extbyte buf[4096];
1808       Lisp_Object coding_system = Qnil;
1809       Extbyte *p;
1810       ssize_t nread = Lstream_read (stream, buf, sizeof (buf));
1811       Extbyte *scan_end;
1812
1813       /* Look for initial "-*-"; mode line prefix */
1814       for (p = buf,
1815              scan_end = buf + nread - LENGTH ("-*-coding:?-*-");
1816            p <= scan_end
1817              && *p != '\n'
1818              && *p != '\r';
1819            p++)
1820         if (*p == '-' && *(p+1) == '*' && *(p+2) == '-')
1821           {
1822             Extbyte *local_vars_beg = p + 3;
1823             /* Look for final "-*-"; mode line suffix */
1824             for (p = local_vars_beg,
1825                    scan_end = buf + nread - LENGTH ("-*-");
1826                  p <= scan_end
1827                    && *p != '\n'
1828                    && *p != '\r';
1829                  p++)
1830               if (*p == '-' && *(p+1) == '*' && *(p+2) == '-')
1831                 {
1832                   Extbyte *suffix = p;
1833                   /* Look for "coding:" */
1834                   for (p = local_vars_beg,
1835                          scan_end = suffix - LENGTH ("coding:?");
1836                        p <= scan_end;
1837                        p++)
1838                     if (memcmp ("coding:", p, LENGTH ("coding:")) == 0
1839                         && (p == local_vars_beg
1840                             || (*(p-1) == ' '  ||
1841                                 *(p-1) == '\t' ||
1842                                 *(p-1) == ';')))
1843                       {
1844                         Extbyte save;
1845                         int n;
1846                         p += LENGTH ("coding:");
1847                         while (*p == ' ' || *p == '\t') p++;
1848
1849                         /* Get coding system name */
1850                         save = *suffix; *suffix = '\0';
1851                         /* Characters valid in a MIME charset name (rfc 1521),
1852                            and in a Lisp symbol name. */
1853                         n = strspn ( (char *) p,
1854                                     "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
1855                                     "abcdefghijklmnopqrstuvwxyz"
1856                                     "0123456789"
1857                                     "!$%&*+-.^_{|}~");
1858                         *suffix = save;
1859                         if (n > 0)
1860                           {
1861                             save = p[n]; p[n] = '\0';
1862                             coding_system =
1863                               Ffind_coding_system (intern ((char *) p));
1864                             p[n] = save;
1865                           }
1866                         break;
1867                       }
1868                   break;
1869                 }
1870             break;
1871           }
1872
1873       if (NILP (coding_system))
1874         do
1875           {
1876             if (detect_coding_type (&decst, buf, nread,
1877                                     XCODING_SYSTEM_TYPE (*codesys_in_out)
1878                                     != CODESYS_AUTODETECT))
1879               break;
1880             nread = Lstream_read (stream, buf, sizeof (buf));
1881             if (nread == 0)
1882               break;
1883           }
1884         while (1);
1885
1886       else if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT
1887                && XCODING_SYSTEM_EOL_TYPE (coding_system) == EOL_AUTODETECT)
1888         do
1889           {
1890             if (detect_coding_type (&decst, buf, nread, 1))
1891               break;
1892             nread = Lstream_read (stream, buf, sizeof (buf));
1893             if (!nread)
1894               break;
1895           }
1896         while (1);
1897
1898       *eol_type_in_out = decst.eol_type;
1899       if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT)
1900         {
1901           if (NILP (coding_system))
1902             *codesys_in_out = coding_system_from_mask (decst.mask);
1903           else
1904             *codesys_in_out = coding_system;
1905         }
1906     }
1907
1908   /* If we absolutely can't determine the EOL type, just assume LF. */
1909   if (*eol_type_in_out == EOL_AUTODETECT)
1910     *eol_type_in_out = EOL_LF;
1911
1912   Lstream_rewind (stream);
1913 }
1914
1915 DEFUN ("detect-coding-region", Fdetect_coding_region, 2, 3, 0, /*
1916 Detect coding system of the text in the region between START and END.
1917 Returned a list of possible coding systems ordered by priority.
1918 If only ASCII characters are found, it returns 'undecided or one of
1919 its subsidiary coding systems according to a detected end-of-line
1920 type.  Optional arg BUFFER defaults to the current buffer.
1921 */
1922        (start, end, buffer))
1923 {
1924   Lisp_Object val = Qnil;
1925   struct buffer *buf = decode_buffer (buffer, 0);
1926   Bufpos b, e;
1927   Lisp_Object instream, lb_instream;
1928   Lstream *istr, *lb_istr;
1929   struct detection_state decst;
1930   struct gcpro gcpro1, gcpro2;
1931
1932   get_buffer_range_char (buf, start, end, &b, &e, 0);
1933   lb_instream = make_lisp_buffer_input_stream (buf, b, e, 0);
1934   lb_istr = XLSTREAM (lb_instream);
1935   instream = make_encoding_input_stream (lb_istr, Fget_coding_system (Qbinary));
1936   istr = XLSTREAM (instream);
1937   GCPRO2 (instream, lb_instream);
1938   xzero (decst);
1939   decst.eol_type = EOL_AUTODETECT;
1940   decst.mask = ~0;
1941   while (1)
1942     {
1943       unsigned char random_buffer[4096];
1944       ssize_t nread = Lstream_read (istr, random_buffer, sizeof (random_buffer));
1945
1946       if (!nread)
1947         break;
1948       if (detect_coding_type (&decst, random_buffer, nread, 0))
1949         break;
1950     }
1951
1952   if (decst.mask == ~0)
1953     val = subsidiary_coding_system (Fget_coding_system (Qundecided),
1954                                     decst.eol_type);
1955   else
1956     {
1957       int i;
1958
1959       val = Qnil;
1960 #ifdef MULE
1961       decst.mask = postprocess_iso2022_mask (decst.mask);
1962 #endif
1963       for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1964         {
1965           int sys = fcd->coding_category_by_priority[i];
1966           if (decst.mask & (1 << sys))
1967             {
1968               Lisp_Object codesys = fcd->coding_category_system[sys];
1969               if (!NILP (codesys))
1970                 codesys = subsidiary_coding_system (codesys, decst.eol_type);
1971               val = Fcons (codesys, val);
1972             }
1973         }
1974     }
1975   Lstream_close (istr);
1976   UNGCPRO;
1977   Lstream_delete (istr);
1978   Lstream_delete (lb_istr);
1979   return val;
1980 }
1981
1982 \f
1983 /************************************************************************/
1984 /*           Converting to internal Mule format ("decoding")            */
1985 /************************************************************************/
1986
1987 /* A decoding stream is a stream used for decoding text (i.e.
1988    converting from some external format to internal format).
1989    The decoding-stream object keeps track of the actual coding
1990    stream, the stream that is at the other end, and data that
1991    needs to be persistent across the lifetime of the stream. */
1992
1993 /* Handle the EOL stuff related to just-read-in character C.
1994    EOL_TYPE is the EOL type of the coding stream.
1995    FLAGS is the current value of FLAGS in the coding stream, and may
1996    be modified by this macro.  (The macro only looks at the
1997    CODING_STATE_CR flag.)  DST is the Dynarr to which the decoded
1998    bytes are to be written.  You need to also define a local goto
1999    label "label_continue_loop" that is at the end of the main
2000    character-reading loop.
2001
2002    If C is a CR character, then this macro handles it entirely and
2003    jumps to label_continue_loop.  Otherwise, this macro does not add
2004    anything to DST, and continues normally.  You should continue
2005    processing C normally after this macro. */
2006
2007 #define DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst)         \
2008 do {                                                            \
2009   if (c == '\r')                                                \
2010     {                                                           \
2011       if (eol_type == EOL_CR)                                   \
2012         Dynarr_add (dst, '\n');                                 \
2013       else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR) \
2014         Dynarr_add (dst, c);                                    \
2015       else                                                      \
2016         flags |= CODING_STATE_CR;                               \
2017       goto label_continue_loop;                                 \
2018     }                                                           \
2019   else if (flags & CODING_STATE_CR)                             \
2020     {   /* eol_type == CODING_SYSTEM_EOL_CRLF */                \
2021       if (c != '\n')                                            \
2022         Dynarr_add (dst, '\r');                                 \
2023       flags &= ~CODING_STATE_CR;                                \
2024     }                                                           \
2025 } while (0)
2026
2027 /* C should be a binary character in the range 0 - 255; convert
2028    to internal format and add to Dynarr DST. */
2029
2030 #define DECODE_ADD_BINARY_CHAR(c, dst)          \
2031 do {                                            \
2032   if (BYTE_ASCII_P (c))                         \
2033     Dynarr_add (dst, c);                        \
2034   else if (BYTE_C1_P (c))                       \
2035     {                                           \
2036       Dynarr_add (dst, LEADING_BYTE_CONTROL_1); \
2037       Dynarr_add (dst, c + 0x20);               \
2038     }                                           \
2039   else                                          \
2040     {                                           \
2041       Dynarr_add (dst, LEADING_BYTE_LATIN_ISO8859_1); \
2042       Dynarr_add (dst, c);                      \
2043     }                                           \
2044 } while (0)
2045
2046 #define DECODE_OUTPUT_PARTIAL_CHAR(ch)  \
2047 do {                                    \
2048   if (ch)                               \
2049     {                                   \
2050       DECODE_ADD_BINARY_CHAR (ch, dst); \
2051       ch = 0;                           \
2052     }                                   \
2053 } while (0)
2054
2055 #define DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst) \
2056 do {                                    \
2057   if (flags & CODING_STATE_END)         \
2058     {                                   \
2059       DECODE_OUTPUT_PARTIAL_CHAR (ch);  \
2060       if (flags & CODING_STATE_CR)      \
2061         Dynarr_add (dst, '\r');         \
2062     }                                   \
2063 } while (0)
2064
2065 #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding)
2066
2067 struct decoding_stream
2068 {
2069   /* Coding system that governs the conversion. */
2070   Lisp_Coding_System *codesys;
2071
2072   /* Stream that we read the encoded data from or
2073      write the decoded data to. */
2074   Lstream *other_end;
2075
2076   /* If we are reading, then we can return only a fixed amount of
2077      data, so if the conversion resulted in too much data, we store it
2078      here for retrieval the next time around. */
2079   unsigned_char_dynarr *runoff;
2080
2081   /* FLAGS holds flags indicating the current state of the decoding.
2082      Some of these flags are dependent on the coding system. */
2083   unsigned int flags;
2084
2085   /* CH holds a partially built-up character.  Since we only deal
2086      with one- and two-byte characters at the moment, we only use
2087      this to store the first byte of a two-byte character. */
2088   unsigned int ch;
2089
2090   /* EOL_TYPE specifies the type of end-of-line conversion that
2091      currently applies.  We need to keep this separate from the
2092      EOL type stored in CODESYS because the latter might indicate
2093      automatic EOL-type detection while the former will always
2094      indicate a particular EOL type. */
2095   eol_type_t eol_type;
2096 #ifdef MULE
2097   /* Additional ISO2022 information.  We define the structure above
2098      because it's also needed by the detection routines. */
2099   struct iso2022_decoder iso2022;
2100
2101   /* Additional information (the state of the running CCL program)
2102      used by the CCL decoder. */
2103   struct ccl_program ccl;
2104
2105   /* counter for UTF-8 or UCS-4 */
2106   unsigned char counter;
2107 #endif
2108   struct detection_state decst;
2109 };
2110
2111 static ssize_t decoding_reader (Lstream *stream,
2112                                 unsigned char *data, size_t size);
2113 static ssize_t decoding_writer (Lstream *stream,
2114                                 const unsigned char *data, size_t size);
2115 static int decoding_rewinder   (Lstream *stream);
2116 static int decoding_seekable_p (Lstream *stream);
2117 static int decoding_flusher    (Lstream *stream);
2118 static int decoding_closer     (Lstream *stream);
2119
2120 static Lisp_Object decoding_marker (Lisp_Object stream);
2121
2122 DEFINE_LSTREAM_IMPLEMENTATION ("decoding", lstream_decoding,
2123                                sizeof (struct decoding_stream));
2124
2125 static Lisp_Object
2126 decoding_marker (Lisp_Object stream)
2127 {
2128   Lstream *str = DECODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2129   Lisp_Object str_obj;
2130
2131   /* We do not need to mark the coding systems or charsets stored
2132      within the stream because they are stored in a global list
2133      and automatically marked. */
2134
2135   XSETLSTREAM (str_obj, str);
2136   mark_object (str_obj);
2137   if (str->imp->marker)
2138     return (str->imp->marker) (str_obj);
2139   else
2140     return Qnil;
2141 }
2142
2143 /* Read SIZE bytes of data and store it into DATA.  We are a decoding stream
2144    so we read data from the other end, decode it, and store it into DATA. */
2145
2146 static ssize_t
2147 decoding_reader (Lstream *stream, unsigned char *data, size_t size)
2148 {
2149   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2150   unsigned char *orig_data = data;
2151   ssize_t read_size;
2152   int error_occurred = 0;
2153
2154   /* We need to interface to mule_decode(), which expects to take some
2155      amount of data and store the result into a Dynarr.  We have
2156      mule_decode() store into str->runoff, and take data from there
2157      as necessary. */
2158
2159   /* We loop until we have enough data, reading chunks from the other
2160      end and decoding it. */
2161   while (1)
2162     {
2163       /* Take data from the runoff if we can.  Make sure to take at
2164          most SIZE bytes, and delete the data from the runoff. */
2165       if (Dynarr_length (str->runoff) > 0)
2166         {
2167           size_t chunk = min (size, (size_t) Dynarr_length (str->runoff));
2168           memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2169           Dynarr_delete_many (str->runoff, 0, chunk);
2170           data += chunk;
2171           size -= chunk;
2172         }
2173
2174       if (size == 0)
2175         break; /* No more room for data */
2176
2177       if (str->flags & CODING_STATE_END)
2178         /* This means that on the previous iteration, we hit the EOF on
2179            the other end.  We loop once more so that mule_decode() can
2180            output any final stuff it may be holding, or any "go back
2181            to a sane state" escape sequences. (This latter makes sense
2182            during encoding.) */
2183         break;
2184
2185       /* Exhausted the runoff, so get some more.  DATA has at least
2186          SIZE bytes left of storage in it, so it's OK to read directly
2187          into it.  (We'll be overwriting above, after we've decoded it
2188          into the runoff.) */
2189       read_size = Lstream_read (str->other_end, data, size);
2190       if (read_size < 0)
2191         {
2192           error_occurred = 1;
2193           break;
2194         }
2195       if (read_size == 0)
2196         /* There might be some more end data produced in the translation.
2197            See the comment above. */
2198         str->flags |= CODING_STATE_END;
2199       mule_decode (stream, data, str->runoff, read_size);
2200     }
2201
2202   if (data - orig_data == 0)
2203     return error_occurred ? -1 : 0;
2204   else
2205     return data - orig_data;
2206 }
2207
2208 static ssize_t
2209 decoding_writer (Lstream *stream, const unsigned char *data, size_t size)
2210 {
2211   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2212   ssize_t retval;
2213
2214   /* Decode all our data into the runoff, and then attempt to write
2215      it all out to the other end.  Remove whatever chunk we succeeded
2216      in writing. */
2217   mule_decode (stream, data, str->runoff, size);
2218   retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2219                           Dynarr_length (str->runoff));
2220   if (retval > 0)
2221     Dynarr_delete_many (str->runoff, 0, retval);
2222   /* Do NOT return retval.  The return value indicates how much
2223      of the incoming data was written, not how many bytes were
2224      written. */
2225   return size;
2226 }
2227
2228 static void
2229 reset_decoding_stream (struct decoding_stream *str)
2230 {
2231 #ifdef MULE
2232   if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_ISO2022)
2233     {
2234       Lisp_Object coding_system;
2235       XSETCODING_SYSTEM (coding_system, str->codesys);
2236       reset_iso2022 (coding_system, &str->iso2022);
2237     }
2238   else if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_CCL)
2239     {
2240       setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_DECODE (str->codesys));
2241     }
2242   str->counter = 0;
2243 #endif /* MULE */
2244   if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT
2245       || CODING_SYSTEM_EOL_TYPE (str->codesys) == EOL_AUTODETECT)
2246     {
2247       xzero (str->decst);
2248       str->decst.eol_type = EOL_AUTODETECT;
2249       str->decst.mask = ~0;
2250     }
2251   str->flags = str->ch = 0;
2252 }
2253
2254 static int
2255 decoding_rewinder (Lstream *stream)
2256 {
2257   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2258   reset_decoding_stream (str);
2259   Dynarr_reset (str->runoff);
2260   return Lstream_rewind (str->other_end);
2261 }
2262
2263 static int
2264 decoding_seekable_p (Lstream *stream)
2265 {
2266   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2267   return Lstream_seekable_p (str->other_end);
2268 }
2269
2270 static int
2271 decoding_flusher (Lstream *stream)
2272 {
2273   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2274   return Lstream_flush (str->other_end);
2275 }
2276
2277 static int
2278 decoding_closer (Lstream *stream)
2279 {
2280   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2281   if (stream->flags & LSTREAM_FL_WRITE)
2282     {
2283       str->flags |= CODING_STATE_END;
2284       decoding_writer (stream, 0, 0);
2285     }
2286   Dynarr_free (str->runoff);
2287 #ifdef MULE
2288 #ifdef ENABLE_COMPOSITE_CHARS
2289   if (str->iso2022.composite_chars)
2290     Dynarr_free (str->iso2022.composite_chars);
2291 #endif
2292 #endif
2293   return Lstream_close (str->other_end);
2294 }
2295
2296 Lisp_Object
2297 decoding_stream_coding_system (Lstream *stream)
2298 {
2299   Lisp_Object coding_system;
2300   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2301
2302   XSETCODING_SYSTEM (coding_system, str->codesys);
2303   return subsidiary_coding_system (coding_system, str->eol_type);
2304 }
2305
2306 void
2307 set_decoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2308 {
2309   Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2310   struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2311   str->codesys = cs;
2312   if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
2313     str->eol_type = CODING_SYSTEM_EOL_TYPE (cs);
2314   reset_decoding_stream (str);
2315 }
2316
2317 /* WARNING WARNING WARNING WARNING!!!!!  If you open up a decoding
2318    stream for writing, no automatic code detection will be performed.
2319    The reason for this is that automatic code detection requires a
2320    seekable input.  Things will also fail if you open a decoding
2321    stream for reading using a non-fully-specified coding system and
2322    a non-seekable input stream. */
2323
2324 static Lisp_Object
2325 make_decoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2326                         const char *mode)
2327 {
2328   Lstream *lstr = Lstream_new (lstream_decoding, mode);
2329   struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2330   Lisp_Object obj;
2331
2332   xzero (*str);
2333   str->other_end = stream;
2334   str->runoff = (unsigned_char_dynarr *) Dynarr_new (unsigned_char);
2335   str->eol_type = EOL_AUTODETECT;
2336   if (!strcmp (mode, "r")
2337       && Lstream_seekable_p (stream))
2338     /* We can determine the coding system now. */
2339     determine_real_coding_system (stream, &codesys, &str->eol_type);
2340   set_decoding_stream_coding_system (lstr, codesys);
2341   str->decst.eol_type = str->eol_type;
2342   str->decst.mask = ~0;
2343   XSETLSTREAM (obj, lstr);
2344   return obj;
2345 }
2346
2347 Lisp_Object
2348 make_decoding_input_stream (Lstream *stream, Lisp_Object codesys)
2349 {
2350   return make_decoding_stream_1 (stream, codesys, "r");
2351 }
2352
2353 Lisp_Object
2354 make_decoding_output_stream (Lstream *stream, Lisp_Object codesys)
2355 {
2356   return make_decoding_stream_1 (stream, codesys, "w");
2357 }
2358
2359 /* Note: the decode_coding_* functions all take the same
2360    arguments as mule_decode(), which is to say some SRC data of
2361    size N, which is to be stored into dynamic array DST.
2362    DECODING is the stream within which the decoding is
2363    taking place, but no data is actually read from or
2364    written to that stream; that is handled in decoding_reader()
2365    or decoding_writer().  This allows the same functions to
2366    be used for both reading and writing. */
2367
2368 static void
2369 mule_decode (Lstream *decoding, const unsigned char *src,
2370              unsigned_char_dynarr *dst, unsigned int n)
2371 {
2372   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2373
2374   /* If necessary, do encoding-detection now.  We do this when
2375      we're a writing stream or a non-seekable reading stream,
2376      meaning that we can't just process the whole input,
2377      rewind, and start over. */
2378
2379   if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT ||
2380       str->eol_type == EOL_AUTODETECT)
2381     {
2382       Lisp_Object codesys;
2383
2384       XSETCODING_SYSTEM (codesys, str->codesys);
2385       detect_coding_type (&str->decst, src, n,
2386                           CODING_SYSTEM_TYPE (str->codesys) !=
2387                           CODESYS_AUTODETECT);
2388       if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT &&
2389           str->decst.mask != ~0)
2390         /* #### This is cheesy.  What we really ought to do is
2391            buffer up a certain amount of data so as to get a
2392            less random result. */
2393         codesys = coding_system_from_mask (str->decst.mask);
2394       str->eol_type = str->decst.eol_type;
2395       if (XCODING_SYSTEM (codesys) != str->codesys)
2396         {
2397           /* Preserve the CODING_STATE_END flag in case it was set.
2398              If we erase it, bad things might happen. */
2399           int was_end = str->flags & CODING_STATE_END;
2400           set_decoding_stream_coding_system (decoding, codesys);
2401           if (was_end)
2402             str->flags |= CODING_STATE_END;
2403         }
2404     }
2405
2406   switch (CODING_SYSTEM_TYPE (str->codesys))
2407     {
2408 #ifdef DEBUG_XEMACS
2409     case CODESYS_INTERNAL:
2410       Dynarr_add_many (dst, src, n);
2411       break;
2412 #endif
2413     case CODESYS_AUTODETECT:
2414       /* If we got this far and still haven't decided on the coding
2415          system, then do no conversion. */
2416     case CODESYS_NO_CONVERSION:
2417       decode_coding_no_conversion (decoding, src, dst, n);
2418       break;
2419 #ifdef MULE
2420     case CODESYS_SHIFT_JIS:
2421       decode_coding_sjis (decoding, src, dst, n);
2422       break;
2423     case CODESYS_BIG5:
2424       decode_coding_big5 (decoding, src, dst, n);
2425       break;
2426     case CODESYS_UCS4:
2427       decode_coding_ucs4 (decoding, src, dst, n);
2428       break;
2429     case CODESYS_UTF8:
2430       decode_coding_utf8 (decoding, src, dst, n);
2431       break;
2432     case CODESYS_CCL:
2433       str->ccl.last_block = str->flags & CODING_STATE_END;
2434       ccl_driver (&str->ccl, src, dst, n, 0, CCL_MODE_DECODING);
2435       break;
2436     case CODESYS_ISO2022:
2437       decode_coding_iso2022 (decoding, src, dst, n);
2438       break;
2439 #endif /* MULE */
2440     default:
2441       abort ();
2442     }
2443 }
2444
2445 DEFUN ("decode-coding-region", Fdecode_coding_region, 3, 4, 0, /*
2446 Decode the text between START and END which is encoded in CODING-SYSTEM.
2447 This is useful if you've read in encoded text from a file without decoding
2448 it (e.g. you read in a JIS-formatted file but used the `binary' or
2449 `no-conversion' coding system, so that it shows up as "^[$B!<!+^[(B").
2450 Return length of decoded text.
2451 BUFFER defaults to the current buffer if unspecified.
2452 */
2453        (start, end, coding_system, buffer))
2454 {
2455   Bufpos b, e;
2456   struct buffer *buf = decode_buffer (buffer, 0);
2457   Lisp_Object instream, lb_outstream, de_outstream, outstream;
2458   Lstream *istr, *ostr;
2459   struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2460
2461   get_buffer_range_char (buf, start, end, &b, &e, 0);
2462
2463   barf_if_buffer_read_only (buf, b, e);
2464
2465   coding_system = Fget_coding_system (coding_system);
2466   instream = make_lisp_buffer_input_stream  (buf, b, e, 0);
2467   lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2468   de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2469                                               coding_system);
2470   outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2471                                            Fget_coding_system (Qbinary));
2472   istr = XLSTREAM (instream);
2473   ostr = XLSTREAM (outstream);
2474   GCPRO4 (instream, lb_outstream, de_outstream, outstream);
2475
2476   /* The chain of streams looks like this:
2477
2478      [BUFFER] <----- send through
2479                      ------> [ENCODE AS BINARY]
2480                              ------> [DECODE AS SPECIFIED]
2481                                      ------> [BUFFER]
2482    */
2483
2484   while (1)
2485     {
2486       char tempbuf[1024]; /* some random amount */
2487       Bufpos newpos, even_newer_pos;
2488       Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2489       ssize_t size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2490
2491       if (!size_in_bytes)
2492         break;
2493       newpos = lisp_buffer_stream_startpos (istr);
2494       Lstream_write (ostr, tempbuf, size_in_bytes);
2495       even_newer_pos = lisp_buffer_stream_startpos (istr);
2496       buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2497                            even_newer_pos, 0);
2498     }
2499   Lstream_close (istr);
2500   Lstream_close (ostr);
2501   UNGCPRO;
2502   Lstream_delete (istr);
2503   Lstream_delete (ostr);
2504   Lstream_delete (XLSTREAM (de_outstream));
2505   Lstream_delete (XLSTREAM (lb_outstream));
2506   return Qnil;
2507 }
2508
2509 \f
2510 /************************************************************************/
2511 /*           Converting to an external encoding ("encoding")            */
2512 /************************************************************************/
2513
2514 /* An encoding stream is an output stream.  When you create the
2515    stream, you specify the coding system that governs the encoding
2516    and another stream that the resulting encoded data is to be
2517    sent to, and then start sending data to it. */
2518
2519 #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding)
2520
2521 struct encoding_stream
2522 {
2523   /* Coding system that governs the conversion. */
2524   Lisp_Coding_System *codesys;
2525
2526   /* Stream that we read the encoded data from or
2527      write the decoded data to. */
2528   Lstream *other_end;
2529
2530   /* If we are reading, then we can return only a fixed amount of
2531      data, so if the conversion resulted in too much data, we store it
2532      here for retrieval the next time around. */
2533   unsigned_char_dynarr *runoff;
2534
2535   /* FLAGS holds flags indicating the current state of the encoding.
2536      Some of these flags are dependent on the coding system. */
2537   unsigned int flags;
2538
2539   /* CH holds a partially built-up character.  Since we only deal
2540      with one- and two-byte characters at the moment, we only use
2541      this to store the first byte of a two-byte character. */
2542   unsigned int ch;
2543 #ifdef MULE
2544   /* Additional information used by the ISO2022 encoder. */
2545   struct
2546     {
2547       /* CHARSET holds the character sets currently assigned to the G0
2548          through G3 registers.  It is initialized from the array
2549          INITIAL_CHARSET in CODESYS. */
2550       Lisp_Object charset[4];
2551
2552       /* Which registers are currently invoked into the left (GL) and
2553          right (GR) halves of the 8-bit encoding space? */
2554       int register_left, register_right;
2555
2556       /* Whether we need to explicitly designate the charset in the
2557          G? register before using it.  It is initialized from the
2558          array FORCE_CHARSET_ON_OUTPUT in CODESYS. */
2559       unsigned char force_charset_on_output[4];
2560
2561       /* Other state variables that need to be preserved across
2562          invocations. */
2563       Lisp_Object current_charset;
2564       int current_half;
2565       int current_char_boundary;
2566     } iso2022;
2567
2568   /* Additional information (the state of the running CCL program)
2569      used by the CCL encoder. */
2570   struct ccl_program ccl;
2571 #endif /* MULE */
2572 };
2573
2574 static ssize_t encoding_reader (Lstream *stream, unsigned char *data, size_t size);
2575 static ssize_t encoding_writer (Lstream *stream, const unsigned char *data,
2576                                 size_t size);
2577 static int encoding_rewinder   (Lstream *stream);
2578 static int encoding_seekable_p (Lstream *stream);
2579 static int encoding_flusher    (Lstream *stream);
2580 static int encoding_closer     (Lstream *stream);
2581
2582 static Lisp_Object encoding_marker (Lisp_Object stream);
2583
2584 DEFINE_LSTREAM_IMPLEMENTATION ("encoding", lstream_encoding,
2585                                sizeof (struct encoding_stream));
2586
2587 static Lisp_Object
2588 encoding_marker (Lisp_Object stream)
2589 {
2590   Lstream *str = ENCODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2591   Lisp_Object str_obj;
2592
2593   /* We do not need to mark the coding systems or charsets stored
2594      within the stream because they are stored in a global list
2595      and automatically marked. */
2596
2597   XSETLSTREAM (str_obj, str);
2598   mark_object (str_obj);
2599   if (str->imp->marker)
2600     return (str->imp->marker) (str_obj);
2601   else
2602     return Qnil;
2603 }
2604
2605 /* Read SIZE bytes of data and store it into DATA.  We are a encoding stream
2606    so we read data from the other end, encode it, and store it into DATA. */
2607
2608 static ssize_t
2609 encoding_reader (Lstream *stream, unsigned char *data, size_t size)
2610 {
2611   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2612   unsigned char *orig_data = data;
2613   ssize_t read_size;
2614   int error_occurred = 0;
2615
2616   /* We need to interface to mule_encode(), which expects to take some
2617      amount of data and store the result into a Dynarr.  We have
2618      mule_encode() store into str->runoff, and take data from there
2619      as necessary. */
2620
2621   /* We loop until we have enough data, reading chunks from the other
2622      end and encoding it. */
2623   while (1)
2624     {
2625       /* Take data from the runoff if we can.  Make sure to take at
2626          most SIZE bytes, and delete the data from the runoff. */
2627       if (Dynarr_length (str->runoff) > 0)
2628         {
2629           int chunk = min ((int) size, Dynarr_length (str->runoff));
2630           memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2631           Dynarr_delete_many (str->runoff, 0, chunk);
2632           data += chunk;
2633           size -= chunk;
2634         }
2635
2636       if (size == 0)
2637         break; /* No more room for data */
2638
2639       if (str->flags & CODING_STATE_END)
2640         /* This means that on the previous iteration, we hit the EOF on
2641            the other end.  We loop once more so that mule_encode() can
2642            output any final stuff it may be holding, or any "go back
2643            to a sane state" escape sequences. (This latter makes sense
2644            during encoding.) */
2645         break;
2646
2647       /* Exhausted the runoff, so get some more.  DATA at least SIZE bytes
2648          left of storage in it, so it's OK to read directly into it.
2649          (We'll be overwriting above, after we've encoded it into the
2650          runoff.) */
2651       read_size = Lstream_read (str->other_end, data, size);
2652       if (read_size < 0)
2653         {
2654           error_occurred = 1;
2655           break;
2656         }
2657       if (read_size == 0)
2658         /* There might be some more end data produced in the translation.
2659            See the comment above. */
2660         str->flags |= CODING_STATE_END;
2661       mule_encode (stream, data, str->runoff, read_size);
2662     }
2663
2664   if (data == orig_data)
2665     return error_occurred ? -1 : 0;
2666   else
2667     return data - orig_data;
2668 }
2669
2670 static ssize_t
2671 encoding_writer (Lstream *stream, const unsigned char *data, size_t size)
2672 {
2673   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2674   ssize_t retval;
2675
2676   /* Encode all our data into the runoff, and then attempt to write
2677      it all out to the other end.  Remove whatever chunk we succeeded
2678      in writing. */
2679   mule_encode (stream, data, str->runoff, size);
2680   retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2681                           Dynarr_length (str->runoff));
2682   if (retval > 0)
2683     Dynarr_delete_many (str->runoff, 0, retval);
2684   /* Do NOT return retval.  The return value indicates how much
2685      of the incoming data was written, not how many bytes were
2686      written. */
2687   return size;
2688 }
2689
2690 static void
2691 reset_encoding_stream (struct encoding_stream *str)
2692 {
2693 #ifdef MULE
2694   switch (CODING_SYSTEM_TYPE (str->codesys))
2695     {
2696     case CODESYS_ISO2022:
2697       {
2698         int i;
2699
2700         for (i = 0; i < 4; i++)
2701           {
2702             str->iso2022.charset[i] =
2703               CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, i);
2704             str->iso2022.force_charset_on_output[i] =
2705               CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (str->codesys, i);
2706           }
2707         str->iso2022.register_left = 0;
2708         str->iso2022.register_right = 1;
2709         str->iso2022.current_charset = Qnil;
2710         str->iso2022.current_half = 0;
2711         str->iso2022.current_char_boundary = 1;
2712         break;
2713       }
2714     case CODESYS_CCL:
2715       setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_ENCODE (str->codesys));
2716       break;
2717     default:
2718       break;
2719     }
2720 #endif /* MULE */
2721
2722   str->flags = str->ch = 0;
2723 }
2724
2725 static int
2726 encoding_rewinder (Lstream *stream)
2727 {
2728   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2729   reset_encoding_stream (str);
2730   Dynarr_reset (str->runoff);
2731   return Lstream_rewind (str->other_end);
2732 }
2733
2734 static int
2735 encoding_seekable_p (Lstream *stream)
2736 {
2737   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2738   return Lstream_seekable_p (str->other_end);
2739 }
2740
2741 static int
2742 encoding_flusher (Lstream *stream)
2743 {
2744   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2745   return Lstream_flush (str->other_end);
2746 }
2747
2748 static int
2749 encoding_closer (Lstream *stream)
2750 {
2751   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2752   if (stream->flags & LSTREAM_FL_WRITE)
2753     {
2754       str->flags |= CODING_STATE_END;
2755       encoding_writer (stream, 0, 0);
2756     }
2757   Dynarr_free (str->runoff);
2758   return Lstream_close (str->other_end);
2759 }
2760
2761 Lisp_Object
2762 encoding_stream_coding_system (Lstream *stream)
2763 {
2764   Lisp_Object coding_system;
2765   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2766
2767   XSETCODING_SYSTEM (coding_system, str->codesys);
2768   return coding_system;
2769 }
2770
2771 void
2772 set_encoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2773 {
2774   Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2775   struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2776   str->codesys = cs;
2777   reset_encoding_stream (str);
2778 }
2779
2780 static Lisp_Object
2781 make_encoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2782                         const char *mode)
2783 {
2784   Lstream *lstr = Lstream_new (lstream_encoding, mode);
2785   struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2786   Lisp_Object obj;
2787
2788   xzero (*str);
2789   str->runoff = Dynarr_new (unsigned_char);
2790   str->other_end = stream;
2791   set_encoding_stream_coding_system (lstr, codesys);
2792   XSETLSTREAM (obj, lstr);
2793   return obj;
2794 }
2795
2796 Lisp_Object
2797 make_encoding_input_stream (Lstream *stream, Lisp_Object codesys)
2798 {
2799   return make_encoding_stream_1 (stream, codesys, "r");
2800 }
2801
2802 Lisp_Object
2803 make_encoding_output_stream (Lstream *stream, Lisp_Object codesys)
2804 {
2805   return make_encoding_stream_1 (stream, codesys, "w");
2806 }
2807
2808 /* Convert N bytes of internally-formatted data stored in SRC to an
2809    external format, according to the encoding stream ENCODING.
2810    Store the encoded data into DST. */
2811
2812 static void
2813 mule_encode (Lstream *encoding, const unsigned char *src,
2814              unsigned_char_dynarr *dst, unsigned int n)
2815 {
2816   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2817
2818   switch (CODING_SYSTEM_TYPE (str->codesys))
2819     {
2820 #ifdef DEBUG_XEMACS
2821     case CODESYS_INTERNAL:
2822       Dynarr_add_many (dst, src, n);
2823       break;
2824 #endif
2825     case CODESYS_AUTODETECT:
2826       /* If we got this far and still haven't decided on the coding
2827          system, then do no conversion. */
2828     case CODESYS_NO_CONVERSION:
2829       encode_coding_no_conversion (encoding, src, dst, n);
2830       break;
2831 #ifdef MULE
2832     case CODESYS_SHIFT_JIS:
2833       encode_coding_sjis (encoding, src, dst, n);
2834       break;
2835     case CODESYS_BIG5:
2836       encode_coding_big5 (encoding, src, dst, n);
2837       break;
2838     case CODESYS_UCS4:
2839       encode_coding_ucs4 (encoding, src, dst, n);
2840       break;
2841     case CODESYS_UTF8:
2842       encode_coding_utf8 (encoding, src, dst, n);
2843       break;
2844     case CODESYS_CCL:
2845       str->ccl.last_block = str->flags & CODING_STATE_END;
2846       ccl_driver (&str->ccl, src, dst, n, 0, CCL_MODE_ENCODING);
2847       break;
2848     case CODESYS_ISO2022:
2849       encode_coding_iso2022 (encoding, src, dst, n);
2850       break;
2851 #endif /* MULE */
2852     default:
2853       abort ();
2854     }
2855 }
2856
2857 DEFUN ("encode-coding-region", Fencode_coding_region, 3, 4, 0, /*
2858 Encode the text between START and END using CODING-SYSTEM.
2859 This will, for example, convert Japanese characters into stuff such as
2860 "^[$B!<!+^[(B" if you use the JIS encoding.  Return length of encoded
2861 text.  BUFFER defaults to the current buffer if unspecified.
2862 */
2863        (start, end, coding_system, buffer))
2864 {
2865   Bufpos b, e;
2866   struct buffer *buf = decode_buffer (buffer, 0);
2867   Lisp_Object instream, lb_outstream, de_outstream, outstream;
2868   Lstream *istr, *ostr;
2869   struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2870
2871   get_buffer_range_char (buf, start, end, &b, &e, 0);
2872
2873   barf_if_buffer_read_only (buf, b, e);
2874
2875   coding_system = Fget_coding_system (coding_system);
2876   instream  = make_lisp_buffer_input_stream  (buf, b, e, 0);
2877   lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2878   de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2879                                               Fget_coding_system (Qbinary));
2880   outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2881                                            coding_system);
2882   istr = XLSTREAM (instream);
2883   ostr = XLSTREAM (outstream);
2884   GCPRO4 (instream, outstream, de_outstream, lb_outstream);
2885   /* The chain of streams looks like this:
2886
2887      [BUFFER] <----- send through
2888                      ------> [ENCODE AS SPECIFIED]
2889                              ------> [DECODE AS BINARY]
2890                                      ------> [BUFFER]
2891    */
2892   while (1)
2893     {
2894       char tempbuf[1024]; /* some random amount */
2895       Bufpos newpos, even_newer_pos;
2896       Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2897       ssize_t size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2898
2899       if (!size_in_bytes)
2900         break;
2901       newpos = lisp_buffer_stream_startpos (istr);
2902       Lstream_write (ostr, tempbuf, size_in_bytes);
2903       even_newer_pos = lisp_buffer_stream_startpos (istr);
2904       buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2905                            even_newer_pos, 0);
2906     }
2907
2908   {
2909     Charcount retlen =
2910       lisp_buffer_stream_startpos (XLSTREAM (instream)) - b;
2911     Lstream_close (istr);
2912     Lstream_close (ostr);
2913     UNGCPRO;
2914     Lstream_delete (istr);
2915     Lstream_delete (ostr);
2916     Lstream_delete (XLSTREAM (de_outstream));
2917     Lstream_delete (XLSTREAM (lb_outstream));
2918     return make_int (retlen);
2919   }
2920 }
2921
2922 #ifdef MULE
2923 \f
2924 /************************************************************************/
2925 /*                          Shift-JIS methods                           */
2926 /************************************************************************/
2927
2928 /* Shift-JIS is a coding system encoding three character sets: ASCII, right
2929    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
2930    as is.  A character of JISX0201-Kana (DIMENSION1_CHARS94 character set) is
2931    encoded by "position-code + 0x80".  A character of JISX0208
2932    (DIMENSION2_CHARS94 character set) is encoded in 2-byte but two
2933    position-codes are divided and shifted so that it fit in the range
2934    below.
2935
2936    --- CODE RANGE of Shift-JIS ---
2937    (character set)      (range)
2938    ASCII                0x00 .. 0x7F
2939    JISX0201-Kana        0xA0 .. 0xDF
2940    JISX0208 (1st byte)  0x80 .. 0x9F and 0xE0 .. 0xEF
2941             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
2942    -------------------------------
2943
2944 */
2945
2946 /* Is this the first byte of a Shift-JIS two-byte char? */
2947
2948 #define BYTE_SJIS_TWO_BYTE_1_P(c) \
2949   (((c) >= 0x81 && (c) <= 0x9F) || ((c) >= 0xE0 && (c) <= 0xEF))
2950
2951 /* Is this the second byte of a Shift-JIS two-byte char? */
2952
2953 #define BYTE_SJIS_TWO_BYTE_2_P(c) \
2954   (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0x80 && (c) <= 0xFC))
2955
2956 #define BYTE_SJIS_KATAKANA_P(c) \
2957   ((c) >= 0xA1 && (c) <= 0xDF)
2958
2959 static int
2960 detect_coding_sjis (struct detection_state *st, const unsigned char *src,
2961                     unsigned int n)
2962 {
2963   int c;
2964
2965   while (n--)
2966     {
2967       c = *src++;
2968       if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
2969         return 0;
2970       if (st->shift_jis.in_second_byte)
2971         {
2972           st->shift_jis.in_second_byte = 0;
2973           if (c < 0x40)
2974             return 0;
2975         }
2976       else if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
2977         st->shift_jis.in_second_byte = 1;
2978     }
2979   return CODING_CATEGORY_SHIFT_JIS_MASK;
2980 }
2981
2982 /* Convert Shift-JIS data to internal format. */
2983
2984 static void
2985 decode_coding_sjis (Lstream *decoding, const unsigned char *src,
2986                     unsigned_char_dynarr *dst, unsigned int n)
2987 {
2988   unsigned char c;
2989   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2990   unsigned int flags  = str->flags;
2991   unsigned int ch     = str->ch;
2992   eol_type_t eol_type = str->eol_type;
2993
2994   while (n--)
2995     {
2996       c = *src++;
2997
2998       if (ch)
2999         {
3000           /* Previous character was first byte of Shift-JIS Kanji char. */
3001           if (BYTE_SJIS_TWO_BYTE_2_P (c))
3002             {
3003               unsigned char e1, e2;
3004
3005               Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
3006               DECODE_SJIS (ch, c, e1, e2);
3007               Dynarr_add (dst, e1);
3008               Dynarr_add (dst, e2);
3009             }
3010           else
3011             {
3012               DECODE_ADD_BINARY_CHAR (ch, dst);
3013               DECODE_ADD_BINARY_CHAR (c, dst);
3014             }
3015           ch = 0;
3016         }
3017       else
3018         {
3019           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3020           if (BYTE_SJIS_TWO_BYTE_1_P (c))
3021             ch = c;
3022           else if (BYTE_SJIS_KATAKANA_P (c))
3023             {
3024               Dynarr_add (dst, LEADING_BYTE_KATAKANA_JISX0201);
3025               Dynarr_add (dst, c);
3026             }
3027           else
3028             DECODE_ADD_BINARY_CHAR (c, dst);
3029         }
3030     label_continue_loop:;
3031     }
3032
3033   DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
3034
3035   str->flags = flags;
3036   str->ch    = ch;
3037 }
3038
3039 /* Convert internally-formatted data to Shift-JIS. */
3040
3041 static void
3042 encode_coding_sjis (Lstream *encoding, const unsigned char *src,
3043                     unsigned_char_dynarr *dst, unsigned int n)
3044 {
3045   unsigned char c;
3046   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3047   unsigned int flags  = str->flags;
3048   unsigned int ch     = str->ch;
3049   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3050
3051   while (n--)
3052     {
3053       c = *src++;
3054       if (c == '\n')
3055         {
3056           if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3057             Dynarr_add (dst, '\r');
3058           if (eol_type != EOL_CR)
3059             Dynarr_add (dst, '\n');
3060           ch = 0;
3061         }
3062       else if (BYTE_ASCII_P (c))
3063         {
3064           Dynarr_add (dst, c);
3065           ch = 0;
3066         }
3067       else if (BUFBYTE_LEADING_BYTE_P (c))
3068         ch = (c == LEADING_BYTE_KATAKANA_JISX0201 ||
3069               c == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
3070               c == LEADING_BYTE_JAPANESE_JISX0208) ? c : 0;
3071       else if (ch)
3072         {
3073           if (ch == LEADING_BYTE_KATAKANA_JISX0201)
3074             {
3075               Dynarr_add (dst, c);
3076               ch = 0;
3077             }
3078           else if (ch == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
3079                    ch == LEADING_BYTE_JAPANESE_JISX0208)
3080             ch = c;
3081           else
3082             {
3083               unsigned char j1, j2;
3084               ENCODE_SJIS (ch, c, j1, j2);
3085               Dynarr_add (dst, j1);
3086               Dynarr_add (dst, j2);
3087               ch = 0;
3088             }
3089         }
3090     }
3091
3092   str->flags = flags;
3093   str->ch    = ch;
3094 }
3095
3096 DEFUN ("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /*
3097 Decode a JISX0208 character of Shift-JIS coding-system.
3098 CODE is the character code in Shift-JIS as a cons of type bytes.
3099 Return the corresponding character.
3100 */
3101        (code))
3102 {
3103   unsigned char c1, c2, s1, s2;
3104
3105   CHECK_CONS (code);
3106   CHECK_INT (XCAR (code));
3107   CHECK_INT (XCDR (code));
3108   s1 = XINT (XCAR (code));
3109   s2 = XINT (XCDR (code));
3110   if (BYTE_SJIS_TWO_BYTE_1_P (s1) &&
3111       BYTE_SJIS_TWO_BYTE_2_P (s2))
3112     {
3113       DECODE_SJIS (s1, s2, c1, c2);
3114       return make_char (MAKE_CHAR (Vcharset_japanese_jisx0208,
3115                                    c1 & 0x7F, c2 & 0x7F));
3116     }
3117   else
3118     return Qnil;
3119 }
3120
3121 DEFUN ("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /*
3122 Encode a JISX0208 character CHAR to SHIFT-JIS coding-system.
3123 Return the corresponding character code in SHIFT-JIS as a cons of two bytes.
3124 */
3125        (ch))
3126 {
3127   Lisp_Object charset;
3128   int c1, c2, s1, s2;
3129
3130   CHECK_CHAR_COERCE_INT (ch);
3131   BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
3132   if (EQ (charset, Vcharset_japanese_jisx0208))
3133     {
3134       ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3135       return Fcons (make_int (s1), make_int (s2));
3136     }
3137   else
3138     return Qnil;
3139 }
3140
3141 \f
3142 /************************************************************************/
3143 /*                            Big5 methods                              */
3144 /************************************************************************/
3145
3146 /* BIG5 is a coding system encoding two character sets: ASCII and
3147    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
3148    character set and is encoded in two-byte.
3149
3150    --- CODE RANGE of BIG5 ---
3151    (character set)      (range)
3152    ASCII                0x00 .. 0x7F
3153    Big5 (1st byte)      0xA1 .. 0xFE
3154         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
3155    --------------------------
3156
3157    Since the number of characters in Big5 is larger than maximum
3158    characters in Emacs' charset (96x96), it can't be handled as one
3159    charset.  So, in Emacs, Big5 is divided into two: `charset-big5-1'
3160    and `charset-big5-2'.  Both <type>s are DIMENSION2_CHARS94.  The former
3161    contains frequently used characters and the latter contains less
3162    frequently used characters.  */
3163
3164 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
3165   ((c) >= 0xA1 && (c) <= 0xFE)
3166
3167 /* Is this the second byte of a Shift-JIS two-byte char? */
3168
3169 #define BYTE_BIG5_TWO_BYTE_2_P(c) \
3170   (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0xA1 && (c) <= 0xFE))
3171
3172 /* Number of Big5 characters which have the same code in 1st byte.  */
3173
3174 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
3175
3176 /* Code conversion macros.  These are macros because they are used in
3177    inner loops during code conversion.
3178
3179    Note that temporary variables in macros introduce the classic
3180    dynamic-scoping problems with variable names.  We use capital-
3181    lettered variables in the assumption that XEmacs does not use
3182    capital letters in variables except in a very formalized way
3183    (e.g. Qstring). */
3184
3185 /* Convert Big5 code (b1, b2) into its internal string representation
3186    (lb, c1, c2). */
3187
3188 /* There is a much simpler way to split the Big5 charset into two.
3189    For the moment I'm going to leave the algorithm as-is because it
3190    claims to separate out the most-used characters into a single
3191    charset, which perhaps will lead to optimizations in various
3192    places.
3193
3194    The way the algorithm works is something like this:
3195
3196    Big5 can be viewed as a 94x157 charset, where the row is
3197    encoded into the bytes 0xA1 .. 0xFE and the column is encoded
3198    into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE.  As for frequency,
3199    the split between low and high column numbers is apparently
3200    meaningless; ascending rows produce less and less frequent chars.
3201    Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to
3202    the first charset, and the upper half (0xC9 .. 0xFE) to the
3203    second.  To do the conversion, we convert the character into
3204    a single number where 0 .. 156 is the first row, 157 .. 313
3205    is the second, etc.  That way, the characters are ordered by
3206    decreasing frequency.  Then we just chop the space in two
3207    and coerce the result into a 94x94 space.
3208    */
3209
3210 #define DECODE_BIG5(b1, b2, lb, c1, c2) do                              \
3211 {                                                                       \
3212   int B1 = b1, B2 = b2;                                                 \
3213   unsigned int I                                                        \
3214     = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62);     \
3215                                                                         \
3216   if (B1 < 0xC9)                                                        \
3217     {                                                                   \
3218       lb = LEADING_BYTE_CHINESE_BIG5_1;                                 \
3219     }                                                                   \
3220   else                                                                  \
3221     {                                                                   \
3222       lb = LEADING_BYTE_CHINESE_BIG5_2;                                 \
3223       I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1);                             \
3224     }                                                                   \
3225   c1 = I / (0xFF - 0xA1) + 0xA1;                                        \
3226   c2 = I % (0xFF - 0xA1) + 0xA1;                                        \
3227 } while (0)
3228
3229 /* Convert the internal string representation of a Big5 character
3230    (lb, c1, c2) into Big5 code (b1, b2). */
3231
3232 #define ENCODE_BIG5(lb, c1, c2, b1, b2) do                              \
3233 {                                                                       \
3234   unsigned int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1);       \
3235                                                                         \
3236   if (lb == LEADING_BYTE_CHINESE_BIG5_2)                                \
3237     {                                                                   \
3238       I += BIG5_SAME_ROW * (0xC9 - 0xA1);                               \
3239     }                                                                   \
3240   b1 = I / BIG5_SAME_ROW + 0xA1;                                        \
3241   b2 = I % BIG5_SAME_ROW;                                               \
3242   b2 += b2 < 0x3F ? 0x40 : 0x62;                                        \
3243 } while (0)
3244
3245 static int
3246 detect_coding_big5 (struct detection_state *st, const unsigned char *src,
3247                     unsigned int n)
3248 {
3249   int c;
3250
3251   while (n--)
3252     {
3253       c = *src++;
3254       if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO ||
3255           (c >= 0x80 && c <= 0xA0))
3256         return 0;
3257       if (st->big5.in_second_byte)
3258         {
3259           st->big5.in_second_byte = 0;
3260           if (c < 0x40 || (c >= 0x80 && c <= 0xA0))
3261             return 0;
3262         }
3263       else if (c >= 0xA1)
3264         st->big5.in_second_byte = 1;
3265     }
3266   return CODING_CATEGORY_BIG5_MASK;
3267 }
3268
3269 /* Convert Big5 data to internal format. */
3270
3271 static void
3272 decode_coding_big5 (Lstream *decoding, const unsigned char *src,
3273                     unsigned_char_dynarr *dst, unsigned int n)
3274 {
3275   unsigned char c;
3276   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3277   unsigned int flags  = str->flags;
3278   unsigned int ch     = str->ch;
3279   eol_type_t eol_type = str->eol_type;
3280
3281   while (n--)
3282     {
3283       c = *src++;
3284       if (ch)
3285         {
3286           /* Previous character was first byte of Big5 char. */
3287           if (BYTE_BIG5_TWO_BYTE_2_P (c))
3288             {
3289               unsigned char b1, b2, b3;
3290               DECODE_BIG5 (ch, c, b1, b2, b3);
3291               Dynarr_add (dst, b1);
3292               Dynarr_add (dst, b2);
3293               Dynarr_add (dst, b3);
3294             }
3295           else
3296             {
3297               DECODE_ADD_BINARY_CHAR (ch, dst);
3298               DECODE_ADD_BINARY_CHAR (c, dst);
3299             }
3300           ch = 0;
3301         }
3302       else
3303         {
3304           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3305           if (BYTE_BIG5_TWO_BYTE_1_P (c))
3306             ch = c;
3307           else
3308             DECODE_ADD_BINARY_CHAR (c, dst);
3309         }
3310     label_continue_loop:;
3311     }
3312
3313   DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
3314
3315   str->flags = flags;
3316   str->ch    = ch;
3317 }
3318
3319 /* Convert internally-formatted data to Big5. */
3320
3321 static void
3322 encode_coding_big5 (Lstream *encoding, const unsigned char *src,
3323                     unsigned_char_dynarr *dst, unsigned int n)
3324 {
3325   unsigned char c;
3326   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3327   unsigned int flags  = str->flags;
3328   unsigned int ch     = str->ch;
3329   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3330
3331   while (n--)
3332     {
3333       c = *src++;
3334       if (c == '\n')
3335         {
3336           if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3337             Dynarr_add (dst, '\r');
3338           if (eol_type != EOL_CR)
3339             Dynarr_add (dst, '\n');
3340         }
3341       else if (BYTE_ASCII_P (c))
3342         {
3343           /* ASCII. */
3344           Dynarr_add (dst, c);
3345         }
3346       else if (BUFBYTE_LEADING_BYTE_P (c))
3347         {
3348           if (c == LEADING_BYTE_CHINESE_BIG5_1 ||
3349               c == LEADING_BYTE_CHINESE_BIG5_2)
3350             {
3351               /* A recognized leading byte. */
3352               ch = c;
3353               continue; /* not done with this character. */
3354             }
3355           /* otherwise just ignore this character. */
3356         }
3357       else if (ch == LEADING_BYTE_CHINESE_BIG5_1 ||
3358                ch == LEADING_BYTE_CHINESE_BIG5_2)
3359         {
3360           /* Previous char was a recognized leading byte. */
3361           ch = (ch << 8) | c;
3362           continue; /* not done with this character. */
3363         }
3364       else if (ch)
3365         {
3366           /* Encountering second byte of a Big5 character. */
3367           unsigned char b1, b2;
3368
3369           ENCODE_BIG5 (ch >> 8, ch & 0xFF, c, b1, b2);
3370           Dynarr_add (dst, b1);
3371           Dynarr_add (dst, b2);
3372         }
3373
3374       ch = 0;
3375     }
3376
3377   str->flags = flags;
3378   str->ch    = ch;
3379 }
3380
3381
3382 DEFUN ("decode-big5-char", Fdecode_big5_char, 1, 1, 0, /*
3383 Decode a Big5 character CODE of BIG5 coding-system.
3384 CODE is the character code in BIG5, a cons of two integers.
3385 Return the corresponding character.
3386 */
3387        (code))
3388 {
3389   unsigned char c1, c2, b1, b2;
3390
3391   CHECK_CONS (code);
3392   CHECK_INT (XCAR (code));
3393   CHECK_INT (XCDR (code));
3394   b1 = XINT (XCAR (code));
3395   b2 = XINT (XCDR (code));
3396   if (BYTE_BIG5_TWO_BYTE_1_P (b1) &&
3397       BYTE_BIG5_TWO_BYTE_2_P (b2))
3398     {
3399       int leading_byte;
3400       Lisp_Object charset;
3401       DECODE_BIG5 (b1, b2, leading_byte, c1, c2);
3402       charset = CHARSET_BY_LEADING_BYTE (leading_byte);
3403       return make_char (MAKE_CHAR (charset, c1 & 0x7F, c2 & 0x7F));
3404     }
3405   else
3406     return Qnil;
3407 }
3408
3409 DEFUN ("encode-big5-char", Fencode_big5_char, 1, 1, 0, /*
3410 Encode the Big5 character CH to BIG5 coding-system.
3411 Return the corresponding character code in Big5.
3412 */
3413        (ch))
3414 {
3415   Lisp_Object charset;
3416   int c1, c2, b1, b2;
3417
3418   CHECK_CHAR_COERCE_INT (ch);
3419   BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
3420   if (EQ (charset, Vcharset_chinese_big5_1) ||
3421       EQ (charset, Vcharset_chinese_big5_2))
3422     {
3423       ENCODE_BIG5 (XCHARSET_LEADING_BYTE (charset), c1 | 0x80, c2 | 0x80,
3424                    b1, b2);
3425       return Fcons (make_int (b1), make_int (b2));
3426     }
3427   else
3428     return Qnil;
3429 }
3430
3431 \f
3432 /************************************************************************/
3433 /*                           UCS-4 methods                              */
3434 /*                                                                      */
3435 /*  UCS-4 character codes are implemented as nonnegative integers.      */
3436 /*                                                                      */
3437 /************************************************************************/
3438
3439
3440 DEFUN ("set-ucs-char", Fset_ucs_char, 2, 2, 0, /*
3441 Map UCS-4 code CODE to Mule character CHARACTER.
3442
3443 Return T on success, NIL on failure.
3444 */
3445        (code, character))
3446 {
3447   size_t c;
3448
3449   CHECK_CHAR (character);
3450   CHECK_NATNUM (code);
3451   c = XINT (code);
3452
3453   if (c < countof (fcd->ucs_to_mule_table))
3454     {
3455       fcd->ucs_to_mule_table[c] = character;
3456       return Qt;
3457     }
3458   else
3459     return Qnil;
3460 }
3461
3462 static Lisp_Object
3463 ucs_to_char (unsigned long code)
3464 {
3465   if (code < countof (fcd->ucs_to_mule_table))
3466     {
3467       return fcd->ucs_to_mule_table[code];
3468     }
3469   else if ((0xe00000 <= code) && (code <= 0xe00000 + 94 * 94 * 14))
3470     {
3471       unsigned int c;
3472
3473       code -= 0xe00000;
3474       c = code % (94 * 94);
3475       return make_char
3476         (MAKE_CHAR (CHARSET_BY_ATTRIBUTES
3477                     (CHARSET_TYPE_94X94, code / (94 * 94) + '@',
3478                      CHARSET_LEFT_TO_RIGHT),
3479                     c / 94 + 33, c % 94 + 33));
3480     }
3481   else
3482     return Qnil;
3483 }
3484
3485 DEFUN ("ucs-char", Fucs_char, 1, 1, 0, /*
3486 Return Mule character corresponding to UCS code CODE (a positive integer).
3487 */
3488        (code))
3489 {
3490   CHECK_NATNUM (code);
3491   return ucs_to_char (XINT (code));
3492 }
3493
3494 DEFUN ("set-char-ucs", Fset_char_ucs, 2, 2, 0, /*
3495 Map Mule character CHARACTER to UCS code CODE (a positive integer).
3496 */
3497        (character, code))
3498 {
3499   /* #### Isn't this gilding the lily?  Fput_char_table checks its args.
3500           Fset_char_ucs is more restrictive on index arg, but should
3501           check code arg in a char_table method. */
3502   CHECK_CHAR (character);
3503   CHECK_NATNUM (code);
3504   return Fput_char_table (character, code, mule_to_ucs_table);
3505 }
3506
3507 DEFUN ("char-ucs", Fchar_ucs, 1, 1, 0, /*
3508 Return the UCS code (a positive integer) corresponding to CHARACTER.
3509 */
3510        (character))
3511 {
3512   return Fget_char_table (character, mule_to_ucs_table);
3513 }
3514
3515 /* Decode a UCS-4 character into a buffer.  If the lookup fails, use
3516    <GETA MARK> (U+3013) of JIS X 0208, which means correct character
3517    is not found, instead.
3518    #### do something more appropriate (use blob?)
3519         Danger, Will Robinson!  Data loss.  Should we signal user? */
3520 static void
3521 decode_ucs4 (unsigned long ch, unsigned_char_dynarr *dst)
3522 {
3523   Lisp_Object chr = ucs_to_char (ch);
3524
3525   if (! NILP (chr))
3526     {
3527       Bufbyte work[MAX_EMCHAR_LEN];
3528       int len;
3529
3530       ch = XCHAR (chr);
3531       len = (ch < 128) ?
3532         simple_set_charptr_emchar (work, ch) :
3533         non_ascii_set_charptr_emchar (work, ch);
3534       Dynarr_add_many (dst, work, len);
3535     }
3536   else
3537     {
3538       Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
3539       Dynarr_add (dst, 34 + 128);
3540       Dynarr_add (dst, 46 + 128);
3541     }
3542 }
3543
3544 static unsigned long
3545 mule_char_to_ucs4 (Lisp_Object charset,
3546                    unsigned char h, unsigned char l)
3547 {
3548   Lisp_Object code
3549     = Fget_char_table (make_char (MAKE_CHAR (charset, h & 127, l & 127)),
3550                        mule_to_ucs_table);
3551
3552   if (INTP (code))
3553     {
3554       return XINT (code);
3555     }
3556   else if ( (XCHARSET_DIMENSION (charset) == 2) &&
3557             (XCHARSET_CHARS (charset) == 94) )
3558     {
3559       unsigned char final = XCHARSET_FINAL (charset);
3560
3561       if ( ('@' <= final) && (final < 0x7f) )
3562         {
3563           return 0xe00000 + (final - '@') * 94 * 94
3564             + ((h & 127) - 33) * 94 + (l & 127) - 33;
3565         }
3566       else
3567         {
3568           return '?';
3569         }
3570     }
3571   else
3572     {
3573       return '?';
3574     }
3575 }
3576
3577 static void
3578 encode_ucs4 (Lisp_Object charset,
3579              unsigned char h, unsigned char l, unsigned_char_dynarr *dst)
3580 {
3581   unsigned long code = mule_char_to_ucs4 (charset, h, l);
3582   Dynarr_add (dst,  code >> 24);
3583   Dynarr_add (dst, (code >> 16) & 255);
3584   Dynarr_add (dst, (code >>  8) & 255);
3585   Dynarr_add (dst,  code        & 255);
3586 }
3587
3588 static int
3589 detect_coding_ucs4 (struct detection_state *st, const unsigned char *src,
3590                     unsigned int n)
3591 {
3592   while (n--)
3593     {
3594       int c = *src++;
3595       switch (st->ucs4.in_byte)
3596         {
3597         case 0:
3598           if (c >= 128)
3599             return 0;
3600           else
3601             st->ucs4.in_byte++;
3602           break;
3603         case 3:
3604           st->ucs4.in_byte = 0;
3605           break;
3606         default:
3607           st->ucs4.in_byte++;
3608         }
3609     }
3610   return CODING_CATEGORY_UCS4_MASK;
3611 }
3612
3613 static void
3614 decode_coding_ucs4 (Lstream *decoding, const unsigned char *src,
3615                     unsigned_char_dynarr *dst, unsigned int n)
3616 {
3617   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3618   unsigned int flags = str->flags;
3619   unsigned int ch    = str->ch;
3620   unsigned char counter = str->counter;
3621
3622   while (n--)
3623     {
3624       unsigned char c = *src++;
3625       switch (counter)
3626         {
3627         case 0:
3628           ch = c;
3629           counter = 3;
3630           break;
3631         case 1:
3632           decode_ucs4 ( ( ch << 8 ) | c, dst);
3633           ch = 0;
3634           counter = 0;
3635           break;
3636         default:
3637           ch = ( ch << 8 ) | c;
3638           counter--;
3639         }
3640     }
3641   if (counter & CODING_STATE_END)
3642     DECODE_OUTPUT_PARTIAL_CHAR (ch);
3643
3644   str->flags = flags;
3645   str->ch    = ch;
3646   str->counter = counter;
3647 }
3648
3649 static void
3650 encode_coding_ucs4 (Lstream *encoding, const unsigned char *src,
3651                     unsigned_char_dynarr *dst, unsigned int n)
3652 {
3653   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3654   unsigned int flags = str->flags;
3655   unsigned int ch = str->ch;
3656   unsigned char char_boundary = str->iso2022.current_char_boundary;
3657   Lisp_Object charset = str->iso2022.current_charset;
3658
3659 #ifdef ENABLE_COMPOSITE_CHARS
3660   /* flags for handling composite chars.  We do a little switcharoo
3661      on the source while we're outputting the composite char. */
3662   unsigned int saved_n = 0;
3663   const unsigned char *saved_src = NULL;
3664   int in_composite = 0;
3665
3666  back_to_square_n:
3667 #endif
3668
3669   while (n--)
3670     {
3671       unsigned char c = *src++;
3672
3673       if (BYTE_ASCII_P (c))
3674         {               /* Processing ASCII character */
3675           ch = 0;
3676           encode_ucs4 (Vcharset_ascii, c, 0, dst);
3677           char_boundary = 1;
3678         }
3679       else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
3680         { /* Processing Leading Byte */
3681           ch = 0;
3682           charset = CHARSET_BY_LEADING_BYTE (c);
3683           if (LEADING_BYTE_PREFIX_P(c))
3684             ch = c;
3685           char_boundary = 0;
3686         }
3687       else
3688         {                       /* Processing Non-ASCII character */
3689           char_boundary = 1;
3690           if (EQ (charset, Vcharset_control_1))
3691             {
3692               encode_ucs4 (Vcharset_control_1, c, 0, dst);
3693             }
3694           else
3695             {
3696               switch (XCHARSET_REP_BYTES (charset))
3697                 {
3698                 case 2:
3699                   encode_ucs4 (charset, c, 0, dst);
3700                   break;
3701                 case 3:
3702                   if (XCHARSET_PRIVATE_P (charset))
3703                     {
3704                       encode_ucs4 (charset, c, 0, dst);
3705                       ch = 0;
3706                     }
3707                   else if (ch)
3708                     {
3709 #ifdef ENABLE_COMPOSITE_CHARS
3710                       if (EQ (charset, Vcharset_composite))
3711                         {
3712                           if (in_composite)
3713                             {
3714                               /* #### Bother! We don't know how to
3715                                  handle this yet. */
3716                               Dynarr_add (dst, 0);
3717                               Dynarr_add (dst, 0);
3718                               Dynarr_add (dst, 0);
3719                               Dynarr_add (dst, '~');
3720                             }
3721                           else
3722                             {
3723                               Emchar emch = MAKE_CHAR (Vcharset_composite,
3724                                                        ch & 0x7F, c & 0x7F);
3725                               Lisp_Object lstr = composite_char_string (emch);
3726                               saved_n = n;
3727                               saved_src = src;
3728                               in_composite = 1;
3729                               src = XSTRING_DATA   (lstr);
3730                               n   = XSTRING_LENGTH (lstr);
3731                             }
3732                         }
3733                       else
3734 #endif /* ENABLE_COMPOSITE_CHARS */
3735                         {
3736                           encode_ucs4(charset, ch, c, dst);
3737                         }
3738                       ch = 0;
3739                     }
3740                   else
3741                     {
3742                       ch = c;
3743                       char_boundary = 0;
3744                     }
3745                   break;
3746                 case 4:
3747                   if (ch)
3748                     {
3749                       encode_ucs4 (charset, ch, c, dst);
3750                       ch = 0;
3751                     }
3752                   else
3753                     {
3754                       ch = c;
3755                       char_boundary = 0;
3756                     }
3757                   break;
3758                 default:
3759                   abort ();
3760                 }
3761             }
3762         }
3763     }
3764
3765 #ifdef ENABLE_COMPOSITE_CHARS
3766   if (in_composite)
3767     {
3768       n = saved_n;
3769       src = saved_src;
3770       in_composite = 0;
3771       goto back_to_square_n; /* Wheeeeeeeee ..... */
3772     }
3773 #endif /* ENABLE_COMPOSITE_CHARS */
3774
3775   str->flags = flags;
3776   str->ch = ch;
3777   str->iso2022.current_char_boundary = char_boundary;
3778   str->iso2022.current_charset = charset;
3779
3780   /* Verbum caro factum est! */
3781 }
3782
3783 \f
3784 /************************************************************************/
3785 /*                           UTF-8 methods                              */
3786 /************************************************************************/
3787
3788 static int
3789 detect_coding_utf8 (struct detection_state *st, const unsigned char *src,
3790                     unsigned int n)
3791 {
3792   while (n--)
3793     {
3794       unsigned char c = *src++;
3795       switch (st->utf8.in_byte)
3796         {
3797         case 0:
3798           if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3799             return 0;
3800           else if (c >= 0xfc)
3801             st->utf8.in_byte = 5;
3802           else if (c >= 0xf8)
3803             st->utf8.in_byte = 4;
3804           else if (c >= 0xf0)
3805             st->utf8.in_byte = 3;
3806           else if (c >= 0xe0)
3807             st->utf8.in_byte = 2;
3808           else if (c >= 0xc0)
3809             st->utf8.in_byte = 1;
3810           else if (c >= 0x80)
3811             return 0;
3812           break;
3813         default:
3814           if ((c & 0xc0) != 0x80)
3815             return 0;
3816           else
3817             st->utf8.in_byte--;
3818         }
3819     }
3820   return CODING_CATEGORY_UTF8_MASK;
3821 }
3822
3823 static void
3824 decode_coding_utf8 (Lstream *decoding, const unsigned char *src,
3825                     unsigned_char_dynarr *dst, unsigned int n)
3826 {
3827   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3828   unsigned int flags  = str->flags;
3829   unsigned int ch     = str->ch;
3830   eol_type_t eol_type = str->eol_type;
3831   unsigned char counter = str->counter;
3832
3833   while (n--)
3834     {
3835       unsigned char c = *src++;
3836       switch (counter)
3837         {
3838         case 0:
3839           if ( c >= 0xfc )
3840             {
3841               ch = c & 0x01;
3842               counter = 5;
3843             }
3844           else if ( c >= 0xf8 )
3845             {
3846               ch = c & 0x03;
3847               counter = 4;
3848             }
3849           else if ( c >= 0xf0 )
3850             {
3851               ch = c & 0x07;
3852               counter = 3;
3853             }
3854           else if ( c >= 0xe0 )
3855             {
3856               ch = c & 0x0f;
3857               counter = 2;
3858             }
3859           else if ( c >= 0xc0 )
3860             {
3861               ch = c & 0x1f;
3862               counter = 1;
3863             }
3864           else
3865             {
3866               DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3867               decode_ucs4 (c, dst);
3868             }
3869           break;
3870         case 1:
3871           ch = ( ch << 6 ) | ( c & 0x3f );
3872           decode_ucs4 (ch, dst);
3873           ch = 0;
3874           counter = 0;
3875           break;
3876         default:
3877           ch = ( ch << 6 ) | ( c & 0x3f );
3878           counter--;
3879         }
3880     label_continue_loop:;
3881     }
3882
3883   if (flags & CODING_STATE_END)
3884     DECODE_OUTPUT_PARTIAL_CHAR (ch);
3885
3886   str->flags = flags;
3887   str->ch    = ch;
3888   str->counter = counter;
3889 }
3890
3891 static void
3892 encode_utf8 (Lisp_Object charset,
3893              unsigned char h, unsigned char l, unsigned_char_dynarr *dst)
3894 {
3895   unsigned long code = mule_char_to_ucs4 (charset, h, l);
3896   if ( code <= 0x7f )
3897     {
3898       Dynarr_add (dst, code);
3899     }
3900   else if ( code <= 0x7ff )
3901     {
3902       Dynarr_add (dst, (code >> 6) | 0xc0);
3903       Dynarr_add (dst, (code & 0x3f) | 0x80);
3904     }
3905   else if ( code <= 0xffff )
3906     {
3907       Dynarr_add (dst,  (code >> 12) | 0xe0);
3908       Dynarr_add (dst, ((code >>  6) & 0x3f) | 0x80);
3909       Dynarr_add (dst,  (code        & 0x3f) | 0x80);
3910     }
3911   else if ( code <= 0x1fffff )
3912     {
3913       Dynarr_add (dst,  (code >> 18) | 0xf0);
3914       Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
3915       Dynarr_add (dst, ((code >>  6) & 0x3f) | 0x80);
3916       Dynarr_add (dst,  (code        & 0x3f) | 0x80);
3917     }
3918   else if ( code <= 0x3ffffff )
3919     {
3920       Dynarr_add (dst,  (code >> 24) | 0xf8);
3921       Dynarr_add (dst, ((code >> 18) & 0x3f) | 0x80);
3922       Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
3923       Dynarr_add (dst, ((code >>  6) & 0x3f) | 0x80);
3924       Dynarr_add (dst,  (code        & 0x3f) | 0x80);
3925     }
3926   else
3927     {
3928       Dynarr_add (dst,  (code >> 30) | 0xfc);
3929       Dynarr_add (dst, ((code >> 24) & 0x3f) | 0x80);
3930       Dynarr_add (dst, ((code >> 18) & 0x3f) | 0x80);
3931       Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
3932       Dynarr_add (dst, ((code >>  6) & 0x3f) | 0x80);
3933       Dynarr_add (dst,  (code        & 0x3f) | 0x80);
3934     }
3935 }
3936
3937 static void
3938 encode_coding_utf8 (Lstream *encoding, const unsigned char *src,
3939                     unsigned_char_dynarr *dst, unsigned int n)
3940 {
3941   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3942   unsigned int flags  = str->flags;
3943   unsigned int ch     = str->ch;
3944   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3945   unsigned char char_boundary = str->iso2022.current_char_boundary;
3946   Lisp_Object charset = str->iso2022.current_charset;
3947
3948 #ifdef ENABLE_COMPOSITE_CHARS
3949   /* flags for handling composite chars.  We do a little switcharoo
3950      on the source while we're outputting the composite char. */
3951   unsigned int saved_n = 0;
3952   const unsigned char *saved_src = NULL;
3953   int in_composite = 0;
3954
3955  back_to_square_n:
3956 #endif /* ENABLE_COMPOSITE_CHARS */
3957
3958   while (n--)
3959     {
3960       unsigned char c = *src++;
3961
3962       if (BYTE_ASCII_P (c))
3963         {               /* Processing ASCII character */
3964           ch = 0;
3965           if (c == '\n')
3966             {
3967               if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3968                 Dynarr_add (dst, '\r');
3969               if (eol_type != EOL_CR)
3970                 Dynarr_add (dst, c);
3971             }
3972           else
3973             encode_utf8 (Vcharset_ascii, c, 0, dst);
3974           char_boundary = 1;
3975         }
3976       else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
3977         { /* Processing Leading Byte */
3978           ch = 0;
3979           charset = CHARSET_BY_LEADING_BYTE (c);
3980           if (LEADING_BYTE_PREFIX_P(c))
3981             ch = c;
3982           char_boundary = 0;
3983         }
3984       else
3985         {                       /* Processing Non-ASCII character */
3986           char_boundary = 1;
3987           if (EQ (charset, Vcharset_control_1))
3988             {
3989               encode_utf8 (Vcharset_control_1, c, 0, dst);
3990             }
3991           else
3992             {
3993               switch (XCHARSET_REP_BYTES (charset))
3994                 {
3995                 case 2:
3996                   encode_utf8 (charset, c, 0, dst);
3997                   break;
3998                 case 3:
3999                   if (XCHARSET_PRIVATE_P (charset))
4000                     {
4001                       encode_utf8 (charset, c, 0, dst);
4002                       ch = 0;
4003                     }
4004                   else if (ch)
4005                     {
4006 #ifdef ENABLE_COMPOSITE_CHARS
4007                       if (EQ (charset, Vcharset_composite))
4008                         {
4009                           if (in_composite)
4010                             {
4011                               /* #### Bother! We don't know how to
4012                                  handle this yet. */
4013                               encode_utf8 (Vcharset_ascii, '~', 0, dst);
4014                             }
4015                           else
4016                             {
4017                               Emchar emch = MAKE_CHAR (Vcharset_composite,
4018                                                        ch & 0x7F, c & 0x7F);
4019                               Lisp_Object lstr = composite_char_string (emch);
4020                               saved_n = n;
4021                               saved_src = src;
4022                               in_composite = 1;
4023                               src = XSTRING_DATA   (lstr);
4024                               n   = XSTRING_LENGTH (lstr);
4025                             }
4026                         }
4027                       else
4028 #endif /* ENABLE_COMPOSITE_CHARS */
4029                         {
4030                           encode_utf8 (charset, ch, c, dst);
4031                         }
4032                       ch = 0;
4033                     }
4034                   else
4035                     {
4036                       ch = c;
4037                       char_boundary = 0;
4038                     }
4039                   break;
4040                 case 4:
4041                   if (ch)
4042                     {
4043                       encode_utf8 (charset, ch, c, dst);
4044                       ch = 0;
4045                     }
4046                   else
4047                     {
4048                       ch = c;
4049                       char_boundary = 0;
4050                     }
4051                   break;
4052                 default:
4053                   abort ();
4054                 }
4055             }
4056         }
4057     }
4058
4059 #ifdef ENABLE_COMPOSITE_CHARS
4060   if (in_composite)
4061     {
4062       n = saved_n;
4063       src = saved_src;
4064       in_composite = 0;
4065       goto back_to_square_n; /* Wheeeeeeeee ..... */
4066     }
4067 #endif
4068
4069   str->flags = flags;
4070   str->ch    = ch;
4071   str->iso2022.current_char_boundary = char_boundary;
4072   str->iso2022.current_charset = charset;
4073
4074   /* Verbum caro factum est! */
4075 }
4076
4077 \f
4078 /************************************************************************/
4079 /*                           ISO2022 methods                            */
4080 /************************************************************************/
4081
4082 /* The following note describes the coding system ISO2022 briefly.
4083    Since the intention of this note is to help understand the
4084    functions in this file, some parts are NOT ACCURATE or OVERLY
4085    SIMPLIFIED.  For thorough understanding, please refer to the
4086    original document of ISO2022.
4087
4088    ISO2022 provides many mechanisms to encode several character sets
4089    in 7-bit and 8-bit environments.  For 7-bit environments, all text
4090    is encoded using bytes less than 128.  This may make the encoded
4091    text a little bit longer, but the text passes more easily through
4092    several gateways, some of which strip off MSB (Most Signigant Bit).
4093
4094    There are two kinds of character sets: control character set and
4095    graphic character set.  The former contains control characters such
4096    as `newline' and `escape' to provide control functions (control
4097    functions are also provided by escape sequences).  The latter
4098    contains graphic characters such as 'A' and '-'.  Emacs recognizes
4099    two control character sets and many graphic character sets.
4100
4101    Graphic character sets are classified into one of the following
4102    four classes, according to the number of bytes (DIMENSION) and
4103    number of characters in one dimension (CHARS) of the set:
4104    - DIMENSION1_CHARS94
4105    - DIMENSION1_CHARS96
4106    - DIMENSION2_CHARS94
4107    - DIMENSION2_CHARS96
4108
4109    In addition, each character set is assigned an identification tag,
4110    unique for each set, called "final character" (denoted as <F>
4111    hereafter).  The <F> of each character set is decided by ECMA(*)
4112    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
4113    (0x30..0x3F are for private use only).
4114
4115    Note (*): ECMA = European Computer Manufacturers Association
4116
4117    Here are examples of graphic character set [NAME(<F>)]:
4118         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
4119         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
4120         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
4121         o DIMENSION2_CHARS96 -- none for the moment
4122
4123    A code area (1 byte = 8 bits) is divided into 4 areas, C0, GL, C1, and GR.
4124         C0 [0x00..0x1F] -- control character plane 0
4125         GL [0x20..0x7F] -- graphic character plane 0
4126         C1 [0x80..0x9F] -- control character plane 1
4127         GR [0xA0..0xFF] -- graphic character plane 1
4128
4129    A control character set is directly designated and invoked to C0 or
4130    C1 by an escape sequence.  The most common case is that:
4131    - ISO646's  control character set is designated/invoked to C0, and
4132    - ISO6429's control character set is designated/invoked to C1,
4133    and usually these designations/invocations are omitted in encoded
4134    text.  In a 7-bit environment, only C0 can be used, and a control
4135    character for C1 is encoded by an appropriate escape sequence to
4136    fit into the environment.  All control characters for C1 are
4137    defined to have corresponding escape sequences.
4138
4139    A graphic character set is at first designated to one of four
4140    graphic registers (G0 through G3), then these graphic registers are
4141    invoked to GL or GR.  These designations and invocations can be
4142    done independently.  The most common case is that G0 is invoked to
4143    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
4144    these invocations and designations are omitted in encoded text.
4145    In a 7-bit environment, only GL can be used.
4146
4147    When a graphic character set of CHARS94 is invoked to GL, codes
4148    0x20 and 0x7F of the GL area work as control characters SPACE and
4149    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
4150    be used.
4151
4152    There are two ways of invocation: locking-shift and single-shift.
4153    With locking-shift, the invocation lasts until the next different
4154    invocation, whereas with single-shift, the invocation affects the
4155    following character only and doesn't affect the locking-shift
4156    state.  Invocations are done by the following control characters or
4157    escape sequences:
4158
4159    ----------------------------------------------------------------------
4160    abbrev  function                  cntrl escape seq   description
4161    ----------------------------------------------------------------------
4162    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
4163    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
4164    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
4165    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
4166    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
4167    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
4168    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
4169    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
4170    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
4171    ----------------------------------------------------------------------
4172    (*) These are not used by any known coding system.
4173
4174    Control characters for these functions are defined by macros
4175    ISO_CODE_XXX in `coding.h'.
4176
4177    Designations are done by the following escape sequences:
4178    ----------------------------------------------------------------------
4179    escape sequence      description
4180    ----------------------------------------------------------------------
4181    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
4182    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
4183    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
4184    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
4185    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
4186    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
4187    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
4188    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
4189    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
4190    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
4191    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
4192    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
4193    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
4194    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
4195    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
4196    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
4197    ----------------------------------------------------------------------
4198
4199    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
4200    of dimension 1, chars 94, and final character <F>, etc...
4201
4202    Note (*): Although these designations are not allowed in ISO2022,
4203    Emacs accepts them on decoding, and produces them on encoding
4204    CHARS96 character sets in a coding system which is characterized as
4205    7-bit environment, non-locking-shift, and non-single-shift.
4206
4207    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
4208    '(' can be omitted.  We refer to this as "short-form" hereafter.
4209
4210    Now you may notice that there are a lot of ways for encoding the
4211    same multilingual text in ISO2022.  Actually, there exist many
4212    coding systems such as Compound Text (used in X11's inter client
4213    communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
4214    (used in Korean internet), EUC (Extended UNIX Code, used in Asian
4215    localized platforms), and all of these are variants of ISO2022.
4216
4217    In addition to the above, Emacs handles two more kinds of escape
4218    sequences: ISO6429's direction specification and Emacs' private
4219    sequence for specifying character composition.
4220
4221    ISO6429's direction specification takes the following form:
4222         o CSI ']'      -- end of the current direction
4223         o CSI '0' ']'  -- end of the current direction
4224         o CSI '1' ']'  -- start of left-to-right text
4225         o CSI '2' ']'  -- start of right-to-left text
4226    The control character CSI (0x9B: control sequence introducer) is
4227    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
4228
4229    Character composition specification takes the following form:
4230         o ESC '0' -- start character composition
4231         o ESC '1' -- end character composition
4232    Since these are not standard escape sequences of any ISO standard,
4233    their use with these meanings is restricted to Emacs only.  */
4234
4235 static void
4236 reset_iso2022 (Lisp_Object coding_system, struct iso2022_decoder *iso)
4237 {
4238   int i;
4239
4240   for (i = 0; i < 4; i++)
4241     {
4242       if (!NILP (coding_system))
4243         iso->charset[i] =
4244           XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, i);
4245       else
4246         iso->charset[i] = Qt;
4247       iso->invalid_designated[i] = 0;
4248     }
4249   iso->esc = ISO_ESC_NOTHING;
4250   iso->esc_bytes_index = 0;
4251   iso->register_left = 0;
4252   iso->register_right = 1;
4253   iso->switched_dir_and_no_valid_charset_yet = 0;
4254   iso->invalid_switch_dir = 0;
4255   iso->output_direction_sequence = 0;
4256   iso->output_literally = 0;
4257 #ifdef ENABLE_COMPOSITE_CHARS
4258   if (iso->composite_chars)
4259     Dynarr_reset (iso->composite_chars);
4260 #endif
4261 }
4262
4263 static int
4264 fit_to_be_escape_quoted (unsigned char c)
4265 {
4266   switch (c)
4267     {
4268     case ISO_CODE_ESC:
4269     case ISO_CODE_CSI:
4270     case ISO_CODE_SS2:
4271     case ISO_CODE_SS3:
4272     case ISO_CODE_SO:
4273     case ISO_CODE_SI:
4274       return 1;
4275
4276     default:
4277       return 0;
4278     }
4279 }
4280
4281 /* Parse one byte of an ISO2022 escape sequence.
4282    If the result is an invalid escape sequence, return 0 and
4283    do not change anything in STR.  Otherwise, if the result is
4284    an incomplete escape sequence, update ISO2022.ESC and
4285    ISO2022.ESC_BYTES and return -1.  Otherwise, update
4286    all the state variables (but not ISO2022.ESC_BYTES) and
4287    return 1.
4288
4289    If CHECK_INVALID_CHARSETS is non-zero, check for designation
4290    or invocation of an invalid character set and treat that as
4291    an unrecognized escape sequence. */
4292
4293 static int
4294 parse_iso2022_esc (Lisp_Object codesys, struct iso2022_decoder *iso,
4295                    unsigned char c, unsigned int *flags,
4296                    int check_invalid_charsets)
4297 {
4298   /* (1) If we're at the end of a designation sequence, CS is the
4299      charset being designated and REG is the register to designate
4300      it to.
4301
4302      (2) If we're at the end of a locking-shift sequence, REG is
4303      the register to invoke and HALF (0 == left, 1 == right) is
4304      the half to invoke it into.
4305
4306      (3) If we're at the end of a single-shift sequence, REG is
4307      the register to invoke. */
4308   Lisp_Object cs = Qnil;
4309   int reg, half;
4310
4311   /* NOTE: This code does goto's all over the fucking place.
4312      The reason for this is that we're basically implementing
4313      a state machine here, and hierarchical languages like C
4314      don't really provide a clean way of doing this. */
4315
4316   if (! (*flags & CODING_STATE_ESCAPE))
4317     /* At beginning of escape sequence; we need to reset our
4318        escape-state variables. */
4319     iso->esc = ISO_ESC_NOTHING;
4320
4321   iso->output_literally = 0;
4322   iso->output_direction_sequence = 0;
4323
4324   switch (iso->esc)
4325     {
4326     case ISO_ESC_NOTHING:
4327       iso->esc_bytes_index = 0;
4328       switch (c)
4329         {
4330         case ISO_CODE_ESC:      /* Start escape sequence */
4331           *flags |= CODING_STATE_ESCAPE;
4332           iso->esc = ISO_ESC;
4333           goto not_done;
4334
4335         case ISO_CODE_CSI:      /* ISO6429 (specifying directionality) */
4336           *flags |= CODING_STATE_ESCAPE;
4337           iso->esc = ISO_ESC_5_11;
4338           goto not_done;
4339
4340         case ISO_CODE_SO:       /* locking shift 1 */
4341           reg = 1; half = 0;
4342           goto locking_shift;
4343         case ISO_CODE_SI:       /* locking shift 0 */
4344           reg = 0; half = 0;
4345           goto locking_shift;
4346
4347         case ISO_CODE_SS2:      /* single shift */
4348           reg = 2;
4349           goto single_shift;
4350         case ISO_CODE_SS3:      /* single shift */
4351           reg = 3;
4352           goto single_shift;
4353
4354         default:                        /* Other control characters */
4355           return 0;
4356         }
4357
4358     case ISO_ESC:
4359       switch (c)
4360         {
4361           /**** single shift ****/
4362
4363         case 'N':       /* single shift 2 */
4364           reg = 2;
4365           goto single_shift;
4366         case 'O':       /* single shift 3 */
4367           reg = 3;
4368           goto single_shift;
4369
4370           /**** locking shift ****/
4371
4372         case '~':       /* locking shift 1 right */
4373           reg = 1; half = 1;
4374           goto locking_shift;
4375         case 'n':       /* locking shift 2 */
4376           reg = 2; half = 0;
4377           goto locking_shift;
4378         case '}':       /* locking shift 2 right */
4379           reg = 2; half = 1;
4380           goto locking_shift;
4381         case 'o':       /* locking shift 3 */
4382           reg = 3; half = 0;
4383           goto locking_shift;
4384         case '|':       /* locking shift 3 right */
4385           reg = 3; half = 1;
4386           goto locking_shift;
4387
4388 #ifdef ENABLE_COMPOSITE_CHARS
4389           /**** composite ****/
4390
4391         case '0':
4392           iso->esc = ISO_ESC_START_COMPOSITE;
4393           *flags = (*flags & CODING_STATE_ISO2022_LOCK) |
4394             CODING_STATE_COMPOSITE;
4395           return 1;
4396
4397         case '1':
4398           iso->esc = ISO_ESC_END_COMPOSITE;
4399           *flags = (*flags & CODING_STATE_ISO2022_LOCK) &
4400             ~CODING_STATE_COMPOSITE;
4401           return 1;
4402 #endif /* ENABLE_COMPOSITE_CHARS */
4403
4404           /**** directionality ****/
4405
4406         case '[':
4407           iso->esc = ISO_ESC_5_11;
4408           goto not_done;
4409
4410           /**** designation ****/
4411
4412         case '$':       /* multibyte charset prefix */
4413           iso->esc = ISO_ESC_2_4;
4414           goto not_done;
4415
4416         default:
4417           if (0x28 <= c && c <= 0x2F)
4418             {
4419               iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_8);
4420               goto not_done;
4421             }
4422
4423           /* This function is called with CODESYS equal to nil when
4424              doing coding-system detection. */
4425           if (!NILP (codesys)
4426               && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
4427               && fit_to_be_escape_quoted (c))
4428             {
4429               iso->esc = ISO_ESC_LITERAL;
4430               *flags &= CODING_STATE_ISO2022_LOCK;
4431               return 1;
4432             }
4433
4434           /* bzzzt! */
4435           return 0;
4436         }
4437
4438
4439
4440       /**** directionality ****/
4441
4442     case ISO_ESC_5_11:          /* ISO6429 direction control */
4443       if (c == ']')
4444         {
4445           *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4446           goto directionality;
4447         }
4448       if      (c == '0') iso->esc = ISO_ESC_5_11_0;
4449       else if (c == '1') iso->esc = ISO_ESC_5_11_1;
4450       else if (c == '2') iso->esc = ISO_ESC_5_11_2;
4451       else               return 0;
4452       goto not_done;
4453
4454     case ISO_ESC_5_11_0:
4455       if (c == ']')
4456         {
4457           *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4458           goto directionality;
4459         }
4460       return 0;
4461
4462     case ISO_ESC_5_11_1:
4463       if (c == ']')
4464         {
4465           *flags = (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4466           goto directionality;
4467         }
4468       return 0;
4469
4470     case ISO_ESC_5_11_2:
4471       if (c == ']')
4472         {
4473           *flags = (*flags & CODING_STATE_ISO2022_LOCK) | CODING_STATE_R2L;
4474           goto directionality;
4475         }
4476       return 0;
4477
4478     directionality:
4479       iso->esc = ISO_ESC_DIRECTIONALITY;
4480       /* Various junk here to attempt to preserve the direction sequences
4481          literally in the text if they would otherwise be swallowed due
4482          to invalid designations that don't show up as actual charset
4483          changes in the text. */
4484       if (iso->invalid_switch_dir)
4485         {
4486           /* We already inserted a direction switch literally into the
4487              text.  We assume (#### this may not be right) that the
4488              next direction switch is the one going the other way,
4489              and we need to output that literally as well. */
4490           iso->output_literally = 1;
4491           iso->invalid_switch_dir = 0;
4492         }
4493       else
4494         {
4495           int jj;
4496
4497           /* If we are in the thrall of an invalid designation,
4498            then stick the directionality sequence literally into the
4499            output stream so it ends up in the original text again. */
4500           for (jj = 0; jj < 4; jj++)
4501             if (iso->invalid_designated[jj])
4502               break;
4503           if (jj < 4)
4504             {
4505               iso->output_literally = 1;
4506               iso->invalid_switch_dir = 1;
4507             }
4508           else
4509             /* Indicate that we haven't yet seen a valid designation,
4510                so that if a switch-dir is directly followed by an
4511                invalid designation, both get inserted literally. */
4512             iso->switched_dir_and_no_valid_charset_yet = 1;
4513         }
4514       return 1;
4515
4516
4517       /**** designation ****/
4518
4519     case ISO_ESC_2_4:
4520       if (0x28 <= c && c <= 0x2F)
4521         {
4522           iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_4_8);
4523           goto not_done;
4524         }
4525       if (0x40 <= c && c <= 0x42)
4526         {
4527           cs = CHARSET_BY_ATTRIBUTES (CHARSET_TYPE_94X94, c,
4528                                       *flags & CODING_STATE_R2L ?
4529                                       CHARSET_RIGHT_TO_LEFT :
4530                                       CHARSET_LEFT_TO_RIGHT);
4531           reg = 0;
4532           goto designated;
4533         }
4534       return 0;
4535
4536     default:
4537       {
4538         int type =-1;
4539
4540         if (c < '0' || c > '~')
4541           return 0; /* bad final byte */
4542
4543         if (iso->esc >= ISO_ESC_2_8 &&
4544             iso->esc <= ISO_ESC_2_15)
4545           {
4546             type = ((iso->esc >= ISO_ESC_2_12) ?
4547                     CHARSET_TYPE_96 : CHARSET_TYPE_94);
4548             reg = (iso->esc - ISO_ESC_2_8) & 3;
4549           }
4550         else if (iso->esc >= ISO_ESC_2_4_8 &&
4551                  iso->esc <= ISO_ESC_2_4_15)
4552           {
4553             type = ((iso->esc >= ISO_ESC_2_4_12) ?
4554                     CHARSET_TYPE_96X96 : CHARSET_TYPE_94X94);
4555             reg = (iso->esc - ISO_ESC_2_4_8) & 3;
4556           }
4557         else
4558           {
4559             /* Can this ever be reached? -slb */
4560             abort();
4561             return 0;
4562           }
4563
4564         cs = CHARSET_BY_ATTRIBUTES (type, c,
4565                                     *flags & CODING_STATE_R2L ?
4566                                     CHARSET_RIGHT_TO_LEFT :
4567                                     CHARSET_LEFT_TO_RIGHT);
4568         goto designated;
4569       }
4570     }
4571
4572  not_done:
4573   iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char) c;
4574   return -1;
4575
4576  single_shift:
4577   if (check_invalid_charsets && !CHARSETP (iso->charset[reg]))
4578     /* can't invoke something that ain't there. */
4579     return 0;
4580   iso->esc = ISO_ESC_SINGLE_SHIFT;
4581   *flags &= CODING_STATE_ISO2022_LOCK;
4582   if (reg == 2)
4583     *flags |= CODING_STATE_SS2;
4584   else
4585     *flags |= CODING_STATE_SS3;
4586   return 1;
4587
4588  locking_shift:
4589   if (check_invalid_charsets &&
4590       !CHARSETP (iso->charset[reg]))
4591     /* can't invoke something that ain't there. */
4592     return 0;
4593   if (half)
4594     iso->register_right = reg;
4595   else
4596     iso->register_left = reg;
4597   *flags &= CODING_STATE_ISO2022_LOCK;
4598   iso->esc = ISO_ESC_LOCKING_SHIFT;
4599   return 1;
4600
4601  designated:
4602   if (NILP (cs) && check_invalid_charsets)
4603     {
4604       iso->invalid_designated[reg] = 1;
4605       iso->charset[reg] = Vcharset_ascii;
4606       iso->esc = ISO_ESC_DESIGNATE;
4607       *flags &= CODING_STATE_ISO2022_LOCK;
4608       iso->output_literally = 1;
4609       if (iso->switched_dir_and_no_valid_charset_yet)
4610         {
4611           /* We encountered a switch-direction followed by an
4612              invalid designation.  Ensure that the switch-direction
4613              gets outputted; otherwise it will probably get eaten
4614              when the text is written out again. */
4615           iso->switched_dir_and_no_valid_charset_yet = 0;
4616           iso->output_direction_sequence = 1;
4617           /* And make sure that the switch-dir going the other
4618              way gets outputted, as well. */
4619           iso->invalid_switch_dir = 1;
4620         }
4621       return 1;
4622     }
4623   /* This function is called with CODESYS equal to nil when
4624      doing coding-system detection. */
4625   if (!NILP (codesys))
4626     {
4627       charset_conversion_spec_dynarr *dyn =
4628         XCODING_SYSTEM (codesys)->iso2022.input_conv;
4629
4630       if (dyn)
4631         {
4632           int i;
4633
4634           for (i = 0; i < Dynarr_length (dyn); i++)
4635             {
4636               struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
4637               if (EQ (cs, spec->from_charset))
4638                 cs = spec->to_charset;
4639             }
4640         }
4641     }
4642
4643   iso->charset[reg] = cs;
4644   iso->esc = ISO_ESC_DESIGNATE;
4645   *flags &= CODING_STATE_ISO2022_LOCK;
4646   if (iso->invalid_designated[reg])
4647     {
4648       iso->invalid_designated[reg] = 0;
4649       iso->output_literally = 1;
4650     }
4651   if (iso->switched_dir_and_no_valid_charset_yet)
4652     iso->switched_dir_and_no_valid_charset_yet = 0;
4653   return 1;
4654 }
4655
4656 static int
4657 detect_coding_iso2022 (struct detection_state *st, const unsigned char *src,
4658                        unsigned int n)
4659 {
4660   int mask;
4661
4662   /* #### There are serious deficiencies in the recognition mechanism
4663      here.  This needs to be much smarter if it's going to cut it.
4664      The sequence "\xff\x0f" is currently detected as LOCK_SHIFT while
4665      it should be detected as Latin-1.
4666      All the ISO2022 stuff in this file should be synced up with the
4667      code from FSF Emacs-20.4, in which Mule should be more or less stable.
4668      Perhaps we should wait till R2L works in FSF Emacs? */
4669
4670   if (!st->iso2022.initted)
4671     {
4672       reset_iso2022 (Qnil, &st->iso2022.iso);
4673       st->iso2022.mask = (CODING_CATEGORY_ISO_7_MASK |
4674                           CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4675                           CODING_CATEGORY_ISO_8_1_MASK |
4676                           CODING_CATEGORY_ISO_8_2_MASK |
4677                           CODING_CATEGORY_ISO_LOCK_SHIFT_MASK);
4678       st->iso2022.flags = 0;
4679       st->iso2022.high_byte_count = 0;
4680       st->iso2022.saw_single_shift = 0;
4681       st->iso2022.initted = 1;
4682     }
4683
4684   mask = st->iso2022.mask;
4685
4686   while (n--)
4687     {
4688       int c = *src++;
4689       if (c >= 0xA0)
4690         {
4691           mask &= ~CODING_CATEGORY_ISO_7_MASK;
4692           st->iso2022.high_byte_count++;
4693         }
4694       else
4695         {
4696           if (st->iso2022.high_byte_count && !st->iso2022.saw_single_shift)
4697             {
4698               if (st->iso2022.high_byte_count & 1)
4699                 /* odd number of high bytes; assume not iso-8-2 */
4700                 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4701             }
4702           st->iso2022.high_byte_count = 0;
4703           st->iso2022.saw_single_shift = 0;
4704           if (c > 0x80)
4705             mask &= ~CODING_CATEGORY_ISO_7_MASK;
4706         }
4707       if (!(st->iso2022.flags & CODING_STATE_ESCAPE)
4708           && (BYTE_C0_P (c) || BYTE_C1_P (c)))
4709         { /* control chars */
4710           switch (c)
4711             {
4712               /* Allow and ignore control characters that you might
4713                  reasonably see in a text file */
4714             case '\r':
4715             case '\n':
4716             case '\t':
4717             case  7: /* bell */
4718             case  8: /* backspace */
4719             case 11: /* vertical tab */
4720             case 12: /* form feed */
4721             case 26: /* MS-DOS C-z junk */
4722             case 31: /* '^_' -- for info */
4723               goto label_continue_loop;
4724
4725             default:
4726               break;
4727             }
4728         }
4729
4730       if ((st->iso2022.flags & CODING_STATE_ESCAPE) || BYTE_C0_P (c)
4731           || BYTE_C1_P (c))
4732         {
4733           if (parse_iso2022_esc (Qnil, &st->iso2022.iso, c,
4734                                  &st->iso2022.flags, 0))
4735             {
4736               switch (st->iso2022.iso.esc)
4737                 {
4738                 case ISO_ESC_DESIGNATE:
4739                   mask &= ~CODING_CATEGORY_ISO_8_1_MASK;
4740                   mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4741                   break;
4742                 case ISO_ESC_LOCKING_SHIFT:
4743                   mask = CODING_CATEGORY_ISO_LOCK_SHIFT_MASK;
4744                   goto ran_out_of_chars;
4745                 case ISO_ESC_SINGLE_SHIFT:
4746                   mask &= ~CODING_CATEGORY_ISO_8_DESIGNATE_MASK;
4747                   st->iso2022.saw_single_shift = 1;
4748                   break;
4749                 default:
4750                   break;
4751                 }
4752             }
4753           else
4754             {
4755               mask = 0;
4756               goto ran_out_of_chars;
4757             }
4758         }
4759     label_continue_loop:;
4760     }
4761
4762  ran_out_of_chars:
4763
4764   return mask;
4765 }
4766
4767 static int
4768 postprocess_iso2022_mask (int mask)
4769 {
4770   /* #### kind of cheesy */
4771   /* If seven-bit ISO is allowed, then assume that the encoding is
4772      entirely seven-bit and turn off the eight-bit ones. */
4773   if (mask & CODING_CATEGORY_ISO_7_MASK)
4774     mask &= ~ (CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4775                CODING_CATEGORY_ISO_8_1_MASK |
4776                CODING_CATEGORY_ISO_8_2_MASK);
4777   return mask;
4778 }
4779
4780 /* If FLAGS is a null pointer or specifies right-to-left motion,
4781    output a switch-dir-to-left-to-right sequence to DST.
4782    Also update FLAGS if it is not a null pointer.
4783    If INTERNAL_P is set, we are outputting in internal format and
4784    need to handle the CSI differently. */
4785
4786 static void
4787 restore_left_to_right_direction (Lisp_Coding_System *codesys,
4788                                  unsigned_char_dynarr *dst,
4789                                  unsigned int *flags,
4790                                  int internal_p)
4791 {
4792   if (!flags || (*flags & CODING_STATE_R2L))
4793     {
4794       if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4795         {
4796           Dynarr_add (dst, ISO_CODE_ESC);
4797           Dynarr_add (dst, '[');
4798         }
4799       else if (internal_p)
4800         DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4801       else
4802         Dynarr_add (dst, ISO_CODE_CSI);
4803       Dynarr_add (dst, '0');
4804       Dynarr_add (dst, ']');
4805       if (flags)
4806         *flags &= ~CODING_STATE_R2L;
4807     }
4808 }
4809
4810 /* If FLAGS is a null pointer or specifies a direction different from
4811    DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or
4812    CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape
4813    sequence to DST.  Also update FLAGS if it is not a null pointer.
4814    If INTERNAL_P is set, we are outputting in internal format and
4815    need to handle the CSI differently. */
4816
4817 static void
4818 ensure_correct_direction (int direction, Lisp_Coding_System *codesys,
4819                           unsigned_char_dynarr *dst, unsigned int *flags,
4820                           int internal_p)
4821 {
4822   if ((!flags || (*flags & CODING_STATE_R2L)) &&
4823       direction == CHARSET_LEFT_TO_RIGHT)
4824     restore_left_to_right_direction (codesys, dst, flags, internal_p);
4825   else if (!CODING_SYSTEM_ISO2022_NO_ISO6429 (codesys)
4826            && (!flags || !(*flags & CODING_STATE_R2L)) &&
4827            direction == CHARSET_RIGHT_TO_LEFT)
4828     {
4829       if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4830         {
4831           Dynarr_add (dst, ISO_CODE_ESC);
4832           Dynarr_add (dst, '[');
4833         }
4834       else if (internal_p)
4835         DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4836       else
4837         Dynarr_add (dst, ISO_CODE_CSI);
4838       Dynarr_add (dst, '2');
4839       Dynarr_add (dst, ']');
4840       if (flags)
4841         *flags |= CODING_STATE_R2L;
4842     }
4843 }
4844
4845 /* Convert ISO2022-format data to internal format. */
4846
4847 static void
4848 decode_coding_iso2022 (Lstream *decoding, const unsigned char *src,
4849                        unsigned_char_dynarr *dst, unsigned int n)
4850 {
4851   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4852   unsigned int flags  = str->flags;
4853   unsigned int ch     = str->ch;
4854   eol_type_t eol_type = str->eol_type;
4855 #ifdef ENABLE_COMPOSITE_CHARS
4856   unsigned_char_dynarr *real_dst = dst;
4857 #endif
4858   Lisp_Object coding_system;
4859
4860   XSETCODING_SYSTEM (coding_system, str->codesys);
4861
4862 #ifdef ENABLE_COMPOSITE_CHARS
4863   if (flags & CODING_STATE_COMPOSITE)
4864     dst = str->iso2022.composite_chars;
4865 #endif /* ENABLE_COMPOSITE_CHARS */
4866
4867   while (n--)
4868     {
4869       unsigned char c = *src++;
4870       if (flags & CODING_STATE_ESCAPE)
4871         {       /* Within ESC sequence */
4872           int retval = parse_iso2022_esc (coding_system, &str->iso2022,
4873                                           c, &flags, 1);
4874
4875           if (retval)
4876             {
4877               switch (str->iso2022.esc)
4878                 {
4879 #ifdef ENABLE_COMPOSITE_CHARS
4880                 case ISO_ESC_START_COMPOSITE:
4881                   if (str->iso2022.composite_chars)
4882                     Dynarr_reset (str->iso2022.composite_chars);
4883                   else
4884                     str->iso2022.composite_chars = Dynarr_new (unsigned_char);
4885                   dst = str->iso2022.composite_chars;
4886                   break;
4887                 case ISO_ESC_END_COMPOSITE:
4888                   {
4889                     Bufbyte comstr[MAX_EMCHAR_LEN];
4890                     Bytecount len;
4891                     Emchar emch = lookup_composite_char (Dynarr_atp (dst, 0),
4892                                                          Dynarr_length (dst));
4893                     dst = real_dst;
4894                     len = set_charptr_emchar (comstr, emch);
4895                     Dynarr_add_many (dst, comstr, len);
4896                     break;
4897                   }
4898 #endif /* ENABLE_COMPOSITE_CHARS */
4899
4900                 case ISO_ESC_LITERAL:
4901                   DECODE_ADD_BINARY_CHAR (c, dst);
4902                   break;
4903
4904                 default:
4905                   /* Everything else handled already */
4906                   break;
4907                 }
4908             }
4909
4910           /* Attempted error recovery. */
4911           if (str->iso2022.output_direction_sequence)
4912             ensure_correct_direction (flags & CODING_STATE_R2L ?
4913                                       CHARSET_RIGHT_TO_LEFT :
4914                                       CHARSET_LEFT_TO_RIGHT,
4915                                       str->codesys, dst, 0, 1);
4916           /* More error recovery. */
4917           if (!retval || str->iso2022.output_literally)
4918             {
4919               /* Output the (possibly invalid) sequence */
4920               int i;
4921               for (i = 0; i < str->iso2022.esc_bytes_index; i++)
4922                 DECODE_ADD_BINARY_CHAR (str->iso2022.esc_bytes[i], dst);
4923               flags &= CODING_STATE_ISO2022_LOCK;
4924               if (!retval)
4925                 n++, src--;/* Repeat the loop with the same character. */
4926               else
4927                 {
4928                   /* No sense in reprocessing the final byte of the
4929                      escape sequence; it could mess things up anyway.
4930                      Just add it now. */
4931                   DECODE_ADD_BINARY_CHAR (c, dst);
4932                 }
4933             }
4934           ch = 0;
4935         }
4936       else if (BYTE_C0_P (c) || BYTE_C1_P (c))
4937         { /* Control characters */
4938
4939           /***** Error-handling *****/
4940
4941           /* If we were in the middle of a character, dump out the
4942              partial character. */
4943           DECODE_OUTPUT_PARTIAL_CHAR (ch);
4944
4945           /* If we just saw a single-shift character, dump it out.
4946              This may dump out the wrong sort of single-shift character,
4947              but least it will give an indication that something went
4948              wrong. */
4949           if (flags & CODING_STATE_SS2)
4950             {
4951               DECODE_ADD_BINARY_CHAR (ISO_CODE_SS2, dst);
4952               flags &= ~CODING_STATE_SS2;
4953             }
4954           if (flags & CODING_STATE_SS3)
4955             {
4956               DECODE_ADD_BINARY_CHAR (ISO_CODE_SS3, dst);
4957               flags &= ~CODING_STATE_SS3;
4958             }
4959
4960           /***** Now handle the control characters. *****/
4961
4962           /* Handle CR/LF */
4963           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4964
4965           flags &= CODING_STATE_ISO2022_LOCK;
4966
4967           if (!parse_iso2022_esc (coding_system, &str->iso2022, c, &flags, 1))
4968             DECODE_ADD_BINARY_CHAR (c, dst);
4969         }
4970       else
4971         {                       /* Graphic characters */
4972           Lisp_Object charset;
4973           int lb;
4974           int reg;
4975
4976           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4977
4978           /* Now determine the charset. */
4979           reg = ((flags & CODING_STATE_SS2) ? 2
4980                  : (flags & CODING_STATE_SS3) ? 3
4981                  : !BYTE_ASCII_P (c) ? str->iso2022.register_right
4982                  : str->iso2022.register_left);
4983           charset = str->iso2022.charset[reg];
4984
4985           /* Error checking: */
4986           if (! CHARSETP (charset)
4987               || str->iso2022.invalid_designated[reg]
4988               || (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL)
4989                   && XCHARSET_CHARS (charset) == 94))
4990             /* Mrmph.  We are trying to invoke a register that has no
4991                or an invalid charset in it, or trying to add a character
4992                outside the range of the charset.  Insert that char literally
4993                to preserve it for the output. */
4994             {
4995               DECODE_OUTPUT_PARTIAL_CHAR (ch);
4996               DECODE_ADD_BINARY_CHAR (c, dst);
4997             }
4998
4999           else
5000             {
5001               /* Things are probably hunky-dorey. */
5002
5003               /* Fetch reverse charset, maybe. */
5004               if (((flags & CODING_STATE_R2L) &&
5005                    XCHARSET_DIRECTION (charset) == CHARSET_LEFT_TO_RIGHT)
5006                   ||
5007                   (!(flags & CODING_STATE_R2L) &&
5008                    XCHARSET_DIRECTION (charset) == CHARSET_RIGHT_TO_LEFT))
5009                 {
5010                   Lisp_Object new_charset =
5011                     XCHARSET_REVERSE_DIRECTION_CHARSET (charset);
5012                   if (!NILP (new_charset))
5013                     charset = new_charset;
5014                 }
5015
5016               lb = XCHARSET_LEADING_BYTE (charset);
5017               switch (XCHARSET_REP_BYTES (charset))
5018                 {
5019                 case 1: /* ASCII */
5020                   DECODE_OUTPUT_PARTIAL_CHAR (ch);
5021                   Dynarr_add (dst, c & 0x7F);
5022                   break;
5023
5024                 case 2: /* one-byte official */
5025                   DECODE_OUTPUT_PARTIAL_CHAR (ch);
5026                   Dynarr_add (dst, lb);
5027                   Dynarr_add (dst, c | 0x80);
5028                   break;
5029
5030                 case 3: /* one-byte private or two-byte official */
5031                   if (XCHARSET_PRIVATE_P (charset))
5032                     {
5033                       DECODE_OUTPUT_PARTIAL_CHAR (ch);
5034                       Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_1);
5035                       Dynarr_add (dst, lb);
5036                       Dynarr_add (dst, c | 0x80);
5037                     }
5038                   else
5039                     {
5040                       if (ch)
5041                         {
5042                           Dynarr_add (dst, lb);
5043                           Dynarr_add (dst, ch | 0x80);
5044                           Dynarr_add (dst, c | 0x80);
5045                           ch = 0;
5046                         }
5047                       else
5048                         ch = c;
5049                     }
5050                   break;
5051
5052                 default:        /* two-byte private */
5053                   if (ch)
5054                     {
5055                       Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_2);
5056                       Dynarr_add (dst, lb);
5057                       Dynarr_add (dst, ch | 0x80);
5058                       Dynarr_add (dst, c | 0x80);
5059                       ch = 0;
5060                     }
5061                   else
5062                     ch = c;
5063                 }
5064             }
5065
5066           if (!ch)
5067             flags &= CODING_STATE_ISO2022_LOCK;
5068         }
5069
5070     label_continue_loop:;
5071     }
5072
5073   if (flags & CODING_STATE_END)
5074     DECODE_OUTPUT_PARTIAL_CHAR (ch);
5075
5076   str->flags = flags;
5077   str->ch    = ch;
5078 }
5079
5080
5081 /***** ISO2022 encoder *****/
5082
5083 /* Designate CHARSET into register REG. */
5084
5085 static void
5086 iso2022_designate (Lisp_Object charset, unsigned char reg,
5087                    struct encoding_stream *str, unsigned_char_dynarr *dst)
5088 {
5089   static const char inter94[] = "()*+";
5090   static const char inter96[] = ",-./";
5091   unsigned int type;
5092   unsigned char final;
5093   Lisp_Object old_charset = str->iso2022.charset[reg];
5094
5095   str->iso2022.charset[reg] = charset;
5096   if (!CHARSETP (charset))
5097     /* charset might be an initial nil or t. */
5098     return;
5099   type = XCHARSET_TYPE (charset);
5100   final = XCHARSET_FINAL (charset);
5101   if (!str->iso2022.force_charset_on_output[reg] &&
5102       CHARSETP (old_charset) &&
5103       XCHARSET_TYPE (old_charset) == type &&
5104       XCHARSET_FINAL (old_charset) == final)
5105     return;
5106
5107   str->iso2022.force_charset_on_output[reg] = 0;
5108
5109   {
5110     charset_conversion_spec_dynarr *dyn =
5111       str->codesys->iso2022.output_conv;
5112
5113     if (dyn)
5114       {
5115         int i;
5116
5117         for (i = 0; i < Dynarr_length (dyn); i++)
5118           {
5119             struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
5120             if (EQ (charset, spec->from_charset))
5121                 charset = spec->to_charset;
5122           }
5123       }
5124   }
5125
5126   Dynarr_add (dst, ISO_CODE_ESC);
5127   switch (type)
5128     {
5129     case CHARSET_TYPE_94:
5130       Dynarr_add (dst, inter94[reg]);
5131       break;
5132     case CHARSET_TYPE_96:
5133       Dynarr_add (dst, inter96[reg]);
5134       break;
5135     case CHARSET_TYPE_94X94:
5136       Dynarr_add (dst, '$');
5137       if (reg != 0
5138           || !(CODING_SYSTEM_ISO2022_SHORT (str->codesys))
5139           || final < '@'
5140           || final > 'B')
5141         Dynarr_add (dst, inter94[reg]);
5142       break;
5143     case CHARSET_TYPE_96X96:
5144       Dynarr_add (dst, '$');
5145       Dynarr_add (dst, inter96[reg]);
5146       break;
5147     }
5148   Dynarr_add (dst, final);
5149 }
5150
5151 static void
5152 ensure_normal_shift (struct encoding_stream *str, unsigned_char_dynarr *dst)
5153 {
5154   if (str->iso2022.register_left != 0)
5155     {
5156       Dynarr_add (dst, ISO_CODE_SI);
5157       str->iso2022.register_left = 0;
5158     }
5159 }
5160
5161 static void
5162 ensure_shift_out (struct encoding_stream *str, unsigned_char_dynarr *dst)
5163 {
5164   if (str->iso2022.register_left != 1)
5165     {
5166       Dynarr_add (dst, ISO_CODE_SO);
5167       str->iso2022.register_left = 1;
5168     }
5169 }
5170
5171 /* Convert internally-formatted data to ISO2022 format. */
5172
5173 static void
5174 encode_coding_iso2022 (Lstream *encoding, const unsigned char *src,
5175                        unsigned_char_dynarr *dst, unsigned int n)
5176 {
5177   unsigned char charmask, c;
5178   unsigned char char_boundary;
5179   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
5180   unsigned int flags          = str->flags;
5181   unsigned int ch             = str->ch;
5182   Lisp_Coding_System *codesys = str->codesys;
5183   eol_type_t eol_type         = CODING_SYSTEM_EOL_TYPE (str->codesys);
5184   int i;
5185   Lisp_Object charset;
5186   int half;
5187
5188 #ifdef ENABLE_COMPOSITE_CHARS
5189   /* flags for handling composite chars.  We do a little switcharoo
5190      on the source while we're outputting the composite char. */
5191   unsigned int saved_n = 0;
5192   const unsigned char *saved_src = NULL;
5193   int in_composite = 0;
5194 #endif /* ENABLE_COMPOSITE_CHARS */
5195
5196   char_boundary = str->iso2022.current_char_boundary;
5197   charset = str->iso2022.current_charset;
5198   half = str->iso2022.current_half;
5199
5200 #ifdef ENABLE_COMPOSITE_CHARS
5201  back_to_square_n:
5202 #endif
5203   while (n--)
5204     {
5205       c = *src++;
5206
5207       if (BYTE_ASCII_P (c))
5208         {               /* Processing ASCII character */
5209           ch = 0;
5210
5211           restore_left_to_right_direction (codesys, dst, &flags, 0);
5212
5213           /* Make sure G0 contains ASCII */
5214           if ((c > ' ' && c < ISO_CODE_DEL) ||
5215               !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
5216             {
5217               ensure_normal_shift (str, dst);
5218               iso2022_designate (Vcharset_ascii, 0, str, dst);
5219             }
5220
5221           /* If necessary, restore everything to the default state
5222              at end-of-line */
5223           if (c == '\n' &&
5224               !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys)))
5225             {
5226               restore_left_to_right_direction (codesys, dst, &flags, 0);
5227
5228               ensure_normal_shift (str, dst);
5229
5230               for (i = 0; i < 4; i++)
5231                 {
5232                   Lisp_Object initial_charset =
5233                     CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5234                   iso2022_designate (initial_charset, i, str, dst);
5235                 }
5236             }
5237           if (c == '\n')
5238             {
5239               if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5240                 Dynarr_add (dst, '\r');
5241               if (eol_type != EOL_CR)
5242                 Dynarr_add (dst, c);
5243             }
5244           else
5245             {
5246               if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5247                   && fit_to_be_escape_quoted (c))
5248                 Dynarr_add (dst, ISO_CODE_ESC);
5249               Dynarr_add (dst, c);
5250             }
5251           char_boundary = 1;
5252         }
5253
5254       else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
5255         { /* Processing Leading Byte */
5256           ch = 0;
5257           charset = CHARSET_BY_LEADING_BYTE (c);
5258           if (LEADING_BYTE_PREFIX_P(c))
5259             ch = c;
5260           else if (!EQ (charset, Vcharset_control_1)
5261 #ifdef ENABLE_COMPOSITE_CHARS
5262                    && !EQ (charset, Vcharset_composite)
5263 #endif
5264                    )
5265             {
5266               int reg;
5267
5268               ensure_correct_direction (XCHARSET_DIRECTION (charset),
5269                                         codesys, dst, &flags, 0);
5270
5271               /* Now determine which register to use. */
5272               reg = -1;
5273               for (i = 0; i < 4; i++)
5274                 {
5275                   if (EQ (charset, str->iso2022.charset[i]) ||
5276                       EQ (charset,
5277                           CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i)))
5278                     {
5279                       reg = i;
5280                       break;
5281                     }
5282                 }
5283
5284               if (reg == -1)
5285                 {
5286                   if (XCHARSET_GRAPHIC (charset) != 0)
5287                     {
5288                       if (!NILP (str->iso2022.charset[1]) &&
5289                           (!CODING_SYSTEM_ISO2022_SEVEN (codesys) ||
5290                            CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys)))
5291                         reg = 1;
5292                       else if (!NILP (str->iso2022.charset[2]))
5293                         reg = 2;
5294                       else if (!NILP (str->iso2022.charset[3]))
5295                         reg = 3;
5296                       else
5297                         reg = 0;
5298                     }
5299                   else
5300                     reg = 0;
5301                 }
5302
5303               iso2022_designate (charset, reg, str, dst);
5304
5305               /* Now invoke that register. */
5306               switch (reg)
5307                 {
5308                 case 0:
5309                   ensure_normal_shift (str, dst);
5310                   half = 0;
5311                   break;
5312
5313                 case 1:
5314                   if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5315                     {
5316                       ensure_shift_out (str, dst);
5317                       half = 0;
5318                     }
5319                   else
5320                     half = 1;
5321                   break;
5322
5323                 case 2:
5324                   if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5325                     {
5326                       Dynarr_add (dst, ISO_CODE_ESC);
5327                       Dynarr_add (dst, 'N');
5328                       half = 0;
5329                     }
5330                   else
5331                     {
5332                       Dynarr_add (dst, ISO_CODE_SS2);
5333                       half = 1;
5334                     }
5335                   break;
5336
5337                 case 3:
5338                   if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5339                     {
5340                       Dynarr_add (dst, ISO_CODE_ESC);
5341                       Dynarr_add (dst, 'O');
5342                       half = 0;
5343                     }
5344                   else
5345                     {
5346                       Dynarr_add (dst, ISO_CODE_SS3);
5347                       half = 1;
5348                     }
5349                   break;
5350
5351                 default:
5352                   abort ();
5353                 }
5354             }
5355           char_boundary = 0;
5356         }
5357       else
5358         {                       /* Processing Non-ASCII character */
5359           charmask = (half == 0 ? 0x7F : 0xFF);
5360           char_boundary = 1;
5361           if (EQ (charset, Vcharset_control_1))
5362             {
5363               if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5364                   && fit_to_be_escape_quoted (c))
5365                 Dynarr_add (dst, ISO_CODE_ESC);
5366               /* you asked for it ... */
5367               Dynarr_add (dst, c - 0x20);
5368             }
5369           else
5370             {
5371               switch (XCHARSET_REP_BYTES (charset))
5372                 {
5373                 case 2:
5374                   Dynarr_add (dst, c & charmask);
5375                   break;
5376                 case 3:
5377                   if (XCHARSET_PRIVATE_P (charset))
5378                     {
5379                       Dynarr_add (dst, c & charmask);
5380                       ch = 0;
5381                     }
5382                   else if (ch)
5383                     {
5384 #ifdef ENABLE_COMPOSITE_CHARS
5385                       if (EQ (charset, Vcharset_composite))
5386                         {
5387                           if (in_composite)
5388                             {
5389                               /* #### Bother! We don't know how to
5390                                  handle this yet. */
5391                               Dynarr_add (dst, '~');
5392                             }
5393                           else
5394                             {
5395                               Emchar emch = MAKE_CHAR (Vcharset_composite,
5396                                                        ch & 0x7F, c & 0x7F);
5397                               Lisp_Object lstr = composite_char_string (emch);
5398                               saved_n = n;
5399                               saved_src = src;
5400                               in_composite = 1;
5401                               src = XSTRING_DATA   (lstr);
5402                               n   = XSTRING_LENGTH (lstr);
5403                               Dynarr_add (dst, ISO_CODE_ESC);
5404                               Dynarr_add (dst, '0'); /* start composing */
5405                             }
5406                         }
5407                       else
5408 #endif /* ENABLE_COMPOSITE_CHARS */
5409                         {
5410                           Dynarr_add (dst, ch & charmask);
5411                           Dynarr_add (dst, c & charmask);
5412                         }
5413                       ch = 0;
5414                     }
5415                   else
5416                     {
5417                       ch = c;
5418                       char_boundary = 0;
5419                     }
5420                   break;
5421                 case 4:
5422                   if (ch)
5423                     {
5424                       Dynarr_add (dst, ch & charmask);
5425                       Dynarr_add (dst, c & charmask);
5426                       ch = 0;
5427                     }
5428                   else
5429                     {
5430                       ch = c;
5431                       char_boundary = 0;
5432                     }
5433                   break;
5434                 default:
5435                   abort ();
5436                 }
5437             }
5438         }
5439     }
5440
5441 #ifdef ENABLE_COMPOSITE_CHARS
5442   if (in_composite)
5443     {
5444       n = saved_n;
5445       src = saved_src;
5446       in_composite = 0;
5447       Dynarr_add (dst, ISO_CODE_ESC);
5448       Dynarr_add (dst, '1'); /* end composing */
5449       goto back_to_square_n; /* Wheeeeeeeee ..... */
5450     }
5451 #endif /* ENABLE_COMPOSITE_CHARS */
5452
5453   if (char_boundary && flags & CODING_STATE_END)
5454     {
5455       restore_left_to_right_direction (codesys, dst, &flags, 0);
5456       ensure_normal_shift (str, dst);
5457       for (i = 0; i < 4; i++)
5458         {
5459           Lisp_Object initial_charset =
5460             CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5461           iso2022_designate (initial_charset, i, str, dst);
5462         }
5463     }
5464
5465   str->flags = flags;
5466   str->ch    = ch;
5467   str->iso2022.current_char_boundary = char_boundary;
5468   str->iso2022.current_charset = charset;
5469   str->iso2022.current_half = half;
5470
5471   /* Verbum caro factum est! */
5472 }
5473 #endif /* MULE */
5474 \f
5475 /************************************************************************/
5476 /*                     No-conversion methods                            */
5477 /************************************************************************/
5478
5479 /* This is used when reading in "binary" files -- i.e. files that may
5480    contain all 256 possible byte values and that are not to be
5481    interpreted as being in any particular decoding. */
5482 static void
5483 decode_coding_no_conversion (Lstream *decoding, const unsigned char *src,
5484                              unsigned_char_dynarr *dst, unsigned int n)
5485 {
5486   unsigned char c;
5487   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
5488   unsigned int flags  = str->flags;
5489   unsigned int ch     = str->ch;
5490   eol_type_t eol_type = str->eol_type;
5491
5492   while (n--)
5493     {
5494       c = *src++;
5495
5496       DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5497       DECODE_ADD_BINARY_CHAR (c, dst);
5498     label_continue_loop:;
5499     }
5500
5501   DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
5502
5503   str->flags = flags;
5504   str->ch    = ch;
5505 }
5506
5507 static void
5508 encode_coding_no_conversion (Lstream *encoding, const unsigned char *src,
5509                              unsigned_char_dynarr *dst, unsigned int n)
5510 {
5511   unsigned char c;
5512   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
5513   unsigned int flags  = str->flags;
5514   unsigned int ch     = str->ch;
5515   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
5516
5517   while (n--)
5518     {
5519       c = *src++;
5520       if (c == '\n')
5521         {
5522           if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5523             Dynarr_add (dst, '\r');
5524           if (eol_type != EOL_CR)
5525             Dynarr_add (dst, '\n');
5526           ch = 0;
5527         }
5528       else if (BYTE_ASCII_P (c))
5529         {
5530           assert (ch == 0);
5531           Dynarr_add (dst, c);
5532         }
5533       else if (BUFBYTE_LEADING_BYTE_P (c))
5534         {
5535           assert (ch == 0);
5536           if (c == LEADING_BYTE_LATIN_ISO8859_1 ||
5537               c == LEADING_BYTE_CONTROL_1)
5538             ch = c;
5539           else
5540             Dynarr_add (dst, '~'); /* untranslatable character */
5541         }
5542       else
5543         {
5544           if (ch == LEADING_BYTE_LATIN_ISO8859_1)
5545             Dynarr_add (dst, c);
5546           else if (ch == LEADING_BYTE_CONTROL_1)
5547             {
5548               assert (c < 0xC0);
5549               Dynarr_add (dst, c - 0x20);
5550             }
5551           /* else it should be the second or third byte of an
5552              untranslatable character, so ignore it */
5553           ch = 0;
5554         }
5555     }
5556
5557   str->flags = flags;
5558   str->ch    = ch;
5559 }
5560
5561 \f
5562
5563 /************************************************************************/
5564 /*                             Initialization                           */
5565 /************************************************************************/
5566
5567 void
5568 syms_of_file_coding (void)
5569 {
5570   INIT_LRECORD_IMPLEMENTATION (coding_system);
5571
5572   DEFERROR_STANDARD (Qcoding_system_error, Qio_error);
5573
5574   DEFSUBR (Fcoding_system_p);
5575   DEFSUBR (Ffind_coding_system);
5576   DEFSUBR (Fget_coding_system);
5577   DEFSUBR (Fcoding_system_list);
5578   DEFSUBR (Fcoding_system_name);
5579   DEFSUBR (Fmake_coding_system);
5580   DEFSUBR (Fcopy_coding_system);
5581   DEFSUBR (Fcoding_system_canonical_name_p);
5582   DEFSUBR (Fcoding_system_alias_p);
5583   DEFSUBR (Fcoding_system_aliasee);
5584   DEFSUBR (Fdefine_coding_system_alias);
5585   DEFSUBR (Fsubsidiary_coding_system);
5586
5587   DEFSUBR (Fcoding_system_type);
5588   DEFSUBR (Fcoding_system_doc_string);
5589 #ifdef MULE
5590   DEFSUBR (Fcoding_system_charset);
5591 #endif
5592   DEFSUBR (Fcoding_system_property);
5593
5594   DEFSUBR (Fcoding_category_list);
5595   DEFSUBR (Fset_coding_priority_list);
5596   DEFSUBR (Fcoding_priority_list);
5597   DEFSUBR (Fset_coding_category_system);
5598   DEFSUBR (Fcoding_category_system);
5599
5600   DEFSUBR (Fdetect_coding_region);
5601   DEFSUBR (Fdecode_coding_region);
5602   DEFSUBR (Fencode_coding_region);
5603 #ifdef MULE
5604   DEFSUBR (Fdecode_shift_jis_char);
5605   DEFSUBR (Fencode_shift_jis_char);
5606   DEFSUBR (Fdecode_big5_char);
5607   DEFSUBR (Fencode_big5_char);
5608   DEFSUBR (Fset_ucs_char);
5609   DEFSUBR (Fucs_char);
5610   DEFSUBR (Fset_char_ucs);
5611   DEFSUBR (Fchar_ucs);
5612 #endif /* MULE */
5613   defsymbol (&Qcoding_systemp, "coding-system-p");
5614   defsymbol (&Qno_conversion, "no-conversion");
5615   defsymbol (&Qraw_text, "raw-text");
5616 #ifdef MULE
5617   defsymbol (&Qbig5, "big5");
5618   defsymbol (&Qshift_jis, "shift-jis");
5619   defsymbol (&Qucs4, "ucs-4");
5620   defsymbol (&Qutf8, "utf-8");
5621   defsymbol (&Qccl, "ccl");
5622   defsymbol (&Qiso2022, "iso2022");
5623 #endif /* MULE */
5624   defsymbol (&Qmnemonic, "mnemonic");
5625   defsymbol (&Qeol_type, "eol-type");
5626   defsymbol (&Qpost_read_conversion, "post-read-conversion");
5627   defsymbol (&Qpre_write_conversion, "pre-write-conversion");
5628
5629   defsymbol (&Qcr, "cr");
5630   defsymbol (&Qlf, "lf");
5631   defsymbol (&Qcrlf, "crlf");
5632   defsymbol (&Qeol_cr, "eol-cr");
5633   defsymbol (&Qeol_lf, "eol-lf");
5634   defsymbol (&Qeol_crlf, "eol-crlf");
5635 #ifdef MULE
5636   defsymbol (&Qcharset_g0, "charset-g0");
5637   defsymbol (&Qcharset_g1, "charset-g1");
5638   defsymbol (&Qcharset_g2, "charset-g2");
5639   defsymbol (&Qcharset_g3, "charset-g3");
5640   defsymbol (&Qforce_g0_on_output, "force-g0-on-output");
5641   defsymbol (&Qforce_g1_on_output, "force-g1-on-output");
5642   defsymbol (&Qforce_g2_on_output, "force-g2-on-output");
5643   defsymbol (&Qforce_g3_on_output, "force-g3-on-output");
5644   defsymbol (&Qno_iso6429, "no-iso6429");
5645   defsymbol (&Qinput_charset_conversion, "input-charset-conversion");
5646   defsymbol (&Qoutput_charset_conversion, "output-charset-conversion");
5647
5648   defsymbol (&Qshort, "short");
5649   defsymbol (&Qno_ascii_eol, "no-ascii-eol");
5650   defsymbol (&Qno_ascii_cntl, "no-ascii-cntl");
5651   defsymbol (&Qseven, "seven");
5652   defsymbol (&Qlock_shift, "lock-shift");
5653   defsymbol (&Qescape_quoted, "escape-quoted");
5654 #endif /* MULE */
5655   defsymbol (&Qencode, "encode");
5656   defsymbol (&Qdecode, "decode");
5657
5658 #ifdef MULE
5659   defsymbol (&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS],
5660              "shift-jis");
5661   defsymbol (&coding_category_symbol[CODING_CATEGORY_BIG5],
5662              "big5");
5663   defsymbol (&coding_category_symbol[CODING_CATEGORY_UCS4],
5664              "ucs-4");
5665   defsymbol (&coding_category_symbol[CODING_CATEGORY_UTF8],
5666              "utf-8");
5667   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_7],
5668              "iso-7");
5669   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE],
5670              "iso-8-designate");
5671   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_1],
5672              "iso-8-1");
5673   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_2],
5674              "iso-8-2");
5675   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_LOCK_SHIFT],
5676              "iso-lock-shift");
5677 #endif /* MULE */
5678   defsymbol (&coding_category_symbol[CODING_CATEGORY_NO_CONVERSION],
5679              "no-conversion");
5680 }
5681
5682 void
5683 lstream_type_create_file_coding (void)
5684 {
5685   LSTREAM_HAS_METHOD (decoding, reader);
5686   LSTREAM_HAS_METHOD (decoding, writer);
5687   LSTREAM_HAS_METHOD (decoding, rewinder);
5688   LSTREAM_HAS_METHOD (decoding, seekable_p);
5689   LSTREAM_HAS_METHOD (decoding, flusher);
5690   LSTREAM_HAS_METHOD (decoding, closer);
5691   LSTREAM_HAS_METHOD (decoding, marker);
5692
5693   LSTREAM_HAS_METHOD (encoding, reader);
5694   LSTREAM_HAS_METHOD (encoding, writer);
5695   LSTREAM_HAS_METHOD (encoding, rewinder);
5696   LSTREAM_HAS_METHOD (encoding, seekable_p);
5697   LSTREAM_HAS_METHOD (encoding, flusher);
5698   LSTREAM_HAS_METHOD (encoding, closer);
5699   LSTREAM_HAS_METHOD (encoding, marker);
5700 }
5701
5702 void
5703 vars_of_file_coding (void)
5704 {
5705   int i;
5706
5707   fcd = xnew (struct file_coding_dump);
5708   dumpstruct (&fcd, &fcd_description);
5709
5710   /* Initialize to something reasonable ... */
5711   for (i = 0; i <= CODING_CATEGORY_LAST; i++)
5712     {
5713       fcd->coding_category_system[i] = Qnil;
5714       fcd->coding_category_by_priority[i] = i;
5715     }
5716
5717   Fprovide (intern ("file-coding"));
5718
5719   DEFVAR_LISP ("keyboard-coding-system", &Vkeyboard_coding_system /*
5720 Coding system used for TTY keyboard input.
5721 Not used under a windowing system.
5722 */ );
5723   Vkeyboard_coding_system = Qnil;
5724
5725   DEFVAR_LISP ("terminal-coding-system", &Vterminal_coding_system /*
5726 Coding system used for TTY display output.
5727 Not used under a windowing system.
5728 */ );
5729   Vterminal_coding_system = Qnil;
5730
5731   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read /*
5732 Overriding coding system used when reading from a file or process.
5733 You should bind this variable with `let', but do not set it globally.
5734 If this is non-nil, it specifies the coding system that will be used
5735 to decode input on read operations, such as from a file or process.
5736 It overrides `buffer-file-coding-system-for-read',
5737 `insert-file-contents-pre-hook', etc.  Use those variables instead of
5738 this one for permanent changes to the environment.  */ );
5739   Vcoding_system_for_read = Qnil;
5740
5741   DEFVAR_LISP ("coding-system-for-write",
5742                &Vcoding_system_for_write /*
5743 Overriding coding system used when writing to a file or process.
5744 You should bind this variable with `let', but do not set it globally.
5745 If this is non-nil, it specifies the coding system that will be used
5746 to encode output for write operations, such as to a file or process.
5747 It overrides `buffer-file-coding-system', `write-region-pre-hook', etc.
5748 Use those variables instead of this one for permanent changes to the
5749 environment.  */ );
5750   Vcoding_system_for_write = Qnil;
5751
5752   DEFVAR_LISP ("file-name-coding-system", &Vfile_name_coding_system /*
5753 Coding system used to convert pathnames when accessing files.
5754 */ );
5755   Vfile_name_coding_system = Qnil;
5756
5757   DEFVAR_BOOL ("enable-multibyte-characters", &enable_multibyte_characters /*
5758 Non-nil means the buffer contents are regarded as multi-byte form
5759 of characters, not a binary code.  This affects the display, file I/O,
5760 and behaviors of various editing commands.
5761
5762 Setting this to nil does not do anything.
5763 */ );
5764   enable_multibyte_characters = 1;
5765 }
5766
5767 void
5768 complex_vars_of_file_coding (void)
5769 {
5770   staticpro (&Vcoding_system_hash_table);
5771   Vcoding_system_hash_table =
5772     make_lisp_hash_table (50, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ);
5773
5774   the_codesys_prop_dynarr = Dynarr_new (codesys_prop);
5775   dumpstruct (&the_codesys_prop_dynarr, &codesys_prop_dynarr_description);
5776
5777 #define DEFINE_CODESYS_PROP(Prop_Type, Sym) do  \
5778 {                                               \
5779   struct codesys_prop csp;                      \
5780   csp.sym = (Sym);                              \
5781   csp.prop_type = (Prop_Type);                  \
5782   Dynarr_add (the_codesys_prop_dynarr, csp);    \
5783 } while (0)
5784
5785   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qmnemonic);
5786   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_type);
5787   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_cr);
5788   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_crlf);
5789   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_lf);
5790   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qpost_read_conversion);
5791   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qpre_write_conversion);
5792 #ifdef MULE
5793   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g0);
5794   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g1);
5795   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g2);
5796   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g3);
5797   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g0_on_output);
5798   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g1_on_output);
5799   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g2_on_output);
5800   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g3_on_output);
5801   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qshort);
5802   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_eol);
5803   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_cntl);
5804   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qseven);
5805   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qlock_shift);
5806   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_iso6429);
5807   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qescape_quoted);
5808   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qinput_charset_conversion);
5809   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qoutput_charset_conversion);
5810
5811   DEFINE_CODESYS_PROP (CODESYS_PROP_CCL,     Qencode);
5812   DEFINE_CODESYS_PROP (CODESYS_PROP_CCL,     Qdecode);
5813 #endif /* MULE */
5814   /* Need to create this here or we're really screwed. */
5815   Fmake_coding_system
5816     (Qraw_text, Qno_conversion,
5817      build_string ("Raw text, which means it converts only line-break-codes."),
5818      list2 (Qmnemonic, build_string ("Raw")));
5819
5820   Fmake_coding_system
5821     (Qbinary, Qno_conversion,
5822      build_string ("Binary, which means it does not convert anything."),
5823      list4 (Qeol_type, Qlf,
5824             Qmnemonic, build_string ("Binary")));
5825
5826   Fdefine_coding_system_alias (Qno_conversion, Qraw_text);
5827
5828   Fdefine_coding_system_alias (Qfile_name, Qbinary);
5829
5830   Fdefine_coding_system_alias (Qterminal, Qbinary);
5831   Fdefine_coding_system_alias (Qkeyboard, Qbinary);
5832
5833   /* Need this for bootstrapping */
5834   fcd->coding_category_system[CODING_CATEGORY_NO_CONVERSION] =
5835     Fget_coding_system (Qraw_text);
5836
5837 #ifdef MULE
5838   {
5839     size_t i;
5840
5841     for (i = 0; i < countof (fcd->ucs_to_mule_table); i++)
5842       fcd->ucs_to_mule_table[i] = Qnil;
5843   }
5844   staticpro (&mule_to_ucs_table);
5845   mule_to_ucs_table = Fmake_char_table(Qgeneric);
5846 #endif /* MULE */
5847 }