src/text-coding.c

   1 /* Code conversion functions.
   2    Copyright (C) 1991, 1995 Free Software Foundation, Inc.
   3    Copyright (C) 1995 Sun Microsystems, Inc.
   4    Copyright (C) 1999,2000 MORIOKA Tomohiko
   5
   6 This file is part of XEmacs.
   7
   8 XEmacs is free software; you can redistribute it and/or modify it
   9 under the terms of the GNU General Public License as published by the
  10 Free Software Foundation; either version 2, or (at your option) any
  11 later version.
  12
  13 XEmacs is distributed in the hope that it will be useful, but WITHOUT
  14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with XEmacs; see the file COPYING.  If not, write to
  20 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  21 Boston, MA 02111-1307, USA.  */
  22
  23 /* Synched up with: Mule 2.3.   Not in FSF. */
  24
  25 /* Rewritten by Ben Wing <ben@xemacs.org>. */
  26
  27 #include <config.h>
  28 #include "lisp.h"
  29
  30 #include "buffer.h"
  31 #include "elhash.h"
  32 #include "insdel.h"
  33 #include "lstream.h"
  34 #include "opaque.h"
  35 #ifdef MULE
  36 #include "mule-ccl.h"
  37 #include "chartab.h"
  38 #endif
  39 #include "file-coding.h"
  40
  41 Lisp_Object Qcoding_system_error;
  42
  43 Lisp_Object Vkeyboard_coding_system;
  44 Lisp_Object Vterminal_coding_system;
  45 Lisp_Object Vcoding_system_for_read;
  46 Lisp_Object Vcoding_system_for_write;
  47 Lisp_Object Vfile_name_coding_system;
  48
  49 /* Table of symbols identifying each coding category. */
  50 Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST + 1];
  51
  52
  53
  54 struct file_coding_dump {
  55   /* Coding system currently associated with each coding category. */
  56   Lisp_Object coding_category_system[CODING_CATEGORY_LAST + 1];
  57
  58   /* Table of all coding categories in decreasing order of priority.
  59      This describes a permutation of the possible coding categories. */
  60   int coding_category_by_priority[CODING_CATEGORY_LAST + 1];
  61
  62 #if defined(MULE) && !defined(UTF2000)
  63   Lisp_Object ucs_to_mule_table[65536];
  64 #endif
  65 } *fcd;
  66
  67 static const struct lrecord_description fcd_description_1[] = {
  68   { XD_LISP_OBJECT_ARRAY, offsetof (struct file_coding_dump, coding_category_system), CODING_CATEGORY_LAST + 1 },
  69 #if defined(MULE) && !defined(UTF2000)
  70   { XD_LISP_OBJECT_ARRAY, offsetof (struct file_coding_dump, ucs_to_mule_table), countof (fcd->ucs_to_mule_table) },
  71 #endif
  72   { XD_END }
  73 };
  74
  75 static const struct struct_description fcd_description = {
  76   sizeof (struct file_coding_dump),
  77   fcd_description_1
  78 };
  79
  80 Lisp_Object mule_to_ucs_table;
  81
  82 Lisp_Object Qcoding_systemp;
  83
  84 Lisp_Object Qraw_text, Qno_conversion, Qccl, Qiso2022;
  85 /* Qinternal in general.c */
  86
  87 Lisp_Object Qmnemonic, Qeol_type;
  88 Lisp_Object Qcr, Qcrlf, Qlf;
  89 Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf;
  90 Lisp_Object Qpost_read_conversion;
  91 Lisp_Object Qpre_write_conversion;
  92
  93 #ifdef MULE
  94 Lisp_Object Qucs4, Qutf8;
  95 Lisp_Object Qbig5, Qshift_jis;
  96 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3;
  97 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output;
  98 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output;
  99 Lisp_Object Qno_iso6429;
 100 Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion;
 101 Lisp_Object Qescape_quoted;
 102 Lisp_Object Qshort, Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift;
 103 #endif
 104 #ifdef UTF2000
 105 Lisp_Object Qdisable_composition;
 106 #endif
 107 Lisp_Object Qencode, Qdecode;
 108
 109 Lisp_Object Vcoding_system_hash_table;
 110
 111 int enable_multibyte_characters;
 112
 113 #ifdef MULE
 114 /* Additional information used by the ISO2022 decoder and detector. */
 115 struct iso2022_decoder
 116 {
 117   /* CHARSET holds the character sets currently assigned to the G0
 118      through G3 variables.  It is initialized from the array
 119      INITIAL_CHARSET in CODESYS. */
 120   Lisp_Object charset[4];
 121
 122   /* Which registers are currently invoked into the left (GL) and
 123      right (GR) halves of the 8-bit encoding space? */
 124   int register_left, register_right;
 125
 126   /* ISO_ESC holds a value indicating part of an escape sequence
 127      that has already been seen. */
 128   enum iso_esc_flag esc;
 129
 130   /* This records the bytes we've seen so far in an escape sequence,
 131      in case the sequence is invalid (we spit out the bytes unchanged). */
 132   unsigned char esc_bytes[8];
 133
 134   /* Index for next byte to store in ISO escape sequence. */
 135   int esc_bytes_index;
 136
 137 #ifdef ENABLE_COMPOSITE_CHARS
 138   /* Stuff seen so far when composing a string. */
 139   unsigned_char_dynarr *composite_chars;
 140 #endif
 141
 142   /* If we saw an invalid designation sequence for a particular
 143      register, we flag it here and switch to ASCII.  The next time we
 144      see a valid designation for this register, we turn off the flag
 145      and do the designation normally, but pretend the sequence was
 146      invalid.  The effect of all this is that (most of the time) the
 147      escape sequences for both the switch to the unknown charset, and
 148      the switch back to the known charset, get inserted literally into
 149      the buffer and saved out as such.  The hope is that we can
 150      preserve the escape sequences so that the resulting written out
 151      file makes sense.  If we don't do any of this, the designation
 152      to the invalid charset will be preserved but that switch back
 153      to the known charset will probably get eaten because it was
 154      the same charset that was already present in the register. */
 155   unsigned char invalid_designated[4];
 156
 157   /* We try to do similar things as above for direction-switching
 158      sequences.  If we encountered a direction switch while an
 159      invalid designation was present, or an invalid designation
 160      just after a direction switch (i.e. no valid designation
 161      encountered yet), we insert the direction-switch escape
 162      sequence literally into the output stream, and later on
 163      insert the corresponding direction-restoring escape sequence
 164      literally also. */
 165   unsigned int switched_dir_and_no_valid_charset_yet :1;
 166   unsigned int invalid_switch_dir :1;
 167
 168   /* Tells the decoder to output the escape sequence literally
 169      even though it was valid.  Used in the games we play to
 170      avoid lossage when we encounter invalid designations. */
 171   unsigned int output_literally :1;
 172   /* We encountered a direction switch followed by an invalid
 173      designation.  We didn't output the direction switch
 174      literally because we didn't know about the invalid designation;
 175      but we have to do so now. */
 176   unsigned int output_direction_sequence :1;
 177 };
 178 #endif /* MULE */
 179 EXFUN (Fcopy_coding_system, 2);
 180 #ifdef MULE
 181 struct detection_state;
 182
 183 static void
 184 text_encode_generic (Lstream *encoding, const Bufbyte *src,
 185                      unsigned_char_dynarr *dst, size_t n);
 186
 187 static int detect_coding_sjis (struct detection_state *st,
 188                                const Extbyte *src, size_t n);
 189 static void decode_coding_sjis (Lstream *decoding, const Extbyte *src,
 190                                 unsigned_char_dynarr *dst, size_t n);
 191 void char_encode_shift_jis (struct encoding_stream *str, Emchar c,
 192                             unsigned_char_dynarr *dst, unsigned int *flags);
 193 void char_finish_shift_jis (struct encoding_stream *str,
 194                             unsigned_char_dynarr *dst, unsigned int *flags);
 195
 196 static int detect_coding_big5 (struct detection_state *st,
 197                                const Extbyte *src, size_t n);
 198 static void decode_coding_big5 (Lstream *decoding, const Extbyte *src,
 199                                 unsigned_char_dynarr *dst, size_t n);
 200 static void encode_coding_big5 (Lstream *encoding, const Bufbyte *src,
 201                                 unsigned_char_dynarr *dst, size_t n);
 202
 203 static int detect_coding_ucs4 (struct detection_state *st,
 204                                const Extbyte *src, size_t n);
 205 static void decode_coding_ucs4 (Lstream *decoding, const Extbyte *src,
 206                                 unsigned_char_dynarr *dst, size_t n);
 207 void char_encode_ucs4 (struct encoding_stream *str, Emchar c,
 208                        unsigned_char_dynarr *dst, unsigned int *flags);
 209 void char_finish_ucs4 (struct encoding_stream *str,
 210                        unsigned_char_dynarr *dst, unsigned int *flags);
 211
 212 static int detect_coding_utf8 (struct detection_state *st,
 213                                const Extbyte *src, size_t n);
 214 static void decode_coding_utf8 (Lstream *decoding, const Extbyte *src,
 215                                 unsigned_char_dynarr *dst, size_t n);
 216 void char_encode_utf8 (struct encoding_stream *str, Emchar c,
 217                        unsigned_char_dynarr *dst, unsigned int *flags);
 218 void char_finish_utf8 (struct encoding_stream *str,
 219                        unsigned_char_dynarr *dst, unsigned int *flags);
 220
 221 static int postprocess_iso2022_mask (int mask);
 222 static void reset_iso2022 (Lisp_Object coding_system,
 223                            struct iso2022_decoder *iso);
 224 static int detect_coding_iso2022 (struct detection_state *st,
 225                                   const Extbyte *src, size_t n);
 226 static void decode_coding_iso2022 (Lstream *decoding, const Extbyte *src,
 227                                    unsigned_char_dynarr *dst, size_t n);
 228 void char_encode_iso2022 (struct encoding_stream *str, Emchar c,
 229                           unsigned_char_dynarr *dst, unsigned int *flags);
 230 void char_finish_iso2022 (struct encoding_stream *str,
 231                           unsigned_char_dynarr *dst, unsigned int *flags);
 232 #endif /* MULE */
 233 static void decode_coding_no_conversion (Lstream *decoding, const Extbyte *src,
 234                                          unsigned_char_dynarr *dst, size_t n);
 235 static void encode_coding_no_conversion (Lstream *encoding, const Bufbyte *src,
 236                                          unsigned_char_dynarr *dst, size_t n);
 237 static void mule_decode (Lstream *decoding, const Extbyte *src,
 238                          unsigned_char_dynarr *dst, size_t n);
 239 static void mule_encode (Lstream *encoding, const Bufbyte *src,
 240                          unsigned_char_dynarr *dst, size_t n);
 241
 242 typedef struct codesys_prop codesys_prop;
 243 struct codesys_prop
 244 {
 245   Lisp_Object sym;
 246   int prop_type;
 247 };
 248
 249 typedef struct
 250 {
 251   Dynarr_declare (codesys_prop);
 252 } codesys_prop_dynarr;
 253
 254 static const struct lrecord_description codesys_prop_description_1[] = {
 255   { XD_LISP_OBJECT, offsetof (codesys_prop, sym) },
 256   { XD_END }
 257 };
 258
 259 static const struct struct_description codesys_prop_description = {
 260   sizeof (codesys_prop),
 261   codesys_prop_description_1
 262 };
 263
 264 static const struct lrecord_description codesys_prop_dynarr_description_1[] = {
 265   XD_DYNARR_DESC (codesys_prop_dynarr, &codesys_prop_description),
 266   { XD_END }
 267 };
 268
 269 static const struct struct_description codesys_prop_dynarr_description = {
 270   sizeof (codesys_prop_dynarr),
 271   codesys_prop_dynarr_description_1
 272 };
 273
 274 codesys_prop_dynarr *the_codesys_prop_dynarr;
 275
 276 enum codesys_prop_enum
 277 {
 278   CODESYS_PROP_ALL_OK,
 279   CODESYS_PROP_ISO2022,
 280   CODESYS_PROP_CCL
 281 };
 282
 283 \f
 284 /************************************************************************/
 285 /*                       Coding system functions                        */
 286 /************************************************************************/
 287
 288 static Lisp_Object mark_coding_system (Lisp_Object);
 289 static void print_coding_system (Lisp_Object, Lisp_Object, int);
 290 static void finalize_coding_system (void *header, int for_disksave);
 291
 292 #ifdef MULE
 293 static const struct lrecord_description ccs_description_1[] = {
 294   { XD_LISP_OBJECT, offsetof (charset_conversion_spec, from_charset) },
 295   { XD_LISP_OBJECT, offsetof (charset_conversion_spec, to_charset) },
 296   { XD_END }
 297 };
 298
 299 static const struct struct_description ccs_description = {
 300   sizeof (charset_conversion_spec),
 301   ccs_description_1
 302 };
 303
 304 static const struct lrecord_description ccsd_description_1[] = {
 305   XD_DYNARR_DESC (charset_conversion_spec_dynarr, &ccs_description),
 306   { XD_END }
 307 };
 308
 309 static const struct struct_description ccsd_description = {
 310   sizeof (charset_conversion_spec_dynarr),
 311   ccsd_description_1
 312 };
 313 #endif
 314
 315 static const struct lrecord_description coding_system_description[] = {
 316   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, name) },
 317   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, doc_string) },
 318   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, mnemonic) },
 319   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, post_read_conversion) },
 320   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, pre_write_conversion) },
 321   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_lf) },
 322   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_crlf) },
 323   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_cr) },
 324 #ifdef MULE
 325   { XD_LISP_OBJECT_ARRAY, offsetof (Lisp_Coding_System, iso2022.initial_charset), 4 },
 326   { XD_STRUCT_PTR,  offsetof (Lisp_Coding_System, iso2022.input_conv),  1, &ccsd_description },
 327   { XD_STRUCT_PTR,  offsetof (Lisp_Coding_System, iso2022.output_conv), 1, &ccsd_description },
 328   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccl.decode) },
 329   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccl.encode) },
 330 #endif
 331   { XD_END }
 332 };
 333
 334 DEFINE_LRECORD_IMPLEMENTATION ("coding-system", coding_system,
 335                                mark_coding_system, print_coding_system,
 336                                finalize_coding_system,
 337                                0, 0, coding_system_description,
 338                                Lisp_Coding_System);
 339
 340 static Lisp_Object
 341 mark_coding_system (Lisp_Object obj)
 342 {
 343   Lisp_Coding_System *codesys = XCODING_SYSTEM (obj);
 344
 345   mark_object (CODING_SYSTEM_NAME (codesys));
 346   mark_object (CODING_SYSTEM_DOC_STRING (codesys));
 347   mark_object (CODING_SYSTEM_MNEMONIC (codesys));
 348   mark_object (CODING_SYSTEM_EOL_LF (codesys));
 349   mark_object (CODING_SYSTEM_EOL_CRLF (codesys));
 350   mark_object (CODING_SYSTEM_EOL_CR (codesys));
 351
 352   switch (CODING_SYSTEM_TYPE (codesys))
 353     {
 354 #ifdef MULE
 355       int i;
 356     case CODESYS_ISO2022:
 357       for (i = 0; i < 4; i++)
 358         mark_object (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i));
 359       if (codesys->iso2022.input_conv)
 360         {
 361           for (i = 0; i < Dynarr_length (codesys->iso2022.input_conv); i++)
 362             {
 363               struct charset_conversion_spec *ccs =
 364                 Dynarr_atp (codesys->iso2022.input_conv, i);
 365               mark_object (ccs->from_charset);
 366               mark_object (ccs->to_charset);
 367             }
 368         }
 369       if (codesys->iso2022.output_conv)
 370         {
 371           for (i = 0; i < Dynarr_length (codesys->iso2022.output_conv); i++)
 372             {
 373               struct charset_conversion_spec *ccs =
 374                 Dynarr_atp (codesys->iso2022.output_conv, i);
 375               mark_object (ccs->from_charset);
 376               mark_object (ccs->to_charset);
 377             }
 378         }
 379       break;
 380
 381     case CODESYS_CCL:
 382       mark_object (CODING_SYSTEM_CCL_DECODE (codesys));
 383       mark_object (CODING_SYSTEM_CCL_ENCODE (codesys));
 384       break;
 385 #endif /* MULE */
 386     default:
 387       break;
 388     }
 389
 390   mark_object (CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys));
 391   return CODING_SYSTEM_POST_READ_CONVERSION (codesys);
 392 }
 393
 394 static void
 395 print_coding_system (Lisp_Object obj, Lisp_Object printcharfun,
 396                      int escapeflag)
 397 {
 398   Lisp_Coding_System *c = XCODING_SYSTEM (obj);
 399   if (print_readably)
 400     error ("printing unreadable object #<coding_system 0x%x>",
 401            c->header.uid);
 402
 403   write_c_string ("#<coding_system ", printcharfun);
 404   print_internal (c->name, printcharfun, 1);
 405   write_c_string (">", printcharfun);
 406 }
 407
 408 static void
 409 finalize_coding_system (void *header, int for_disksave)
 410 {
 411   Lisp_Coding_System *c = (Lisp_Coding_System *) header;
 412   /* Since coding systems never go away, this function is not
 413      necessary.  But it would be necessary if we changed things
 414      so that coding systems could go away. */
 415   if (!for_disksave) /* see comment in lstream.c */
 416     {
 417       switch (CODING_SYSTEM_TYPE (c))
 418         {
 419 #ifdef MULE
 420         case CODESYS_ISO2022:
 421           if (c->iso2022.input_conv)
 422             {
 423               Dynarr_free (c->iso2022.input_conv);
 424               c->iso2022.input_conv = 0;
 425             }
 426           if (c->iso2022.output_conv)
 427             {
 428               Dynarr_free (c->iso2022.output_conv);
 429               c->iso2022.output_conv = 0;
 430             }
 431           break;
 432 #endif /* MULE */
 433         default:
 434           break;
 435         }
 436     }
 437 }
 438
 439 static eol_type_t
 440 symbol_to_eol_type (Lisp_Object symbol)
 441 {
 442   CHECK_SYMBOL (symbol);
 443   if (NILP (symbol))      return EOL_AUTODETECT;
 444   if (EQ (symbol, Qlf))   return EOL_LF;
 445   if (EQ (symbol, Qcrlf)) return EOL_CRLF;
 446   if (EQ (symbol, Qcr))   return EOL_CR;
 447
 448   signal_simple_error ("Unrecognized eol type", symbol);
 449   return EOL_AUTODETECT; /* not reached */
 450 }
 451
 452 static Lisp_Object
 453 eol_type_to_symbol (eol_type_t type)
 454 {
 455   switch (type)
 456     {
 457     default: abort ();
 458     case EOL_LF:         return Qlf;
 459     case EOL_CRLF:       return Qcrlf;
 460     case EOL_CR:         return Qcr;
 461     case EOL_AUTODETECT: return Qnil;
 462     }
 463 }
 464
 465 static void
 466 setup_eol_coding_systems (Lisp_Coding_System *codesys)
 467 {
 468   Lisp_Object codesys_obj;
 469   int len = string_length (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name);
 470   char *codesys_name = (char *) alloca (len + 7);
 471   int mlen = -1;
 472   char *codesys_mnemonic=0;
 473
 474   Lisp_Object codesys_name_sym, sub_codesys_obj;
 475
 476   /* kludge */
 477
 478   XSETCODING_SYSTEM (codesys_obj, codesys);
 479
 480   memcpy (codesys_name,
 481           string_data (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name), len);
 482
 483   if (STRINGP (CODING_SYSTEM_MNEMONIC (codesys)))
 484     {
 485       mlen = XSTRING_LENGTH (CODING_SYSTEM_MNEMONIC (codesys));
 486       codesys_mnemonic = (char *) alloca (mlen + 7);
 487       memcpy (codesys_mnemonic,
 488               XSTRING_DATA (CODING_SYSTEM_MNEMONIC (codesys)), mlen);
 489     }
 490
 491 #define DEFINE_SUB_CODESYS(op_sys, op_sys_abbr, Type) do {                      \
 492   strcpy (codesys_name + len, "-" op_sys);                                      \
 493   if (mlen != -1)                                                               \
 494     strcpy (codesys_mnemonic + mlen, op_sys_abbr);                              \
 495   codesys_name_sym = intern (codesys_name);                                     \
 496   sub_codesys_obj = Fcopy_coding_system (codesys_obj, codesys_name_sym);        \
 497   XCODING_SYSTEM_EOL_TYPE (sub_codesys_obj) = Type;                             \
 498   if (mlen != -1)                                                               \
 499     XCODING_SYSTEM_MNEMONIC(sub_codesys_obj) =                                  \
 500       build_string (codesys_mnemonic);                                          \
 501   CODING_SYSTEM_##Type (codesys) = sub_codesys_obj;                             \
 502 } while (0)
 503
 504   DEFINE_SUB_CODESYS("unix", "", EOL_LF);
 505   DEFINE_SUB_CODESYS("dos",  ":T", EOL_CRLF);
 506   DEFINE_SUB_CODESYS("mac",  ":t", EOL_CR);
 507 }
 508
 509 DEFUN ("coding-system-p", Fcoding_system_p, 1, 1, 0, /*
 510 Return t if OBJECT is a coding system.
 511 A coding system is an object that defines how text containing multiple
 512 character sets is encoded into a stream of (typically 8-bit) bytes.
 513 The coding system is used to decode the stream into a series of
 514 characters (which may be from multiple charsets) when the text is read
 515 from a file or process, and is used to encode the text back into the
 516 same format when it is written out to a file or process.
 517
 518 For example, many ISO2022-compliant coding systems (such as Compound
 519 Text, which is used for inter-client data under the X Window System)
 520 use escape sequences to switch between different charsets -- Japanese
 521 Kanji, for example, is invoked with "ESC $ ( B"; ASCII is invoked
 522 with "ESC ( B"; and Cyrillic is invoked with "ESC - L".  See
 523 `make-coding-system' for more information.
 524
 525 Coding systems are normally identified using a symbol, and the
 526 symbol is accepted in place of the actual coding system object whenever
 527 a coding system is called for. (This is similar to how faces work.)
 528 */
 529        (object))
 530 {
 531   return CODING_SYSTEMP (object) ? Qt : Qnil;
 532 }
 533
 534 DEFUN ("find-coding-system", Ffind_coding_system, 1, 1, 0, /*
 535 Retrieve the coding system of the given name.
 536
 537 If CODING-SYSTEM-OR-NAME is a coding-system object, it is simply
 538 returned.  Otherwise, CODING-SYSTEM-OR-NAME should be a symbol.
 539 If there is no such coding system, nil is returned.  Otherwise the
 540 associated coding system object is returned.
 541 */
 542        (coding_system_or_name))
 543 {
 544   if (NILP (coding_system_or_name))
 545     coding_system_or_name = Qbinary;
 546   else if (CODING_SYSTEMP (coding_system_or_name))
 547     return coding_system_or_name;
 548   else
 549     CHECK_SYMBOL (coding_system_or_name);
 550
 551   while (1)
 552     {
 553       coding_system_or_name =
 554         Fgethash (coding_system_or_name, Vcoding_system_hash_table, Qnil);
 555
 556       if (CODING_SYSTEMP (coding_system_or_name) || NILP (coding_system_or_name))
 557         return coding_system_or_name;
 558     }
 559 }
 560
 561 DEFUN ("get-coding-system", Fget_coding_system, 1, 1, 0, /*
 562 Retrieve the coding system of the given name.
 563 Same as `find-coding-system' except that if there is no such
 564 coding system, an error is signaled instead of returning nil.
 565 */
 566        (name))
 567 {
 568   Lisp_Object coding_system = Ffind_coding_system (name);
 569
 570   if (NILP (coding_system))
 571     signal_simple_error ("No such coding system", name);
 572   return coding_system;
 573 }
 574
 575 /* We store the coding systems in hash tables with the names as the key and the
 576    actual coding system object as the value.  Occasionally we need to use them
 577    in a list format.  These routines provide us with that. */
 578 struct coding_system_list_closure
 579 {
 580   Lisp_Object *coding_system_list;
 581 };
 582
 583 static int
 584 add_coding_system_to_list_mapper (Lisp_Object key, Lisp_Object value,
 585                                   void *coding_system_list_closure)
 586 {
 587   /* This function can GC */
 588   struct coding_system_list_closure *cscl =
 589     (struct coding_system_list_closure *) coding_system_list_closure;
 590   Lisp_Object *coding_system_list = cscl->coding_system_list;
 591
 592   *coding_system_list = Fcons (key, *coding_system_list);
 593   return 0;
 594 }
 595
 596 DEFUN ("coding-system-list", Fcoding_system_list, 0, 0, 0, /*
 597 Return a list of the names of all defined coding systems.
 598 */
 599        ())
 600 {
 601   Lisp_Object coding_system_list = Qnil;
 602   struct gcpro gcpro1;
 603   struct coding_system_list_closure coding_system_list_closure;
 604
 605   GCPRO1 (coding_system_list);
 606   coding_system_list_closure.coding_system_list = &coding_system_list;
 607   elisp_maphash (add_coding_system_to_list_mapper, Vcoding_system_hash_table,
 608                  &coding_system_list_closure);
 609   UNGCPRO;
 610
 611   return coding_system_list;
 612 }
 613
 614 DEFUN ("coding-system-name", Fcoding_system_name, 1, 1, 0, /*
 615 Return the name of the given coding system.
 616 */
 617        (coding_system))
 618 {
 619   coding_system = Fget_coding_system (coding_system);
 620   return XCODING_SYSTEM_NAME (coding_system);
 621 }
 622
 623 static Lisp_Coding_System *
 624 allocate_coding_system (enum coding_system_type type, Lisp_Object name)
 625 {
 626   Lisp_Coding_System *codesys =
 627     alloc_lcrecord_type (Lisp_Coding_System, &lrecord_coding_system);
 628
 629   zero_lcrecord (codesys);
 630   CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = Qnil;
 631   CODING_SYSTEM_POST_READ_CONVERSION (codesys) = Qnil;
 632   CODING_SYSTEM_EOL_TYPE (codesys) = EOL_AUTODETECT;
 633   CODING_SYSTEM_EOL_CRLF (codesys) = Qnil;
 634   CODING_SYSTEM_EOL_CR   (codesys) = Qnil;
 635   CODING_SYSTEM_EOL_LF   (codesys) = Qnil;
 636   CODING_SYSTEM_TYPE     (codesys) = type;
 637   CODING_SYSTEM_MNEMONIC (codesys) = Qnil;
 638 #ifdef MULE
 639   if (type == CODESYS_ISO2022)
 640     {
 641       int i;
 642       for (i = 0; i < 4; i++)
 643         CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i) = Qnil;
 644     }
 645   else if (type == CODESYS_CCL)
 646     {
 647       CODING_SYSTEM_CCL_DECODE (codesys) = Qnil;
 648       CODING_SYSTEM_CCL_ENCODE (codesys) = Qnil;
 649     }
 650 #endif /* MULE */
 651   CODING_SYSTEM_NAME (codesys) = name;
 652
 653   return codesys;
 654 }
 655
 656 #ifdef MULE
 657 /* Given a list of charset conversion specs as specified in a Lisp
 658    program, parse it into STORE_HERE. */
 659
 660 static void
 661 parse_charset_conversion_specs (charset_conversion_spec_dynarr *store_here,
 662                                 Lisp_Object spec_list)
 663 {
 664   Lisp_Object rest;
 665
 666   EXTERNAL_LIST_LOOP (rest, spec_list)
 667     {
 668       Lisp_Object car = XCAR (rest);
 669       Lisp_Object from, to;
 670       struct charset_conversion_spec spec;
 671
 672       if (!CONSP (car) || !CONSP (XCDR (car)) || !NILP (XCDR (XCDR (car))))
 673         signal_simple_error ("Invalid charset conversion spec", car);
 674       from = Fget_charset (XCAR (car));
 675       to = Fget_charset (XCAR (XCDR (car)));
 676       if ( (XCHARSET_CHARS (from) != XCHARSET_CHARS (to)) ||
 677            (XCHARSET_DIMENSION (from) != XCHARSET_DIMENSION (to)) )
 678         signal_simple_error_2
 679           ("Attempted conversion between different charset types",
 680            from, to);
 681       spec.from_charset = from;
 682       spec.to_charset = to;
 683
 684       Dynarr_add (store_here, spec);
 685     }
 686 }
 687
 688 /* Given a dynarr LOAD_HERE of internally-stored charset conversion
 689    specs, return the equivalent as the Lisp programmer would see it.
 690
 691    If LOAD_HERE is 0, return Qnil. */
 692
 693 static Lisp_Object
 694 unparse_charset_conversion_specs (charset_conversion_spec_dynarr *load_here)
 695 {
 696   int i;
 697   Lisp_Object result;
 698
 699   if (!load_here)
 700     return Qnil;
 701   for (i = 0, result = Qnil; i < Dynarr_length (load_here); i++)
 702     {
 703       struct charset_conversion_spec *ccs = Dynarr_atp (load_here, i);
 704       result = Fcons (list2 (ccs->from_charset, ccs->to_charset), result);
 705     }
 706
 707   return Fnreverse (result);
 708 }
 709
 710 #endif /* MULE */
 711
 712 DEFUN ("make-coding-system", Fmake_coding_system, 2, 4, 0, /*
 713 Register symbol NAME as a coding system.
 714
 715 TYPE describes the conversion method used and should be one of
 716
 717 nil or 'undecided
 718      Automatic conversion.  XEmacs attempts to detect the coding system
 719      used in the file.
 720 'no-conversion
 721      No conversion.  Use this for binary files and such.  On output,
 722      graphic characters that are not in ASCII or Latin-1 will be
 723      replaced by a ?. (For a no-conversion-encoded buffer, these
 724      characters will only be present if you explicitly insert them.)
 725 'shift-jis
 726      Shift-JIS (a Japanese encoding commonly used in PC operating systems).
 727 'ucs-4
 728      ISO 10646 UCS-4 encoding.
 729 'utf-8
 730      ISO 10646 UTF-8 encoding.
 731 'iso2022
 732      Any ISO2022-compliant encoding.  Among other things, this includes
 733      JIS (the Japanese encoding commonly used for e-mail), EUC (the
 734      standard Unix encoding for Japanese and other languages), and
 735      Compound Text (the encoding used in X11).  You can specify more
 736      specific information about the conversion with the PROPS argument.
 737 'big5
 738      Big5 (the encoding commonly used for Taiwanese).
 739 'ccl
 740      The conversion is performed using a user-written pseudo-code
 741      program.  CCL (Code Conversion Language) is the name of this
 742      pseudo-code.
 743 'internal
 744      Write out or read in the raw contents of the memory representing
 745      the buffer's text.  This is primarily useful for debugging
 746      purposes, and is only enabled when XEmacs has been compiled with
 747      DEBUG_XEMACS defined (via the --debug configure option).
 748      WARNING: Reading in a file using 'internal conversion can result
 749      in an internal inconsistency in the memory representing a
 750      buffer's text, which will produce unpredictable results and may
 751      cause XEmacs to crash.  Under normal circumstances you should
 752      never use 'internal conversion.
 753
 754 DOC-STRING is a string describing the coding system.
 755
 756 PROPS is a property list, describing the specific nature of the
 757 character set.  Recognized properties are:
 758
 759 'mnemonic
 760      String to be displayed in the modeline when this coding system is
 761      active.
 762
 763 'eol-type
 764      End-of-line conversion to be used.  It should be one of
 765
 766         nil
 767                 Automatically detect the end-of-line type (LF, CRLF,
 768                 or CR).  Also generate subsidiary coding systems named
 769                 `NAME-unix', `NAME-dos', and `NAME-mac', that are
 770                 identical to this coding system but have an EOL-TYPE
 771                 value of 'lf, 'crlf, and 'cr, respectively.
 772         'lf
 773                 The end of a line is marked externally using ASCII LF.
 774                 Since this is also the way that XEmacs represents an
 775                 end-of-line internally, specifying this option results
 776                 in no end-of-line conversion.  This is the standard
 777                 format for Unix text files.
 778         'crlf
 779                 The end of a line is marked externally using ASCII
 780                 CRLF.  This is the standard format for MS-DOS text
 781                 files.
 782         'cr
 783                 The end of a line is marked externally using ASCII CR.
 784                 This is the standard format for Macintosh text files.
 785         t
 786                 Automatically detect the end-of-line type but do not
 787                 generate subsidiary coding systems.  (This value is
 788                 converted to nil when stored internally, and
 789                 `coding-system-property' will return nil.)
 790
 791 'disable-composition
 792      If non-nil, composition/decomposition for combining characters
 793      are disabled.
 794
 795 'post-read-conversion
 796      Function called after a file has been read in, to perform the
 797      decoding.  Called with two arguments, START and END, denoting
 798      a region of the current buffer to be decoded.
 799
 800 'pre-write-conversion
 801      Function called before a file is written out, to perform the
 802      encoding.  Called with two arguments, START and END, denoting
 803      a region of the current buffer to be encoded.
 804
 805
 806 The following additional properties are recognized if TYPE is 'iso2022:
 807
 808 'charset-g0
 809 'charset-g1
 810 'charset-g2
 811 'charset-g3
 812      The character set initially designated to the G0 - G3 registers.
 813      The value should be one of
 814
 815           -- A charset object (designate that character set)
 816           -- nil (do not ever use this register)
 817           -- t (no character set is initially designated to
 818                 the register, but may be later on; this automatically
 819                 sets the corresponding `force-g*-on-output' property)
 820
 821 'force-g0-on-output
 822 'force-g1-on-output
 823 'force-g2-on-output
 824 'force-g2-on-output
 825      If non-nil, send an explicit designation sequence on output before
 826      using the specified register.
 827
 828 'short
 829      If non-nil, use the short forms "ESC $ @", "ESC $ A", and
 830      "ESC $ B" on output in place of the full designation sequences
 831      "ESC $ ( @", "ESC $ ( A", and "ESC $ ( B".
 832
 833 'no-ascii-eol
 834      If non-nil, don't designate ASCII to G0 at each end of line on output.
 835      Setting this to non-nil also suppresses other state-resetting that
 836      normally happens at the end of a line.
 837
 838 'no-ascii-cntl
 839      If non-nil, don't designate ASCII to G0 before control chars on output.
 840
 841 'seven
 842      If non-nil, use 7-bit environment on output.  Otherwise, use 8-bit
 843      environment.
 844
 845 'lock-shift
 846      If non-nil, use locking-shift (SO/SI) instead of single-shift
 847      or designation by escape sequence.
 848
 849 'no-iso6429
 850      If non-nil, don't use ISO6429's direction specification.
 851
 852 'escape-quoted
 853      If non-nil, literal control characters that are the same as
 854      the beginning of a recognized ISO2022 or ISO6429 escape sequence
 855      (in particular, ESC (0x1B), SO (0x0E), SI (0x0F), SS2 (0x8E),
 856      SS3 (0x8F), and CSI (0x9B)) are "quoted" with an escape character
 857      so that they can be properly distinguished from an escape sequence.
 858      (Note that doing this results in a non-portable encoding.) This
 859      encoding flag is used for byte-compiled files.  Note that ESC
 860      is a good choice for a quoting character because there are no
 861      escape sequences whose second byte is a character from the Control-0
 862      or Control-1 character sets; this is explicitly disallowed by the
 863      ISO2022 standard.
 864
 865 'input-charset-conversion
 866      A list of conversion specifications, specifying conversion of
 867      characters in one charset to another when decoding is performed.
 868      Each specification is a list of two elements: the source charset,
 869      and the destination charset.
 870
 871 'output-charset-conversion
 872      A list of conversion specifications, specifying conversion of
 873      characters in one charset to another when encoding is performed.
 874      The form of each specification is the same as for
 875      'input-charset-conversion.
 876
 877
 878 The following additional properties are recognized (and required)
 879 if TYPE is 'ccl:
 880
 881 'decode
 882      CCL program used for decoding (converting to internal format).
 883
 884 'encode
 885      CCL program used for encoding (converting to external format).
 886 */
 887        (name, type, doc_string, props))
 888 {
 889   Lisp_Coding_System *codesys;
 890   enum coding_system_type ty;
 891   int need_to_setup_eol_systems = 1;
 892
 893   /* Convert type to constant */
 894   if (NILP (type) || EQ (type, Qundecided))
 895                                       { ty = CODESYS_AUTODETECT; }
 896 #ifdef MULE
 897   else if (EQ (type, Qshift_jis))     { ty = CODESYS_SHIFT_JIS; }
 898   else if (EQ (type, Qiso2022))       { ty = CODESYS_ISO2022; }
 899   else if (EQ (type, Qbig5))          { ty = CODESYS_BIG5; }
 900   else if (EQ (type, Qucs4))          { ty = CODESYS_UCS4; }
 901   else if (EQ (type, Qutf8))          { ty = CODESYS_UTF8; }
 902   else if (EQ (type, Qccl))           { ty = CODESYS_CCL; }
 903 #endif
 904   else if (EQ (type, Qno_conversion)) { ty = CODESYS_NO_CONVERSION; }
 905 #ifdef DEBUG_XEMACS
 906   else if (EQ (type, Qinternal))      { ty = CODESYS_INTERNAL; }
 907 #endif
 908   else
 909     signal_simple_error ("Invalid coding system type", type);
 910
 911   CHECK_SYMBOL (name);
 912
 913   codesys = allocate_coding_system (ty, name);
 914
 915   if (NILP (doc_string))
 916     doc_string = build_string ("");
 917   else
 918     CHECK_STRING (doc_string);
 919   CODING_SYSTEM_DOC_STRING (codesys) = doc_string;
 920
 921   {
 922     EXTERNAL_PROPERTY_LIST_LOOP_3 (key, value, props)
 923       {
 924         if (EQ (key, Qmnemonic))
 925           {
 926             if (!NILP (value))
 927               CHECK_STRING (value);
 928             CODING_SYSTEM_MNEMONIC (codesys) = value;
 929           }
 930
 931         else if (EQ (key, Qeol_type))
 932           {
 933             need_to_setup_eol_systems = NILP (value);
 934             if (EQ (value, Qt))
 935               value = Qnil;
 936             CODING_SYSTEM_EOL_TYPE (codesys) = symbol_to_eol_type (value);
 937           }
 938
 939         else if (EQ (key, Qpost_read_conversion))
 940           CODING_SYSTEM_POST_READ_CONVERSION (codesys) = value;
 941         else if (EQ (key, Qpre_write_conversion))
 942           CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = value;
 943 #ifdef UTF2000
 944         else if (EQ (key, Qdisable_composition))
 945           CODING_SYSTEM_DISABLE_COMPOSITION (codesys) = !NILP (value);
 946 #endif
 947 #ifdef MULE
 948         else if (ty == CODESYS_ISO2022)
 949           {
 950 #define FROB_INITIAL_CHARSET(charset_num) \
 951   CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \
 952     ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value))
 953
 954             if      (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
 955             else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
 956             else if (EQ (key, Qcharset_g2)) FROB_INITIAL_CHARSET (2);
 957             else if (EQ (key, Qcharset_g3)) FROB_INITIAL_CHARSET (3);
 958
 959 #define FROB_FORCE_CHARSET(charset_num) \
 960   CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = !NILP (value)
 961
 962             else if (EQ (key, Qforce_g0_on_output)) FROB_FORCE_CHARSET (0);
 963             else if (EQ (key, Qforce_g1_on_output)) FROB_FORCE_CHARSET (1);
 964             else if (EQ (key, Qforce_g2_on_output)) FROB_FORCE_CHARSET (2);
 965             else if (EQ (key, Qforce_g3_on_output)) FROB_FORCE_CHARSET (3);
 966
 967 #define FROB_BOOLEAN_PROPERTY(prop) \
 968   CODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value)
 969
 970             else if (EQ (key, Qshort))         FROB_BOOLEAN_PROPERTY (SHORT);
 971             else if (EQ (key, Qno_ascii_eol))  FROB_BOOLEAN_PROPERTY (NO_ASCII_EOL);
 972             else if (EQ (key, Qno_ascii_cntl)) FROB_BOOLEAN_PROPERTY (NO_ASCII_CNTL);
 973             else if (EQ (key, Qseven))         FROB_BOOLEAN_PROPERTY (SEVEN);
 974             else if (EQ (key, Qlock_shift))    FROB_BOOLEAN_PROPERTY (LOCK_SHIFT);
 975             else if (EQ (key, Qno_iso6429))    FROB_BOOLEAN_PROPERTY (NO_ISO6429);
 976             else if (EQ (key, Qescape_quoted)) FROB_BOOLEAN_PROPERTY (ESCAPE_QUOTED);
 977
 978             else if (EQ (key, Qinput_charset_conversion))
 979               {
 980                 codesys->iso2022.input_conv =
 981                   Dynarr_new (charset_conversion_spec);
 982                 parse_charset_conversion_specs (codesys->iso2022.input_conv,
 983                                                 value);
 984               }
 985             else if (EQ (key, Qoutput_charset_conversion))
 986               {
 987                 codesys->iso2022.output_conv =
 988                   Dynarr_new (charset_conversion_spec);
 989                 parse_charset_conversion_specs (codesys->iso2022.output_conv,
 990                                                 value);
 991               }
 992             else
 993               signal_simple_error ("Unrecognized property", key);
 994           }
 995         else if (EQ (type, Qccl))
 996           {
 997             Lisp_Object sym;
 998             struct ccl_program test_ccl;
 999             Extbyte *suffix;
1000
1001             /* Check key first.  */
1002             if (EQ (key, Qdecode))
1003               suffix = "-ccl-decode";
1004             else if (EQ (key, Qencode))
1005               suffix = "-ccl-encode";
1006             else
1007               signal_simple_error ("Unrecognized property", key);
1008
1009             /* If value is vector, register it as a ccl program
1010                associated with an newly created symbol for
1011                backward compatibility.  */
1012             if (VECTORP (value))
1013               {
1014                 sym = Fintern (concat2 (Fsymbol_name (name),
1015                                         build_string (suffix)),
1016                                Qnil);
1017                 Fregister_ccl_program (sym, value);
1018               }
1019             else
1020               {
1021                 CHECK_SYMBOL (value);
1022                 sym = value;
1023               }
1024             /* check if the given ccl programs are valid.  */
1025             if (setup_ccl_program (&test_ccl, sym) < 0)
1026               signal_simple_error ("Invalid CCL program", value);
1027
1028             if (EQ (key, Qdecode))
1029               CODING_SYSTEM_CCL_DECODE (codesys) = sym;
1030             else if (EQ (key, Qencode))
1031               CODING_SYSTEM_CCL_ENCODE (codesys) = sym;
1032
1033           }
1034 #endif /* MULE */
1035         else
1036           signal_simple_error ("Unrecognized property", key);
1037       }
1038   }
1039
1040   if (need_to_setup_eol_systems)
1041     setup_eol_coding_systems (codesys);
1042
1043   {
1044     Lisp_Object codesys_obj;
1045     XSETCODING_SYSTEM (codesys_obj, codesys);
1046     Fputhash (name, codesys_obj, Vcoding_system_hash_table);
1047     return codesys_obj;
1048   }
1049 }
1050
1051 DEFUN ("copy-coding-system", Fcopy_coding_system, 2, 2, 0, /*
1052 Copy OLD-CODING-SYSTEM to NEW-NAME.
1053 If NEW-NAME does not name an existing coding system, a new one will
1054 be created.
1055 */
1056        (old_coding_system, new_name))
1057 {
1058   Lisp_Object new_coding_system;
1059   old_coding_system = Fget_coding_system (old_coding_system);
1060   new_coding_system = Ffind_coding_system (new_name);
1061   if (NILP (new_coding_system))
1062     {
1063       XSETCODING_SYSTEM (new_coding_system,
1064                          allocate_coding_system
1065                          (XCODING_SYSTEM_TYPE (old_coding_system),
1066                           new_name));
1067       Fputhash (new_name, new_coding_system, Vcoding_system_hash_table);
1068     }
1069
1070   {
1071     Lisp_Coding_System *to = XCODING_SYSTEM (new_coding_system);
1072     Lisp_Coding_System *from = XCODING_SYSTEM (old_coding_system);
1073     memcpy (((char *) to  ) + sizeof (to->header),
1074             ((char *) from) + sizeof (from->header),
1075             sizeof (*from) - sizeof (from->header));
1076     to->name = new_name;
1077   }
1078   return new_coding_system;
1079 }
1080
1081 DEFUN ("coding-system-canonical-name-p", Fcoding_system_canonical_name_p, 1, 1, 0, /*
1082 Return t if OBJECT names a coding system, and is not a coding system alias.
1083 */
1084        (object))
1085 {
1086   return CODING_SYSTEMP (Fgethash (object, Vcoding_system_hash_table, Qnil))
1087     ? Qt : Qnil;
1088 }
1089
1090 DEFUN ("coding-system-alias-p", Fcoding_system_alias_p, 1, 1, 0, /*
1091 Return t if OBJECT is a coding system alias.
1092 All coding system aliases are created by `define-coding-system-alias'.
1093 */
1094        (object))
1095 {
1096   return SYMBOLP (Fgethash (object, Vcoding_system_hash_table, Qzero))
1097     ? Qt : Qnil;
1098 }
1099
1100 DEFUN ("coding-system-aliasee", Fcoding_system_aliasee, 1, 1, 0, /*
1101 Return the coding-system symbol for which symbol ALIAS is an alias.
1102 */
1103        (alias))
1104 {
1105   Lisp_Object aliasee = Fgethash (alias, Vcoding_system_hash_table, Qnil);
1106   if (SYMBOLP (aliasee))
1107     return aliasee;
1108   else
1109     signal_simple_error ("Symbol is not a coding system alias", alias);
1110   return Qnil;          /* To keep the compiler happy */
1111 }
1112
1113 static Lisp_Object
1114 append_suffix_to_symbol (Lisp_Object symbol, const char *ascii_string)
1115 {
1116   return Fintern (concat2 (Fsymbol_name (symbol), build_string (ascii_string)),
1117                   Qnil);
1118 }
1119
1120 /* A maphash function, for removing dangling coding system aliases. */
1121 static int
1122 dangling_coding_system_alias_p (Lisp_Object alias,
1123                                 Lisp_Object aliasee,
1124                                 void *dangling_aliases)
1125 {
1126   if (SYMBOLP (aliasee)
1127       && NILP (Fgethash (aliasee, Vcoding_system_hash_table, Qnil)))
1128     {
1129       (*(int *) dangling_aliases)++;
1130       return 1;
1131     }
1132   else
1133     return 0;
1134 }
1135
1136 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias, 2, 2, 0, /*
1137 Define symbol ALIAS as an alias for coding system ALIASEE.
1138
1139 You can use this function to redefine an alias that has already been defined,
1140 but you cannot redefine a name which is the canonical name for a coding system.
1141 \(a canonical name of a coding system is what is returned when you call
1142 `coding-system-name' on a coding system).
1143
1144 ALIASEE itself can be an alias, which allows you to define nested aliases.
1145
1146 You are forbidden, however, from creating alias loops or `dangling' aliases.
1147 These will be detected, and an error will be signaled if you attempt to do so.
1148
1149 If ALIASEE is nil, then ALIAS will simply be undefined.
1150
1151 See also `coding-system-alias-p', `coding-system-aliasee',
1152 and `coding-system-canonical-name-p'.
1153 */
1154        (alias, aliasee))
1155 {
1156   Lisp_Object real_coding_system, probe;
1157
1158   CHECK_SYMBOL (alias);
1159
1160   if (!NILP (Fcoding_system_canonical_name_p (alias)))
1161     signal_simple_error
1162       ("Symbol is the canonical name of a coding system and cannot be redefined",
1163        alias);
1164
1165   if (NILP (aliasee))
1166     {
1167       Lisp_Object subsidiary_unix = append_suffix_to_symbol (alias, "-unix");
1168       Lisp_Object subsidiary_dos  = append_suffix_to_symbol (alias, "-dos");
1169       Lisp_Object subsidiary_mac  = append_suffix_to_symbol (alias, "-mac");
1170
1171       Fremhash (alias, Vcoding_system_hash_table);
1172
1173       /* Undefine subsidiary aliases,
1174          presumably created by a previous call to this function */
1175       if (! NILP (Fcoding_system_alias_p (subsidiary_unix)) &&
1176           ! NILP (Fcoding_system_alias_p (subsidiary_dos))  &&
1177           ! NILP (Fcoding_system_alias_p (subsidiary_mac)))
1178         {
1179           Fdefine_coding_system_alias (subsidiary_unix, Qnil);
1180           Fdefine_coding_system_alias (subsidiary_dos,  Qnil);
1181           Fdefine_coding_system_alias (subsidiary_mac,  Qnil);
1182         }
1183
1184       /* Undefine dangling coding system aliases. */
1185       {
1186         int dangling_aliases;
1187
1188         do {
1189           dangling_aliases = 0;
1190           elisp_map_remhash (dangling_coding_system_alias_p,
1191                              Vcoding_system_hash_table,
1192                              &dangling_aliases);
1193         } while (dangling_aliases > 0);
1194       }
1195
1196       return Qnil;
1197     }
1198
1199   if (CODING_SYSTEMP (aliasee))
1200     aliasee = XCODING_SYSTEM_NAME (aliasee);
1201
1202   /* Checks that aliasee names a coding-system */
1203   real_coding_system = Fget_coding_system (aliasee);
1204
1205   /* Check for coding system alias loops */
1206   if (EQ (alias, aliasee))
1207     alias_loop: signal_simple_error_2
1208       ("Attempt to create a coding system alias loop", alias, aliasee);
1209
1210   for (probe = aliasee;
1211        SYMBOLP (probe);
1212        probe = Fgethash (probe, Vcoding_system_hash_table, Qzero))
1213     {
1214       if (EQ (probe, alias))
1215         goto alias_loop;
1216     }
1217
1218   Fputhash (alias, aliasee, Vcoding_system_hash_table);
1219
1220   /* Set up aliases for subsidiaries.
1221      #### There must be a better way to handle subsidiary coding systems. */
1222   {
1223     static const char *suffixes[] = { "-unix", "-dos", "-mac" };
1224     int i;
1225     for (i = 0; i < countof (suffixes); i++)
1226       {
1227         Lisp_Object alias_subsidiary =
1228           append_suffix_to_symbol (alias, suffixes[i]);
1229         Lisp_Object aliasee_subsidiary =
1230           append_suffix_to_symbol (aliasee, suffixes[i]);
1231
1232         if (! NILP (Ffind_coding_system (aliasee_subsidiary)))
1233           Fdefine_coding_system_alias (alias_subsidiary, aliasee_subsidiary);
1234       }
1235   }
1236   /* FSF return value is a vector of [ALIAS-unix ALIAS-dos ALIAS-mac],
1237      but it doesn't look intentional, so I'd rather return something
1238      meaningful or nothing at all. */
1239   return Qnil;
1240 }
1241
1242 static Lisp_Object
1243 subsidiary_coding_system (Lisp_Object coding_system, eol_type_t type)
1244 {
1245   Lisp_Coding_System *cs = XCODING_SYSTEM (coding_system);
1246   Lisp_Object new_coding_system;
1247
1248   if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
1249     return coding_system;
1250
1251   switch (type)
1252     {
1253     case EOL_AUTODETECT: return coding_system;
1254     case EOL_LF:   new_coding_system = CODING_SYSTEM_EOL_LF   (cs); break;
1255     case EOL_CR:   new_coding_system = CODING_SYSTEM_EOL_CR   (cs); break;
1256     case EOL_CRLF: new_coding_system = CODING_SYSTEM_EOL_CRLF (cs); break;
1257     default:       abort (); return Qnil;
1258     }
1259
1260   return NILP (new_coding_system) ? coding_system : new_coding_system;
1261 }
1262
1263 DEFUN ("subsidiary-coding-system", Fsubsidiary_coding_system, 2, 2, 0, /*
1264 Return the subsidiary coding system of CODING-SYSTEM with eol type EOL-TYPE.
1265 */
1266        (coding_system, eol_type))
1267 {
1268   coding_system = Fget_coding_system (coding_system);
1269
1270   return subsidiary_coding_system (coding_system,
1271                                    symbol_to_eol_type (eol_type));
1272 }
1273
1274 \f
1275 /************************************************************************/
1276 /*                         Coding system accessors                      */
1277 /************************************************************************/
1278
1279 DEFUN ("coding-system-doc-string", Fcoding_system_doc_string, 1, 1, 0, /*
1280 Return the doc string for CODING-SYSTEM.
1281 */
1282        (coding_system))
1283 {
1284   coding_system = Fget_coding_system (coding_system);
1285   return XCODING_SYSTEM_DOC_STRING (coding_system);
1286 }
1287
1288 DEFUN ("coding-system-type", Fcoding_system_type, 1, 1, 0, /*
1289 Return the type of CODING-SYSTEM.
1290 */
1291        (coding_system))
1292 {
1293   switch (XCODING_SYSTEM_TYPE (Fget_coding_system (coding_system)))
1294     {
1295     default: abort ();
1296     case CODESYS_AUTODETECT:    return Qundecided;
1297 #ifdef MULE
1298     case CODESYS_SHIFT_JIS:     return Qshift_jis;
1299     case CODESYS_ISO2022:       return Qiso2022;
1300     case CODESYS_BIG5:          return Qbig5;
1301     case CODESYS_UCS4:          return Qucs4;
1302     case CODESYS_UTF8:          return Qutf8;
1303     case CODESYS_CCL:           return Qccl;
1304 #endif
1305     case CODESYS_NO_CONVERSION: return Qno_conversion;
1306 #ifdef DEBUG_XEMACS
1307     case CODESYS_INTERNAL:      return Qinternal;
1308 #endif
1309     }
1310 }
1311
1312 #ifdef MULE
1313 static
1314 Lisp_Object coding_system_charset (Lisp_Object coding_system, int gnum)
1315 {
1316   Lisp_Object cs
1317     = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, gnum);
1318
1319   return CHARSETP (cs) ? XCHARSET_NAME (cs) : Qnil;
1320 }
1321
1322 DEFUN ("coding-system-charset", Fcoding_system_charset, 2, 2, 0, /*
1323 Return initial charset of CODING-SYSTEM designated to GNUM.
1324 GNUM allows 0 .. 3.
1325 */
1326        (coding_system, gnum))
1327 {
1328   coding_system = Fget_coding_system (coding_system);
1329   CHECK_INT (gnum);
1330
1331   return coding_system_charset (coding_system, XINT (gnum));
1332 }
1333 #endif /* MULE */
1334
1335 DEFUN ("coding-system-property", Fcoding_system_property, 2, 2, 0, /*
1336 Return the PROP property of CODING-SYSTEM.
1337 */
1338        (coding_system, prop))
1339 {
1340   int i, ok = 0;
1341   enum coding_system_type type;
1342
1343   coding_system = Fget_coding_system (coding_system);
1344   CHECK_SYMBOL (prop);
1345   type = XCODING_SYSTEM_TYPE (coding_system);
1346
1347   for (i = 0; !ok && i < Dynarr_length (the_codesys_prop_dynarr); i++)
1348     if (EQ (Dynarr_at (the_codesys_prop_dynarr, i).sym, prop))
1349       {
1350         ok = 1;
1351         switch (Dynarr_at (the_codesys_prop_dynarr, i).prop_type)
1352           {
1353           case CODESYS_PROP_ALL_OK:
1354             break;
1355 #ifdef MULE
1356           case CODESYS_PROP_ISO2022:
1357             if (type != CODESYS_ISO2022)
1358               signal_simple_error
1359                 ("Property only valid in ISO2022 coding systems",
1360                  prop);
1361             break;
1362
1363           case CODESYS_PROP_CCL:
1364             if (type != CODESYS_CCL)
1365               signal_simple_error
1366                 ("Property only valid in CCL coding systems",
1367                  prop);
1368             break;
1369 #endif /* MULE */
1370           default:
1371             abort ();
1372           }
1373       }
1374
1375   if (!ok)
1376     signal_simple_error ("Unrecognized property", prop);
1377
1378   if (EQ (prop, Qname))
1379     return XCODING_SYSTEM_NAME (coding_system);
1380   else if (EQ (prop, Qtype))
1381     return Fcoding_system_type (coding_system);
1382   else if (EQ (prop, Qdoc_string))
1383     return XCODING_SYSTEM_DOC_STRING (coding_system);
1384   else if (EQ (prop, Qmnemonic))
1385     return XCODING_SYSTEM_MNEMONIC (coding_system);
1386   else if (EQ (prop, Qeol_type))
1387     return eol_type_to_symbol (XCODING_SYSTEM_EOL_TYPE (coding_system));
1388   else if (EQ (prop, Qeol_lf))
1389     return XCODING_SYSTEM_EOL_LF (coding_system);
1390   else if (EQ (prop, Qeol_crlf))
1391     return XCODING_SYSTEM_EOL_CRLF (coding_system);
1392   else if (EQ (prop, Qeol_cr))
1393     return XCODING_SYSTEM_EOL_CR (coding_system);
1394   else if (EQ (prop, Qpost_read_conversion))
1395     return XCODING_SYSTEM_POST_READ_CONVERSION (coding_system);
1396   else if (EQ (prop, Qpre_write_conversion))
1397     return XCODING_SYSTEM_PRE_WRITE_CONVERSION (coding_system);
1398 #ifdef MULE
1399   else if (type == CODESYS_ISO2022)
1400     {
1401       if (EQ (prop, Qcharset_g0))
1402         return coding_system_charset (coding_system, 0);
1403       else if (EQ (prop, Qcharset_g1))
1404         return coding_system_charset (coding_system, 1);
1405       else if (EQ (prop, Qcharset_g2))
1406         return coding_system_charset (coding_system, 2);
1407       else if (EQ (prop, Qcharset_g3))
1408         return coding_system_charset (coding_system, 3);
1409
1410 #define FORCE_CHARSET(charset_num) \
1411   (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \
1412    (coding_system, charset_num) ? Qt : Qnil)
1413
1414       else if (EQ (prop, Qforce_g0_on_output)) return FORCE_CHARSET (0);
1415       else if (EQ (prop, Qforce_g1_on_output)) return FORCE_CHARSET (1);
1416       else if (EQ (prop, Qforce_g2_on_output)) return FORCE_CHARSET (2);
1417       else if (EQ (prop, Qforce_g3_on_output)) return FORCE_CHARSET (3);
1418
1419 #define LISP_BOOLEAN(prop) \
1420   (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil)
1421
1422       else if (EQ (prop, Qshort))         return LISP_BOOLEAN (SHORT);
1423       else if (EQ (prop, Qno_ascii_eol))  return LISP_BOOLEAN (NO_ASCII_EOL);
1424       else if (EQ (prop, Qno_ascii_cntl)) return LISP_BOOLEAN (NO_ASCII_CNTL);
1425       else if (EQ (prop, Qseven))         return LISP_BOOLEAN (SEVEN);
1426       else if (EQ (prop, Qlock_shift))    return LISP_BOOLEAN (LOCK_SHIFT);
1427       else if (EQ (prop, Qno_iso6429))    return LISP_BOOLEAN (NO_ISO6429);
1428       else if (EQ (prop, Qescape_quoted)) return LISP_BOOLEAN (ESCAPE_QUOTED);
1429
1430       else if (EQ (prop, Qinput_charset_conversion))
1431         return
1432           unparse_charset_conversion_specs
1433             (XCODING_SYSTEM (coding_system)->iso2022.input_conv);
1434       else if (EQ (prop, Qoutput_charset_conversion))
1435         return
1436           unparse_charset_conversion_specs
1437             (XCODING_SYSTEM (coding_system)->iso2022.output_conv);
1438       else
1439         abort ();
1440     }
1441   else if (type == CODESYS_CCL)
1442     {
1443       if (EQ (prop, Qdecode))
1444         return XCODING_SYSTEM_CCL_DECODE (coding_system);
1445       else if (EQ (prop, Qencode))
1446         return XCODING_SYSTEM_CCL_ENCODE (coding_system);
1447       else
1448         abort ();
1449     }
1450 #endif /* MULE */
1451   else
1452     abort ();
1453
1454   return Qnil; /* not reached */
1455 }
1456
1457 \f
1458 /************************************************************************/
1459 /*                       Coding category functions                      */
1460 /************************************************************************/
1461
1462 static int
1463 decode_coding_category (Lisp_Object symbol)
1464 {
1465   int i;
1466
1467   CHECK_SYMBOL (symbol);
1468   for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1469     if (EQ (coding_category_symbol[i], symbol))
1470       return i;
1471
1472   signal_simple_error ("Unrecognized coding category", symbol);
1473   return 0; /* not reached */
1474 }
1475
1476 DEFUN ("coding-category-list", Fcoding_category_list, 0, 0, 0, /*
1477 Return a list of all recognized coding categories.
1478 */
1479        ())
1480 {
1481   int i;
1482   Lisp_Object list = Qnil;
1483
1484   for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1485     list = Fcons (coding_category_symbol[i], list);
1486   return list;
1487 }
1488
1489 DEFUN ("set-coding-priority-list", Fset_coding_priority_list, 1, 1, 0, /*
1490 Change the priority order of the coding categories.
1491 LIST should be list of coding categories, in descending order of
1492 priority.  Unspecified coding categories will be lower in priority
1493 than all specified ones, in the same relative order they were in
1494 previously.
1495 */
1496        (list))
1497 {
1498   int category_to_priority[CODING_CATEGORY_LAST + 1];
1499   int i, j;
1500   Lisp_Object rest;
1501
1502   /* First generate a list that maps coding categories to priorities. */
1503
1504   for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1505     category_to_priority[i] = -1;
1506
1507   /* Highest priority comes from the specified list. */
1508   i = 0;
1509   EXTERNAL_LIST_LOOP (rest, list)
1510     {
1511       int cat = decode_coding_category (XCAR (rest));
1512
1513       if (category_to_priority[cat] >= 0)
1514         signal_simple_error ("Duplicate coding category in list", XCAR (rest));
1515       category_to_priority[cat] = i++;
1516     }
1517
1518   /* Now go through the existing categories by priority to retrieve
1519      the categories not yet specified and preserve their priority
1520      order. */
1521   for (j = 0; j <= CODING_CATEGORY_LAST; j++)
1522     {
1523       int cat = fcd->coding_category_by_priority[j];
1524       if (category_to_priority[cat] < 0)
1525         category_to_priority[cat] = i++;
1526     }
1527
1528   /* Now we need to construct the inverse of the mapping we just
1529      constructed. */
1530
1531   for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1532     fcd->coding_category_by_priority[category_to_priority[i]] = i;
1533
1534   /* Phew!  That was confusing. */
1535   return Qnil;
1536 }
1537
1538 DEFUN ("coding-priority-list", Fcoding_priority_list, 0, 0, 0, /*
1539 Return a list of coding categories in descending order of priority.
1540 */
1541        ())
1542 {
1543   int i;
1544   Lisp_Object list = Qnil;
1545
1546   for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1547     list = Fcons (coding_category_symbol[fcd->coding_category_by_priority[i]],
1548                   list);
1549   return list;
1550 }
1551
1552 DEFUN ("set-coding-category-system", Fset_coding_category_system, 2, 2, 0, /*
1553 Change the coding system associated with a coding category.
1554 */
1555        (coding_category, coding_system))
1556 {
1557   int cat = decode_coding_category (coding_category);
1558
1559   coding_system = Fget_coding_system (coding_system);
1560   fcd->coding_category_system[cat] = coding_system;
1561   return Qnil;
1562 }
1563
1564 DEFUN ("coding-category-system", Fcoding_category_system, 1, 1, 0, /*
1565 Return the coding system associated with a coding category.
1566 */
1567        (coding_category))
1568 {
1569   int cat = decode_coding_category (coding_category);
1570   Lisp_Object sys = fcd->coding_category_system[cat];
1571
1572   if (!NILP (sys))
1573     return XCODING_SYSTEM_NAME (sys);
1574   return Qnil;
1575 }
1576
1577 \f
1578 /************************************************************************/
1579 /*                     Detecting the encoding of data                   */
1580 /************************************************************************/
1581
1582 struct detection_state
1583 {
1584   eol_type_t eol_type;
1585   int seen_non_ascii;
1586   int mask;
1587 #ifdef MULE
1588   struct
1589     {
1590       int mask;
1591       int in_second_byte;
1592     }
1593   big5;
1594
1595   struct
1596     {
1597       int mask;
1598       int in_second_byte;
1599     }
1600   shift_jis;
1601
1602   struct
1603     {
1604       int mask;
1605       int in_byte;
1606   }
1607   ucs4;
1608
1609   struct
1610     {
1611       int mask;
1612       int in_byte;
1613     }
1614   utf8;
1615
1616   struct
1617     {
1618       int mask;
1619       int initted;
1620       struct iso2022_decoder iso;
1621       unsigned int flags;
1622       int high_byte_count;
1623       unsigned int saw_single_shift:1;
1624     }
1625   iso2022;
1626 #endif
1627   struct
1628     {
1629       int seen_anything;
1630       int just_saw_cr;
1631     }
1632   eol;
1633 };
1634
1635 static int
1636 acceptable_control_char_p (int c)
1637 {
1638   switch (c)
1639     {
1640       /* Allow and ignore control characters that you might
1641          reasonably see in a text file */
1642     case '\r':
1643     case '\n':
1644     case '\t':
1645     case  7: /* bell */
1646     case  8: /* backspace */
1647     case 11: /* vertical tab */
1648     case 12: /* form feed */
1649     case 26: /* MS-DOS C-z junk */
1650     case 31: /* '^_' -- for info */
1651       return 1;
1652     default:
1653       return 0;
1654     }
1655 }
1656
1657 static int
1658 mask_has_at_most_one_bit_p (int mask)
1659 {
1660   /* Perhaps the only thing useful you learn from intensive Microsoft
1661      technical interviews */
1662   return (mask & (mask - 1)) == 0;
1663 }
1664
1665 static eol_type_t
1666 detect_eol_type (struct detection_state *st, const Extbyte *src,
1667                  size_t n)
1668 {
1669   while (n--)
1670     {
1671       unsigned char c = *(unsigned char *)src++;
1672       if (c == '\n')
1673         {
1674           if (st->eol.just_saw_cr)
1675             return EOL_CRLF;
1676           else if (st->eol.seen_anything)
1677             return EOL_LF;
1678         }
1679       else if (st->eol.just_saw_cr)
1680         return EOL_CR;
1681       else if (c == '\r')
1682         st->eol.just_saw_cr = 1;
1683       else
1684         st->eol.just_saw_cr = 0;
1685       st->eol.seen_anything = 1;
1686     }
1687
1688   return EOL_AUTODETECT;
1689 }
1690
1691 /* Attempt to determine the encoding and EOL type of the given text.
1692    Before calling this function for the first type, you must initialize
1693    st->eol_type as appropriate and initialize st->mask to ~0.
1694
1695    st->eol_type holds the determined EOL type, or EOL_AUTODETECT if
1696    not yet known.
1697
1698    st->mask holds the determined coding category mask, or ~0 if only
1699    ASCII has been seen so far.
1700
1701    Returns:
1702
1703    0 == st->eol_type is EOL_AUTODETECT and/or more than coding category
1704         is present in st->mask
1705    1 == definitive answers are here for both st->eol_type and st->mask
1706 */
1707
1708 static int
1709 detect_coding_type (struct detection_state *st, const Extbyte *src,
1710                     size_t n, int just_do_eol)
1711 {
1712   if (st->eol_type == EOL_AUTODETECT)
1713     st->eol_type = detect_eol_type (st, src, n);
1714
1715   if (just_do_eol)
1716     return st->eol_type != EOL_AUTODETECT;
1717
1718   if (!st->seen_non_ascii)
1719     {
1720       for (; n; n--, src++)
1721         {
1722           unsigned char c = *(unsigned char *) src;
1723           if ((c < 0x20 && !acceptable_control_char_p (c)) || c >= 0x80)
1724             {
1725               st->seen_non_ascii = 1;
1726 #ifdef MULE
1727               st->shift_jis.mask = ~0;
1728               st->big5.mask = ~0;
1729               st->ucs4.mask = ~0;
1730               st->utf8.mask = ~0;
1731               st->iso2022.mask = ~0;
1732 #endif
1733               break;
1734             }
1735         }
1736     }
1737
1738   if (!n)
1739     return 0;
1740 #ifdef MULE
1741   if (!mask_has_at_most_one_bit_p (st->iso2022.mask))
1742     st->iso2022.mask = detect_coding_iso2022 (st, src, n);
1743   if (!mask_has_at_most_one_bit_p (st->shift_jis.mask))
1744     st->shift_jis.mask = detect_coding_sjis (st, src, n);
1745   if (!mask_has_at_most_one_bit_p (st->big5.mask))
1746     st->big5.mask = detect_coding_big5 (st, src, n);
1747   if (!mask_has_at_most_one_bit_p (st->utf8.mask))
1748     st->utf8.mask = detect_coding_utf8 (st, src, n);
1749   if (!mask_has_at_most_one_bit_p (st->ucs4.mask))
1750     st->ucs4.mask = detect_coding_ucs4 (st, src, n);
1751
1752   st->mask
1753     = st->iso2022.mask | st->shift_jis.mask | st->big5.mask
1754     | st->utf8.mask | st->ucs4.mask;
1755 #endif
1756   {
1757     int retval = mask_has_at_most_one_bit_p (st->mask);
1758     st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK;
1759     return retval && st->eol_type != EOL_AUTODETECT;
1760   }
1761 }
1762
1763 static Lisp_Object
1764 coding_system_from_mask (int mask)
1765 {
1766   if (mask == ~0)
1767     {
1768       /* If the file was entirely or basically ASCII, use the
1769          default value of `buffer-file-coding-system'. */
1770       Lisp_Object retval =
1771         XBUFFER (Vbuffer_defaults)->buffer_file_coding_system;
1772       if (!NILP (retval))
1773         {
1774           retval = Ffind_coding_system (retval);
1775           if (NILP (retval))
1776             {
1777               warn_when_safe
1778                 (Qbad_variable, Qwarning,
1779                  "Invalid `default-buffer-file-coding-system', set to nil");
1780               XBUFFER (Vbuffer_defaults)->buffer_file_coding_system = Qnil;
1781             }
1782         }
1783       if (NILP (retval))
1784         retval = Fget_coding_system (Qraw_text);
1785       return retval;
1786     }
1787   else
1788     {
1789       int i;
1790       int cat = -1;
1791 #ifdef MULE
1792       mask = postprocess_iso2022_mask (mask);
1793 #endif
1794       /* Look through the coding categories by priority and find
1795          the first one that is allowed. */
1796       for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1797         {
1798           cat = fcd->coding_category_by_priority[i];
1799           if ((mask & (1 << cat)) &&
1800               !NILP (fcd->coding_category_system[cat]))
1801             break;
1802         }
1803       if (cat >= 0)
1804         return fcd->coding_category_system[cat];
1805       else
1806         return Fget_coding_system (Qraw_text);
1807     }
1808 }
1809
1810 /* Given a seekable read stream and potential coding system and EOL type
1811    as specified, do any autodetection that is called for.  If the
1812    coding system and/or EOL type are not `autodetect', they will be left
1813    alone; but this function will never return an autodetect coding system
1814    or EOL type.
1815
1816    This function does not automatically fetch subsidiary coding systems;
1817    that should be unnecessary with the explicit eol-type argument. */
1818
1819 #define LENGTH(string_constant) (sizeof (string_constant) - 1)
1820
1821 void
1822 determine_real_coding_system (Lstream *stream, Lisp_Object *codesys_in_out,
1823                               eol_type_t *eol_type_in_out)
1824 {
1825   struct detection_state decst;
1826
1827   if (*eol_type_in_out == EOL_AUTODETECT)
1828     *eol_type_in_out = XCODING_SYSTEM_EOL_TYPE (*codesys_in_out);
1829
1830   xzero (decst);
1831   decst.eol_type = *eol_type_in_out;
1832   decst.mask = ~0;
1833
1834   /* If autodetection is called for, do it now. */
1835   if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT
1836       || *eol_type_in_out == EOL_AUTODETECT)
1837     {
1838       Extbyte buf[4096];
1839       Lisp_Object coding_system = Qnil;
1840       Extbyte *p;
1841       ssize_t nread = Lstream_read (stream, buf, sizeof (buf));
1842       Extbyte *scan_end;
1843
1844       /* Look for initial "-*-"; mode line prefix */
1845       for (p = buf,
1846              scan_end = buf + nread - LENGTH ("-*-coding:?-*-");
1847            p <= scan_end
1848              && *p != '\n'
1849              && *p != '\r';
1850            p++)
1851         if (*p == '-' && *(p+1) == '*' && *(p+2) == '-')
1852           {
1853             Extbyte *local_vars_beg = p + 3;
1854             /* Look for final "-*-"; mode line suffix */
1855             for (p = local_vars_beg,
1856                    scan_end = buf + nread - LENGTH ("-*-");
1857                  p <= scan_end
1858                    && *p != '\n'
1859                    && *p != '\r';
1860                  p++)
1861               if (*p == '-' && *(p+1) == '*' && *(p+2) == '-')
1862                 {
1863                   Extbyte *suffix = p;
1864                   /* Look for "coding:" */
1865                   for (p = local_vars_beg,
1866                          scan_end = suffix - LENGTH ("coding:?");
1867                        p <= scan_end;
1868                        p++)
1869                     if (memcmp ("coding:", p, LENGTH ("coding:")) == 0
1870                         && (p == local_vars_beg
1871                             || (*(p-1) == ' '  ||
1872                                 *(p-1) == '\t' ||
1873                                 *(p-1) == ';')))
1874                       {
1875                         Extbyte save;
1876                         int n;
1877                         p += LENGTH ("coding:");
1878                         while (*p == ' ' || *p == '\t') p++;
1879
1880                         /* Get coding system name */
1881                         save = *suffix; *suffix = '\0';
1882                         /* Characters valid in a MIME charset name (rfc 1521),
1883                            and in a Lisp symbol name. */
1884                         n = strspn ( (char *) p,
1885                                     "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
1886                                     "abcdefghijklmnopqrstuvwxyz"
1887                                     "0123456789"
1888                                     "!$%&*+-.^_{|}~");
1889                         *suffix = save;
1890                         if (n > 0)
1891                           {
1892                             save = p[n]; p[n] = '\0';
1893                             coding_system =
1894                               Ffind_coding_system (intern ((char *) p));
1895                             p[n] = save;
1896                           }
1897                         break;
1898                       }
1899                   break;
1900                 }
1901             break;
1902           }
1903
1904       if (NILP (coding_system))
1905         do
1906           {
1907             if (detect_coding_type (&decst, buf, nread,
1908                                     XCODING_SYSTEM_TYPE (*codesys_in_out)
1909                                     != CODESYS_AUTODETECT))
1910               break;
1911             nread = Lstream_read (stream, buf, sizeof (buf));
1912             if (nread == 0)
1913               break;
1914           }
1915         while (1);
1916
1917       else if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT
1918                && XCODING_SYSTEM_EOL_TYPE (coding_system) == EOL_AUTODETECT)
1919         do
1920           {
1921             if (detect_coding_type (&decst, buf, nread, 1))
1922               break;
1923             nread = Lstream_read (stream, buf, sizeof (buf));
1924             if (!nread)
1925               break;
1926           }
1927         while (1);
1928
1929       *eol_type_in_out = decst.eol_type;
1930       if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT)
1931         {
1932           if (NILP (coding_system))
1933             *codesys_in_out = coding_system_from_mask (decst.mask);
1934           else
1935             *codesys_in_out = coding_system;
1936         }
1937     }
1938
1939   /* If we absolutely can't determine the EOL type, just assume LF. */
1940   if (*eol_type_in_out == EOL_AUTODETECT)
1941     *eol_type_in_out = EOL_LF;
1942
1943   Lstream_rewind (stream);
1944 }
1945
1946 DEFUN ("detect-coding-region", Fdetect_coding_region, 2, 3, 0, /*
1947 Detect coding system of the text in the region between START and END.
1948 Return a list of possible coding systems ordered by priority.
1949 If only ASCII characters are found, return 'undecided or one of
1950 its subsidiary coding systems according to a detected end-of-line
1951 type.  Optional arg BUFFER defaults to the current buffer.
1952 */
1953        (start, end, buffer))
1954 {
1955   Lisp_Object val = Qnil;
1956   struct buffer *buf = decode_buffer (buffer, 0);
1957   Bufpos b, e;
1958   Lisp_Object instream, lb_instream;
1959   Lstream *istr, *lb_istr;
1960   struct detection_state decst;
1961   struct gcpro gcpro1, gcpro2;
1962
1963   get_buffer_range_char (buf, start, end, &b, &e, 0);
1964   lb_instream = make_lisp_buffer_input_stream (buf, b, e, 0);
1965   lb_istr = XLSTREAM (lb_instream);
1966   instream = make_encoding_input_stream (lb_istr, Fget_coding_system (Qbinary));
1967   istr = XLSTREAM (instream);
1968   GCPRO2 (instream, lb_instream);
1969   xzero (decst);
1970   decst.eol_type = EOL_AUTODETECT;
1971   decst.mask = ~0;
1972   while (1)
1973     {
1974       Extbyte random_buffer[4096];
1975       ssize_t nread = Lstream_read (istr, random_buffer, sizeof (random_buffer));
1976
1977       if (!nread)
1978         break;
1979       if (detect_coding_type (&decst, random_buffer, nread, 0))
1980         break;
1981     }
1982
1983   if (decst.mask == ~0)
1984     val = subsidiary_coding_system (Fget_coding_system (Qundecided),
1985                                     decst.eol_type);
1986   else
1987     {
1988       int i;
1989
1990       val = Qnil;
1991 #ifdef MULE
1992       decst.mask = postprocess_iso2022_mask (decst.mask);
1993 #endif
1994       for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1995         {
1996           int sys = fcd->coding_category_by_priority[i];
1997           if (decst.mask & (1 << sys))
1998             {
1999               Lisp_Object codesys = fcd->coding_category_system[sys];
2000               if (!NILP (codesys))
2001                 codesys = subsidiary_coding_system (codesys, decst.eol_type);
2002               val = Fcons (codesys, val);
2003             }
2004         }
2005     }
2006   Lstream_close (istr);
2007   UNGCPRO;
2008   Lstream_delete (istr);
2009   Lstream_delete (lb_istr);
2010   return val;
2011 }
2012
2013 \f
2014 /************************************************************************/
2015 /*           Converting to internal Mule format ("decoding")            */
2016 /************************************************************************/
2017
2018 /* A decoding stream is a stream used for decoding text (i.e.
2019    converting from some external format to internal format).
2020    The decoding-stream object keeps track of the actual coding
2021    stream, the stream that is at the other end, and data that
2022    needs to be persistent across the lifetime of the stream. */
2023
2024 /* Handle the EOL stuff related to just-read-in character C.
2025    EOL_TYPE is the EOL type of the coding stream.
2026    FLAGS is the current value of FLAGS in the coding stream, and may
2027    be modified by this macro.  (The macro only looks at the
2028    CODING_STATE_CR flag.)  DST is the Dynarr to which the decoded
2029    bytes are to be written.  You need to also define a local goto
2030    label "label_continue_loop" that is at the end of the main
2031    character-reading loop.
2032
2033    If C is a CR character, then this macro handles it entirely and
2034    jumps to label_continue_loop.  Otherwise, this macro does not add
2035    anything to DST, and continues normally.  You should continue
2036    processing C normally after this macro. */
2037
2038 #define DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst)         \
2039 do {                                                            \
2040   if (c == '\r')                                                \
2041     {                                                           \
2042       if (eol_type == EOL_CR)                                   \
2043         Dynarr_add (dst, '\n');                                 \
2044       else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR) \
2045         Dynarr_add (dst, c);                                    \
2046       else                                                      \
2047         flags |= CODING_STATE_CR;                               \
2048       goto label_continue_loop;                                 \
2049     }                                                           \
2050   else if (flags & CODING_STATE_CR)                             \
2051     {   /* eol_type == CODING_SYSTEM_EOL_CRLF */                \
2052       if (c != '\n')                                            \
2053         Dynarr_add (dst, '\r');                                 \
2054       flags &= ~CODING_STATE_CR;                                \
2055     }                                                           \
2056 } while (0)
2057
2058 /* C should be a binary character in the range 0 - 255; convert
2059    to internal format and add to Dynarr DST. */
2060
2061 #ifdef UTF2000
2062 #define DECODE_ADD_BINARY_CHAR(c, dst) \
2063 do {                                            \
2064   if (BYTE_ASCII_P (c))                         \
2065     Dynarr_add (dst, c);                        \
2066   else                                          \
2067     {                                           \
2068       Dynarr_add (dst, (c >> 6) | 0xc0);        \
2069       Dynarr_add (dst, (c & 0x3f) | 0x80);      \
2070     }                                           \
2071 } while (0)
2072
2073 INLINE_HEADER void DECODE_ADD_UCS_CHAR(Emchar c, unsigned_char_dynarr* dst);
2074 INLINE_HEADER void
2075 DECODE_ADD_UCS_CHAR(Emchar c, unsigned_char_dynarr* dst)
2076 {
2077   if ( c <= 0x7f )
2078     {
2079       Dynarr_add (dst, c);
2080     }
2081   else if ( c <= 0x7ff )
2082     {
2083       Dynarr_add (dst, (c >> 6) | 0xc0);
2084       Dynarr_add (dst, (c & 0x3f) | 0x80);
2085     }
2086   else if ( c <= 0xffff )
2087     {
2088       Dynarr_add (dst,  (c >> 12) | 0xe0);
2089       Dynarr_add (dst, ((c >>  6) & 0x3f) | 0x80);
2090       Dynarr_add (dst,  (c        & 0x3f) | 0x80);
2091     }
2092   else if ( c <= 0x1fffff )
2093     {
2094       Dynarr_add (dst,  (c >> 18) | 0xf0);
2095       Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2096       Dynarr_add (dst, ((c >>  6) & 0x3f) | 0x80);
2097       Dynarr_add (dst,  (c        & 0x3f) | 0x80);
2098     }
2099   else if ( c <= 0x3ffffff )
2100     {
2101       Dynarr_add (dst,  (c >> 24) | 0xf8);
2102       Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
2103       Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2104       Dynarr_add (dst, ((c >>  6) & 0x3f) | 0x80);
2105       Dynarr_add (dst,  (c        & 0x3f) | 0x80);
2106     }
2107   else
2108     {
2109       Dynarr_add (dst,  (c >> 30) | 0xfc);
2110       Dynarr_add (dst, ((c >> 24) & 0x3f) | 0x80);
2111       Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
2112       Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2113       Dynarr_add (dst, ((c >>  6) & 0x3f) | 0x80);
2114       Dynarr_add (dst,  (c        & 0x3f) | 0x80);
2115     }
2116 }
2117 #else
2118 #define DECODE_ADD_BINARY_CHAR(c, dst)          \
2119 do {                                            \
2120   if (BYTE_ASCII_P (c))                         \
2121     Dynarr_add (dst, c);                        \
2122   else if (BYTE_C1_P (c))                       \
2123     {                                           \
2124       Dynarr_add (dst, LEADING_BYTE_CONTROL_1); \
2125       Dynarr_add (dst, c + 0x20);               \
2126     }                                           \
2127   else                                          \
2128     {                                           \
2129       Dynarr_add (dst, LEADING_BYTE_LATIN_ISO8859_1); \
2130       Dynarr_add (dst, c);                      \
2131     }                                           \
2132 } while (0)
2133 #endif
2134
2135 #define DECODE_OUTPUT_PARTIAL_CHAR(ch)  \
2136 do {                                    \
2137   if (ch)                               \
2138     {                                   \
2139       DECODE_ADD_BINARY_CHAR (ch, dst); \
2140       ch = 0;                           \
2141     }                                   \
2142 } while (0)
2143
2144 #define DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst) \
2145 do {                                    \
2146   if (flags & CODING_STATE_END)         \
2147     {                                   \
2148       DECODE_OUTPUT_PARTIAL_CHAR (ch);  \
2149       if (flags & CODING_STATE_CR)      \
2150         Dynarr_add (dst, '\r');         \
2151     }                                   \
2152 } while (0)
2153
2154 #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding)
2155
2156 struct decoding_stream
2157 {
2158   /* Coding system that governs the conversion. */
2159   Lisp_Coding_System *codesys;
2160
2161   /* Stream that we read the encoded data from or
2162      write the decoded data to. */
2163   Lstream *other_end;
2164
2165   /* If we are reading, then we can return only a fixed amount of
2166      data, so if the conversion resulted in too much data, we store it
2167      here for retrieval the next time around. */
2168   unsigned_char_dynarr *runoff;
2169
2170   /* FLAGS holds flags indicating the current state of the decoding.
2171      Some of these flags are dependent on the coding system. */
2172   unsigned int flags;
2173
2174   /* CPOS holds a partially built-up code-point of character. */
2175   unsigned int cpos;
2176
2177   /* EOL_TYPE specifies the type of end-of-line conversion that
2178      currently applies.  We need to keep this separate from the
2179      EOL type stored in CODESYS because the latter might indicate
2180      automatic EOL-type detection while the former will always
2181      indicate a particular EOL type. */
2182   eol_type_t eol_type;
2183 #ifdef MULE
2184   /* Additional ISO2022 information.  We define the structure above
2185      because it's also needed by the detection routines. */
2186   struct iso2022_decoder iso2022;
2187
2188   /* Additional information (the state of the running CCL program)
2189      used by the CCL decoder. */
2190   struct ccl_program ccl;
2191
2192   /* counter for UTF-8 or UCS-4 */
2193   unsigned char counter;
2194 #endif
2195 #ifdef UTF2000
2196   unsigned combined_char_count;
2197   Emchar combined_chars[16];
2198   Lisp_Object combining_table;
2199 #endif
2200   struct detection_state decst;
2201 };
2202
2203 #ifdef UTF2000
2204 extern Lisp_Object Vcharacter_composition_table;
2205
2206 INLINE_HEADER void
2207 COMPOSE_FLUSH_CHARS (struct decoding_stream *str, unsigned_char_dynarr* dst);
2208 INLINE_HEADER void
2209 COMPOSE_FLUSH_CHARS (struct decoding_stream *str, unsigned_char_dynarr* dst)
2210 {
2211   unsigned i;
2212
2213   for (i = 0; i < str->combined_char_count; i++)
2214     DECODE_ADD_UCS_CHAR (str->combined_chars[i], dst);
2215   str->combined_char_count = 0;
2216   str->combining_table = Qnil;
2217 }
2218
2219 void COMPOSE_ADD_CHAR(struct decoding_stream *str, Emchar character,
2220                       unsigned_char_dynarr* dst);
2221 void
2222 COMPOSE_ADD_CHAR(struct decoding_stream *str,
2223                  Emchar character, unsigned_char_dynarr* dst)
2224 {
2225   if (CODING_SYSTEM_DISABLE_COMPOSITION (str->codesys))
2226     DECODE_ADD_UCS_CHAR (character, dst);
2227   else if (!CHAR_ID_TABLE_P (str->combining_table))
2228     {
2229       Lisp_Object ret
2230         = get_char_id_table (character, Vcharacter_composition_table);
2231
2232       if (NILP (ret))
2233         DECODE_ADD_UCS_CHAR (character, dst);
2234       else
2235         {
2236           str->combined_chars[0] = character;
2237           str->combined_char_count = 1;
2238           str->combining_table = ret;
2239         }
2240     }
2241   else
2242     {
2243       Lisp_Object ret
2244         = get_char_id_table (character, str->combining_table);
2245
2246       if (CHARP (ret))
2247         {
2248           Emchar char2 = XCHARVAL (ret);
2249           ret = get_char_id_table (char2, Vcharacter_composition_table);
2250           if (NILP (ret))
2251             {
2252               DECODE_ADD_UCS_CHAR (char2, dst);
2253               str->combined_char_count = 0;
2254               str->combining_table = Qnil;
2255             }
2256           else
2257             {
2258               str->combined_chars[0] = char2;
2259               str->combined_char_count = 1;
2260               str->combining_table = ret;
2261             }
2262         }
2263       else if (CHAR_ID_TABLE_P (ret))
2264         {
2265           str->combined_chars[str->combined_char_count++] = character;
2266           str->combining_table = ret;
2267         }
2268       else
2269         {
2270           COMPOSE_FLUSH_CHARS (str, dst);
2271           DECODE_ADD_UCS_CHAR (character, dst);
2272         }
2273     }
2274 }
2275 #else /* not UTF2000 */
2276 #define COMPOSE_FLUSH_CHARS(str, dst)
2277 #define COMPOSE_ADD_CHAR(str, ch, dst) DECODE_ADD_UCS_CHAR (ch, dst)
2278 #endif /* UTF2000 */
2279
2280 static ssize_t decoding_reader (Lstream *stream,
2281                                 unsigned char *data, size_t size);
2282 static ssize_t decoding_writer (Lstream *stream,
2283                                 const unsigned char *data, size_t size);
2284 static int decoding_rewinder   (Lstream *stream);
2285 static int decoding_seekable_p (Lstream *stream);
2286 static int decoding_flusher    (Lstream *stream);
2287 static int decoding_closer     (Lstream *stream);
2288
2289 static Lisp_Object decoding_marker (Lisp_Object stream);
2290
2291 DEFINE_LSTREAM_IMPLEMENTATION ("decoding", lstream_decoding,
2292                                sizeof (struct decoding_stream));
2293
2294 static Lisp_Object
2295 decoding_marker (Lisp_Object stream)
2296 {
2297   Lstream *str = DECODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2298   Lisp_Object str_obj;
2299
2300   /* We do not need to mark the coding systems or charsets stored
2301      within the stream because they are stored in a global list
2302      and automatically marked. */
2303
2304   XSETLSTREAM (str_obj, str);
2305   mark_object (str_obj);
2306   if (str->imp->marker)
2307     return (str->imp->marker) (str_obj);
2308   else
2309     return Qnil;
2310 }
2311
2312 /* Read SIZE bytes of data and store it into DATA.  We are a decoding stream
2313    so we read data from the other end, decode it, and store it into DATA. */
2314
2315 static ssize_t
2316 decoding_reader (Lstream *stream, unsigned char *data, size_t size)
2317 {
2318   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2319   unsigned char *orig_data = data;
2320   ssize_t read_size;
2321   int error_occurred = 0;
2322
2323   /* We need to interface to mule_decode(), which expects to take some
2324      amount of data and store the result into a Dynarr.  We have
2325      mule_decode() store into str->runoff, and take data from there
2326      as necessary. */
2327
2328   /* We loop until we have enough data, reading chunks from the other
2329      end and decoding it. */
2330   while (1)
2331     {
2332       /* Take data from the runoff if we can.  Make sure to take at
2333          most SIZE bytes, and delete the data from the runoff. */
2334       if (Dynarr_length (str->runoff) > 0)
2335         {
2336           size_t chunk = min (size, (size_t) Dynarr_length (str->runoff));
2337           memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2338           Dynarr_delete_many (str->runoff, 0, chunk);
2339           data += chunk;
2340           size -= chunk;
2341         }
2342
2343       if (size == 0)
2344         break; /* No more room for data */
2345
2346       if (str->flags & CODING_STATE_END)
2347         /* This means that on the previous iteration, we hit the EOF on
2348            the other end.  We loop once more so that mule_decode() can
2349            output any final stuff it may be holding, or any "go back
2350            to a sane state" escape sequences. (This latter makes sense
2351            during encoding.) */
2352         break;
2353
2354       /* Exhausted the runoff, so get some more.  DATA has at least
2355          SIZE bytes left of storage in it, so it's OK to read directly
2356          into it.  (We'll be overwriting above, after we've decoded it
2357          into the runoff.) */
2358       read_size = Lstream_read (str->other_end, data, size);
2359       if (read_size < 0)
2360         {
2361           error_occurred = 1;
2362           break;
2363         }
2364       if (read_size == 0)
2365         /* There might be some more end data produced in the translation.
2366            See the comment above. */
2367         str->flags |= CODING_STATE_END;
2368       mule_decode (stream, (Extbyte *) data, str->runoff, read_size);
2369     }
2370
2371   if (data - orig_data == 0)
2372     return error_occurred ? -1 : 0;
2373   else
2374     return data - orig_data;
2375 }
2376
2377 static ssize_t
2378 decoding_writer (Lstream *stream, const unsigned char *data, size_t size)
2379 {
2380   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2381   ssize_t retval;
2382
2383   /* Decode all our data into the runoff, and then attempt to write
2384      it all out to the other end.  Remove whatever chunk we succeeded
2385      in writing. */
2386   mule_decode (stream, (Extbyte *) data, str->runoff, size);
2387   retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2388                           Dynarr_length (str->runoff));
2389   if (retval > 0)
2390     Dynarr_delete_many (str->runoff, 0, retval);
2391   /* Do NOT return retval.  The return value indicates how much
2392      of the incoming data was written, not how many bytes were
2393      written. */
2394   return size;
2395 }
2396
2397 static void
2398 reset_decoding_stream (struct decoding_stream *str)
2399 {
2400 #ifdef MULE
2401   if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_ISO2022)
2402     {
2403       Lisp_Object coding_system;
2404       XSETCODING_SYSTEM (coding_system, str->codesys);
2405       reset_iso2022 (coding_system, &str->iso2022);
2406     }
2407   else if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_CCL)
2408     {
2409       setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_DECODE (str->codesys));
2410     }
2411   str->counter = 0;
2412 #endif /* MULE */
2413 #ifdef UTF2000
2414   str->combined_char_count = 0;
2415   str->combining_table = Qnil;
2416 #endif
2417   str->flags = str->cpos = 0;
2418 }
2419
2420 static int
2421 decoding_rewinder (Lstream *stream)
2422 {
2423   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2424   reset_decoding_stream (str);
2425   Dynarr_reset (str->runoff);
2426   return Lstream_rewind (str->other_end);
2427 }
2428
2429 static int
2430 decoding_seekable_p (Lstream *stream)
2431 {
2432   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2433   return Lstream_seekable_p (str->other_end);
2434 }
2435
2436 static int
2437 decoding_flusher (Lstream *stream)
2438 {
2439   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2440   return Lstream_flush (str->other_end);
2441 }
2442
2443 static int
2444 decoding_closer (Lstream *stream)
2445 {
2446   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2447   if (stream->flags & LSTREAM_FL_WRITE)
2448     {
2449       str->flags |= CODING_STATE_END;
2450       decoding_writer (stream, 0, 0);
2451     }
2452   Dynarr_free (str->runoff);
2453 #ifdef MULE
2454 #ifdef ENABLE_COMPOSITE_CHARS
2455   if (str->iso2022.composite_chars)
2456     Dynarr_free (str->iso2022.composite_chars);
2457 #endif
2458 #endif
2459   return Lstream_close (str->other_end);
2460 }
2461
2462 Lisp_Object
2463 decoding_stream_coding_system (Lstream *stream)
2464 {
2465   Lisp_Object coding_system;
2466   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2467
2468   XSETCODING_SYSTEM (coding_system, str->codesys);
2469   return subsidiary_coding_system (coding_system, str->eol_type);
2470 }
2471
2472 void
2473 set_decoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2474 {
2475   Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2476   struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2477   str->codesys = cs;
2478   if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
2479     str->eol_type = CODING_SYSTEM_EOL_TYPE (cs);
2480   reset_decoding_stream (str);
2481 }
2482
2483 /* WARNING WARNING WARNING WARNING!!!!!  If you open up a decoding
2484    stream for writing, no automatic code detection will be performed.
2485    The reason for this is that automatic code detection requires a
2486    seekable input.  Things will also fail if you open a decoding
2487    stream for reading using a non-fully-specified coding system and
2488    a non-seekable input stream. */
2489
2490 static Lisp_Object
2491 make_decoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2492                         const char *mode)
2493 {
2494   Lstream *lstr = Lstream_new (lstream_decoding, mode);
2495   struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2496   Lisp_Object obj;
2497
2498   xzero (*str);
2499   str->other_end = stream;
2500   str->runoff = (unsigned_char_dynarr *) Dynarr_new (unsigned_char);
2501   str->eol_type = EOL_AUTODETECT;
2502   if (!strcmp (mode, "r")
2503       && Lstream_seekable_p (stream))
2504     /* We can determine the coding system now. */
2505     determine_real_coding_system (stream, &codesys, &str->eol_type);
2506   set_decoding_stream_coding_system (lstr, codesys);
2507   str->decst.eol_type = str->eol_type;
2508   str->decst.mask = ~0;
2509   XSETLSTREAM (obj, lstr);
2510   return obj;
2511 }
2512
2513 Lisp_Object
2514 make_decoding_input_stream (Lstream *stream, Lisp_Object codesys)
2515 {
2516   return make_decoding_stream_1 (stream, codesys, "r");
2517 }
2518
2519 Lisp_Object
2520 make_decoding_output_stream (Lstream *stream, Lisp_Object codesys)
2521 {
2522   return make_decoding_stream_1 (stream, codesys, "w");
2523 }
2524
2525 /* Note: the decode_coding_* functions all take the same
2526    arguments as mule_decode(), which is to say some SRC data of
2527    size N, which is to be stored into dynamic array DST.
2528    DECODING is the stream within which the decoding is
2529    taking place, but no data is actually read from or
2530    written to that stream; that is handled in decoding_reader()
2531    or decoding_writer().  This allows the same functions to
2532    be used for both reading and writing. */
2533
2534 static void
2535 mule_decode (Lstream *decoding, const Extbyte *src,
2536              unsigned_char_dynarr *dst, size_t n)
2537 {
2538   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2539
2540   /* If necessary, do encoding-detection now.  We do this when
2541      we're a writing stream or a non-seekable reading stream,
2542      meaning that we can't just process the whole input,
2543      rewind, and start over. */
2544
2545   if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT ||
2546       str->eol_type == EOL_AUTODETECT)
2547     {
2548       Lisp_Object codesys;
2549
2550       XSETCODING_SYSTEM (codesys, str->codesys);
2551       detect_coding_type (&str->decst, src, n,
2552                           CODING_SYSTEM_TYPE (str->codesys) !=
2553                           CODESYS_AUTODETECT);
2554       if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT &&
2555           str->decst.mask != ~0)
2556         /* #### This is cheesy.  What we really ought to do is
2557            buffer up a certain amount of data so as to get a
2558            less random result. */
2559         codesys = coding_system_from_mask (str->decst.mask);
2560       str->eol_type = str->decst.eol_type;
2561       if (XCODING_SYSTEM (codesys) != str->codesys)
2562         {
2563           /* Preserve the CODING_STATE_END flag in case it was set.
2564              If we erase it, bad things might happen. */
2565           int was_end = str->flags & CODING_STATE_END;
2566           set_decoding_stream_coding_system (decoding, codesys);
2567           if (was_end)
2568             str->flags |= CODING_STATE_END;
2569         }
2570     }
2571
2572   switch (CODING_SYSTEM_TYPE (str->codesys))
2573     {
2574 #ifdef DEBUG_XEMACS
2575     case CODESYS_INTERNAL:
2576       Dynarr_add_many (dst, src, n);
2577       break;
2578 #endif
2579     case CODESYS_AUTODETECT:
2580       /* If we got this far and still haven't decided on the coding
2581          system, then do no conversion. */
2582     case CODESYS_NO_CONVERSION:
2583       decode_coding_no_conversion (decoding, src, dst, n);
2584       break;
2585 #ifdef MULE
2586     case CODESYS_SHIFT_JIS:
2587       decode_coding_sjis (decoding, src, dst, n);
2588       break;
2589     case CODESYS_BIG5:
2590       decode_coding_big5 (decoding, src, dst, n);
2591       break;
2592     case CODESYS_UCS4:
2593       decode_coding_ucs4 (decoding, src, dst, n);
2594       break;
2595     case CODESYS_UTF8:
2596       decode_coding_utf8 (decoding, src, dst, n);
2597       break;
2598     case CODESYS_CCL:
2599       str->ccl.last_block = str->flags & CODING_STATE_END;
2600       /* When applying ccl program to stream, MUST NOT set NULL
2601          pointer to src.  */
2602       ccl_driver (&str->ccl, (src ? (unsigned char *)src : (unsigned char*)""),
2603                   dst, n, 0, CCL_MODE_DECODING);
2604       break;
2605     case CODESYS_ISO2022:
2606       decode_coding_iso2022 (decoding, src, dst, n);
2607       break;
2608 #endif /* MULE */
2609     default:
2610       abort ();
2611     }
2612 }
2613
2614 DEFUN ("decode-coding-region", Fdecode_coding_region, 3, 4, 0, /*
2615 Decode the text between START and END which is encoded in CODING-SYSTEM.
2616 This is useful if you've read in encoded text from a file without decoding
2617 it (e.g. you read in a JIS-formatted file but used the `binary' or
2618 `no-conversion' coding system, so that it shows up as "^[$B!<!+^[(B").
2619 Return length of decoded text.
2620 BUFFER defaults to the current buffer if unspecified.
2621 */
2622        (start, end, coding_system, buffer))
2623 {
2624   Bufpos b, e;
2625   struct buffer *buf = decode_buffer (buffer, 0);
2626   Lisp_Object instream, lb_outstream, de_outstream, outstream;
2627   Lstream *istr, *ostr;
2628   struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2629
2630   get_buffer_range_char (buf, start, end, &b, &e, 0);
2631
2632   barf_if_buffer_read_only (buf, b, e);
2633
2634   coding_system = Fget_coding_system (coding_system);
2635   instream = make_lisp_buffer_input_stream  (buf, b, e, 0);
2636   lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2637   de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2638                                               coding_system);
2639   outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2640                                            Fget_coding_system (Qbinary));
2641   istr = XLSTREAM (instream);
2642   ostr = XLSTREAM (outstream);
2643   GCPRO4 (instream, lb_outstream, de_outstream, outstream);
2644
2645   /* The chain of streams looks like this:
2646
2647      [BUFFER] <----- send through
2648                      ------> [ENCODE AS BINARY]
2649                              ------> [DECODE AS SPECIFIED]
2650                                      ------> [BUFFER]
2651    */
2652
2653   while (1)
2654     {
2655       char tempbuf[1024]; /* some random amount */
2656       Bufpos newpos, even_newer_pos;
2657       Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2658       ssize_t size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2659
2660       if (!size_in_bytes)
2661         break;
2662       newpos = lisp_buffer_stream_startpos (istr);
2663       Lstream_write (ostr, tempbuf, size_in_bytes);
2664       even_newer_pos = lisp_buffer_stream_startpos (istr);
2665       buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2666                            even_newer_pos, 0);
2667     }
2668   Lstream_close (istr);
2669   Lstream_close (ostr);
2670   UNGCPRO;
2671   Lstream_delete (istr);
2672   Lstream_delete (ostr);
2673   Lstream_delete (XLSTREAM (de_outstream));
2674   Lstream_delete (XLSTREAM (lb_outstream));
2675   return Qnil;
2676 }
2677
2678 \f
2679 /************************************************************************/
2680 /*           Converting to an external encoding ("encoding")            */
2681 /************************************************************************/
2682
2683 /* An encoding stream is an output stream.  When you create the
2684    stream, you specify the coding system that governs the encoding
2685    and another stream that the resulting encoded data is to be
2686    sent to, and then start sending data to it. */
2687
2688 #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding)
2689
2690 struct encoding_stream
2691 {
2692   /* Coding system that governs the conversion. */
2693   Lisp_Coding_System *codesys;
2694
2695   /* Stream that we read the encoded data from or
2696      write the decoded data to. */
2697   Lstream *other_end;
2698
2699   /* If we are reading, then we can return only a fixed amount of
2700      data, so if the conversion resulted in too much data, we store it
2701      here for retrieval the next time around. */
2702   unsigned_char_dynarr *runoff;
2703
2704   /* FLAGS holds flags indicating the current state of the encoding.
2705      Some of these flags are dependent on the coding system. */
2706   unsigned int flags;
2707
2708   /* CH holds a partially built-up character.  Since we only deal
2709      with one- and two-byte characters at the moment, we only use
2710      this to store the first byte of a two-byte character. */
2711   unsigned int ch;
2712 #ifdef MULE
2713   /* Additional information used by the ISO2022 encoder. */
2714   struct
2715     {
2716       /* CHARSET holds the character sets currently assigned to the G0
2717          through G3 registers.  It is initialized from the array
2718          INITIAL_CHARSET in CODESYS. */
2719       Lisp_Object charset[4];
2720
2721       /* Which registers are currently invoked into the left (GL) and
2722          right (GR) halves of the 8-bit encoding space? */
2723       int register_left, register_right;
2724
2725       /* Whether we need to explicitly designate the charset in the
2726          G? register before using it.  It is initialized from the
2727          array FORCE_CHARSET_ON_OUTPUT in CODESYS. */
2728       unsigned char force_charset_on_output[4];
2729
2730       /* Other state variables that need to be preserved across
2731          invocations. */
2732       Lisp_Object current_charset;
2733       int current_half;
2734       int current_char_boundary;
2735     } iso2022;
2736
2737   void (*encode_char) (struct encoding_stream *str, Emchar c,
2738                        unsigned_char_dynarr *dst, unsigned int *flags);
2739   void (*finish) (struct encoding_stream *str,
2740                   unsigned_char_dynarr *dst, unsigned int *flags);
2741
2742   /* Additional information (the state of the running CCL program)
2743      used by the CCL encoder. */
2744   struct ccl_program ccl;
2745 #endif /* MULE */
2746 };
2747
2748 static ssize_t encoding_reader (Lstream *stream, unsigned char *data, size_t size);
2749 static ssize_t encoding_writer (Lstream *stream, const unsigned char *data,
2750                                 size_t size);
2751 static int encoding_rewinder   (Lstream *stream);
2752 static int encoding_seekable_p (Lstream *stream);
2753 static int encoding_flusher    (Lstream *stream);
2754 static int encoding_closer     (Lstream *stream);
2755
2756 static Lisp_Object encoding_marker (Lisp_Object stream);
2757
2758 DEFINE_LSTREAM_IMPLEMENTATION ("encoding", lstream_encoding,
2759                                sizeof (struct encoding_stream));
2760
2761 static Lisp_Object
2762 encoding_marker (Lisp_Object stream)
2763 {
2764   Lstream *str = ENCODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2765   Lisp_Object str_obj;
2766
2767   /* We do not need to mark the coding systems or charsets stored
2768      within the stream because they are stored in a global list
2769      and automatically marked. */
2770
2771   XSETLSTREAM (str_obj, str);
2772   mark_object (str_obj);
2773   if (str->imp->marker)
2774     return (str->imp->marker) (str_obj);
2775   else
2776     return Qnil;
2777 }
2778
2779 /* Read SIZE bytes of data and store it into DATA.  We are a encoding stream
2780    so we read data from the other end, encode it, and store it into DATA. */
2781
2782 static ssize_t
2783 encoding_reader (Lstream *stream, unsigned char *data, size_t size)
2784 {
2785   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2786   unsigned char *orig_data = data;
2787   ssize_t read_size;
2788   int error_occurred = 0;
2789
2790   /* We need to interface to mule_encode(), which expects to take some
2791      amount of data and store the result into a Dynarr.  We have
2792      mule_encode() store into str->runoff, and take data from there
2793      as necessary. */
2794
2795   /* We loop until we have enough data, reading chunks from the other
2796      end and encoding it. */
2797   while (1)
2798     {
2799       /* Take data from the runoff if we can.  Make sure to take at
2800          most SIZE bytes, and delete the data from the runoff. */
2801       if (Dynarr_length (str->runoff) > 0)
2802         {
2803           int chunk = min ((int) size, Dynarr_length (str->runoff));
2804           memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2805           Dynarr_delete_many (str->runoff, 0, chunk);
2806           data += chunk;
2807           size -= chunk;
2808         }
2809
2810       if (size == 0)
2811         break; /* No more room for data */
2812
2813       if (str->flags & CODING_STATE_END)
2814         /* This means that on the previous iteration, we hit the EOF on
2815            the other end.  We loop once more so that mule_encode() can
2816            output any final stuff it may be holding, or any "go back
2817            to a sane state" escape sequences. (This latter makes sense
2818            during encoding.) */
2819         break;
2820
2821       /* Exhausted the runoff, so get some more.  DATA at least SIZE bytes
2822          left of storage in it, so it's OK to read directly into it.
2823          (We'll be overwriting above, after we've encoded it into the
2824          runoff.) */
2825       read_size = Lstream_read (str->other_end, data, size);
2826       if (read_size < 0)
2827         {
2828           error_occurred = 1;
2829           break;
2830         }
2831       if (read_size == 0)
2832         /* There might be some more end data produced in the translation.
2833            See the comment above. */
2834         str->flags |= CODING_STATE_END;
2835       mule_encode (stream, data, str->runoff, read_size);
2836     }
2837
2838   if (data == orig_data)
2839     return error_occurred ? -1 : 0;
2840   else
2841     return data - orig_data;
2842 }
2843
2844 static ssize_t
2845 encoding_writer (Lstream *stream, const unsigned char *data, size_t size)
2846 {
2847   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2848   ssize_t retval;
2849
2850   /* Encode all our data into the runoff, and then attempt to write
2851      it all out to the other end.  Remove whatever chunk we succeeded
2852      in writing. */
2853   mule_encode (stream, data, str->runoff, size);
2854   retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2855                           Dynarr_length (str->runoff));
2856   if (retval > 0)
2857     Dynarr_delete_many (str->runoff, 0, retval);
2858   /* Do NOT return retval.  The return value indicates how much
2859      of the incoming data was written, not how many bytes were
2860      written. */
2861   return size;
2862 }
2863
2864 static void
2865 reset_encoding_stream (struct encoding_stream *str)
2866 {
2867 #ifdef MULE
2868   switch (CODING_SYSTEM_TYPE (str->codesys))
2869     {
2870     case CODESYS_ISO2022:
2871       {
2872         int i;
2873
2874         str->encode_char = &char_encode_iso2022;
2875         str->finish = &char_finish_iso2022;
2876         for (i = 0; i < 4; i++)
2877           {
2878             str->iso2022.charset[i] =
2879               CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, i);
2880             str->iso2022.force_charset_on_output[i] =
2881               CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (str->codesys, i);
2882           }
2883         str->iso2022.register_left = 0;
2884         str->iso2022.register_right = 1;
2885         str->iso2022.current_charset = Qnil;
2886         str->iso2022.current_half = 0;
2887         break;
2888       }
2889     case CODESYS_CCL:
2890       setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_ENCODE (str->codesys));
2891       break;
2892     case CODESYS_UTF8:
2893       str->encode_char = &char_encode_utf8;
2894       str->finish = &char_finish_utf8;
2895       break;
2896     case CODESYS_UCS4:
2897       str->encode_char = &char_encode_ucs4;
2898       str->finish = &char_finish_ucs4;
2899       break;
2900     case CODESYS_SHIFT_JIS:
2901       str->encode_char = &char_encode_shift_jis;
2902       str->finish = &char_finish_shift_jis;
2903       break;
2904     default:
2905       break;
2906     }
2907 #endif /* MULE */
2908   str->iso2022.current_char_boundary = 0;
2909   str->flags = str->ch = 0;
2910 }
2911
2912 static int
2913 encoding_rewinder (Lstream *stream)
2914 {
2915   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2916   reset_encoding_stream (str);
2917   Dynarr_reset (str->runoff);
2918   return Lstream_rewind (str->other_end);
2919 }
2920
2921 static int
2922 encoding_seekable_p (Lstream *stream)
2923 {
2924   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2925   return Lstream_seekable_p (str->other_end);
2926 }
2927
2928 static int
2929 encoding_flusher (Lstream *stream)
2930 {
2931   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2932   return Lstream_flush (str->other_end);
2933 }
2934
2935 static int
2936 encoding_closer (Lstream *stream)
2937 {
2938   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2939   if (stream->flags & LSTREAM_FL_WRITE)
2940     {
2941       str->flags |= CODING_STATE_END;
2942       encoding_writer (stream, 0, 0);
2943     }
2944   Dynarr_free (str->runoff);
2945   return Lstream_close (str->other_end);
2946 }
2947
2948 Lisp_Object
2949 encoding_stream_coding_system (Lstream *stream)
2950 {
2951   Lisp_Object coding_system;
2952   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2953
2954   XSETCODING_SYSTEM (coding_system, str->codesys);
2955   return coding_system;
2956 }
2957
2958 void
2959 set_encoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2960 {
2961   Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2962   struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2963   str->codesys = cs;
2964   reset_encoding_stream (str);
2965 }
2966
2967 static Lisp_Object
2968 make_encoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2969                         const char *mode)
2970 {
2971   Lstream *lstr = Lstream_new (lstream_encoding, mode);
2972   struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2973   Lisp_Object obj;
2974
2975   xzero (*str);
2976   str->runoff = Dynarr_new (unsigned_char);
2977   str->other_end = stream;
2978   set_encoding_stream_coding_system (lstr, codesys);
2979   XSETLSTREAM (obj, lstr);
2980   return obj;
2981 }
2982
2983 Lisp_Object
2984 make_encoding_input_stream (Lstream *stream, Lisp_Object codesys)
2985 {
2986   return make_encoding_stream_1 (stream, codesys, "r");
2987 }
2988
2989 Lisp_Object
2990 make_encoding_output_stream (Lstream *stream, Lisp_Object codesys)
2991 {
2992   return make_encoding_stream_1 (stream, codesys, "w");
2993 }
2994
2995 /* Convert N bytes of internally-formatted data stored in SRC to an
2996    external format, according to the encoding stream ENCODING.
2997    Store the encoded data into DST. */
2998
2999 static void
3000 mule_encode (Lstream *encoding, const Bufbyte *src,
3001              unsigned_char_dynarr *dst, size_t n)
3002 {
3003   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3004
3005   switch (CODING_SYSTEM_TYPE (str->codesys))
3006     {
3007 #ifdef DEBUG_XEMACS
3008     case CODESYS_INTERNAL:
3009       Dynarr_add_many (dst, src, n);
3010       break;
3011 #endif
3012     case CODESYS_AUTODETECT:
3013       /* If we got this far and still haven't decided on the coding
3014          system, then do no conversion. */
3015     case CODESYS_NO_CONVERSION:
3016       encode_coding_no_conversion (encoding, src, dst, n);
3017       break;
3018 #ifdef MULE
3019     case CODESYS_BIG5:
3020       encode_coding_big5 (encoding, src, dst, n);
3021       break;
3022     case CODESYS_CCL:
3023       str->ccl.last_block = str->flags & CODING_STATE_END;
3024       /* When applying ccl program to stream, MUST NOT set NULL
3025          pointer to src.  */
3026       ccl_driver (&str->ccl, ((src) ? src : (unsigned char*)""),
3027                   dst, n, 0, CCL_MODE_ENCODING);
3028       break;
3029 #endif /* MULE */
3030     default:
3031       text_encode_generic (encoding, src, dst, n);
3032     }
3033 }
3034
3035 DEFUN ("encode-coding-region", Fencode_coding_region, 3, 4, 0, /*
3036 Encode the text between START and END using CODING-SYSTEM.
3037 This will, for example, convert Japanese characters into stuff such as
3038 "^[$B!<!+^[(B" if you use the JIS encoding.  Return length of encoded
3039 text.  BUFFER defaults to the current buffer if unspecified.
3040 */
3041        (start, end, coding_system, buffer))
3042 {
3043   Bufpos b, e;
3044   struct buffer *buf = decode_buffer (buffer, 0);
3045   Lisp_Object instream, lb_outstream, de_outstream, outstream;
3046   Lstream *istr, *ostr;
3047   struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
3048
3049   get_buffer_range_char (buf, start, end, &b, &e, 0);
3050
3051   barf_if_buffer_read_only (buf, b, e);
3052
3053   coding_system = Fget_coding_system (coding_system);
3054   instream  = make_lisp_buffer_input_stream  (buf, b, e, 0);
3055   lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
3056   de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
3057                                               Fget_coding_system (Qbinary));
3058   outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
3059                                            coding_system);
3060   istr = XLSTREAM (instream);
3061   ostr = XLSTREAM (outstream);
3062   GCPRO4 (instream, outstream, de_outstream, lb_outstream);
3063   /* The chain of streams looks like this:
3064
3065      [BUFFER] <----- send through
3066                      ------> [ENCODE AS SPECIFIED]
3067                              ------> [DECODE AS BINARY]
3068                                      ------> [BUFFER]
3069    */
3070   while (1)
3071     {
3072       char tempbuf[1024]; /* some random amount */
3073       Bufpos newpos, even_newer_pos;
3074       Bufpos oldpos = lisp_buffer_stream_startpos (istr);
3075       ssize_t size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
3076
3077       if (!size_in_bytes)
3078         break;
3079       newpos = lisp_buffer_stream_startpos (istr);
3080       Lstream_write (ostr, tempbuf, size_in_bytes);
3081       even_newer_pos = lisp_buffer_stream_startpos (istr);
3082       buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
3083                            even_newer_pos, 0);
3084     }
3085
3086   {
3087     Charcount retlen =
3088       lisp_buffer_stream_startpos (XLSTREAM (instream)) - b;
3089     Lstream_close (istr);
3090     Lstream_close (ostr);
3091     UNGCPRO;
3092     Lstream_delete (istr);
3093     Lstream_delete (ostr);
3094     Lstream_delete (XLSTREAM (de_outstream));
3095     Lstream_delete (XLSTREAM (lb_outstream));
3096     return make_int (retlen);
3097   }
3098 }
3099
3100 #ifdef MULE
3101 \f
3102 static void
3103 text_encode_generic (Lstream *encoding, const Bufbyte *src,
3104                      unsigned_char_dynarr *dst, size_t n)
3105 {
3106   unsigned char c;
3107   unsigned char char_boundary;
3108   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3109   unsigned int flags          = str->flags;
3110   Emchar ch                   = str->ch;
3111
3112   char_boundary = str->iso2022.current_char_boundary;
3113
3114   while (n--)
3115     {
3116       c = *src++;
3117
3118       if (char_boundary == 0)
3119         {
3120           if (c >= 0xfc)
3121             {
3122               ch = c & 0x01;
3123               char_boundary = 5;
3124             }
3125           else if (c >= 0xf8)
3126             {
3127               ch = c & 0x03;
3128               char_boundary = 4;
3129             }
3130           else if (c >= 0xf0)
3131             {
3132               ch = c & 0x07;
3133               char_boundary = 3;
3134             }
3135           else if (c >= 0xe0)
3136             {
3137               ch = c & 0x0f;
3138               char_boundary = 2;
3139             }
3140           else if (c >= 0xc0)
3141             {
3142               ch = c & 0x1f;
3143               char_boundary = 1;
3144             }
3145           else
3146             (*str->encode_char) (str, c, dst, &flags);
3147         }
3148       else if (char_boundary == 1)
3149         {
3150           (*str->encode_char) (str, (ch << 6) | (c & 0x3f), dst, &flags);
3151           ch =0;
3152           char_boundary = 0;
3153         }
3154       else
3155         {
3156           ch = (ch << 6) | (c & 0x3f);
3157           char_boundary--;
3158         }
3159     }
3160
3161   if ((char_boundary == 0) && (flags & CODING_STATE_END))
3162     {
3163       (*str->finish) (str, dst, &flags);
3164     }
3165
3166   str->flags = flags;
3167   str->ch    = ch;
3168   str->iso2022.current_char_boundary = char_boundary;
3169 }
3170
3171 \f
3172 /************************************************************************/
3173 /*                          Shift-JIS methods                           */
3174 /************************************************************************/
3175
3176 /* Shift-JIS is a coding system encoding three character sets: ASCII, right
3177    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
3178    as is.  A character of JISX0201-Kana (DIMENSION1_CHARS94 character set) is
3179    encoded by "position-code + 0x80".  A character of JISX0208
3180    (DIMENSION2_CHARS94 character set) is encoded in 2-byte but two
3181    position-codes are divided and shifted so that it fit in the range
3182    below.
3183
3184    --- CODE RANGE of Shift-JIS ---
3185    (character set)      (range)
3186    ASCII                0x00 .. 0x7F
3187    JISX0201-Kana        0xA0 .. 0xDF
3188    JISX0208 (1st byte)  0x80 .. 0x9F and 0xE0 .. 0xEF
3189             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
3190    -------------------------------
3191
3192 */
3193
3194 /* Is this the first byte of a Shift-JIS two-byte char? */
3195
3196 #define BYTE_SJIS_TWO_BYTE_1_P(c) \
3197   (((c) >= 0x81 && (c) <= 0x9F) || ((c) >= 0xE0 && (c) <= 0xEF))
3198
3199 /* Is this the second byte of a Shift-JIS two-byte char? */
3200
3201 #define BYTE_SJIS_TWO_BYTE_2_P(c) \
3202   (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0x80 && (c) <= 0xFC))
3203
3204 #define BYTE_SJIS_KATAKANA_P(c) \
3205   ((c) >= 0xA1 && (c) <= 0xDF)
3206
3207 static int
3208 detect_coding_sjis (struct detection_state *st, const Extbyte *src, size_t n)
3209 {
3210   while (n--)
3211     {
3212       unsigned char c = *(unsigned char *)src++;
3213       if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3214         return 0;
3215       if (st->shift_jis.in_second_byte)
3216         {
3217           st->shift_jis.in_second_byte = 0;
3218           if (c < 0x40)
3219             return 0;
3220         }
3221       else if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
3222         st->shift_jis.in_second_byte = 1;
3223     }
3224   return CODING_CATEGORY_SHIFT_JIS_MASK;
3225 }
3226
3227 /* Convert Shift-JIS data to internal format. */
3228
3229 static void
3230 decode_coding_sjis (Lstream *decoding, const Extbyte *src,
3231                     unsigned_char_dynarr *dst, size_t n)
3232 {
3233   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3234   unsigned int flags  = str->flags;
3235   unsigned int cpos   = str->cpos;
3236   eol_type_t eol_type = str->eol_type;
3237
3238   while (n--)
3239     {
3240       unsigned char c = *(unsigned char *)src++;
3241
3242       if (cpos)
3243         {
3244           /* Previous character was first byte of Shift-JIS Kanji char. */
3245           if (BYTE_SJIS_TWO_BYTE_2_P (c))
3246             {
3247               unsigned char e1, e2;
3248
3249               DECODE_SJIS (cpos, c, e1, e2);
3250 #ifdef UTF2000
3251               DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_japanese_jisx0208,
3252                                             e1 & 0x7F,
3253                                             e2 & 0x7F), dst);
3254 #else
3255               Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
3256               Dynarr_add (dst, e1);
3257               Dynarr_add (dst, e2);
3258 #endif
3259             }
3260           else
3261             {
3262               DECODE_ADD_BINARY_CHAR (cpos, dst);
3263               DECODE_ADD_BINARY_CHAR (c, dst);
3264             }
3265           cpos = 0;
3266         }
3267       else
3268         {
3269           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3270           if (BYTE_SJIS_TWO_BYTE_1_P (c))
3271             cpos = c;
3272           else if (BYTE_SJIS_KATAKANA_P (c))
3273             {
3274 #ifdef UTF2000
3275               DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_katakana_jisx0201,
3276                                             c & 0x7F, 0), dst);
3277 #else
3278               Dynarr_add (dst, LEADING_BYTE_KATAKANA_JISX0201);
3279               Dynarr_add (dst, c);
3280 #endif
3281             }
3282 #ifdef UTF2000
3283           else if (c > 32)
3284             DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_latin_jisx0201,
3285                                           c, 0), dst);
3286 #endif
3287           else
3288             DECODE_ADD_BINARY_CHAR (c, dst);
3289         }
3290     label_continue_loop:;
3291     }
3292
3293   DECODE_HANDLE_END_OF_CONVERSION (flags, cpos, dst);
3294
3295   str->flags = flags;
3296   str->cpos  = cpos;
3297 }
3298
3299 /* Convert internal character representation to Shift_JIS. */
3300
3301 void
3302 char_encode_shift_jis (struct encoding_stream *str, Emchar ch,
3303                        unsigned_char_dynarr *dst, unsigned int *flags)
3304 {
3305   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3306
3307   if (ch == '\n')
3308     {
3309       if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3310         Dynarr_add (dst, '\r');
3311       if (eol_type != EOL_CR)
3312         Dynarr_add (dst, ch);
3313     }
3314   else
3315     {
3316       unsigned int s1, s2;
3317 #ifdef UTF2000
3318       int code_point = charset_code_point (Vcharset_latin_jisx0201, ch);
3319
3320       if (code_point >= 0)
3321         Dynarr_add (dst, code_point);
3322       else if ((code_point
3323                 = charset_code_point (Vcharset_japanese_jisx0208_1990, ch))
3324                >= 0)
3325         {
3326           ENCODE_SJIS ((code_point >> 8) | 0x80,
3327                        (code_point & 0xFF) | 0x80, s1, s2);
3328           Dynarr_add (dst, s1);
3329           Dynarr_add (dst, s2);
3330         }
3331       else if ((code_point
3332                 = charset_code_point (Vcharset_katakana_jisx0201, ch))
3333                >= 0)
3334         Dynarr_add (dst, code_point | 0x80);
3335       else if ((code_point
3336                 = charset_code_point (Vcharset_japanese_jisx0208, ch))
3337                >= 0)
3338         {
3339           ENCODE_SJIS ((code_point >> 8) | 0x80,
3340                        (code_point & 0xFF) | 0x80, s1, s2);
3341           Dynarr_add (dst, s1);
3342           Dynarr_add (dst, s2);
3343         }
3344       else if ((code_point = charset_code_point (Vcharset_ascii, ch))
3345                >= 0)
3346         Dynarr_add (dst, code_point);
3347       else
3348         Dynarr_add (dst, '?');
3349 #else
3350       Lisp_Object charset;
3351       unsigned int c1, c2;
3352
3353       BREAKUP_CHAR (ch, charset, c1, c2);
3354
3355       if (EQ(charset, Vcharset_katakana_jisx0201))
3356         {
3357           Dynarr_add (dst, c1 | 0x80);
3358         }
3359       else if (c2 == 0)
3360         {
3361           Dynarr_add (dst, c1);
3362         }
3363       else if (EQ(charset, Vcharset_japanese_jisx0208))
3364         {
3365           ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3366           Dynarr_add (dst, s1);
3367           Dynarr_add (dst, s2);
3368         }
3369       else
3370         Dynarr_add (dst, '?');
3371 #endif
3372     }
3373 }
3374
3375 void
3376 char_finish_shift_jis (struct encoding_stream *str, unsigned_char_dynarr *dst,
3377                        unsigned int *flags)
3378 {
3379 }
3380
3381 DEFUN ("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /*
3382 Decode a JISX0208 character of Shift-JIS coding-system.
3383 CODE is the character code in Shift-JIS as a cons of type bytes.
3384 Return the corresponding character.
3385 */
3386        (code))
3387 {
3388   unsigned char c1, c2, s1, s2;
3389
3390   CHECK_CONS (code);
3391   CHECK_INT (XCAR (code));
3392   CHECK_INT (XCDR (code));
3393   s1 = XINT (XCAR (code));
3394   s2 = XINT (XCDR (code));
3395   if (BYTE_SJIS_TWO_BYTE_1_P (s1) &&
3396       BYTE_SJIS_TWO_BYTE_2_P (s2))
3397     {
3398       DECODE_SJIS (s1, s2, c1, c2);
3399       return make_char (MAKE_CHAR (Vcharset_japanese_jisx0208,
3400                                    c1 & 0x7F, c2 & 0x7F));
3401     }
3402   else
3403     return Qnil;
3404 }
3405
3406 DEFUN ("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /*
3407 Encode a JISX0208 character CHARACTER to SHIFT-JIS coding-system.
3408 Return the corresponding character code in SHIFT-JIS as a cons of two bytes.
3409 */
3410        (character))
3411 {
3412   Lisp_Object charset;
3413   int c1, c2, s1, s2;
3414
3415   CHECK_CHAR_COERCE_INT (character);
3416   BREAKUP_CHAR (XCHAR (character), charset, c1, c2);
3417   if (EQ (charset, Vcharset_japanese_jisx0208))
3418     {
3419       ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3420       return Fcons (make_int (s1), make_int (s2));
3421     }
3422   else
3423     return Qnil;
3424 }
3425
3426 \f
3427 /************************************************************************/
3428 /*                            Big5 methods                              */
3429 /************************************************************************/
3430
3431 /* BIG5 is a coding system encoding two character sets: ASCII and
3432    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
3433    character set and is encoded in two-byte.
3434
3435    --- CODE RANGE of BIG5 ---
3436    (character set)      (range)
3437    ASCII                0x00 .. 0x7F
3438    Big5 (1st byte)      0xA1 .. 0xFE
3439         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
3440    --------------------------
3441
3442    Since the number of characters in Big5 is larger than maximum
3443    characters in Emacs' charset (96x96), it can't be handled as one
3444    charset.  So, in Emacs, Big5 is divided into two: `charset-big5-1'
3445    and `charset-big5-2'.  Both <type>s are DIMENSION2_CHARS94.  The former
3446    contains frequently used characters and the latter contains less
3447    frequently used characters.  */
3448
3449 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
3450   ((c) >= 0xA1 && (c) <= 0xFE)
3451
3452 /* Is this the second byte of a Shift-JIS two-byte char? */
3453
3454 #define BYTE_BIG5_TWO_BYTE_2_P(c) \
3455   (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0xA1 && (c) <= 0xFE))
3456
3457 /* Number of Big5 characters which have the same code in 1st byte.  */
3458
3459 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
3460
3461 /* Code conversion macros.  These are macros because they are used in
3462    inner loops during code conversion.
3463
3464    Note that temporary variables in macros introduce the classic
3465    dynamic-scoping problems with variable names.  We use capital-
3466    lettered variables in the assumption that XEmacs does not use
3467    capital letters in variables except in a very formalized way
3468    (e.g. Qstring). */
3469
3470 /* Convert Big5 code (b1, b2) into its internal string representation
3471    (lb, c1, c2). */
3472
3473 /* There is a much simpler way to split the Big5 charset into two.
3474    For the moment I'm going to leave the algorithm as-is because it
3475    claims to separate out the most-used characters into a single
3476    charset, which perhaps will lead to optimizations in various
3477    places.
3478
3479    The way the algorithm works is something like this:
3480
3481    Big5 can be viewed as a 94x157 charset, where the row is
3482    encoded into the bytes 0xA1 .. 0xFE and the column is encoded
3483    into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE.  As for frequency,
3484    the split between low and high column numbers is apparently
3485    meaningless; ascending rows produce less and less frequent chars.
3486    Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to
3487    the first charset, and the upper half (0xC9 .. 0xFE) to the
3488    second.  To do the conversion, we convert the character into
3489    a single number where 0 .. 156 is the first row, 157 .. 313
3490    is the second, etc.  That way, the characters are ordered by
3491    decreasing frequency.  Then we just chop the space in two
3492    and coerce the result into a 94x94 space.
3493    */
3494
3495 #define DECODE_BIG5(b1, b2, lb, c1, c2) do                              \
3496 {                                                                       \
3497   int B1 = b1, B2 = b2;                                                 \
3498   unsigned int I                                                        \
3499     = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62);     \
3500                                                                         \
3501   if (B1 < 0xC9)                                                        \
3502     {                                                                   \
3503       lb = LEADING_BYTE_CHINESE_BIG5_1;                                 \
3504     }                                                                   \
3505   else                                                                  \
3506     {                                                                   \
3507       lb = LEADING_BYTE_CHINESE_BIG5_2;                                 \
3508       I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1);                             \
3509     }                                                                   \
3510   c1 = I / (0xFF - 0xA1) + 0xA1;                                        \
3511   c2 = I % (0xFF - 0xA1) + 0xA1;                                        \
3512 } while (0)
3513
3514 /* Convert the internal string representation of a Big5 character
3515    (lb, c1, c2) into Big5 code (b1, b2). */
3516
3517 #define ENCODE_BIG5(lb, c1, c2, b1, b2) do                              \
3518 {                                                                       \
3519   unsigned int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1);       \
3520                                                                         \
3521   if (lb == LEADING_BYTE_CHINESE_BIG5_2)                                \
3522     {                                                                   \
3523       I += BIG5_SAME_ROW * (0xC9 - 0xA1);                               \
3524     }                                                                   \
3525   b1 = I / BIG5_SAME_ROW + 0xA1;                                        \
3526   b2 = I % BIG5_SAME_ROW;                                               \
3527   b2 += b2 < 0x3F ? 0x40 : 0x62;                                        \
3528 } while (0)
3529
3530 static int
3531 detect_coding_big5 (struct detection_state *st, const Extbyte *src, size_t n)
3532 {
3533   while (n--)
3534     {
3535       unsigned char c = *(unsigned char *)src++;
3536       if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO ||
3537           (c >= 0x80 && c <= 0xA0))
3538         return 0;
3539       if (st->big5.in_second_byte)
3540         {
3541           st->big5.in_second_byte = 0;
3542           if (c < 0x40 || (c >= 0x80 && c <= 0xA0))
3543             return 0;
3544         }
3545       else if (c >= 0xA1)
3546         st->big5.in_second_byte = 1;
3547     }
3548   return CODING_CATEGORY_BIG5_MASK;
3549 }
3550
3551 /* Convert Big5 data to internal format. */
3552
3553 static void
3554 decode_coding_big5 (Lstream *decoding, const Extbyte *src,
3555                     unsigned_char_dynarr *dst, size_t n)
3556 {
3557   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3558   unsigned int flags  = str->flags;
3559   unsigned int cpos   = str->cpos;
3560   eol_type_t eol_type = str->eol_type;
3561
3562   while (n--)
3563     {
3564       unsigned char c = *(unsigned char *)src++;
3565       if (cpos)
3566         {
3567           /* Previous character was first byte of Big5 char. */
3568           if (BYTE_BIG5_TWO_BYTE_2_P (c))
3569             {
3570 #ifdef UTF2000
3571               DECODE_ADD_UCS_CHAR
3572                 (DECODE_CHAR (Vcharset_chinese_big5, (cpos << 8) | c),
3573                  dst);
3574 #else
3575               unsigned char b1, b2, b3;
3576               DECODE_BIG5 (cpos, c, b1, b2, b3);
3577               Dynarr_add (dst, b1);
3578               Dynarr_add (dst, b2);
3579               Dynarr_add (dst, b3);
3580 #endif
3581             }
3582           else
3583             {
3584               DECODE_ADD_BINARY_CHAR (cpos, dst);
3585               DECODE_ADD_BINARY_CHAR (c, dst);
3586             }
3587           cpos = 0;
3588         }
3589       else
3590         {
3591           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3592           if (BYTE_BIG5_TWO_BYTE_1_P (c))
3593             cpos = c;
3594           else
3595             DECODE_ADD_BINARY_CHAR (c, dst);
3596         }
3597     label_continue_loop:;
3598     }
3599
3600   DECODE_HANDLE_END_OF_CONVERSION (flags, cpos, dst);
3601
3602   str->flags = flags;
3603   str->cpos  = cpos;
3604 }
3605
3606 /* Convert internally-formatted data to Big5. */
3607
3608 static void
3609 encode_coding_big5 (Lstream *encoding, const Bufbyte *src,
3610                     unsigned_char_dynarr *dst, size_t n)
3611 {
3612 #ifndef UTF2000
3613   unsigned char c;
3614   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3615   unsigned int flags  = str->flags;
3616   unsigned int ch     = str->ch;
3617   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3618
3619   while (n--)
3620     {
3621       c = *src++;
3622       if (c == '\n')
3623         {
3624           if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3625             Dynarr_add (dst, '\r');
3626           if (eol_type != EOL_CR)
3627             Dynarr_add (dst, '\n');
3628         }
3629       else if (BYTE_ASCII_P (c))
3630         {
3631           /* ASCII. */
3632           Dynarr_add (dst, c);
3633         }
3634       else if (BUFBYTE_LEADING_BYTE_P (c))
3635         {
3636           if (c == LEADING_BYTE_CHINESE_BIG5_1 ||
3637               c == LEADING_BYTE_CHINESE_BIG5_2)
3638             {
3639               /* A recognized leading byte. */
3640               ch = c;
3641               continue; /* not done with this character. */
3642             }
3643           /* otherwise just ignore this character. */
3644         }
3645       else if (ch == LEADING_BYTE_CHINESE_BIG5_1 ||
3646                ch == LEADING_BYTE_CHINESE_BIG5_2)
3647         {
3648           /* Previous char was a recognized leading byte. */
3649           ch = (ch << 8) | c;
3650           continue; /* not done with this character. */
3651         }
3652       else if (ch)
3653         {
3654           /* Encountering second byte of a Big5 character. */
3655           unsigned char b1, b2;
3656
3657           ENCODE_BIG5 (ch >> 8, ch & 0xFF, c, b1, b2);
3658           Dynarr_add (dst, b1);
3659           Dynarr_add (dst, b2);
3660         }
3661
3662       ch = 0;
3663     }
3664
3665   str->flags = flags;
3666   str->ch    = ch;
3667 #endif
3668 }
3669
3670
3671 DEFUN ("decode-big5-char", Fdecode_big5_char, 1, 1, 0, /*
3672 Decode a Big5 character CODE of BIG5 coding-system.
3673 CODE is the character code in BIG5, a cons of two integers.
3674 Return the corresponding character.
3675 */
3676        (code))
3677 {
3678   unsigned char c1, c2, b1, b2;
3679
3680   CHECK_CONS (code);
3681   CHECK_INT (XCAR (code));
3682   CHECK_INT (XCDR (code));
3683   b1 = XINT (XCAR (code));
3684   b2 = XINT (XCDR (code));
3685   if (BYTE_BIG5_TWO_BYTE_1_P (b1) &&
3686       BYTE_BIG5_TWO_BYTE_2_P (b2))
3687     {
3688       Charset_ID leading_byte;
3689       Lisp_Object charset;
3690       DECODE_BIG5 (b1, b2, leading_byte, c1, c2);
3691       charset = CHARSET_BY_LEADING_BYTE (leading_byte);
3692       return make_char (MAKE_CHAR (charset, c1 & 0x7F, c2 & 0x7F));
3693     }
3694   else
3695     return Qnil;
3696 }
3697
3698 DEFUN ("encode-big5-char", Fencode_big5_char, 1, 1, 0, /*
3699 Encode the Big5 character CHARACTER in the BIG5 coding-system.
3700 Return the corresponding character code in Big5.
3701 */
3702        (character))
3703 {
3704   Lisp_Object charset;
3705   int c1, c2, b1, b2;
3706
3707   CHECK_CHAR_COERCE_INT (character);
3708   BREAKUP_CHAR (XCHAR (character), charset, c1, c2);
3709   if (EQ (charset, Vcharset_chinese_big5_1) ||
3710       EQ (charset, Vcharset_chinese_big5_2))
3711     {
3712       ENCODE_BIG5 (XCHARSET_LEADING_BYTE (charset), c1 | 0x80, c2 | 0x80,
3713                    b1, b2);
3714       return Fcons (make_int (b1), make_int (b2));
3715     }
3716   else
3717     return Qnil;
3718 }
3719
3720 \f
3721 /************************************************************************/
3722 /*                           UCS-4 methods                              */
3723 /************************************************************************/
3724
3725 static int
3726 detect_coding_ucs4 (struct detection_state *st, const Extbyte *src, size_t n)
3727 {
3728   while (n--)
3729     {
3730       unsigned char c = *(unsigned char *)src++;
3731       switch (st->ucs4.in_byte)
3732         {
3733         case 0:
3734           if (c >= 128)
3735             return 0;
3736           else
3737             st->ucs4.in_byte++;
3738           break;
3739         case 3:
3740           st->ucs4.in_byte = 0;
3741           break;
3742         default:
3743           st->ucs4.in_byte++;
3744         }
3745     }
3746   return CODING_CATEGORY_UCS4_MASK;
3747 }
3748
3749 static void
3750 decode_coding_ucs4 (Lstream *decoding, const Extbyte *src,
3751                     unsigned_char_dynarr *dst, size_t n)
3752 {
3753   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3754   unsigned int flags = str->flags;
3755   unsigned int cpos  = str->cpos;
3756   unsigned char counter = str->counter;
3757
3758   while (n--)
3759     {
3760       unsigned char c = *(unsigned char *)src++;
3761       switch (counter)
3762         {
3763         case 0:
3764           cpos = c;
3765           counter = 3;
3766           break;
3767         case 1:
3768           DECODE_ADD_UCS_CHAR ((cpos << 8) | c, dst);
3769           cpos = 0;
3770           counter = 0;
3771           break;
3772         default:
3773           cpos = ( cpos << 8 ) | c;
3774           counter--;
3775         }
3776     }
3777   if (counter & CODING_STATE_END)
3778     DECODE_OUTPUT_PARTIAL_CHAR (cpos);
3779
3780   str->flags    = flags;
3781   str->cpos     = cpos;
3782   str->counter  = counter;
3783 }
3784
3785 void
3786 char_encode_ucs4 (struct encoding_stream *str, Emchar ch,
3787                   unsigned_char_dynarr *dst, unsigned int *flags)
3788 {
3789   Dynarr_add (dst, ch >> 24);
3790   Dynarr_add (dst, ch >> 16);
3791   Dynarr_add (dst, ch >>  8);
3792   Dynarr_add (dst, ch      );
3793 }
3794
3795 void
3796 char_finish_ucs4 (struct encoding_stream *str, unsigned_char_dynarr *dst,
3797                   unsigned int *flags)
3798 {
3799 }
3800
3801 \f
3802 /************************************************************************/
3803 /*                           UTF-8 methods                              */
3804 /************************************************************************/
3805
3806 static int
3807 detect_coding_utf8 (struct detection_state *st, const Extbyte *src, size_t n)
3808 {
3809   while (n--)
3810     {
3811       unsigned char c = *(unsigned char *)src++;
3812       switch (st->utf8.in_byte)
3813         {
3814         case 0:
3815           if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3816             return 0;
3817           else if (c >= 0xfc)
3818             st->utf8.in_byte = 5;
3819           else if (c >= 0xf8)
3820             st->utf8.in_byte = 4;
3821           else if (c >= 0xf0)
3822             st->utf8.in_byte = 3;
3823           else if (c >= 0xe0)
3824             st->utf8.in_byte = 2;
3825           else if (c >= 0xc0)
3826             st->utf8.in_byte = 1;
3827           else if (c >= 0x80)
3828             return 0;
3829           break;
3830         default:
3831           if ((c & 0xc0) != 0x80)
3832             return 0;
3833           else
3834             st->utf8.in_byte--;
3835         }
3836     }
3837   return CODING_CATEGORY_UTF8_MASK;
3838 }
3839
3840 static void
3841 decode_output_utf8_partial_char (unsigned char counter,
3842                                  unsigned int cpos,
3843                                  unsigned_char_dynarr *dst)
3844 {
3845   if (counter == 5)
3846     DECODE_ADD_BINARY_CHAR ( (cpos|0xFC), dst);
3847   else if (counter == 4)
3848     {
3849       if (cpos < (1 << 6))
3850         DECODE_ADD_BINARY_CHAR ( (cpos|0xF8), dst);
3851       else
3852         {
3853           DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xFC), dst);
3854           DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
3855         }
3856     }
3857   else if (counter == 3)
3858     {
3859       if (cpos < (1 << 6))
3860         DECODE_ADD_BINARY_CHAR ( (cpos|0xF0), dst);
3861       else if (cpos < (1 << 12))
3862         {
3863           DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xF8), dst);
3864           DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
3865         }
3866       else
3867         {
3868           DECODE_ADD_BINARY_CHAR ( ( (cpos >> 12)|0xFC), dst);
3869           DECODE_ADD_BINARY_CHAR ( (((cpos >>  6)&0x3F)|0x80), dst);
3870           DECODE_ADD_BINARY_CHAR ( ( (cpos       &0x3F)|0x80), dst);
3871         }
3872     }
3873   else if (counter == 2)
3874     {
3875       if (cpos < (1 << 6))
3876         DECODE_ADD_BINARY_CHAR ( (cpos|0xE0), dst);
3877       else if (cpos < (1 << 12))
3878         {
3879           DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xF0), dst);
3880           DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
3881         }
3882       else if (cpos < (1 << 18))
3883         {
3884           DECODE_ADD_BINARY_CHAR ( ( (cpos >> 12)|0xF8), dst);
3885           DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
3886           DECODE_ADD_BINARY_CHAR ( ( (cpos      &0x3F)|0x80), dst);
3887         }
3888       else
3889         {
3890           DECODE_ADD_BINARY_CHAR ( ( (cpos >> 18)|0xFC), dst);
3891           DECODE_ADD_BINARY_CHAR ( (((cpos >> 12)&0x3F)|0x80), dst);
3892           DECODE_ADD_BINARY_CHAR ( (((cpos >>  6)&0x3F)|0x80), dst);
3893           DECODE_ADD_BINARY_CHAR ( ( (cpos       &0x3F)|0x80), dst);
3894         }
3895     }
3896   else
3897     {
3898       if (cpos < (1 << 6))
3899         DECODE_ADD_BINARY_CHAR ( (cpos|0xC0), dst);
3900       else if (cpos < (1 << 12))
3901         {
3902           DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xE0), dst);
3903           DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
3904         }
3905       else if (cpos < (1 << 18))
3906         {
3907           DECODE_ADD_BINARY_CHAR ( ( (cpos >> 12)|0xF0), dst);
3908           DECODE_ADD_BINARY_CHAR ( (((cpos >>  6)&0x3F)|0x80), dst);
3909           DECODE_ADD_BINARY_CHAR ( ( (cpos       &0x3F)|0x80), dst);
3910         }
3911       else if (cpos < (1 << 24))
3912         {
3913           DECODE_ADD_BINARY_CHAR ( ( (cpos >> 18)|0xF8), dst);
3914           DECODE_ADD_BINARY_CHAR ( (((cpos >> 12)&0x3F)|0x80), dst);
3915           DECODE_ADD_BINARY_CHAR ( (((cpos >>  6)&0x3F)|0x80), dst);
3916           DECODE_ADD_BINARY_CHAR ( ( (cpos       &0x3F)|0x80), dst);
3917         }
3918       else
3919         {
3920           DECODE_ADD_BINARY_CHAR ( ( (cpos >> 24)|0xFC), dst);
3921           DECODE_ADD_BINARY_CHAR ( (((cpos >> 18)&0x3F)|0x80), dst);
3922           DECODE_ADD_BINARY_CHAR ( (((cpos >> 12)&0x3F)|0x80), dst);
3923           DECODE_ADD_BINARY_CHAR ( (((cpos >>  6)&0x3F)|0x80), dst);
3924           DECODE_ADD_BINARY_CHAR ( ( (cpos       &0x3F)|0x80), dst);
3925         }
3926     }
3927 }
3928
3929 static void
3930 decode_coding_utf8 (Lstream *decoding, const Extbyte *src,
3931                     unsigned_char_dynarr *dst, size_t n)
3932 {
3933   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3934   unsigned int flags    = str->flags;
3935   unsigned int cpos     = str->cpos;
3936   eol_type_t eol_type   = str->eol_type;
3937   unsigned char counter = str->counter;
3938
3939   while (n--)
3940     {
3941       unsigned char c = *(unsigned char *)src++;
3942       if (counter == 0)
3943         {
3944           if ( c < 0xC0 )
3945             {
3946               DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3947               DECODE_ADD_UCS_CHAR (c, dst);
3948             }
3949           else if ( c < 0xE0 )
3950             {
3951               cpos = c & 0x1f;
3952               counter = 1;
3953             }
3954           else if ( c < 0xF0 )
3955             {
3956               cpos = c & 0x0f;
3957               counter = 2;
3958             }
3959           else if ( c < 0xF8 )
3960             {
3961               cpos = c & 0x07;
3962               counter = 3;
3963             }
3964           else if ( c < 0xFC )
3965             {
3966               cpos = c & 0x03;
3967               counter = 4;
3968             }
3969           else
3970             {
3971               cpos = c & 0x01;
3972               counter = 5;
3973             }
3974         }
3975       else if ( (c & 0xC0) == 0x80 )
3976         {
3977           cpos = ( cpos << 6 ) | ( c & 0x3f );
3978           if (counter == 1)
3979             {
3980               DECODE_ADD_UCS_CHAR (cpos, dst);
3981               cpos = 0;
3982               counter = 0;
3983             }
3984           else
3985             counter--;
3986         }
3987       else
3988         {
3989           decode_output_utf8_partial_char (counter, cpos, dst);
3990           DECODE_ADD_BINARY_CHAR (c, dst);
3991           cpos = 0;
3992           counter = 0;
3993         }
3994     label_continue_loop:;
3995     }
3996
3997   if (flags & CODING_STATE_END)
3998     if (counter > 0)
3999       {
4000         decode_output_utf8_partial_char (counter, cpos, dst);
4001         cpos = 0;
4002         counter = 0;
4003       }
4004   str->flags    = flags;
4005   str->cpos     = cpos;
4006   str->counter  = counter;
4007 }
4008
4009 void
4010 char_encode_utf8 (struct encoding_stream *str, Emchar ch,
4011                   unsigned_char_dynarr *dst, unsigned int *flags)
4012 {
4013   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
4014
4015   if (ch == '\n')
4016     {
4017       if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
4018         Dynarr_add (dst, '\r');
4019       if (eol_type != EOL_CR)
4020         Dynarr_add (dst, ch);
4021     }
4022   else if (ch <= 0x7f)
4023     {
4024       Dynarr_add (dst, ch);
4025     }
4026   else if (ch <= 0x7ff)
4027     {
4028       Dynarr_add (dst, (ch >> 6) | 0xc0);
4029       Dynarr_add (dst, (ch & 0x3f) | 0x80);
4030     }
4031   else if (ch <= 0xffff)
4032     {
4033       Dynarr_add (dst,  (ch >> 12) | 0xe0);
4034       Dynarr_add (dst, ((ch >>  6) & 0x3f) | 0x80);
4035       Dynarr_add (dst,  (ch        & 0x3f) | 0x80);
4036     }
4037   else if (ch <= 0x1fffff)
4038     {
4039       Dynarr_add (dst,  (ch >> 18) | 0xf0);
4040       Dynarr_add (dst, ((ch >> 12) & 0x3f) | 0x80);
4041       Dynarr_add (dst, ((ch >>  6) & 0x3f) | 0x80);
4042       Dynarr_add (dst,  (ch        & 0x3f) | 0x80);
4043     }
4044   else if (ch <= 0x3ffffff)
4045     {
4046       Dynarr_add (dst,  (ch >> 24) | 0xf8);
4047       Dynarr_add (dst, ((ch >> 18) & 0x3f) | 0x80);
4048       Dynarr_add (dst, ((ch >> 12) & 0x3f) | 0x80);
4049       Dynarr_add (dst, ((ch >>  6) & 0x3f) | 0x80);
4050       Dynarr_add (dst,  (ch        & 0x3f) | 0x80);
4051     }
4052   else
4053     {
4054       Dynarr_add (dst,  (ch >> 30) | 0xfc);
4055       Dynarr_add (dst, ((ch >> 24) & 0x3f) | 0x80);
4056       Dynarr_add (dst, ((ch >> 18) & 0x3f) | 0x80);
4057       Dynarr_add (dst, ((ch >> 12) & 0x3f) | 0x80);
4058       Dynarr_add (dst, ((ch >>  6) & 0x3f) | 0x80);
4059       Dynarr_add (dst,  (ch        & 0x3f) | 0x80);
4060     }
4061 }
4062
4063 void
4064 char_finish_utf8 (struct encoding_stream *str, unsigned_char_dynarr *dst,
4065                   unsigned int *flags)
4066 {
4067 }
4068
4069 \f
4070 /************************************************************************/
4071 /*                           ISO2022 methods                            */
4072 /************************************************************************/
4073
4074 /* The following note describes the coding system ISO2022 briefly.
4075    Since the intention of this note is to help understand the
4076    functions in this file, some parts are NOT ACCURATE or OVERLY
4077    SIMPLIFIED.  For thorough understanding, please refer to the
4078    original document of ISO2022.
4079
4080    ISO2022 provides many mechanisms to encode several character sets
4081    in 7-bit and 8-bit environments.  For 7-bit environments, all text
4082    is encoded using bytes less than 128.  This may make the encoded
4083    text a little bit longer, but the text passes more easily through
4084    several gateways, some of which strip off MSB (Most Signigant Bit).
4085
4086    There are two kinds of character sets: control character set and
4087    graphic character set.  The former contains control characters such
4088    as `newline' and `escape' to provide control functions (control
4089    functions are also provided by escape sequences).  The latter
4090    contains graphic characters such as 'A' and '-'.  Emacs recognizes
4091    two control character sets and many graphic character sets.
4092
4093    Graphic character sets are classified into one of the following
4094    four classes, according to the number of bytes (DIMENSION) and
4095    number of characters in one dimension (CHARS) of the set:
4096    - DIMENSION1_CHARS94
4097    - DIMENSION1_CHARS96
4098    - DIMENSION2_CHARS94
4099    - DIMENSION2_CHARS96
4100
4101    In addition, each character set is assigned an identification tag,
4102    unique for each set, called "final character" (denoted as <F>
4103    hereafter).  The <F> of each character set is decided by ECMA(*)
4104    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
4105    (0x30..0x3F are for private use only).
4106
4107    Note (*): ECMA = European Computer Manufacturers Association
4108
4109    Here are examples of graphic character set [NAME(<F>)]:
4110         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
4111         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
4112         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
4113         o DIMENSION2_CHARS96 -- none for the moment
4114
4115    A code area (1 byte = 8 bits) is divided into 4 areas, C0, GL, C1, and GR.
4116         C0 [0x00..0x1F] -- control character plane 0
4117         GL [0x20..0x7F] -- graphic character plane 0
4118         C1 [0x80..0x9F] -- control character plane 1
4119         GR [0xA0..0xFF] -- graphic character plane 1
4120
4121    A control character set is directly designated and invoked to C0 or
4122    C1 by an escape sequence.  The most common case is that:
4123    - ISO646's  control character set is designated/invoked to C0, and
4124    - ISO6429's control character set is designated/invoked to C1,
4125    and usually these designations/invocations are omitted in encoded
4126    text.  In a 7-bit environment, only C0 can be used, and a control
4127    character for C1 is encoded by an appropriate escape sequence to
4128    fit into the environment.  All control characters for C1 are
4129    defined to have corresponding escape sequences.
4130
4131    A graphic character set is at first designated to one of four
4132    graphic registers (G0 through G3), then these graphic registers are
4133    invoked to GL or GR.  These designations and invocations can be
4134    done independently.  The most common case is that G0 is invoked to
4135    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
4136    these invocations and designations are omitted in encoded text.
4137    In a 7-bit environment, only GL can be used.
4138
4139    When a graphic character set of CHARS94 is invoked to GL, codes
4140    0x20 and 0x7F of the GL area work as control characters SPACE and
4141    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
4142    be used.
4143
4144    There are two ways of invocation: locking-shift and single-shift.
4145    With locking-shift, the invocation lasts until the next different
4146    invocation, whereas with single-shift, the invocation affects the
4147    following character only and doesn't affect the locking-shift
4148    state.  Invocations are done by the following control characters or
4149    escape sequences:
4150
4151    ----------------------------------------------------------------------
4152    abbrev  function                  cntrl escape seq   description
4153    ----------------------------------------------------------------------
4154    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
4155    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
4156    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
4157    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
4158    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
4159    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
4160    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
4161    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
4162    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
4163    ----------------------------------------------------------------------
4164    (*) These are not used by any known coding system.
4165
4166    Control characters for these functions are defined by macros
4167    ISO_CODE_XXX in `coding.h'.
4168
4169    Designations are done by the following escape sequences:
4170    ----------------------------------------------------------------------
4171    escape sequence      description
4172    ----------------------------------------------------------------------
4173    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
4174    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
4175    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
4176    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
4177    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
4178    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
4179    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
4180    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
4181    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
4182    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
4183    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
4184    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
4185    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
4186    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
4187    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
4188    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
4189    ----------------------------------------------------------------------
4190
4191    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
4192    of dimension 1, chars 94, and final character <F>, etc...
4193
4194    Note (*): Although these designations are not allowed in ISO2022,
4195    Emacs accepts them on decoding, and produces them on encoding
4196    CHARS96 character sets in a coding system which is characterized as
4197    7-bit environment, non-locking-shift, and non-single-shift.
4198
4199    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
4200    '(' can be omitted.  We refer to this as "short-form" hereafter.
4201
4202    Now you may notice that there are a lot of ways for encoding the
4203    same multilingual text in ISO2022.  Actually, there exist many
4204    coding systems such as Compound Text (used in X11's inter client
4205    communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
4206    (used in Korean internet), EUC (Extended UNIX Code, used in Asian
4207    localized platforms), and all of these are variants of ISO2022.
4208
4209    In addition to the above, Emacs handles two more kinds of escape
4210    sequences: ISO6429's direction specification and Emacs' private
4211    sequence for specifying character composition.
4212
4213    ISO6429's direction specification takes the following form:
4214         o CSI ']'      -- end of the current direction
4215         o CSI '0' ']'  -- end of the current direction
4216         o CSI '1' ']'  -- start of left-to-right text
4217         o CSI '2' ']'  -- start of right-to-left text
4218    The control character CSI (0x9B: control sequence introducer) is
4219    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
4220
4221    Character composition specification takes the following form:
4222         o ESC '0' -- start character composition
4223         o ESC '1' -- end character composition
4224    Since these are not standard escape sequences of any ISO standard,
4225    their use with these meanings is restricted to Emacs only.  */
4226
4227 static void
4228 reset_iso2022 (Lisp_Object coding_system, struct iso2022_decoder *iso)
4229 {
4230   int i;
4231
4232   for (i = 0; i < 4; i++)
4233     {
4234       if (!NILP (coding_system))
4235         iso->charset[i] =
4236           XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, i);
4237       else
4238         iso->charset[i] = Qt;
4239       iso->invalid_designated[i] = 0;
4240     }
4241   iso->esc = ISO_ESC_NOTHING;
4242   iso->esc_bytes_index = 0;
4243   iso->register_left = 0;
4244   iso->register_right = 1;
4245   iso->switched_dir_and_no_valid_charset_yet = 0;
4246   iso->invalid_switch_dir = 0;
4247   iso->output_direction_sequence = 0;
4248   iso->output_literally = 0;
4249 #ifdef ENABLE_COMPOSITE_CHARS
4250   if (iso->composite_chars)
4251     Dynarr_reset (iso->composite_chars);
4252 #endif
4253 }
4254
4255 static int
4256 fit_to_be_escape_quoted (unsigned char c)
4257 {
4258   switch (c)
4259     {
4260     case ISO_CODE_ESC:
4261     case ISO_CODE_CSI:
4262     case ISO_CODE_SS2:
4263     case ISO_CODE_SS3:
4264     case ISO_CODE_SO:
4265     case ISO_CODE_SI:
4266       return 1;
4267
4268     default:
4269       return 0;
4270     }
4271 }
4272
4273 /* Parse one byte of an ISO2022 escape sequence.
4274    If the result is an invalid escape sequence, return 0 and
4275    do not change anything in STR.  Otherwise, if the result is
4276    an incomplete escape sequence, update ISO2022.ESC and
4277    ISO2022.ESC_BYTES and return -1.  Otherwise, update
4278    all the state variables (but not ISO2022.ESC_BYTES) and
4279    return 1.
4280
4281    If CHECK_INVALID_CHARSETS is non-zero, check for designation
4282    or invocation of an invalid character set and treat that as
4283    an unrecognized escape sequence. */
4284
4285 static int
4286 parse_iso2022_esc (Lisp_Object codesys, struct iso2022_decoder *iso,
4287                    unsigned char c, unsigned int *flags,
4288                    int check_invalid_charsets)
4289 {
4290   /* (1) If we're at the end of a designation sequence, CS is the
4291      charset being designated and REG is the register to designate
4292      it to.
4293
4294      (2) If we're at the end of a locking-shift sequence, REG is
4295      the register to invoke and HALF (0 == left, 1 == right) is
4296      the half to invoke it into.
4297
4298      (3) If we're at the end of a single-shift sequence, REG is
4299      the register to invoke. */
4300   Lisp_Object cs = Qnil;
4301   int reg, half;
4302
4303   /* NOTE: This code does goto's all over the fucking place.
4304      The reason for this is that we're basically implementing
4305      a state machine here, and hierarchical languages like C
4306      don't really provide a clean way of doing this. */
4307
4308   if (! (*flags & CODING_STATE_ESCAPE))
4309     /* At beginning of escape sequence; we need to reset our
4310        escape-state variables. */
4311     iso->esc = ISO_ESC_NOTHING;
4312
4313   iso->output_literally = 0;
4314   iso->output_direction_sequence = 0;
4315
4316   switch (iso->esc)
4317     {
4318     case ISO_ESC_NOTHING:
4319       iso->esc_bytes_index = 0;
4320       switch (c)
4321         {
4322         case ISO_CODE_ESC:      /* Start escape sequence */
4323           *flags |= CODING_STATE_ESCAPE;
4324           iso->esc = ISO_ESC;
4325           goto not_done;
4326
4327         case ISO_CODE_CSI:      /* ISO6429 (specifying directionality) */
4328           *flags |= CODING_STATE_ESCAPE;
4329           iso->esc = ISO_ESC_5_11;
4330           goto not_done;
4331
4332         case ISO_CODE_SO:       /* locking shift 1 */
4333           reg = 1; half = 0;
4334           goto locking_shift;
4335         case ISO_CODE_SI:       /* locking shift 0 */
4336           reg = 0; half = 0;
4337           goto locking_shift;
4338
4339         case ISO_CODE_SS2:      /* single shift */
4340           reg = 2;
4341           goto single_shift;
4342         case ISO_CODE_SS3:      /* single shift */
4343           reg = 3;
4344           goto single_shift;
4345
4346         default:                        /* Other control characters */
4347           return 0;
4348         }
4349
4350     case ISO_ESC:
4351       switch (c)
4352         {
4353           /**** single shift ****/
4354
4355         case 'N':       /* single shift 2 */
4356           reg = 2;
4357           goto single_shift;
4358         case 'O':       /* single shift 3 */
4359           reg = 3;
4360           goto single_shift;
4361
4362           /**** locking shift ****/
4363
4364         case '~':       /* locking shift 1 right */
4365           reg = 1; half = 1;
4366           goto locking_shift;
4367         case 'n':       /* locking shift 2 */
4368           reg = 2; half = 0;
4369           goto locking_shift;
4370         case '}':       /* locking shift 2 right */
4371           reg = 2; half = 1;
4372           goto locking_shift;
4373         case 'o':       /* locking shift 3 */
4374           reg = 3; half = 0;
4375           goto locking_shift;
4376         case '|':       /* locking shift 3 right */
4377           reg = 3; half = 1;
4378           goto locking_shift;
4379
4380 #ifdef ENABLE_COMPOSITE_CHARS
4381           /**** composite ****/
4382
4383         case '0':
4384           iso->esc = ISO_ESC_START_COMPOSITE;
4385           *flags = (*flags & CODING_STATE_ISO2022_LOCK) |
4386             CODING_STATE_COMPOSITE;
4387           return 1;
4388
4389         case '1':
4390           iso->esc = ISO_ESC_END_COMPOSITE;
4391           *flags = (*flags & CODING_STATE_ISO2022_LOCK) &
4392             ~CODING_STATE_COMPOSITE;
4393           return 1;
4394 #endif /* ENABLE_COMPOSITE_CHARS */
4395
4396           /**** directionality ****/
4397
4398         case '[':
4399           iso->esc = ISO_ESC_5_11;
4400           goto not_done;
4401
4402           /**** designation ****/
4403
4404         case '$':       /* multibyte charset prefix */
4405           iso->esc = ISO_ESC_2_4;
4406           goto not_done;
4407
4408         default:
4409           if (0x28 <= c && c <= 0x2F)
4410             {
4411               iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_8);
4412               goto not_done;
4413             }
4414
4415           /* This function is called with CODESYS equal to nil when
4416              doing coding-system detection. */
4417           if (!NILP (codesys)
4418               && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
4419               && fit_to_be_escape_quoted (c))
4420             {
4421               iso->esc = ISO_ESC_LITERAL;
4422               *flags &= CODING_STATE_ISO2022_LOCK;
4423               return 1;
4424             }
4425
4426           /* bzzzt! */
4427           return 0;
4428         }
4429
4430
4431
4432       /**** directionality ****/
4433
4434     case ISO_ESC_5_11:          /* ISO6429 direction control */
4435       if (c == ']')
4436         {
4437           *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4438           goto directionality;
4439         }
4440       if      (c == '0') iso->esc = ISO_ESC_5_11_0;
4441       else if (c == '1') iso->esc = ISO_ESC_5_11_1;
4442       else if (c == '2') iso->esc = ISO_ESC_5_11_2;
4443       else               return 0;
4444       goto not_done;
4445
4446     case ISO_ESC_5_11_0:
4447       if (c == ']')
4448         {
4449           *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4450           goto directionality;
4451         }
4452       return 0;
4453
4454     case ISO_ESC_5_11_1:
4455       if (c == ']')
4456         {
4457           *flags = (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4458           goto directionality;
4459         }
4460       return 0;
4461
4462     case ISO_ESC_5_11_2:
4463       if (c == ']')
4464         {
4465           *flags = (*flags & CODING_STATE_ISO2022_LOCK) | CODING_STATE_R2L;
4466           goto directionality;
4467         }
4468       return 0;
4469
4470     directionality:
4471       iso->esc = ISO_ESC_DIRECTIONALITY;
4472       /* Various junk here to attempt to preserve the direction sequences
4473          literally in the text if they would otherwise be swallowed due
4474          to invalid designations that don't show up as actual charset
4475          changes in the text. */
4476       if (iso->invalid_switch_dir)
4477         {
4478           /* We already inserted a direction switch literally into the
4479              text.  We assume (#### this may not be right) that the
4480              next direction switch is the one going the other way,
4481              and we need to output that literally as well. */
4482           iso->output_literally = 1;
4483           iso->invalid_switch_dir = 0;
4484         }
4485       else
4486         {
4487           int jj;
4488
4489           /* If we are in the thrall of an invalid designation,
4490            then stick the directionality sequence literally into the
4491            output stream so it ends up in the original text again. */
4492           for (jj = 0; jj < 4; jj++)
4493             if (iso->invalid_designated[jj])
4494               break;
4495           if (jj < 4)
4496             {
4497               iso->output_literally = 1;
4498               iso->invalid_switch_dir = 1;
4499             }
4500           else
4501             /* Indicate that we haven't yet seen a valid designation,
4502                so that if a switch-dir is directly followed by an
4503                invalid designation, both get inserted literally. */
4504             iso->switched_dir_and_no_valid_charset_yet = 1;
4505         }
4506       return 1;
4507
4508
4509       /**** designation ****/
4510
4511     case ISO_ESC_2_4:
4512       if (0x28 <= c && c <= 0x2F)
4513         {
4514           iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_4_8);
4515           goto not_done;
4516         }
4517       if (0x40 <= c && c <= 0x42)
4518         {
4519           /* 94^n-set */
4520           cs = CHARSET_BY_ATTRIBUTES (94, -1, c,
4521                                       *flags & CODING_STATE_R2L ?
4522                                       CHARSET_RIGHT_TO_LEFT :
4523                                       CHARSET_LEFT_TO_RIGHT);
4524           reg = 0;
4525           goto designated;
4526         }
4527       return 0;
4528
4529     default:
4530       {
4531         int chars = 0;
4532         int single = 0;
4533
4534         if (c < '0' || c > '~')
4535           return 0; /* bad final byte */
4536
4537         if (iso->esc >= ISO_ESC_2_8 &&
4538             iso->esc <= ISO_ESC_2_15)
4539           {
4540             chars = (iso->esc >= ISO_ESC_2_12) ? 96 : 94;
4541             single = 1; /* single-byte */
4542             reg = (iso->esc - ISO_ESC_2_8) & 3;
4543           }
4544         else if (iso->esc >= ISO_ESC_2_4_8 &&
4545                  iso->esc <= ISO_ESC_2_4_15)
4546           {
4547             chars = (iso->esc >= ISO_ESC_2_4_12) ? 96 : 94;
4548             single = -1; /* multi-byte */
4549             reg = (iso->esc - ISO_ESC_2_4_8) & 3;
4550           }
4551         else
4552           {
4553             /* Can this ever be reached? -slb */
4554             abort();
4555           }
4556
4557         cs = CHARSET_BY_ATTRIBUTES (chars, single, c,
4558                                     *flags & CODING_STATE_R2L ?
4559                                     CHARSET_RIGHT_TO_LEFT :
4560                                     CHARSET_LEFT_TO_RIGHT);
4561         goto designated;
4562       }
4563     }
4564
4565  not_done:
4566   iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char) c;
4567   return -1;
4568
4569  single_shift:
4570   if (check_invalid_charsets && !CHARSETP (iso->charset[reg]))
4571     /* can't invoke something that ain't there. */
4572     return 0;
4573   iso->esc = ISO_ESC_SINGLE_SHIFT;
4574   *flags &= CODING_STATE_ISO2022_LOCK;
4575   if (reg == 2)
4576     *flags |= CODING_STATE_SS2;
4577   else
4578     *flags |= CODING_STATE_SS3;
4579   return 1;
4580
4581  locking_shift:
4582   if (check_invalid_charsets &&
4583       !CHARSETP (iso->charset[reg]))
4584     /* can't invoke something that ain't there. */
4585     return 0;
4586   if (half)
4587     iso->register_right = reg;
4588   else
4589     iso->register_left = reg;
4590   *flags &= CODING_STATE_ISO2022_LOCK;
4591   iso->esc = ISO_ESC_LOCKING_SHIFT;
4592   return 1;
4593
4594  designated:
4595   if (NILP (cs) && check_invalid_charsets)
4596     {
4597       iso->invalid_designated[reg] = 1;
4598       iso->charset[reg] = Vcharset_ascii;
4599       iso->esc = ISO_ESC_DESIGNATE;
4600       *flags &= CODING_STATE_ISO2022_LOCK;
4601       iso->output_literally = 1;
4602       if (iso->switched_dir_and_no_valid_charset_yet)
4603         {
4604           /* We encountered a switch-direction followed by an
4605              invalid designation.  Ensure that the switch-direction
4606              gets outputted; otherwise it will probably get eaten
4607              when the text is written out again. */
4608           iso->switched_dir_and_no_valid_charset_yet = 0;
4609           iso->output_direction_sequence = 1;
4610           /* And make sure that the switch-dir going the other
4611              way gets outputted, as well. */
4612           iso->invalid_switch_dir = 1;
4613         }
4614       return 1;
4615     }
4616   /* This function is called with CODESYS equal to nil when
4617      doing coding-system detection. */
4618   if (!NILP (codesys))
4619     {
4620       charset_conversion_spec_dynarr *dyn =
4621         XCODING_SYSTEM (codesys)->iso2022.input_conv;
4622
4623       if (dyn)
4624         {
4625           int i;
4626
4627           for (i = 0; i < Dynarr_length (dyn); i++)
4628             {
4629               struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
4630               if (EQ (cs, spec->from_charset))
4631                 cs = spec->to_charset;
4632             }
4633         }
4634     }
4635
4636   iso->charset[reg] = cs;
4637   iso->esc = ISO_ESC_DESIGNATE;
4638   *flags &= CODING_STATE_ISO2022_LOCK;
4639   if (iso->invalid_designated[reg])
4640     {
4641       iso->invalid_designated[reg] = 0;
4642       iso->output_literally = 1;
4643     }
4644   if (iso->switched_dir_and_no_valid_charset_yet)
4645     iso->switched_dir_and_no_valid_charset_yet = 0;
4646   return 1;
4647 }
4648
4649 static int
4650 detect_coding_iso2022 (struct detection_state *st, const Extbyte *src, size_t n)
4651 {
4652   int mask;
4653
4654   /* #### There are serious deficiencies in the recognition mechanism
4655      here.  This needs to be much smarter if it's going to cut it.
4656      The sequence "\xff\x0f" is currently detected as LOCK_SHIFT while
4657      it should be detected as Latin-1.
4658      All the ISO2022 stuff in this file should be synced up with the
4659      code from FSF Emacs-20.4, in which Mule should be more or less stable.
4660      Perhaps we should wait till R2L works in FSF Emacs? */
4661
4662   if (!st->iso2022.initted)
4663     {
4664       reset_iso2022 (Qnil, &st->iso2022.iso);
4665       st->iso2022.mask = (CODING_CATEGORY_ISO_7_MASK |
4666                           CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4667                           CODING_CATEGORY_ISO_8_1_MASK |
4668                           CODING_CATEGORY_ISO_8_2_MASK |
4669                           CODING_CATEGORY_ISO_LOCK_SHIFT_MASK);
4670       st->iso2022.flags = 0;
4671       st->iso2022.high_byte_count = 0;
4672       st->iso2022.saw_single_shift = 0;
4673       st->iso2022.initted = 1;
4674     }
4675
4676   mask = st->iso2022.mask;
4677
4678   while (n--)
4679     {
4680       unsigned char c = *(unsigned char *)src++;
4681       if (c >= 0xA0)
4682         {
4683           mask &= ~CODING_CATEGORY_ISO_7_MASK;
4684           st->iso2022.high_byte_count++;
4685         }
4686       else
4687         {
4688           if (st->iso2022.high_byte_count && !st->iso2022.saw_single_shift)
4689             {
4690               if (st->iso2022.high_byte_count & 1)
4691                 /* odd number of high bytes; assume not iso-8-2 */
4692                 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4693             }
4694           st->iso2022.high_byte_count = 0;
4695           st->iso2022.saw_single_shift = 0;
4696           if (c > 0x80)
4697             mask &= ~CODING_CATEGORY_ISO_7_MASK;
4698         }
4699       if (!(st->iso2022.flags & CODING_STATE_ESCAPE)
4700           && (BYTE_C0_P (c) || BYTE_C1_P (c)))
4701         { /* control chars */
4702           switch (c)
4703             {
4704               /* Allow and ignore control characters that you might
4705                  reasonably see in a text file */
4706             case '\r':
4707             case '\n':
4708             case '\t':
4709             case  7: /* bell */
4710             case  8: /* backspace */
4711             case 11: /* vertical tab */
4712             case 12: /* form feed */
4713             case 26: /* MS-DOS C-z junk */
4714             case 31: /* '^_' -- for info */
4715               goto label_continue_loop;
4716
4717             default:
4718               break;
4719             }
4720         }
4721
4722       if ((st->iso2022.flags & CODING_STATE_ESCAPE) || BYTE_C0_P (c)
4723           || BYTE_C1_P (c))
4724         {
4725           if (parse_iso2022_esc (Qnil, &st->iso2022.iso, c,
4726                                  &st->iso2022.flags, 0))
4727             {
4728               switch (st->iso2022.iso.esc)
4729                 {
4730                 case ISO_ESC_DESIGNATE:
4731                   mask &= ~CODING_CATEGORY_ISO_8_1_MASK;
4732                   mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4733                   break;
4734                 case ISO_ESC_LOCKING_SHIFT:
4735                   mask = CODING_CATEGORY_ISO_LOCK_SHIFT_MASK;
4736                   goto ran_out_of_chars;
4737                 case ISO_ESC_SINGLE_SHIFT:
4738                   mask &= ~CODING_CATEGORY_ISO_8_DESIGNATE_MASK;
4739                   st->iso2022.saw_single_shift = 1;
4740                   break;
4741                 default:
4742                   break;
4743                 }
4744             }
4745           else
4746             {
4747               mask = 0;
4748               goto ran_out_of_chars;
4749             }
4750         }
4751     label_continue_loop:;
4752     }
4753
4754  ran_out_of_chars:
4755
4756   return mask;
4757 }
4758
4759 static int
4760 postprocess_iso2022_mask (int mask)
4761 {
4762   /* #### kind of cheesy */
4763   /* If seven-bit ISO is allowed, then assume that the encoding is
4764      entirely seven-bit and turn off the eight-bit ones. */
4765   if (mask & CODING_CATEGORY_ISO_7_MASK)
4766     mask &= ~ (CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4767                CODING_CATEGORY_ISO_8_1_MASK |
4768                CODING_CATEGORY_ISO_8_2_MASK);
4769   return mask;
4770 }
4771
4772 /* If FLAGS is a null pointer or specifies right-to-left motion,
4773    output a switch-dir-to-left-to-right sequence to DST.
4774    Also update FLAGS if it is not a null pointer.
4775    If INTERNAL_P is set, we are outputting in internal format and
4776    need to handle the CSI differently. */
4777
4778 static void
4779 restore_left_to_right_direction (Lisp_Coding_System *codesys,
4780                                  unsigned_char_dynarr *dst,
4781                                  unsigned int *flags,
4782                                  int internal_p)
4783 {
4784   if (!flags || (*flags & CODING_STATE_R2L))
4785     {
4786       if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4787         {
4788           Dynarr_add (dst, ISO_CODE_ESC);
4789           Dynarr_add (dst, '[');
4790         }
4791       else if (internal_p)
4792         DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4793       else
4794         Dynarr_add (dst, ISO_CODE_CSI);
4795       Dynarr_add (dst, '0');
4796       Dynarr_add (dst, ']');
4797       if (flags)
4798         *flags &= ~CODING_STATE_R2L;
4799     }
4800 }
4801
4802 /* If FLAGS is a null pointer or specifies a direction different from
4803    DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or
4804    CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape
4805    sequence to DST.  Also update FLAGS if it is not a null pointer.
4806    If INTERNAL_P is set, we are outputting in internal format and
4807    need to handle the CSI differently. */
4808
4809 static void
4810 ensure_correct_direction (int direction, Lisp_Coding_System *codesys,
4811                           unsigned_char_dynarr *dst, unsigned int *flags,
4812                           int internal_p)
4813 {
4814   if ((!flags || (*flags & CODING_STATE_R2L)) &&
4815       direction == CHARSET_LEFT_TO_RIGHT)
4816     restore_left_to_right_direction (codesys, dst, flags, internal_p);
4817   else if (!CODING_SYSTEM_ISO2022_NO_ISO6429 (codesys)
4818            && (!flags || !(*flags & CODING_STATE_R2L)) &&
4819            direction == CHARSET_RIGHT_TO_LEFT)
4820     {
4821       if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4822         {
4823           Dynarr_add (dst, ISO_CODE_ESC);
4824           Dynarr_add (dst, '[');
4825         }
4826       else if (internal_p)
4827         DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4828       else
4829         Dynarr_add (dst, ISO_CODE_CSI);
4830       Dynarr_add (dst, '2');
4831       Dynarr_add (dst, ']');
4832       if (flags)
4833         *flags |= CODING_STATE_R2L;
4834     }
4835 }
4836
4837 /* Convert ISO2022-format data to internal format. */
4838
4839 static void
4840 decode_coding_iso2022 (Lstream *decoding, const Extbyte *src,
4841                        unsigned_char_dynarr *dst, size_t n)
4842 {
4843   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4844   unsigned int flags    = str->flags;
4845   unsigned int cpos     = str->cpos;
4846   unsigned char counter = str->counter;
4847   eol_type_t eol_type   = str->eol_type;
4848 #ifdef ENABLE_COMPOSITE_CHARS
4849   unsigned_char_dynarr *real_dst = dst;
4850 #endif
4851   Lisp_Object coding_system;
4852
4853   XSETCODING_SYSTEM (coding_system, str->codesys);
4854
4855 #ifdef ENABLE_COMPOSITE_CHARS
4856   if (flags & CODING_STATE_COMPOSITE)
4857     dst = str->iso2022.composite_chars;
4858 #endif /* ENABLE_COMPOSITE_CHARS */
4859
4860   while (n--)
4861     {
4862       unsigned char c = *(unsigned char *)src++;
4863       if (flags & CODING_STATE_ESCAPE)
4864         {       /* Within ESC sequence */
4865           int retval = parse_iso2022_esc (coding_system, &str->iso2022,
4866                                           c, &flags, 1);
4867
4868           if (retval)
4869             {
4870               switch (str->iso2022.esc)
4871                 {
4872 #ifdef ENABLE_COMPOSITE_CHARS
4873                 case ISO_ESC_START_COMPOSITE:
4874                   if (str->iso2022.composite_chars)
4875                     Dynarr_reset (str->iso2022.composite_chars);
4876                   else
4877                     str->iso2022.composite_chars = Dynarr_new (unsigned_char);
4878                   dst = str->iso2022.composite_chars;
4879                   break;
4880                 case ISO_ESC_END_COMPOSITE:
4881                   {
4882                     Bufbyte comstr[MAX_EMCHAR_LEN];
4883                     Bytecount len;
4884                     Emchar emch = lookup_composite_char (Dynarr_atp (dst, 0),
4885                                                          Dynarr_length (dst));
4886                     dst = real_dst;
4887                     len = set_charptr_emchar (comstr, emch);
4888                     Dynarr_add_many (dst, comstr, len);
4889                     break;
4890                   }
4891 #endif /* ENABLE_COMPOSITE_CHARS */
4892
4893                 case ISO_ESC_LITERAL:
4894                   COMPOSE_FLUSH_CHARS (str, dst);
4895                   DECODE_ADD_BINARY_CHAR (c, dst);
4896                   break;
4897
4898                 default:
4899                   /* Everything else handled already */
4900                   break;
4901                 }
4902             }
4903
4904           /* Attempted error recovery. */
4905           if (str->iso2022.output_direction_sequence)
4906             ensure_correct_direction (flags & CODING_STATE_R2L ?
4907                                       CHARSET_RIGHT_TO_LEFT :
4908                                       CHARSET_LEFT_TO_RIGHT,
4909                                       str->codesys, dst, 0, 1);
4910           /* More error recovery. */
4911           if (!retval || str->iso2022.output_literally)
4912             {
4913               /* Output the (possibly invalid) sequence */
4914               int i;
4915               COMPOSE_FLUSH_CHARS (str, dst);
4916               for (i = 0; i < str->iso2022.esc_bytes_index; i++)
4917                 DECODE_ADD_BINARY_CHAR (str->iso2022.esc_bytes[i], dst);
4918               flags &= CODING_STATE_ISO2022_LOCK;
4919               if (!retval)
4920                 n++, src--;/* Repeat the loop with the same character. */
4921               else
4922                 {
4923                   /* No sense in reprocessing the final byte of the
4924                      escape sequence; it could mess things up anyway.
4925                      Just add it now. */
4926                   COMPOSE_FLUSH_CHARS (str, dst);
4927                   DECODE_ADD_BINARY_CHAR (c, dst);
4928                 }
4929             }
4930           cpos = 0;
4931           counter = 0;
4932         }
4933       else if (BYTE_C0_P (c) || BYTE_C1_P (c))
4934         { /* Control characters */
4935
4936           /***** Error-handling *****/
4937
4938           /* If we were in the middle of a character, dump out the
4939              partial character. */
4940           if (counter)
4941             {
4942               COMPOSE_FLUSH_CHARS (str, dst);
4943               while (counter > 0)
4944                 {
4945                   counter--;
4946                   DECODE_ADD_BINARY_CHAR
4947                     ((unsigned char)(cpos >> (counter * 8)), dst);
4948                 }
4949               cpos = 0;
4950             }
4951
4952           /* If we just saw a single-shift character, dump it out.
4953              This may dump out the wrong sort of single-shift character,
4954              but least it will give an indication that something went
4955              wrong. */
4956           if (flags & CODING_STATE_SS2)
4957             {
4958               COMPOSE_FLUSH_CHARS (str, dst);
4959               DECODE_ADD_BINARY_CHAR (ISO_CODE_SS2, dst);
4960               flags &= ~CODING_STATE_SS2;
4961             }
4962           if (flags & CODING_STATE_SS3)
4963             {
4964               COMPOSE_FLUSH_CHARS (str, dst);
4965               DECODE_ADD_BINARY_CHAR (ISO_CODE_SS3, dst);
4966               flags &= ~CODING_STATE_SS3;
4967             }
4968
4969           /***** Now handle the control characters. *****/
4970
4971           /* Handle CR/LF */
4972 #ifdef UTF2000
4973           if (c == '\r')
4974             {
4975               COMPOSE_FLUSH_CHARS (str, dst);
4976               if (eol_type == EOL_CR)
4977                 Dynarr_add (dst, '\n');
4978               else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR)
4979                 Dynarr_add (dst, c);
4980               else
4981                 flags |= CODING_STATE_CR;
4982               goto label_continue_loop;
4983             }
4984           else if (flags & CODING_STATE_CR)
4985             {   /* eol_type == CODING_SYSTEM_EOL_CRLF */
4986               if (c != '\n')
4987                 Dynarr_add (dst, '\r');
4988               flags &= ~CODING_STATE_CR;
4989             }
4990 #else
4991           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4992 #endif
4993
4994           flags &= CODING_STATE_ISO2022_LOCK;
4995
4996           if (!parse_iso2022_esc (coding_system, &str->iso2022, c, &flags, 1))
4997             {
4998               COMPOSE_FLUSH_CHARS (str, dst);
4999               DECODE_ADD_BINARY_CHAR (c, dst);
5000             }
5001         }
5002       else
5003         {                       /* Graphic characters */
5004           Lisp_Object charset;
5005 #ifndef UTF2000
5006           Charset_ID lb;
5007 #endif
5008           int reg;
5009
5010 #ifdef UTF2000
5011           if (c == '\r')
5012             {
5013               COMPOSE_FLUSH_CHARS (str, dst);
5014               if (eol_type == EOL_CR)
5015                 Dynarr_add (dst, '\n');
5016               else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR)
5017                 Dynarr_add (dst, c);
5018               else
5019                 flags |= CODING_STATE_CR;
5020               goto label_continue_loop;
5021             }
5022           else if (flags & CODING_STATE_CR)
5023             {   /* eol_type == CODING_SYSTEM_EOL_CRLF */
5024               if (c != '\n')
5025                 Dynarr_add (dst, '\r');
5026               flags &= ~CODING_STATE_CR;
5027             }
5028 #else
5029           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5030 #endif
5031
5032           /* Now determine the charset. */
5033           reg = ((flags & CODING_STATE_SS2) ? 2
5034                  : (flags & CODING_STATE_SS3) ? 3
5035                  : !BYTE_ASCII_P (c) ? str->iso2022.register_right
5036                  : str->iso2022.register_left);
5037           charset = str->iso2022.charset[reg];
5038
5039           /* Error checking: */
5040           if (! CHARSETP (charset)
5041               || str->iso2022.invalid_designated[reg]
5042               || (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL)
5043                   && XCHARSET_CHARS (charset) == 94))
5044             /* Mrmph.  We are trying to invoke a register that has no
5045                or an invalid charset in it, or trying to add a character
5046                outside the range of the charset.  Insert that char literally
5047                to preserve it for the output. */
5048             {
5049               COMPOSE_FLUSH_CHARS (str, dst);
5050               while (counter > 0)
5051                 {
5052                   counter--;
5053                   DECODE_ADD_BINARY_CHAR
5054                     ((unsigned char)(cpos >> (counter * 8)), dst);
5055                 }
5056               cpos = 0;
5057               DECODE_ADD_BINARY_CHAR (c, dst);
5058             }
5059
5060           else
5061             {
5062               /* Things are probably hunky-dorey. */
5063
5064               /* Fetch reverse charset, maybe. */
5065               if (((flags & CODING_STATE_R2L) &&
5066                    XCHARSET_DIRECTION (charset) == CHARSET_LEFT_TO_RIGHT)
5067                   ||
5068                   (!(flags & CODING_STATE_R2L) &&
5069                    XCHARSET_DIRECTION (charset) == CHARSET_RIGHT_TO_LEFT))
5070                 {
5071                   Lisp_Object new_charset =
5072                     XCHARSET_REVERSE_DIRECTION_CHARSET (charset);
5073                   if (!NILP (new_charset))
5074                     charset = new_charset;
5075                 }
5076
5077 #ifdef UTF2000
5078               counter++;
5079               if (XCHARSET_DIMENSION (charset) == counter)
5080                 {
5081                   COMPOSE_ADD_CHAR (str,
5082                                     DECODE_CHAR (charset,
5083                                                  ((cpos & 0x7F7F7F) << 8)
5084                                                  | (c & 0x7F)),
5085                                     dst);
5086                   cpos = 0;
5087                   counter = 0;
5088                 }
5089               else
5090                 cpos = (cpos << 8) | c;
5091 #else
5092               lb = XCHARSET_LEADING_BYTE (charset);
5093               switch (XCHARSET_REP_BYTES (charset))
5094                 {
5095                 case 1: /* ASCII */
5096                   DECODE_OUTPUT_PARTIAL_CHAR (ch);
5097                   Dynarr_add (dst, c & 0x7F);
5098                   break;
5099
5100                 case 2: /* one-byte official */
5101                   DECODE_OUTPUT_PARTIAL_CHAR (ch);
5102                   Dynarr_add (dst, lb);
5103                   Dynarr_add (dst, c | 0x80);
5104                   break;
5105
5106                 case 3: /* one-byte private or two-byte official */
5107                   if (XCHARSET_PRIVATE_P (charset))
5108                     {
5109                       DECODE_OUTPUT_PARTIAL_CHAR (ch);
5110                       Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_1);
5111                       Dynarr_add (dst, lb);
5112                       Dynarr_add (dst, c | 0x80);
5113                     }
5114                   else
5115                     {
5116                       if (ch)
5117                         {
5118                           Dynarr_add (dst, lb);
5119                           Dynarr_add (dst, ch | 0x80);
5120                           Dynarr_add (dst, c | 0x80);
5121                           ch = 0;
5122                         }
5123                       else
5124                         ch = c;
5125                     }
5126                   break;
5127
5128                 default:        /* two-byte private */
5129                   if (ch)
5130                     {
5131                       Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_2);
5132                       Dynarr_add (dst, lb);
5133                       Dynarr_add (dst, ch | 0x80);
5134                       Dynarr_add (dst, c | 0x80);
5135                       ch = 0;
5136                     }
5137                   else
5138                     ch = c;
5139                 }
5140 #endif
5141             }
5142
5143           if (!cpos)
5144             flags &= CODING_STATE_ISO2022_LOCK;
5145         }
5146
5147     label_continue_loop:;
5148     }
5149
5150   if (flags & CODING_STATE_END)
5151     {
5152       COMPOSE_FLUSH_CHARS (str, dst);
5153       DECODE_OUTPUT_PARTIAL_CHAR (cpos);
5154     }
5155   str->flags   = flags;
5156   str->cpos    = cpos;
5157   str->counter = counter;
5158 }
5159
5160
5161 /***** ISO2022 encoder *****/
5162
5163 /* Designate CHARSET into register REG. */
5164
5165 static void
5166 iso2022_designate (Lisp_Object charset, unsigned char reg,
5167                    struct encoding_stream *str, unsigned_char_dynarr *dst)
5168 {
5169   static const char inter94[] = "()*+";
5170   static const char inter96[] = ",-./";
5171   unsigned short chars;
5172   unsigned char dimension;
5173   unsigned char final;
5174   Lisp_Object old_charset = str->iso2022.charset[reg];
5175
5176   str->iso2022.charset[reg] = charset;
5177   if (!CHARSETP (charset))
5178     /* charset might be an initial nil or t. */
5179     return;
5180   chars = XCHARSET_CHARS (charset);
5181   dimension = XCHARSET_DIMENSION (charset);
5182   final = XCHARSET_FINAL (charset);
5183   if (!str->iso2022.force_charset_on_output[reg] &&
5184       CHARSETP (old_charset) &&
5185       XCHARSET_CHARS (old_charset) == chars &&
5186       XCHARSET_DIMENSION (old_charset) == dimension &&
5187       XCHARSET_FINAL (old_charset) == final)
5188     return;
5189
5190   str->iso2022.force_charset_on_output[reg] = 0;
5191
5192   {
5193     charset_conversion_spec_dynarr *dyn =
5194       str->codesys->iso2022.output_conv;
5195
5196     if (dyn)
5197       {
5198         int i;
5199
5200         for (i = 0; i < Dynarr_length (dyn); i++)
5201           {
5202             struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
5203             if (EQ (charset, spec->from_charset))
5204                 charset = spec->to_charset;
5205           }
5206       }
5207   }
5208
5209   Dynarr_add (dst, ISO_CODE_ESC);
5210   switch (chars)
5211     {
5212     case 94:
5213       if (dimension == 1)
5214         Dynarr_add (dst, inter94[reg]);
5215       else
5216         {
5217           Dynarr_add (dst, '$');
5218           if (reg != 0
5219               || !(CODING_SYSTEM_ISO2022_SHORT (str->codesys))
5220               || final < '@'
5221               || final > 'B')
5222             Dynarr_add (dst, inter94[reg]);
5223         }
5224       break;
5225     case 96:
5226       if (dimension == 1)
5227         Dynarr_add (dst, inter96[reg]);
5228       else
5229         {
5230           Dynarr_add (dst, '$');
5231           Dynarr_add (dst, inter96[reg]);
5232         }
5233       break;
5234     }
5235   Dynarr_add (dst, final);
5236 }
5237
5238 static void
5239 ensure_normal_shift (struct encoding_stream *str, unsigned_char_dynarr *dst)
5240 {
5241   if (str->iso2022.register_left != 0)
5242     {
5243       Dynarr_add (dst, ISO_CODE_SI);
5244       str->iso2022.register_left = 0;
5245     }
5246 }
5247
5248 static void
5249 ensure_shift_out (struct encoding_stream *str, unsigned_char_dynarr *dst)
5250 {
5251   if (str->iso2022.register_left != 1)
5252     {
5253       Dynarr_add (dst, ISO_CODE_SO);
5254       str->iso2022.register_left = 1;
5255     }
5256 }
5257
5258 void
5259 char_encode_iso2022 (struct encoding_stream *str, Emchar ch,
5260                      unsigned_char_dynarr *dst, unsigned int *flags)
5261 {
5262   unsigned char charmask;
5263   Lisp_Coding_System* codesys = str->codesys;
5264   eol_type_t eol_type         = CODING_SYSTEM_EOL_TYPE (str->codesys);
5265   int i;
5266   Lisp_Object charset = str->iso2022.current_charset;
5267   int half = str->iso2022.current_half;
5268   int code_point = -1;
5269
5270   if (ch <= 0x7F)
5271     {
5272       restore_left_to_right_direction (codesys, dst, flags, 0);
5273
5274       /* Make sure G0 contains ASCII */
5275       if ((ch > ' ' && ch < ISO_CODE_DEL)
5276           || !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
5277         {
5278           ensure_normal_shift (str, dst);
5279           iso2022_designate (Vcharset_ascii, 0, str, dst);
5280         }
5281
5282       /* If necessary, restore everything to the default state
5283          at end-of-line */
5284       if (ch == '\n' && !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys)))
5285         {
5286           restore_left_to_right_direction (codesys, dst, flags, 0);
5287
5288           ensure_normal_shift (str, dst);
5289
5290           for (i = 0; i < 4; i++)
5291             {
5292               Lisp_Object initial_charset =
5293                 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5294               iso2022_designate (initial_charset, i, str, dst);
5295             }
5296         }
5297       if (ch == '\n')
5298         {
5299           if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5300             Dynarr_add (dst, '\r');
5301           if (eol_type != EOL_CR)
5302             Dynarr_add (dst, ch);
5303         }
5304       else
5305         {
5306           if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5307               && fit_to_be_escape_quoted (ch))
5308             Dynarr_add (dst, ISO_CODE_ESC);
5309           Dynarr_add (dst, ch);
5310         }
5311     }
5312   else if ( (0x80 <= ch) && (ch <= 0x9f) )
5313     {
5314       charmask = (half == 0 ? 0x00 : 0x80);
5315
5316       if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5317           && fit_to_be_escape_quoted (ch))
5318         Dynarr_add (dst, ISO_CODE_ESC);
5319       /* you asked for it ... */
5320       Dynarr_add (dst, ch);
5321     }
5322   else
5323     {
5324       int reg;
5325
5326       /* Now determine which register to use. */
5327       reg = -1;
5328       for (i = 0; i < 4; i++)
5329         {
5330           if ((CHARSETP (charset = str->iso2022.charset[i])
5331                && ((code_point = charset_code_point (charset, ch)) >= 0))
5332               ||
5333               (CHARSETP
5334                (charset
5335                 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i))
5336                && ((code_point = charset_code_point (charset, ch)) >= 0)))
5337             {
5338               reg = i;
5339               break;
5340             }
5341         }
5342       if (reg == -1)
5343         {
5344           Lisp_Object original_default_coded_charset_priority_list
5345             = Vdefault_coded_charset_priority_list;
5346
5347           while (!EQ (Vdefault_coded_charset_priority_list, Qnil))
5348             {
5349               code_point = ENCODE_CHAR (ch, charset);
5350               if (XCHARSET_FINAL (charset))
5351                 goto found;
5352               Vdefault_coded_charset_priority_list
5353                 = Fcdr (Fmemq (XCHARSET_NAME (charset),
5354                                Vdefault_coded_charset_priority_list));
5355             }
5356           code_point = ENCODE_CHAR (ch, charset);
5357           if (!XCHARSET_FINAL (charset))
5358             {
5359               charset = Vcharset_ascii;
5360               code_point = '~';
5361             }
5362         found:
5363           Vdefault_coded_charset_priority_list
5364             = original_default_coded_charset_priority_list;
5365         }
5366       ensure_correct_direction (XCHARSET_DIRECTION (charset),
5367                                 codesys, dst, flags, 0);
5368
5369       if (reg == -1)
5370         {
5371           if (XCHARSET_GRAPHIC (charset) != 0)
5372             {
5373               if (!NILP (str->iso2022.charset[1]) &&
5374                   (!CODING_SYSTEM_ISO2022_SEVEN (codesys)
5375                    || CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys)))
5376                 reg = 1;
5377               else if (!NILP (str->iso2022.charset[2]))
5378                 reg = 2;
5379               else if (!NILP (str->iso2022.charset[3]))
5380                 reg = 3;
5381               else
5382                 reg = 0;
5383             }
5384           else
5385             reg = 0;
5386         }
5387
5388       iso2022_designate (charset, reg, str, dst);
5389
5390       /* Now invoke that register. */
5391       switch (reg)
5392         {
5393         case 0:
5394           ensure_normal_shift (str, dst);
5395           half = 0;
5396           break;
5397         case 1:
5398           if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5399             {
5400               ensure_shift_out (str, dst);
5401               half = 0;
5402             }
5403           else
5404             half = 1;
5405           break;
5406         case 2:
5407           if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5408             {
5409               Dynarr_add (dst, ISO_CODE_ESC);
5410               Dynarr_add (dst, 'N');
5411               half = 0;
5412             }
5413           else
5414             {
5415               Dynarr_add (dst, ISO_CODE_SS2);
5416               half = 1;
5417             }
5418           break;
5419         case 3:
5420           if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5421             {
5422               Dynarr_add (dst, ISO_CODE_ESC);
5423               Dynarr_add (dst, 'O');
5424               half = 0;
5425             }
5426           else
5427             {
5428               Dynarr_add (dst, ISO_CODE_SS3);
5429               half = 1;
5430             }
5431           break;
5432         default:
5433           abort ();
5434         }
5435
5436       charmask = (half == 0 ? 0x00 : 0x80);
5437
5438       switch (XCHARSET_DIMENSION (charset))
5439         {
5440         case 1:
5441           Dynarr_add (dst, (code_point & 0xFF) | charmask);
5442           break;
5443         case 2:
5444           Dynarr_add (dst, ((code_point >> 8) & 0xFF) | charmask);
5445           Dynarr_add (dst, ( code_point       & 0xFF) | charmask);
5446           break;
5447         case 3:
5448           Dynarr_add (dst, ((code_point >> 16) & 0xFF) | charmask);
5449           Dynarr_add (dst, ((code_point >>  8) & 0xFF) | charmask);
5450           Dynarr_add (dst, ( code_point        & 0xFF) | charmask);
5451           break;
5452         case 4:
5453           Dynarr_add (dst, ((code_point >> 24) & 0xFF) | charmask);
5454           Dynarr_add (dst, ((code_point >> 16) & 0xFF) | charmask);
5455           Dynarr_add (dst, ((code_point >>  8) & 0xFF) | charmask);
5456           Dynarr_add (dst, ( code_point        & 0xFF) | charmask);
5457           break;
5458         default:
5459           abort ();
5460         }
5461     }
5462   str->iso2022.current_charset = charset;
5463   str->iso2022.current_half = half;
5464 }
5465
5466 void
5467 char_finish_iso2022 (struct encoding_stream *str, unsigned_char_dynarr *dst,
5468                      unsigned int *flags)
5469 {
5470   Lisp_Coding_System* codesys = str->codesys;
5471   int i;
5472
5473   restore_left_to_right_direction (codesys, dst, flags, 0);
5474   ensure_normal_shift (str, dst);
5475   for (i = 0; i < 4; i++)
5476     {
5477       Lisp_Object initial_charset
5478         = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5479       iso2022_designate (initial_charset, i, str, dst);
5480     }
5481 }
5482 #endif /* MULE */
5483 \f
5484 /************************************************************************/
5485 /*                     No-conversion methods                            */
5486 /************************************************************************/
5487
5488 /* This is used when reading in "binary" files -- i.e. files that may
5489    contain all 256 possible byte values and that are not to be
5490    interpreted as being in any particular decoding. */
5491 static void
5492 decode_coding_no_conversion (Lstream *decoding, const Extbyte *src,
5493                              unsigned_char_dynarr *dst, size_t n)
5494 {
5495   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
5496   unsigned int flags  = str->flags;
5497   unsigned int cpos   = str->cpos;
5498   eol_type_t eol_type = str->eol_type;
5499
5500   while (n--)
5501     {
5502       unsigned char c = *(unsigned char *)src++;
5503
5504       DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5505       DECODE_ADD_BINARY_CHAR (c, dst);
5506     label_continue_loop:;
5507     }
5508
5509   DECODE_HANDLE_END_OF_CONVERSION (flags, cpos, dst);
5510
5511   str->flags = flags;
5512   str->cpos  = cpos;
5513 }
5514
5515 static void
5516 encode_coding_no_conversion (Lstream *encoding, const Bufbyte *src,
5517                              unsigned_char_dynarr *dst, size_t n)
5518 {
5519   unsigned char c;
5520   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
5521   unsigned int flags  = str->flags;
5522   unsigned int ch     = str->ch;
5523   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
5524 #ifdef UTF2000
5525   unsigned char char_boundary = str->iso2022.current_char_boundary;
5526 #endif
5527
5528   while (n--)
5529     {
5530       c = *src++;
5531 #ifdef UTF2000
5532       if (char_boundary == 0)
5533         if ( c >= 0xfc )
5534           {
5535             ch = c & 0x01;
5536             char_boundary = 5;
5537           }
5538         else if ( c >= 0xf8 )
5539           {
5540             ch = c & 0x03;
5541             char_boundary = 4;
5542           }
5543         else if ( c >= 0xf0 )
5544           {
5545             ch = c & 0x07;
5546             char_boundary = 3;
5547           }
5548         else if ( c >= 0xe0 )
5549           {
5550             ch = c & 0x0f;
5551             char_boundary = 2;
5552           }
5553         else if ( c >= 0xc0 )
5554           {
5555             ch = c & 0x1f;
5556             char_boundary = 1;
5557           }
5558         else
5559           {
5560             ch = 0;
5561             if (c == '\n')
5562               {
5563                 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5564                   Dynarr_add (dst, '\r');
5565                 if (eol_type != EOL_CR)
5566                   Dynarr_add (dst, c);
5567               }
5568             else
5569               Dynarr_add (dst, c);
5570             char_boundary = 0;
5571           }
5572       else if (char_boundary == 1)
5573         {
5574           ch = ( ch << 6 ) | ( c & 0x3f );
5575           Dynarr_add (dst, ch & 0xff);
5576           char_boundary = 0;
5577         }
5578       else
5579         {
5580           ch = ( ch << 6 ) | ( c & 0x3f );
5581           char_boundary--;
5582         }
5583 #else /* not UTF2000 */
5584       if (c == '\n')
5585         {
5586           if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5587             Dynarr_add (dst, '\r');
5588           if (eol_type != EOL_CR)
5589             Dynarr_add (dst, '\n');
5590           ch = 0;
5591         }
5592       else if (BYTE_ASCII_P (c))
5593         {
5594           assert (ch == 0);
5595           Dynarr_add (dst, c);
5596         }
5597       else if (BUFBYTE_LEADING_BYTE_P (c))
5598         {
5599           assert (ch == 0);
5600           if (c == LEADING_BYTE_LATIN_ISO8859_1 ||
5601               c == LEADING_BYTE_CONTROL_1)
5602             ch = c;
5603           else
5604             Dynarr_add (dst, '~'); /* untranslatable character */
5605         }
5606       else
5607         {
5608           if (ch == LEADING_BYTE_LATIN_ISO8859_1)
5609             Dynarr_add (dst, c);
5610           else if (ch == LEADING_BYTE_CONTROL_1)
5611             {
5612               assert (c < 0xC0);
5613               Dynarr_add (dst, c - 0x20);
5614             }
5615           /* else it should be the second or third byte of an
5616              untranslatable character, so ignore it */
5617           ch = 0;
5618         }
5619 #endif /* not UTF2000 */
5620     }
5621
5622   str->flags = flags;
5623   str->ch    = ch;
5624 #ifdef UTF2000
5625   str->iso2022.current_char_boundary = char_boundary;
5626 #endif
5627 }
5628
5629 \f
5630
5631 /************************************************************************/
5632 /*                             Initialization                           */
5633 /************************************************************************/
5634
5635 void
5636 syms_of_file_coding (void)
5637 {
5638   INIT_LRECORD_IMPLEMENTATION (coding_system);
5639
5640   deferror (&Qcoding_system_error, "coding-system-error",
5641             "Coding-system error", Qio_error);
5642
5643   DEFSUBR (Fcoding_system_p);
5644   DEFSUBR (Ffind_coding_system);
5645   DEFSUBR (Fget_coding_system);
5646   DEFSUBR (Fcoding_system_list);
5647   DEFSUBR (Fcoding_system_name);
5648   DEFSUBR (Fmake_coding_system);
5649   DEFSUBR (Fcopy_coding_system);
5650   DEFSUBR (Fcoding_system_canonical_name_p);
5651   DEFSUBR (Fcoding_system_alias_p);
5652   DEFSUBR (Fcoding_system_aliasee);
5653   DEFSUBR (Fdefine_coding_system_alias);
5654   DEFSUBR (Fsubsidiary_coding_system);
5655
5656   DEFSUBR (Fcoding_system_type);
5657   DEFSUBR (Fcoding_system_doc_string);
5658 #ifdef MULE
5659   DEFSUBR (Fcoding_system_charset);
5660 #endif
5661   DEFSUBR (Fcoding_system_property);
5662
5663   DEFSUBR (Fcoding_category_list);
5664   DEFSUBR (Fset_coding_priority_list);
5665   DEFSUBR (Fcoding_priority_list);
5666   DEFSUBR (Fset_coding_category_system);
5667   DEFSUBR (Fcoding_category_system);
5668
5669   DEFSUBR (Fdetect_coding_region);
5670   DEFSUBR (Fdecode_coding_region);
5671   DEFSUBR (Fencode_coding_region);
5672 #ifdef MULE
5673   DEFSUBR (Fdecode_shift_jis_char);
5674   DEFSUBR (Fencode_shift_jis_char);
5675   DEFSUBR (Fdecode_big5_char);
5676   DEFSUBR (Fencode_big5_char);
5677 #endif /* MULE */
5678   defsymbol (&Qcoding_systemp, "coding-system-p");
5679   defsymbol (&Qno_conversion, "no-conversion");
5680   defsymbol (&Qraw_text, "raw-text");
5681 #ifdef MULE
5682   defsymbol (&Qbig5, "big5");
5683   defsymbol (&Qshift_jis, "shift-jis");
5684   defsymbol (&Qucs4, "ucs-4");
5685   defsymbol (&Qutf8, "utf-8");
5686   defsymbol (&Qccl, "ccl");
5687   defsymbol (&Qiso2022, "iso2022");
5688 #endif /* MULE */
5689   defsymbol (&Qmnemonic, "mnemonic");
5690   defsymbol (&Qeol_type, "eol-type");
5691   defsymbol (&Qpost_read_conversion, "post-read-conversion");
5692   defsymbol (&Qpre_write_conversion, "pre-write-conversion");
5693
5694   defsymbol (&Qcr, "cr");
5695   defsymbol (&Qlf, "lf");
5696   defsymbol (&Qcrlf, "crlf");
5697   defsymbol (&Qeol_cr, "eol-cr");
5698   defsymbol (&Qeol_lf, "eol-lf");
5699   defsymbol (&Qeol_crlf, "eol-crlf");
5700 #ifdef MULE
5701   defsymbol (&Qcharset_g0, "charset-g0");
5702   defsymbol (&Qcharset_g1, "charset-g1");
5703   defsymbol (&Qcharset_g2, "charset-g2");
5704   defsymbol (&Qcharset_g3, "charset-g3");
5705   defsymbol (&Qforce_g0_on_output, "force-g0-on-output");
5706   defsymbol (&Qforce_g1_on_output, "force-g1-on-output");
5707   defsymbol (&Qforce_g2_on_output, "force-g2-on-output");
5708   defsymbol (&Qforce_g3_on_output, "force-g3-on-output");
5709   defsymbol (&Qno_iso6429, "no-iso6429");
5710   defsymbol (&Qinput_charset_conversion, "input-charset-conversion");
5711   defsymbol (&Qoutput_charset_conversion, "output-charset-conversion");
5712
5713   defsymbol (&Qshort, "short");
5714   defsymbol (&Qno_ascii_eol, "no-ascii-eol");
5715   defsymbol (&Qno_ascii_cntl, "no-ascii-cntl");
5716   defsymbol (&Qseven, "seven");
5717   defsymbol (&Qlock_shift, "lock-shift");
5718   defsymbol (&Qescape_quoted, "escape-quoted");
5719 #endif /* MULE */
5720 #ifdef UTF2000
5721   defsymbol (&Qdisable_composition, "disable-composition");
5722 #endif
5723   defsymbol (&Qencode, "encode");
5724   defsymbol (&Qdecode, "decode");
5725
5726 #ifdef MULE
5727   defsymbol (&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS],
5728              "shift-jis");
5729   defsymbol (&coding_category_symbol[CODING_CATEGORY_BIG5],
5730              "big5");
5731   defsymbol (&coding_category_symbol[CODING_CATEGORY_UCS4],
5732              "ucs-4");
5733   defsymbol (&coding_category_symbol[CODING_CATEGORY_UTF8],
5734              "utf-8");
5735   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_7],
5736              "iso-7");
5737   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE],
5738              "iso-8-designate");
5739   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_1],
5740              "iso-8-1");
5741   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_2],
5742              "iso-8-2");
5743   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_LOCK_SHIFT],
5744              "iso-lock-shift");
5745 #endif /* MULE */
5746   defsymbol (&coding_category_symbol[CODING_CATEGORY_NO_CONVERSION],
5747              "no-conversion");
5748 }
5749
5750 void
5751 lstream_type_create_file_coding (void)
5752 {
5753   LSTREAM_HAS_METHOD (decoding, reader);
5754   LSTREAM_HAS_METHOD (decoding, writer);
5755   LSTREAM_HAS_METHOD (decoding, rewinder);
5756   LSTREAM_HAS_METHOD (decoding, seekable_p);
5757   LSTREAM_HAS_METHOD (decoding, flusher);
5758   LSTREAM_HAS_METHOD (decoding, closer);
5759   LSTREAM_HAS_METHOD (decoding, marker);
5760
5761   LSTREAM_HAS_METHOD (encoding, reader);
5762   LSTREAM_HAS_METHOD (encoding, writer);
5763   LSTREAM_HAS_METHOD (encoding, rewinder);
5764   LSTREAM_HAS_METHOD (encoding, seekable_p);
5765   LSTREAM_HAS_METHOD (encoding, flusher);
5766   LSTREAM_HAS_METHOD (encoding, closer);
5767   LSTREAM_HAS_METHOD (encoding, marker);
5768 }
5769
5770 void
5771 vars_of_file_coding (void)
5772 {
5773   int i;
5774
5775   fcd = xnew (struct file_coding_dump);
5776   dumpstruct (&fcd, &fcd_description);
5777
5778   /* Initialize to something reasonable ... */
5779   for (i = 0; i <= CODING_CATEGORY_LAST; i++)
5780     {
5781       fcd->coding_category_system[i] = Qnil;
5782       fcd->coding_category_by_priority[i] = i;
5783     }
5784
5785   Fprovide (intern ("file-coding"));
5786
5787   DEFVAR_LISP ("keyboard-coding-system", &Vkeyboard_coding_system /*
5788 Coding system used for TTY keyboard input.
5789 Not used under a windowing system.
5790 */ );
5791   Vkeyboard_coding_system = Qnil;
5792
5793   DEFVAR_LISP ("terminal-coding-system", &Vterminal_coding_system /*
5794 Coding system used for TTY display output.
5795 Not used under a windowing system.
5796 */ );
5797   Vterminal_coding_system = Qnil;
5798
5799   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read /*
5800 Overriding coding system used when reading from a file or process.
5801 You should bind this variable with `let', but do not set it globally.
5802 If this is non-nil, it specifies the coding system that will be used
5803 to decode input on read operations, such as from a file or process.
5804 It overrides `buffer-file-coding-system-for-read',
5805 `insert-file-contents-pre-hook', etc.  Use those variables instead of
5806 this one for permanent changes to the environment.  */ );
5807   Vcoding_system_for_read = Qnil;
5808
5809   DEFVAR_LISP ("coding-system-for-write",
5810                &Vcoding_system_for_write /*
5811 Overriding coding system used when writing to a file or process.
5812 You should bind this variable with `let', but do not set it globally.
5813 If this is non-nil, it specifies the coding system that will be used
5814 to encode output for write operations, such as to a file or process.
5815 It overrides `buffer-file-coding-system', `write-region-pre-hook', etc.
5816 Use those variables instead of this one for permanent changes to the
5817 environment.  */ );
5818   Vcoding_system_for_write = Qnil;
5819
5820   DEFVAR_LISP ("file-name-coding-system", &Vfile_name_coding_system /*
5821 Coding system used to convert pathnames when accessing files.
5822 */ );
5823   Vfile_name_coding_system = Qnil;
5824
5825   DEFVAR_BOOL ("enable-multibyte-characters", &enable_multibyte_characters /*
5826 Non-nil means the buffer contents are regarded as multi-byte form
5827 of characters, not a binary code.  This affects the display, file I/O,
5828 and behaviors of various editing commands.
5829
5830 Setting this to nil does not do anything.
5831 */ );
5832   enable_multibyte_characters = 1;
5833 }
5834
5835 void
5836 complex_vars_of_file_coding (void)
5837 {
5838   staticpro (&Vcoding_system_hash_table);
5839   Vcoding_system_hash_table =
5840     make_lisp_hash_table (50, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ);
5841
5842   the_codesys_prop_dynarr = Dynarr_new (codesys_prop);
5843   dumpstruct (&the_codesys_prop_dynarr, &codesys_prop_dynarr_description);
5844
5845 #define DEFINE_CODESYS_PROP(Prop_Type, Sym) do  \
5846 {                                               \
5847   struct codesys_prop csp;                      \
5848   csp.sym = (Sym);                              \
5849   csp.prop_type = (Prop_Type);                  \
5850   Dynarr_add (the_codesys_prop_dynarr, csp);    \
5851 } while (0)
5852
5853   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qmnemonic);
5854   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_type);
5855   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_cr);
5856   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_crlf);
5857   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_lf);
5858   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qpost_read_conversion);
5859   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qpre_write_conversion);
5860 #ifdef MULE
5861   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g0);
5862   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g1);
5863   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g2);
5864   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g3);
5865   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g0_on_output);
5866   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g1_on_output);
5867   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g2_on_output);
5868   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g3_on_output);
5869   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qshort);
5870   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_eol);
5871   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_cntl);
5872   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qseven);
5873   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qlock_shift);
5874   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_iso6429);
5875   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qescape_quoted);
5876   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qinput_charset_conversion);
5877   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qoutput_charset_conversion);
5878
5879   DEFINE_CODESYS_PROP (CODESYS_PROP_CCL,     Qencode);
5880   DEFINE_CODESYS_PROP (CODESYS_PROP_CCL,     Qdecode);
5881 #endif /* MULE */
5882   /* Need to create this here or we're really screwed. */
5883   Fmake_coding_system
5884     (Qraw_text, Qno_conversion,
5885      build_string ("Raw text, which means it converts only line-break-codes."),
5886      list2 (Qmnemonic, build_string ("Raw")));
5887
5888   Fmake_coding_system
5889     (Qbinary, Qno_conversion,
5890      build_string ("Binary, which means it does not convert anything."),
5891      list4 (Qeol_type, Qlf,
5892             Qmnemonic, build_string ("Binary")));
5893
5894 #ifdef UTF2000
5895   Fmake_coding_system
5896     (Qutf8, Qutf8,
5897      build_string ("Coding-system of ISO/IEC 10646 UTF-8."),
5898      list2 (Qmnemonic, build_string ("UTF8")));
5899 #endif
5900
5901   Fdefine_coding_system_alias (Qno_conversion, Qraw_text);
5902
5903   Fdefine_coding_system_alias (Qfile_name, Qbinary);
5904
5905   Fdefine_coding_system_alias (Qterminal, Qbinary);
5906   Fdefine_coding_system_alias (Qkeyboard, Qbinary);
5907
5908   /* Need this for bootstrapping */
5909   fcd->coding_category_system[CODING_CATEGORY_NO_CONVERSION] =
5910     Fget_coding_system (Qraw_text);
5911
5912 #ifdef UTF2000
5913   fcd->coding_category_system[CODING_CATEGORY_UTF8]
5914    = Fget_coding_system (Qutf8);
5915 #endif
5916
5917 #if defined(MULE) && !defined(UTF2000)
5918   {
5919     unsigned int i;
5920
5921     for (i = 0; i < countof (fcd->ucs_to_mule_table); i++)
5922       fcd->ucs_to_mule_table[i] = Qnil;
5923   }
5924   staticpro (&mule_to_ucs_table);
5925   mule_to_ucs_table = Fmake_char_table(Qgeneric);
5926 #endif /* defined(MULE) && !defined(UTF2000) */
5927 }