git.chise.org Git - chise/xemacs-chise.git.1/blob - src/text-coding.c

   1 /* Code conversion functions.
   2    Copyright (C) 1991, 1995 Free Software Foundation, Inc.
   3    Copyright (C) 1995 Sun Microsystems, Inc.
   4    Copyright (C) 1999,2000,2001,2002,2003,2004 MORIOKA Tomohiko
   5
   6 This file is part of XEmacs.
   7
   8 XEmacs is free software; you can redistribute it and/or modify it
   9 under the terms of the GNU General Public License as published by the
  10 Free Software Foundation; either version 2, or (at your option) any
  11 later version.
  12
  13 XEmacs is distributed in the hope that it will be useful, but WITHOUT
  14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with XEmacs; see the file COPYING.  If not, write to
  20 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  21 Boston, MA 02111-1307, USA.  */
  22
  23 /* Synched up with: Mule 2.3.   Not in FSF. */
  24
  25 /* Rewritten by Ben Wing <ben@xemacs.org>. */
  26 /* Rewritten by MORIOKA Tomohiko <tomo@m17n.org> for XEmacs CHISE. */
  27
  28 #include <config.h>
  29 #include "lisp.h"
  30
  31 #include "buffer.h"
  32 #include "elhash.h"
  33 #include "insdel.h"
  34 #include "lstream.h"
  35 #include "opaque.h"
  36 #ifdef MULE
  37 #include "mule-ccl.h"
  38 #include "chartab.h"
  39 #endif
  40 #include "file-coding.h"
  41
  42 Lisp_Object Qcoding_system_error;
  43
  44 Lisp_Object Vkeyboard_coding_system;
  45 Lisp_Object Vterminal_coding_system;
  46 Lisp_Object Vcoding_system_for_read;
  47 Lisp_Object Vcoding_system_for_write;
  48 Lisp_Object Vfile_name_coding_system;
  49
  50 Lisp_Object Vcoded_charset_entity_reference_alist;
  51
  52 /* Table of symbols identifying each coding category. */
  53 Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST];
  54
  55
  56
  57 struct file_coding_dump {
  58   /* Coding system currently associated with each coding category. */
  59   Lisp_Object coding_category_system[CODING_CATEGORY_LAST];
  60
  61   /* Table of all coding categories in decreasing order of priority.
  62      This describes a permutation of the possible coding categories. */
  63   int coding_category_by_priority[CODING_CATEGORY_LAST];
  64
  65 #if defined(MULE) && !defined(UTF2000)
  66   Lisp_Object ucs_to_mule_table[65536];
  67 #endif
  68 } *fcd;
  69
  70 static const struct lrecord_description fcd_description_1[] = {
  71   { XD_LISP_OBJECT_ARRAY, offsetof (struct file_coding_dump, coding_category_system), CODING_CATEGORY_LAST },
  72 #if defined(MULE) && !defined(UTF2000)
  73   { XD_LISP_OBJECT_ARRAY, offsetof (struct file_coding_dump, ucs_to_mule_table), countof (fcd->ucs_to_mule_table) },
  74 #endif
  75   { XD_END }
  76 };
  77
  78 static const struct struct_description fcd_description = {
  79   sizeof (struct file_coding_dump),
  80   fcd_description_1
  81 };
  82
  83 Lisp_Object mule_to_ucs_table;
  84
  85 Lisp_Object Qcoding_systemp;
  86
  87 Lisp_Object Qraw_text, Qno_conversion, Qccl, Qiso2022;
  88 /* Qinternal in general.c */
  89
  90 Lisp_Object Qmnemonic, Qeol_type;
  91 Lisp_Object Qcr, Qcrlf, Qlf;
  92 Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf;
  93 Lisp_Object Qpost_read_conversion;
  94 Lisp_Object Qpre_write_conversion;
  95
  96 #ifdef MULE
  97 Lisp_Object Qucs4, Qutf16, Qutf8;
  98 Lisp_Object Qbig5, Qshift_jis;
  99 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3;
 100 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output;
 101 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output;
 102 Lisp_Object Qno_iso6429;
 103 Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion;
 104 Lisp_Object Qescape_quoted;
 105 Lisp_Object Qshort, Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift;
 106 #endif
 107 #ifdef UTF2000
 108 Lisp_Object Qutf_8_mcs;
 109 Lisp_Object Qdisable_composition;
 110 Lisp_Object Quse_entity_reference;
 111 Lisp_Object Qd, Qx, QX;
 112 #endif
 113 Lisp_Object Qencode, Qdecode;
 114
 115 Lisp_Object Vcoding_system_hash_table;
 116
 117 int enable_multibyte_characters;
 118
 119 #ifdef MULE
 120 /* Additional information used by the ISO2022 decoder and detector. */
 121 struct iso2022_decoder
 122 {
 123   /* CHARSET holds the character sets currently assigned to the G0
 124      through G3 variables.  It is initialized from the array
 125      INITIAL_CHARSET in CODESYS. */
 126   Lisp_Object charset[4];
 127
 128   /* Which registers are currently invoked into the left (GL) and
 129      right (GR) halves of the 8-bit encoding space? */
 130   int register_left, register_right;
 131
 132   /* ISO_ESC holds a value indicating part of an escape sequence
 133      that has already been seen. */
 134   enum iso_esc_flag esc;
 135
 136   /* This records the bytes we've seen so far in an escape sequence,
 137      in case the sequence is invalid (we spit out the bytes unchanged). */
 138   unsigned char esc_bytes[8];
 139
 140   /* Index for next byte to store in ISO escape sequence. */
 141   int esc_bytes_index;
 142
 143 #ifdef ENABLE_COMPOSITE_CHARS
 144   /* Stuff seen so far when composing a string. */
 145   unsigned_char_dynarr *composite_chars;
 146 #endif
 147
 148   /* If we saw an invalid designation sequence for a particular
 149      register, we flag it here and switch to ASCII.  The next time we
 150      see a valid designation for this register, we turn off the flag
 151      and do the designation normally, but pretend the sequence was
 152      invalid.  The effect of all this is that (most of the time) the
 153      escape sequences for both the switch to the unknown charset, and
 154      the switch back to the known charset, get inserted literally into
 155      the buffer and saved out as such.  The hope is that we can
 156      preserve the escape sequences so that the resulting written out
 157      file makes sense.  If we don't do any of this, the designation
 158      to the invalid charset will be preserved but that switch back
 159      to the known charset will probably get eaten because it was
 160      the same charset that was already present in the register. */
 161   unsigned char invalid_designated[4];
 162
 163   /* We try to do similar things as above for direction-switching
 164      sequences.  If we encountered a direction switch while an
 165      invalid designation was present, or an invalid designation
 166      just after a direction switch (i.e. no valid designation
 167      encountered yet), we insert the direction-switch escape
 168      sequence literally into the output stream, and later on
 169      insert the corresponding direction-restoring escape sequence
 170      literally also. */
 171   unsigned int switched_dir_and_no_valid_charset_yet :1;
 172   unsigned int invalid_switch_dir :1;
 173
 174   /* Tells the decoder to output the escape sequence literally
 175      even though it was valid.  Used in the games we play to
 176      avoid lossage when we encounter invalid designations. */
 177   unsigned int output_literally :1;
 178   /* We encountered a direction switch followed by an invalid
 179      designation.  We didn't output the direction switch
 180      literally because we didn't know about the invalid designation;
 181      but we have to do so now. */
 182   unsigned int output_direction_sequence :1;
 183 };
 184 #endif /* MULE */
 185 EXFUN (Fcopy_coding_system, 2);
 186 #ifdef MULE
 187 struct detection_state;
 188
 189 static void
 190 text_encode_generic (Lstream *encoding, const Bufbyte *src,
 191                      unsigned_char_dynarr *dst, Lstream_data_count n);
 192
 193 static int detect_coding_sjis (struct detection_state *st,
 194                                const Extbyte *src, Lstream_data_count n);
 195 static void decode_coding_sjis (Lstream *decoding, const Extbyte *src,
 196                                 unsigned_char_dynarr *dst, Lstream_data_count n);
 197 void char_encode_shift_jis (struct encoding_stream *str, Emchar c,
 198                             unsigned_char_dynarr *dst, unsigned int *flags);
 199 void char_finish_shift_jis (struct encoding_stream *str,
 200                             unsigned_char_dynarr *dst, unsigned int *flags);
 201
 202 static int detect_coding_big5 (struct detection_state *st,
 203                                const Extbyte *src, Lstream_data_count n);
 204 static void decode_coding_big5 (Lstream *decoding, const Extbyte *src,
 205                                 unsigned_char_dynarr *dst, Lstream_data_count n);
 206 void char_encode_big5 (struct encoding_stream *str, Emchar c,
 207                        unsigned_char_dynarr *dst, unsigned int *flags);
 208 void char_finish_big5 (struct encoding_stream *str,
 209                        unsigned_char_dynarr *dst, unsigned int *flags);
 210
 211 static int detect_coding_ucs4 (struct detection_state *st,
 212                                const Extbyte *src, Lstream_data_count n);
 213 static void decode_coding_ucs4 (Lstream *decoding, const Extbyte *src,
 214                                 unsigned_char_dynarr *dst, Lstream_data_count n);
 215 void char_encode_ucs4 (struct encoding_stream *str, Emchar c,
 216                        unsigned_char_dynarr *dst, unsigned int *flags);
 217 void char_finish_ucs4 (struct encoding_stream *str,
 218                        unsigned_char_dynarr *dst, unsigned int *flags);
 219
 220 static int detect_coding_utf16 (struct detection_state *st,
 221                                const Extbyte *src, Lstream_data_count n);
 222 static void decode_coding_utf16 (Lstream *decoding, const Extbyte *src,
 223                                 unsigned_char_dynarr *dst, Lstream_data_count n);
 224 void char_encode_utf16 (struct encoding_stream *str, Emchar c,
 225                        unsigned_char_dynarr *dst, unsigned int *flags);
 226 void char_finish_utf16 (struct encoding_stream *str,
 227                        unsigned_char_dynarr *dst, unsigned int *flags);
 228
 229 static int detect_coding_utf8 (struct detection_state *st,
 230                                const Extbyte *src, Lstream_data_count n);
 231 static void decode_coding_utf8 (Lstream *decoding, const Extbyte *src,
 232                                 unsigned_char_dynarr *dst, Lstream_data_count n);
 233 void char_encode_utf8 (struct encoding_stream *str, Emchar c,
 234                        unsigned_char_dynarr *dst, unsigned int *flags);
 235 void char_finish_utf8 (struct encoding_stream *str,
 236                        unsigned_char_dynarr *dst, unsigned int *flags);
 237
 238 static int postprocess_iso2022_mask (int mask);
 239 static void reset_iso2022 (Lisp_Object coding_system,
 240                            struct iso2022_decoder *iso);
 241 static int detect_coding_iso2022 (struct detection_state *st,
 242                                   const Extbyte *src, Lstream_data_count n);
 243 static void decode_coding_iso2022 (Lstream *decoding, const Extbyte *src,
 244                                    unsigned_char_dynarr *dst, Lstream_data_count n);
 245 void char_encode_iso2022 (struct encoding_stream *str, Emchar c,
 246                           unsigned_char_dynarr *dst, unsigned int *flags);
 247 void char_finish_iso2022 (struct encoding_stream *str,
 248                           unsigned_char_dynarr *dst, unsigned int *flags);
 249 #endif /* MULE */
 250 static void decode_coding_no_conversion (Lstream *decoding, const Extbyte *src,
 251                                          unsigned_char_dynarr *dst, Lstream_data_count n);
 252 static void encode_coding_no_conversion (Lstream *encoding, const Bufbyte *src,
 253                                          unsigned_char_dynarr *dst, Lstream_data_count n);
 254 static void mule_decode (Lstream *decoding, const Extbyte *src,
 255                          unsigned_char_dynarr *dst, Lstream_data_count n);
 256 static void mule_encode (Lstream *encoding, const Bufbyte *src,
 257                          unsigned_char_dynarr *dst, Lstream_data_count n);
 258
 259 typedef struct codesys_prop codesys_prop;
 260 struct codesys_prop
 261 {
 262   Lisp_Object sym;
 263   int prop_type;
 264 };
 265
 266 typedef struct
 267 {
 268   Dynarr_declare (codesys_prop);
 269 } codesys_prop_dynarr;
 270
 271 static const struct lrecord_description codesys_prop_description_1[] = {
 272   { XD_LISP_OBJECT, offsetof (codesys_prop, sym) },
 273   { XD_END }
 274 };
 275
 276 static const struct struct_description codesys_prop_description = {
 277   sizeof (codesys_prop),
 278   codesys_prop_description_1
 279 };
 280
 281 static const struct lrecord_description codesys_prop_dynarr_description_1[] = {
 282   XD_DYNARR_DESC (codesys_prop_dynarr, &codesys_prop_description),
 283   { XD_END }
 284 };
 285
 286 static const struct struct_description codesys_prop_dynarr_description = {
 287   sizeof (codesys_prop_dynarr),
 288   codesys_prop_dynarr_description_1
 289 };
 290
 291 codesys_prop_dynarr *the_codesys_prop_dynarr;
 292
 293 enum codesys_prop_enum
 294 {
 295   CODESYS_PROP_ALL_OK,
 296   CODESYS_PROP_ISO2022,
 297   CODESYS_PROP_CCL
 298 };
 299
 300 \f
 301 /************************************************************************/
 302 /*                       Coding system functions                        */
 303 /************************************************************************/
 304
 305 static Lisp_Object mark_coding_system (Lisp_Object);
 306 static void print_coding_system (Lisp_Object, Lisp_Object, int);
 307 static void finalize_coding_system (void *header, int for_disksave);
 308
 309 #ifdef MULE
 310 static const struct lrecord_description ccs_description_1[] = {
 311   { XD_LISP_OBJECT, offsetof (charset_conversion_spec, from_charset) },
 312   { XD_LISP_OBJECT, offsetof (charset_conversion_spec, to_charset) },
 313   { XD_END }
 314 };
 315
 316 static const struct struct_description ccs_description = {
 317   sizeof (charset_conversion_spec),
 318   ccs_description_1
 319 };
 320
 321 static const struct lrecord_description ccsd_description_1[] = {
 322   XD_DYNARR_DESC (charset_conversion_spec_dynarr, &ccs_description),
 323   { XD_END }
 324 };
 325
 326 static const struct struct_description ccsd_description = {
 327   sizeof (charset_conversion_spec_dynarr),
 328   ccsd_description_1
 329 };
 330 #endif
 331
 332 static const struct lrecord_description coding_system_description[] = {
 333   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, name) },
 334   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, doc_string) },
 335   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, mnemonic) },
 336   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, post_read_conversion) },
 337   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, pre_write_conversion) },
 338   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_lf) },
 339   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_crlf) },
 340   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_cr) },
 341 #ifdef MULE
 342   { XD_LISP_OBJECT_ARRAY, offsetof (Lisp_Coding_System, iso2022.initial_charset), 4 },
 343   { XD_STRUCT_PTR,  offsetof (Lisp_Coding_System, iso2022.input_conv),  1, &ccsd_description },
 344   { XD_STRUCT_PTR,  offsetof (Lisp_Coding_System, iso2022.output_conv), 1, &ccsd_description },
 345   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccl.decode) },
 346   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccl.encode) },
 347 #ifdef UTF2000
 348   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccs_priority_list) },
 349 #endif
 350 #endif
 351   { XD_END }
 352 };
 353
 354 DEFINE_LRECORD_IMPLEMENTATION ("coding-system", coding_system,
 355                                mark_coding_system, print_coding_system,
 356                                finalize_coding_system,
 357                                0, 0, coding_system_description,
 358                                Lisp_Coding_System);
 359
 360 static Lisp_Object
 361 mark_coding_system (Lisp_Object obj)
 362 {
 363   Lisp_Coding_System *codesys = XCODING_SYSTEM (obj);
 364
 365   mark_object (CODING_SYSTEM_NAME (codesys));
 366   mark_object (CODING_SYSTEM_DOC_STRING (codesys));
 367   mark_object (CODING_SYSTEM_MNEMONIC (codesys));
 368   mark_object (CODING_SYSTEM_EOL_LF (codesys));
 369   mark_object (CODING_SYSTEM_EOL_CRLF (codesys));
 370   mark_object (CODING_SYSTEM_EOL_CR (codesys));
 371
 372   switch (CODING_SYSTEM_TYPE (codesys))
 373     {
 374 #ifdef MULE
 375       int i;
 376     case CODESYS_ISO2022:
 377       for (i = 0; i < 4; i++)
 378         mark_object (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i));
 379       if (codesys->iso2022.input_conv)
 380         {
 381           for (i = 0; i < Dynarr_length (codesys->iso2022.input_conv); i++)
 382             {
 383               struct charset_conversion_spec *ccs =
 384                 Dynarr_atp (codesys->iso2022.input_conv, i);
 385               mark_object (ccs->from_charset);
 386               mark_object (ccs->to_charset);
 387             }
 388         }
 389       if (codesys->iso2022.output_conv)
 390         {
 391           for (i = 0; i < Dynarr_length (codesys->iso2022.output_conv); i++)
 392             {
 393               struct charset_conversion_spec *ccs =
 394                 Dynarr_atp (codesys->iso2022.output_conv, i);
 395               mark_object (ccs->from_charset);
 396               mark_object (ccs->to_charset);
 397             }
 398         }
 399       break;
 400 #ifdef UTF2000
 401
 402     case CODESYS_BIG5:
 403       mark_object (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 0));
 404       mark_object (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 1));
 405       break;
 406 #endif
 407
 408     case CODESYS_CCL:
 409       mark_object (CODING_SYSTEM_CCL_DECODE (codesys));
 410       mark_object (CODING_SYSTEM_CCL_ENCODE (codesys));
 411       break;
 412 #endif /* MULE */
 413     default:
 414       break;
 415     }
 416
 417   mark_object (CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys));
 418 #ifdef UTF2000
 419   mark_object (CODING_SYSTEM_CCS_PRIORITY_LIST (codesys));
 420 #endif
 421   return CODING_SYSTEM_POST_READ_CONVERSION (codesys);
 422 }
 423
 424 static void
 425 print_coding_system (Lisp_Object obj, Lisp_Object printcharfun,
 426                      int escapeflag)
 427 {
 428   Lisp_Coding_System *c = XCODING_SYSTEM (obj);
 429   if (print_readably)
 430     error ("printing unreadable object #<coding_system 0x%x>",
 431            c->header.uid);
 432
 433   write_c_string ("#<coding_system ", printcharfun);
 434   print_internal (c->name, printcharfun, 1);
 435   write_c_string (">", printcharfun);
 436 }
 437
 438 static void
 439 finalize_coding_system (void *header, int for_disksave)
 440 {
 441   Lisp_Coding_System *c = (Lisp_Coding_System *) header;
 442   /* Since coding systems never go away, this function is not
 443      necessary.  But it would be necessary if we changed things
 444      so that coding systems could go away. */
 445   if (!for_disksave) /* see comment in lstream.c */
 446     {
 447       switch (CODING_SYSTEM_TYPE (c))
 448         {
 449 #ifdef MULE
 450         case CODESYS_ISO2022:
 451           if (c->iso2022.input_conv)
 452             {
 453               Dynarr_free (c->iso2022.input_conv);
 454               c->iso2022.input_conv = 0;
 455             }
 456           if (c->iso2022.output_conv)
 457             {
 458               Dynarr_free (c->iso2022.output_conv);
 459               c->iso2022.output_conv = 0;
 460             }
 461           break;
 462 #endif /* MULE */
 463         default:
 464           break;
 465         }
 466     }
 467 }
 468
 469 static eol_type_t
 470 symbol_to_eol_type (Lisp_Object symbol)
 471 {
 472   CHECK_SYMBOL (symbol);
 473   if (NILP (symbol))      return EOL_AUTODETECT;
 474   if (EQ (symbol, Qlf))   return EOL_LF;
 475   if (EQ (symbol, Qcrlf)) return EOL_CRLF;
 476   if (EQ (symbol, Qcr))   return EOL_CR;
 477
 478   signal_simple_error ("Unrecognized eol type", symbol);
 479   return EOL_AUTODETECT; /* not reached */
 480 }
 481
 482 static Lisp_Object
 483 eol_type_to_symbol (eol_type_t type)
 484 {
 485   switch (type)
 486     {
 487     default: abort ();
 488     case EOL_LF:         return Qlf;
 489     case EOL_CRLF:       return Qcrlf;
 490     case EOL_CR:         return Qcr;
 491     case EOL_AUTODETECT: return Qnil;
 492     }
 493 }
 494
 495 static void
 496 setup_eol_coding_systems (Lisp_Coding_System *codesys)
 497 {
 498   Lisp_Object codesys_obj;
 499   int len = string_length (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name);
 500   char *codesys_name = (char *) alloca (len + 7);
 501   int mlen = -1;
 502   char *codesys_mnemonic=0;
 503
 504   Lisp_Object codesys_name_sym, sub_codesys_obj;
 505
 506   /* kludge */
 507
 508   XSETCODING_SYSTEM (codesys_obj, codesys);
 509
 510   memcpy (codesys_name,
 511           string_data (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name), len);
 512
 513   if (STRINGP (CODING_SYSTEM_MNEMONIC (codesys)))
 514     {
 515       mlen = XSTRING_LENGTH (CODING_SYSTEM_MNEMONIC (codesys));
 516       codesys_mnemonic = (char *) alloca (mlen + 7);
 517       memcpy (codesys_mnemonic,
 518               XSTRING_DATA (CODING_SYSTEM_MNEMONIC (codesys)), mlen);
 519     }
 520
 521 #define DEFINE_SUB_CODESYS(op_sys, op_sys_abbr, Type) do {                      \
 522   strcpy (codesys_name + len, "-" op_sys);                                      \
 523   if (mlen != -1)                                                               \
 524     strcpy (codesys_mnemonic + mlen, op_sys_abbr);                              \
 525   codesys_name_sym = intern (codesys_name);                                     \
 526   sub_codesys_obj = Fcopy_coding_system (codesys_obj, codesys_name_sym);        \
 527   XCODING_SYSTEM_EOL_TYPE (sub_codesys_obj) = Type;                             \
 528   if (mlen != -1)                                                               \
 529     XCODING_SYSTEM_MNEMONIC(sub_codesys_obj) =                                  \
 530       build_string (codesys_mnemonic);                                          \
 531   CODING_SYSTEM_##Type (codesys) = sub_codesys_obj;                             \
 532 } while (0)
 533
 534   DEFINE_SUB_CODESYS("unix", "", EOL_LF);
 535   DEFINE_SUB_CODESYS("dos",  ":T", EOL_CRLF);
 536   DEFINE_SUB_CODESYS("mac",  ":t", EOL_CR);
 537 }
 538
 539 DEFUN ("coding-system-p", Fcoding_system_p, 1, 1, 0, /*
 540 Return t if OBJECT is a coding system.
 541 A coding system is an object that defines how text containing multiple
 542 character sets is encoded into a stream of (typically 8-bit) bytes.
 543 The coding system is used to decode the stream into a series of
 544 characters (which may be from multiple charsets) when the text is read
 545 from a file or process, and is used to encode the text back into the
 546 same format when it is written out to a file or process.
 547
 548 For example, many ISO2022-compliant coding systems (such as Compound
 549 Text, which is used for inter-client data under the X Window System)
 550 use escape sequences to switch between different charsets -- Japanese
 551 Kanji, for example, is invoked with "ESC $ ( B"; ASCII is invoked
 552 with "ESC ( B"; and Cyrillic is invoked with "ESC - L".  See
 553 `make-coding-system' for more information.
 554
 555 Coding systems are normally identified using a symbol, and the
 556 symbol is accepted in place of the actual coding system object whenever
 557 a coding system is called for. (This is similar to how faces work.)
 558 */
 559        (object))
 560 {
 561   return CODING_SYSTEMP (object) ? Qt : Qnil;
 562 }
 563
 564 DEFUN ("find-coding-system", Ffind_coding_system, 1, 1, 0, /*
 565 Retrieve the coding system of the given name.
 566
 567 If CODING-SYSTEM-OR-NAME is a coding-system object, it is simply
 568 returned.  Otherwise, CODING-SYSTEM-OR-NAME should be a symbol.
 569 If there is no such coding system, nil is returned.  Otherwise the
 570 associated coding system object is returned.
 571 */
 572        (coding_system_or_name))
 573 {
 574   if (NILP (coding_system_or_name))
 575     coding_system_or_name = Qbinary;
 576   else if (CODING_SYSTEMP (coding_system_or_name))
 577     return coding_system_or_name;
 578   else
 579     CHECK_SYMBOL (coding_system_or_name);
 580
 581   while (1)
 582     {
 583       coding_system_or_name =
 584         Fgethash (coding_system_or_name, Vcoding_system_hash_table, Qnil);
 585
 586       if (CODING_SYSTEMP (coding_system_or_name) || NILP (coding_system_or_name))
 587         return coding_system_or_name;
 588     }
 589 }
 590
 591 DEFUN ("get-coding-system", Fget_coding_system, 1, 1, 0, /*
 592 Retrieve the coding system of the given name.
 593 Same as `find-coding-system' except that if there is no such
 594 coding system, an error is signaled instead of returning nil.
 595 */
 596        (name))
 597 {
 598   Lisp_Object coding_system = Ffind_coding_system (name);
 599
 600   if (NILP (coding_system))
 601     signal_simple_error ("No such coding system", name);
 602   return coding_system;
 603 }
 604
 605 /* We store the coding systems in hash tables with the names as the key and the
 606    actual coding system object as the value.  Occasionally we need to use them
 607    in a list format.  These routines provide us with that. */
 608 struct coding_system_list_closure
 609 {
 610   Lisp_Object *coding_system_list;
 611 };
 612
 613 static int
 614 add_coding_system_to_list_mapper (Lisp_Object key, Lisp_Object value,
 615                                   void *coding_system_list_closure)
 616 {
 617   /* This function can GC */
 618   struct coding_system_list_closure *cscl =
 619     (struct coding_system_list_closure *) coding_system_list_closure;
 620   Lisp_Object *coding_system_list = cscl->coding_system_list;
 621
 622   *coding_system_list = Fcons (key, *coding_system_list);
 623   return 0;
 624 }
 625
 626 DEFUN ("coding-system-list", Fcoding_system_list, 0, 0, 0, /*
 627 Return a list of the names of all defined coding systems.
 628 */
 629        ())
 630 {
 631   Lisp_Object coding_system_list = Qnil;
 632   struct gcpro gcpro1;
 633   struct coding_system_list_closure coding_system_list_closure;
 634
 635   GCPRO1 (coding_system_list);
 636   coding_system_list_closure.coding_system_list = &coding_system_list;
 637   elisp_maphash (add_coding_system_to_list_mapper, Vcoding_system_hash_table,
 638                  &coding_system_list_closure);
 639   UNGCPRO;
 640
 641   return coding_system_list;
 642 }
 643
 644 DEFUN ("coding-system-name", Fcoding_system_name, 1, 1, 0, /*
 645 Return the name of the given coding system.
 646 */
 647        (coding_system))
 648 {
 649   coding_system = Fget_coding_system (coding_system);
 650   return XCODING_SYSTEM_NAME (coding_system);
 651 }
 652
 653 static Lisp_Coding_System *
 654 allocate_coding_system (enum coding_system_type type, Lisp_Object name)
 655 {
 656   Lisp_Coding_System *codesys =
 657     alloc_lcrecord_type (Lisp_Coding_System, &lrecord_coding_system);
 658
 659   zero_lcrecord (codesys);
 660   CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = Qnil;
 661   CODING_SYSTEM_POST_READ_CONVERSION (codesys) = Qnil;
 662   CODING_SYSTEM_EOL_TYPE (codesys) = EOL_AUTODETECT;
 663   CODING_SYSTEM_EOL_CRLF (codesys) = Qnil;
 664   CODING_SYSTEM_EOL_CR   (codesys) = Qnil;
 665   CODING_SYSTEM_EOL_LF   (codesys) = Qnil;
 666   CODING_SYSTEM_TYPE     (codesys) = type;
 667   CODING_SYSTEM_MNEMONIC (codesys) = Qnil;
 668 #ifdef MULE
 669 #ifdef UTF2000
 670   CODING_SYSTEM_CCS_PRIORITY_LIST (codesys) = Qnil;
 671 #endif
 672   if (type == CODESYS_ISO2022)
 673     {
 674       int i;
 675       for (i = 0; i < 4; i++)
 676         CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i) = Qnil;
 677     }
 678 #ifdef UTF2000
 679   if (type == CODESYS_UTF8)
 680     {
 681       CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 0)
 682         = Vcharset_ucs;
 683       CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 1)
 684         = Qnil;
 685       CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 2)
 686         = Qnil;
 687       CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 3)
 688         = Qnil;
 689     }
 690   else if (type == CODESYS_BIG5)
 691     {
 692       CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 0)
 693         = Vcharset_ascii;
 694       CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 1)
 695         = Vcharset_chinese_big5;
 696       CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 2)
 697         = Qnil;
 698       CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 3)
 699         = Qnil;
 700     }
 701 #endif
 702   else if (type == CODESYS_CCL)
 703     {
 704       CODING_SYSTEM_CCL_DECODE (codesys) = Qnil;
 705       CODING_SYSTEM_CCL_ENCODE (codesys) = Qnil;
 706     }
 707 #endif /* MULE */
 708   CODING_SYSTEM_NAME (codesys) = name;
 709
 710   return codesys;
 711 }
 712
 713 #ifdef MULE
 714 /* Given a list of charset conversion specs as specified in a Lisp
 715    program, parse it into STORE_HERE. */
 716
 717 static void
 718 parse_charset_conversion_specs (charset_conversion_spec_dynarr *store_here,
 719                                 Lisp_Object spec_list)
 720 {
 721   Lisp_Object rest;
 722
 723   EXTERNAL_LIST_LOOP (rest, spec_list)
 724     {
 725       Lisp_Object car = XCAR (rest);
 726       Lisp_Object from, to;
 727       struct charset_conversion_spec spec;
 728
 729       if (!CONSP (car) || !CONSP (XCDR (car)) || !NILP (XCDR (XCDR (car))))
 730         signal_simple_error ("Invalid charset conversion spec", car);
 731       from = Fget_charset (XCAR (car));
 732       to = Fget_charset (XCAR (XCDR (car)));
 733       if ( (XCHARSET_CHARS (from) != XCHARSET_CHARS (to)) ||
 734            (XCHARSET_DIMENSION (from) != XCHARSET_DIMENSION (to)) )
 735         signal_simple_error_2
 736           ("Attempted conversion between different charset types",
 737            from, to);
 738       spec.from_charset = from;
 739       spec.to_charset = to;
 740
 741       Dynarr_add (store_here, spec);
 742     }
 743 }
 744
 745 /* Given a dynarr LOAD_HERE of internally-stored charset conversion
 746    specs, return the equivalent as the Lisp programmer would see it.
 747
 748    If LOAD_HERE is 0, return Qnil. */
 749
 750 static Lisp_Object
 751 unparse_charset_conversion_specs (charset_conversion_spec_dynarr *load_here)
 752 {
 753   int i;
 754   Lisp_Object result;
 755
 756   if (!load_here)
 757     return Qnil;
 758   for (i = 0, result = Qnil; i < Dynarr_length (load_here); i++)
 759     {
 760       struct charset_conversion_spec *ccs = Dynarr_atp (load_here, i);
 761       result = Fcons (list2 (ccs->from_charset, ccs->to_charset), result);
 762     }
 763
 764   return Fnreverse (result);
 765 }
 766
 767 #endif /* MULE */
 768
 769 DEFUN ("make-coding-system", Fmake_coding_system, 2, 4, 0, /*
 770 Register symbol NAME as a coding system.
 771
 772 TYPE describes the conversion method used and should be one of
 773
 774 nil or 'undecided
 775      Automatic conversion.  XEmacs attempts to detect the coding system
 776      used in the file.
 777 'no-conversion
 778      No conversion.  Use this for binary files and such.  On output,
 779      graphic characters that are not in ASCII or Latin-1 will be
 780      replaced by a ?. (For a no-conversion-encoded buffer, these
 781      characters will only be present if you explicitly insert them.)
 782 'shift-jis
 783      Shift-JIS (a Japanese encoding commonly used in PC operating systems).
 784 'ucs-4
 785      ISO 10646 UCS-4 encoding.
 786 'utf-8
 787      ISO 10646 UTF-8 encoding.
 788 'iso2022
 789      Any ISO2022-compliant encoding.  Among other things, this includes
 790      JIS (the Japanese encoding commonly used for e-mail), EUC (the
 791      standard Unix encoding for Japanese and other languages), and
 792      Compound Text (the encoding used in X11).  You can specify more
 793      specific information about the conversion with the PROPS argument.
 794 'big5
 795      Big5 (the encoding commonly used for Taiwanese).
 796 'ccl
 797      The conversion is performed using a user-written pseudo-code
 798      program.  CCL (Code Conversion Language) is the name of this
 799      pseudo-code.
 800 'internal
 801      Write out or read in the raw contents of the memory representing
 802      the buffer's text.  This is primarily useful for debugging
 803      purposes, and is only enabled when XEmacs has been compiled with
 804      DEBUG_XEMACS defined (via the --debug configure option).
 805      WARNING: Reading in a file using 'internal conversion can result
 806      in an internal inconsistency in the memory representing a
 807      buffer's text, which will produce unpredictable results and may
 808      cause XEmacs to crash.  Under normal circumstances you should
 809      never use 'internal conversion.
 810
 811 DOC-STRING is a string describing the coding system.
 812
 813 PROPS is a property list, describing the specific nature of the
 814 character set.  Recognized properties are:
 815
 816 'mnemonic
 817      String to be displayed in the modeline when this coding system is
 818      active.
 819
 820 'eol-type
 821      End-of-line conversion to be used.  It should be one of
 822
 823         nil
 824                 Automatically detect the end-of-line type (LF, CRLF,
 825                 or CR).  Also generate subsidiary coding systems named
 826                 `NAME-unix', `NAME-dos', and `NAME-mac', that are
 827                 identical to this coding system but have an EOL-TYPE
 828                 value of 'lf, 'crlf, and 'cr, respectively.
 829         'lf
 830                 The end of a line is marked externally using ASCII LF.
 831                 Since this is also the way that XEmacs represents an
 832                 end-of-line internally, specifying this option results
 833                 in no end-of-line conversion.  This is the standard
 834                 format for Unix text files.
 835         'crlf
 836                 The end of a line is marked externally using ASCII
 837                 CRLF.  This is the standard format for MS-DOS text
 838                 files.
 839         'cr
 840                 The end of a line is marked externally using ASCII CR.
 841                 This is the standard format for Macintosh text files.
 842         t
 843                 Automatically detect the end-of-line type but do not
 844                 generate subsidiary coding systems.  (This value is
 845                 converted to nil when stored internally, and
 846                 `coding-system-property' will return nil.)
 847
 848 'disable-composition
 849      If non-nil, composition/decomposition for combining characters
 850      are disabled.
 851
 852 'use-entity-reference
 853      If non-nil, SGML style entity-reference is used for non-system-characters.
 854
 855 'post-read-conversion
 856      Function called after a file has been read in, to perform the
 857      decoding.  Called with two arguments, START and END, denoting
 858      a region of the current buffer to be decoded.
 859
 860 'pre-write-conversion
 861      Function called before a file is written out, to perform the
 862      encoding.  Called with two arguments, START and END, denoting
 863      a region of the current buffer to be encoded.
 864
 865
 866 The following additional properties are recognized if TYPE is 'iso2022:
 867
 868 'charset-g0
 869 'charset-g1
 870 'charset-g2
 871 'charset-g3
 872      The character set initially designated to the G0 - G3 registers.
 873      The value should be one of
 874
 875           -- A charset object (designate that character set)
 876           -- nil (do not ever use this register)
 877           -- t (no character set is initially designated to
 878                 the register, but may be later on; this automatically
 879                 sets the corresponding `force-g*-on-output' property)
 880
 881 'force-g0-on-output
 882 'force-g1-on-output
 883 'force-g2-on-output
 884 'force-g2-on-output
 885      If non-nil, send an explicit designation sequence on output before
 886      using the specified register.
 887
 888 'short
 889      If non-nil, use the short forms "ESC $ @", "ESC $ A", and
 890      "ESC $ B" on output in place of the full designation sequences
 891      "ESC $ ( @", "ESC $ ( A", and "ESC $ ( B".
 892
 893 'no-ascii-eol
 894      If non-nil, don't designate ASCII to G0 at each end of line on output.
 895      Setting this to non-nil also suppresses other state-resetting that
 896      normally happens at the end of a line.
 897
 898 'no-ascii-cntl
 899      If non-nil, don't designate ASCII to G0 before control chars on output.
 900
 901 'seven
 902      If non-nil, use 7-bit environment on output.  Otherwise, use 8-bit
 903      environment.
 904
 905 'lock-shift
 906      If non-nil, use locking-shift (SO/SI) instead of single-shift
 907      or designation by escape sequence.
 908
 909 'no-iso6429
 910      If non-nil, don't use ISO6429's direction specification.
 911
 912 'escape-quoted
 913      If non-nil, literal control characters that are the same as
 914      the beginning of a recognized ISO2022 or ISO6429 escape sequence
 915      (in particular, ESC (0x1B), SO (0x0E), SI (0x0F), SS2 (0x8E),
 916      SS3 (0x8F), and CSI (0x9B)) are "quoted" with an escape character
 917      so that they can be properly distinguished from an escape sequence.
 918      (Note that doing this results in a non-portable encoding.) This
 919      encoding flag is used for byte-compiled files.  Note that ESC
 920      is a good choice for a quoting character because there are no
 921      escape sequences whose second byte is a character from the Control-0
 922      or Control-1 character sets; this is explicitly disallowed by the
 923      ISO2022 standard.
 924
 925 'input-charset-conversion
 926      A list of conversion specifications, specifying conversion of
 927      characters in one charset to another when decoding is performed.
 928      Each specification is a list of two elements: the source charset,
 929      and the destination charset.
 930
 931 'output-charset-conversion
 932      A list of conversion specifications, specifying conversion of
 933      characters in one charset to another when encoding is performed.
 934      The form of each specification is the same as for
 935      'input-charset-conversion.
 936
 937
 938 The following additional properties are recognized (and required)
 939 if TYPE is 'ccl:
 940
 941 'decode
 942      CCL program used for decoding (converting to internal format).
 943
 944 'encode
 945      CCL program used for encoding (converting to external format).
 946 */
 947        (name, type, doc_string, props))
 948 {
 949   Lisp_Coding_System *codesys;
 950   enum coding_system_type ty;
 951   int need_to_setup_eol_systems = 1;
 952
 953   /* Convert type to constant */
 954   if (NILP (type) || EQ (type, Qundecided))
 955                                       { ty = CODESYS_AUTODETECT; }
 956 #ifdef MULE
 957   else if (EQ (type, Qshift_jis))     { ty = CODESYS_SHIFT_JIS; }
 958   else if (EQ (type, Qiso2022))       { ty = CODESYS_ISO2022; }
 959   else if (EQ (type, Qbig5))          { ty = CODESYS_BIG5; }
 960   else if (EQ (type, Qucs4))          { ty = CODESYS_UCS4; }
 961   else if (EQ (type, Qutf16))         { ty = CODESYS_UTF16; }
 962   else if (EQ (type, Qutf8))          { ty = CODESYS_UTF8; }
 963   else if (EQ (type, Qccl))           { ty = CODESYS_CCL; }
 964 #endif
 965   else if (EQ (type, Qno_conversion)) { ty = CODESYS_NO_CONVERSION; }
 966 #ifdef DEBUG_XEMACS
 967   else if (EQ (type, Qinternal))      { ty = CODESYS_INTERNAL; }
 968 #endif
 969   else
 970     signal_simple_error ("Invalid coding system type", type);
 971
 972   CHECK_SYMBOL (name);
 973
 974   codesys = allocate_coding_system (ty, name);
 975
 976   if (NILP (doc_string))
 977     doc_string = build_string ("");
 978   else
 979     CHECK_STRING (doc_string);
 980   CODING_SYSTEM_DOC_STRING (codesys) = doc_string;
 981
 982   {
 983     EXTERNAL_PROPERTY_LIST_LOOP_3 (key, value, props)
 984       {
 985         if (EQ (key, Qmnemonic))
 986           {
 987             if (!NILP (value))
 988               CHECK_STRING (value);
 989             CODING_SYSTEM_MNEMONIC (codesys) = value;
 990           }
 991
 992         else if (EQ (key, Qeol_type))
 993           {
 994             need_to_setup_eol_systems = NILP (value);
 995             if (EQ (value, Qt))
 996               value = Qnil;
 997             CODING_SYSTEM_EOL_TYPE (codesys) = symbol_to_eol_type (value);
 998           }
 999
1000         else if (EQ (key, Qpost_read_conversion))
1001           CODING_SYSTEM_POST_READ_CONVERSION (codesys) = value;
1002         else if (EQ (key, Qpre_write_conversion))
1003           CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = value;
1004 #ifdef UTF2000
1005         else if (EQ (key, Qdisable_composition))
1006           CODING_SYSTEM_DISABLE_COMPOSITION (codesys) = !NILP (value);
1007         else if (EQ (key, Quse_entity_reference))
1008           CODING_SYSTEM_USE_ENTITY_REFERENCE (codesys) = !NILP (value);
1009 #endif
1010 #ifdef MULE
1011         else if (ty == CODESYS_ISO2022)
1012           {
1013 #define FROB_INITIAL_CHARSET(charset_num) \
1014   CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \
1015     ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value))
1016
1017             if      (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
1018             else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
1019             else if (EQ (key, Qcharset_g2)) FROB_INITIAL_CHARSET (2);
1020             else if (EQ (key, Qcharset_g3)) FROB_INITIAL_CHARSET (3);
1021
1022 #define FROB_FORCE_CHARSET(charset_num) \
1023   CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = !NILP (value)
1024
1025             else if (EQ (key, Qforce_g0_on_output)) FROB_FORCE_CHARSET (0);
1026             else if (EQ (key, Qforce_g1_on_output)) FROB_FORCE_CHARSET (1);
1027             else if (EQ (key, Qforce_g2_on_output)) FROB_FORCE_CHARSET (2);
1028             else if (EQ (key, Qforce_g3_on_output)) FROB_FORCE_CHARSET (3);
1029
1030 #define FROB_BOOLEAN_PROPERTY(prop) \
1031   CODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value)
1032
1033             else if (EQ (key, Qshort))         FROB_BOOLEAN_PROPERTY (SHORT);
1034             else if (EQ (key, Qno_ascii_eol))  FROB_BOOLEAN_PROPERTY (NO_ASCII_EOL);
1035             else if (EQ (key, Qno_ascii_cntl)) FROB_BOOLEAN_PROPERTY (NO_ASCII_CNTL);
1036             else if (EQ (key, Qseven))         FROB_BOOLEAN_PROPERTY (SEVEN);
1037             else if (EQ (key, Qlock_shift))    FROB_BOOLEAN_PROPERTY (LOCK_SHIFT);
1038             else if (EQ (key, Qno_iso6429))    FROB_BOOLEAN_PROPERTY (NO_ISO6429);
1039             else if (EQ (key, Qescape_quoted)) FROB_BOOLEAN_PROPERTY (ESCAPE_QUOTED);
1040
1041             else if (EQ (key, Qinput_charset_conversion))
1042               {
1043                 codesys->iso2022.input_conv =
1044                   Dynarr_new (charset_conversion_spec);
1045                 parse_charset_conversion_specs (codesys->iso2022.input_conv,
1046                                                 value);
1047               }
1048             else if (EQ (key, Qoutput_charset_conversion))
1049               {
1050                 codesys->iso2022.output_conv =
1051                   Dynarr_new (charset_conversion_spec);
1052                 parse_charset_conversion_specs (codesys->iso2022.output_conv,
1053                                                 value);
1054               }
1055             else
1056               signal_simple_error ("Unrecognized property", key);
1057           }
1058 #ifdef UTF2000
1059         else if (ty == CODESYS_UTF8)
1060           {
1061             if      (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
1062             else if (EQ (key, Qcharset_g1))
1063               CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 1) = value;
1064             else if (EQ (key, Qcharset_g2))
1065               CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 2) = value;
1066             else
1067               signal_simple_error ("Unrecognized property", key);
1068           }
1069         else if (ty == CODESYS_BIG5)
1070           {
1071             if      (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
1072             else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
1073             else
1074               signal_simple_error ("Unrecognized property", key);
1075           }
1076 #endif
1077         else if (EQ (type, Qccl))
1078           {
1079             Lisp_Object sym;
1080             struct ccl_program test_ccl;
1081             Extbyte *suffix;
1082
1083             /* Check key first.  */
1084             if (EQ (key, Qdecode))
1085               suffix = "-ccl-decode";
1086             else if (EQ (key, Qencode))
1087               suffix = "-ccl-encode";
1088             else
1089               signal_simple_error ("Unrecognized property", key);
1090
1091             /* If value is vector, register it as a ccl program
1092                associated with an newly created symbol for
1093                backward compatibility.  */
1094             if (VECTORP (value))
1095               {
1096                 sym = Fintern (concat2 (Fsymbol_name (name),
1097                                         build_string (suffix)),
1098                                Qnil);
1099                 Fregister_ccl_program (sym, value);
1100               }
1101             else
1102               {
1103                 CHECK_SYMBOL (value);
1104                 sym = value;
1105               }
1106             /* check if the given ccl programs are valid.  */
1107             if (setup_ccl_program (&test_ccl, sym) < 0)
1108               signal_simple_error ("Invalid CCL program", value);
1109
1110             if (EQ (key, Qdecode))
1111               CODING_SYSTEM_CCL_DECODE (codesys) = sym;
1112             else if (EQ (key, Qencode))
1113               CODING_SYSTEM_CCL_ENCODE (codesys) = sym;
1114
1115           }
1116 #endif /* MULE */
1117         else
1118           signal_simple_error ("Unrecognized property", key);
1119       }
1120   }
1121
1122   if (need_to_setup_eol_systems)
1123     setup_eol_coding_systems (codesys);
1124
1125   {
1126     Lisp_Object codesys_obj;
1127     XSETCODING_SYSTEM (codesys_obj, codesys);
1128     Fputhash (name, codesys_obj, Vcoding_system_hash_table);
1129     return codesys_obj;
1130   }
1131 }
1132
1133 DEFUN ("copy-coding-system", Fcopy_coding_system, 2, 2, 0, /*
1134 Copy OLD-CODING-SYSTEM to NEW-NAME.
1135 If NEW-NAME does not name an existing coding system, a new one will
1136 be created.
1137 */
1138        (old_coding_system, new_name))
1139 {
1140   Lisp_Object new_coding_system;
1141   old_coding_system = Fget_coding_system (old_coding_system);
1142   new_coding_system = Ffind_coding_system (new_name);
1143   if (NILP (new_coding_system))
1144     {
1145       XSETCODING_SYSTEM (new_coding_system,
1146                          allocate_coding_system
1147                          (XCODING_SYSTEM_TYPE (old_coding_system),
1148                           new_name));
1149       Fputhash (new_name, new_coding_system, Vcoding_system_hash_table);
1150     }
1151
1152   {
1153     Lisp_Coding_System *to = XCODING_SYSTEM (new_coding_system);
1154     Lisp_Coding_System *from = XCODING_SYSTEM (old_coding_system);
1155     memcpy (((char *) to  ) + sizeof (to->header),
1156             ((char *) from) + sizeof (from->header),
1157             sizeof (*from) - sizeof (from->header));
1158     to->name = new_name;
1159   }
1160   return new_coding_system;
1161 }
1162
1163 DEFUN ("coding-system-canonical-name-p", Fcoding_system_canonical_name_p, 1, 1, 0, /*
1164 Return t if OBJECT names a coding system, and is not a coding system alias.
1165 */
1166        (object))
1167 {
1168   return CODING_SYSTEMP (Fgethash (object, Vcoding_system_hash_table, Qnil))
1169     ? Qt : Qnil;
1170 }
1171
1172 DEFUN ("coding-system-alias-p", Fcoding_system_alias_p, 1, 1, 0, /*
1173 Return t if OBJECT is a coding system alias.
1174 All coding system aliases are created by `define-coding-system-alias'.
1175 */
1176        (object))
1177 {
1178   return SYMBOLP (Fgethash (object, Vcoding_system_hash_table, Qzero))
1179     ? Qt : Qnil;
1180 }
1181
1182 DEFUN ("coding-system-aliasee", Fcoding_system_aliasee, 1, 1, 0, /*
1183 Return the coding-system symbol for which symbol ALIAS is an alias.
1184 */
1185        (alias))
1186 {
1187   Lisp_Object aliasee = Fgethash (alias, Vcoding_system_hash_table, Qnil);
1188   if (SYMBOLP (aliasee))
1189     return aliasee;
1190   else
1191     signal_simple_error ("Symbol is not a coding system alias", alias);
1192   return Qnil;          /* To keep the compiler happy */
1193 }
1194
1195 static Lisp_Object
1196 append_suffix_to_symbol (Lisp_Object symbol, const char *ascii_string)
1197 {
1198   return Fintern (concat2 (Fsymbol_name (symbol), build_string (ascii_string)),
1199                   Qnil);
1200 }
1201
1202 /* A maphash function, for removing dangling coding system aliases. */
1203 static int
1204 dangling_coding_system_alias_p (Lisp_Object alias,
1205                                 Lisp_Object aliasee,
1206                                 void *dangling_aliases)
1207 {
1208   if (SYMBOLP (aliasee)
1209       && NILP (Fgethash (aliasee, Vcoding_system_hash_table, Qnil)))
1210     {
1211       (*(int *) dangling_aliases)++;
1212       return 1;
1213     }
1214   else
1215     return 0;
1216 }
1217
1218 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias, 2, 2, 0, /*
1219 Define symbol ALIAS as an alias for coding system ALIASEE.
1220
1221 You can use this function to redefine an alias that has already been defined,
1222 but you cannot redefine a name which is the canonical name for a coding system.
1223 \(a canonical name of a coding system is what is returned when you call
1224 `coding-system-name' on a coding system).
1225
1226 ALIASEE itself can be an alias, which allows you to define nested aliases.
1227
1228 You are forbidden, however, from creating alias loops or `dangling' aliases.
1229 These will be detected, and an error will be signaled if you attempt to do so.
1230
1231 If ALIASEE is nil, then ALIAS will simply be undefined.
1232
1233 See also `coding-system-alias-p', `coding-system-aliasee',
1234 and `coding-system-canonical-name-p'.
1235 */
1236        (alias, aliasee))
1237 {
1238   Lisp_Object real_coding_system, probe;
1239
1240   CHECK_SYMBOL (alias);
1241
1242   if (!NILP (Fcoding_system_canonical_name_p (alias)))
1243     signal_simple_error
1244       ("Symbol is the canonical name of a coding system and cannot be redefined",
1245        alias);
1246
1247   if (NILP (aliasee))
1248     {
1249       Lisp_Object subsidiary_unix = append_suffix_to_symbol (alias, "-unix");
1250       Lisp_Object subsidiary_dos  = append_suffix_to_symbol (alias, "-dos");
1251       Lisp_Object subsidiary_mac  = append_suffix_to_symbol (alias, "-mac");
1252
1253       Fremhash (alias, Vcoding_system_hash_table);
1254
1255       /* Undefine subsidiary aliases,
1256          presumably created by a previous call to this function */
1257       if (! NILP (Fcoding_system_alias_p (subsidiary_unix)) &&
1258           ! NILP (Fcoding_system_alias_p (subsidiary_dos))  &&
1259           ! NILP (Fcoding_system_alias_p (subsidiary_mac)))
1260         {
1261           Fdefine_coding_system_alias (subsidiary_unix, Qnil);
1262           Fdefine_coding_system_alias (subsidiary_dos,  Qnil);
1263           Fdefine_coding_system_alias (subsidiary_mac,  Qnil);
1264         }
1265
1266       /* Undefine dangling coding system aliases. */
1267       {
1268         int dangling_aliases;
1269
1270         do {
1271           dangling_aliases = 0;
1272           elisp_map_remhash (dangling_coding_system_alias_p,
1273                              Vcoding_system_hash_table,
1274                              &dangling_aliases);
1275         } while (dangling_aliases > 0);
1276       }
1277
1278       return Qnil;
1279     }
1280
1281   if (CODING_SYSTEMP (aliasee))
1282     aliasee = XCODING_SYSTEM_NAME (aliasee);
1283
1284   /* Checks that aliasee names a coding-system */
1285   real_coding_system = Fget_coding_system (aliasee);
1286
1287   /* Check for coding system alias loops */
1288   if (EQ (alias, aliasee))
1289     alias_loop: signal_simple_error_2
1290       ("Attempt to create a coding system alias loop", alias, aliasee);
1291
1292   for (probe = aliasee;
1293        SYMBOLP (probe);
1294        probe = Fgethash (probe, Vcoding_system_hash_table, Qzero))
1295     {
1296       if (EQ (probe, alias))
1297         goto alias_loop;
1298     }
1299
1300   Fputhash (alias, aliasee, Vcoding_system_hash_table);
1301
1302   /* Set up aliases for subsidiaries.
1303      #### There must be a better way to handle subsidiary coding systems. */
1304   {
1305     static const char *suffixes[] = { "-unix", "-dos", "-mac" };
1306     int i;
1307     for (i = 0; i < countof (suffixes); i++)
1308       {
1309         Lisp_Object alias_subsidiary =
1310           append_suffix_to_symbol (alias, suffixes[i]);
1311         Lisp_Object aliasee_subsidiary =
1312           append_suffix_to_symbol (aliasee, suffixes[i]);
1313
1314         if (! NILP (Ffind_coding_system (aliasee_subsidiary)))
1315           Fdefine_coding_system_alias (alias_subsidiary, aliasee_subsidiary);
1316       }
1317   }
1318   /* FSF return value is a vector of [ALIAS-unix ALIAS-dos ALIAS-mac],
1319      but it doesn't look intentional, so I'd rather return something
1320      meaningful or nothing at all. */
1321   return Qnil;
1322 }
1323
1324 static Lisp_Object
1325 subsidiary_coding_system (Lisp_Object coding_system, eol_type_t type)
1326 {
1327   Lisp_Coding_System *cs = XCODING_SYSTEM (coding_system);
1328   Lisp_Object new_coding_system;
1329
1330   if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
1331     return coding_system;
1332
1333   switch (type)
1334     {
1335     case EOL_AUTODETECT: return coding_system;
1336     case EOL_LF:   new_coding_system = CODING_SYSTEM_EOL_LF   (cs); break;
1337     case EOL_CR:   new_coding_system = CODING_SYSTEM_EOL_CR   (cs); break;
1338     case EOL_CRLF: new_coding_system = CODING_SYSTEM_EOL_CRLF (cs); break;
1339     default:       abort (); return Qnil;
1340     }
1341
1342   return NILP (new_coding_system) ? coding_system : new_coding_system;
1343 }
1344
1345 DEFUN ("subsidiary-coding-system", Fsubsidiary_coding_system, 2, 2, 0, /*
1346 Return the subsidiary coding system of CODING-SYSTEM with eol type EOL-TYPE.
1347 */
1348        (coding_system, eol_type))
1349 {
1350   coding_system = Fget_coding_system (coding_system);
1351
1352   return subsidiary_coding_system (coding_system,
1353                                    symbol_to_eol_type (eol_type));
1354 }
1355
1356 \f
1357 /************************************************************************/
1358 /*                         Coding system accessors                      */
1359 /************************************************************************/
1360
1361 DEFUN ("coding-system-doc-string", Fcoding_system_doc_string, 1, 1, 0, /*
1362 Return the doc string for CODING-SYSTEM.
1363 */
1364        (coding_system))
1365 {
1366   coding_system = Fget_coding_system (coding_system);
1367   return XCODING_SYSTEM_DOC_STRING (coding_system);
1368 }
1369
1370 DEFUN ("coding-system-type", Fcoding_system_type, 1, 1, 0, /*
1371 Return the type of CODING-SYSTEM.
1372 */
1373        (coding_system))
1374 {
1375   switch (XCODING_SYSTEM_TYPE (Fget_coding_system (coding_system)))
1376     {
1377     default: abort ();
1378     case CODESYS_AUTODETECT:    return Qundecided;
1379 #ifdef MULE
1380     case CODESYS_SHIFT_JIS:     return Qshift_jis;
1381     case CODESYS_ISO2022:       return Qiso2022;
1382     case CODESYS_BIG5:          return Qbig5;
1383     case CODESYS_UCS4:          return Qucs4;
1384     case CODESYS_UTF16:         return Qutf16;
1385     case CODESYS_UTF8:          return Qutf8;
1386     case CODESYS_CCL:           return Qccl;
1387 #endif
1388     case CODESYS_NO_CONVERSION: return Qno_conversion;
1389 #ifdef DEBUG_XEMACS
1390     case CODESYS_INTERNAL:      return Qinternal;
1391 #endif
1392     }
1393 }
1394
1395 #ifdef MULE
1396 static
1397 Lisp_Object coding_system_charset (Lisp_Object coding_system, int gnum)
1398 {
1399   Lisp_Object cs
1400     = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, gnum);
1401
1402   return CHARSETP (cs) ? XCHARSET_NAME (cs) : Qnil;
1403 }
1404
1405 DEFUN ("coding-system-charset", Fcoding_system_charset, 2, 2, 0, /*
1406 Return initial charset of CODING-SYSTEM designated to GNUM.
1407 GNUM allows 0 .. 3.
1408 */
1409        (coding_system, gnum))
1410 {
1411   coding_system = Fget_coding_system (coding_system);
1412   CHECK_INT (gnum);
1413
1414   return coding_system_charset (coding_system, XINT (gnum));
1415 }
1416 #endif /* MULE */
1417
1418 DEFUN ("coding-system-property", Fcoding_system_property, 2, 2, 0, /*
1419 Return the PROP property of CODING-SYSTEM.
1420 */
1421        (coding_system, prop))
1422 {
1423   int i, ok = 0;
1424   enum coding_system_type type;
1425
1426   coding_system = Fget_coding_system (coding_system);
1427   CHECK_SYMBOL (prop);
1428   type = XCODING_SYSTEM_TYPE (coding_system);
1429
1430   for (i = 0; !ok && i < Dynarr_length (the_codesys_prop_dynarr); i++)
1431     if (EQ (Dynarr_at (the_codesys_prop_dynarr, i).sym, prop))
1432       {
1433         ok = 1;
1434         switch (Dynarr_at (the_codesys_prop_dynarr, i).prop_type)
1435           {
1436           case CODESYS_PROP_ALL_OK:
1437             break;
1438 #ifdef MULE
1439           case CODESYS_PROP_ISO2022:
1440             if (type != CODESYS_ISO2022)
1441               signal_simple_error
1442                 ("Property only valid in ISO2022 coding systems",
1443                  prop);
1444             break;
1445
1446           case CODESYS_PROP_CCL:
1447             if (type != CODESYS_CCL)
1448               signal_simple_error
1449                 ("Property only valid in CCL coding systems",
1450                  prop);
1451             break;
1452 #endif /* MULE */
1453           default:
1454             abort ();
1455           }
1456       }
1457
1458   if (!ok)
1459     signal_simple_error ("Unrecognized property", prop);
1460
1461   if (EQ (prop, Qname))
1462     return XCODING_SYSTEM_NAME (coding_system);
1463   else if (EQ (prop, Qtype))
1464     return Fcoding_system_type (coding_system);
1465   else if (EQ (prop, Qdoc_string))
1466     return XCODING_SYSTEM_DOC_STRING (coding_system);
1467   else if (EQ (prop, Qmnemonic))
1468     return XCODING_SYSTEM_MNEMONIC (coding_system);
1469   else if (EQ (prop, Qeol_type))
1470     return eol_type_to_symbol (XCODING_SYSTEM_EOL_TYPE (coding_system));
1471   else if (EQ (prop, Qeol_lf))
1472     return XCODING_SYSTEM_EOL_LF (coding_system);
1473   else if (EQ (prop, Qeol_crlf))
1474     return XCODING_SYSTEM_EOL_CRLF (coding_system);
1475   else if (EQ (prop, Qeol_cr))
1476     return XCODING_SYSTEM_EOL_CR (coding_system);
1477   else if (EQ (prop, Qpost_read_conversion))
1478     return XCODING_SYSTEM_POST_READ_CONVERSION (coding_system);
1479   else if (EQ (prop, Qpre_write_conversion))
1480     return XCODING_SYSTEM_PRE_WRITE_CONVERSION (coding_system);
1481 #ifdef MULE
1482 #ifdef UTF2000
1483   else if (EQ (prop, Qdisable_composition))
1484     return XCODING_SYSTEM_DISABLE_COMPOSITION (coding_system) ? Qt : Qnil;
1485   else if (EQ (prop, Quse_entity_reference))
1486     return XCODING_SYSTEM_USE_ENTITY_REFERENCE (coding_system) ? Qt : Qnil;
1487 #endif
1488   else if (type == CODESYS_ISO2022)
1489     {
1490       if (EQ (prop, Qcharset_g0))
1491         return coding_system_charset (coding_system, 0);
1492       else if (EQ (prop, Qcharset_g1))
1493         return coding_system_charset (coding_system, 1);
1494       else if (EQ (prop, Qcharset_g2))
1495         return coding_system_charset (coding_system, 2);
1496       else if (EQ (prop, Qcharset_g3))
1497         return coding_system_charset (coding_system, 3);
1498
1499 #define FORCE_CHARSET(charset_num) \
1500   (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \
1501    (coding_system, charset_num) ? Qt : Qnil)
1502
1503       else if (EQ (prop, Qforce_g0_on_output)) return FORCE_CHARSET (0);
1504       else if (EQ (prop, Qforce_g1_on_output)) return FORCE_CHARSET (1);
1505       else if (EQ (prop, Qforce_g2_on_output)) return FORCE_CHARSET (2);
1506       else if (EQ (prop, Qforce_g3_on_output)) return FORCE_CHARSET (3);
1507
1508 #define LISP_BOOLEAN(prop) \
1509   (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil)
1510
1511       else if (EQ (prop, Qshort))         return LISP_BOOLEAN (SHORT);
1512       else if (EQ (prop, Qno_ascii_eol))  return LISP_BOOLEAN (NO_ASCII_EOL);
1513       else if (EQ (prop, Qno_ascii_cntl)) return LISP_BOOLEAN (NO_ASCII_CNTL);
1514       else if (EQ (prop, Qseven))         return LISP_BOOLEAN (SEVEN);
1515       else if (EQ (prop, Qlock_shift))    return LISP_BOOLEAN (LOCK_SHIFT);
1516       else if (EQ (prop, Qno_iso6429))    return LISP_BOOLEAN (NO_ISO6429);
1517       else if (EQ (prop, Qescape_quoted)) return LISP_BOOLEAN (ESCAPE_QUOTED);
1518
1519       else if (EQ (prop, Qinput_charset_conversion))
1520         return
1521           unparse_charset_conversion_specs
1522             (XCODING_SYSTEM (coding_system)->iso2022.input_conv);
1523       else if (EQ (prop, Qoutput_charset_conversion))
1524         return
1525           unparse_charset_conversion_specs
1526             (XCODING_SYSTEM (coding_system)->iso2022.output_conv);
1527       else
1528         abort ();
1529     }
1530   else if (type == CODESYS_CCL)
1531     {
1532       if (EQ (prop, Qdecode))
1533         return XCODING_SYSTEM_CCL_DECODE (coding_system);
1534       else if (EQ (prop, Qencode))
1535         return XCODING_SYSTEM_CCL_ENCODE (coding_system);
1536       else
1537         abort ();
1538     }
1539 #endif /* MULE */
1540   else
1541     abort ();
1542
1543   return Qnil; /* not reached */
1544 }
1545
1546 \f
1547 /************************************************************************/
1548 /*                       Coding category functions                      */
1549 /************************************************************************/
1550
1551 static int
1552 decode_coding_category (Lisp_Object symbol)
1553 {
1554   int i;
1555
1556   CHECK_SYMBOL (symbol);
1557   for (i = 0; i < CODING_CATEGORY_LAST; i++)
1558     if (EQ (coding_category_symbol[i], symbol))
1559       return i;
1560
1561   signal_simple_error ("Unrecognized coding category", symbol);
1562   return 0; /* not reached */
1563 }
1564
1565 DEFUN ("coding-category-list", Fcoding_category_list, 0, 0, 0, /*
1566 Return a list of all recognized coding categories.
1567 */
1568        ())
1569 {
1570   int i;
1571   Lisp_Object list = Qnil;
1572
1573   for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
1574     list = Fcons (coding_category_symbol[i], list);
1575   return list;
1576 }
1577
1578 DEFUN ("set-coding-priority-list", Fset_coding_priority_list, 1, 1, 0, /*
1579 Change the priority order of the coding categories.
1580 LIST should be list of coding categories, in descending order of
1581 priority.  Unspecified coding categories will be lower in priority
1582 than all specified ones, in the same relative order they were in
1583 previously.
1584 */
1585        (list))
1586 {
1587   int category_to_priority[CODING_CATEGORY_LAST];
1588   int i, j;
1589   Lisp_Object rest;
1590
1591   /* First generate a list that maps coding categories to priorities. */
1592
1593   for (i = 0; i < CODING_CATEGORY_LAST; i++)
1594     category_to_priority[i] = -1;
1595
1596   /* Highest priority comes from the specified list. */
1597   i = 0;
1598   EXTERNAL_LIST_LOOP (rest, list)
1599     {
1600       int cat = decode_coding_category (XCAR (rest));
1601
1602       if (category_to_priority[cat] >= 0)
1603         signal_simple_error ("Duplicate coding category in list", XCAR (rest));
1604       category_to_priority[cat] = i++;
1605     }
1606
1607   /* Now go through the existing categories by priority to retrieve
1608      the categories not yet specified and preserve their priority
1609      order. */
1610   for (j = 0; j < CODING_CATEGORY_LAST; j++)
1611     {
1612       int cat = fcd->coding_category_by_priority[j];
1613       if (category_to_priority[cat] < 0)
1614         category_to_priority[cat] = i++;
1615     }
1616
1617   /* Now we need to construct the inverse of the mapping we just
1618      constructed. */
1619
1620   for (i = 0; i < CODING_CATEGORY_LAST; i++)
1621     fcd->coding_category_by_priority[category_to_priority[i]] = i;
1622
1623   /* Phew!  That was confusing. */
1624   return Qnil;
1625 }
1626
1627 DEFUN ("coding-priority-list", Fcoding_priority_list, 0, 0, 0, /*
1628 Return a list of coding categories in descending order of priority.
1629 */
1630        ())
1631 {
1632   int i;
1633   Lisp_Object list = Qnil;
1634
1635   for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
1636     list = Fcons (coding_category_symbol[fcd->coding_category_by_priority[i]],
1637                   list);
1638   return list;
1639 }
1640
1641 DEFUN ("set-coding-category-system", Fset_coding_category_system, 2, 2, 0, /*
1642 Change the coding system associated with a coding category.
1643 */
1644        (coding_category, coding_system))
1645 {
1646   int cat = decode_coding_category (coding_category);
1647
1648   coding_system = Fget_coding_system (coding_system);
1649   fcd->coding_category_system[cat] = coding_system;
1650   return Qnil;
1651 }
1652
1653 DEFUN ("coding-category-system", Fcoding_category_system, 1, 1, 0, /*
1654 Return the coding system associated with a coding category.
1655 */
1656        (coding_category))
1657 {
1658   int cat = decode_coding_category (coding_category);
1659   Lisp_Object sys = fcd->coding_category_system[cat];
1660
1661   if (!NILP (sys))
1662     return XCODING_SYSTEM_NAME (sys);
1663   return Qnil;
1664 }
1665
1666 \f
1667 /************************************************************************/
1668 /*                     Detecting the encoding of data                   */
1669 /************************************************************************/
1670
1671 struct detection_state
1672 {
1673   eol_type_t eol_type;
1674   int seen_non_ascii;
1675   int mask;
1676 #ifdef MULE
1677   struct
1678     {
1679       int mask;
1680       int in_second_byte;
1681     }
1682   big5;
1683
1684   struct
1685     {
1686       int mask;
1687       int in_second_byte;
1688     }
1689   shift_jis;
1690
1691   struct
1692     {
1693       int mask;
1694       int in_byte;
1695   }
1696   ucs4;
1697
1698   struct
1699     {
1700       int mask;
1701       int in_byte;
1702     }
1703   utf16;
1704
1705   struct
1706     {
1707       int mask;
1708       int in_byte;
1709     }
1710   utf8;
1711
1712   struct
1713     {
1714       int mask;
1715       int initted;
1716       struct iso2022_decoder iso;
1717       unsigned int flags;
1718       int high_byte_count;
1719       unsigned int saw_single_shift:1;
1720     }
1721   iso2022;
1722 #endif
1723   struct
1724     {
1725       int seen_anything;
1726       int just_saw_cr;
1727     }
1728   eol;
1729 };
1730
1731 static int
1732 acceptable_control_char_p (int c)
1733 {
1734   switch (c)
1735     {
1736       /* Allow and ignore control characters that you might
1737          reasonably see in a text file */
1738     case '\r':
1739     case '\n':
1740     case '\t':
1741     case  7: /* bell */
1742     case  8: /* backspace */
1743     case 11: /* vertical tab */
1744     case 12: /* form feed */
1745     case 26: /* MS-DOS C-z junk */
1746     case 31: /* '^_' -- for info */
1747       return 1;
1748     default:
1749       return 0;
1750     }
1751 }
1752
1753 static int
1754 mask_has_at_most_one_bit_p (int mask)
1755 {
1756   /* Perhaps the only thing useful you learn from intensive Microsoft
1757      technical interviews */
1758   return (mask & (mask - 1)) == 0;
1759 }
1760
1761 static eol_type_t
1762 detect_eol_type (struct detection_state *st, const Extbyte *src,
1763                  Lstream_data_count n)
1764 {
1765   while (n--)
1766     {
1767       unsigned char c = *(unsigned char *)src++;
1768       if (c == '\n')
1769         {
1770           if (st->eol.just_saw_cr)
1771             return EOL_CRLF;
1772           else if (st->eol.seen_anything)
1773             return EOL_LF;
1774         }
1775       else if (st->eol.just_saw_cr)
1776         return EOL_CR;
1777       else if (c == '\r')
1778         st->eol.just_saw_cr = 1;
1779       else
1780         st->eol.just_saw_cr = 0;
1781       st->eol.seen_anything = 1;
1782     }
1783
1784   return EOL_AUTODETECT;
1785 }
1786
1787 /* Attempt to determine the encoding and EOL type of the given text.
1788    Before calling this function for the first type, you must initialize
1789    st->eol_type as appropriate and initialize st->mask to ~0.
1790
1791    st->eol_type holds the determined EOL type, or EOL_AUTODETECT if
1792    not yet known.
1793
1794    st->mask holds the determined coding category mask, or ~0 if only
1795    ASCII has been seen so far.
1796
1797    Returns:
1798
1799    0 == st->eol_type is EOL_AUTODETECT and/or more than coding category
1800         is present in st->mask
1801    1 == definitive answers are here for both st->eol_type and st->mask
1802 */
1803
1804 static int
1805 detect_coding_type (struct detection_state *st, const Extbyte *src,
1806                     Lstream_data_count n, int just_do_eol)
1807 {
1808   if (st->eol_type == EOL_AUTODETECT)
1809     st->eol_type = detect_eol_type (st, src, n);
1810
1811   if (just_do_eol)
1812     return st->eol_type != EOL_AUTODETECT;
1813
1814   if (!st->seen_non_ascii)
1815     {
1816       for (; n; n--, src++)
1817         {
1818           unsigned char c = *(unsigned char *) src;
1819           if ((c < 0x20 && !acceptable_control_char_p (c)) || c >= 0x80)
1820             {
1821               st->seen_non_ascii = 1;
1822 #ifdef MULE
1823               st->shift_jis.mask = ~0;
1824               st->big5.mask = ~0;
1825               st->ucs4.mask = ~0;
1826               st->utf16.mask = ~0;
1827               st->utf8.mask = ~0;
1828               st->iso2022.mask = ~0;
1829 #endif
1830               break;
1831             }
1832         }
1833     }
1834
1835   if (!n)
1836     return 0;
1837 #ifdef MULE
1838   if (!mask_has_at_most_one_bit_p (st->iso2022.mask))
1839     st->iso2022.mask = detect_coding_iso2022 (st, src, n);
1840   if (!mask_has_at_most_one_bit_p (st->shift_jis.mask))
1841     st->shift_jis.mask = detect_coding_sjis (st, src, n);
1842   if (!mask_has_at_most_one_bit_p (st->big5.mask))
1843     st->big5.mask = detect_coding_big5 (st, src, n);
1844   if (!mask_has_at_most_one_bit_p (st->utf8.mask))
1845     st->utf8.mask = detect_coding_utf8 (st, src, n);
1846   if (!mask_has_at_most_one_bit_p (st->utf16.mask))
1847     st->utf16.mask = detect_coding_utf16 (st, src, n);
1848   if (!mask_has_at_most_one_bit_p (st->ucs4.mask))
1849     st->ucs4.mask = detect_coding_ucs4 (st, src, n);
1850
1851   st->mask
1852     = st->iso2022.mask | st->shift_jis.mask | st->big5.mask
1853     | st->utf8.mask | st->ucs4.mask;
1854 #endif
1855   {
1856     int retval = mask_has_at_most_one_bit_p (st->mask);
1857     st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK;
1858     return retval && st->eol_type != EOL_AUTODETECT;
1859   }
1860 }
1861
1862 static Lisp_Object
1863 coding_system_from_mask (int mask)
1864 {
1865   if (mask == ~0)
1866     {
1867       /* If the file was entirely or basically ASCII, use the
1868          default value of `buffer-file-coding-system'. */
1869       Lisp_Object retval =
1870         XBUFFER (Vbuffer_defaults)->buffer_file_coding_system;
1871       if (!NILP (retval))
1872         {
1873           retval = Ffind_coding_system (retval);
1874           if (NILP (retval))
1875             {
1876               warn_when_safe
1877                 (Qbad_variable, Qwarning,
1878                  "Invalid `default-buffer-file-coding-system', set to nil");
1879               XBUFFER (Vbuffer_defaults)->buffer_file_coding_system = Qnil;
1880             }
1881         }
1882       if (NILP (retval))
1883         retval = Fget_coding_system (Qraw_text);
1884       return retval;
1885     }
1886   else
1887     {
1888       int i;
1889       int cat = -1;
1890 #ifdef MULE
1891       mask = postprocess_iso2022_mask (mask);
1892 #endif
1893       /* Look through the coding categories by priority and find
1894          the first one that is allowed. */
1895       for (i = 0; i < CODING_CATEGORY_LAST; i++)
1896         {
1897           cat = fcd->coding_category_by_priority[i];
1898           if ((mask & (1 << cat)) &&
1899               !NILP (fcd->coding_category_system[cat]))
1900             break;
1901         }
1902       if (cat >= 0)
1903         return fcd->coding_category_system[cat];
1904       else
1905         return Fget_coding_system (Qraw_text);
1906     }
1907 }
1908
1909 /* Given a seekable read stream and potential coding system and EOL type
1910    as specified, do any autodetection that is called for.  If the
1911    coding system and/or EOL type are not `autodetect', they will be left
1912    alone; but this function will never return an autodetect coding system
1913    or EOL type.
1914
1915    This function does not automatically fetch subsidiary coding systems;
1916    that should be unnecessary with the explicit eol-type argument. */
1917
1918 #define LENGTH(string_constant) (sizeof (string_constant) - 1)
1919 /* number of leading lines to check for a coding cookie */
1920 #define LINES_TO_CHECK 2
1921
1922 void
1923 determine_real_coding_system (Lstream *stream, Lisp_Object *codesys_in_out,
1924                               eol_type_t *eol_type_in_out)
1925 {
1926   struct detection_state decst;
1927
1928   if (*eol_type_in_out == EOL_AUTODETECT)
1929     *eol_type_in_out = XCODING_SYSTEM_EOL_TYPE (*codesys_in_out);
1930
1931   xzero (decst);
1932   decst.eol_type = *eol_type_in_out;
1933   decst.mask = ~0;
1934
1935   /* If autodetection is called for, do it now. */
1936   if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT
1937       || *eol_type_in_out == EOL_AUTODETECT)
1938     {
1939       Extbyte buf[4096];
1940       Lisp_Object coding_system = Qnil;
1941       Extbyte *p;
1942       Lstream_data_count nread = Lstream_read (stream, buf, sizeof (buf));
1943       Extbyte *scan_end;
1944       int lines_checked = 0;
1945
1946       /* Look for initial "-*-"; mode line prefix */
1947       for (p = buf,
1948              scan_end = buf + nread - LENGTH ("-*-coding:?-*-");
1949            p <= scan_end
1950              && lines_checked < LINES_TO_CHECK;
1951            p++)
1952         if (*p == '-' && *(p+1) == '*' && *(p+2) == '-')
1953           {
1954             Extbyte *local_vars_beg = p + 3;
1955             /* Look for final "-*-"; mode line suffix */
1956             for (p = local_vars_beg,
1957                    scan_end = buf + nread - LENGTH ("-*-");
1958                  p <= scan_end
1959                    && lines_checked < LINES_TO_CHECK;
1960                  p++)
1961               if (*p == '-' && *(p+1) == '*' && *(p+2) == '-')
1962                 {
1963                   Extbyte *suffix = p;
1964                   /* Look for "coding:" */
1965                   for (p = local_vars_beg,
1966                          scan_end = suffix - LENGTH ("coding:?");
1967                        p <= scan_end;
1968                        p++)
1969                     if (memcmp ("coding:", p, LENGTH ("coding:")) == 0
1970                         && (p == local_vars_beg
1971                             || (*(p-1) == ' '  ||
1972                                 *(p-1) == '\t' ||
1973                                 *(p-1) == ';')))
1974                       {
1975                         Extbyte save;
1976                         int n;
1977                         p += LENGTH ("coding:");
1978                         while (*p == ' ' || *p == '\t') p++;
1979
1980                         /* Get coding system name */
1981                         save = *suffix; *suffix = '\0';
1982                         /* Characters valid in a MIME charset name (rfc 1521),
1983                            and in a Lisp symbol name. */
1984                         n = strspn ( (char *) p,
1985                                     "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
1986                                     "abcdefghijklmnopqrstuvwxyz"
1987                                     "0123456789"
1988                                     "!$%&*+-.^_{|}~");
1989                         *suffix = save;
1990                         if (n > 0)
1991                           {
1992                             save = p[n]; p[n] = '\0';
1993                             coding_system =
1994                               Ffind_coding_system (intern ((char *) p));
1995                             p[n] = save;
1996                           }
1997                         break;
1998                       }
1999                   break;
2000                 }
2001               /* #### file must use standard EOLs or we miss 2d line */
2002               /* #### not to mention this is broken for UTF-16 DOS files */
2003               else if (*p == '\n' || *p == '\r')
2004                 {
2005                   lines_checked++;
2006                   /* skip past multibyte (DOS) newline */
2007                   if (*p == '\r' && *(p+1) == '\n') p++;
2008                 }
2009             break;
2010           }
2011         /* #### file must use standard EOLs or we miss 2d line */
2012         /* #### not to mention this is broken for UTF-16 DOS files */
2013         else if (*p == '\n' || *p == '\r')
2014           {
2015             lines_checked++;
2016             /* skip past multibyte (DOS) newline */
2017             if (*p == '\r' && *(p+1) == '\n') p++;
2018           }
2019
2020       if (NILP (coding_system))
2021         do
2022           {
2023             if (detect_coding_type (&decst, buf, nread,
2024                                     XCODING_SYSTEM_TYPE (*codesys_in_out)
2025                                     != CODESYS_AUTODETECT))
2026               break;
2027             nread = Lstream_read (stream, buf, sizeof (buf));
2028             if (nread == 0)
2029               break;
2030           }
2031         while (1);
2032
2033       else if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT
2034                && XCODING_SYSTEM_EOL_TYPE (coding_system) == EOL_AUTODETECT)
2035         do
2036           {
2037             if (detect_coding_type (&decst, buf, nread, 1))
2038               break;
2039             nread = Lstream_read (stream, buf, sizeof (buf));
2040             if (!nread)
2041               break;
2042           }
2043         while (1);
2044
2045       *eol_type_in_out = decst.eol_type;
2046       if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT)
2047         {
2048           if (NILP (coding_system))
2049             *codesys_in_out = coding_system_from_mask (decst.mask);
2050           else
2051             *codesys_in_out = coding_system;
2052         }
2053     }
2054
2055   /* If we absolutely can't determine the EOL type, just assume LF. */
2056   if (*eol_type_in_out == EOL_AUTODETECT)
2057     *eol_type_in_out = EOL_LF;
2058
2059   Lstream_rewind (stream);
2060 }
2061
2062 DEFUN ("detect-coding-region", Fdetect_coding_region, 2, 3, 0, /*
2063 Detect coding system of the text in the region between START and END.
2064 Return a list of possible coding systems ordered by priority.
2065 If only ASCII characters are found, return 'undecided or one of
2066 its subsidiary coding systems according to a detected end-of-line
2067 type.  Optional arg BUFFER defaults to the current buffer.
2068 */
2069        (start, end, buffer))
2070 {
2071   Lisp_Object val = Qnil;
2072   struct buffer *buf = decode_buffer (buffer, 0);
2073   Bufpos b, e;
2074   Lisp_Object instream, lb_instream;
2075   Lstream *istr, *lb_istr;
2076   struct detection_state decst;
2077   struct gcpro gcpro1, gcpro2;
2078
2079   get_buffer_range_char (buf, start, end, &b, &e, 0);
2080   lb_instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2081   lb_istr = XLSTREAM (lb_instream);
2082   instream = make_encoding_input_stream (lb_istr, Fget_coding_system (Qbinary));
2083   istr = XLSTREAM (instream);
2084   GCPRO2 (instream, lb_instream);
2085   xzero (decst);
2086   decst.eol_type = EOL_AUTODETECT;
2087   decst.mask = ~0;
2088   while (1)
2089     {
2090       Extbyte random_buffer[4096];
2091       Lstream_data_count nread = Lstream_read (istr, random_buffer, sizeof (random_buffer));
2092
2093       if (!nread)
2094         break;
2095       if (detect_coding_type (&decst, random_buffer, nread, 0))
2096         break;
2097     }
2098
2099   if (decst.mask == ~0)
2100     val = subsidiary_coding_system (Fget_coding_system (Qundecided),
2101                                     decst.eol_type);
2102   else
2103     {
2104       int i;
2105
2106       val = Qnil;
2107 #ifdef MULE
2108       decst.mask = postprocess_iso2022_mask (decst.mask);
2109 #endif
2110       for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
2111         {
2112           int sys = fcd->coding_category_by_priority[i];
2113           if (decst.mask & (1 << sys))
2114             {
2115               Lisp_Object codesys = fcd->coding_category_system[sys];
2116               if (!NILP (codesys))
2117                 codesys = subsidiary_coding_system (codesys, decst.eol_type);
2118               val = Fcons (codesys, val);
2119             }
2120         }
2121     }
2122   Lstream_close (istr);
2123   UNGCPRO;
2124   Lstream_delete (istr);
2125   Lstream_delete (lb_istr);
2126   return val;
2127 }
2128
2129 \f
2130 /************************************************************************/
2131 /*           Converting to internal Mule format ("decoding")            */
2132 /************************************************************************/
2133
2134 /* A decoding stream is a stream used for decoding text (i.e.
2135    converting from some external format to internal format).
2136    The decoding-stream object keeps track of the actual coding
2137    stream, the stream that is at the other end, and data that
2138    needs to be persistent across the lifetime of the stream. */
2139
2140 /* Handle the EOL stuff related to just-read-in character C.
2141    EOL_TYPE is the EOL type of the coding stream.
2142    FLAGS is the current value of FLAGS in the coding stream, and may
2143    be modified by this macro.  (The macro only looks at the
2144    CODING_STATE_CR flag.)  DST is the Dynarr to which the decoded
2145    bytes are to be written.  You need to also define a local goto
2146    label "label_continue_loop" that is at the end of the main
2147    character-reading loop.
2148
2149    If C is a CR character, then this macro handles it entirely and
2150    jumps to label_continue_loop.  Otherwise, this macro does not add
2151    anything to DST, and continues normally.  You should continue
2152    processing C normally after this macro. */
2153
2154 #define DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst)         \
2155 do {                                                            \
2156   if (c == '\r')                                                \
2157     {                                                           \
2158       if (eol_type == EOL_CR)                                   \
2159         Dynarr_add (dst, '\n');                                 \
2160       else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR) \
2161         Dynarr_add (dst, c);                                    \
2162       else                                                      \
2163         flags |= CODING_STATE_CR;                               \
2164       goto label_continue_loop;                                 \
2165     }                                                           \
2166   else if (flags & CODING_STATE_CR)                             \
2167     {   /* eol_type == CODING_SYSTEM_EOL_CRLF */                \
2168       if (c != '\n')                                            \
2169         Dynarr_add (dst, '\r');                                 \
2170       flags &= ~CODING_STATE_CR;                                \
2171     }                                                           \
2172 } while (0)
2173
2174 /* C should be a binary character in the range 0 - 255; convert
2175    to internal format and add to Dynarr DST. */
2176
2177 #ifdef UTF2000
2178 #define DECODE_ADD_BINARY_CHAR(c, dst) \
2179 do {                                            \
2180   if (BYTE_ASCII_P (c))                         \
2181     Dynarr_add (dst, c);                        \
2182   else                                          \
2183     {                                           \
2184       Dynarr_add (dst, (c >> 6) | 0xc0);        \
2185       Dynarr_add (dst, (c & 0x3f) | 0x80);      \
2186     }                                           \
2187 } while (0)
2188
2189 INLINE_HEADER void DECODE_ADD_UCS_CHAR(Emchar c, unsigned_char_dynarr* dst);
2190 INLINE_HEADER void
2191 DECODE_ADD_UCS_CHAR(Emchar c, unsigned_char_dynarr* dst)
2192 {
2193   if ( c <= 0x7f )
2194     {
2195       Dynarr_add (dst, c);
2196     }
2197   else if ( c <= 0x7ff )
2198     {
2199       Dynarr_add (dst, (c >> 6) | 0xc0);
2200       Dynarr_add (dst, (c & 0x3f) | 0x80);
2201     }
2202   else if ( c <= 0xffff )
2203     {
2204       Dynarr_add (dst,  (c >> 12) | 0xe0);
2205       Dynarr_add (dst, ((c >>  6) & 0x3f) | 0x80);
2206       Dynarr_add (dst,  (c        & 0x3f) | 0x80);
2207     }
2208   else if ( c <= 0x1fffff )
2209     {
2210       Dynarr_add (dst,  (c >> 18) | 0xf0);
2211       Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2212       Dynarr_add (dst, ((c >>  6) & 0x3f) | 0x80);
2213       Dynarr_add (dst,  (c        & 0x3f) | 0x80);
2214     }
2215   else if ( c <= 0x3ffffff )
2216     {
2217       Dynarr_add (dst,  (c >> 24) | 0xf8);
2218       Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
2219       Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2220       Dynarr_add (dst, ((c >>  6) & 0x3f) | 0x80);
2221       Dynarr_add (dst,  (c        & 0x3f) | 0x80);
2222     }
2223   else
2224     {
2225       Dynarr_add (dst,  (c >> 30) | 0xfc);
2226       Dynarr_add (dst, ((c >> 24) & 0x3f) | 0x80);
2227       Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
2228       Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2229       Dynarr_add (dst, ((c >>  6) & 0x3f) | 0x80);
2230       Dynarr_add (dst,  (c        & 0x3f) | 0x80);
2231     }
2232 }
2233 #else
2234 #define DECODE_ADD_BINARY_CHAR(c, dst)          \
2235 do {                                            \
2236   if (BYTE_ASCII_P (c))                         \
2237     Dynarr_add (dst, c);                        \
2238   else if (BYTE_C1_P (c))                       \
2239     {                                           \
2240       Dynarr_add (dst, LEADING_BYTE_CONTROL_1); \
2241       Dynarr_add (dst, c + 0x20);               \
2242     }                                           \
2243   else                                          \
2244     {                                           \
2245       Dynarr_add (dst, LEADING_BYTE_LATIN_ISO8859_1); \
2246       Dynarr_add (dst, c);                      \
2247     }                                           \
2248 } while (0)
2249 #endif
2250
2251 #define DECODE_OUTPUT_PARTIAL_CHAR(ch)  \
2252 do {                                    \
2253   if (ch)                               \
2254     {                                   \
2255       DECODE_ADD_BINARY_CHAR (ch, dst); \
2256       ch = 0;                           \
2257     }                                   \
2258 } while (0)
2259
2260 #define DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst) \
2261 do {                                    \
2262   if (flags & CODING_STATE_END)         \
2263     {                                   \
2264       DECODE_OUTPUT_PARTIAL_CHAR (ch);  \
2265       if (flags & CODING_STATE_CR)      \
2266         Dynarr_add (dst, '\r');         \
2267     }                                   \
2268 } while (0)
2269
2270 #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding)
2271
2272 #define ER_BUF_SIZE 24
2273
2274 struct decoding_stream
2275 {
2276   /* Coding system that governs the conversion. */
2277   Lisp_Coding_System *codesys;
2278
2279   /* Stream that we read the encoded data from or
2280      write the decoded data to. */
2281   Lstream *other_end;
2282
2283   /* If we are reading, then we can return only a fixed amount of
2284      data, so if the conversion resulted in too much data, we store it
2285      here for retrieval the next time around. */
2286   unsigned_char_dynarr *runoff;
2287
2288   /* FLAGS holds flags indicating the current state of the decoding.
2289      Some of these flags are dependent on the coding system. */
2290   unsigned int flags;
2291
2292   /* CPOS holds a partially built-up code-point of character. */
2293   unsigned int cpos;
2294
2295   /* EOL_TYPE specifies the type of end-of-line conversion that
2296      currently applies.  We need to keep this separate from the
2297      EOL type stored in CODESYS because the latter might indicate
2298      automatic EOL-type detection while the former will always
2299      indicate a particular EOL type. */
2300   eol_type_t eol_type;
2301 #ifdef MULE
2302   /* Additional ISO2022 information.  We define the structure above
2303      because it's also needed by the detection routines. */
2304   struct iso2022_decoder iso2022;
2305
2306   /* Additional information (the state of the running CCL program)
2307      used by the CCL decoder. */
2308   struct ccl_program ccl;
2309
2310   /* counter for UTF-8 or UCS-4 */
2311   unsigned char counter;
2312 #endif
2313 #ifdef UTF2000
2314   unsigned char er_counter;
2315   unsigned char er_buf[ER_BUF_SIZE];
2316
2317   unsigned combined_char_count;
2318   Emchar combined_chars[16];
2319   Lisp_Object combining_table;
2320 #endif
2321   struct detection_state decst;
2322 };
2323
2324 static Lstream_data_count decoding_reader (Lstream *stream,
2325                                 unsigned char *data, Lstream_data_count size);
2326 static Lstream_data_count decoding_writer (Lstream *stream,
2327                                 const unsigned char *data, Lstream_data_count size);
2328 static int decoding_rewinder   (Lstream *stream);
2329 static int decoding_seekable_p (Lstream *stream);
2330 static int decoding_flusher    (Lstream *stream);
2331 static int decoding_closer     (Lstream *stream);
2332
2333 static Lisp_Object decoding_marker (Lisp_Object stream);
2334
2335 DEFINE_LSTREAM_IMPLEMENTATION ("decoding", lstream_decoding,
2336                                sizeof (struct decoding_stream));
2337
2338 static Lisp_Object
2339 decoding_marker (Lisp_Object stream)
2340 {
2341   Lstream *str = DECODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2342   Lisp_Object str_obj;
2343
2344   /* We do not need to mark the coding systems or charsets stored
2345      within the stream because they are stored in a global list
2346      and automatically marked. */
2347
2348   XSETLSTREAM (str_obj, str);
2349   mark_object (str_obj);
2350   if (str->imp->marker)
2351     return (str->imp->marker) (str_obj);
2352   else
2353     return Qnil;
2354 }
2355
2356 /* Read SIZE bytes of data and store it into DATA.  We are a decoding stream
2357    so we read data from the other end, decode it, and store it into DATA. */
2358
2359 static Lstream_data_count
2360 decoding_reader (Lstream *stream, unsigned char *data, Lstream_data_count size)
2361 {
2362   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2363   unsigned char *orig_data = data;
2364   Lstream_data_count read_size;
2365   int error_occurred = 0;
2366
2367   /* We need to interface to mule_decode(), which expects to take some
2368      amount of data and store the result into a Dynarr.  We have
2369      mule_decode() store into str->runoff, and take data from there
2370      as necessary. */
2371
2372   /* We loop until we have enough data, reading chunks from the other
2373      end and decoding it. */
2374   while (1)
2375     {
2376       /* Take data from the runoff if we can.  Make sure to take at
2377          most SIZE bytes, and delete the data from the runoff. */
2378       if (Dynarr_length (str->runoff) > 0)
2379         {
2380           Lstream_data_count chunk = min (size, (Lstream_data_count) Dynarr_length (str->runoff));
2381           memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2382           Dynarr_delete_many (str->runoff, 0, chunk);
2383           data += chunk;
2384           size -= chunk;
2385         }
2386
2387       if (size == 0)
2388         break; /* No more room for data */
2389
2390       if (str->flags & CODING_STATE_END)
2391         /* This means that on the previous iteration, we hit the EOF on
2392            the other end.  We loop once more so that mule_decode() can
2393            output any final stuff it may be holding, or any "go back
2394            to a sane state" escape sequences. (This latter makes sense
2395            during encoding.) */
2396         break;
2397
2398       /* Exhausted the runoff, so get some more.  DATA has at least
2399          SIZE bytes left of storage in it, so it's OK to read directly
2400          into it.  (We'll be overwriting above, after we've decoded it
2401          into the runoff.) */
2402       read_size = Lstream_read (str->other_end, data, size);
2403       if (read_size < 0)
2404         {
2405           error_occurred = 1;
2406           break;
2407         }
2408       if (read_size == 0)
2409         /* There might be some more end data produced in the translation.
2410            See the comment above. */
2411         str->flags |= CODING_STATE_END;
2412       mule_decode (stream, (Extbyte *) data, str->runoff, read_size);
2413     }
2414
2415   if (data - orig_data == 0)
2416     return error_occurred ? -1 : 0;
2417   else
2418     return data - orig_data;
2419 }
2420
2421 static Lstream_data_count
2422 decoding_writer (Lstream *stream, const unsigned char *data, Lstream_data_count size)
2423 {
2424   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2425   Lstream_data_count retval;
2426
2427   /* Decode all our data into the runoff, and then attempt to write
2428      it all out to the other end.  Remove whatever chunk we succeeded
2429      in writing. */
2430   mule_decode (stream, (Extbyte *) data, str->runoff, size);
2431   retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2432                           Dynarr_length (str->runoff));
2433   if (retval > 0)
2434     Dynarr_delete_many (str->runoff, 0, retval);
2435   /* Do NOT return retval.  The return value indicates how much
2436      of the incoming data was written, not how many bytes were
2437      written. */
2438   return size;
2439 }
2440
2441 static void
2442 reset_decoding_stream (struct decoding_stream *str)
2443 {
2444 #ifdef MULE
2445   if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_ISO2022)
2446     {
2447       Lisp_Object coding_system;
2448       XSETCODING_SYSTEM (coding_system, str->codesys);
2449       reset_iso2022 (coding_system, &str->iso2022);
2450     }
2451   else if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_CCL)
2452     {
2453       setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_DECODE (str->codesys));
2454     }
2455   str->counter = 0;
2456 #endif /* MULE */
2457 #ifdef UTF2000
2458   str->er_counter = 0;
2459   str->combined_char_count = 0;
2460   str->combining_table = Qnil;
2461 #endif
2462   if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT
2463       || CODING_SYSTEM_EOL_TYPE (str->codesys) == EOL_AUTODETECT)
2464     {
2465       xzero (str->decst);
2466       str->decst.eol_type = EOL_AUTODETECT;
2467       str->decst.mask = ~0;
2468     }
2469   str->flags = str->cpos = 0;
2470 }
2471
2472 static int
2473 decoding_rewinder (Lstream *stream)
2474 {
2475   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2476   reset_decoding_stream (str);
2477   Dynarr_reset (str->runoff);
2478   return Lstream_rewind (str->other_end);
2479 }
2480
2481 static int
2482 decoding_seekable_p (Lstream *stream)
2483 {
2484   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2485   return Lstream_seekable_p (str->other_end);
2486 }
2487
2488 static int
2489 decoding_flusher (Lstream *stream)
2490 {
2491   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2492   return Lstream_flush (str->other_end);
2493 }
2494
2495 static int
2496 decoding_closer (Lstream *stream)
2497 {
2498   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2499   if (stream->flags & LSTREAM_FL_WRITE)
2500     {
2501       str->flags |= CODING_STATE_END;
2502       decoding_writer (stream, 0, 0);
2503     }
2504   Dynarr_free (str->runoff);
2505 #ifdef MULE
2506 #ifdef ENABLE_COMPOSITE_CHARS
2507   if (str->iso2022.composite_chars)
2508     Dynarr_free (str->iso2022.composite_chars);
2509 #endif
2510 #endif
2511   return Lstream_close (str->other_end);
2512 }
2513
2514 Lisp_Object
2515 decoding_stream_coding_system (Lstream *stream)
2516 {
2517   Lisp_Object coding_system;
2518   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2519
2520   XSETCODING_SYSTEM (coding_system, str->codesys);
2521   return subsidiary_coding_system (coding_system, str->eol_type);
2522 }
2523
2524 void
2525 set_decoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2526 {
2527   Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2528   struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2529   str->codesys = cs;
2530   if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
2531     str->eol_type = CODING_SYSTEM_EOL_TYPE (cs);
2532   reset_decoding_stream (str);
2533 }
2534
2535 /* WARNING WARNING WARNING WARNING!!!!!  If you open up a decoding
2536    stream for writing, no automatic code detection will be performed.
2537    The reason for this is that automatic code detection requires a
2538    seekable input.  Things will also fail if you open a decoding
2539    stream for reading using a non-fully-specified coding system and
2540    a non-seekable input stream. */
2541
2542 static Lisp_Object
2543 make_decoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2544                         const char *mode)
2545 {
2546   Lstream *lstr = Lstream_new (lstream_decoding, mode);
2547   struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2548   Lisp_Object obj;
2549
2550   xzero (*str);
2551   str->other_end = stream;
2552   str->runoff = (unsigned_char_dynarr *) Dynarr_new (unsigned_char);
2553   str->eol_type = EOL_AUTODETECT;
2554   if (!strcmp (mode, "r")
2555       && Lstream_seekable_p (stream))
2556     /* We can determine the coding system now. */
2557     determine_real_coding_system (stream, &codesys, &str->eol_type);
2558   set_decoding_stream_coding_system (lstr, codesys);
2559   str->decst.eol_type = str->eol_type;
2560   str->decst.mask = ~0;
2561   XSETLSTREAM (obj, lstr);
2562   return obj;
2563 }
2564
2565 Lisp_Object
2566 make_decoding_input_stream (Lstream *stream, Lisp_Object codesys)
2567 {
2568   return make_decoding_stream_1 (stream, codesys, "r");
2569 }
2570
2571 Lisp_Object
2572 make_decoding_output_stream (Lstream *stream, Lisp_Object codesys)
2573 {
2574   return make_decoding_stream_1 (stream, codesys, "w");
2575 }
2576
2577 /* Note: the decode_coding_* functions all take the same
2578    arguments as mule_decode(), which is to say some SRC data of
2579    size N, which is to be stored into dynamic array DST.
2580    DECODING is the stream within which the decoding is
2581    taking place, but no data is actually read from or
2582    written to that stream; that is handled in decoding_reader()
2583    or decoding_writer().  This allows the same functions to
2584    be used for both reading and writing. */
2585
2586 static void
2587 mule_decode (Lstream *decoding, const Extbyte *src,
2588              unsigned_char_dynarr *dst, Lstream_data_count n)
2589 {
2590   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2591
2592   /* If necessary, do encoding-detection now.  We do this when
2593      we're a writing stream or a non-seekable reading stream,
2594      meaning that we can't just process the whole input,
2595      rewind, and start over. */
2596
2597   if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT ||
2598       str->eol_type == EOL_AUTODETECT)
2599     {
2600       Lisp_Object codesys;
2601
2602       XSETCODING_SYSTEM (codesys, str->codesys);
2603       detect_coding_type (&str->decst, src, n,
2604                           CODING_SYSTEM_TYPE (str->codesys) !=
2605                           CODESYS_AUTODETECT);
2606       if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT &&
2607           str->decst.mask != ~0)
2608         /* #### This is cheesy.  What we really ought to do is
2609            buffer up a certain amount of data so as to get a
2610            less random result. */
2611         codesys = coding_system_from_mask (str->decst.mask);
2612       str->eol_type = str->decst.eol_type;
2613       if (XCODING_SYSTEM (codesys) != str->codesys)
2614         {
2615           /* Preserve the CODING_STATE_END flag in case it was set.
2616              If we erase it, bad things might happen. */
2617           int was_end = str->flags & CODING_STATE_END;
2618           set_decoding_stream_coding_system (decoding, codesys);
2619           if (was_end)
2620             str->flags |= CODING_STATE_END;
2621         }
2622     }
2623
2624   switch (CODING_SYSTEM_TYPE (str->codesys))
2625     {
2626 #ifdef DEBUG_XEMACS
2627     case CODESYS_INTERNAL:
2628       Dynarr_add_many (dst, src, n);
2629       break;
2630 #endif
2631     case CODESYS_AUTODETECT:
2632       /* If we got this far and still haven't decided on the coding
2633          system, then do no conversion. */
2634     case CODESYS_NO_CONVERSION:
2635       decode_coding_no_conversion (decoding, src, dst, n);
2636       break;
2637 #ifdef MULE
2638     case CODESYS_SHIFT_JIS:
2639       decode_coding_sjis (decoding, src, dst, n);
2640       break;
2641     case CODESYS_BIG5:
2642       decode_coding_big5 (decoding, src, dst, n);
2643       break;
2644     case CODESYS_UCS4:
2645       decode_coding_ucs4 (decoding, src, dst, n);
2646       break;
2647     case CODESYS_UTF16:
2648       decode_coding_utf16 (decoding, src, dst, n);
2649       break;
2650     case CODESYS_UTF8:
2651       decode_coding_utf8 (decoding, src, dst, n);
2652       break;
2653     case CODESYS_CCL:
2654       str->ccl.last_block = str->flags & CODING_STATE_END;
2655       /* When applying ccl program to stream, MUST NOT set NULL
2656          pointer to src.  */
2657       ccl_driver (&str->ccl, (src ? (unsigned char *)src : (unsigned char*)""),
2658                   dst, n, 0, CCL_MODE_DECODING);
2659       break;
2660     case CODESYS_ISO2022:
2661       decode_coding_iso2022 (decoding, src, dst, n);
2662       break;
2663 #endif /* MULE */
2664     default:
2665       abort ();
2666     }
2667 }
2668
2669 DEFUN ("decode-coding-region", Fdecode_coding_region, 3, 4, 0, /*
2670 Decode the text between START and END which is encoded in CODING-SYSTEM.
2671 This is useful if you've read in encoded text from a file without decoding
2672 it (e.g. you read in a JIS-formatted file but used the `binary' or
2673 `no-conversion' coding system, so that it shows up as "^[$B!<!+^[(B").
2674 Return length of decoded text.
2675 BUFFER defaults to the current buffer if unspecified.
2676 */
2677        (start, end, coding_system, buffer))
2678 {
2679   Bufpos b, e;
2680   struct buffer *buf = decode_buffer (buffer, 0);
2681   Lisp_Object instream, lb_outstream, de_outstream, outstream;
2682   Lstream *istr, *ostr;
2683   struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2684
2685   get_buffer_range_char (buf, start, end, &b, &e, 0);
2686
2687   barf_if_buffer_read_only (buf, b, e);
2688
2689   coding_system = Fget_coding_system (coding_system);
2690   instream = make_lisp_buffer_input_stream  (buf, b, e, 0);
2691   lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2692   de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2693                                               coding_system);
2694   outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2695                                            Fget_coding_system (Qbinary));
2696   istr = XLSTREAM (instream);
2697   ostr = XLSTREAM (outstream);
2698   GCPRO4 (instream, lb_outstream, de_outstream, outstream);
2699
2700   /* The chain of streams looks like this:
2701
2702      [BUFFER] <----- send through
2703                      ------> [ENCODE AS BINARY]
2704                              ------> [DECODE AS SPECIFIED]
2705                                      ------> [BUFFER]
2706    */
2707
2708   while (1)
2709     {
2710       char tempbuf[1024]; /* some random amount */
2711       Bufpos newpos, even_newer_pos;
2712       Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2713       Lstream_data_count size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2714
2715       if (!size_in_bytes)
2716         break;
2717       newpos = lisp_buffer_stream_startpos (istr);
2718       Lstream_write (ostr, tempbuf, size_in_bytes);
2719       even_newer_pos = lisp_buffer_stream_startpos (istr);
2720       buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2721                            even_newer_pos, 0);
2722     }
2723   Lstream_close (istr);
2724   Lstream_close (ostr);
2725   UNGCPRO;
2726   Lstream_delete (istr);
2727   Lstream_delete (ostr);
2728   Lstream_delete (XLSTREAM (de_outstream));
2729   Lstream_delete (XLSTREAM (lb_outstream));
2730   return Qnil;
2731 }
2732
2733 \f
2734 /************************************************************************/
2735 /*           Converting to an external encoding ("encoding")            */
2736 /************************************************************************/
2737
2738 /* An encoding stream is an output stream.  When you create the
2739    stream, you specify the coding system that governs the encoding
2740    and another stream that the resulting encoded data is to be
2741    sent to, and then start sending data to it. */
2742
2743 #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding)
2744
2745 struct encoding_stream
2746 {
2747   /* Coding system that governs the conversion. */
2748   Lisp_Coding_System *codesys;
2749
2750   /* Stream that we read the encoded data from or
2751      write the decoded data to. */
2752   Lstream *other_end;
2753
2754   /* If we are reading, then we can return only a fixed amount of
2755      data, so if the conversion resulted in too much data, we store it
2756      here for retrieval the next time around. */
2757   unsigned_char_dynarr *runoff;
2758
2759   /* FLAGS holds flags indicating the current state of the encoding.
2760      Some of these flags are dependent on the coding system. */
2761   unsigned int flags;
2762
2763   /* CH holds a partially built-up character.  Since we only deal
2764      with one- and two-byte characters at the moment, we only use
2765      this to store the first byte of a two-byte character. */
2766   unsigned int ch;
2767 #ifdef MULE
2768   /* Additional information used by the ISO2022 encoder. */
2769   struct
2770     {
2771       /* CHARSET holds the character sets currently assigned to the G0
2772          through G3 registers.  It is initialized from the array
2773          INITIAL_CHARSET in CODESYS. */
2774       Lisp_Object charset[4];
2775
2776       /* Which registers are currently invoked into the left (GL) and
2777          right (GR) halves of the 8-bit encoding space? */
2778       int register_left, register_right;
2779
2780       /* Whether we need to explicitly designate the charset in the
2781          G? register before using it.  It is initialized from the
2782          array FORCE_CHARSET_ON_OUTPUT in CODESYS. */
2783       unsigned char force_charset_on_output[4];
2784
2785       /* Other state variables that need to be preserved across
2786          invocations. */
2787       Lisp_Object current_charset;
2788       int current_half;
2789       int current_char_boundary;
2790     } iso2022;
2791
2792   void (*encode_char) (struct encoding_stream *str, Emchar c,
2793                        unsigned_char_dynarr *dst, unsigned int *flags);
2794   void (*finish) (struct encoding_stream *str,
2795                   unsigned_char_dynarr *dst, unsigned int *flags);
2796
2797   /* Additional information (the state of the running CCL program)
2798      used by the CCL encoder. */
2799   struct ccl_program ccl;
2800 #endif /* MULE */
2801 };
2802
2803 static Lstream_data_count encoding_reader (Lstream *stream, unsigned char *data, Lstream_data_count size);
2804 static Lstream_data_count encoding_writer (Lstream *stream, const unsigned char *data,
2805                                 Lstream_data_count size);
2806 static int encoding_rewinder   (Lstream *stream);
2807 static int encoding_seekable_p (Lstream *stream);
2808 static int encoding_flusher    (Lstream *stream);
2809 static int encoding_closer     (Lstream *stream);
2810
2811 static Lisp_Object encoding_marker (Lisp_Object stream);
2812
2813 DEFINE_LSTREAM_IMPLEMENTATION ("encoding", lstream_encoding,
2814                                sizeof (struct encoding_stream));
2815
2816 static Lisp_Object
2817 encoding_marker (Lisp_Object stream)
2818 {
2819   Lstream *str = ENCODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2820   Lisp_Object str_obj;
2821
2822   /* We do not need to mark the coding systems or charsets stored
2823      within the stream because they are stored in a global list
2824      and automatically marked. */
2825
2826   XSETLSTREAM (str_obj, str);
2827   mark_object (str_obj);
2828   if (str->imp->marker)
2829     return (str->imp->marker) (str_obj);
2830   else
2831     return Qnil;
2832 }
2833
2834 /* Read SIZE bytes of data and store it into DATA.  We are a encoding stream
2835    so we read data from the other end, encode it, and store it into DATA. */
2836
2837 static Lstream_data_count
2838 encoding_reader (Lstream *stream, unsigned char *data, Lstream_data_count size)
2839 {
2840   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2841   unsigned char *orig_data = data;
2842   Lstream_data_count read_size;
2843   int error_occurred = 0;
2844
2845   /* We need to interface to mule_encode(), which expects to take some
2846      amount of data and store the result into a Dynarr.  We have
2847      mule_encode() store into str->runoff, and take data from there
2848      as necessary. */
2849
2850   /* We loop until we have enough data, reading chunks from the other
2851      end and encoding it. */
2852   while (1)
2853     {
2854       /* Take data from the runoff if we can.  Make sure to take at
2855          most SIZE bytes, and delete the data from the runoff. */
2856       if (Dynarr_length (str->runoff) > 0)
2857         {
2858           int chunk = min ((int) size, Dynarr_length (str->runoff));
2859           memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2860           Dynarr_delete_many (str->runoff, 0, chunk);
2861           data += chunk;
2862           size -= chunk;
2863         }
2864
2865       if (size == 0)
2866         break; /* No more room for data */
2867
2868       if (str->flags & CODING_STATE_END)
2869         /* This means that on the previous iteration, we hit the EOF on
2870            the other end.  We loop once more so that mule_encode() can
2871            output any final stuff it may be holding, or any "go back
2872            to a sane state" escape sequences. (This latter makes sense
2873            during encoding.) */
2874         break;
2875
2876       /* Exhausted the runoff, so get some more.  DATA at least SIZE bytes
2877          left of storage in it, so it's OK to read directly into it.
2878          (We'll be overwriting above, after we've encoded it into the
2879          runoff.) */
2880       read_size = Lstream_read (str->other_end, data, size);
2881       if (read_size < 0)
2882         {
2883           error_occurred = 1;
2884           break;
2885         }
2886       if (read_size == 0)
2887         /* There might be some more end data produced in the translation.
2888            See the comment above. */
2889         str->flags |= CODING_STATE_END;
2890       mule_encode (stream, data, str->runoff, read_size);
2891     }
2892
2893   if (data == orig_data)
2894     return error_occurred ? -1 : 0;
2895   else
2896     return data - orig_data;
2897 }
2898
2899 static Lstream_data_count
2900 encoding_writer (Lstream *stream, const unsigned char *data, Lstream_data_count size)
2901 {
2902   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2903   Lstream_data_count retval;
2904
2905   /* Encode all our data into the runoff, and then attempt to write
2906      it all out to the other end.  Remove whatever chunk we succeeded
2907      in writing. */
2908   mule_encode (stream, data, str->runoff, size);
2909   retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2910                           Dynarr_length (str->runoff));
2911   if (retval > 0)
2912     Dynarr_delete_many (str->runoff, 0, retval);
2913   /* Do NOT return retval.  The return value indicates how much
2914      of the incoming data was written, not how many bytes were
2915      written. */
2916   return size;
2917 }
2918
2919 static void
2920 reset_encoding_stream (struct encoding_stream *str)
2921 {
2922 #ifdef MULE
2923   switch (CODING_SYSTEM_TYPE (str->codesys))
2924     {
2925     case CODESYS_ISO2022:
2926       {
2927         int i;
2928
2929         str->encode_char = &char_encode_iso2022;
2930         str->finish = &char_finish_iso2022;
2931         for (i = 0; i < 4; i++)
2932           {
2933             str->iso2022.charset[i] =
2934               CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, i);
2935             str->iso2022.force_charset_on_output[i] =
2936               CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (str->codesys, i);
2937           }
2938         str->iso2022.register_left = 0;
2939         str->iso2022.register_right = 1;
2940         str->iso2022.current_charset = Qnil;
2941         str->iso2022.current_half = 0;
2942         break;
2943       }
2944     case CODESYS_CCL:
2945       setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_ENCODE (str->codesys));
2946       break;
2947     case CODESYS_UTF8:
2948       str->encode_char = &char_encode_utf8;
2949       str->finish = &char_finish_utf8;
2950       break;
2951     case CODESYS_UTF16:
2952       str->encode_char = &char_encode_utf16;
2953       str->finish = &char_finish_utf16;
2954       break;
2955     case CODESYS_UCS4:
2956       str->encode_char = &char_encode_ucs4;
2957       str->finish = &char_finish_ucs4;
2958       break;
2959     case CODESYS_SHIFT_JIS:
2960       str->encode_char = &char_encode_shift_jis;
2961       str->finish = &char_finish_shift_jis;
2962       break;
2963     case CODESYS_BIG5:
2964       str->encode_char = &char_encode_big5;
2965       str->finish = &char_finish_big5;
2966       break;
2967     default:
2968       break;
2969     }
2970 #endif /* MULE */
2971   str->iso2022.current_char_boundary = 0;
2972   str->flags = str->ch = 0;
2973 }
2974
2975 static int
2976 encoding_rewinder (Lstream *stream)
2977 {
2978   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2979   reset_encoding_stream (str);
2980   Dynarr_reset (str->runoff);
2981   return Lstream_rewind (str->other_end);
2982 }
2983
2984 static int
2985 encoding_seekable_p (Lstream *stream)
2986 {
2987   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2988   return Lstream_seekable_p (str->other_end);
2989 }
2990
2991 static int
2992 encoding_flusher (Lstream *stream)
2993 {
2994   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2995   return Lstream_flush (str->other_end);
2996 }
2997
2998 static int
2999 encoding_closer (Lstream *stream)
3000 {
3001   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
3002   if (stream->flags & LSTREAM_FL_WRITE)
3003     {
3004       str->flags |= CODING_STATE_END;
3005       encoding_writer (stream, 0, 0);
3006     }
3007   Dynarr_free (str->runoff);
3008   return Lstream_close (str->other_end);
3009 }
3010
3011 Lisp_Object
3012 encoding_stream_coding_system (Lstream *stream)
3013 {
3014   Lisp_Object coding_system;
3015   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
3016
3017   XSETCODING_SYSTEM (coding_system, str->codesys);
3018   return coding_system;
3019 }
3020
3021 void
3022 set_encoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
3023 {
3024   Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
3025   struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
3026   str->codesys = cs;
3027   reset_encoding_stream (str);
3028 }
3029
3030 static Lisp_Object
3031 make_encoding_stream_1 (Lstream *stream, Lisp_Object codesys,
3032                         const char *mode)
3033 {
3034   Lstream *lstr = Lstream_new (lstream_encoding, mode);
3035   struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
3036   Lisp_Object obj;
3037
3038   xzero (*str);
3039   str->runoff = Dynarr_new (unsigned_char);
3040   str->other_end = stream;
3041   set_encoding_stream_coding_system (lstr, codesys);
3042   XSETLSTREAM (obj, lstr);
3043   return obj;
3044 }
3045
3046 Lisp_Object
3047 make_encoding_input_stream (Lstream *stream, Lisp_Object codesys)
3048 {
3049   return make_encoding_stream_1 (stream, codesys, "r");
3050 }
3051
3052 Lisp_Object
3053 make_encoding_output_stream (Lstream *stream, Lisp_Object codesys)
3054 {
3055   return make_encoding_stream_1 (stream, codesys, "w");
3056 }
3057
3058 /* Convert N bytes of internally-formatted data stored in SRC to an
3059    external format, according to the encoding stream ENCODING.
3060    Store the encoded data into DST. */
3061
3062 static void
3063 mule_encode (Lstream *encoding, const Bufbyte *src,
3064              unsigned_char_dynarr *dst, Lstream_data_count n)
3065 {
3066   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3067
3068   switch (CODING_SYSTEM_TYPE (str->codesys))
3069     {
3070 #ifdef DEBUG_XEMACS
3071     case CODESYS_INTERNAL:
3072       Dynarr_add_many (dst, src, n);
3073       break;
3074 #endif
3075     case CODESYS_AUTODETECT:
3076       /* If we got this far and still haven't decided on the coding
3077          system, then do no conversion. */
3078     case CODESYS_NO_CONVERSION:
3079       encode_coding_no_conversion (encoding, src, dst, n);
3080       break;
3081 #ifdef MULE
3082     case CODESYS_CCL:
3083       str->ccl.last_block = str->flags & CODING_STATE_END;
3084       /* When applying ccl program to stream, MUST NOT set NULL
3085          pointer to src.  */
3086       ccl_driver (&str->ccl, ((src) ? src : (unsigned char*)""),
3087                   dst, n, 0, CCL_MODE_ENCODING);
3088       break;
3089 #endif /* MULE */
3090     default:
3091       text_encode_generic (encoding, src, dst, n);
3092     }
3093 }
3094
3095 DEFUN ("encode-coding-region", Fencode_coding_region, 3, 4, 0, /*
3096 Encode the text between START and END using CODING-SYSTEM.
3097 This will, for example, convert Japanese characters into stuff such as
3098 "^[$B!<!+^[(B" if you use the JIS encoding.  Return length of encoded
3099 text.  BUFFER defaults to the current buffer if unspecified.
3100 */
3101        (start, end, coding_system, buffer))
3102 {
3103   Bufpos b, e;
3104   struct buffer *buf = decode_buffer (buffer, 0);
3105   Lisp_Object instream, lb_outstream, de_outstream, outstream;
3106   Lstream *istr, *ostr;
3107   struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
3108
3109   get_buffer_range_char (buf, start, end, &b, &e, 0);
3110
3111   barf_if_buffer_read_only (buf, b, e);
3112
3113   coding_system = Fget_coding_system (coding_system);
3114   instream  = make_lisp_buffer_input_stream  (buf, b, e, 0);
3115   lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
3116   de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
3117                                               Fget_coding_system (Qbinary));
3118   outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
3119                                            coding_system);
3120   istr = XLSTREAM (instream);
3121   ostr = XLSTREAM (outstream);
3122   GCPRO4 (instream, outstream, de_outstream, lb_outstream);
3123   /* The chain of streams looks like this:
3124
3125      [BUFFER] <----- send through
3126                      ------> [ENCODE AS SPECIFIED]
3127                              ------> [DECODE AS BINARY]
3128                                      ------> [BUFFER]
3129    */
3130   while (1)
3131     {
3132       char tempbuf[1024]; /* some random amount */
3133       Bufpos newpos, even_newer_pos;
3134       Bufpos oldpos = lisp_buffer_stream_startpos (istr);
3135       Lstream_data_count size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
3136
3137       if (!size_in_bytes)
3138         break;
3139       newpos = lisp_buffer_stream_startpos (istr);
3140       Lstream_write (ostr, tempbuf, size_in_bytes);
3141       even_newer_pos = lisp_buffer_stream_startpos (istr);
3142       buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
3143                            even_newer_pos, 0);
3144     }
3145
3146   {
3147     Charcount retlen =
3148       lisp_buffer_stream_startpos (XLSTREAM (instream)) - b;
3149     Lstream_close (istr);
3150     Lstream_close (ostr);
3151     UNGCPRO;
3152     Lstream_delete (istr);
3153     Lstream_delete (ostr);
3154     Lstream_delete (XLSTREAM (de_outstream));
3155     Lstream_delete (XLSTREAM (lb_outstream));
3156     return make_int (retlen);
3157   }
3158 }
3159
3160 #ifdef MULE
3161 \f
3162 static void
3163 text_encode_generic (Lstream *encoding, const Bufbyte *src,
3164                      unsigned_char_dynarr *dst, Lstream_data_count n)
3165 {
3166   unsigned char c;
3167   unsigned char char_boundary;
3168   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3169   unsigned int flags          = str->flags;
3170   Emchar ch                   = str->ch;
3171
3172   char_boundary = str->iso2022.current_char_boundary;
3173
3174   while (n--)
3175     {
3176       c = *src++;
3177
3178       if (char_boundary == 0)
3179         {
3180           if (c >= 0xfc)
3181             {
3182               ch = c & 0x01;
3183               char_boundary = 5;
3184             }
3185           else if (c >= 0xf8)
3186             {
3187               ch = c & 0x03;
3188               char_boundary = 4;
3189             }
3190           else if (c >= 0xf0)
3191             {
3192               ch = c & 0x07;
3193               char_boundary = 3;
3194             }
3195           else if (c >= 0xe0)
3196             {
3197               ch = c & 0x0f;
3198               char_boundary = 2;
3199             }
3200           else if (c >= 0xc0)
3201             {
3202               ch = c & 0x1f;
3203               char_boundary = 1;
3204             }
3205           else
3206             (*str->encode_char) (str, c, dst, &flags);
3207         }
3208       else if (char_boundary == 1)
3209         {
3210           (*str->encode_char) (str, (ch << 6) | (c & 0x3f), dst, &flags);
3211           ch =0;
3212           char_boundary = 0;
3213         }
3214       else
3215         {
3216           ch = (ch << 6) | (c & 0x3f);
3217           char_boundary--;
3218         }
3219     }
3220
3221   if ((char_boundary == 0) && (flags & CODING_STATE_END))
3222     {
3223       (*str->finish) (str, dst, &flags);
3224     }
3225
3226   str->flags = flags;
3227   str->ch    = ch;
3228   str->iso2022.current_char_boundary = char_boundary;
3229 }
3230
3231 \f
3232 #ifdef UTF2000
3233 /************************************************************************/
3234 /*                          entity reference                            */
3235 /************************************************************************/
3236
3237 INLINE_HEADER void
3238 decode_flush_er_chars (struct decoding_stream *str, unsigned_char_dynarr* dst);
3239 INLINE_HEADER void
3240 decode_flush_er_chars (struct decoding_stream *str, unsigned_char_dynarr* dst)
3241 {
3242   if ( str->er_counter > 0)
3243     {
3244       Dynarr_add_many (dst, str->er_buf, str->er_counter);
3245       str->er_counter = 0;
3246     }
3247 }
3248
3249 EXFUN (Fregexp_quote, 1);
3250
3251 void decode_add_er_char (struct decoding_stream *str, Emchar character,
3252                          unsigned_char_dynarr* dst);
3253 void
3254 decode_add_er_char (struct decoding_stream *str, Emchar c,
3255                     unsigned_char_dynarr* dst)
3256 {
3257   if (str->er_counter == 0)
3258     {
3259       if (CODING_SYSTEM_USE_ENTITY_REFERENCE (str->codesys)
3260           && (c == '&') )
3261         {
3262           str->er_buf[0] = '&';
3263           str->er_counter++;
3264         }
3265       else
3266         DECODE_ADD_UCS_CHAR (c, dst);
3267     }
3268   else if (c == ';')
3269     {
3270       Lisp_Object string = make_string (str->er_buf,
3271                                         str->er_counter);
3272       Lisp_Object rest;
3273       Lisp_Object cell;
3274       Lisp_Object ret;
3275       Lisp_Object pat;
3276       Lisp_Object ccs;
3277       Lisp_Object char_type;
3278       int base;
3279
3280       for ( rest = Vcoded_charset_entity_reference_alist;
3281             !NILP (rest); rest = Fcdr (rest) )
3282         {
3283           cell = Fcar (rest);
3284           ccs = Fcar (cell);
3285           if (CONSP (ccs))
3286             {
3287               char_type = XCDR (ccs);
3288               ccs = XCAR (ccs);
3289             }
3290           else
3291             char_type = Qnil;
3292           if (NILP (ccs = Ffind_charset (ccs)))
3293             continue;
3294
3295           cell = Fcdr (cell);
3296           ret = Fcar (cell);
3297           if (STRINGP (ret))
3298             pat = ret;
3299           else
3300             continue;
3301           pat = Fregexp_quote (pat);
3302
3303           cell = Fcdr (cell);
3304           cell = Fcdr (cell);
3305           ret = Fcar (cell);
3306           if (EQ (ret, Qd))
3307             {
3308               pat = concat3 (build_string ("^&"),
3309                              pat, build_string ("\\([0-9]+\\)$"));
3310               base = 10;
3311             }
3312           else if (EQ (ret, Qx))
3313             {
3314               pat = concat3 (build_string ("^&"),
3315                              pat, build_string ("\\([0-9a-f]+\\)$"));
3316               base = 16;
3317             }
3318           else if (EQ (ret, QX))
3319             {
3320               pat = concat3 (build_string ("^&"),
3321                              pat, build_string ("\\([0-9A-F]+\\)$"));
3322               base = 16;
3323             }
3324           else
3325             continue;
3326
3327           if (!NILP (Fstring_match (pat, string, Qnil, Qnil)))
3328             {
3329               int code
3330                 = XINT (Fstring_to_number
3331                         (Fsubstring (string,
3332                                      Fmatch_beginning (make_int (1)),
3333                                      Fmatch_end (make_int (1))),
3334                          make_int (base)));
3335               Emchar chr
3336                 = NILP (char_type)
3337                 ? DECODE_CHAR (ccs, code, 0)
3338                 : decode_builtin_char (ccs, code);
3339
3340               DECODE_ADD_UCS_CHAR (chr, dst);
3341               goto decoded;
3342             }
3343         }
3344       if (!NILP (Fstring_match (build_string ("^&MCS-\\([0-9A-F]+\\)$"),
3345                                 string, Qnil, Qnil)))
3346         {
3347           int code
3348             = XUINT (Fstring_to_number
3349                      (Fsubstring (string,
3350                                   Fmatch_beginning (make_int (1)),
3351                                   Fmatch_end (make_int (1))),
3352                       make_int (16)));
3353
3354           DECODE_ADD_UCS_CHAR (code, dst);
3355         }
3356       else
3357         {
3358           Dynarr_add_many (dst, str->er_buf, str->er_counter);
3359           Dynarr_add (dst, ';');
3360         }
3361     decoded:
3362       str->er_counter = 0;
3363     }
3364   else if ( (str->er_counter >= ER_BUF_SIZE) || (c >= 0x7F) )
3365     {
3366       Dynarr_add_many (dst, str->er_buf, str->er_counter);
3367       str->er_counter = 0;
3368       DECODE_ADD_UCS_CHAR (c, dst);
3369     }
3370   else
3371     str->er_buf[str->er_counter++] = c;
3372 }
3373
3374 void char_encode_as_entity_reference (Emchar ch, char* buf);
3375 void
3376 char_encode_as_entity_reference (Emchar ch, char* buf)
3377 {
3378   Lisp_Object rest = Vcoded_charset_entity_reference_alist;
3379   Lisp_Object cell;
3380   Lisp_Object ccs;
3381   Lisp_Object char_type;
3382   int format_columns, idx;
3383   char format[ER_BUF_SIZE];
3384
3385   while (!NILP (rest))
3386     {
3387       cell = Fcar (rest);
3388       ccs = Fcar (cell);
3389       if (CONSP (ccs))
3390         {
3391           char_type = XCDR (ccs);
3392           ccs = XCAR (ccs);
3393         }
3394       else
3395         char_type = Qnil;
3396       if (!NILP (ccs = Ffind_charset (ccs)))
3397         {
3398           int code_point = charset_code_point (ccs, ch, 0);
3399
3400           if ( (code_point >= 0)
3401                && (NILP (char_type)
3402                    || DECODE_CHAR (ccs, code_point, 0) != ch) )
3403             {
3404               Lisp_Object ret;
3405
3406               cell = Fcdr (cell);
3407               ret = Fcar (cell);
3408               if ( STRINGP (ret) &&
3409                    ( (idx = XSTRING_LENGTH (ret)) <= (ER_BUF_SIZE - 4) ) )
3410                 {
3411                   format[0] = '&';
3412                   strncpy (&format[1], XSTRING_DATA (ret), idx);
3413                   idx++;
3414                 }
3415               else
3416                 goto try_next;
3417
3418               cell = Fcdr (cell);
3419               ret = Fcar (cell);
3420               if (INTP (ret))
3421                 {
3422                   format[idx++] = '%';
3423                   format_columns = XINT (ret);
3424                   if ( (2 <= format_columns) && (format_columns <= 8)
3425                        && (idx + format_columns <= ER_BUF_SIZE - 1) )
3426                     {
3427                       format [idx++] = '0';
3428                       format [idx++] = '0' + format_columns;
3429                     }
3430                 }
3431               else
3432                 goto try_next;
3433
3434               cell = Fcdr (cell);
3435               ret = Fcar (cell);
3436               if (EQ (ret, Qd))
3437                 format [idx++] = 'd';
3438               else if (EQ (ret, Qx))
3439                 format [idx++] = 'x';
3440               else if (EQ (ret, QX))
3441                 format [idx++] = 'X';
3442               else
3443                 goto try_next;
3444               format [idx++] = ';';
3445               format [idx++] = 0;
3446
3447               sprintf (buf, format, code_point);
3448               return;
3449             }
3450         }
3451     try_next:
3452       rest = Fcdr (rest);
3453     }
3454   sprintf (buf, "&MCS-%08X;", ch);
3455 }
3456
3457 \f
3458 /************************************************************************/
3459 /*                          character composition                       */
3460 /************************************************************************/
3461 extern Lisp_Object Qcomposition;
3462
3463 INLINE_HEADER void
3464 COMPOSE_FLUSH_CHARS (struct decoding_stream *str, unsigned_char_dynarr* dst);
3465 INLINE_HEADER void
3466 COMPOSE_FLUSH_CHARS (struct decoding_stream *str, unsigned_char_dynarr* dst)
3467 {
3468   unsigned i;
3469
3470   for (i = 0; i < str->combined_char_count; i++)
3471     decode_add_er_char (str, str->combined_chars[i], dst);
3472   str->combined_char_count = 0;
3473   str->combining_table = Qnil;
3474 }
3475
3476 void COMPOSE_ADD_CHAR (struct decoding_stream *str, Emchar character,
3477                        unsigned_char_dynarr* dst);
3478 void
3479 COMPOSE_ADD_CHAR (struct decoding_stream *str,
3480                   Emchar character, unsigned_char_dynarr* dst)
3481 {
3482   if (CODING_SYSTEM_DISABLE_COMPOSITION (str->codesys))
3483     decode_add_er_char (str, character, dst);
3484   else if (!CONSP (str->combining_table))
3485     {
3486       Lisp_Object ret
3487         = Fchar_feature (make_char (character), Qcomposition, Qnil,
3488                          Qnil, Qnil);
3489
3490       if (NILP (ret))
3491         decode_add_er_char (str, character, dst);
3492       else
3493         {
3494           str->combined_chars[0] = character;
3495           str->combined_char_count = 1;
3496           str->combining_table = ret;
3497         }
3498     }
3499   else
3500     {
3501       Lisp_Object ret
3502         = Fcdr (Fassq (make_char (character), str->combining_table));
3503
3504       if (CHARP (ret))
3505         {
3506           Emchar char2 = XCHARVAL (ret);
3507           Lisp_Object ret2 = Fchar_feature (ret, Qcomposition, Qnil,
3508                                             Qnil, Qnil);
3509
3510           if (NILP (ret2))
3511             {
3512               decode_add_er_char (str, char2, dst);
3513               str->combined_char_count = 0;
3514               str->combining_table = Qnil;
3515             }
3516           else
3517             {
3518               str->combined_chars[0] = char2;
3519               str->combined_char_count = 1;
3520               str->combining_table = ret2;
3521             }
3522         }
3523       else
3524         {
3525           ret = Fchar_feature (make_char (character), Qcomposition, Qnil,
3526                                Qnil, Qnil);
3527
3528           COMPOSE_FLUSH_CHARS (str, dst);
3529           if (NILP (ret))
3530             decode_add_er_char (str, character, dst);
3531           else
3532             {
3533               str->combined_chars[0] = character;
3534               str->combined_char_count = 1;
3535               str->combining_table = ret;
3536             }
3537         }
3538     }
3539 }
3540 #else /* not UTF2000 */
3541 #define COMPOSE_FLUSH_CHARS(str, dst)
3542 #define COMPOSE_ADD_CHAR(str, ch, dst) DECODE_ADD_UCS_CHAR (ch, dst)
3543 #endif /* UTF2000 */
3544
3545 \f
3546 /************************************************************************/
3547 /*                          Shift-JIS methods                           */
3548 /************************************************************************/
3549
3550 /* Shift-JIS is a coding system encoding three character sets: ASCII, right
3551    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
3552    as is.  A character of JISX0201-Kana (DIMENSION1_CHARS94 character set) is
3553    encoded by "position-code + 0x80".  A character of JISX0208
3554    (DIMENSION2_CHARS94 character set) is encoded in 2-byte but two
3555    position-codes are divided and shifted so that it fit in the range
3556    below.
3557
3558    --- CODE RANGE of Shift-JIS ---
3559    (character set)      (range)
3560    ASCII                0x00 .. 0x7F
3561    JISX0201-Kana        0xA0 .. 0xDF
3562    JISX0208 (1st byte)  0x80 .. 0x9F and 0xE0 .. 0xEF
3563             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
3564    -------------------------------
3565
3566 */
3567
3568 /* Is this the first byte of a Shift-JIS two-byte char? */
3569
3570 #define BYTE_SJIS_TWO_BYTE_1_P(c) \
3571   (((c) >= 0x81 && (c) <= 0x9F) || ((c) >= 0xE0 && (c) <= 0xEF))
3572
3573 /* Is this the second byte of a Shift-JIS two-byte char? */
3574
3575 #define BYTE_SJIS_TWO_BYTE_2_P(c) \
3576   (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0x80 && (c) <= 0xFC))
3577
3578 #define BYTE_SJIS_KATAKANA_P(c) \
3579   ((c) >= 0xA1 && (c) <= 0xDF)
3580
3581 static int
3582 detect_coding_sjis (struct detection_state *st, const Extbyte *src, Lstream_data_count n)
3583 {
3584   while (n--)
3585     {
3586       unsigned char c = *(unsigned char *)src++;
3587       if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3588         return 0;
3589       if (st->shift_jis.in_second_byte)
3590         {
3591           st->shift_jis.in_second_byte = 0;
3592           if (c < 0x40)
3593             return 0;
3594         }
3595       else if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
3596         st->shift_jis.in_second_byte = 1;
3597     }
3598   return CODING_CATEGORY_SHIFT_JIS_MASK;
3599 }
3600
3601 /* Convert Shift-JIS data to internal format. */
3602
3603 static void
3604 decode_coding_sjis (Lstream *decoding, const Extbyte *src,
3605                     unsigned_char_dynarr *dst, Lstream_data_count n)
3606 {
3607   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3608   unsigned int flags  = str->flags;
3609   unsigned int cpos   = str->cpos;
3610   eol_type_t eol_type = str->eol_type;
3611
3612   while (n--)
3613     {
3614       unsigned char c = *(unsigned char *)src++;
3615
3616       if (cpos)
3617         {
3618           /* Previous character was first byte of Shift-JIS Kanji char. */
3619           if (BYTE_SJIS_TWO_BYTE_2_P (c))
3620             {
3621               unsigned char e1, e2;
3622
3623               DECODE_SJIS (cpos, c, e1, e2);
3624 #ifdef UTF2000
3625               DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_japanese_jisx0208,
3626                                             e1 & 0x7F,
3627                                             e2 & 0x7F), dst);
3628 #else
3629               Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
3630               Dynarr_add (dst, e1);
3631               Dynarr_add (dst, e2);
3632 #endif
3633             }
3634           else
3635             {
3636               DECODE_ADD_BINARY_CHAR (cpos, dst);
3637               DECODE_ADD_BINARY_CHAR (c, dst);
3638             }
3639           cpos = 0;
3640         }
3641       else
3642         {
3643           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3644           if (BYTE_SJIS_TWO_BYTE_1_P (c))
3645             cpos = c;
3646           else if (BYTE_SJIS_KATAKANA_P (c))
3647             {
3648 #ifdef UTF2000
3649               DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_katakana_jisx0201,
3650                                             c & 0x7F, 0), dst);
3651 #else
3652               Dynarr_add (dst, LEADING_BYTE_KATAKANA_JISX0201);
3653               Dynarr_add (dst, c);
3654 #endif
3655             }
3656 #ifdef UTF2000
3657           else if (c > 32)
3658             DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_latin_jisx0201,
3659                                           c, 0), dst);
3660 #endif
3661           else
3662             DECODE_ADD_BINARY_CHAR (c, dst);
3663         }
3664     label_continue_loop:;
3665     }
3666
3667   DECODE_HANDLE_END_OF_CONVERSION (flags, cpos, dst);
3668
3669   str->flags = flags;
3670   str->cpos  = cpos;
3671 }
3672
3673 /* Convert internal character representation to Shift_JIS. */
3674
3675 void
3676 char_encode_shift_jis (struct encoding_stream *str, Emchar ch,
3677                        unsigned_char_dynarr *dst, unsigned int *flags)
3678 {
3679   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3680
3681   if (ch == '\n')
3682     {
3683       if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3684         Dynarr_add (dst, '\r');
3685       if (eol_type != EOL_CR)
3686         Dynarr_add (dst, ch);
3687     }
3688   else
3689     {
3690       unsigned int s1, s2;
3691 #ifdef UTF2000
3692       int code_point = charset_code_point (Vcharset_latin_jisx0201, ch, 0);
3693
3694       if (code_point >= 0)
3695         Dynarr_add (dst, code_point);
3696       else if ((code_point
3697                 = charset_code_point (Vcharset_japanese_jisx0208_1990, ch, 0))
3698                >= 0)
3699         {
3700           ENCODE_SJIS ((code_point >> 8) | 0x80,
3701                        (code_point & 0xFF) | 0x80, s1, s2);
3702           Dynarr_add (dst, s1);
3703           Dynarr_add (dst, s2);
3704         }
3705       else if ((code_point
3706                 = charset_code_point (Vcharset_katakana_jisx0201, ch, 0))
3707                >= 0)
3708         Dynarr_add (dst, code_point | 0x80);
3709       else if ((code_point
3710                 = charset_code_point (Vcharset_japanese_jisx0208, ch, 0))
3711                >= 0)
3712         {
3713           ENCODE_SJIS ((code_point >> 8) | 0x80,
3714                        (code_point & 0xFF) | 0x80, s1, s2);
3715           Dynarr_add (dst, s1);
3716           Dynarr_add (dst, s2);
3717         }
3718       else if ((code_point = charset_code_point (Vcharset_ascii, ch, 0))
3719                >= 0)
3720         Dynarr_add (dst, code_point);
3721       else
3722         Dynarr_add (dst, '?');
3723 #else
3724       Lisp_Object charset;
3725       unsigned int c1, c2;
3726
3727       BREAKUP_CHAR (ch, charset, c1, c2);
3728
3729       if (EQ(charset, Vcharset_katakana_jisx0201))
3730         {
3731           Dynarr_add (dst, c1 | 0x80);
3732         }
3733       else if (c2 == 0)
3734         {
3735           Dynarr_add (dst, c1);
3736         }
3737       else if (EQ(charset, Vcharset_japanese_jisx0208))
3738         {
3739           ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3740           Dynarr_add (dst, s1);
3741           Dynarr_add (dst, s2);
3742         }
3743       else
3744         Dynarr_add (dst, '?');
3745 #endif
3746     }
3747 }
3748
3749 void
3750 char_finish_shift_jis (struct encoding_stream *str, unsigned_char_dynarr *dst,
3751                        unsigned int *flags)
3752 {
3753 }
3754
3755 DEFUN ("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /*
3756 Decode a JISX0208 character of Shift-JIS coding-system.
3757 CODE is the character code in Shift-JIS as a cons of type bytes.
3758 Return the corresponding character.
3759 */
3760        (code))
3761 {
3762   unsigned char c1, c2, s1, s2;
3763
3764   CHECK_CONS (code);
3765   CHECK_INT (XCAR (code));
3766   CHECK_INT (XCDR (code));
3767   s1 = XINT (XCAR (code));
3768   s2 = XINT (XCDR (code));
3769   if (BYTE_SJIS_TWO_BYTE_1_P (s1) &&
3770       BYTE_SJIS_TWO_BYTE_2_P (s2))
3771     {
3772       DECODE_SJIS (s1, s2, c1, c2);
3773       return make_char (MAKE_CHAR (Vcharset_japanese_jisx0208,
3774                                    c1 & 0x7F, c2 & 0x7F));
3775     }
3776   else
3777     return Qnil;
3778 }
3779
3780 DEFUN ("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /*
3781 Encode a JISX0208 character CHARACTER to SHIFT-JIS coding-system.
3782 Return the corresponding character code in SHIFT-JIS as a cons of two bytes.
3783 */
3784        (character))
3785 {
3786   Lisp_Object charset;
3787   int c1, c2, s1, s2;
3788
3789   CHECK_CHAR_COERCE_INT (character);
3790   BREAKUP_CHAR (XCHAR (character), charset, c1, c2);
3791   if (EQ (charset, Vcharset_japanese_jisx0208))
3792     {
3793       ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3794       return Fcons (make_int (s1), make_int (s2));
3795     }
3796   else
3797     return Qnil;
3798 }
3799
3800 \f
3801 /************************************************************************/
3802 /*                            Big5 methods                              */
3803 /************************************************************************/
3804
3805 /* BIG5 is a coding system encoding two character sets: ASCII and
3806    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
3807    character set and is encoded in two-byte.
3808
3809    --- CODE RANGE of BIG5 ---
3810    (character set)      (range)
3811    ASCII                0x00 .. 0x7F
3812    Big5 (1st byte)      0xA1 .. 0xFE
3813         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
3814    --------------------------
3815
3816    Since the number of characters in Big5 is larger than maximum
3817    characters in Emacs' charset (96x96), it can't be handled as one
3818    charset.  So, in Emacs, Big5 is divided into two: `charset-big5-1'
3819    and `charset-big5-2'.  Both <type>s are DIMENSION2_CHARS94.  The former
3820    contains frequently used characters and the latter contains less
3821    frequently used characters.  */
3822
3823 #ifdef UTF2000
3824 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
3825   ((c) >= 0x81 && (c) <= 0xFE)
3826 #else
3827 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
3828   ((c) >= 0xA1 && (c) <= 0xFE)
3829 #endif
3830
3831 /* Is this the second byte of a Shift-JIS two-byte char? */
3832
3833 #define BYTE_BIG5_TWO_BYTE_2_P(c) \
3834   (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0xA1 && (c) <= 0xFE))
3835
3836 /* Number of Big5 characters which have the same code in 1st byte.  */
3837
3838 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
3839
3840 /* Code conversion macros.  These are macros because they are used in
3841    inner loops during code conversion.
3842
3843    Note that temporary variables in macros introduce the classic
3844    dynamic-scoping problems with variable names.  We use capital-
3845    lettered variables in the assumption that XEmacs does not use
3846    capital letters in variables except in a very formalized way
3847    (e.g. Qstring). */
3848
3849 /* Convert Big5 code (b1, b2) into its internal string representation
3850    (lb, c1, c2). */
3851
3852 /* There is a much simpler way to split the Big5 charset into two.
3853    For the moment I'm going to leave the algorithm as-is because it
3854    claims to separate out the most-used characters into a single
3855    charset, which perhaps will lead to optimizations in various
3856    places.
3857
3858    The way the algorithm works is something like this:
3859
3860    Big5 can be viewed as a 94x157 charset, where the row is
3861    encoded into the bytes 0xA1 .. 0xFE and the column is encoded
3862    into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE.  As for frequency,
3863    the split between low and high column numbers is apparently
3864    meaningless; ascending rows produce less and less frequent chars.
3865    Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to
3866    the first charset, and the upper half (0xC9 .. 0xFE) to the
3867    second.  To do the conversion, we convert the character into
3868    a single number where 0 .. 156 is the first row, 157 .. 313
3869    is the second, etc.  That way, the characters are ordered by
3870    decreasing frequency.  Then we just chop the space in two
3871    and coerce the result into a 94x94 space.
3872    */
3873
3874 #define DECODE_BIG5(b1, b2, lb, c1, c2) do                              \
3875 {                                                                       \
3876   int B1 = b1, B2 = b2;                                                 \
3877   unsigned int I                                                        \
3878     = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62);     \
3879                                                                         \
3880   if (B1 < 0xC9)                                                        \
3881     {                                                                   \
3882       lb = LEADING_BYTE_CHINESE_BIG5_1;                                 \
3883     }                                                                   \
3884   else                                                                  \
3885     {                                                                   \
3886       lb = LEADING_BYTE_CHINESE_BIG5_2;                                 \
3887       I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1);                             \
3888     }                                                                   \
3889   c1 = I / (0xFF - 0xA1) + 0xA1;                                        \
3890   c2 = I % (0xFF - 0xA1) + 0xA1;                                        \
3891 } while (0)
3892
3893 /* Convert the internal string representation of a Big5 character
3894    (lb, c1, c2) into Big5 code (b1, b2). */
3895
3896 #define ENCODE_BIG5(lb, c1, c2, b1, b2) do                              \
3897 {                                                                       \
3898   unsigned int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1);       \
3899                                                                         \
3900   if (lb == LEADING_BYTE_CHINESE_BIG5_2)                                \
3901     {                                                                   \
3902       I += BIG5_SAME_ROW * (0xC9 - 0xA1);                               \
3903     }                                                                   \
3904   b1 = I / BIG5_SAME_ROW + 0xA1;                                        \
3905   b2 = I % BIG5_SAME_ROW;                                               \
3906   b2 += b2 < 0x3F ? 0x40 : 0x62;                                        \
3907 } while (0)
3908
3909 static int
3910 detect_coding_big5 (struct detection_state *st, const Extbyte *src, Lstream_data_count n)
3911 {
3912   while (n--)
3913     {
3914       unsigned char c = *(unsigned char *)src++;
3915       if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO
3916 #ifndef UTF2000
3917           || (c >= 0x80 && c <= 0xA0)
3918 #endif
3919           )
3920         return 0;
3921       if (st->big5.in_second_byte)
3922         {
3923           st->big5.in_second_byte = 0;
3924           if (c < 0x40 || (c >= 0x80 && c <= 0xA0))
3925             return 0;
3926         }
3927       else if (
3928 #ifdef UTF2000
3929                c >= 0x81
3930 #else
3931                c >= 0xA1
3932 #endif
3933                )
3934         st->big5.in_second_byte = 1;
3935     }
3936   return CODING_CATEGORY_BIG5_MASK;
3937 }
3938
3939 /* Convert Big5 data to internal format. */
3940
3941 static void
3942 decode_coding_big5 (Lstream *decoding, const Extbyte *src,
3943                     unsigned_char_dynarr *dst, Lstream_data_count n)
3944 {
3945   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3946   unsigned int flags  = str->flags;
3947   unsigned int cpos   = str->cpos;
3948   eol_type_t eol_type = str->eol_type;
3949 #ifdef UTF2000
3950   Lisp_Object ccs
3951     = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (DECODING_STREAM_DATA
3952                                              (decoding)->codesys, 1);
3953 #endif
3954
3955   while (n--)
3956     {
3957       unsigned char c = *(unsigned char *)src++;
3958       if (cpos)
3959         {
3960           /* Previous character was first byte of Big5 char. */
3961           if (BYTE_BIG5_TWO_BYTE_2_P (c))
3962             {
3963 #ifdef UTF2000
3964               int code_point = (cpos << 8) | c;
3965               Emchar char_id = decode_defined_char (ccs, code_point, 0);
3966
3967               if (char_id < 0)
3968                 char_id
3969                   = DECODE_CHAR (Vcharset_chinese_big5, code_point, 0);
3970               DECODE_ADD_UCS_CHAR (char_id, dst);
3971 #else
3972               unsigned char b1, b2, b3;
3973               DECODE_BIG5 (cpos, c, b1, b2, b3);
3974               Dynarr_add (dst, b1);
3975               Dynarr_add (dst, b2);
3976               Dynarr_add (dst, b3);
3977 #endif
3978             }
3979           else
3980             {
3981               DECODE_ADD_BINARY_CHAR (cpos, dst);
3982               DECODE_ADD_BINARY_CHAR (c, dst);
3983             }
3984           cpos = 0;
3985         }
3986       else
3987         {
3988           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3989           if (BYTE_BIG5_TWO_BYTE_1_P (c))
3990             {
3991               decode_flush_er_chars (str, dst);
3992               cpos = c;
3993             }
3994           else if ( c < ' ' )
3995             {
3996               decode_flush_er_chars (str, dst);
3997               DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3998               DECODE_ADD_BINARY_CHAR (c, dst);
3999             }
4000           else
4001             {
4002               /* DECODE_ADD_BINARY_CHAR (c, dst); */
4003               decode_add_er_char (str, c, dst);
4004             }
4005         }
4006     label_continue_loop:;
4007     }
4008
4009   /* DECODE_HANDLE_END_OF_CONVERSION (flags, cpos, dst); */
4010   if (flags & CODING_STATE_END)
4011     {
4012       decode_flush_er_chars (str, dst);
4013       DECODE_OUTPUT_PARTIAL_CHAR (cpos);
4014       if (flags & CODING_STATE_CR)
4015         Dynarr_add (dst, '\r');
4016     }
4017
4018   str->flags = flags;
4019   str->cpos  = cpos;
4020 }
4021
4022 /* Convert internally-formatted data to Big5. */
4023
4024 void
4025 char_encode_big5 (struct encoding_stream *str, Emchar ch,
4026                   unsigned_char_dynarr *dst, unsigned int *flags)
4027 {
4028   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
4029
4030   if (ch == '\n')
4031     {
4032       if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
4033         Dynarr_add (dst, '\r');
4034       if (eol_type != EOL_CR)
4035         Dynarr_add (dst, ch);
4036     }
4037   else
4038     {
4039 #ifdef UTF2000
4040       int code_point;
4041       Lisp_Object ccs
4042         = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, 1);
4043
4044       if ((code_point = charset_code_point (Vcharset_ascii, ch, 0)) >= 0)
4045         Dynarr_add (dst, code_point);
4046       else if ((code_point = charset_code_point (ccs, ch, 0)) >= 0)
4047         {
4048           Dynarr_add (dst, code_point >> 8);
4049           Dynarr_add (dst, code_point & 0xFF);
4050         }
4051       else if ((code_point
4052                 = charset_code_point (Vcharset_chinese_big5, ch, 0)) >= 0)
4053         {
4054           Dynarr_add (dst, code_point >> 8);
4055           Dynarr_add (dst, code_point & 0xFF);
4056         }
4057       else if ((code_point
4058                 = charset_code_point (Vcharset_chinese_big5_1, ch, 0)) >= 0)
4059         {
4060           unsigned int I
4061             = ((code_point >> 8) - 33) * (0xFF - 0xA1)
4062             + ((code_point & 0xFF) - 33);
4063           unsigned char b1 = I / BIG5_SAME_ROW + 0xA1;
4064           unsigned char b2 = I % BIG5_SAME_ROW;
4065
4066           b2 += b2 < 0x3F ? 0x40 : 0x62;
4067           Dynarr_add (dst, b1);
4068           Dynarr_add (dst, b2);
4069         }
4070       else if ((code_point
4071                 = charset_code_point (Vcharset_chinese_big5_2, ch, 0)) >= 0)
4072         {
4073           unsigned int I
4074             = ((code_point >> 8) - 33) * (0xFF - 0xA1)
4075             + ((code_point & 0xFF) - 33);
4076           unsigned char b1, b2;
4077
4078           I += BIG5_SAME_ROW * (0xC9 - 0xA1);
4079           b1 = I / BIG5_SAME_ROW + 0xA1;
4080           b2 = I % BIG5_SAME_ROW;
4081           b2 += b2 < 0x3F ? 0x40 : 0x62;
4082           Dynarr_add (dst, b1);
4083           Dynarr_add (dst, b2);
4084         }
4085       else if (CODING_SYSTEM_USE_ENTITY_REFERENCE (str->codesys))
4086         {
4087           char buf[18];
4088
4089           char_encode_as_entity_reference (ch, buf);
4090           Dynarr_add_many (dst, buf, strlen (buf));
4091         }
4092       else
4093         Dynarr_add (dst, '?');
4094 #else
4095 #endif
4096     }
4097 }
4098
4099 void
4100 char_finish_big5 (struct encoding_stream *str, unsigned_char_dynarr *dst,
4101                   unsigned int *flags)
4102 {
4103 }
4104
4105
4106 DEFUN ("decode-big5-char", Fdecode_big5_char, 1, 1, 0, /*
4107 Decode a Big5 character CODE of BIG5 coding-system.
4108 CODE is the character code in BIG5, a cons of two integers.
4109 Return the corresponding character.
4110 */
4111        (code))
4112 {
4113   unsigned char c1, c2, b1, b2;
4114
4115   CHECK_CONS (code);
4116   CHECK_INT (XCAR (code));
4117   CHECK_INT (XCDR (code));
4118   b1 = XINT (XCAR (code));
4119   b2 = XINT (XCDR (code));
4120   if (BYTE_BIG5_TWO_BYTE_1_P (b1) &&
4121       BYTE_BIG5_TWO_BYTE_2_P (b2))
4122     {
4123       Charset_ID leading_byte;
4124       Lisp_Object charset;
4125       DECODE_BIG5 (b1, b2, leading_byte, c1, c2);
4126       charset = CHARSET_BY_LEADING_BYTE (leading_byte);
4127       return make_char (MAKE_CHAR (charset, c1 & 0x7F, c2 & 0x7F));
4128     }
4129   else
4130     return Qnil;
4131 }
4132
4133 DEFUN ("encode-big5-char", Fencode_big5_char, 1, 1, 0, /*
4134 Encode the Big5 character CHARACTER in the BIG5 coding-system.
4135 Return the corresponding character code in Big5.
4136 */
4137        (character))
4138 {
4139   Lisp_Object charset;
4140   int c1, c2, b1, b2;
4141
4142   CHECK_CHAR_COERCE_INT (character);
4143   BREAKUP_CHAR (XCHAR (character), charset, c1, c2);
4144   if (EQ (charset, Vcharset_chinese_big5_1) ||
4145       EQ (charset, Vcharset_chinese_big5_2))
4146     {
4147       ENCODE_BIG5 (XCHARSET_LEADING_BYTE (charset), c1 | 0x80, c2 | 0x80,
4148                    b1, b2);
4149       return Fcons (make_int (b1), make_int (b2));
4150     }
4151   else
4152     return Qnil;
4153 }
4154
4155 \f
4156 /************************************************************************/
4157 /*                           UCS-4 methods                              */
4158 /************************************************************************/
4159
4160 static int
4161 detect_coding_ucs4 (struct detection_state *st, const Extbyte *src, Lstream_data_count n)
4162 {
4163   while (n--)
4164     {
4165       unsigned char c = *(unsigned char *)src++;
4166       switch (st->ucs4.in_byte)
4167         {
4168         case 0:
4169           if (c >= 128)
4170             return 0;
4171           else
4172             st->ucs4.in_byte++;
4173           break;
4174         case 3:
4175           st->ucs4.in_byte = 0;
4176           break;
4177         default:
4178           st->ucs4.in_byte++;
4179         }
4180     }
4181   return CODING_CATEGORY_UCS4_MASK;
4182 }
4183
4184 static void
4185 decode_coding_ucs4 (Lstream *decoding, const Extbyte *src,
4186                     unsigned_char_dynarr *dst, Lstream_data_count n)
4187 {
4188   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4189   unsigned int flags = str->flags;
4190   unsigned int cpos  = str->cpos;
4191   unsigned char counter = str->counter;
4192
4193   while (n--)
4194     {
4195       unsigned char c = *(unsigned char *)src++;
4196       switch (counter)
4197         {
4198         case 0:
4199           cpos = c;
4200           counter = 3;
4201           break;
4202         case 1:
4203           DECODE_ADD_UCS_CHAR ((cpos << 8) | c, dst);
4204           cpos = 0;
4205           counter = 0;
4206           break;
4207         default:
4208           cpos = ( cpos << 8 ) | c;
4209           counter--;
4210         }
4211     }
4212   if (counter & CODING_STATE_END)
4213     DECODE_OUTPUT_PARTIAL_CHAR (cpos);
4214
4215   str->flags    = flags;
4216   str->cpos     = cpos;
4217   str->counter  = counter;
4218 }
4219
4220 void
4221 char_encode_ucs4 (struct encoding_stream *str, Emchar ch,
4222                   unsigned_char_dynarr *dst, unsigned int *flags)
4223 {
4224   Dynarr_add (dst, ch >> 24);
4225   Dynarr_add (dst, ch >> 16);
4226   Dynarr_add (dst, ch >>  8);
4227   Dynarr_add (dst, ch      );
4228 }
4229
4230 void
4231 char_finish_ucs4 (struct encoding_stream *str, unsigned_char_dynarr *dst,
4232                   unsigned int *flags)
4233 {
4234 }
4235
4236 \f
4237 /************************************************************************/
4238 /*                           UTF-16 methods                             */
4239 /************************************************************************/
4240
4241 static int
4242 detect_coding_utf16 (struct detection_state *st, const Extbyte *src, Lstream_data_count n)
4243 {
4244   return CODING_CATEGORY_UTF16_MASK;
4245 }
4246
4247 static void
4248 decode_coding_utf16 (Lstream *decoding, const Extbyte *src,
4249                     unsigned_char_dynarr *dst, Lstream_data_count n)
4250 {
4251   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4252   unsigned int flags = str->flags;
4253   unsigned int cpos  = str->cpos;
4254   unsigned char counter = str->counter & 3;
4255   unsigned char byte_order = str->counter >> 2;
4256   eol_type_t eol_type = str->eol_type;
4257
4258   while (n--)
4259     {
4260       unsigned char c = *(unsigned char *)src++;
4261       if (counter == 0)
4262         {
4263           cpos = c;
4264           counter = 1;
4265         }
4266       else if (counter == 1)
4267         {
4268           int code;
4269
4270           if (byte_order == 0)
4271             code = (c << 8) | cpos;
4272           else
4273             code = (cpos << 8) | c;
4274           if (code == 0xFFFE)
4275             {
4276               code = ((code & 0xFF) << 8) | (code >> 8);
4277               if ( byte_order == 0 )
4278                 byte_order = 1;
4279               else
4280                 byte_order = 0;
4281             }
4282           if ( (0xD800 <= code) && (code <= 0xDBFF) )
4283             {
4284               counter = 2;
4285               cpos = code;
4286             }
4287           else
4288             {
4289               counter = 0;
4290               cpos = 0;
4291               if (code != 0xFEFF)
4292                 {
4293                   DECODE_HANDLE_EOL_TYPE (eol_type, code, flags, dst);
4294                   DECODE_ADD_UCS_CHAR (code, dst);
4295                 }
4296             }
4297         }
4298       else if (counter == 2)
4299         {
4300           cpos = (cpos << 8) | c;
4301           counter++;
4302         }
4303       else
4304         {
4305           int x = cpos >> 8;
4306           int y
4307             = (byte_order == 0)
4308             ? (c << 8) | (cpos & 0xFF)
4309             : ((cpos & 0xFF) << 8) | c;
4310
4311           DECODE_ADD_UCS_CHAR ((x - 0xD800) * 0x400 + (y - 0xDC00)
4312                                + 0x10000, dst);
4313           counter = 0;
4314           cpos = 0;
4315         }
4316     label_continue_loop:;
4317     }
4318   if (counter & CODING_STATE_END)
4319     DECODE_OUTPUT_PARTIAL_CHAR (cpos);
4320
4321   str->flags    = flags;
4322   str->cpos     = cpos;
4323   str->counter  = (byte_order << 2) | counter;
4324 }
4325
4326 void
4327 char_encode_utf16 (struct encoding_stream *str, Emchar ch,
4328                   unsigned_char_dynarr *dst, unsigned int *flags)
4329 {
4330   if (ch <= 0xFFFF)
4331     {
4332       Dynarr_add (dst, ch);
4333       Dynarr_add (dst, ch >> 8);
4334     }
4335   else
4336     {
4337       int y = ((ch - 0x10000) / 0x400) + 0xD800;
4338       int z = ((ch - 0x10000) % 0x400) + 0xDC00;
4339
4340       Dynarr_add (dst, y);
4341       Dynarr_add (dst, y >> 8);
4342       Dynarr_add (dst, z);
4343       Dynarr_add (dst, z >> 8);
4344     }
4345 }
4346
4347 void
4348 char_finish_utf16 (struct encoding_stream *str, unsigned_char_dynarr *dst,
4349                   unsigned int *flags)
4350 {
4351 }
4352
4353 \f
4354 /************************************************************************/
4355 /*                           UTF-8 methods                              */
4356 /************************************************************************/
4357
4358 static int
4359 detect_coding_utf8 (struct detection_state *st, const Extbyte *src, Lstream_data_count n)
4360 {
4361   while (n--)
4362     {
4363       unsigned char c = *(unsigned char *)src++;
4364       switch (st->utf8.in_byte)
4365         {
4366         case 0:
4367           if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
4368             return 0;
4369           else if (c >= 0xfc)
4370             st->utf8.in_byte = 5;
4371           else if (c >= 0xf8)
4372             st->utf8.in_byte = 4;
4373           else if (c >= 0xf0)
4374             st->utf8.in_byte = 3;
4375           else if (c >= 0xe0)
4376             st->utf8.in_byte = 2;
4377           else if (c >= 0xc0)
4378             st->utf8.in_byte = 1;
4379           else if (c >= 0x80)
4380             return 0;
4381           break;
4382         default:
4383           if ((c & 0xc0) != 0x80)
4384             return 0;
4385           else
4386             st->utf8.in_byte--;
4387         }
4388     }
4389   return CODING_CATEGORY_UTF8_MASK;
4390 }
4391
4392 static void
4393 decode_output_utf8_partial_char (unsigned char counter,
4394                                  unsigned int cpos,
4395                                  unsigned_char_dynarr *dst)
4396 {
4397   if (counter == 5)
4398     DECODE_ADD_BINARY_CHAR ( (cpos|0xFC), dst);
4399   else if (counter == 4)
4400     {
4401       if (cpos < (1 << 6))
4402         DECODE_ADD_BINARY_CHAR ( (cpos|0xF8), dst);
4403       else
4404         {
4405           DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xFC), dst);
4406           DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
4407         }
4408     }
4409   else if (counter == 3)
4410     {
4411       if (cpos < (1 << 6))
4412         DECODE_ADD_BINARY_CHAR ( (cpos|0xF0), dst);
4413       else if (cpos < (1 << 12))
4414         {
4415           DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xF8), dst);
4416           DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
4417         }
4418       else
4419         {
4420           DECODE_ADD_BINARY_CHAR ( ( (cpos >> 12)|0xFC), dst);
4421           DECODE_ADD_BINARY_CHAR ( (((cpos >>  6)&0x3F)|0x80), dst);
4422           DECODE_ADD_BINARY_CHAR ( ( (cpos       &0x3F)|0x80), dst);
4423         }
4424     }
4425   else if (counter == 2)
4426     {
4427       if (cpos < (1 << 6))
4428         DECODE_ADD_BINARY_CHAR ( (cpos|0xE0), dst);
4429       else if (cpos < (1 << 12))
4430         {
4431           DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xF0), dst);
4432           DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
4433         }
4434       else if (cpos < (1 << 18))
4435         {
4436           DECODE_ADD_BINARY_CHAR ( ( (cpos >> 12)|0xF8), dst);
4437           DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
4438           DECODE_ADD_BINARY_CHAR ( ( (cpos      &0x3F)|0x80), dst);
4439         }
4440       else
4441         {
4442           DECODE_ADD_BINARY_CHAR ( ( (cpos >> 18)|0xFC), dst);
4443           DECODE_ADD_BINARY_CHAR ( (((cpos >> 12)&0x3F)|0x80), dst);
4444           DECODE_ADD_BINARY_CHAR ( (((cpos >>  6)&0x3F)|0x80), dst);
4445           DECODE_ADD_BINARY_CHAR ( ( (cpos       &0x3F)|0x80), dst);
4446         }
4447     }
4448   else
4449     {
4450       if (cpos < (1 << 6))
4451         DECODE_ADD_BINARY_CHAR ( (cpos|0xC0), dst);
4452       else if (cpos < (1 << 12))
4453         {
4454           DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xE0), dst);
4455           DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
4456         }
4457       else if (cpos < (1 << 18))
4458         {
4459           DECODE_ADD_BINARY_CHAR ( ( (cpos >> 12)|0xF0), dst);
4460           DECODE_ADD_BINARY_CHAR ( (((cpos >>  6)&0x3F)|0x80), dst);
4461           DECODE_ADD_BINARY_CHAR ( ( (cpos       &0x3F)|0x80), dst);
4462         }
4463       else if (cpos < (1 << 24))
4464         {
4465           DECODE_ADD_BINARY_CHAR ( ( (cpos >> 18)|0xF8), dst);
4466           DECODE_ADD_BINARY_CHAR ( (((cpos >> 12)&0x3F)|0x80), dst);
4467           DECODE_ADD_BINARY_CHAR ( (((cpos >>  6)&0x3F)|0x80), dst);
4468           DECODE_ADD_BINARY_CHAR ( ( (cpos       &0x3F)|0x80), dst);
4469         }
4470       else
4471         {
4472           DECODE_ADD_BINARY_CHAR ( ( (cpos >> 24)|0xFC), dst);
4473           DECODE_ADD_BINARY_CHAR ( (((cpos >> 18)&0x3F)|0x80), dst);
4474           DECODE_ADD_BINARY_CHAR ( (((cpos >> 12)&0x3F)|0x80), dst);
4475           DECODE_ADD_BINARY_CHAR ( (((cpos >>  6)&0x3F)|0x80), dst);
4476           DECODE_ADD_BINARY_CHAR ( ( (cpos       &0x3F)|0x80), dst);
4477         }
4478     }
4479 }
4480
4481 static void
4482 decode_coding_utf8 (Lstream *decoding, const Extbyte *src,
4483                     unsigned_char_dynarr *dst, Lstream_data_count n)
4484 {
4485   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4486   unsigned int flags    = str->flags;
4487   unsigned int cpos     = str->cpos;
4488   eol_type_t eol_type   = str->eol_type;
4489   unsigned char counter = str->counter;
4490 #ifdef UTF2000
4491   Lisp_Object ccs
4492     = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (DECODING_STREAM_DATA
4493                                              (decoding)->codesys, 0);
4494 #endif
4495
4496   while (n--)
4497     {
4498       unsigned char c = *(unsigned char *)src++;
4499       if (counter == 0)
4500         {
4501           if ( c < ' ' )
4502             {
4503               COMPOSE_FLUSH_CHARS (str, dst);
4504               decode_flush_er_chars (str, dst);
4505               DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4506               DECODE_ADD_UCS_CHAR (c, dst);
4507             }
4508           else if ( c < 0xC0 )
4509             /* decode_add_er_char (str, c, dst); */
4510             COMPOSE_ADD_CHAR (str, c, dst);
4511           else
4512             {
4513               /* decode_flush_er_chars (str, dst); */
4514               if ( c < 0xE0 )
4515                 {
4516                   cpos = c & 0x1f;
4517                   counter = 1;
4518                 }
4519               else if ( c < 0xF0 )
4520                 {
4521                   cpos = c & 0x0f;
4522                   counter = 2;
4523                 }
4524               else if ( c < 0xF8 )
4525                 {
4526                   cpos = c & 0x07;
4527                   counter = 3;
4528                 }
4529               else if ( c < 0xFC )
4530                 {
4531                   cpos = c & 0x03;
4532                   counter = 4;
4533                 }
4534               else
4535                 {
4536                   cpos = c & 0x01;
4537                   counter = 5;
4538                 }
4539             }
4540         }
4541       else if ( (c & 0xC0) == 0x80 )
4542         {
4543           cpos = ( cpos << 6 ) | ( c & 0x3f );
4544           if (counter == 1)
4545             {
4546               Emchar char_id;
4547
4548               if (!NILP (ccs))
4549                 {
4550                   char_id = decode_defined_char (ccs, cpos, 0);
4551
4552                   if (char_id < 0)
4553                     char_id = cpos;
4554                 }
4555               else
4556                 char_id = cpos;
4557               COMPOSE_ADD_CHAR (str, char_id, dst);
4558               cpos = 0;
4559               counter = 0;
4560             }
4561           else
4562             counter--;
4563         }
4564       else
4565         {
4566           COMPOSE_FLUSH_CHARS (str, dst);
4567           decode_flush_er_chars (str, dst);
4568           decode_output_utf8_partial_char (counter, cpos, dst);
4569           DECODE_ADD_BINARY_CHAR (c, dst);
4570           cpos = 0;
4571           counter = 0;
4572         }
4573     label_continue_loop:;
4574     }
4575
4576   if (flags & CODING_STATE_END)
4577     {
4578       COMPOSE_FLUSH_CHARS (str, dst);
4579       decode_flush_er_chars (str, dst);
4580       if (counter > 0)
4581         {
4582           decode_output_utf8_partial_char (counter, cpos, dst);
4583           cpos = 0;
4584           counter = 0;
4585         }
4586     }
4587   str->flags    = flags;
4588   str->cpos     = cpos;
4589   str->counter  = counter;
4590 }
4591
4592 void
4593 char_encode_utf8 (struct encoding_stream *str, Emchar ch,
4594                   unsigned_char_dynarr *dst, unsigned int *flags)
4595 {
4596   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
4597
4598   if (ch == '\n')
4599     {
4600       if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
4601         Dynarr_add (dst, '\r');
4602       if (eol_type != EOL_CR)
4603         Dynarr_add (dst, ch);
4604     }
4605   else if (ch <= 0x7f)
4606     {
4607       Dynarr_add (dst, ch);
4608     }
4609   else
4610     {
4611       Lisp_Object ucs_ccs
4612         = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, 0);
4613       int code_point = charset_code_point (ucs_ccs, ch, 0);
4614
4615       if ( (code_point < 0) || (code_point > 0xEFFFF) )
4616         {
4617           Lisp_Object map
4618             = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, 1);
4619           Lisp_Object ret;
4620
4621           if ( !NILP (map)
4622                && INTP (ret = Fchar_feature (make_char (ch),
4623                                              map, Qnil,
4624                                              Qnil, Qnil)) )
4625             code_point = XINT (ret);
4626           else if ( !NILP (map =
4627                            CODING_SYSTEM_ISO2022_INITIAL_CHARSET
4628                            (str->codesys, 2))
4629                     && INTP (ret = Fchar_feature (make_char (ch),
4630                                                   map, Qnil,
4631                                                   Qnil, Qnil)) )
4632             code_point = XINT (ret);
4633           else if (CODING_SYSTEM_USE_ENTITY_REFERENCE (str->codesys))
4634             {
4635               char buf[18];
4636
4637               char_encode_as_entity_reference (ch, buf);
4638               Dynarr_add_many (dst, buf, strlen (buf));
4639               return;
4640             }
4641           else
4642             code_point = ch;
4643         }
4644       if (code_point <= 0x7ff)
4645         {
4646           Dynarr_add (dst, (code_point >> 6) | 0xc0);
4647           Dynarr_add (dst, (code_point & 0x3f) | 0x80);
4648         }
4649       else if (code_point <= 0xffff)
4650         {
4651           Dynarr_add (dst,  (code_point >> 12) | 0xe0);
4652           Dynarr_add (dst, ((code_point >>  6) & 0x3f) | 0x80);
4653           Dynarr_add (dst,  (code_point        & 0x3f) | 0x80);
4654         }
4655       else if (code_point <= 0x1fffff)
4656         {
4657           Dynarr_add (dst,  (code_point >> 18) | 0xf0);
4658           Dynarr_add (dst, ((code_point >> 12) & 0x3f) | 0x80);
4659           Dynarr_add (dst, ((code_point >>  6) & 0x3f) | 0x80);
4660           Dynarr_add (dst,  (code_point        & 0x3f) | 0x80);
4661         }
4662       else if (code_point <= 0x3ffffff)
4663         {
4664           Dynarr_add (dst,  (code_point >> 24) | 0xf8);
4665           Dynarr_add (dst, ((code_point >> 18) & 0x3f) | 0x80);
4666           Dynarr_add (dst, ((code_point >> 12) & 0x3f) | 0x80);
4667           Dynarr_add (dst, ((code_point >>  6) & 0x3f) | 0x80);
4668           Dynarr_add (dst,  (code_point        & 0x3f) | 0x80);
4669         }
4670       else
4671         {
4672           Dynarr_add (dst,  (code_point >> 30) | 0xfc);
4673           Dynarr_add (dst, ((code_point >> 24) & 0x3f) | 0x80);
4674           Dynarr_add (dst, ((code_point >> 18) & 0x3f) | 0x80);
4675           Dynarr_add (dst, ((code_point >> 12) & 0x3f) | 0x80);
4676           Dynarr_add (dst, ((code_point >>  6) & 0x3f) | 0x80);
4677           Dynarr_add (dst,  (code_point        & 0x3f) | 0x80);
4678         }
4679     }
4680 }
4681
4682 void
4683 char_finish_utf8 (struct encoding_stream *str, unsigned_char_dynarr *dst,
4684                   unsigned int *flags)
4685 {
4686 }
4687
4688 \f
4689 /************************************************************************/
4690 /*                           ISO2022 methods                            */
4691 /************************************************************************/
4692
4693 /* The following note describes the coding system ISO2022 briefly.
4694    Since the intention of this note is to help understand the
4695    functions in this file, some parts are NOT ACCURATE or OVERLY
4696    SIMPLIFIED.  For thorough understanding, please refer to the
4697    original document of ISO2022.
4698
4699    ISO2022 provides many mechanisms to encode several character sets
4700    in 7-bit and 8-bit environments.  For 7-bit environments, all text
4701    is encoded using bytes less than 128.  This may make the encoded
4702    text a little bit longer, but the text passes more easily through
4703    several gateways, some of which strip off MSB (Most Signigant Bit).
4704
4705    There are two kinds of character sets: control character set and
4706    graphic character set.  The former contains control characters such
4707    as `newline' and `escape' to provide control functions (control
4708    functions are also provided by escape sequences).  The latter
4709    contains graphic characters such as 'A' and '-'.  Emacs recognizes
4710    two control character sets and many graphic character sets.
4711
4712    Graphic character sets are classified into one of the following
4713    four classes, according to the number of bytes (DIMENSION) and
4714    number of characters in one dimension (CHARS) of the set:
4715    - DIMENSION1_CHARS94
4716    - DIMENSION1_CHARS96
4717    - DIMENSION2_CHARS94
4718    - DIMENSION2_CHARS96
4719
4720    In addition, each character set is assigned an identification tag,
4721    unique for each set, called "final character" (denoted as <F>
4722    hereafter).  The <F> of each character set is decided by ECMA(*)
4723    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
4724    (0x30..0x3F are for private use only).
4725
4726    Note (*): ECMA = European Computer Manufacturers Association
4727
4728    Here are examples of graphic character set [NAME(<F>)]:
4729         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
4730         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
4731         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
4732         o DIMENSION2_CHARS96 -- none for the moment
4733
4734    A code area (1 byte = 8 bits) is divided into 4 areas, C0, GL, C1, and GR.
4735         C0 [0x00..0x1F] -- control character plane 0
4736         GL [0x20..0x7F] -- graphic character plane 0
4737         C1 [0x80..0x9F] -- control character plane 1
4738         GR [0xA0..0xFF] -- graphic character plane 1
4739
4740    A control character set is directly designated and invoked to C0 or
4741    C1 by an escape sequence.  The most common case is that:
4742    - ISO646's  control character set is designated/invoked to C0, and
4743    - ISO6429's control character set is designated/invoked to C1,
4744    and usually these designations/invocations are omitted in encoded
4745    text.  In a 7-bit environment, only C0 can be used, and a control
4746    character for C1 is encoded by an appropriate escape sequence to
4747    fit into the environment.  All control characters for C1 are
4748    defined to have corresponding escape sequences.
4749
4750    A graphic character set is at first designated to one of four
4751    graphic registers (G0 through G3), then these graphic registers are
4752    invoked to GL or GR.  These designations and invocations can be
4753    done independently.  The most common case is that G0 is invoked to
4754    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
4755    these invocations and designations are omitted in encoded text.
4756    In a 7-bit environment, only GL can be used.
4757
4758    When a graphic character set of CHARS94 is invoked to GL, codes
4759    0x20 and 0x7F of the GL area work as control characters SPACE and
4760    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
4761    be used.
4762
4763    There are two ways of invocation: locking-shift and single-shift.
4764    With locking-shift, the invocation lasts until the next different
4765    invocation, whereas with single-shift, the invocation affects the
4766    following character only and doesn't affect the locking-shift
4767    state.  Invocations are done by the following control characters or
4768    escape sequences:
4769
4770    ----------------------------------------------------------------------
4771    abbrev  function                  cntrl escape seq   description
4772    ----------------------------------------------------------------------
4773    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
4774    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
4775    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
4776    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
4777    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
4778    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
4779    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
4780    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
4781    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
4782    ----------------------------------------------------------------------
4783    (*) These are not used by any known coding system.
4784
4785    Control characters for these functions are defined by macros
4786    ISO_CODE_XXX in `coding.h'.
4787
4788    Designations are done by the following escape sequences:
4789    ----------------------------------------------------------------------
4790    escape sequence      description
4791    ----------------------------------------------------------------------
4792    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
4793    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
4794    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
4795    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
4796    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
4797    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
4798    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
4799    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
4800    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
4801    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
4802    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
4803    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
4804    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
4805    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
4806    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
4807    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
4808    ----------------------------------------------------------------------
4809
4810    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
4811    of dimension 1, chars 94, and final character <F>, etc...
4812
4813    Note (*): Although these designations are not allowed in ISO2022,
4814    Emacs accepts them on decoding, and produces them on encoding
4815    CHARS96 character sets in a coding system which is characterized as
4816    7-bit environment, non-locking-shift, and non-single-shift.
4817
4818    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
4819    '(' can be omitted.  We refer to this as "short-form" hereafter.
4820
4821    Now you may notice that there are a lot of ways for encoding the
4822    same multilingual text in ISO2022.  Actually, there exist many
4823    coding systems such as Compound Text (used in X11's inter client
4824    communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
4825    (used in Korean internet), EUC (Extended UNIX Code, used in Asian
4826    localized platforms), and all of these are variants of ISO2022.
4827
4828    In addition to the above, Emacs handles two more kinds of escape
4829    sequences: ISO6429's direction specification and Emacs' private
4830    sequence for specifying character composition.
4831
4832    ISO6429's direction specification takes the following form:
4833         o CSI ']'      -- end of the current direction
4834         o CSI '0' ']'  -- end of the current direction
4835         o CSI '1' ']'  -- start of left-to-right text
4836         o CSI '2' ']'  -- start of right-to-left text
4837    The control character CSI (0x9B: control sequence introducer) is
4838    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
4839
4840    Character composition specification takes the following form:
4841         o ESC '0' -- start character composition
4842         o ESC '1' -- end character composition
4843    Since these are not standard escape sequences of any ISO standard,
4844    their use with these meanings is restricted to Emacs only.  */
4845
4846 static void
4847 reset_iso2022 (Lisp_Object coding_system, struct iso2022_decoder *iso)
4848 {
4849   int i;
4850
4851   for (i = 0; i < 4; i++)
4852     {
4853       if (!NILP (coding_system))
4854         iso->charset[i] =
4855           XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, i);
4856       else
4857         iso->charset[i] = Qt;
4858       iso->invalid_designated[i] = 0;
4859     }
4860   iso->esc = ISO_ESC_NOTHING;
4861   iso->esc_bytes_index = 0;
4862   iso->register_left = 0;
4863   iso->register_right = 1;
4864   iso->switched_dir_and_no_valid_charset_yet = 0;
4865   iso->invalid_switch_dir = 0;
4866   iso->output_direction_sequence = 0;
4867   iso->output_literally = 0;
4868 #ifdef ENABLE_COMPOSITE_CHARS
4869   if (iso->composite_chars)
4870     Dynarr_reset (iso->composite_chars);
4871 #endif
4872 }
4873
4874 static int
4875 fit_to_be_escape_quoted (unsigned char c)
4876 {
4877   switch (c)
4878     {
4879     case ISO_CODE_ESC:
4880     case ISO_CODE_CSI:
4881     case ISO_CODE_SS2:
4882     case ISO_CODE_SS3:
4883     case ISO_CODE_SO:
4884     case ISO_CODE_SI:
4885       return 1;
4886
4887     default:
4888       return 0;
4889     }
4890 }
4891
4892 /* Parse one byte of an ISO2022 escape sequence.
4893    If the result is an invalid escape sequence, return 0 and
4894    do not change anything in STR.  Otherwise, if the result is
4895    an incomplete escape sequence, update ISO2022.ESC and
4896    ISO2022.ESC_BYTES and return -1.  Otherwise, update
4897    all the state variables (but not ISO2022.ESC_BYTES) and
4898    return 1.
4899
4900    If CHECK_INVALID_CHARSETS is non-zero, check for designation
4901    or invocation of an invalid character set and treat that as
4902    an unrecognized escape sequence. */
4903
4904 static int
4905 parse_iso2022_esc (Lisp_Object codesys, struct iso2022_decoder *iso,
4906                    unsigned char c, unsigned int *flags,
4907                    int check_invalid_charsets)
4908 {
4909   /* (1) If we're at the end of a designation sequence, CS is the
4910      charset being designated and REG is the register to designate
4911      it to.
4912
4913      (2) If we're at the end of a locking-shift sequence, REG is
4914      the register to invoke and HALF (0 == left, 1 == right) is
4915      the half to invoke it into.
4916
4917      (3) If we're at the end of a single-shift sequence, REG is
4918      the register to invoke. */
4919   Lisp_Object cs = Qnil;
4920   int reg, half;
4921
4922   /* NOTE: This code does goto's all over the fucking place.
4923      The reason for this is that we're basically implementing
4924      a state machine here, and hierarchical languages like C
4925      don't really provide a clean way of doing this. */
4926
4927   if (! (*flags & CODING_STATE_ESCAPE))
4928     /* At beginning of escape sequence; we need to reset our
4929        escape-state variables. */
4930     iso->esc = ISO_ESC_NOTHING;
4931
4932   iso->output_literally = 0;
4933   iso->output_direction_sequence = 0;
4934
4935   switch (iso->esc)
4936     {
4937     case ISO_ESC_NOTHING:
4938       iso->esc_bytes_index = 0;
4939       switch (c)
4940         {
4941         case ISO_CODE_ESC:      /* Start escape sequence */
4942           *flags |= CODING_STATE_ESCAPE;
4943           iso->esc = ISO_ESC;
4944           goto not_done;
4945
4946         case ISO_CODE_CSI:      /* ISO6429 (specifying directionality) */
4947           *flags |= CODING_STATE_ESCAPE;
4948           iso->esc = ISO_ESC_5_11;
4949           goto not_done;
4950
4951         case ISO_CODE_SO:       /* locking shift 1 */
4952           reg = 1; half = 0;
4953           goto locking_shift;
4954         case ISO_CODE_SI:       /* locking shift 0 */
4955           reg = 0; half = 0;
4956           goto locking_shift;
4957
4958         case ISO_CODE_SS2:      /* single shift */
4959           reg = 2;
4960           goto single_shift;
4961         case ISO_CODE_SS3:      /* single shift */
4962           reg = 3;
4963           goto single_shift;
4964
4965         default:                        /* Other control characters */
4966           return 0;
4967         }
4968
4969     case ISO_ESC:
4970       switch (c)
4971         {
4972           /**** single shift ****/
4973
4974         case 'N':       /* single shift 2 */
4975           reg = 2;
4976           goto single_shift;
4977         case 'O':       /* single shift 3 */
4978           reg = 3;
4979           goto single_shift;
4980
4981           /**** locking shift ****/
4982
4983         case '~':       /* locking shift 1 right */
4984           reg = 1; half = 1;
4985           goto locking_shift;
4986         case 'n':       /* locking shift 2 */
4987           reg = 2; half = 0;
4988           goto locking_shift;
4989         case '}':       /* locking shift 2 right */
4990           reg = 2; half = 1;
4991           goto locking_shift;
4992         case 'o':       /* locking shift 3 */
4993           reg = 3; half = 0;
4994           goto locking_shift;
4995         case '|':       /* locking shift 3 right */
4996           reg = 3; half = 1;
4997           goto locking_shift;
4998
4999 #ifdef ENABLE_COMPOSITE_CHARS
5000           /**** composite ****/
5001
5002         case '0':
5003           iso->esc = ISO_ESC_START_COMPOSITE;
5004           *flags = (*flags & CODING_STATE_ISO2022_LOCK) |
5005             CODING_STATE_COMPOSITE;
5006           return 1;
5007
5008         case '1':
5009           iso->esc = ISO_ESC_END_COMPOSITE;
5010           *flags = (*flags & CODING_STATE_ISO2022_LOCK) &
5011             ~CODING_STATE_COMPOSITE;
5012           return 1;
5013 #endif /* ENABLE_COMPOSITE_CHARS */
5014
5015           /**** directionality ****/
5016
5017         case '[':
5018           iso->esc = ISO_ESC_5_11;
5019           goto not_done;
5020
5021           /**** designation ****/
5022
5023         case '$':       /* multibyte charset prefix */
5024           iso->esc = ISO_ESC_2_4;
5025           goto not_done;
5026
5027         default:
5028           if (0x28 <= c && c <= 0x2F)
5029             {
5030               iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_8);
5031               goto not_done;
5032             }
5033
5034           /* This function is called with CODESYS equal to nil when
5035              doing coding-system detection. */
5036           if (!NILP (codesys)
5037               && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5038               && fit_to_be_escape_quoted (c))
5039             {
5040               iso->esc = ISO_ESC_LITERAL;
5041               *flags &= CODING_STATE_ISO2022_LOCK;
5042               return 1;
5043             }
5044
5045           /* bzzzt! */
5046           return 0;
5047         }
5048
5049
5050
5051       /**** directionality ****/
5052
5053     case ISO_ESC_5_11:          /* ISO6429 direction control */
5054       if (c == ']')
5055         {
5056           *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
5057           goto directionality;
5058         }
5059       if      (c == '0') iso->esc = ISO_ESC_5_11_0;
5060       else if (c == '1') iso->esc = ISO_ESC_5_11_1;
5061       else if (c == '2') iso->esc = ISO_ESC_5_11_2;
5062       else               return 0;
5063       goto not_done;
5064
5065     case ISO_ESC_5_11_0:
5066       if (c == ']')
5067         {
5068           *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
5069           goto directionality;
5070         }
5071       return 0;
5072
5073     case ISO_ESC_5_11_1:
5074       if (c == ']')
5075         {
5076           *flags = (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
5077           goto directionality;
5078         }
5079       return 0;
5080
5081     case ISO_ESC_5_11_2:
5082       if (c == ']')
5083         {
5084           *flags = (*flags & CODING_STATE_ISO2022_LOCK) | CODING_STATE_R2L;
5085           goto directionality;
5086         }
5087       return 0;
5088
5089     directionality:
5090       iso->esc = ISO_ESC_DIRECTIONALITY;
5091       /* Various junk here to attempt to preserve the direction sequences
5092          literally in the text if they would otherwise be swallowed due
5093          to invalid designations that don't show up as actual charset
5094          changes in the text. */
5095       if (iso->invalid_switch_dir)
5096         {
5097           /* We already inserted a direction switch literally into the
5098              text.  We assume (#### this may not be right) that the
5099              next direction switch is the one going the other way,
5100              and we need to output that literally as well. */
5101           iso->output_literally = 1;
5102           iso->invalid_switch_dir = 0;
5103         }
5104       else
5105         {
5106           int jj;
5107
5108           /* If we are in the thrall of an invalid designation,
5109            then stick the directionality sequence literally into the
5110            output stream so it ends up in the original text again. */
5111           for (jj = 0; jj < 4; jj++)
5112             if (iso->invalid_designated[jj])
5113               break;
5114           if (jj < 4)
5115             {
5116               iso->output_literally = 1;
5117               iso->invalid_switch_dir = 1;
5118             }
5119           else
5120             /* Indicate that we haven't yet seen a valid designation,
5121                so that if a switch-dir is directly followed by an
5122                invalid designation, both get inserted literally. */
5123             iso->switched_dir_and_no_valid_charset_yet = 1;
5124         }
5125       return 1;
5126
5127
5128       /**** designation ****/
5129
5130     case ISO_ESC_2_4:
5131       if (0x28 <= c && c <= 0x2F)
5132         {
5133           iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_4_8);
5134           goto not_done;
5135         }
5136       if (0x40 <= c && c <= 0x42)
5137         {
5138           /* 94^n-set */
5139           cs = CHARSET_BY_ATTRIBUTES (94, -1, c,
5140                                       *flags & CODING_STATE_R2L ?
5141                                       CHARSET_RIGHT_TO_LEFT :
5142                                       CHARSET_LEFT_TO_RIGHT);
5143           reg = 0;
5144           goto designated;
5145         }
5146       return 0;
5147
5148     default:
5149       {
5150         int chars = 0;
5151         int single = 0;
5152
5153         if (c < '0' || c > '~')
5154           return 0; /* bad final byte */
5155
5156         if (iso->esc >= ISO_ESC_2_8 &&
5157             iso->esc <= ISO_ESC_2_15)
5158           {
5159             chars = (iso->esc >= ISO_ESC_2_12) ? 96 : 94;
5160             single = 1; /* single-byte */
5161             reg = (iso->esc - ISO_ESC_2_8) & 3;
5162           }
5163         else if (iso->esc >= ISO_ESC_2_4_8 &&
5164                  iso->esc <= ISO_ESC_2_4_15)
5165           {
5166             chars = (iso->esc >= ISO_ESC_2_4_12) ? 96 : 94;
5167             single = -1; /* multi-byte */
5168             reg = (iso->esc - ISO_ESC_2_4_8) & 3;
5169           }
5170         else
5171           {
5172             /* Can this ever be reached? -slb */
5173             abort();
5174           }
5175
5176         cs = CHARSET_BY_ATTRIBUTES (chars, single, c,
5177                                     *flags & CODING_STATE_R2L ?
5178                                     CHARSET_RIGHT_TO_LEFT :
5179                                     CHARSET_LEFT_TO_RIGHT);
5180         goto designated;
5181       }
5182     }
5183
5184  not_done:
5185   iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char) c;
5186   return -1;
5187
5188  single_shift:
5189   if (check_invalid_charsets && !CHARSETP (iso->charset[reg]))
5190     /* can't invoke something that ain't there. */
5191     return 0;
5192   iso->esc = ISO_ESC_SINGLE_SHIFT;
5193   *flags &= CODING_STATE_ISO2022_LOCK;
5194   if (reg == 2)
5195     *flags |= CODING_STATE_SS2;
5196   else
5197     *flags |= CODING_STATE_SS3;
5198   return 1;
5199
5200  locking_shift:
5201   if (check_invalid_charsets &&
5202       !CHARSETP (iso->charset[reg]))
5203     /* can't invoke something that ain't there. */
5204     return 0;
5205   if (half)
5206     iso->register_right = reg;
5207   else
5208     iso->register_left = reg;
5209   *flags &= CODING_STATE_ISO2022_LOCK;
5210   iso->esc = ISO_ESC_LOCKING_SHIFT;
5211   return 1;
5212
5213  designated:
5214   if (NILP (cs) && check_invalid_charsets)
5215     {
5216       iso->invalid_designated[reg] = 1;
5217       iso->charset[reg] = Vcharset_ascii;
5218       iso->esc = ISO_ESC_DESIGNATE;
5219       *flags &= CODING_STATE_ISO2022_LOCK;
5220       iso->output_literally = 1;
5221       if (iso->switched_dir_and_no_valid_charset_yet)
5222         {
5223           /* We encountered a switch-direction followed by an
5224              invalid designation.  Ensure that the switch-direction
5225              gets outputted; otherwise it will probably get eaten
5226              when the text is written out again. */
5227           iso->switched_dir_and_no_valid_charset_yet = 0;
5228           iso->output_direction_sequence = 1;
5229           /* And make sure that the switch-dir going the other
5230              way gets outputted, as well. */
5231           iso->invalid_switch_dir = 1;
5232         }
5233       return 1;
5234     }
5235   /* This function is called with CODESYS equal to nil when
5236      doing coding-system detection. */
5237   if (!NILP (codesys))
5238     {
5239       charset_conversion_spec_dynarr *dyn =
5240         XCODING_SYSTEM (codesys)->iso2022.input_conv;
5241
5242       if (dyn)
5243         {
5244           int i;
5245
5246           for (i = 0; i < Dynarr_length (dyn); i++)
5247             {
5248               struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
5249               if (EQ (cs, spec->from_charset))
5250                 cs = spec->to_charset;
5251             }
5252         }
5253     }
5254
5255   iso->charset[reg] = cs;
5256   iso->esc = ISO_ESC_DESIGNATE;
5257   *flags &= CODING_STATE_ISO2022_LOCK;
5258   if (iso->invalid_designated[reg])
5259     {
5260       iso->invalid_designated[reg] = 0;
5261       iso->output_literally = 1;
5262     }
5263   if (iso->switched_dir_and_no_valid_charset_yet)
5264     iso->switched_dir_and_no_valid_charset_yet = 0;
5265   return 1;
5266 }
5267
5268 static int
5269 detect_coding_iso2022 (struct detection_state *st, const Extbyte *src, Lstream_data_count n)
5270 {
5271   int mask;
5272
5273   /* #### There are serious deficiencies in the recognition mechanism
5274      here.  This needs to be much smarter if it's going to cut it.
5275      The sequence "\xff\x0f" is currently detected as LOCK_SHIFT while
5276      it should be detected as Latin-1.
5277      All the ISO2022 stuff in this file should be synced up with the
5278      code from FSF Emacs-20.4, in which Mule should be more or less stable.
5279      Perhaps we should wait till R2L works in FSF Emacs? */
5280
5281   if (!st->iso2022.initted)
5282     {
5283       reset_iso2022 (Qnil, &st->iso2022.iso);
5284       st->iso2022.mask = (CODING_CATEGORY_ISO_7_MASK |
5285                           CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
5286                           CODING_CATEGORY_ISO_8_1_MASK |
5287                           CODING_CATEGORY_ISO_8_2_MASK |
5288                           CODING_CATEGORY_ISO_LOCK_SHIFT_MASK);
5289       st->iso2022.flags = 0;
5290       st->iso2022.high_byte_count = 0;
5291       st->iso2022.saw_single_shift = 0;
5292       st->iso2022.initted = 1;
5293     }
5294
5295   mask = st->iso2022.mask;
5296
5297   while (n--)
5298     {
5299       unsigned char c = *(unsigned char *)src++;
5300       if (c >= 0xA0)
5301         {
5302           mask &= ~CODING_CATEGORY_ISO_7_MASK;
5303           st->iso2022.high_byte_count++;
5304         }
5305       else
5306         {
5307           if (st->iso2022.high_byte_count && !st->iso2022.saw_single_shift)
5308             {
5309               if (st->iso2022.high_byte_count & 1)
5310                 /* odd number of high bytes; assume not iso-8-2 */
5311                 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
5312             }
5313           st->iso2022.high_byte_count = 0;
5314           st->iso2022.saw_single_shift = 0;
5315           if (c > 0x80)
5316             mask &= ~CODING_CATEGORY_ISO_7_MASK;
5317         }
5318       if (!(st->iso2022.flags & CODING_STATE_ESCAPE)
5319           && (BYTE_C0_P (c) || BYTE_C1_P (c)))
5320         { /* control chars */
5321           switch (c)
5322             {
5323               /* Allow and ignore control characters that you might
5324                  reasonably see in a text file */
5325             case '\r':
5326             case '\n':
5327             case '\t':
5328             case  7: /* bell */
5329             case  8: /* backspace */
5330             case 11: /* vertical tab */
5331             case 12: /* form feed */
5332             case 26: /* MS-DOS C-z junk */
5333             case 31: /* '^_' -- for info */
5334               goto label_continue_loop;
5335
5336             default:
5337               break;
5338             }
5339         }
5340
5341       if ((st->iso2022.flags & CODING_STATE_ESCAPE) || BYTE_C0_P (c)
5342           || BYTE_C1_P (c))
5343         {
5344           if (parse_iso2022_esc (Qnil, &st->iso2022.iso, c,
5345                                  &st->iso2022.flags, 0))
5346             {
5347               switch (st->iso2022.iso.esc)
5348                 {
5349                 case ISO_ESC_DESIGNATE:
5350                   mask &= ~CODING_CATEGORY_ISO_8_1_MASK;
5351                   mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
5352                   break;
5353                 case ISO_ESC_LOCKING_SHIFT:
5354                   mask = CODING_CATEGORY_ISO_LOCK_SHIFT_MASK;
5355                   goto ran_out_of_chars;
5356                 case ISO_ESC_SINGLE_SHIFT:
5357                   mask &= ~CODING_CATEGORY_ISO_8_DESIGNATE_MASK;
5358                   st->iso2022.saw_single_shift = 1;
5359                   break;
5360                 default:
5361                   break;
5362                 }
5363             }
5364           else
5365             {
5366               mask = 0;
5367               goto ran_out_of_chars;
5368             }
5369         }
5370     label_continue_loop:;
5371     }
5372
5373  ran_out_of_chars:
5374
5375   return mask;
5376 }
5377
5378 static int
5379 postprocess_iso2022_mask (int mask)
5380 {
5381   /* #### kind of cheesy */
5382   /* If seven-bit ISO is allowed, then assume that the encoding is
5383      entirely seven-bit and turn off the eight-bit ones. */
5384   if (mask & CODING_CATEGORY_ISO_7_MASK)
5385     mask &= ~ (CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
5386                CODING_CATEGORY_ISO_8_1_MASK |
5387                CODING_CATEGORY_ISO_8_2_MASK);
5388   return mask;
5389 }
5390
5391 /* If FLAGS is a null pointer or specifies right-to-left motion,
5392    output a switch-dir-to-left-to-right sequence to DST.
5393    Also update FLAGS if it is not a null pointer.
5394    If INTERNAL_P is set, we are outputting in internal format and
5395    need to handle the CSI differently. */
5396
5397 static void
5398 restore_left_to_right_direction (Lisp_Coding_System *codesys,
5399                                  unsigned_char_dynarr *dst,
5400                                  unsigned int *flags,
5401                                  int internal_p)
5402 {
5403   if (!flags || (*flags & CODING_STATE_R2L))
5404     {
5405       if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5406         {
5407           Dynarr_add (dst, ISO_CODE_ESC);
5408           Dynarr_add (dst, '[');
5409         }
5410       else if (internal_p)
5411         DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
5412       else
5413         Dynarr_add (dst, ISO_CODE_CSI);
5414       Dynarr_add (dst, '0');
5415       Dynarr_add (dst, ']');
5416       if (flags)
5417         *flags &= ~CODING_STATE_R2L;
5418     }
5419 }
5420
5421 /* If FLAGS is a null pointer or specifies a direction different from
5422    DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or
5423    CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape
5424    sequence to DST.  Also update FLAGS if it is not a null pointer.
5425    If INTERNAL_P is set, we are outputting in internal format and
5426    need to handle the CSI differently. */
5427
5428 static void
5429 ensure_correct_direction (int direction, Lisp_Coding_System *codesys,
5430                           unsigned_char_dynarr *dst, unsigned int *flags,
5431                           int internal_p)
5432 {
5433   if ((!flags || (*flags & CODING_STATE_R2L)) &&
5434       direction == CHARSET_LEFT_TO_RIGHT)
5435     restore_left_to_right_direction (codesys, dst, flags, internal_p);
5436   else if (!CODING_SYSTEM_ISO2022_NO_ISO6429 (codesys)
5437            && (!flags || !(*flags & CODING_STATE_R2L)) &&
5438            direction == CHARSET_RIGHT_TO_LEFT)
5439     {
5440       if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5441         {
5442           Dynarr_add (dst, ISO_CODE_ESC);
5443           Dynarr_add (dst, '[');
5444         }
5445       else if (internal_p)
5446         DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
5447       else
5448         Dynarr_add (dst, ISO_CODE_CSI);
5449       Dynarr_add (dst, '2');
5450       Dynarr_add (dst, ']');
5451       if (flags)
5452         *flags |= CODING_STATE_R2L;
5453     }
5454 }
5455
5456 /* Convert ISO2022-format data to internal format. */
5457
5458 static void
5459 decode_coding_iso2022 (Lstream *decoding, const Extbyte *src,
5460                        unsigned_char_dynarr *dst, Lstream_data_count n)
5461 {
5462   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
5463   unsigned int flags    = str->flags;
5464   unsigned int cpos     = str->cpos;
5465   unsigned char counter = str->counter;
5466   eol_type_t eol_type   = str->eol_type;
5467 #ifdef ENABLE_COMPOSITE_CHARS
5468   unsigned_char_dynarr *real_dst = dst;
5469 #endif
5470   Lisp_Object coding_system;
5471
5472   XSETCODING_SYSTEM (coding_system, str->codesys);
5473
5474 #ifdef ENABLE_COMPOSITE_CHARS
5475   if (flags & CODING_STATE_COMPOSITE)
5476     dst = str->iso2022.composite_chars;
5477 #endif /* ENABLE_COMPOSITE_CHARS */
5478
5479   while (n--)
5480     {
5481       unsigned char c = *(unsigned char *)src++;
5482       if (flags & CODING_STATE_ESCAPE)
5483         {       /* Within ESC sequence */
5484           int retval = parse_iso2022_esc (coding_system, &str->iso2022,
5485                                           c, &flags, 1);
5486
5487           if (retval)
5488             {
5489               switch (str->iso2022.esc)
5490                 {
5491 #ifdef ENABLE_COMPOSITE_CHARS
5492                 case ISO_ESC_START_COMPOSITE:
5493                   if (str->iso2022.composite_chars)
5494                     Dynarr_reset (str->iso2022.composite_chars);
5495                   else
5496                     str->iso2022.composite_chars = Dynarr_new (unsigned_char);
5497                   dst = str->iso2022.composite_chars;
5498                   break;
5499                 case ISO_ESC_END_COMPOSITE:
5500                   {
5501                     Bufbyte comstr[MAX_EMCHAR_LEN];
5502                     Bytecount len;
5503                     Emchar emch = lookup_composite_char (Dynarr_atp (dst, 0),
5504                                                          Dynarr_length (dst));
5505                     dst = real_dst;
5506                     len = set_charptr_emchar (comstr, emch);
5507                     Dynarr_add_many (dst, comstr, len);
5508                     break;
5509                   }
5510 #endif /* ENABLE_COMPOSITE_CHARS */
5511
5512                 case ISO_ESC_LITERAL:
5513                   COMPOSE_FLUSH_CHARS (str, dst);
5514                   decode_flush_er_chars (str, dst);
5515                   DECODE_ADD_BINARY_CHAR (c, dst);
5516                   break;
5517
5518                 default:
5519                   /* Everything else handled already */
5520                   break;
5521                 }
5522             }
5523
5524           /* Attempted error recovery. */
5525           if (str->iso2022.output_direction_sequence)
5526             ensure_correct_direction (flags & CODING_STATE_R2L ?
5527                                       CHARSET_RIGHT_TO_LEFT :
5528                                       CHARSET_LEFT_TO_RIGHT,
5529                                       str->codesys, dst, 0, 1);
5530           /* More error recovery. */
5531           if (!retval || str->iso2022.output_literally)
5532             {
5533               /* Output the (possibly invalid) sequence */
5534               int i;
5535               COMPOSE_FLUSH_CHARS (str, dst);
5536               decode_flush_er_chars (str, dst);
5537               for (i = 0; i < str->iso2022.esc_bytes_index; i++)
5538                 DECODE_ADD_BINARY_CHAR (str->iso2022.esc_bytes[i], dst);
5539               flags &= CODING_STATE_ISO2022_LOCK;
5540               if (!retval)
5541                 n++, src--;/* Repeat the loop with the same character. */
5542               else
5543                 {
5544                   /* No sense in reprocessing the final byte of the
5545                      escape sequence; it could mess things up anyway.
5546                      Just add it now. */
5547                   COMPOSE_FLUSH_CHARS (str, dst);
5548                   decode_flush_er_chars (str, dst);
5549                   DECODE_ADD_BINARY_CHAR (c, dst);
5550                 }
5551             }
5552           cpos = 0;
5553           counter = 0;
5554         }
5555       else if (BYTE_C0_P (c) || BYTE_C1_P (c))
5556         { /* Control characters */
5557
5558           /***** Error-handling *****/
5559
5560           /* If we were in the middle of a character, dump out the
5561              partial character. */
5562           if (counter)
5563             {
5564               COMPOSE_FLUSH_CHARS (str, dst);
5565               decode_flush_er_chars (str, dst);
5566               while (counter > 0)
5567                 {
5568                   counter--;
5569                   DECODE_ADD_BINARY_CHAR
5570                     ((unsigned char)(cpos >> (counter * 8)), dst);
5571                 }
5572               cpos = 0;
5573             }
5574
5575           /* If we just saw a single-shift character, dump it out.
5576              This may dump out the wrong sort of single-shift character,
5577              but least it will give an indication that something went
5578              wrong. */
5579           if (flags & CODING_STATE_SS2)
5580             {
5581               COMPOSE_FLUSH_CHARS (str, dst);
5582               decode_flush_er_chars (str, dst);
5583               DECODE_ADD_BINARY_CHAR (ISO_CODE_SS2, dst);
5584               flags &= ~CODING_STATE_SS2;
5585             }
5586           if (flags & CODING_STATE_SS3)
5587             {
5588               COMPOSE_FLUSH_CHARS (str, dst);
5589               decode_flush_er_chars (str, dst);
5590               DECODE_ADD_BINARY_CHAR (ISO_CODE_SS3, dst);
5591               flags &= ~CODING_STATE_SS3;
5592             }
5593
5594           /***** Now handle the control characters. *****/
5595
5596           /* Handle CR/LF */
5597 #ifdef UTF2000
5598           if (c == '\r')
5599             {
5600               COMPOSE_FLUSH_CHARS (str, dst);
5601               decode_flush_er_chars (str, dst);
5602               if (eol_type == EOL_CR)
5603                 Dynarr_add (dst, '\n');
5604               else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR)
5605                 Dynarr_add (dst, c);
5606               else
5607                 flags |= CODING_STATE_CR;
5608               goto label_continue_loop;
5609             }
5610           else if (flags & CODING_STATE_CR)
5611             {   /* eol_type == CODING_SYSTEM_EOL_CRLF */
5612               if (c != '\n')
5613                 Dynarr_add (dst, '\r');
5614               flags &= ~CODING_STATE_CR;
5615             }
5616 #else
5617           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5618 #endif
5619
5620           flags &= CODING_STATE_ISO2022_LOCK;
5621
5622           if (!parse_iso2022_esc (coding_system, &str->iso2022, c, &flags, 1))
5623             {
5624               COMPOSE_FLUSH_CHARS (str, dst);
5625               decode_flush_er_chars (str, dst);
5626               DECODE_ADD_BINARY_CHAR (c, dst);
5627             }
5628         }
5629       else
5630         {                       /* Graphic characters */
5631           Lisp_Object charset;
5632 #ifndef UTF2000
5633           Charset_ID lb;
5634 #endif
5635           int reg;
5636
5637 #ifdef UTF2000
5638           if (c == '\r')
5639             {
5640               COMPOSE_FLUSH_CHARS (str, dst);
5641               decode_flush_er_chars (str, dst);
5642               if (eol_type == EOL_CR)
5643                 Dynarr_add (dst, '\n');
5644               else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR)
5645                 Dynarr_add (dst, c);
5646               else
5647                 flags |= CODING_STATE_CR;
5648               goto label_continue_loop;
5649             }
5650           else if (flags & CODING_STATE_CR)
5651             {   /* eol_type == CODING_SYSTEM_EOL_CRLF */
5652               if (c != '\n')
5653                 Dynarr_add (dst, '\r');
5654               flags &= ~CODING_STATE_CR;
5655             }
5656 #else
5657           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5658 #endif
5659
5660           /* Now determine the charset. */
5661           reg = ((flags & CODING_STATE_SS2) ? 2
5662                  : (flags & CODING_STATE_SS3) ? 3
5663                  : !BYTE_ASCII_P (c) ? str->iso2022.register_right
5664                  : str->iso2022.register_left);
5665           charset = str->iso2022.charset[reg];
5666
5667           /* Error checking: */
5668           if (! CHARSETP (charset)
5669               || str->iso2022.invalid_designated[reg]
5670               || (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL)
5671                   && XCHARSET_CHARS (charset) == 94))
5672             /* Mrmph.  We are trying to invoke a register that has no
5673                or an invalid charset in it, or trying to add a character
5674                outside the range of the charset.  Insert that char literally
5675                to preserve it for the output. */
5676             {
5677               COMPOSE_FLUSH_CHARS (str, dst);
5678               decode_flush_er_chars (str, dst);
5679               while (counter > 0)
5680                 {
5681                   counter--;
5682                   DECODE_ADD_BINARY_CHAR
5683                     ((unsigned char)(cpos >> (counter * 8)), dst);
5684                 }
5685               cpos = 0;
5686               DECODE_ADD_BINARY_CHAR (c, dst);
5687             }
5688
5689           else
5690             {
5691               /* Things are probably hunky-dorey. */
5692
5693               /* Fetch reverse charset, maybe. */
5694               if (((flags & CODING_STATE_R2L) &&
5695                    XCHARSET_DIRECTION (charset) == CHARSET_LEFT_TO_RIGHT)
5696                   ||
5697                   (!(flags & CODING_STATE_R2L) &&
5698                    XCHARSET_DIRECTION (charset) == CHARSET_RIGHT_TO_LEFT))
5699                 {
5700                   Lisp_Object new_charset =
5701                     XCHARSET_REVERSE_DIRECTION_CHARSET (charset);
5702                   if (!NILP (new_charset))
5703                     charset = new_charset;
5704                 }
5705
5706 #ifdef UTF2000
5707               counter++;
5708               if (XCHARSET_DIMENSION (charset) == counter)
5709                 {
5710                   COMPOSE_ADD_CHAR (str,
5711                                     DECODE_CHAR (charset,
5712                                                  ((cpos & 0x7F7F7F) << 8)
5713                                                  | (c & 0x7F), 0),
5714                                     dst);
5715                   cpos = 0;
5716                   counter = 0;
5717                 }
5718               else
5719                 cpos = (cpos << 8) | c;
5720 #else
5721               lb = XCHARSET_LEADING_BYTE (charset);
5722               switch (XCHARSET_REP_BYTES (charset))
5723                 {
5724                 case 1: /* ASCII */
5725                   DECODE_OUTPUT_PARTIAL_CHAR (ch);
5726                   Dynarr_add (dst, c & 0x7F);
5727                   break;
5728
5729                 case 2: /* one-byte official */
5730                   DECODE_OUTPUT_PARTIAL_CHAR (ch);
5731                   Dynarr_add (dst, lb);
5732                   Dynarr_add (dst, c | 0x80);
5733                   break;
5734
5735                 case 3: /* one-byte private or two-byte official */
5736                   if (XCHARSET_PRIVATE_P (charset))
5737                     {
5738                       DECODE_OUTPUT_PARTIAL_CHAR (ch);
5739                       Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_1);
5740                       Dynarr_add (dst, lb);
5741                       Dynarr_add (dst, c | 0x80);
5742                     }
5743                   else
5744                     {
5745                       if (ch)
5746                         {
5747                           Dynarr_add (dst, lb);
5748                           Dynarr_add (dst, ch | 0x80);
5749                           Dynarr_add (dst, c | 0x80);
5750                           ch = 0;
5751                         }
5752                       else
5753                         ch = c;
5754                     }
5755                   break;
5756
5757                 default:        /* two-byte private */
5758                   if (ch)
5759                     {
5760                       Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_2);
5761                       Dynarr_add (dst, lb);
5762                       Dynarr_add (dst, ch | 0x80);
5763                       Dynarr_add (dst, c | 0x80);
5764                       ch = 0;
5765                     }
5766                   else
5767                     ch = c;
5768                 }
5769 #endif
5770             }
5771
5772           if (!cpos)
5773             flags &= CODING_STATE_ISO2022_LOCK;
5774         }
5775
5776     label_continue_loop:;
5777     }
5778
5779   if (flags & CODING_STATE_END)
5780     {
5781       COMPOSE_FLUSH_CHARS (str, dst);
5782       decode_flush_er_chars (str, dst);
5783       DECODE_OUTPUT_PARTIAL_CHAR (cpos);
5784     }
5785   str->flags   = flags;
5786   str->cpos    = cpos;
5787   str->counter = counter;
5788 }
5789
5790
5791 /***** ISO2022 encoder *****/
5792
5793 /* Designate CHARSET into register REG. */
5794
5795 static void
5796 iso2022_designate (Lisp_Object charset, unsigned char reg,
5797                    struct encoding_stream *str, unsigned_char_dynarr *dst)
5798 {
5799   static const char inter94[] = "()*+";
5800   static const char inter96[] = ",-./";
5801   unsigned short chars;
5802   unsigned char dimension;
5803   unsigned char final;
5804   Lisp_Object old_charset = str->iso2022.charset[reg];
5805
5806   str->iso2022.charset[reg] = charset;
5807   if (!CHARSETP (charset))
5808     /* charset might be an initial nil or t. */
5809     return;
5810   chars = XCHARSET_CHARS (charset);
5811   dimension = XCHARSET_DIMENSION (charset);
5812   final = XCHARSET_FINAL (charset);
5813   if (!str->iso2022.force_charset_on_output[reg] &&
5814       CHARSETP (old_charset) &&
5815       XCHARSET_CHARS (old_charset) == chars &&
5816       XCHARSET_DIMENSION (old_charset) == dimension &&
5817       XCHARSET_FINAL (old_charset) == final)
5818     return;
5819
5820   str->iso2022.force_charset_on_output[reg] = 0;
5821
5822   {
5823     charset_conversion_spec_dynarr *dyn =
5824       str->codesys->iso2022.output_conv;
5825
5826     if (dyn)
5827       {
5828         int i;
5829
5830         for (i = 0; i < Dynarr_length (dyn); i++)
5831           {
5832             struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
5833             if (EQ (charset, spec->from_charset))
5834                 charset = spec->to_charset;
5835           }
5836       }
5837   }
5838
5839   Dynarr_add (dst, ISO_CODE_ESC);
5840   switch (chars)
5841     {
5842     case 94:
5843       if (dimension == 1)
5844         Dynarr_add (dst, inter94[reg]);
5845       else
5846         {
5847           Dynarr_add (dst, '$');
5848           if (reg != 0
5849               || !(CODING_SYSTEM_ISO2022_SHORT (str->codesys))
5850               || final < '@'
5851               || final > 'B')
5852             Dynarr_add (dst, inter94[reg]);
5853         }
5854       break;
5855     case 96:
5856       if (dimension == 1)
5857         Dynarr_add (dst, inter96[reg]);
5858       else
5859         {
5860           Dynarr_add (dst, '$');
5861           Dynarr_add (dst, inter96[reg]);
5862         }
5863       break;
5864     }
5865   Dynarr_add (dst, final);
5866 }
5867
5868 static void
5869 ensure_normal_shift (struct encoding_stream *str, unsigned_char_dynarr *dst)
5870 {
5871   if (str->iso2022.register_left != 0)
5872     {
5873       Dynarr_add (dst, ISO_CODE_SI);
5874       str->iso2022.register_left = 0;
5875     }
5876 }
5877
5878 static void
5879 ensure_shift_out (struct encoding_stream *str, unsigned_char_dynarr *dst)
5880 {
5881   if (str->iso2022.register_left != 1)
5882     {
5883       Dynarr_add (dst, ISO_CODE_SO);
5884       str->iso2022.register_left = 1;
5885     }
5886 }
5887
5888 void
5889 char_encode_iso2022 (struct encoding_stream *str, Emchar ch,
5890                      unsigned_char_dynarr *dst, unsigned int *flags)
5891 {
5892   unsigned char charmask;
5893   Lisp_Coding_System* codesys = str->codesys;
5894   eol_type_t eol_type         = CODING_SYSTEM_EOL_TYPE (str->codesys);
5895   int i;
5896   Lisp_Object charset = str->iso2022.current_charset;
5897   int half = str->iso2022.current_half;
5898   int code_point = -1;
5899
5900   if (ch <= 0x7F)
5901     {
5902       restore_left_to_right_direction (codesys, dst, flags, 0);
5903
5904       /* Make sure G0 contains ASCII */
5905       if ((ch > ' ' && ch < ISO_CODE_DEL)
5906           || !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
5907         {
5908           ensure_normal_shift (str, dst);
5909           iso2022_designate (Vcharset_ascii, 0, str, dst);
5910         }
5911
5912       /* If necessary, restore everything to the default state
5913          at end-of-line */
5914       if (ch == '\n' && !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys)))
5915         {
5916           restore_left_to_right_direction (codesys, dst, flags, 0);
5917
5918           ensure_normal_shift (str, dst);
5919
5920           for (i = 0; i < 4; i++)
5921             {
5922               Lisp_Object initial_charset =
5923                 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5924               iso2022_designate (initial_charset, i, str, dst);
5925             }
5926         }
5927       if (ch == '\n')
5928         {
5929           if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5930             Dynarr_add (dst, '\r');
5931           if (eol_type != EOL_CR)
5932             Dynarr_add (dst, ch);
5933         }
5934       else
5935         {
5936           if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5937               && fit_to_be_escape_quoted (ch))
5938             Dynarr_add (dst, ISO_CODE_ESC);
5939           Dynarr_add (dst, ch);
5940         }
5941     }
5942   else if ( (0x80 <= ch) && (ch <= 0x9f) )
5943     {
5944       charmask = (half == 0 ? 0x00 : 0x80);
5945
5946       if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5947           && fit_to_be_escape_quoted (ch))
5948         Dynarr_add (dst, ISO_CODE_ESC);
5949       /* you asked for it ... */
5950       Dynarr_add (dst, ch);
5951     }
5952   else
5953     {
5954       int reg;
5955
5956       /* Now determine which register to use. */
5957       reg = -1;
5958       for (i = 0; i < 4; i++)
5959         {
5960           if ((CHARSETP (charset = str->iso2022.charset[i])
5961                && ((code_point = charset_code_point (charset, ch, 0)) >= 0))
5962               ||
5963               (CHARSETP
5964                (charset
5965                 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i))
5966                && ((code_point = charset_code_point (charset, ch, 0)) >= 0)))
5967             {
5968               reg = i;
5969               break;
5970             }
5971         }
5972       if (reg == -1)
5973         {
5974           Lisp_Object original_default_coded_charset_priority_list
5975             = Vdefault_coded_charset_priority_list;
5976
5977           while (!EQ (Vdefault_coded_charset_priority_list, Qnil))
5978             {
5979               code_point = ENCODE_CHAR (ch, charset);
5980               if (XCHARSET_FINAL (charset))
5981                 goto found;
5982               Vdefault_coded_charset_priority_list
5983                 = Fcdr (Fmemq (XCHARSET_NAME (charset),
5984                                Vdefault_coded_charset_priority_list));
5985             }
5986           code_point = ENCODE_CHAR (ch, charset);
5987           if (!XCHARSET_FINAL (charset))
5988             {
5989               charset = Vcharset_ascii;
5990               code_point = '~';
5991             }
5992         found:
5993           Vdefault_coded_charset_priority_list
5994             = original_default_coded_charset_priority_list;
5995         }
5996       ensure_correct_direction (XCHARSET_DIRECTION (charset),
5997                                 codesys, dst, flags, 0);
5998
5999       if (reg == -1)
6000         {
6001           if (XCHARSET_GRAPHIC (charset) != 0)
6002             {
6003               if (!NILP (str->iso2022.charset[1]) &&
6004                   (!CODING_SYSTEM_ISO2022_SEVEN (codesys)
6005                    || CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys)))
6006                 reg = 1;
6007               else if (!NILP (str->iso2022.charset[2]))
6008                 reg = 2;
6009               else if (!NILP (str->iso2022.charset[3]))
6010                 reg = 3;
6011               else
6012                 reg = 0;
6013             }
6014           else
6015             reg = 0;
6016         }
6017
6018       iso2022_designate (charset, reg, str, dst);
6019
6020       /* Now invoke that register. */
6021       switch (reg)
6022         {
6023         case 0:
6024           ensure_normal_shift (str, dst);
6025           half = 0;
6026           break;
6027         case 1:
6028           if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
6029             {
6030               ensure_shift_out (str, dst);
6031               half = 0;
6032             }
6033           else
6034             half = 1;
6035           break;
6036         case 2:
6037           if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
6038             {
6039               Dynarr_add (dst, ISO_CODE_ESC);
6040               Dynarr_add (dst, 'N');
6041               half = 0;
6042             }
6043           else
6044             {
6045               Dynarr_add (dst, ISO_CODE_SS2);
6046               half = 1;
6047             }
6048           break;
6049         case 3:
6050           if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
6051             {
6052               Dynarr_add (dst, ISO_CODE_ESC);
6053               Dynarr_add (dst, 'O');
6054               half = 0;
6055             }
6056           else
6057             {
6058               Dynarr_add (dst, ISO_CODE_SS3);
6059               half = 1;
6060             }
6061           break;
6062         default:
6063           abort ();
6064         }
6065
6066       charmask = (half == 0 ? 0x00 : 0x80);
6067
6068       switch (XCHARSET_DIMENSION (charset))
6069         {
6070         case 1:
6071           Dynarr_add (dst, (code_point & 0xFF) | charmask);
6072           break;
6073         case 2:
6074           Dynarr_add (dst, ((code_point >> 8) & 0xFF) | charmask);
6075           Dynarr_add (dst, ( code_point       & 0xFF) | charmask);
6076           break;
6077         case 3:
6078           Dynarr_add (dst, ((code_point >> 16) & 0xFF) | charmask);
6079           Dynarr_add (dst, ((code_point >>  8) & 0xFF) | charmask);
6080           Dynarr_add (dst, ( code_point        & 0xFF) | charmask);
6081           break;
6082         case 4:
6083           Dynarr_add (dst, ((code_point >> 24) & 0xFF) | charmask);
6084           Dynarr_add (dst, ((code_point >> 16) & 0xFF) | charmask);
6085           Dynarr_add (dst, ((code_point >>  8) & 0xFF) | charmask);
6086           Dynarr_add (dst, ( code_point        & 0xFF) | charmask);
6087           break;
6088         default:
6089           abort ();
6090         }
6091     }
6092   str->iso2022.current_charset = charset;
6093   str->iso2022.current_half = half;
6094 }
6095
6096 void
6097 char_finish_iso2022 (struct encoding_stream *str, unsigned_char_dynarr *dst,
6098                      unsigned int *flags)
6099 {
6100   Lisp_Coding_System* codesys = str->codesys;
6101   int i;
6102
6103   restore_left_to_right_direction (codesys, dst, flags, 0);
6104   ensure_normal_shift (str, dst);
6105   for (i = 0; i < 4; i++)
6106     {
6107       Lisp_Object initial_charset
6108         = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
6109       iso2022_designate (initial_charset, i, str, dst);
6110     }
6111 }
6112 #endif /* MULE */
6113 \f
6114 /************************************************************************/
6115 /*                     No-conversion methods                            */
6116 /************************************************************************/
6117
6118 /* This is used when reading in "binary" files -- i.e. files that may
6119    contain all 256 possible byte values and that are not to be
6120    interpreted as being in any particular decoding. */
6121 static void
6122 decode_coding_no_conversion (Lstream *decoding, const Extbyte *src,
6123                              unsigned_char_dynarr *dst, Lstream_data_count n)
6124 {
6125   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
6126   unsigned int flags  = str->flags;
6127   unsigned int cpos   = str->cpos;
6128   eol_type_t eol_type = str->eol_type;
6129
6130   while (n--)
6131     {
6132       unsigned char c = *(unsigned char *)src++;
6133
6134       DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
6135       DECODE_ADD_BINARY_CHAR (c, dst);
6136     label_continue_loop:;
6137     }
6138
6139   DECODE_HANDLE_END_OF_CONVERSION (flags, cpos, dst);
6140
6141   str->flags = flags;
6142   str->cpos  = cpos;
6143 }
6144
6145 static void
6146 encode_coding_no_conversion (Lstream *encoding, const Bufbyte *src,
6147                              unsigned_char_dynarr *dst, Lstream_data_count n)
6148 {
6149   unsigned char c;
6150   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
6151   unsigned int flags  = str->flags;
6152   unsigned int ch     = str->ch;
6153   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
6154 #ifdef UTF2000
6155   unsigned char char_boundary = str->iso2022.current_char_boundary;
6156 #endif
6157
6158   while (n--)
6159     {
6160       c = *src++;
6161 #ifdef UTF2000
6162       if (char_boundary == 0)
6163         if ( c >= 0xfc )
6164           {
6165             ch = c & 0x01;
6166             char_boundary = 5;
6167           }
6168         else if ( c >= 0xf8 )
6169           {
6170             ch = c & 0x03;
6171             char_boundary = 4;
6172           }
6173         else if ( c >= 0xf0 )
6174           {
6175             ch = c & 0x07;
6176             char_boundary = 3;
6177           }
6178         else if ( c >= 0xe0 )
6179           {
6180             ch = c & 0x0f;
6181             char_boundary = 2;
6182           }
6183         else if ( c >= 0xc0 )
6184           {
6185             ch = c & 0x1f;
6186             char_boundary = 1;
6187           }
6188         else
6189           {
6190             ch = 0;
6191             if (c == '\n')
6192               {
6193                 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
6194                   Dynarr_add (dst, '\r');
6195                 if (eol_type != EOL_CR)
6196                   Dynarr_add (dst, c);
6197               }
6198             else
6199               Dynarr_add (dst, c);
6200             char_boundary = 0;
6201           }
6202       else if (char_boundary == 1)
6203         {
6204           ch = ( ch << 6 ) | ( c & 0x3f );
6205           Dynarr_add (dst, ch & 0xff);
6206           char_boundary = 0;
6207         }
6208       else
6209         {
6210           ch = ( ch << 6 ) | ( c & 0x3f );
6211           char_boundary--;
6212         }
6213 #else /* not UTF2000 */
6214       if (c == '\n')
6215         {
6216           if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
6217             Dynarr_add (dst, '\r');
6218           if (eol_type != EOL_CR)
6219             Dynarr_add (dst, '\n');
6220           ch = 0;
6221         }
6222       else if (BYTE_ASCII_P (c))
6223         {
6224           assert (ch == 0);
6225           Dynarr_add (dst, c);
6226         }
6227       else if (BUFBYTE_LEADING_BYTE_P (c))
6228         {
6229           assert (ch == 0);
6230           if (c == LEADING_BYTE_LATIN_ISO8859_1 ||
6231               c == LEADING_BYTE_CONTROL_1)
6232             ch = c;
6233           else
6234             Dynarr_add (dst, '~'); /* untranslatable character */
6235         }
6236       else
6237         {
6238           if (ch == LEADING_BYTE_LATIN_ISO8859_1)
6239             Dynarr_add (dst, c);
6240           else if (ch == LEADING_BYTE_CONTROL_1)
6241             {
6242               assert (c < 0xC0);
6243               Dynarr_add (dst, c - 0x20);
6244             }
6245           /* else it should be the second or third byte of an
6246              untranslatable character, so ignore it */
6247           ch = 0;
6248         }
6249 #endif /* not UTF2000 */
6250     }
6251
6252   str->flags = flags;
6253   str->ch    = ch;
6254 #ifdef UTF2000
6255   str->iso2022.current_char_boundary = char_boundary;
6256 #endif
6257 }
6258
6259 \f
6260
6261 /************************************************************************/
6262 /*                             Initialization                           */
6263 /************************************************************************/
6264
6265 void
6266 syms_of_file_coding (void)
6267 {
6268   INIT_LRECORD_IMPLEMENTATION (coding_system);
6269
6270   DEFERROR_STANDARD (Qcoding_system_error, Qio_error);
6271
6272   DEFSUBR (Fcoding_system_p);
6273   DEFSUBR (Ffind_coding_system);
6274   DEFSUBR (Fget_coding_system);
6275   DEFSUBR (Fcoding_system_list);
6276   DEFSUBR (Fcoding_system_name);
6277   DEFSUBR (Fmake_coding_system);
6278   DEFSUBR (Fcopy_coding_system);
6279   DEFSUBR (Fcoding_system_canonical_name_p);
6280   DEFSUBR (Fcoding_system_alias_p);
6281   DEFSUBR (Fcoding_system_aliasee);
6282   DEFSUBR (Fdefine_coding_system_alias);
6283   DEFSUBR (Fsubsidiary_coding_system);
6284
6285   DEFSUBR (Fcoding_system_type);
6286   DEFSUBR (Fcoding_system_doc_string);
6287 #ifdef MULE
6288   DEFSUBR (Fcoding_system_charset);
6289 #endif
6290   DEFSUBR (Fcoding_system_property);
6291
6292   DEFSUBR (Fcoding_category_list);
6293   DEFSUBR (Fset_coding_priority_list);
6294   DEFSUBR (Fcoding_priority_list);
6295   DEFSUBR (Fset_coding_category_system);
6296   DEFSUBR (Fcoding_category_system);
6297
6298   DEFSUBR (Fdetect_coding_region);
6299   DEFSUBR (Fdecode_coding_region);
6300   DEFSUBR (Fencode_coding_region);
6301 #ifdef MULE
6302   DEFSUBR (Fdecode_shift_jis_char);
6303   DEFSUBR (Fencode_shift_jis_char);
6304   DEFSUBR (Fdecode_big5_char);
6305   DEFSUBR (Fencode_big5_char);
6306 #endif /* MULE */
6307   defsymbol (&Qcoding_systemp, "coding-system-p");
6308   defsymbol (&Qno_conversion, "no-conversion");
6309   defsymbol (&Qraw_text, "raw-text");
6310 #ifdef MULE
6311   defsymbol (&Qbig5, "big5");
6312   defsymbol (&Qshift_jis, "shift-jis");
6313   defsymbol (&Qucs4, "ucs-4");
6314   defsymbol (&Qutf8, "utf-8");
6315   defsymbol (&Qutf16, "utf-16");
6316   defsymbol (&Qccl, "ccl");
6317   defsymbol (&Qiso2022, "iso2022");
6318 #endif /* MULE */
6319   defsymbol (&Qmnemonic, "mnemonic");
6320   defsymbol (&Qeol_type, "eol-type");
6321   defsymbol (&Qpost_read_conversion, "post-read-conversion");
6322   defsymbol (&Qpre_write_conversion, "pre-write-conversion");
6323
6324   defsymbol (&Qcr, "cr");
6325   defsymbol (&Qlf, "lf");
6326   defsymbol (&Qcrlf, "crlf");
6327   defsymbol (&Qeol_cr, "eol-cr");
6328   defsymbol (&Qeol_lf, "eol-lf");
6329   defsymbol (&Qeol_crlf, "eol-crlf");
6330 #ifdef MULE
6331   defsymbol (&Qcharset_g0, "charset-g0");
6332   defsymbol (&Qcharset_g1, "charset-g1");
6333   defsymbol (&Qcharset_g2, "charset-g2");
6334   defsymbol (&Qcharset_g3, "charset-g3");
6335   defsymbol (&Qforce_g0_on_output, "force-g0-on-output");
6336   defsymbol (&Qforce_g1_on_output, "force-g1-on-output");
6337   defsymbol (&Qforce_g2_on_output, "force-g2-on-output");
6338   defsymbol (&Qforce_g3_on_output, "force-g3-on-output");
6339   defsymbol (&Qno_iso6429, "no-iso6429");
6340   defsymbol (&Qinput_charset_conversion, "input-charset-conversion");
6341   defsymbol (&Qoutput_charset_conversion, "output-charset-conversion");
6342
6343   defsymbol (&Qshort, "short");
6344   defsymbol (&Qno_ascii_eol, "no-ascii-eol");
6345   defsymbol (&Qno_ascii_cntl, "no-ascii-cntl");
6346   defsymbol (&Qseven, "seven");
6347   defsymbol (&Qlock_shift, "lock-shift");
6348   defsymbol (&Qescape_quoted, "escape-quoted");
6349 #endif /* MULE */
6350 #ifdef UTF2000
6351   defsymbol (&Qutf_8_mcs, "utf-8-mcs");
6352   defsymbol (&Qdisable_composition, "disable-composition");
6353   defsymbol (&Quse_entity_reference, "use-entity-reference");
6354   defsymbol (&Qd, "d");
6355   defsymbol (&Qx, "x");
6356   defsymbol (&QX, "X");
6357 #endif
6358   defsymbol (&Qencode, "encode");
6359   defsymbol (&Qdecode, "decode");
6360
6361 #ifdef MULE
6362   defsymbol (&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS],
6363              "shift-jis");
6364   defsymbol (&coding_category_symbol[CODING_CATEGORY_BIG5],
6365              "big5");
6366   defsymbol (&coding_category_symbol[CODING_CATEGORY_UCS4],
6367              "ucs-4");
6368   defsymbol (&coding_category_symbol[CODING_CATEGORY_UTF16],
6369              "utf-16");
6370   defsymbol (&coding_category_symbol[CODING_CATEGORY_UTF8],
6371              "utf-8");
6372   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_7],
6373              "iso-7");
6374   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE],
6375              "iso-8-designate");
6376   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_1],
6377              "iso-8-1");
6378   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_2],
6379              "iso-8-2");
6380   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_LOCK_SHIFT],
6381              "iso-lock-shift");
6382 #endif /* MULE */
6383   defsymbol (&coding_category_symbol[CODING_CATEGORY_NO_CONVERSION],
6384              "no-conversion");
6385 }
6386
6387 void
6388 lstream_type_create_file_coding (void)
6389 {
6390   LSTREAM_HAS_METHOD (decoding, reader);
6391   LSTREAM_HAS_METHOD (decoding, writer);
6392   LSTREAM_HAS_METHOD (decoding, rewinder);
6393   LSTREAM_HAS_METHOD (decoding, seekable_p);
6394   LSTREAM_HAS_METHOD (decoding, flusher);
6395   LSTREAM_HAS_METHOD (decoding, closer);
6396   LSTREAM_HAS_METHOD (decoding, marker);
6397
6398   LSTREAM_HAS_METHOD (encoding, reader);
6399   LSTREAM_HAS_METHOD (encoding, writer);
6400   LSTREAM_HAS_METHOD (encoding, rewinder);
6401   LSTREAM_HAS_METHOD (encoding, seekable_p);
6402   LSTREAM_HAS_METHOD (encoding, flusher);
6403   LSTREAM_HAS_METHOD (encoding, closer);
6404   LSTREAM_HAS_METHOD (encoding, marker);
6405 }
6406
6407 void
6408 vars_of_file_coding (void)
6409 {
6410   int i;
6411
6412   fcd = xnew (struct file_coding_dump);
6413   dump_add_root_struct_ptr (&fcd, &fcd_description);
6414
6415   /* Initialize to something reasonable ... */
6416   for (i = 0; i < CODING_CATEGORY_LAST; i++)
6417     {
6418       fcd->coding_category_system[i] = Qnil;
6419       fcd->coding_category_by_priority[i] = i;
6420     }
6421
6422   Fprovide (intern ("file-coding"));
6423
6424   DEFVAR_LISP ("keyboard-coding-system", &Vkeyboard_coding_system /*
6425 Coding system used for TTY keyboard input.
6426 Not used under a windowing system.
6427 */ );
6428   Vkeyboard_coding_system = Qnil;
6429
6430   DEFVAR_LISP ("terminal-coding-system", &Vterminal_coding_system /*
6431 Coding system used for TTY display output.
6432 Not used under a windowing system.
6433 */ );
6434   Vterminal_coding_system = Qnil;
6435
6436   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read /*
6437 Overriding coding system used when reading from a file or process.
6438 You should bind this variable with `let', but do not set it globally.
6439 If this is non-nil, it specifies the coding system that will be used
6440 to decode input on read operations, such as from a file or process.
6441 It overrides `buffer-file-coding-system-for-read',
6442 `insert-file-contents-pre-hook', etc.  Use those variables instead of
6443 this one for permanent changes to the environment.  */ );
6444   Vcoding_system_for_read = Qnil;
6445
6446   DEFVAR_LISP ("coding-system-for-write",
6447                &Vcoding_system_for_write /*
6448 Overriding coding system used when writing to a file or process.
6449 You should bind this variable with `let', but do not set it globally.
6450 If this is non-nil, it specifies the coding system that will be used
6451 to encode output for write operations, such as to a file or process.
6452 It overrides `buffer-file-coding-system', `write-region-pre-hook', etc.
6453 Use those variables instead of this one for permanent changes to the
6454 environment.  */ );
6455   Vcoding_system_for_write = Qnil;
6456
6457   DEFVAR_LISP ("file-name-coding-system", &Vfile_name_coding_system /*
6458 Coding system used to convert pathnames when accessing files.
6459 */ );
6460   Vfile_name_coding_system = Qnil;
6461
6462   DEFVAR_LISP ("coded-charset-entity-reference-alist",
6463                &Vcoded_charset_entity_reference_alist /*
6464 Alist of coded-charset vs corresponding entity-reference.
6465 Each element looks like (CCS PREFIX CODE-COLUMNS CODE-TYPE).
6466 CCS is coded-charset.
6467 CODE-COLUMNS is columns of code-point of entity-reference.
6468 CODE-TYPE is format type of code-point of entity-reference.
6469 `d' means decimal value and `x' means hexadecimal value.
6470 */ );
6471   Vcoded_charset_entity_reference_alist = Qnil;
6472
6473   DEFVAR_BOOL ("enable-multibyte-characters", &enable_multibyte_characters /*
6474 Non-nil means the buffer contents are regarded as multi-byte form
6475 of characters, not a binary code.  This affects the display, file I/O,
6476 and behaviors of various editing commands.
6477
6478 Setting this to nil does not do anything.
6479 */ );
6480   enable_multibyte_characters = 1;
6481 }
6482
6483 void
6484 complex_vars_of_file_coding (void)
6485 {
6486   staticpro (&Vcoding_system_hash_table);
6487   Vcoding_system_hash_table =
6488     make_lisp_hash_table (50, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ);
6489
6490   the_codesys_prop_dynarr = Dynarr_new (codesys_prop);
6491   dump_add_root_struct_ptr (&the_codesys_prop_dynarr, &codesys_prop_dynarr_description);
6492
6493 #define DEFINE_CODESYS_PROP(Prop_Type, Sym) do  \
6494 {                                               \
6495   struct codesys_prop csp;                      \
6496   csp.sym = (Sym);                              \
6497   csp.prop_type = (Prop_Type);                  \
6498   Dynarr_add (the_codesys_prop_dynarr, csp);    \
6499 } while (0)
6500
6501   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qmnemonic);
6502   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_type);
6503   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_cr);
6504   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_crlf);
6505   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_lf);
6506   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qpost_read_conversion);
6507   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qpre_write_conversion);
6508 #ifdef MULE
6509   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g0);
6510   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g1);
6511   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g2);
6512   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g3);
6513   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g0_on_output);
6514   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g1_on_output);
6515   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g2_on_output);
6516   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g3_on_output);
6517   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qshort);
6518   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_eol);
6519   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_cntl);
6520   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qseven);
6521   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qlock_shift);
6522   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_iso6429);
6523   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qescape_quoted);
6524   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qinput_charset_conversion);
6525   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qoutput_charset_conversion);
6526
6527   DEFINE_CODESYS_PROP (CODESYS_PROP_CCL,     Qencode);
6528   DEFINE_CODESYS_PROP (CODESYS_PROP_CCL,     Qdecode);
6529 #ifdef UTF2000
6530   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qdisable_composition);
6531   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Quse_entity_reference);
6532 #endif
6533 #endif /* MULE */
6534   /* Need to create this here or we're really screwed. */
6535   Fmake_coding_system
6536     (Qraw_text, Qno_conversion,
6537      build_string ("Raw text, which means it converts only line-break-codes."),
6538      list2 (Qmnemonic, build_string ("Raw")));
6539
6540   Fmake_coding_system
6541     (Qbinary, Qno_conversion,
6542      build_string ("Binary, which means it does not convert anything."),
6543      list4 (Qeol_type, Qlf,
6544             Qmnemonic, build_string ("Binary")));
6545
6546 #ifdef UTF2000
6547   Fmake_coding_system
6548     (Qutf_8_mcs, Qutf8,
6549      build_string
6550      ("Coding-system of UTF-8 with Multiple Coded-character-Sets extension."),
6551      list2 (Qmnemonic, build_string ("MTF8")));
6552 #endif
6553
6554   Fdefine_coding_system_alias (Qno_conversion, Qraw_text);
6555
6556   Fdefine_coding_system_alias (Qfile_name, Qbinary);
6557
6558   Fdefine_coding_system_alias (Qterminal, Qbinary);
6559   Fdefine_coding_system_alias (Qkeyboard, Qbinary);
6560
6561   /* Need this for bootstrapping */
6562   fcd->coding_category_system[CODING_CATEGORY_NO_CONVERSION] =
6563     Fget_coding_system (Qraw_text);
6564
6565 #ifdef UTF2000
6566   fcd->coding_category_system[CODING_CATEGORY_UTF8]
6567    = Fget_coding_system (Qutf_8_mcs);
6568 #endif
6569
6570 #if defined(MULE) && !defined(UTF2000)
6571   {
6572     size_t i;
6573
6574     for (i = 0; i < countof (fcd->ucs_to_mule_table); i++)
6575       fcd->ucs_to_mule_table[i] = Qnil;
6576   }
6577   staticpro (&mule_to_ucs_table);
6578   mule_to_ucs_table = Fmake_char_table(Qgeneric);
6579 #endif /* defined(MULE) && !defined(UTF2000) */
6580 }