git.chise.org Git - chise/xemacs-chise.git.1/blob - src/text-coding.c

   1 /* Code conversion functions.
   2    Copyright (C) 1991, 1995 Free Software Foundation, Inc.
   3    Copyright (C) 1995 Sun Microsystems, Inc.
   4    Copyright (C) 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2008, 2011
   5      MORIOKA Tomohiko
   6
   7 This file is part of XEmacs.
   8
   9 XEmacs is free software; you can redistribute it and/or modify it
  10 under the terms of the GNU General Public License as published by the
  11 Free Software Foundation; either version 2, or (at your option) any
  12 later version.
  13
  14 XEmacs is distributed in the hope that it will be useful, but WITHOUT
  15 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  16 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  17 for more details.
  18
  19 You should have received a copy of the GNU General Public License
  20 along with XEmacs; see the file COPYING.  If not, write to
  21 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  22 Boston, MA 02111-1307, USA.  */
  23
  24 /* Synched up with: Mule 2.3.   Not in FSF. */
  25
  26 /* Rewritten by Ben Wing <ben@xemacs.org>. */
  27 /* Rewritten by MORIOKA Tomohiko <tomo@m17n.org> for XEmacs CHISE. */
  28
  29 #include <config.h>
  30 #include "lisp.h"
  31
  32 #include "buffer.h"
  33 #include "elhash.h"
  34 #include "insdel.h"
  35 #include "lstream.h"
  36 #include "opaque.h"
  37 #ifdef MULE
  38 #include "mule-ccl.h"
  39 #include "chartab.h"
  40 #endif
  41 #include "file-coding.h"
  42
  43 Lisp_Object Qcoding_system_error;
  44
  45 Lisp_Object Vkeyboard_coding_system;
  46 Lisp_Object Vterminal_coding_system;
  47 Lisp_Object Vcoding_system_for_read;
  48 Lisp_Object Vcoding_system_for_write;
  49 Lisp_Object Vfile_name_coding_system;
  50
  51 Lisp_Object Vcoded_charset_entity_reference_alist;
  52
  53 /* Table of symbols identifying each coding category. */
  54 Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST];
  55
  56
  57
  58 struct file_coding_dump {
  59   /* Coding system currently associated with each coding category. */
  60   Lisp_Object coding_category_system[CODING_CATEGORY_LAST];
  61
  62   /* Table of all coding categories in decreasing order of priority.
  63      This describes a permutation of the possible coding categories. */
  64   int coding_category_by_priority[CODING_CATEGORY_LAST];
  65
  66 #if defined(MULE) && !defined(UTF2000)
  67   Lisp_Object ucs_to_mule_table[65536];
  68 #endif
  69 } *fcd;
  70
  71 static const struct lrecord_description fcd_description_1[] = {
  72   { XD_LISP_OBJECT_ARRAY, offsetof (struct file_coding_dump, coding_category_system), CODING_CATEGORY_LAST },
  73 #if defined(MULE) && !defined(UTF2000)
  74   { XD_LISP_OBJECT_ARRAY, offsetof (struct file_coding_dump, ucs_to_mule_table), countof (fcd->ucs_to_mule_table) },
  75 #endif
  76   { XD_END }
  77 };
  78
  79 static const struct struct_description fcd_description = {
  80   sizeof (struct file_coding_dump),
  81   fcd_description_1
  82 };
  83
  84 Lisp_Object mule_to_ucs_table;
  85
  86 Lisp_Object Qcoding_systemp;
  87
  88 Lisp_Object Qraw_text, Qno_conversion, Qccl, Qiso2022;
  89 /* Qinternal in general.c */
  90
  91 Lisp_Object Qmnemonic, Qeol_type;
  92 Lisp_Object Qcr, Qcrlf, Qlf;
  93 Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf;
  94 Lisp_Object Qpost_read_conversion;
  95 Lisp_Object Qpre_write_conversion;
  96
  97 #ifdef MULE
  98 Lisp_Object Qucs4, Qutf16, Qutf8;
  99 Lisp_Object Qbig5, Qshift_jis;
 100 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3;
 101 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output;
 102 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output;
 103 Lisp_Object Qno_iso6429;
 104 Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion;
 105 Lisp_Object Qescape_quoted;
 106 Lisp_Object Qshort, Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift;
 107 #endif
 108 #ifdef UTF2000
 109 Lisp_Object Qutf_8_mcs;
 110 Lisp_Object Qdisable_composition;
 111 Lisp_Object Qccs_priority_list;
 112 Lisp_Object Quse_entity_reference;
 113 Lisp_Object Qd, Qx, QX;
 114 #endif
 115 Lisp_Object Qencode, Qdecode;
 116
 117 Lisp_Object Vcoding_system_hash_table;
 118
 119 int enable_multibyte_characters;
 120
 121 #ifdef MULE
 122 /* Additional information used by the ISO2022 decoder and detector. */
 123 struct iso2022_decoder
 124 {
 125   /* CHARSET holds the character sets currently assigned to the G0
 126      through G3 variables.  It is initialized from the array
 127      INITIAL_CHARSET in CODESYS. */
 128   Lisp_Object charset[4];
 129
 130   /* Which registers are currently invoked into the left (GL) and
 131      right (GR) halves of the 8-bit encoding space? */
 132   int register_left, register_right;
 133
 134   /* ISO_ESC holds a value indicating part of an escape sequence
 135      that has already been seen. */
 136   enum iso_esc_flag esc;
 137
 138   /* This records the bytes we've seen so far in an escape sequence,
 139      in case the sequence is invalid (we spit out the bytes unchanged). */
 140   unsigned char esc_bytes[8];
 141
 142   /* Index for next byte to store in ISO escape sequence. */
 143   int esc_bytes_index;
 144
 145 #ifdef ENABLE_COMPOSITE_CHARS
 146   /* Stuff seen so far when composing a string. */
 147   unsigned_char_dynarr *composite_chars;
 148 #endif
 149
 150   /* If we saw an invalid designation sequence for a particular
 151      register, we flag it here and switch to ASCII.  The next time we
 152      see a valid designation for this register, we turn off the flag
 153      and do the designation normally, but pretend the sequence was
 154      invalid.  The effect of all this is that (most of the time) the
 155      escape sequences for both the switch to the unknown charset, and
 156      the switch back to the known charset, get inserted literally into
 157      the buffer and saved out as such.  The hope is that we can
 158      preserve the escape sequences so that the resulting written out
 159      file makes sense.  If we don't do any of this, the designation
 160      to the invalid charset will be preserved but that switch back
 161      to the known charset will probably get eaten because it was
 162      the same charset that was already present in the register. */
 163   unsigned char invalid_designated[4];
 164
 165   /* We try to do similar things as above for direction-switching
 166      sequences.  If we encountered a direction switch while an
 167      invalid designation was present, or an invalid designation
 168      just after a direction switch (i.e. no valid designation
 169      encountered yet), we insert the direction-switch escape
 170      sequence literally into the output stream, and later on
 171      insert the corresponding direction-restoring escape sequence
 172      literally also. */
 173   unsigned int switched_dir_and_no_valid_charset_yet :1;
 174   unsigned int invalid_switch_dir :1;
 175
 176   /* Tells the decoder to output the escape sequence literally
 177      even though it was valid.  Used in the games we play to
 178      avoid lossage when we encounter invalid designations. */
 179   unsigned int output_literally :1;
 180   /* We encountered a direction switch followed by an invalid
 181      designation.  We didn't output the direction switch
 182      literally because we didn't know about the invalid designation;
 183      but we have to do so now. */
 184   unsigned int output_direction_sequence :1;
 185 };
 186 #endif /* MULE */
 187 EXFUN (Fcopy_coding_system, 2);
 188 #ifdef MULE
 189 struct detection_state;
 190
 191 static void
 192 text_encode_generic (Lstream *encoding, const Bufbyte *src,
 193                      unsigned_char_dynarr *dst, Lstream_data_count n);
 194
 195 static int detect_coding_sjis (struct detection_state *st,
 196                                const Extbyte *src, Lstream_data_count n);
 197 static void decode_coding_sjis (Lstream *decoding, const Extbyte *src,
 198                                 unsigned_char_dynarr *dst, Lstream_data_count n);
 199 void char_encode_shift_jis (struct encoding_stream *str, Emchar c,
 200                             unsigned_char_dynarr *dst, unsigned int *flags);
 201 void char_finish_shift_jis (struct encoding_stream *str,
 202                             unsigned_char_dynarr *dst, unsigned int *flags);
 203
 204 static int detect_coding_big5 (struct detection_state *st,
 205                                const Extbyte *src, Lstream_data_count n);
 206 static void decode_coding_big5 (Lstream *decoding, const Extbyte *src,
 207                                 unsigned_char_dynarr *dst, Lstream_data_count n);
 208 void char_encode_big5 (struct encoding_stream *str, Emchar c,
 209                        unsigned_char_dynarr *dst, unsigned int *flags);
 210 void char_finish_big5 (struct encoding_stream *str,
 211                        unsigned_char_dynarr *dst, unsigned int *flags);
 212
 213 static int detect_coding_ucs4 (struct detection_state *st,
 214                                const Extbyte *src, Lstream_data_count n);
 215 static void decode_coding_ucs4 (Lstream *decoding, const Extbyte *src,
 216                                 unsigned_char_dynarr *dst, Lstream_data_count n);
 217 void char_encode_ucs4 (struct encoding_stream *str, Emchar c,
 218                        unsigned_char_dynarr *dst, unsigned int *flags);
 219 void char_finish_ucs4 (struct encoding_stream *str,
 220                        unsigned_char_dynarr *dst, unsigned int *flags);
 221
 222 static int detect_coding_utf16 (struct detection_state *st,
 223                                const Extbyte *src, Lstream_data_count n);
 224 static void decode_coding_utf16 (Lstream *decoding, const Extbyte *src,
 225                                 unsigned_char_dynarr *dst, Lstream_data_count n);
 226 void char_encode_utf16 (struct encoding_stream *str, Emchar c,
 227                        unsigned_char_dynarr *dst, unsigned int *flags);
 228 void char_finish_utf16 (struct encoding_stream *str,
 229                        unsigned_char_dynarr *dst, unsigned int *flags);
 230
 231 static int detect_coding_utf8 (struct detection_state *st,
 232                                const Extbyte *src, Lstream_data_count n);
 233 static void decode_coding_utf8 (Lstream *decoding, const Extbyte *src,
 234                                 unsigned_char_dynarr *dst, Lstream_data_count n);
 235 void char_encode_utf8 (struct encoding_stream *str, Emchar c,
 236                        unsigned_char_dynarr *dst, unsigned int *flags);
 237 void char_finish_utf8 (struct encoding_stream *str,
 238                        unsigned_char_dynarr *dst, unsigned int *flags);
 239
 240 static int postprocess_iso2022_mask (int mask);
 241 static void reset_iso2022 (Lisp_Object coding_system,
 242                            struct iso2022_decoder *iso);
 243 static int detect_coding_iso2022 (struct detection_state *st,
 244                                   const Extbyte *src, Lstream_data_count n);
 245 static void decode_coding_iso2022 (Lstream *decoding, const Extbyte *src,
 246                                    unsigned_char_dynarr *dst, Lstream_data_count n);
 247 void char_encode_iso2022 (struct encoding_stream *str, Emchar c,
 248                           unsigned_char_dynarr *dst, unsigned int *flags);
 249 void char_finish_iso2022 (struct encoding_stream *str,
 250                           unsigned_char_dynarr *dst, unsigned int *flags);
 251 #endif /* MULE */
 252 static void decode_coding_no_conversion (Lstream *decoding, const Extbyte *src,
 253                                          unsigned_char_dynarr *dst, Lstream_data_count n);
 254 static void encode_coding_no_conversion (Lstream *encoding, const Bufbyte *src,
 255                                          unsigned_char_dynarr *dst, Lstream_data_count n);
 256 static void mule_decode (Lstream *decoding, const Extbyte *src,
 257                          unsigned_char_dynarr *dst, Lstream_data_count n);
 258 static void mule_encode (Lstream *encoding, const Bufbyte *src,
 259                          unsigned_char_dynarr *dst, Lstream_data_count n);
 260
 261 typedef struct codesys_prop codesys_prop;
 262 struct codesys_prop
 263 {
 264   Lisp_Object sym;
 265   int prop_type;
 266 };
 267
 268 typedef struct
 269 {
 270   Dynarr_declare (codesys_prop);
 271 } codesys_prop_dynarr;
 272
 273 static const struct lrecord_description codesys_prop_description_1[] = {
 274   { XD_LISP_OBJECT, offsetof (codesys_prop, sym) },
 275   { XD_END }
 276 };
 277
 278 static const struct struct_description codesys_prop_description = {
 279   sizeof (codesys_prop),
 280   codesys_prop_description_1
 281 };
 282
 283 static const struct lrecord_description codesys_prop_dynarr_description_1[] = {
 284   XD_DYNARR_DESC (codesys_prop_dynarr, &codesys_prop_description),
 285   { XD_END }
 286 };
 287
 288 static const struct struct_description codesys_prop_dynarr_description = {
 289   sizeof (codesys_prop_dynarr),
 290   codesys_prop_dynarr_description_1
 291 };
 292
 293 codesys_prop_dynarr *the_codesys_prop_dynarr;
 294
 295 enum codesys_prop_enum
 296 {
 297   CODESYS_PROP_ALL_OK,
 298   CODESYS_PROP_ISO2022,
 299   CODESYS_PROP_CCL
 300 };
 301
 302 \f
 303 /************************************************************************/
 304 /*                       Coding system functions                        */
 305 /************************************************************************/
 306
 307 static Lisp_Object mark_coding_system (Lisp_Object);
 308 static void print_coding_system (Lisp_Object, Lisp_Object, int);
 309 static void finalize_coding_system (void *header, int for_disksave);
 310
 311 #ifdef MULE
 312 static const struct lrecord_description ccs_description_1[] = {
 313   { XD_LISP_OBJECT, offsetof (charset_conversion_spec, from_charset) },
 314   { XD_LISP_OBJECT, offsetof (charset_conversion_spec, to_charset) },
 315   { XD_END }
 316 };
 317
 318 static const struct struct_description ccs_description = {
 319   sizeof (charset_conversion_spec),
 320   ccs_description_1
 321 };
 322
 323 static const struct lrecord_description ccsd_description_1[] = {
 324   XD_DYNARR_DESC (charset_conversion_spec_dynarr, &ccs_description),
 325   { XD_END }
 326 };
 327
 328 static const struct struct_description ccsd_description = {
 329   sizeof (charset_conversion_spec_dynarr),
 330   ccsd_description_1
 331 };
 332 #endif
 333
 334 static const struct lrecord_description coding_system_description[] = {
 335   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, name) },
 336   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, doc_string) },
 337   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, mnemonic) },
 338   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, post_read_conversion) },
 339   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, pre_write_conversion) },
 340   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_lf) },
 341   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_crlf) },
 342   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_cr) },
 343 #ifdef MULE
 344   { XD_LISP_OBJECT_ARRAY, offsetof (Lisp_Coding_System, iso2022.initial_charset), 4 },
 345   { XD_STRUCT_PTR,  offsetof (Lisp_Coding_System, iso2022.input_conv),  1, &ccsd_description },
 346   { XD_STRUCT_PTR,  offsetof (Lisp_Coding_System, iso2022.output_conv), 1, &ccsd_description },
 347   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccl.decode) },
 348   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccl.encode) },
 349 #ifdef UTF2000
 350   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccs_priority_list) },
 351 #endif
 352 #endif
 353   { XD_END }
 354 };
 355
 356 DEFINE_LRECORD_IMPLEMENTATION ("coding-system", coding_system,
 357                                mark_coding_system, print_coding_system,
 358                                finalize_coding_system,
 359                                0, 0, coding_system_description,
 360                                Lisp_Coding_System);
 361
 362 static Lisp_Object
 363 mark_coding_system (Lisp_Object obj)
 364 {
 365   Lisp_Coding_System *codesys = XCODING_SYSTEM (obj);
 366
 367   mark_object (CODING_SYSTEM_NAME (codesys));
 368   mark_object (CODING_SYSTEM_DOC_STRING (codesys));
 369   mark_object (CODING_SYSTEM_MNEMONIC (codesys));
 370   mark_object (CODING_SYSTEM_EOL_LF (codesys));
 371   mark_object (CODING_SYSTEM_EOL_CRLF (codesys));
 372   mark_object (CODING_SYSTEM_EOL_CR (codesys));
 373
 374   switch (CODING_SYSTEM_TYPE (codesys))
 375     {
 376 #ifdef MULE
 377       int i;
 378     case CODESYS_ISO2022:
 379       for (i = 0; i < 4; i++)
 380         mark_object (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i));
 381       if (codesys->iso2022.input_conv)
 382         {
 383           for (i = 0; i < Dynarr_length (codesys->iso2022.input_conv); i++)
 384             {
 385               struct charset_conversion_spec *ccs =
 386                 Dynarr_atp (codesys->iso2022.input_conv, i);
 387               mark_object (ccs->from_charset);
 388               mark_object (ccs->to_charset);
 389             }
 390         }
 391       if (codesys->iso2022.output_conv)
 392         {
 393           for (i = 0; i < Dynarr_length (codesys->iso2022.output_conv); i++)
 394             {
 395               struct charset_conversion_spec *ccs =
 396                 Dynarr_atp (codesys->iso2022.output_conv, i);
 397               mark_object (ccs->from_charset);
 398               mark_object (ccs->to_charset);
 399             }
 400         }
 401       break;
 402 #ifdef UTF2000
 403
 404     case CODESYS_BIG5:
 405       mark_object (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 0));
 406       mark_object (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 1));
 407       break;
 408 #endif
 409
 410     case CODESYS_CCL:
 411       mark_object (CODING_SYSTEM_CCL_DECODE (codesys));
 412       mark_object (CODING_SYSTEM_CCL_ENCODE (codesys));
 413       break;
 414 #endif /* MULE */
 415     default:
 416       break;
 417     }
 418
 419   mark_object (CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys));
 420 #ifdef UTF2000
 421   mark_object (CODING_SYSTEM_CCS_PRIORITY_LIST (codesys));
 422 #endif
 423   return CODING_SYSTEM_POST_READ_CONVERSION (codesys);
 424 }
 425
 426 static void
 427 print_coding_system (Lisp_Object obj, Lisp_Object printcharfun,
 428                      int escapeflag)
 429 {
 430   Lisp_Coding_System *c = XCODING_SYSTEM (obj);
 431   if (print_readably)
 432     error ("printing unreadable object #<coding_system 0x%x>",
 433            c->header.uid);
 434
 435   write_c_string ("#<coding_system ", printcharfun);
 436   print_internal (c->name, printcharfun, 1);
 437   write_c_string (">", printcharfun);
 438 }
 439
 440 static void
 441 finalize_coding_system (void *header, int for_disksave)
 442 {
 443   Lisp_Coding_System *c = (Lisp_Coding_System *) header;
 444   /* Since coding systems never go away, this function is not
 445      necessary.  But it would be necessary if we changed things
 446      so that coding systems could go away. */
 447   if (!for_disksave) /* see comment in lstream.c */
 448     {
 449       switch (CODING_SYSTEM_TYPE (c))
 450         {
 451 #ifdef MULE
 452         case CODESYS_ISO2022:
 453           if (c->iso2022.input_conv)
 454             {
 455               Dynarr_free (c->iso2022.input_conv);
 456               c->iso2022.input_conv = 0;
 457             }
 458           if (c->iso2022.output_conv)
 459             {
 460               Dynarr_free (c->iso2022.output_conv);
 461               c->iso2022.output_conv = 0;
 462             }
 463           break;
 464 #endif /* MULE */
 465         default:
 466           break;
 467         }
 468     }
 469 }
 470
 471 static eol_type_t
 472 symbol_to_eol_type (Lisp_Object symbol)
 473 {
 474   CHECK_SYMBOL (symbol);
 475   if (NILP (symbol))      return EOL_AUTODETECT;
 476   if (EQ (symbol, Qlf))   return EOL_LF;
 477   if (EQ (symbol, Qcrlf)) return EOL_CRLF;
 478   if (EQ (symbol, Qcr))   return EOL_CR;
 479
 480   signal_simple_error ("Unrecognized eol type", symbol);
 481   return EOL_AUTODETECT; /* not reached */
 482 }
 483
 484 static Lisp_Object
 485 eol_type_to_symbol (eol_type_t type)
 486 {
 487   switch (type)
 488     {
 489     default: abort ();
 490     case EOL_LF:         return Qlf;
 491     case EOL_CRLF:       return Qcrlf;
 492     case EOL_CR:         return Qcr;
 493     case EOL_AUTODETECT: return Qnil;
 494     }
 495 }
 496
 497 static void
 498 setup_eol_coding_systems (Lisp_Coding_System *codesys)
 499 {
 500   Lisp_Object codesys_obj;
 501   int len = string_length (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name);
 502   char *codesys_name = (char *) alloca (len + 7);
 503   int mlen = -1;
 504   char *codesys_mnemonic=0;
 505
 506   Lisp_Object codesys_name_sym, sub_codesys_obj;
 507
 508   /* kludge */
 509
 510   XSETCODING_SYSTEM (codesys_obj, codesys);
 511
 512   memcpy (codesys_name,
 513           string_data (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name), len);
 514
 515   if (STRINGP (CODING_SYSTEM_MNEMONIC (codesys)))
 516     {
 517       mlen = XSTRING_LENGTH (CODING_SYSTEM_MNEMONIC (codesys));
 518       codesys_mnemonic = (char *) alloca (mlen + 7);
 519       memcpy (codesys_mnemonic,
 520               XSTRING_DATA (CODING_SYSTEM_MNEMONIC (codesys)), mlen);
 521     }
 522
 523 #define DEFINE_SUB_CODESYS(op_sys, op_sys_abbr, Type) do {                      \
 524   strcpy (codesys_name + len, "-" op_sys);                                      \
 525   if (mlen != -1)                                                               \
 526     strcpy (codesys_mnemonic + mlen, op_sys_abbr);                              \
 527   codesys_name_sym = intern (codesys_name);                                     \
 528   sub_codesys_obj = Fcopy_coding_system (codesys_obj, codesys_name_sym);        \
 529   XCODING_SYSTEM_EOL_TYPE (sub_codesys_obj) = Type;                             \
 530   if (mlen != -1)                                                               \
 531     XCODING_SYSTEM_MNEMONIC(sub_codesys_obj) =                                  \
 532       build_string (codesys_mnemonic);                                          \
 533   CODING_SYSTEM_##Type (codesys) = sub_codesys_obj;                             \
 534 } while (0)
 535
 536   DEFINE_SUB_CODESYS("unix", "", EOL_LF);
 537   DEFINE_SUB_CODESYS("dos",  ":T", EOL_CRLF);
 538   DEFINE_SUB_CODESYS("mac",  ":t", EOL_CR);
 539 }
 540
 541 DEFUN ("coding-system-p", Fcoding_system_p, 1, 1, 0, /*
 542 Return t if OBJECT is a coding system.
 543 A coding system is an object that defines how text containing multiple
 544 character sets is encoded into a stream of (typically 8-bit) bytes.
 545 The coding system is used to decode the stream into a series of
 546 characters (which may be from multiple charsets) when the text is read
 547 from a file or process, and is used to encode the text back into the
 548 same format when it is written out to a file or process.
 549
 550 For example, many ISO2022-compliant coding systems (such as Compound
 551 Text, which is used for inter-client data under the X Window System)
 552 use escape sequences to switch between different charsets -- Japanese
 553 Kanji, for example, is invoked with "ESC $ ( B"; ASCII is invoked
 554 with "ESC ( B"; and Cyrillic is invoked with "ESC - L".  See
 555 `make-coding-system' for more information.
 556
 557 Coding systems are normally identified using a symbol, and the
 558 symbol is accepted in place of the actual coding system object whenever
 559 a coding system is called for. (This is similar to how faces work.)
 560 */
 561        (object))
 562 {
 563   return CODING_SYSTEMP (object) ? Qt : Qnil;
 564 }
 565
 566 DEFUN ("find-coding-system", Ffind_coding_system, 1, 1, 0, /*
 567 Retrieve the coding system of the given name.
 568
 569 If CODING-SYSTEM-OR-NAME is a coding-system object, it is simply
 570 returned.  Otherwise, CODING-SYSTEM-OR-NAME should be a symbol.
 571 If there is no such coding system, nil is returned.  Otherwise the
 572 associated coding system object is returned.
 573 */
 574        (coding_system_or_name))
 575 {
 576   if (NILP (coding_system_or_name))
 577     coding_system_or_name = Qbinary;
 578   else if (CODING_SYSTEMP (coding_system_or_name))
 579     return coding_system_or_name;
 580   else
 581     CHECK_SYMBOL (coding_system_or_name);
 582
 583   while (1)
 584     {
 585       coding_system_or_name =
 586         Fgethash (coding_system_or_name, Vcoding_system_hash_table, Qnil);
 587
 588       if (CODING_SYSTEMP (coding_system_or_name) || NILP (coding_system_or_name))
 589         return coding_system_or_name;
 590     }
 591 }
 592
 593 DEFUN ("get-coding-system", Fget_coding_system, 1, 1, 0, /*
 594 Retrieve the coding system of the given name.
 595 Same as `find-coding-system' except that if there is no such
 596 coding system, an error is signaled instead of returning nil.
 597 */
 598        (name))
 599 {
 600   Lisp_Object coding_system = Ffind_coding_system (name);
 601
 602   if (NILP (coding_system))
 603     signal_simple_error ("No such coding system", name);
 604   return coding_system;
 605 }
 606
 607 /* We store the coding systems in hash tables with the names as the key and the
 608    actual coding system object as the value.  Occasionally we need to use them
 609    in a list format.  These routines provide us with that. */
 610 struct coding_system_list_closure
 611 {
 612   Lisp_Object *coding_system_list;
 613 };
 614
 615 static int
 616 add_coding_system_to_list_mapper (Lisp_Object key, Lisp_Object value,
 617                                   void *coding_system_list_closure)
 618 {
 619   /* This function can GC */
 620   struct coding_system_list_closure *cscl =
 621     (struct coding_system_list_closure *) coding_system_list_closure;
 622   Lisp_Object *coding_system_list = cscl->coding_system_list;
 623
 624   *coding_system_list = Fcons (key, *coding_system_list);
 625   return 0;
 626 }
 627
 628 DEFUN ("coding-system-list", Fcoding_system_list, 0, 0, 0, /*
 629 Return a list of the names of all defined coding systems.
 630 */
 631        ())
 632 {
 633   Lisp_Object coding_system_list = Qnil;
 634   struct gcpro gcpro1;
 635   struct coding_system_list_closure coding_system_list_closure;
 636
 637   GCPRO1 (coding_system_list);
 638   coding_system_list_closure.coding_system_list = &coding_system_list;
 639   elisp_maphash (add_coding_system_to_list_mapper, Vcoding_system_hash_table,
 640                  &coding_system_list_closure);
 641   UNGCPRO;
 642
 643   return coding_system_list;
 644 }
 645
 646 DEFUN ("coding-system-name", Fcoding_system_name, 1, 1, 0, /*
 647 Return the name of the given coding system.
 648 */
 649        (coding_system))
 650 {
 651   coding_system = Fget_coding_system (coding_system);
 652   return XCODING_SYSTEM_NAME (coding_system);
 653 }
 654
 655 static Lisp_Coding_System *
 656 allocate_coding_system (enum coding_system_type type, Lisp_Object name)
 657 {
 658   Lisp_Coding_System *codesys =
 659     alloc_lcrecord_type (Lisp_Coding_System, &lrecord_coding_system);
 660
 661   zero_lcrecord (codesys);
 662   CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = Qnil;
 663   CODING_SYSTEM_POST_READ_CONVERSION (codesys) = Qnil;
 664   CODING_SYSTEM_EOL_TYPE (codesys) = EOL_AUTODETECT;
 665   CODING_SYSTEM_EOL_CRLF (codesys) = Qnil;
 666   CODING_SYSTEM_EOL_CR   (codesys) = Qnil;
 667   CODING_SYSTEM_EOL_LF   (codesys) = Qnil;
 668   CODING_SYSTEM_TYPE     (codesys) = type;
 669   CODING_SYSTEM_MNEMONIC (codesys) = Qnil;
 670 #ifdef MULE
 671 #ifdef UTF2000
 672   CODING_SYSTEM_CCS_PRIORITY_LIST (codesys) = Qnil;
 673 #endif
 674   if (type == CODESYS_ISO2022)
 675     {
 676       int i;
 677       for (i = 0; i < 4; i++)
 678         CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i) = Qnil;
 679     }
 680 #ifdef UTF2000
 681   if (type == CODESYS_UTF8)
 682     {
 683       CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 0)
 684         = Vcharset_ucs;
 685       CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 1)
 686         = Qnil;
 687       CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 2)
 688         = Qnil;
 689       CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 3)
 690         = Qnil;
 691     }
 692   else if (type == CODESYS_BIG5)
 693     {
 694       CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 0)
 695         = Vcharset_ascii;
 696       CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 1)
 697         = Vcharset_chinese_big5;
 698       CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 2)
 699         = Qnil;
 700       CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 3)
 701         = Qnil;
 702     }
 703 #endif
 704   else if (type == CODESYS_CCL)
 705     {
 706       CODING_SYSTEM_CCL_DECODE (codesys) = Qnil;
 707       CODING_SYSTEM_CCL_ENCODE (codesys) = Qnil;
 708     }
 709 #endif /* MULE */
 710   CODING_SYSTEM_NAME (codesys) = name;
 711
 712   return codesys;
 713 }
 714
 715 #ifdef MULE
 716 /* Given a list of charset conversion specs as specified in a Lisp
 717    program, parse it into STORE_HERE. */
 718
 719 static void
 720 parse_charset_conversion_specs (charset_conversion_spec_dynarr *store_here,
 721                                 Lisp_Object spec_list)
 722 {
 723   Lisp_Object rest;
 724
 725   EXTERNAL_LIST_LOOP (rest, spec_list)
 726     {
 727       Lisp_Object car = XCAR (rest);
 728       Lisp_Object from, to;
 729       struct charset_conversion_spec spec;
 730
 731       if (!CONSP (car) || !CONSP (XCDR (car)) || !NILP (XCDR (XCDR (car))))
 732         signal_simple_error ("Invalid charset conversion spec", car);
 733       from = Fget_charset (XCAR (car));
 734       to = Fget_charset (XCAR (XCDR (car)));
 735       if ( (XCHARSET_CHARS (from) != XCHARSET_CHARS (to)) ||
 736            (XCHARSET_DIMENSION (from) != XCHARSET_DIMENSION (to)) )
 737         signal_simple_error_2
 738           ("Attempted conversion between different charset types",
 739            from, to);
 740       spec.from_charset = from;
 741       spec.to_charset = to;
 742
 743       Dynarr_add (store_here, spec);
 744     }
 745 }
 746
 747 /* Given a dynarr LOAD_HERE of internally-stored charset conversion
 748    specs, return the equivalent as the Lisp programmer would see it.
 749
 750    If LOAD_HERE is 0, return Qnil. */
 751
 752 static Lisp_Object
 753 unparse_charset_conversion_specs (charset_conversion_spec_dynarr *load_here)
 754 {
 755   int i;
 756   Lisp_Object result;
 757
 758   if (!load_here)
 759     return Qnil;
 760   for (i = 0, result = Qnil; i < Dynarr_length (load_here); i++)
 761     {
 762       struct charset_conversion_spec *ccs = Dynarr_atp (load_here, i);
 763       result = Fcons (list2 (ccs->from_charset, ccs->to_charset), result);
 764     }
 765
 766   return Fnreverse (result);
 767 }
 768
 769 #endif /* MULE */
 770
 771 DEFUN ("make-coding-system", Fmake_coding_system, 2, 4, 0, /*
 772 Register symbol NAME as a coding system.
 773
 774 TYPE describes the conversion method used and should be one of
 775
 776 nil or 'undecided
 777      Automatic conversion.  XEmacs attempts to detect the coding system
 778      used in the file.
 779 'no-conversion
 780      No conversion.  Use this for binary files and such.  On output,
 781      graphic characters that are not in ASCII or Latin-1 will be
 782      replaced by a ?. (For a no-conversion-encoded buffer, these
 783      characters will only be present if you explicitly insert them.)
 784 'shift-jis
 785      Shift-JIS (a Japanese encoding commonly used in PC operating systems).
 786 'ucs-4
 787      ISO 10646 UCS-4 encoding.
 788 'utf-8
 789      ISO 10646 UTF-8 encoding.
 790 'iso2022
 791      Any ISO2022-compliant encoding.  Among other things, this includes
 792      JIS (the Japanese encoding commonly used for e-mail), EUC (the
 793      standard Unix encoding for Japanese and other languages), and
 794      Compound Text (the encoding used in X11).  You can specify more
 795      specific information about the conversion with the PROPS argument.
 796 'big5
 797      Big5 (the encoding commonly used for Taiwanese).
 798 'ccl
 799      The conversion is performed using a user-written pseudo-code
 800      program.  CCL (Code Conversion Language) is the name of this
 801      pseudo-code.
 802 'internal
 803      Write out or read in the raw contents of the memory representing
 804      the buffer's text.  This is primarily useful for debugging
 805      purposes, and is only enabled when XEmacs has been compiled with
 806      DEBUG_XEMACS defined (via the --debug configure option).
 807      WARNING: Reading in a file using 'internal conversion can result
 808      in an internal inconsistency in the memory representing a
 809      buffer's text, which will produce unpredictable results and may
 810      cause XEmacs to crash.  Under normal circumstances you should
 811      never use 'internal conversion.
 812
 813 DOC-STRING is a string describing the coding system.
 814
 815 PROPS is a property list, describing the specific nature of the
 816 character set.  Recognized properties are:
 817
 818 'mnemonic
 819      String to be displayed in the modeline when this coding system is
 820      active.
 821
 822 'eol-type
 823      End-of-line conversion to be used.  It should be one of
 824
 825         nil
 826                 Automatically detect the end-of-line type (LF, CRLF,
 827                 or CR).  Also generate subsidiary coding systems named
 828                 `NAME-unix', `NAME-dos', and `NAME-mac', that are
 829                 identical to this coding system but have an EOL-TYPE
 830                 value of 'lf, 'crlf, and 'cr, respectively.
 831         'lf
 832                 The end of a line is marked externally using ASCII LF.
 833                 Since this is also the way that XEmacs represents an
 834                 end-of-line internally, specifying this option results
 835                 in no end-of-line conversion.  This is the standard
 836                 format for Unix text files.
 837         'crlf
 838                 The end of a line is marked externally using ASCII
 839                 CRLF.  This is the standard format for MS-DOS text
 840                 files.
 841         'cr
 842                 The end of a line is marked externally using ASCII CR.
 843                 This is the standard format for Macintosh text files.
 844         t
 845                 Automatically detect the end-of-line type but do not
 846                 generate subsidiary coding systems.  (This value is
 847                 converted to nil when stored internally, and
 848                 `coding-system-property' will return nil.)
 849
 850 'disable-composition
 851      If non-nil, composition/decomposition for combining characters
 852      are disabled.
 853
 854 'use-entity-reference
 855      If non-nil, SGML style entity-reference is used for non-system-characters.
 856
 857 'post-read-conversion
 858      Function called after a file has been read in, to perform the
 859      decoding.  Called with two arguments, START and END, denoting
 860      a region of the current buffer to be decoded.
 861
 862 'pre-write-conversion
 863      Function called before a file is written out, to perform the
 864      encoding.  Called with two arguments, START and END, denoting
 865      a region of the current buffer to be encoded.
 866
 867
 868 The following additional properties are recognized if TYPE is 'iso2022:
 869
 870 'charset-g0
 871 'charset-g1
 872 'charset-g2
 873 'charset-g3
 874      The character set initially designated to the G0 - G3 registers.
 875      The value should be one of
 876
 877           -- A charset object (designate that character set)
 878           -- nil (do not ever use this register)
 879           -- t (no character set is initially designated to
 880                 the register, but may be later on; this automatically
 881                 sets the corresponding `force-g*-on-output' property)
 882
 883 'force-g0-on-output
 884 'force-g1-on-output
 885 'force-g2-on-output
 886 'force-g2-on-output
 887      If non-nil, send an explicit designation sequence on output before
 888      using the specified register.
 889
 890 'short
 891      If non-nil, use the short forms "ESC $ @", "ESC $ A", and
 892      "ESC $ B" on output in place of the full designation sequences
 893      "ESC $ ( @", "ESC $ ( A", and "ESC $ ( B".
 894
 895 'no-ascii-eol
 896      If non-nil, don't designate ASCII to G0 at each end of line on output.
 897      Setting this to non-nil also suppresses other state-resetting that
 898      normally happens at the end of a line.
 899
 900 'no-ascii-cntl
 901      If non-nil, don't designate ASCII to G0 before control chars on output.
 902
 903 'seven
 904      If non-nil, use 7-bit environment on output.  Otherwise, use 8-bit
 905      environment.
 906
 907 'lock-shift
 908      If non-nil, use locking-shift (SO/SI) instead of single-shift
 909      or designation by escape sequence.
 910
 911 'no-iso6429
 912      If non-nil, don't use ISO6429's direction specification.
 913
 914 'escape-quoted
 915      If non-nil, literal control characters that are the same as
 916      the beginning of a recognized ISO2022 or ISO6429 escape sequence
 917      (in particular, ESC (0x1B), SO (0x0E), SI (0x0F), SS2 (0x8E),
 918      SS3 (0x8F), and CSI (0x9B)) are "quoted" with an escape character
 919      so that they can be properly distinguished from an escape sequence.
 920      (Note that doing this results in a non-portable encoding.) This
 921      encoding flag is used for byte-compiled files.  Note that ESC
 922      is a good choice for a quoting character because there are no
 923      escape sequences whose second byte is a character from the Control-0
 924      or Control-1 character sets; this is explicitly disallowed by the
 925      ISO2022 standard.
 926
 927 'input-charset-conversion
 928      A list of conversion specifications, specifying conversion of
 929      characters in one charset to another when decoding is performed.
 930      Each specification is a list of two elements: the source charset,
 931      and the destination charset.
 932
 933 'output-charset-conversion
 934      A list of conversion specifications, specifying conversion of
 935      characters in one charset to another when encoding is performed.
 936      The form of each specification is the same as for
 937      'input-charset-conversion.
 938
 939
 940 The following additional properties are recognized (and required)
 941 if TYPE is 'ccl:
 942
 943 'decode
 944      CCL program used for decoding (converting to internal format).
 945
 946 'encode
 947      CCL program used for encoding (converting to external format).
 948 */
 949        (name, type, doc_string, props))
 950 {
 951   Lisp_Coding_System *codesys;
 952   enum coding_system_type ty;
 953   int need_to_setup_eol_systems = 1;
 954
 955   /* Convert type to constant */
 956   if (NILP (type) || EQ (type, Qundecided))
 957                                       { ty = CODESYS_AUTODETECT; }
 958 #ifdef MULE
 959   else if (EQ (type, Qshift_jis))     { ty = CODESYS_SHIFT_JIS; }
 960   else if (EQ (type, Qiso2022))       { ty = CODESYS_ISO2022; }
 961   else if (EQ (type, Qbig5))          { ty = CODESYS_BIG5; }
 962   else if (EQ (type, Qucs4))          { ty = CODESYS_UCS4; }
 963   else if (EQ (type, Qutf16))         { ty = CODESYS_UTF16; }
 964   else if (EQ (type, Qutf8))          { ty = CODESYS_UTF8; }
 965   else if (EQ (type, Qccl))           { ty = CODESYS_CCL; }
 966 #endif
 967   else if (EQ (type, Qno_conversion)) { ty = CODESYS_NO_CONVERSION; }
 968 #ifdef DEBUG_XEMACS
 969   else if (EQ (type, Qinternal))      { ty = CODESYS_INTERNAL; }
 970 #endif
 971   else
 972     signal_simple_error ("Invalid coding system type", type);
 973
 974   CHECK_SYMBOL (name);
 975
 976   codesys = allocate_coding_system (ty, name);
 977
 978   if (NILP (doc_string))
 979     doc_string = build_string ("");
 980   else
 981     CHECK_STRING (doc_string);
 982   CODING_SYSTEM_DOC_STRING (codesys) = doc_string;
 983
 984   {
 985     EXTERNAL_PROPERTY_LIST_LOOP_3 (key, value, props)
 986       {
 987         if (EQ (key, Qmnemonic))
 988           {
 989             if (!NILP (value))
 990               CHECK_STRING (value);
 991             CODING_SYSTEM_MNEMONIC (codesys) = value;
 992           }
 993
 994         else if (EQ (key, Qeol_type))
 995           {
 996             need_to_setup_eol_systems = NILP (value);
 997             if (EQ (value, Qt))
 998               value = Qnil;
 999             CODING_SYSTEM_EOL_TYPE (codesys) = symbol_to_eol_type (value);
1000           }
1001
1002         else if (EQ (key, Qpost_read_conversion))
1003           CODING_SYSTEM_POST_READ_CONVERSION (codesys) = value;
1004         else if (EQ (key, Qpre_write_conversion))
1005           CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = value;
1006 #ifdef UTF2000
1007         else if (EQ (key, Qdisable_composition))
1008           CODING_SYSTEM_DISABLE_COMPOSITION (codesys) = !NILP (value);
1009         else if (EQ (key, Quse_entity_reference))
1010           CODING_SYSTEM_USE_ENTITY_REFERENCE (codesys) = !NILP (value);
1011 #endif
1012 #ifdef MULE
1013         else if (ty == CODESYS_ISO2022)
1014           {
1015 #define FROB_INITIAL_CHARSET(charset_num) \
1016   CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \
1017     ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value))
1018
1019             if      (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
1020             else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
1021             else if (EQ (key, Qcharset_g2)) FROB_INITIAL_CHARSET (2);
1022             else if (EQ (key, Qcharset_g3)) FROB_INITIAL_CHARSET (3);
1023
1024 #define FROB_FORCE_CHARSET(charset_num) \
1025   CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = !NILP (value)
1026
1027             else if (EQ (key, Qforce_g0_on_output)) FROB_FORCE_CHARSET (0);
1028             else if (EQ (key, Qforce_g1_on_output)) FROB_FORCE_CHARSET (1);
1029             else if (EQ (key, Qforce_g2_on_output)) FROB_FORCE_CHARSET (2);
1030             else if (EQ (key, Qforce_g3_on_output)) FROB_FORCE_CHARSET (3);
1031
1032 #define FROB_BOOLEAN_PROPERTY(prop) \
1033   CODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value)
1034
1035             else if (EQ (key, Qshort))         FROB_BOOLEAN_PROPERTY (SHORT);
1036             else if (EQ (key, Qno_ascii_eol))  FROB_BOOLEAN_PROPERTY (NO_ASCII_EOL);
1037             else if (EQ (key, Qno_ascii_cntl)) FROB_BOOLEAN_PROPERTY (NO_ASCII_CNTL);
1038             else if (EQ (key, Qseven))         FROB_BOOLEAN_PROPERTY (SEVEN);
1039             else if (EQ (key, Qlock_shift))    FROB_BOOLEAN_PROPERTY (LOCK_SHIFT);
1040             else if (EQ (key, Qno_iso6429))    FROB_BOOLEAN_PROPERTY (NO_ISO6429);
1041             else if (EQ (key, Qescape_quoted)) FROB_BOOLEAN_PROPERTY (ESCAPE_QUOTED);
1042
1043             else if (EQ (key, Qinput_charset_conversion))
1044               {
1045                 codesys->iso2022.input_conv =
1046                   Dynarr_new (charset_conversion_spec);
1047                 parse_charset_conversion_specs (codesys->iso2022.input_conv,
1048                                                 value);
1049               }
1050             else if (EQ (key, Qoutput_charset_conversion))
1051               {
1052                 codesys->iso2022.output_conv =
1053                   Dynarr_new (charset_conversion_spec);
1054                 parse_charset_conversion_specs (codesys->iso2022.output_conv,
1055                                                 value);
1056               }
1057 #ifdef UTF2000
1058             else if (EQ (key, Qccs_priority_list))
1059               {
1060                 codesys->ccs_priority_list = value;
1061               }
1062 #endif
1063             else
1064               signal_simple_error ("Unrecognized property", key);
1065           }
1066 #ifdef UTF2000
1067         else if (ty == CODESYS_UTF8)
1068           {
1069             if      (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
1070             else if (EQ (key, Qcharset_g1))
1071               CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 1) = value;
1072             else if (EQ (key, Qcharset_g2))
1073               CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 2) = value;
1074             else
1075               signal_simple_error ("Unrecognized property", key);
1076           }
1077         else if (ty == CODESYS_BIG5)
1078           {
1079             if      (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
1080             else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
1081             else
1082               signal_simple_error ("Unrecognized property", key);
1083           }
1084 #endif
1085         else if (EQ (type, Qccl))
1086           {
1087             Lisp_Object sym;
1088             struct ccl_program test_ccl;
1089             Extbyte *suffix;
1090
1091             /* Check key first.  */
1092             if (EQ (key, Qdecode))
1093               suffix = "-ccl-decode";
1094             else if (EQ (key, Qencode))
1095               suffix = "-ccl-encode";
1096             else
1097               signal_simple_error ("Unrecognized property", key);
1098
1099             /* If value is vector, register it as a ccl program
1100                associated with an newly created symbol for
1101                backward compatibility.  */
1102             if (VECTORP (value))
1103               {
1104                 sym = Fintern (concat2 (Fsymbol_name (name),
1105                                         build_string (suffix)),
1106                                Qnil);
1107                 Fregister_ccl_program (sym, value);
1108               }
1109             else
1110               {
1111                 CHECK_SYMBOL (value);
1112                 sym = value;
1113               }
1114             /* check if the given ccl programs are valid.  */
1115             if (setup_ccl_program (&test_ccl, sym) < 0)
1116               signal_simple_error ("Invalid CCL program", value);
1117
1118             if (EQ (key, Qdecode))
1119               CODING_SYSTEM_CCL_DECODE (codesys) = sym;
1120             else if (EQ (key, Qencode))
1121               CODING_SYSTEM_CCL_ENCODE (codesys) = sym;
1122
1123           }
1124 #endif /* MULE */
1125         else
1126           signal_simple_error ("Unrecognized property", key);
1127       }
1128   }
1129
1130   if (need_to_setup_eol_systems)
1131     setup_eol_coding_systems (codesys);
1132
1133   {
1134     Lisp_Object codesys_obj;
1135     XSETCODING_SYSTEM (codesys_obj, codesys);
1136     Fputhash (name, codesys_obj, Vcoding_system_hash_table);
1137     return codesys_obj;
1138   }
1139 }
1140
1141 DEFUN ("copy-coding-system", Fcopy_coding_system, 2, 2, 0, /*
1142 Copy OLD-CODING-SYSTEM to NEW-NAME.
1143 If NEW-NAME does not name an existing coding system, a new one will
1144 be created.
1145 */
1146        (old_coding_system, new_name))
1147 {
1148   Lisp_Object new_coding_system;
1149   old_coding_system = Fget_coding_system (old_coding_system);
1150   new_coding_system = Ffind_coding_system (new_name);
1151   if (NILP (new_coding_system))
1152     {
1153       XSETCODING_SYSTEM (new_coding_system,
1154                          allocate_coding_system
1155                          (XCODING_SYSTEM_TYPE (old_coding_system),
1156                           new_name));
1157       Fputhash (new_name, new_coding_system, Vcoding_system_hash_table);
1158     }
1159
1160   {
1161     Lisp_Coding_System *to = XCODING_SYSTEM (new_coding_system);
1162     Lisp_Coding_System *from = XCODING_SYSTEM (old_coding_system);
1163     memcpy (((char *) to  ) + sizeof (to->header),
1164             ((char *) from) + sizeof (from->header),
1165             sizeof (*from) - sizeof (from->header));
1166     to->name = new_name;
1167   }
1168   return new_coding_system;
1169 }
1170
1171 DEFUN ("coding-system-canonical-name-p", Fcoding_system_canonical_name_p, 1, 1, 0, /*
1172 Return t if OBJECT names a coding system, and is not a coding system alias.
1173 */
1174        (object))
1175 {
1176   return CODING_SYSTEMP (Fgethash (object, Vcoding_system_hash_table, Qnil))
1177     ? Qt : Qnil;
1178 }
1179
1180 DEFUN ("coding-system-alias-p", Fcoding_system_alias_p, 1, 1, 0, /*
1181 Return t if OBJECT is a coding system alias.
1182 All coding system aliases are created by `define-coding-system-alias'.
1183 */
1184        (object))
1185 {
1186   return SYMBOLP (Fgethash (object, Vcoding_system_hash_table, Qzero))
1187     ? Qt : Qnil;
1188 }
1189
1190 DEFUN ("coding-system-aliasee", Fcoding_system_aliasee, 1, 1, 0, /*
1191 Return the coding-system symbol for which symbol ALIAS is an alias.
1192 */
1193        (alias))
1194 {
1195   Lisp_Object aliasee = Fgethash (alias, Vcoding_system_hash_table, Qnil);
1196   if (SYMBOLP (aliasee))
1197     return aliasee;
1198   else
1199     signal_simple_error ("Symbol is not a coding system alias", alias);
1200   return Qnil;          /* To keep the compiler happy */
1201 }
1202
1203 static Lisp_Object
1204 append_suffix_to_symbol (Lisp_Object symbol, const char *ascii_string)
1205 {
1206   return Fintern (concat2 (Fsymbol_name (symbol), build_string (ascii_string)),
1207                   Qnil);
1208 }
1209
1210 /* A maphash function, for removing dangling coding system aliases. */
1211 static int
1212 dangling_coding_system_alias_p (Lisp_Object alias,
1213                                 Lisp_Object aliasee,
1214                                 void *dangling_aliases)
1215 {
1216   if (SYMBOLP (aliasee)
1217       && NILP (Fgethash (aliasee, Vcoding_system_hash_table, Qnil)))
1218     {
1219       (*(int *) dangling_aliases)++;
1220       return 1;
1221     }
1222   else
1223     return 0;
1224 }
1225
1226 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias, 2, 2, 0, /*
1227 Define symbol ALIAS as an alias for coding system ALIASEE.
1228
1229 You can use this function to redefine an alias that has already been defined,
1230 but you cannot redefine a name which is the canonical name for a coding system.
1231 \(a canonical name of a coding system is what is returned when you call
1232 `coding-system-name' on a coding system).
1233
1234 ALIASEE itself can be an alias, which allows you to define nested aliases.
1235
1236 You are forbidden, however, from creating alias loops or `dangling' aliases.
1237 These will be detected, and an error will be signaled if you attempt to do so.
1238
1239 If ALIASEE is nil, then ALIAS will simply be undefined.
1240
1241 See also `coding-system-alias-p', `coding-system-aliasee',
1242 and `coding-system-canonical-name-p'.
1243 */
1244        (alias, aliasee))
1245 {
1246   Lisp_Object real_coding_system, probe;
1247
1248   CHECK_SYMBOL (alias);
1249
1250   if (!NILP (Fcoding_system_canonical_name_p (alias)))
1251     signal_simple_error
1252       ("Symbol is the canonical name of a coding system and cannot be redefined",
1253        alias);
1254
1255   if (NILP (aliasee))
1256     {
1257       Lisp_Object subsidiary_unix = append_suffix_to_symbol (alias, "-unix");
1258       Lisp_Object subsidiary_dos  = append_suffix_to_symbol (alias, "-dos");
1259       Lisp_Object subsidiary_mac  = append_suffix_to_symbol (alias, "-mac");
1260
1261       Fremhash (alias, Vcoding_system_hash_table);
1262
1263       /* Undefine subsidiary aliases,
1264          presumably created by a previous call to this function */
1265       if (! NILP (Fcoding_system_alias_p (subsidiary_unix)) &&
1266           ! NILP (Fcoding_system_alias_p (subsidiary_dos))  &&
1267           ! NILP (Fcoding_system_alias_p (subsidiary_mac)))
1268         {
1269           Fdefine_coding_system_alias (subsidiary_unix, Qnil);
1270           Fdefine_coding_system_alias (subsidiary_dos,  Qnil);
1271           Fdefine_coding_system_alias (subsidiary_mac,  Qnil);
1272         }
1273
1274       /* Undefine dangling coding system aliases. */
1275       {
1276         int dangling_aliases;
1277
1278         do {
1279           dangling_aliases = 0;
1280           elisp_map_remhash (dangling_coding_system_alias_p,
1281                              Vcoding_system_hash_table,
1282                              &dangling_aliases);
1283         } while (dangling_aliases > 0);
1284       }
1285
1286       return Qnil;
1287     }
1288
1289   if (CODING_SYSTEMP (aliasee))
1290     aliasee = XCODING_SYSTEM_NAME (aliasee);
1291
1292   /* Checks that aliasee names a coding-system */
1293   real_coding_system = Fget_coding_system (aliasee);
1294
1295   /* Check for coding system alias loops */
1296   if (EQ (alias, aliasee))
1297     alias_loop: signal_simple_error_2
1298       ("Attempt to create a coding system alias loop", alias, aliasee);
1299
1300   for (probe = aliasee;
1301        SYMBOLP (probe);
1302        probe = Fgethash (probe, Vcoding_system_hash_table, Qzero))
1303     {
1304       if (EQ (probe, alias))
1305         goto alias_loop;
1306     }
1307
1308   Fputhash (alias, aliasee, Vcoding_system_hash_table);
1309
1310   /* Set up aliases for subsidiaries.
1311      #### There must be a better way to handle subsidiary coding systems. */
1312   {
1313     static const char *suffixes[] = { "-unix", "-dos", "-mac" };
1314     int i;
1315     for (i = 0; i < countof (suffixes); i++)
1316       {
1317         Lisp_Object alias_subsidiary =
1318           append_suffix_to_symbol (alias, suffixes[i]);
1319         Lisp_Object aliasee_subsidiary =
1320           append_suffix_to_symbol (aliasee, suffixes[i]);
1321
1322         if (! NILP (Ffind_coding_system (aliasee_subsidiary)))
1323           Fdefine_coding_system_alias (alias_subsidiary, aliasee_subsidiary);
1324       }
1325   }
1326   /* FSF return value is a vector of [ALIAS-unix ALIAS-dos ALIAS-mac],
1327      but it doesn't look intentional, so I'd rather return something
1328      meaningful or nothing at all. */
1329   return Qnil;
1330 }
1331
1332 static Lisp_Object
1333 subsidiary_coding_system (Lisp_Object coding_system, eol_type_t type)
1334 {
1335   Lisp_Coding_System *cs = XCODING_SYSTEM (coding_system);
1336   Lisp_Object new_coding_system;
1337
1338   if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
1339     return coding_system;
1340
1341   switch (type)
1342     {
1343     case EOL_AUTODETECT: return coding_system;
1344     case EOL_LF:   new_coding_system = CODING_SYSTEM_EOL_LF   (cs); break;
1345     case EOL_CR:   new_coding_system = CODING_SYSTEM_EOL_CR   (cs); break;
1346     case EOL_CRLF: new_coding_system = CODING_SYSTEM_EOL_CRLF (cs); break;
1347     default:       abort (); return Qnil;
1348     }
1349
1350   return NILP (new_coding_system) ? coding_system : new_coding_system;
1351 }
1352
1353 DEFUN ("subsidiary-coding-system", Fsubsidiary_coding_system, 2, 2, 0, /*
1354 Return the subsidiary coding system of CODING-SYSTEM with eol type EOL-TYPE.
1355 */
1356        (coding_system, eol_type))
1357 {
1358   coding_system = Fget_coding_system (coding_system);
1359
1360   return subsidiary_coding_system (coding_system,
1361                                    symbol_to_eol_type (eol_type));
1362 }
1363
1364 \f
1365 /************************************************************************/
1366 /*                         Coding system accessors                      */
1367 /************************************************************************/
1368
1369 DEFUN ("coding-system-doc-string", Fcoding_system_doc_string, 1, 1, 0, /*
1370 Return the doc string for CODING-SYSTEM.
1371 */
1372        (coding_system))
1373 {
1374   coding_system = Fget_coding_system (coding_system);
1375   return XCODING_SYSTEM_DOC_STRING (coding_system);
1376 }
1377
1378 DEFUN ("coding-system-type", Fcoding_system_type, 1, 1, 0, /*
1379 Return the type of CODING-SYSTEM.
1380 */
1381        (coding_system))
1382 {
1383   switch (XCODING_SYSTEM_TYPE (Fget_coding_system (coding_system)))
1384     {
1385     default: abort ();
1386     case CODESYS_AUTODETECT:    return Qundecided;
1387 #ifdef MULE
1388     case CODESYS_SHIFT_JIS:     return Qshift_jis;
1389     case CODESYS_ISO2022:       return Qiso2022;
1390     case CODESYS_BIG5:          return Qbig5;
1391     case CODESYS_UCS4:          return Qucs4;
1392     case CODESYS_UTF16:         return Qutf16;
1393     case CODESYS_UTF8:          return Qutf8;
1394     case CODESYS_CCL:           return Qccl;
1395 #endif
1396     case CODESYS_NO_CONVERSION: return Qno_conversion;
1397 #ifdef DEBUG_XEMACS
1398     case CODESYS_INTERNAL:      return Qinternal;
1399 #endif
1400     }
1401 }
1402
1403 #ifdef MULE
1404 static
1405 Lisp_Object coding_system_charset (Lisp_Object coding_system, int gnum)
1406 {
1407   Lisp_Object cs
1408     = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, gnum);
1409
1410   return CHARSETP (cs) ? XCHARSET_NAME (cs) : Qnil;
1411 }
1412
1413 DEFUN ("coding-system-charset", Fcoding_system_charset, 2, 2, 0, /*
1414 Return initial charset of CODING-SYSTEM designated to GNUM.
1415 GNUM allows 0 .. 3.
1416 */
1417        (coding_system, gnum))
1418 {
1419   coding_system = Fget_coding_system (coding_system);
1420   CHECK_INT (gnum);
1421
1422   return coding_system_charset (coding_system, XINT (gnum));
1423 }
1424 #endif /* MULE */
1425
1426 DEFUN ("coding-system-property", Fcoding_system_property, 2, 2, 0, /*
1427 Return the PROP property of CODING-SYSTEM.
1428 */
1429        (coding_system, prop))
1430 {
1431   int i, ok = 0;
1432   enum coding_system_type type;
1433
1434   coding_system = Fget_coding_system (coding_system);
1435   CHECK_SYMBOL (prop);
1436   type = XCODING_SYSTEM_TYPE (coding_system);
1437
1438   for (i = 0; !ok && i < Dynarr_length (the_codesys_prop_dynarr); i++)
1439     if (EQ (Dynarr_at (the_codesys_prop_dynarr, i).sym, prop))
1440       {
1441         ok = 1;
1442         switch (Dynarr_at (the_codesys_prop_dynarr, i).prop_type)
1443           {
1444           case CODESYS_PROP_ALL_OK:
1445             break;
1446 #ifdef MULE
1447           case CODESYS_PROP_ISO2022:
1448             if (type != CODESYS_ISO2022)
1449               signal_simple_error
1450                 ("Property only valid in ISO2022 coding systems",
1451                  prop);
1452             break;
1453
1454           case CODESYS_PROP_CCL:
1455             if (type != CODESYS_CCL)
1456               signal_simple_error
1457                 ("Property only valid in CCL coding systems",
1458                  prop);
1459             break;
1460 #endif /* MULE */
1461           default:
1462             abort ();
1463           }
1464       }
1465
1466   if (!ok)
1467     signal_simple_error ("Unrecognized property", prop);
1468
1469   if (EQ (prop, Qname))
1470     return XCODING_SYSTEM_NAME (coding_system);
1471   else if (EQ (prop, Qtype))
1472     return Fcoding_system_type (coding_system);
1473   else if (EQ (prop, Qdoc_string))
1474     return XCODING_SYSTEM_DOC_STRING (coding_system);
1475   else if (EQ (prop, Qmnemonic))
1476     return XCODING_SYSTEM_MNEMONIC (coding_system);
1477   else if (EQ (prop, Qeol_type))
1478     return eol_type_to_symbol (XCODING_SYSTEM_EOL_TYPE (coding_system));
1479   else if (EQ (prop, Qeol_lf))
1480     return XCODING_SYSTEM_EOL_LF (coding_system);
1481   else if (EQ (prop, Qeol_crlf))
1482     return XCODING_SYSTEM_EOL_CRLF (coding_system);
1483   else if (EQ (prop, Qeol_cr))
1484     return XCODING_SYSTEM_EOL_CR (coding_system);
1485   else if (EQ (prop, Qpost_read_conversion))
1486     return XCODING_SYSTEM_POST_READ_CONVERSION (coding_system);
1487   else if (EQ (prop, Qpre_write_conversion))
1488     return XCODING_SYSTEM_PRE_WRITE_CONVERSION (coding_system);
1489 #ifdef MULE
1490 #ifdef UTF2000
1491   else if (EQ (prop, Qdisable_composition))
1492     return XCODING_SYSTEM_DISABLE_COMPOSITION (coding_system) ? Qt : Qnil;
1493   else if (EQ (prop, Quse_entity_reference))
1494     return XCODING_SYSTEM_USE_ENTITY_REFERENCE (coding_system) ? Qt : Qnil;
1495   else if (EQ (prop, Qccs_priority_list))
1496     return XCODING_SYSTEM_CCS_PRIORITY_LIST (coding_system);
1497 #endif
1498   else if (type == CODESYS_ISO2022)
1499     {
1500       if (EQ (prop, Qcharset_g0))
1501         return coding_system_charset (coding_system, 0);
1502       else if (EQ (prop, Qcharset_g1))
1503         return coding_system_charset (coding_system, 1);
1504       else if (EQ (prop, Qcharset_g2))
1505         return coding_system_charset (coding_system, 2);
1506       else if (EQ (prop, Qcharset_g3))
1507         return coding_system_charset (coding_system, 3);
1508
1509 #define FORCE_CHARSET(charset_num) \
1510   (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \
1511    (coding_system, charset_num) ? Qt : Qnil)
1512
1513       else if (EQ (prop, Qforce_g0_on_output)) return FORCE_CHARSET (0);
1514       else if (EQ (prop, Qforce_g1_on_output)) return FORCE_CHARSET (1);
1515       else if (EQ (prop, Qforce_g2_on_output)) return FORCE_CHARSET (2);
1516       else if (EQ (prop, Qforce_g3_on_output)) return FORCE_CHARSET (3);
1517
1518 #define LISP_BOOLEAN(prop) \
1519   (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil)
1520
1521       else if (EQ (prop, Qshort))         return LISP_BOOLEAN (SHORT);
1522       else if (EQ (prop, Qno_ascii_eol))  return LISP_BOOLEAN (NO_ASCII_EOL);
1523       else if (EQ (prop, Qno_ascii_cntl)) return LISP_BOOLEAN (NO_ASCII_CNTL);
1524       else if (EQ (prop, Qseven))         return LISP_BOOLEAN (SEVEN);
1525       else if (EQ (prop, Qlock_shift))    return LISP_BOOLEAN (LOCK_SHIFT);
1526       else if (EQ (prop, Qno_iso6429))    return LISP_BOOLEAN (NO_ISO6429);
1527       else if (EQ (prop, Qescape_quoted)) return LISP_BOOLEAN (ESCAPE_QUOTED);
1528
1529       else if (EQ (prop, Qinput_charset_conversion))
1530         return
1531           unparse_charset_conversion_specs
1532             (XCODING_SYSTEM (coding_system)->iso2022.input_conv);
1533       else if (EQ (prop, Qoutput_charset_conversion))
1534         return
1535           unparse_charset_conversion_specs
1536             (XCODING_SYSTEM (coding_system)->iso2022.output_conv);
1537       else
1538         abort ();
1539     }
1540   else if (type == CODESYS_CCL)
1541     {
1542       if (EQ (prop, Qdecode))
1543         return XCODING_SYSTEM_CCL_DECODE (coding_system);
1544       else if (EQ (prop, Qencode))
1545         return XCODING_SYSTEM_CCL_ENCODE (coding_system);
1546       else
1547         abort ();
1548     }
1549 #endif /* MULE */
1550   else
1551     abort ();
1552
1553   return Qnil; /* not reached */
1554 }
1555
1556 \f
1557 /************************************************************************/
1558 /*                       Coding category functions                      */
1559 /************************************************************************/
1560
1561 static int
1562 decode_coding_category (Lisp_Object symbol)
1563 {
1564   int i;
1565
1566   CHECK_SYMBOL (symbol);
1567   for (i = 0; i < CODING_CATEGORY_LAST; i++)
1568     if (EQ (coding_category_symbol[i], symbol))
1569       return i;
1570
1571   signal_simple_error ("Unrecognized coding category", symbol);
1572   return 0; /* not reached */
1573 }
1574
1575 DEFUN ("coding-category-list", Fcoding_category_list, 0, 0, 0, /*
1576 Return a list of all recognized coding categories.
1577 */
1578        ())
1579 {
1580   int i;
1581   Lisp_Object list = Qnil;
1582
1583   for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
1584     list = Fcons (coding_category_symbol[i], list);
1585   return list;
1586 }
1587
1588 DEFUN ("set-coding-priority-list", Fset_coding_priority_list, 1, 1, 0, /*
1589 Change the priority order of the coding categories.
1590 LIST should be list of coding categories, in descending order of
1591 priority.  Unspecified coding categories will be lower in priority
1592 than all specified ones, in the same relative order they were in
1593 previously.
1594 */
1595        (list))
1596 {
1597   int category_to_priority[CODING_CATEGORY_LAST];
1598   int i, j;
1599   Lisp_Object rest;
1600
1601   /* First generate a list that maps coding categories to priorities. */
1602
1603   for (i = 0; i < CODING_CATEGORY_LAST; i++)
1604     category_to_priority[i] = -1;
1605
1606   /* Highest priority comes from the specified list. */
1607   i = 0;
1608   EXTERNAL_LIST_LOOP (rest, list)
1609     {
1610       int cat = decode_coding_category (XCAR (rest));
1611
1612       if (category_to_priority[cat] >= 0)
1613         signal_simple_error ("Duplicate coding category in list", XCAR (rest));
1614       category_to_priority[cat] = i++;
1615     }
1616
1617   /* Now go through the existing categories by priority to retrieve
1618      the categories not yet specified and preserve their priority
1619      order. */
1620   for (j = 0; j < CODING_CATEGORY_LAST; j++)
1621     {
1622       int cat = fcd->coding_category_by_priority[j];
1623       if (category_to_priority[cat] < 0)
1624         category_to_priority[cat] = i++;
1625     }
1626
1627   /* Now we need to construct the inverse of the mapping we just
1628      constructed. */
1629
1630   for (i = 0; i < CODING_CATEGORY_LAST; i++)
1631     fcd->coding_category_by_priority[category_to_priority[i]] = i;
1632
1633   /* Phew!  That was confusing. */
1634   return Qnil;
1635 }
1636
1637 DEFUN ("coding-priority-list", Fcoding_priority_list, 0, 0, 0, /*
1638 Return a list of coding categories in descending order of priority.
1639 */
1640        ())
1641 {
1642   int i;
1643   Lisp_Object list = Qnil;
1644
1645   for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
1646     list = Fcons (coding_category_symbol[fcd->coding_category_by_priority[i]],
1647                   list);
1648   return list;
1649 }
1650
1651 DEFUN ("set-coding-category-system", Fset_coding_category_system, 2, 2, 0, /*
1652 Change the coding system associated with a coding category.
1653 */
1654        (coding_category, coding_system))
1655 {
1656   int cat = decode_coding_category (coding_category);
1657
1658   coding_system = Fget_coding_system (coding_system);
1659   fcd->coding_category_system[cat] = coding_system;
1660   return Qnil;
1661 }
1662
1663 DEFUN ("coding-category-system", Fcoding_category_system, 1, 1, 0, /*
1664 Return the coding system associated with a coding category.
1665 */
1666        (coding_category))
1667 {
1668   int cat = decode_coding_category (coding_category);
1669   Lisp_Object sys = fcd->coding_category_system[cat];
1670
1671   if (!NILP (sys))
1672     return XCODING_SYSTEM_NAME (sys);
1673   return Qnil;
1674 }
1675
1676 \f
1677 /************************************************************************/
1678 /*                     Detecting the encoding of data                   */
1679 /************************************************************************/
1680
1681 struct detection_state
1682 {
1683   eol_type_t eol_type;
1684   int seen_non_ascii;
1685   int mask;
1686 #ifdef MULE
1687   struct
1688     {
1689       int mask;
1690       int in_second_byte;
1691     }
1692   big5;
1693
1694   struct
1695     {
1696       int mask;
1697       int in_second_byte;
1698     }
1699   shift_jis;
1700
1701   struct
1702     {
1703       int mask;
1704       int in_byte;
1705   }
1706   ucs4;
1707
1708   struct
1709     {
1710       int mask;
1711       int in_byte;
1712     }
1713   utf16;
1714
1715   struct
1716     {
1717       int mask;
1718       int in_byte;
1719     }
1720   utf8;
1721
1722   struct
1723     {
1724       int mask;
1725       int initted;
1726       struct iso2022_decoder iso;
1727       unsigned int flags;
1728       int high_byte_count;
1729       unsigned int saw_single_shift:1;
1730     }
1731   iso2022;
1732 #endif
1733   struct
1734     {
1735       int seen_anything;
1736       int just_saw_cr;
1737     }
1738   eol;
1739 };
1740
1741 static int
1742 acceptable_control_char_p (int c)
1743 {
1744   switch (c)
1745     {
1746       /* Allow and ignore control characters that you might
1747          reasonably see in a text file */
1748     case '\r':
1749     case '\n':
1750     case '\t':
1751     case  7: /* bell */
1752     case  8: /* backspace */
1753     case 11: /* vertical tab */
1754     case 12: /* form feed */
1755     case 26: /* MS-DOS C-z junk */
1756     case 31: /* '^_' -- for info */
1757       return 1;
1758     default:
1759       return 0;
1760     }
1761 }
1762
1763 static int
1764 mask_has_at_most_one_bit_p (int mask)
1765 {
1766   /* Perhaps the only thing useful you learn from intensive Microsoft
1767      technical interviews */
1768   return (mask & (mask - 1)) == 0;
1769 }
1770
1771 static eol_type_t
1772 detect_eol_type (struct detection_state *st, const Extbyte *src,
1773                  Lstream_data_count n)
1774 {
1775   while (n--)
1776     {
1777       unsigned char c = *(unsigned char *)src++;
1778       if (c == '\n')
1779         {
1780           if (st->eol.just_saw_cr)
1781             return EOL_CRLF;
1782           else if (st->eol.seen_anything)
1783             return EOL_LF;
1784         }
1785       else if (st->eol.just_saw_cr)
1786         return EOL_CR;
1787       else if (c == '\r')
1788         st->eol.just_saw_cr = 1;
1789       else
1790         st->eol.just_saw_cr = 0;
1791       st->eol.seen_anything = 1;
1792     }
1793
1794   return EOL_AUTODETECT;
1795 }
1796
1797 /* Attempt to determine the encoding and EOL type of the given text.
1798    Before calling this function for the first type, you must initialize
1799    st->eol_type as appropriate and initialize st->mask to ~0.
1800
1801    st->eol_type holds the determined EOL type, or EOL_AUTODETECT if
1802    not yet known.
1803
1804    st->mask holds the determined coding category mask, or ~0 if only
1805    ASCII has been seen so far.
1806
1807    Returns:
1808
1809    0 == st->eol_type is EOL_AUTODETECT and/or more than coding category
1810         is present in st->mask
1811    1 == definitive answers are here for both st->eol_type and st->mask
1812 */
1813
1814 static int
1815 detect_coding_type (struct detection_state *st, const Extbyte *src,
1816                     Lstream_data_count n, int just_do_eol)
1817 {
1818   if (st->eol_type == EOL_AUTODETECT)
1819     st->eol_type = detect_eol_type (st, src, n);
1820
1821   if (just_do_eol)
1822     return st->eol_type != EOL_AUTODETECT;
1823
1824   if (!st->seen_non_ascii)
1825     {
1826       for (; n; n--, src++)
1827         {
1828           unsigned char c = *(unsigned char *) src;
1829           if ((c < 0x20 && !acceptable_control_char_p (c)) || c >= 0x80)
1830             {
1831               st->seen_non_ascii = 1;
1832 #ifdef MULE
1833               st->shift_jis.mask = ~0;
1834               st->big5.mask = ~0;
1835               st->ucs4.mask = ~0;
1836               st->utf16.mask = ~0;
1837               st->utf8.mask = ~0;
1838               st->iso2022.mask = ~0;
1839 #endif
1840               break;
1841             }
1842         }
1843     }
1844
1845   if (!n)
1846     return 0;
1847 #ifdef MULE
1848   if (!mask_has_at_most_one_bit_p (st->iso2022.mask))
1849     st->iso2022.mask = detect_coding_iso2022 (st, src, n);
1850   if (!mask_has_at_most_one_bit_p (st->shift_jis.mask))
1851     st->shift_jis.mask = detect_coding_sjis (st, src, n);
1852   if (!mask_has_at_most_one_bit_p (st->big5.mask))
1853     st->big5.mask = detect_coding_big5 (st, src, n);
1854   if (!mask_has_at_most_one_bit_p (st->utf8.mask))
1855     st->utf8.mask = detect_coding_utf8 (st, src, n);
1856   if (!mask_has_at_most_one_bit_p (st->utf16.mask))
1857     st->utf16.mask = detect_coding_utf16 (st, src, n);
1858   if (!mask_has_at_most_one_bit_p (st->ucs4.mask))
1859     st->ucs4.mask = detect_coding_ucs4 (st, src, n);
1860
1861   st->mask
1862     = st->iso2022.mask | st->shift_jis.mask | st->big5.mask
1863     | st->utf8.mask | st->ucs4.mask;
1864 #endif
1865   {
1866     int retval = mask_has_at_most_one_bit_p (st->mask);
1867     st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK;
1868     return retval && st->eol_type != EOL_AUTODETECT;
1869   }
1870 }
1871
1872 static Lisp_Object
1873 coding_system_from_mask (int mask)
1874 {
1875   if (mask == ~0)
1876     {
1877       /* If the file was entirely or basically ASCII, use the
1878          default value of `buffer-file-coding-system'. */
1879       Lisp_Object retval =
1880         XBUFFER (Vbuffer_defaults)->buffer_file_coding_system;
1881       if (!NILP (retval))
1882         {
1883           retval = Ffind_coding_system (retval);
1884           if (NILP (retval))
1885             {
1886               warn_when_safe
1887                 (Qbad_variable, Qwarning,
1888                  "Invalid `default-buffer-file-coding-system', set to nil");
1889               XBUFFER (Vbuffer_defaults)->buffer_file_coding_system = Qnil;
1890             }
1891         }
1892       if (NILP (retval))
1893         retval = Fget_coding_system (Qraw_text);
1894       return retval;
1895     }
1896   else
1897     {
1898       int i;
1899       int cat = -1;
1900 #ifdef MULE
1901       mask = postprocess_iso2022_mask (mask);
1902 #endif
1903       /* Look through the coding categories by priority and find
1904          the first one that is allowed. */
1905       for (i = 0; i < CODING_CATEGORY_LAST; i++)
1906         {
1907           cat = fcd->coding_category_by_priority[i];
1908           if ((mask & (1 << cat)) &&
1909               !NILP (fcd->coding_category_system[cat]))
1910             break;
1911         }
1912       if (cat >= 0)
1913         return fcd->coding_category_system[cat];
1914       else
1915         return Fget_coding_system (Qraw_text);
1916     }
1917 }
1918
1919 /* Given a seekable read stream and potential coding system and EOL type
1920    as specified, do any autodetection that is called for.  If the
1921    coding system and/or EOL type are not `autodetect', they will be left
1922    alone; but this function will never return an autodetect coding system
1923    or EOL type.
1924
1925    This function does not automatically fetch subsidiary coding systems;
1926    that should be unnecessary with the explicit eol-type argument. */
1927
1928 #define LENGTH(string_constant) (sizeof (string_constant) - 1)
1929 /* number of leading lines to check for a coding cookie */
1930 #define LINES_TO_CHECK 2
1931
1932 void
1933 determine_real_coding_system (Lstream *stream, Lisp_Object *codesys_in_out,
1934                               eol_type_t *eol_type_in_out)
1935 {
1936   struct detection_state decst;
1937
1938   if (*eol_type_in_out == EOL_AUTODETECT)
1939     *eol_type_in_out = XCODING_SYSTEM_EOL_TYPE (*codesys_in_out);
1940
1941   xzero (decst);
1942   decst.eol_type = *eol_type_in_out;
1943   decst.mask = ~0;
1944
1945   /* If autodetection is called for, do it now. */
1946   if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT
1947       || *eol_type_in_out == EOL_AUTODETECT)
1948     {
1949       Extbyte buf[4096];
1950       Lisp_Object coding_system = Qnil;
1951       Extbyte *p;
1952       Lstream_data_count nread = Lstream_read (stream, buf, sizeof (buf));
1953       Extbyte *scan_end;
1954       int lines_checked = 0;
1955
1956       /* Look for initial "-*-"; mode line prefix */
1957       for (p = buf,
1958              scan_end = buf + nread - LENGTH ("-*-coding:?-*-");
1959            p <= scan_end
1960              && lines_checked < LINES_TO_CHECK;
1961            p++)
1962         if (*p == '-' && *(p+1) == '*' && *(p+2) == '-')
1963           {
1964             Extbyte *local_vars_beg = p + 3;
1965             /* Look for final "-*-"; mode line suffix */
1966             for (p = local_vars_beg,
1967                    scan_end = buf + nread - LENGTH ("-*-");
1968                  p <= scan_end
1969                    && lines_checked < LINES_TO_CHECK;
1970                  p++)
1971               if (*p == '-' && *(p+1) == '*' && *(p+2) == '-')
1972                 {
1973                   Extbyte *suffix = p;
1974                   /* Look for "coding:" */
1975                   for (p = local_vars_beg,
1976                          scan_end = suffix - LENGTH ("coding:?");
1977                        p <= scan_end;
1978                        p++)
1979                     if (memcmp ("coding:", p, LENGTH ("coding:")) == 0
1980                         && (p == local_vars_beg
1981                             || (*(p-1) == ' '  ||
1982                                 *(p-1) == '\t' ||
1983                                 *(p-1) == ';')))
1984                       {
1985                         Extbyte save;
1986                         int n;
1987                         p += LENGTH ("coding:");
1988                         while (*p == ' ' || *p == '\t') p++;
1989
1990                         /* Get coding system name */
1991                         save = *suffix; *suffix = '\0';
1992                         /* Characters valid in a MIME charset name (rfc 1521),
1993                            and in a Lisp symbol name. */
1994                         n = strspn ( (char *) p,
1995                                     "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
1996                                     "abcdefghijklmnopqrstuvwxyz"
1997                                     "0123456789"
1998                                     "!$%&*+-.^_{|}~");
1999                         *suffix = save;
2000                         if (n > 0)
2001                           {
2002                             save = p[n]; p[n] = '\0';
2003                             coding_system =
2004                               Ffind_coding_system (intern ((char *) p));
2005                             p[n] = save;
2006                           }
2007                         break;
2008                       }
2009                   break;
2010                 }
2011               /* #### file must use standard EOLs or we miss 2d line */
2012               /* #### not to mention this is broken for UTF-16 DOS files */
2013               else if (*p == '\n' || *p == '\r')
2014                 {
2015                   lines_checked++;
2016                   /* skip past multibyte (DOS) newline */
2017                   if (*p == '\r' && *(p+1) == '\n') p++;
2018                 }
2019             break;
2020           }
2021         /* #### file must use standard EOLs or we miss 2d line */
2022         /* #### not to mention this is broken for UTF-16 DOS files */
2023         else if (*p == '\n' || *p == '\r')
2024           {
2025             lines_checked++;
2026             /* skip past multibyte (DOS) newline */
2027             if (*p == '\r' && *(p+1) == '\n') p++;
2028           }
2029
2030       if (NILP (coding_system))
2031         do
2032           {
2033             if (detect_coding_type (&decst, buf, nread,
2034                                     XCODING_SYSTEM_TYPE (*codesys_in_out)
2035                                     != CODESYS_AUTODETECT))
2036               break;
2037             nread = Lstream_read (stream, buf, sizeof (buf));
2038             if (nread == 0)
2039               break;
2040           }
2041         while (1);
2042
2043       else if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT
2044                && XCODING_SYSTEM_EOL_TYPE (coding_system) == EOL_AUTODETECT)
2045         do
2046           {
2047             if (detect_coding_type (&decst, buf, nread, 1))
2048               break;
2049             nread = Lstream_read (stream, buf, sizeof (buf));
2050             if (!nread)
2051               break;
2052           }
2053         while (1);
2054
2055       *eol_type_in_out = decst.eol_type;
2056       if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT)
2057         {
2058           if (NILP (coding_system))
2059             *codesys_in_out = coding_system_from_mask (decst.mask);
2060           else
2061             *codesys_in_out = coding_system;
2062         }
2063     }
2064
2065   /* If we absolutely can't determine the EOL type, just assume LF. */
2066   if (*eol_type_in_out == EOL_AUTODETECT)
2067     *eol_type_in_out = EOL_LF;
2068
2069   Lstream_rewind (stream);
2070 }
2071
2072 DEFUN ("detect-coding-region", Fdetect_coding_region, 2, 3, 0, /*
2073 Detect coding system of the text in the region between START and END.
2074 Return a list of possible coding systems ordered by priority.
2075 If only ASCII characters are found, return 'undecided or one of
2076 its subsidiary coding systems according to a detected end-of-line
2077 type.  Optional arg BUFFER defaults to the current buffer.
2078 */
2079        (start, end, buffer))
2080 {
2081   Lisp_Object val = Qnil;
2082   struct buffer *buf = decode_buffer (buffer, 0);
2083   Bufpos b, e;
2084   Lisp_Object instream, lb_instream;
2085   Lstream *istr, *lb_istr;
2086   struct detection_state decst;
2087   struct gcpro gcpro1, gcpro2;
2088
2089   get_buffer_range_char (buf, start, end, &b, &e, 0);
2090   lb_instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2091   lb_istr = XLSTREAM (lb_instream);
2092   instream = make_encoding_input_stream (lb_istr, Fget_coding_system (Qbinary));
2093   istr = XLSTREAM (instream);
2094   GCPRO2 (instream, lb_instream);
2095   xzero (decst);
2096   decst.eol_type = EOL_AUTODETECT;
2097   decst.mask = ~0;
2098   while (1)
2099     {
2100       Extbyte random_buffer[4096];
2101       Lstream_data_count nread = Lstream_read (istr, random_buffer, sizeof (random_buffer));
2102
2103       if (!nread)
2104         break;
2105       if (detect_coding_type (&decst, random_buffer, nread, 0))
2106         break;
2107     }
2108
2109   if (decst.mask == ~0)
2110     val = subsidiary_coding_system (Fget_coding_system (Qundecided),
2111                                     decst.eol_type);
2112   else
2113     {
2114       int i;
2115
2116       val = Qnil;
2117 #ifdef MULE
2118       decst.mask = postprocess_iso2022_mask (decst.mask);
2119 #endif
2120       for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
2121         {
2122           int sys = fcd->coding_category_by_priority[i];
2123           if (decst.mask & (1 << sys))
2124             {
2125               Lisp_Object codesys = fcd->coding_category_system[sys];
2126               if (!NILP (codesys))
2127                 codesys = subsidiary_coding_system (codesys, decst.eol_type);
2128               val = Fcons (codesys, val);
2129             }
2130         }
2131     }
2132   Lstream_close (istr);
2133   UNGCPRO;
2134   Lstream_delete (istr);
2135   Lstream_delete (lb_istr);
2136   return val;
2137 }
2138
2139 \f
2140 /************************************************************************/
2141 /*           Converting to internal Mule format ("decoding")            */
2142 /************************************************************************/
2143
2144 /* A decoding stream is a stream used for decoding text (i.e.
2145    converting from some external format to internal format).
2146    The decoding-stream object keeps track of the actual coding
2147    stream, the stream that is at the other end, and data that
2148    needs to be persistent across the lifetime of the stream. */
2149
2150 /* Handle the EOL stuff related to just-read-in character C.
2151    EOL_TYPE is the EOL type of the coding stream.
2152    FLAGS is the current value of FLAGS in the coding stream, and may
2153    be modified by this macro.  (The macro only looks at the
2154    CODING_STATE_CR flag.)  DST is the Dynarr to which the decoded
2155    bytes are to be written.  You need to also define a local goto
2156    label "label_continue_loop" that is at the end of the main
2157    character-reading loop.
2158
2159    If C is a CR character, then this macro handles it entirely and
2160    jumps to label_continue_loop.  Otherwise, this macro does not add
2161    anything to DST, and continues normally.  You should continue
2162    processing C normally after this macro. */
2163
2164 #define DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst)         \
2165 do {                                                            \
2166   if (c == '\r')                                                \
2167     {                                                           \
2168       if (eol_type == EOL_CR)                                   \
2169         Dynarr_add (dst, '\n');                                 \
2170       else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR) \
2171         Dynarr_add (dst, c);                                    \
2172       else                                                      \
2173         flags |= CODING_STATE_CR;                               \
2174       goto label_continue_loop;                                 \
2175     }                                                           \
2176   else if (flags & CODING_STATE_CR)                             \
2177     {   /* eol_type == CODING_SYSTEM_EOL_CRLF */                \
2178       if (c != '\n')                                            \
2179         Dynarr_add (dst, '\r');                                 \
2180       flags &= ~CODING_STATE_CR;                                \
2181     }                                                           \
2182 } while (0)
2183
2184 /* C should be a binary character in the range 0 - 255; convert
2185    to internal format and add to Dynarr DST. */
2186
2187 #ifdef UTF2000
2188 #define DECODE_ADD_BINARY_CHAR(c, dst) \
2189 do {                                            \
2190   if (BYTE_ASCII_P (c))                         \
2191     Dynarr_add (dst, c);                        \
2192   else                                          \
2193     {                                           \
2194       Dynarr_add (dst, (c >> 6) | 0xc0);        \
2195       Dynarr_add (dst, (c & 0x3f) | 0x80);      \
2196     }                                           \
2197 } while (0)
2198
2199 INLINE_HEADER void DECODE_ADD_UCS_CHAR(Emchar c, unsigned_char_dynarr* dst);
2200 INLINE_HEADER void
2201 DECODE_ADD_UCS_CHAR(Emchar c, unsigned_char_dynarr* dst)
2202 {
2203   if ( c <= 0x7f )
2204     {
2205       Dynarr_add (dst, c);
2206     }
2207   else if ( c <= 0x7ff )
2208     {
2209       Dynarr_add (dst, (c >> 6) | 0xc0);
2210       Dynarr_add (dst, (c & 0x3f) | 0x80);
2211     }
2212   else if ( c <= 0xffff )
2213     {
2214       Dynarr_add (dst,  (c >> 12) | 0xe0);
2215       Dynarr_add (dst, ((c >>  6) & 0x3f) | 0x80);
2216       Dynarr_add (dst,  (c        & 0x3f) | 0x80);
2217     }
2218   else if ( c <= 0x1fffff )
2219     {
2220       Dynarr_add (dst,  (c >> 18) | 0xf0);
2221       Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2222       Dynarr_add (dst, ((c >>  6) & 0x3f) | 0x80);
2223       Dynarr_add (dst,  (c        & 0x3f) | 0x80);
2224     }
2225   else if ( c <= 0x3ffffff )
2226     {
2227       Dynarr_add (dst,  (c >> 24) | 0xf8);
2228       Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
2229       Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2230       Dynarr_add (dst, ((c >>  6) & 0x3f) | 0x80);
2231       Dynarr_add (dst,  (c        & 0x3f) | 0x80);
2232     }
2233   else
2234     {
2235       Dynarr_add (dst,  (c >> 30) | 0xfc);
2236       Dynarr_add (dst, ((c >> 24) & 0x3f) | 0x80);
2237       Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
2238       Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2239       Dynarr_add (dst, ((c >>  6) & 0x3f) | 0x80);
2240       Dynarr_add (dst,  (c        & 0x3f) | 0x80);
2241     }
2242 }
2243 #else
2244 #define DECODE_ADD_BINARY_CHAR(c, dst)          \
2245 do {                                            \
2246   if (BYTE_ASCII_P (c))                         \
2247     Dynarr_add (dst, c);                        \
2248   else if (BYTE_C1_P (c))                       \
2249     {                                           \
2250       Dynarr_add (dst, LEADING_BYTE_CONTROL_1); \
2251       Dynarr_add (dst, c + 0x20);               \
2252     }                                           \
2253   else                                          \
2254     {                                           \
2255       Dynarr_add (dst, LEADING_BYTE_LATIN_ISO8859_1); \
2256       Dynarr_add (dst, c);                      \
2257     }                                           \
2258 } while (0)
2259 #endif
2260
2261 #define DECODE_OUTPUT_PARTIAL_CHAR(ch)  \
2262 do {                                    \
2263   if (ch)                               \
2264     {                                   \
2265       DECODE_ADD_BINARY_CHAR (ch, dst); \
2266       ch = 0;                           \
2267     }                                   \
2268 } while (0)
2269
2270 #define DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst) \
2271 do {                                    \
2272   if (flags & CODING_STATE_END)         \
2273     {                                   \
2274       DECODE_OUTPUT_PARTIAL_CHAR (ch);  \
2275       if (flags & CODING_STATE_CR)      \
2276         Dynarr_add (dst, '\r');         \
2277     }                                   \
2278 } while (0)
2279
2280 #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding)
2281
2282 #define ER_BUF_SIZE 24
2283
2284 struct decoding_stream
2285 {
2286   /* Coding system that governs the conversion. */
2287   Lisp_Coding_System *codesys;
2288
2289   /* Stream that we read the encoded data from or
2290      write the decoded data to. */
2291   Lstream *other_end;
2292
2293   /* If we are reading, then we can return only a fixed amount of
2294      data, so if the conversion resulted in too much data, we store it
2295      here for retrieval the next time around. */
2296   unsigned_char_dynarr *runoff;
2297
2298   /* FLAGS holds flags indicating the current state of the decoding.
2299      Some of these flags are dependent on the coding system. */
2300   unsigned int flags;
2301
2302   /* CPOS holds a partially built-up code-point of character. */
2303   unsigned int cpos;
2304
2305   /* EOL_TYPE specifies the type of end-of-line conversion that
2306      currently applies.  We need to keep this separate from the
2307      EOL type stored in CODESYS because the latter might indicate
2308      automatic EOL-type detection while the former will always
2309      indicate a particular EOL type. */
2310   eol_type_t eol_type;
2311 #ifdef MULE
2312   /* Additional ISO2022 information.  We define the structure above
2313      because it's also needed by the detection routines. */
2314   struct iso2022_decoder iso2022;
2315
2316   /* Additional information (the state of the running CCL program)
2317      used by the CCL decoder. */
2318   struct ccl_program ccl;
2319
2320   /* counter for UTF-8 or UCS-4 */
2321   unsigned char counter;
2322 #endif
2323 #ifdef UTF2000
2324   char bom_flag;
2325   unsigned char er_counter;
2326   unsigned char er_buf[ER_BUF_SIZE];
2327
2328   unsigned combined_char_count;
2329   Emchar combined_chars[16];
2330   Lisp_Object combining_table;
2331 #endif
2332   struct detection_state decst;
2333 };
2334
2335 static Lstream_data_count decoding_reader (Lstream *stream,
2336                                 unsigned char *data, Lstream_data_count size);
2337 static Lstream_data_count decoding_writer (Lstream *stream,
2338                                 const unsigned char *data, Lstream_data_count size);
2339 static int decoding_rewinder   (Lstream *stream);
2340 static int decoding_seekable_p (Lstream *stream);
2341 static int decoding_flusher    (Lstream *stream);
2342 static int decoding_closer     (Lstream *stream);
2343
2344 static Lisp_Object decoding_marker (Lisp_Object stream);
2345
2346 DEFINE_LSTREAM_IMPLEMENTATION ("decoding", lstream_decoding,
2347                                sizeof (struct decoding_stream));
2348
2349 static Lisp_Object
2350 decoding_marker (Lisp_Object stream)
2351 {
2352   Lstream *str = DECODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2353   Lisp_Object str_obj;
2354
2355   /* We do not need to mark the coding systems or charsets stored
2356      within the stream because they are stored in a global list
2357      and automatically marked. */
2358
2359   XSETLSTREAM (str_obj, str);
2360   mark_object (str_obj);
2361   if (str->imp->marker)
2362     return (str->imp->marker) (str_obj);
2363   else
2364     return Qnil;
2365 }
2366
2367 /* Read SIZE bytes of data and store it into DATA.  We are a decoding stream
2368    so we read data from the other end, decode it, and store it into DATA. */
2369
2370 static Lstream_data_count
2371 decoding_reader (Lstream *stream, unsigned char *data, Lstream_data_count size)
2372 {
2373   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2374   unsigned char *orig_data = data;
2375   Lstream_data_count read_size;
2376   int error_occurred = 0;
2377
2378   /* We need to interface to mule_decode(), which expects to take some
2379      amount of data and store the result into a Dynarr.  We have
2380      mule_decode() store into str->runoff, and take data from there
2381      as necessary. */
2382
2383   /* We loop until we have enough data, reading chunks from the other
2384      end and decoding it. */
2385   while (1)
2386     {
2387       /* Take data from the runoff if we can.  Make sure to take at
2388          most SIZE bytes, and delete the data from the runoff. */
2389       if (Dynarr_length (str->runoff) > 0)
2390         {
2391           Lstream_data_count chunk = min (size, (Lstream_data_count) Dynarr_length (str->runoff));
2392           memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2393           Dynarr_delete_many (str->runoff, 0, chunk);
2394           data += chunk;
2395           size -= chunk;
2396         }
2397
2398       if (size == 0)
2399         break; /* No more room for data */
2400
2401       if (str->flags & CODING_STATE_END)
2402         /* This means that on the previous iteration, we hit the EOF on
2403            the other end.  We loop once more so that mule_decode() can
2404            output any final stuff it may be holding, or any "go back
2405            to a sane state" escape sequences. (This latter makes sense
2406            during encoding.) */
2407         break;
2408
2409       /* Exhausted the runoff, so get some more.  DATA has at least
2410          SIZE bytes left of storage in it, so it's OK to read directly
2411          into it.  (We'll be overwriting above, after we've decoded it
2412          into the runoff.) */
2413       read_size = Lstream_read (str->other_end, data, size);
2414       if (read_size < 0)
2415         {
2416           error_occurred = 1;
2417           break;
2418         }
2419       if (read_size == 0)
2420         /* There might be some more end data produced in the translation.
2421            See the comment above. */
2422         str->flags |= CODING_STATE_END;
2423       mule_decode (stream, (Extbyte *) data, str->runoff, read_size);
2424     }
2425
2426   if (data - orig_data == 0)
2427     return error_occurred ? -1 : 0;
2428   else
2429     return data - orig_data;
2430 }
2431
2432 static Lstream_data_count
2433 decoding_writer (Lstream *stream, const unsigned char *data, Lstream_data_count size)
2434 {
2435   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2436   Lstream_data_count retval;
2437
2438   /* Decode all our data into the runoff, and then attempt to write
2439      it all out to the other end.  Remove whatever chunk we succeeded
2440      in writing. */
2441   mule_decode (stream, (Extbyte *) data, str->runoff, size);
2442   retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2443                           Dynarr_length (str->runoff));
2444   if (retval > 0)
2445     Dynarr_delete_many (str->runoff, 0, retval);
2446   /* Do NOT return retval.  The return value indicates how much
2447      of the incoming data was written, not how many bytes were
2448      written. */
2449   return size;
2450 }
2451
2452 static void
2453 reset_decoding_stream (struct decoding_stream *str)
2454 {
2455 #ifdef MULE
2456   if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_ISO2022)
2457     {
2458       Lisp_Object coding_system;
2459       XSETCODING_SYSTEM (coding_system, str->codesys);
2460       reset_iso2022 (coding_system, &str->iso2022);
2461     }
2462   else if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_CCL)
2463     {
2464       setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_DECODE (str->codesys));
2465     }
2466   str->counter = 0;
2467 #endif /* MULE */
2468 #ifdef UTF2000
2469   str->bom_flag = 0;
2470   str->er_counter = 0;
2471   str->combined_char_count = 0;
2472   str->combining_table = Qnil;
2473 #endif
2474   if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT
2475       || CODING_SYSTEM_EOL_TYPE (str->codesys) == EOL_AUTODETECT)
2476     {
2477       xzero (str->decst);
2478       str->decst.eol_type = EOL_AUTODETECT;
2479       str->decst.mask = ~0;
2480     }
2481   str->flags = str->cpos = 0;
2482 }
2483
2484 static int
2485 decoding_rewinder (Lstream *stream)
2486 {
2487   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2488   reset_decoding_stream (str);
2489   Dynarr_reset (str->runoff);
2490   return Lstream_rewind (str->other_end);
2491 }
2492
2493 static int
2494 decoding_seekable_p (Lstream *stream)
2495 {
2496   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2497   return Lstream_seekable_p (str->other_end);
2498 }
2499
2500 static int
2501 decoding_flusher (Lstream *stream)
2502 {
2503   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2504   return Lstream_flush (str->other_end);
2505 }
2506
2507 static int
2508 decoding_closer (Lstream *stream)
2509 {
2510   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2511   if (stream->flags & LSTREAM_FL_WRITE)
2512     {
2513       str->flags |= CODING_STATE_END;
2514       decoding_writer (stream, 0, 0);
2515     }
2516   Dynarr_free (str->runoff);
2517 #ifdef MULE
2518 #ifdef ENABLE_COMPOSITE_CHARS
2519   if (str->iso2022.composite_chars)
2520     Dynarr_free (str->iso2022.composite_chars);
2521 #endif
2522 #endif
2523   return Lstream_close (str->other_end);
2524 }
2525
2526 Lisp_Object
2527 decoding_stream_coding_system (Lstream *stream)
2528 {
2529   Lisp_Object coding_system;
2530   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2531
2532   XSETCODING_SYSTEM (coding_system, str->codesys);
2533   return subsidiary_coding_system (coding_system, str->eol_type);
2534 }
2535
2536 void
2537 set_decoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2538 {
2539   Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2540   struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2541   str->codesys = cs;
2542   if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
2543     str->eol_type = CODING_SYSTEM_EOL_TYPE (cs);
2544   reset_decoding_stream (str);
2545 }
2546
2547 /* WARNING WARNING WARNING WARNING!!!!!  If you open up a decoding
2548    stream for writing, no automatic code detection will be performed.
2549    The reason for this is that automatic code detection requires a
2550    seekable input.  Things will also fail if you open a decoding
2551    stream for reading using a non-fully-specified coding system and
2552    a non-seekable input stream. */
2553
2554 static Lisp_Object
2555 make_decoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2556                         const char *mode)
2557 {
2558   Lstream *lstr = Lstream_new (lstream_decoding, mode);
2559   struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2560   Lisp_Object obj;
2561
2562   xzero (*str);
2563   str->other_end = stream;
2564   str->runoff = (unsigned_char_dynarr *) Dynarr_new (unsigned_char);
2565   str->eol_type = EOL_AUTODETECT;
2566   if (!strcmp (mode, "r")
2567       && Lstream_seekable_p (stream))
2568     /* We can determine the coding system now. */
2569     determine_real_coding_system (stream, &codesys, &str->eol_type);
2570   set_decoding_stream_coding_system (lstr, codesys);
2571   str->decst.eol_type = str->eol_type;
2572   str->decst.mask = ~0;
2573   XSETLSTREAM (obj, lstr);
2574   return obj;
2575 }
2576
2577 Lisp_Object
2578 make_decoding_input_stream (Lstream *stream, Lisp_Object codesys)
2579 {
2580   return make_decoding_stream_1 (stream, codesys, "r");
2581 }
2582
2583 Lisp_Object
2584 make_decoding_output_stream (Lstream *stream, Lisp_Object codesys)
2585 {
2586   return make_decoding_stream_1 (stream, codesys, "w");
2587 }
2588
2589 /* Note: the decode_coding_* functions all take the same
2590    arguments as mule_decode(), which is to say some SRC data of
2591    size N, which is to be stored into dynamic array DST.
2592    DECODING is the stream within which the decoding is
2593    taking place, but no data is actually read from or
2594    written to that stream; that is handled in decoding_reader()
2595    or decoding_writer().  This allows the same functions to
2596    be used for both reading and writing. */
2597
2598 static void
2599 mule_decode (Lstream *decoding, const Extbyte *src,
2600              unsigned_char_dynarr *dst, Lstream_data_count n)
2601 {
2602   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2603
2604   /* If necessary, do encoding-detection now.  We do this when
2605      we're a writing stream or a non-seekable reading stream,
2606      meaning that we can't just process the whole input,
2607      rewind, and start over. */
2608
2609   if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT ||
2610       str->eol_type == EOL_AUTODETECT)
2611     {
2612       Lisp_Object codesys;
2613
2614       XSETCODING_SYSTEM (codesys, str->codesys);
2615       detect_coding_type (&str->decst, src, n,
2616                           CODING_SYSTEM_TYPE (str->codesys) !=
2617                           CODESYS_AUTODETECT);
2618       if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT &&
2619           str->decst.mask != ~0)
2620         /* #### This is cheesy.  What we really ought to do is
2621            buffer up a certain amount of data so as to get a
2622            less random result. */
2623         codesys = coding_system_from_mask (str->decst.mask);
2624       str->eol_type = str->decst.eol_type;
2625       if (XCODING_SYSTEM (codesys) != str->codesys)
2626         {
2627           /* Preserve the CODING_STATE_END flag in case it was set.
2628              If we erase it, bad things might happen. */
2629           int was_end = str->flags & CODING_STATE_END;
2630           set_decoding_stream_coding_system (decoding, codesys);
2631           if (was_end)
2632             str->flags |= CODING_STATE_END;
2633         }
2634     }
2635
2636   switch (CODING_SYSTEM_TYPE (str->codesys))
2637     {
2638 #ifdef DEBUG_XEMACS
2639     case CODESYS_INTERNAL:
2640       Dynarr_add_many (dst, src, n);
2641       break;
2642 #endif
2643     case CODESYS_AUTODETECT:
2644       /* If we got this far and still haven't decided on the coding
2645          system, then do no conversion. */
2646     case CODESYS_NO_CONVERSION:
2647       decode_coding_no_conversion (decoding, src, dst, n);
2648       break;
2649 #ifdef MULE
2650     case CODESYS_SHIFT_JIS:
2651       decode_coding_sjis (decoding, src, dst, n);
2652       break;
2653     case CODESYS_BIG5:
2654       decode_coding_big5 (decoding, src, dst, n);
2655       break;
2656     case CODESYS_UCS4:
2657       decode_coding_ucs4 (decoding, src, dst, n);
2658       break;
2659     case CODESYS_UTF16:
2660       decode_coding_utf16 (decoding, src, dst, n);
2661       break;
2662     case CODESYS_UTF8:
2663       decode_coding_utf8 (decoding, src, dst, n);
2664       break;
2665     case CODESYS_CCL:
2666       str->ccl.last_block = str->flags & CODING_STATE_END;
2667       /* When applying ccl program to stream, MUST NOT set NULL
2668          pointer to src.  */
2669       ccl_driver (&str->ccl, (src ? (unsigned char *)src : (unsigned char*)""),
2670                   dst, n, 0, CCL_MODE_DECODING);
2671       break;
2672     case CODESYS_ISO2022:
2673       decode_coding_iso2022 (decoding, src, dst, n);
2674       break;
2675 #endif /* MULE */
2676     default:
2677       abort ();
2678     }
2679 }
2680
2681 DEFUN ("decode-coding-region", Fdecode_coding_region, 3, 4, 0, /*
2682 Decode the text between START and END which is encoded in CODING-SYSTEM.
2683 This is useful if you've read in encoded text from a file without decoding
2684 it (e.g. you read in a JIS-formatted file but used the `binary' or
2685 `no-conversion' coding system, so that it shows up as "^[$B!<!+^[(B").
2686 Return length of decoded text.
2687 BUFFER defaults to the current buffer if unspecified.
2688 */
2689        (start, end, coding_system, buffer))
2690 {
2691   Bufpos b, e;
2692   struct buffer *buf = decode_buffer (buffer, 0);
2693   Lisp_Object instream, lb_outstream, de_outstream, outstream;
2694   Lstream *istr, *ostr;
2695   struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2696
2697   get_buffer_range_char (buf, start, end, &b, &e, 0);
2698
2699   barf_if_buffer_read_only (buf, b, e);
2700
2701   coding_system = Fget_coding_system (coding_system);
2702   instream = make_lisp_buffer_input_stream  (buf, b, e, 0);
2703   lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2704   de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2705                                               coding_system);
2706   outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2707                                            Fget_coding_system (Qbinary));
2708   istr = XLSTREAM (instream);
2709   ostr = XLSTREAM (outstream);
2710   GCPRO4 (instream, lb_outstream, de_outstream, outstream);
2711
2712   /* The chain of streams looks like this:
2713
2714      [BUFFER] <----- send through
2715                      ------> [ENCODE AS BINARY]
2716                              ------> [DECODE AS SPECIFIED]
2717                                      ------> [BUFFER]
2718    */
2719
2720   while (1)
2721     {
2722       char tempbuf[1024]; /* some random amount */
2723       Bufpos newpos, even_newer_pos;
2724       Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2725       Lstream_data_count size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2726
2727       if (!size_in_bytes)
2728         break;
2729       newpos = lisp_buffer_stream_startpos (istr);
2730       Lstream_write (ostr, tempbuf, size_in_bytes);
2731       even_newer_pos = lisp_buffer_stream_startpos (istr);
2732       buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2733                            even_newer_pos, 0);
2734     }
2735   Lstream_close (istr);
2736   Lstream_close (ostr);
2737   UNGCPRO;
2738   Lstream_delete (istr);
2739   Lstream_delete (ostr);
2740   Lstream_delete (XLSTREAM (de_outstream));
2741   Lstream_delete (XLSTREAM (lb_outstream));
2742   return Qnil;
2743 }
2744
2745 \f
2746 /************************************************************************/
2747 /*           Converting to an external encoding ("encoding")            */
2748 /************************************************************************/
2749
2750 /* An encoding stream is an output stream.  When you create the
2751    stream, you specify the coding system that governs the encoding
2752    and another stream that the resulting encoded data is to be
2753    sent to, and then start sending data to it. */
2754
2755 #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding)
2756
2757 struct encoding_stream
2758 {
2759   /* Coding system that governs the conversion. */
2760   Lisp_Coding_System *codesys;
2761
2762   /* Stream that we read the encoded data from or
2763      write the decoded data to. */
2764   Lstream *other_end;
2765
2766   /* If we are reading, then we can return only a fixed amount of
2767      data, so if the conversion resulted in too much data, we store it
2768      here for retrieval the next time around. */
2769   unsigned_char_dynarr *runoff;
2770
2771   /* FLAGS holds flags indicating the current state of the encoding.
2772      Some of these flags are dependent on the coding system. */
2773   unsigned int flags;
2774
2775   /* CH holds a partially built-up character.  Since we only deal
2776      with one- and two-byte characters at the moment, we only use
2777      this to store the first byte of a two-byte character. */
2778   unsigned int ch;
2779 #ifdef MULE
2780   /* Additional information used by the ISO2022 encoder. */
2781   struct
2782     {
2783       /* CHARSET holds the character sets currently assigned to the G0
2784          through G3 registers.  It is initialized from the array
2785          INITIAL_CHARSET in CODESYS. */
2786       Lisp_Object charset[4];
2787
2788       /* Which registers are currently invoked into the left (GL) and
2789          right (GR) halves of the 8-bit encoding space? */
2790       int register_left, register_right;
2791
2792       /* Whether we need to explicitly designate the charset in the
2793          G? register before using it.  It is initialized from the
2794          array FORCE_CHARSET_ON_OUTPUT in CODESYS. */
2795       unsigned char force_charset_on_output[4];
2796
2797       /* Other state variables that need to be preserved across
2798          invocations. */
2799       Lisp_Object current_charset;
2800       int current_half;
2801       int current_char_boundary;
2802     } iso2022;
2803
2804   void (*encode_char) (struct encoding_stream *str, Emchar c,
2805                        unsigned_char_dynarr *dst, unsigned int *flags);
2806   void (*finish) (struct encoding_stream *str,
2807                   unsigned_char_dynarr *dst, unsigned int *flags);
2808
2809   /* Additional information (the state of the running CCL program)
2810      used by the CCL encoder. */
2811   struct ccl_program ccl;
2812 #endif /* MULE */
2813 };
2814
2815 static Lstream_data_count encoding_reader (Lstream *stream, unsigned char *data, Lstream_data_count size);
2816 static Lstream_data_count encoding_writer (Lstream *stream, const unsigned char *data,
2817                                 Lstream_data_count size);
2818 static int encoding_rewinder   (Lstream *stream);
2819 static int encoding_seekable_p (Lstream *stream);
2820 static int encoding_flusher    (Lstream *stream);
2821 static int encoding_closer     (Lstream *stream);
2822
2823 static Lisp_Object encoding_marker (Lisp_Object stream);
2824
2825 DEFINE_LSTREAM_IMPLEMENTATION ("encoding", lstream_encoding,
2826                                sizeof (struct encoding_stream));
2827
2828 static Lisp_Object
2829 encoding_marker (Lisp_Object stream)
2830 {
2831   Lstream *str = ENCODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2832   Lisp_Object str_obj;
2833
2834   /* We do not need to mark the coding systems or charsets stored
2835      within the stream because they are stored in a global list
2836      and automatically marked. */
2837
2838   XSETLSTREAM (str_obj, str);
2839   mark_object (str_obj);
2840   if (str->imp->marker)
2841     return (str->imp->marker) (str_obj);
2842   else
2843     return Qnil;
2844 }
2845
2846 /* Read SIZE bytes of data and store it into DATA.  We are a encoding stream
2847    so we read data from the other end, encode it, and store it into DATA. */
2848
2849 static Lstream_data_count
2850 encoding_reader (Lstream *stream, unsigned char *data, Lstream_data_count size)
2851 {
2852   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2853   unsigned char *orig_data = data;
2854   Lstream_data_count read_size;
2855   int error_occurred = 0;
2856
2857   /* We need to interface to mule_encode(), which expects to take some
2858      amount of data and store the result into a Dynarr.  We have
2859      mule_encode() store into str->runoff, and take data from there
2860      as necessary. */
2861
2862   /* We loop until we have enough data, reading chunks from the other
2863      end and encoding it. */
2864   while (1)
2865     {
2866       /* Take data from the runoff if we can.  Make sure to take at
2867          most SIZE bytes, and delete the data from the runoff. */
2868       if (Dynarr_length (str->runoff) > 0)
2869         {
2870           int chunk = min ((int) size, Dynarr_length (str->runoff));
2871           memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2872           Dynarr_delete_many (str->runoff, 0, chunk);
2873           data += chunk;
2874           size -= chunk;
2875         }
2876
2877       if (size == 0)
2878         break; /* No more room for data */
2879
2880       if (str->flags & CODING_STATE_END)
2881         /* This means that on the previous iteration, we hit the EOF on
2882            the other end.  We loop once more so that mule_encode() can
2883            output any final stuff it may be holding, or any "go back
2884            to a sane state" escape sequences. (This latter makes sense
2885            during encoding.) */
2886         break;
2887
2888       /* Exhausted the runoff, so get some more.  DATA at least SIZE bytes
2889          left of storage in it, so it's OK to read directly into it.
2890          (We'll be overwriting above, after we've encoded it into the
2891          runoff.) */
2892       read_size = Lstream_read (str->other_end, data, size);
2893       if (read_size < 0)
2894         {
2895           error_occurred = 1;
2896           break;
2897         }
2898       if (read_size == 0)
2899         /* There might be some more end data produced in the translation.
2900            See the comment above. */
2901         str->flags |= CODING_STATE_END;
2902       mule_encode (stream, data, str->runoff, read_size);
2903     }
2904
2905   if (data == orig_data)
2906     return error_occurred ? -1 : 0;
2907   else
2908     return data - orig_data;
2909 }
2910
2911 static Lstream_data_count
2912 encoding_writer (Lstream *stream, const unsigned char *data, Lstream_data_count size)
2913 {
2914   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2915   Lstream_data_count retval;
2916
2917   /* Encode all our data into the runoff, and then attempt to write
2918      it all out to the other end.  Remove whatever chunk we succeeded
2919      in writing. */
2920   mule_encode (stream, data, str->runoff, size);
2921   retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2922                           Dynarr_length (str->runoff));
2923   if (retval > 0)
2924     Dynarr_delete_many (str->runoff, 0, retval);
2925   /* Do NOT return retval.  The return value indicates how much
2926      of the incoming data was written, not how many bytes were
2927      written. */
2928   return size;
2929 }
2930
2931 static void
2932 reset_encoding_stream (struct encoding_stream *str)
2933 {
2934 #ifdef MULE
2935   switch (CODING_SYSTEM_TYPE (str->codesys))
2936     {
2937     case CODESYS_ISO2022:
2938       {
2939         int i;
2940
2941         str->encode_char = &char_encode_iso2022;
2942         str->finish = &char_finish_iso2022;
2943         for (i = 0; i < 4; i++)
2944           {
2945             str->iso2022.charset[i] =
2946               CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, i);
2947             str->iso2022.force_charset_on_output[i] =
2948               CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (str->codesys, i);
2949           }
2950         str->iso2022.register_left = 0;
2951         str->iso2022.register_right = 1;
2952         str->iso2022.current_charset = Qnil;
2953         str->iso2022.current_half = 0;
2954         break;
2955       }
2956     case CODESYS_CCL:
2957       setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_ENCODE (str->codesys));
2958       break;
2959     case CODESYS_UTF8:
2960       str->encode_char = &char_encode_utf8;
2961       str->finish = &char_finish_utf8;
2962       break;
2963     case CODESYS_UTF16:
2964       str->encode_char = &char_encode_utf16;
2965       str->finish = &char_finish_utf16;
2966       break;
2967     case CODESYS_UCS4:
2968       str->encode_char = &char_encode_ucs4;
2969       str->finish = &char_finish_ucs4;
2970       break;
2971     case CODESYS_SHIFT_JIS:
2972       str->encode_char = &char_encode_shift_jis;
2973       str->finish = &char_finish_shift_jis;
2974       break;
2975     case CODESYS_BIG5:
2976       str->encode_char = &char_encode_big5;
2977       str->finish = &char_finish_big5;
2978       break;
2979     default:
2980       break;
2981     }
2982 #endif /* MULE */
2983   str->iso2022.current_char_boundary = 0;
2984   str->flags = str->ch = 0;
2985 }
2986
2987 static int
2988 encoding_rewinder (Lstream *stream)
2989 {
2990   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2991   reset_encoding_stream (str);
2992   Dynarr_reset (str->runoff);
2993   return Lstream_rewind (str->other_end);
2994 }
2995
2996 static int
2997 encoding_seekable_p (Lstream *stream)
2998 {
2999   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
3000   return Lstream_seekable_p (str->other_end);
3001 }
3002
3003 static int
3004 encoding_flusher (Lstream *stream)
3005 {
3006   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
3007   return Lstream_flush (str->other_end);
3008 }
3009
3010 static int
3011 encoding_closer (Lstream *stream)
3012 {
3013   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
3014   if (stream->flags & LSTREAM_FL_WRITE)
3015     {
3016       str->flags |= CODING_STATE_END;
3017       encoding_writer (stream, 0, 0);
3018     }
3019   Dynarr_free (str->runoff);
3020   return Lstream_close (str->other_end);
3021 }
3022
3023 Lisp_Object
3024 encoding_stream_coding_system (Lstream *stream)
3025 {
3026   Lisp_Object coding_system;
3027   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
3028
3029   XSETCODING_SYSTEM (coding_system, str->codesys);
3030   return coding_system;
3031 }
3032
3033 void
3034 set_encoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
3035 {
3036   Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
3037   struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
3038   str->codesys = cs;
3039   reset_encoding_stream (str);
3040 }
3041
3042 static Lisp_Object
3043 make_encoding_stream_1 (Lstream *stream, Lisp_Object codesys,
3044                         const char *mode)
3045 {
3046   Lstream *lstr = Lstream_new (lstream_encoding, mode);
3047   struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
3048   Lisp_Object obj;
3049
3050   xzero (*str);
3051   str->runoff = Dynarr_new (unsigned_char);
3052   str->other_end = stream;
3053   set_encoding_stream_coding_system (lstr, codesys);
3054   XSETLSTREAM (obj, lstr);
3055   return obj;
3056 }
3057
3058 Lisp_Object
3059 make_encoding_input_stream (Lstream *stream, Lisp_Object codesys)
3060 {
3061   return make_encoding_stream_1 (stream, codesys, "r");
3062 }
3063
3064 Lisp_Object
3065 make_encoding_output_stream (Lstream *stream, Lisp_Object codesys)
3066 {
3067   return make_encoding_stream_1 (stream, codesys, "w");
3068 }
3069
3070 /* Convert N bytes of internally-formatted data stored in SRC to an
3071    external format, according to the encoding stream ENCODING.
3072    Store the encoded data into DST. */
3073
3074 static void
3075 mule_encode (Lstream *encoding, const Bufbyte *src,
3076              unsigned_char_dynarr *dst, Lstream_data_count n)
3077 {
3078   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3079
3080   switch (CODING_SYSTEM_TYPE (str->codesys))
3081     {
3082 #ifdef DEBUG_XEMACS
3083     case CODESYS_INTERNAL:
3084       Dynarr_add_many (dst, src, n);
3085       break;
3086 #endif
3087     case CODESYS_AUTODETECT:
3088       /* If we got this far and still haven't decided on the coding
3089          system, then do no conversion. */
3090     case CODESYS_NO_CONVERSION:
3091       encode_coding_no_conversion (encoding, src, dst, n);
3092       break;
3093 #ifdef MULE
3094     case CODESYS_CCL:
3095       str->ccl.last_block = str->flags & CODING_STATE_END;
3096       /* When applying ccl program to stream, MUST NOT set NULL
3097          pointer to src.  */
3098       ccl_driver (&str->ccl, ((src) ? src : (unsigned char*)""),
3099                   dst, n, 0, CCL_MODE_ENCODING);
3100       break;
3101 #endif /* MULE */
3102     default:
3103       text_encode_generic (encoding, src, dst, n);
3104     }
3105 }
3106
3107 DEFUN ("encode-coding-region", Fencode_coding_region, 3, 4, 0, /*
3108 Encode the text between START and END using CODING-SYSTEM.
3109 This will, for example, convert Japanese characters into stuff such as
3110 "^[$B!<!+^[(B" if you use the JIS encoding.  Return length of encoded
3111 text.  BUFFER defaults to the current buffer if unspecified.
3112 */
3113        (start, end, coding_system, buffer))
3114 {
3115   Bufpos b, e;
3116   struct buffer *buf = decode_buffer (buffer, 0);
3117   Lisp_Object instream, lb_outstream, de_outstream, outstream;
3118   Lstream *istr, *ostr;
3119   struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
3120
3121   get_buffer_range_char (buf, start, end, &b, &e, 0);
3122
3123   barf_if_buffer_read_only (buf, b, e);
3124
3125   coding_system = Fget_coding_system (coding_system);
3126   instream  = make_lisp_buffer_input_stream  (buf, b, e, 0);
3127   lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
3128   de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
3129                                               Fget_coding_system (Qbinary));
3130   outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
3131                                            coding_system);
3132   istr = XLSTREAM (instream);
3133   ostr = XLSTREAM (outstream);
3134   GCPRO4 (instream, outstream, de_outstream, lb_outstream);
3135   /* The chain of streams looks like this:
3136
3137      [BUFFER] <----- send through
3138                      ------> [ENCODE AS SPECIFIED]
3139                              ------> [DECODE AS BINARY]
3140                                      ------> [BUFFER]
3141    */
3142   while (1)
3143     {
3144       char tempbuf[1024]; /* some random amount */
3145       Bufpos newpos, even_newer_pos;
3146       Bufpos oldpos = lisp_buffer_stream_startpos (istr);
3147       Lstream_data_count size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
3148
3149       if (!size_in_bytes)
3150         break;
3151       newpos = lisp_buffer_stream_startpos (istr);
3152       Lstream_write (ostr, tempbuf, size_in_bytes);
3153       even_newer_pos = lisp_buffer_stream_startpos (istr);
3154       buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
3155                            even_newer_pos, 0);
3156     }
3157
3158   {
3159     Charcount retlen =
3160       lisp_buffer_stream_startpos (XLSTREAM (instream)) - b;
3161     Lstream_close (istr);
3162     Lstream_close (ostr);
3163     UNGCPRO;
3164     Lstream_delete (istr);
3165     Lstream_delete (ostr);
3166     Lstream_delete (XLSTREAM (de_outstream));
3167     Lstream_delete (XLSTREAM (lb_outstream));
3168     return make_int (retlen);
3169   }
3170 }
3171
3172 #ifdef MULE
3173 \f
3174 static void
3175 text_encode_generic (Lstream *encoding, const Bufbyte *src,
3176                      unsigned_char_dynarr *dst, Lstream_data_count n)
3177 {
3178   unsigned char c;
3179   unsigned char char_boundary;
3180   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3181   unsigned int flags          = str->flags;
3182   Emchar ch                   = str->ch;
3183
3184   char_boundary = str->iso2022.current_char_boundary;
3185
3186   while (n--)
3187     {
3188       c = *src++;
3189
3190       if (char_boundary == 0)
3191         {
3192           if (c >= 0xfc)
3193             {
3194               ch = c & 0x01;
3195               char_boundary = 5;
3196             }
3197           else if (c >= 0xf8)
3198             {
3199               ch = c & 0x03;
3200               char_boundary = 4;
3201             }
3202           else if (c >= 0xf0)
3203             {
3204               ch = c & 0x07;
3205               char_boundary = 3;
3206             }
3207           else if (c >= 0xe0)
3208             {
3209               ch = c & 0x0f;
3210               char_boundary = 2;
3211             }
3212           else if (c >= 0xc0)
3213             {
3214               ch = c & 0x1f;
3215               char_boundary = 1;
3216             }
3217           else
3218             (*str->encode_char) (str, c, dst, &flags);
3219         }
3220       else if (char_boundary == 1)
3221         {
3222           (*str->encode_char) (str, (ch << 6) | (c & 0x3f), dst, &flags);
3223           ch =0;
3224           char_boundary = 0;
3225         }
3226       else
3227         {
3228           ch = (ch << 6) | (c & 0x3f);
3229           char_boundary--;
3230         }
3231     }
3232
3233   if ((char_boundary == 0) && (flags & CODING_STATE_END))
3234     {
3235       (*str->finish) (str, dst, &flags);
3236     }
3237
3238   str->flags = flags;
3239   str->ch    = ch;
3240   str->iso2022.current_char_boundary = char_boundary;
3241 }
3242
3243 \f
3244 #ifdef UTF2000
3245 /************************************************************************/
3246 /*                          entity reference                            */
3247 /************************************************************************/
3248
3249 INLINE_HEADER void
3250 decode_flush_er_chars (struct decoding_stream *str, unsigned_char_dynarr* dst);
3251 INLINE_HEADER void
3252 decode_flush_er_chars (struct decoding_stream *str, unsigned_char_dynarr* dst)
3253 {
3254   if ( str->er_counter > 0)
3255     {
3256       Dynarr_add_many (dst, str->er_buf, str->er_counter);
3257       str->er_counter = 0;
3258     }
3259 }
3260
3261 EXFUN (Fregexp_quote, 1);
3262
3263 void decode_add_er_char (struct decoding_stream *str, Emchar character,
3264                          unsigned_char_dynarr* dst);
3265 void
3266 decode_add_er_char (struct decoding_stream *str, Emchar c,
3267                     unsigned_char_dynarr* dst)
3268 {
3269   if (str->er_counter == 0)
3270     {
3271       if (CODING_SYSTEM_USE_ENTITY_REFERENCE (str->codesys)
3272           && (c == '&') )
3273         {
3274           str->er_buf[0] = '&';
3275           str->er_counter++;
3276         }
3277       else
3278         DECODE_ADD_UCS_CHAR (c, dst);
3279     }
3280   else if (c == ';')
3281     {
3282       Lisp_Object string = make_string (str->er_buf,
3283                                         str->er_counter);
3284       Lisp_Object rest;
3285       Lisp_Object cell;
3286       Lisp_Object ret;
3287       Lisp_Object pat;
3288       Lisp_Object ccs;
3289       Lisp_Object char_type;
3290       int base;
3291
3292       for ( rest = Vcoded_charset_entity_reference_alist;
3293             !NILP (rest); rest = Fcdr (rest) )
3294         {
3295           cell = Fcar (rest);
3296           ccs = Fcar (cell);
3297           if (CONSP (ccs))
3298             {
3299               char_type = XCDR (ccs);
3300               ccs = XCAR (ccs);
3301             }
3302           else
3303             char_type = Qnil;
3304           if (NILP (ccs = Ffind_charset (ccs)))
3305             continue;
3306
3307           cell = Fcdr (cell);
3308           ret = Fcar (cell);
3309           if (STRINGP (ret))
3310             pat = ret;
3311           else
3312             continue;
3313           pat = Fregexp_quote (pat);
3314
3315           cell = Fcdr (cell);
3316           cell = Fcdr (cell);
3317           ret = Fcar (cell);
3318           if (EQ (ret, Qd))
3319             {
3320               pat = concat3 (build_string ("^&"),
3321                              pat, build_string ("\\([0-9]+\\)$"));
3322               base = 10;
3323             }
3324           else if (EQ (ret, Qx))
3325             {
3326               pat = concat3 (build_string ("^&"),
3327                              pat, build_string ("\\([0-9a-f]+\\)$"));
3328               base = 16;
3329             }
3330           else if (EQ (ret, QX))
3331             {
3332               pat = concat3 (build_string ("^&"),
3333                              pat, build_string ("\\([0-9A-F]+\\)$"));
3334               base = 16;
3335             }
3336           else
3337             continue;
3338
3339           if (!NILP (Fstring_match (pat, string, Qnil, Qnil)))
3340             {
3341               int code
3342                 = XINT (Fstring_to_number
3343                         (Fsubstring (string,
3344                                      Fmatch_beginning (make_int (1)),
3345                                      Fmatch_end (make_int (1))),
3346                          make_int (base)));
3347               Emchar chr
3348                 = NILP (char_type)
3349                 ? DECODE_CHAR (ccs, code, 0)
3350                 : decode_builtin_char (ccs, code);
3351
3352               if ( chr >= 0 )
3353                 DECODE_ADD_UCS_CHAR (chr, dst);
3354               else
3355                 {
3356                   Dynarr_add_many (dst, str->er_buf, str->er_counter);
3357                   Dynarr_add (dst, ';');
3358                 }
3359
3360               goto decoded;
3361             }
3362         }
3363       if (!NILP (Fstring_match (build_string ("^&MCS-\\([0-9A-F]+\\)$"),
3364                                 string, Qnil, Qnil)))
3365         {
3366           int code
3367             = XUINT (Fstring_to_number
3368                      (Fsubstring (string,
3369                                   Fmatch_beginning (make_int (1)),
3370                                   Fmatch_end (make_int (1))),
3371                       make_int (16)));
3372
3373           DECODE_ADD_UCS_CHAR (code, dst);
3374         }
3375       else
3376         {
3377           Dynarr_add_many (dst, str->er_buf, str->er_counter);
3378           Dynarr_add (dst, ';');
3379         }
3380     decoded:
3381       str->er_counter = 0;
3382     }
3383   else if ( (str->er_counter >= ER_BUF_SIZE) || (c >= 0x7F) )
3384     {
3385       Dynarr_add_many (dst, str->er_buf, str->er_counter);
3386       str->er_counter = 0;
3387       DECODE_ADD_UCS_CHAR (c, dst);
3388     }
3389   else
3390     str->er_buf[str->er_counter++] = c;
3391 }
3392
3393 void char_encode_as_entity_reference (Emchar ch, char* buf);
3394 void
3395 char_encode_as_entity_reference (Emchar ch, char* buf)
3396 {
3397   Lisp_Object rest = Vcoded_charset_entity_reference_alist;
3398   Lisp_Object cell;
3399   Lisp_Object ccs;
3400   Lisp_Object char_type;
3401   int format_columns, idx;
3402   char format[ER_BUF_SIZE];
3403
3404   while (!NILP (rest))
3405     {
3406       cell = Fcar (rest);
3407       ccs = Fcar (cell);
3408       if (CONSP (ccs))
3409         {
3410           char_type = XCDR (ccs);
3411           ccs = XCAR (ccs);
3412         }
3413       else
3414         char_type = Qnil;
3415       if (!NILP (ccs = Ffind_charset (ccs)))
3416         {
3417           int code_point = charset_code_point (ccs, ch, 0);
3418
3419           if ( (code_point >= 0)
3420                && (NILP (char_type)
3421                    || DECODE_CHAR (ccs, code_point, 0) != ch) )
3422             {
3423               Lisp_Object ret;
3424
3425               cell = Fcdr (cell);
3426               ret = Fcar (cell);
3427               if ( STRINGP (ret) &&
3428                    ( (idx = XSTRING_LENGTH (ret)) <= (ER_BUF_SIZE - 4) ) )
3429                 {
3430                   format[0] = '&';
3431                   strncpy (&format[1], XSTRING_DATA (ret), idx);
3432                   idx++;
3433                 }
3434               else
3435                 goto try_next;
3436
3437               cell = Fcdr (cell);
3438               ret = Fcar (cell);
3439               if (INTP (ret))
3440                 {
3441                   format[idx++] = '%';
3442                   format_columns = XINT (ret);
3443                   if ( (2 <= format_columns) && (format_columns <= 8)
3444                        && (idx + format_columns <= ER_BUF_SIZE - 1) )
3445                     {
3446                       format [idx++] = '0';
3447                       format [idx++] = '0' + format_columns;
3448                     }
3449                 }
3450               else
3451                 goto try_next;
3452
3453               cell = Fcdr (cell);
3454               ret = Fcar (cell);
3455               if (EQ (ret, Qd))
3456                 format [idx++] = 'd';
3457               else if (EQ (ret, Qx))
3458                 format [idx++] = 'x';
3459               else if (EQ (ret, QX))
3460                 format [idx++] = 'X';
3461               else
3462                 goto try_next;
3463               format [idx++] = ';';
3464               format [idx++] = 0;
3465
3466               sprintf (buf, format, code_point);
3467               return;
3468             }
3469         }
3470     try_next:
3471       rest = Fcdr (rest);
3472     }
3473   sprintf (buf, "&MCS-%08X;", ch);
3474 }
3475
3476 \f
3477 /************************************************************************/
3478 /*                          character composition                       */
3479 /************************************************************************/
3480 extern Lisp_Object Qcomposition, Qrep_decomposition;
3481
3482 INLINE_HEADER void
3483 COMPOSE_FLUSH_CHARS (struct decoding_stream *str, unsigned_char_dynarr* dst);
3484 INLINE_HEADER void
3485 COMPOSE_FLUSH_CHARS (struct decoding_stream *str, unsigned_char_dynarr* dst)
3486 {
3487   unsigned i;
3488
3489   for (i = 0; i < str->combined_char_count; i++)
3490     decode_add_er_char (str, str->combined_chars[i], dst);
3491   str->combined_char_count = 0;
3492   str->combining_table = Qnil;
3493 }
3494
3495 void COMPOSE_ADD_CHAR (struct decoding_stream *str, Emchar character,
3496                        unsigned_char_dynarr* dst);
3497 void
3498 COMPOSE_ADD_CHAR (struct decoding_stream *str,
3499                   Emchar character, unsigned_char_dynarr* dst)
3500 {
3501   if (CODING_SYSTEM_DISABLE_COMPOSITION (str->codesys))
3502     decode_add_er_char (str, character, dst);
3503   else if (!CONSP (str->combining_table))
3504     {
3505       Lisp_Object ret
3506         = Fchar_feature (make_char (character), Qcomposition, Qnil,
3507                          Qnil, Qnil);
3508
3509       if (NILP (ret))
3510         decode_add_er_char (str, character, dst);
3511       else
3512         {
3513           str->combined_chars[0] = character;
3514           str->combined_char_count = 1;
3515           str->combining_table = ret;
3516         }
3517     }
3518   else
3519     {
3520       Lisp_Object ret
3521         = Fcdr (Fassq (make_char (character), str->combining_table));
3522
3523       if (CHARP (ret))
3524         {
3525           Emchar char2 = XCHARVAL (ret);
3526           Lisp_Object ret2 = Fchar_feature (ret, Qcomposition, Qnil,
3527                                             Qnil, Qnil);
3528
3529           if (NILP (ret2))
3530             {
3531               decode_add_er_char (str, char2, dst);
3532               str->combined_char_count = 0;
3533               str->combining_table = Qnil;
3534             }
3535           else
3536             {
3537               str->combined_chars[0] = char2;
3538               str->combined_char_count = 1;
3539               str->combining_table = ret2;
3540             }
3541         }
3542       else
3543         {
3544           ret = Fchar_feature (make_char (character), Qcomposition, Qnil,
3545                                Qnil, Qnil);
3546
3547           COMPOSE_FLUSH_CHARS (str, dst);
3548           if (NILP (ret))
3549             decode_add_er_char (str, character, dst);
3550           else
3551             {
3552               str->combined_chars[0] = character;
3553               str->combined_char_count = 1;
3554               str->combining_table = ret;
3555             }
3556         }
3557     }
3558 }
3559 #else /* not UTF2000 */
3560 #define COMPOSE_FLUSH_CHARS(str, dst)
3561 #define COMPOSE_ADD_CHAR(str, ch, dst) DECODE_ADD_UCS_CHAR (ch, dst)
3562 #endif /* UTF2000 */
3563
3564 \f
3565 /************************************************************************/
3566 /*                          Shift-JIS methods                           */
3567 /************************************************************************/
3568
3569 /* Shift-JIS is a coding system encoding three character sets: ASCII, right
3570    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
3571    as is.  A character of JISX0201-Kana (DIMENSION1_CHARS94 character set) is
3572    encoded by "position-code + 0x80".  A character of JISX0208
3573    (DIMENSION2_CHARS94 character set) is encoded in 2-byte but two
3574    position-codes are divided and shifted so that it fit in the range
3575    below.
3576
3577    --- CODE RANGE of Shift-JIS ---
3578    (character set)      (range)
3579    ASCII                0x00 .. 0x7F
3580    JISX0201-Kana        0xA0 .. 0xDF
3581    JISX0208 (1st byte)  0x80 .. 0x9F and 0xE0 .. 0xEF
3582             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
3583    -------------------------------
3584
3585 */
3586
3587 /* Is this the first byte of a Shift-JIS two-byte char? */
3588
3589 #define BYTE_SJIS_TWO_BYTE_1_P(c) \
3590   (((c) >= 0x81 && (c) <= 0x9F) || ((c) >= 0xE0 && (c) <= 0xEF))
3591
3592 /* Is this the second byte of a Shift-JIS two-byte char? */
3593
3594 #define BYTE_SJIS_TWO_BYTE_2_P(c) \
3595   (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0x80 && (c) <= 0xFC))
3596
3597 #define BYTE_SJIS_KATAKANA_P(c) \
3598   ((c) >= 0xA1 && (c) <= 0xDF)
3599
3600 static int
3601 detect_coding_sjis (struct detection_state *st, const Extbyte *src, Lstream_data_count n)
3602 {
3603   while (n--)
3604     {
3605       unsigned char c = *(unsigned char *)src++;
3606       if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3607         return 0;
3608       if (st->shift_jis.in_second_byte)
3609         {
3610           st->shift_jis.in_second_byte = 0;
3611           if (c < 0x40)
3612             return 0;
3613         }
3614       else if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
3615         st->shift_jis.in_second_byte = 1;
3616     }
3617   return CODING_CATEGORY_SHIFT_JIS_MASK;
3618 }
3619
3620 /* Convert Shift-JIS data to internal format. */
3621
3622 static void
3623 decode_coding_sjis (Lstream *decoding, const Extbyte *src,
3624                     unsigned_char_dynarr *dst, Lstream_data_count n)
3625 {
3626   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3627   unsigned int flags  = str->flags;
3628   unsigned int cpos   = str->cpos;
3629   eol_type_t eol_type = str->eol_type;
3630
3631   while (n--)
3632     {
3633       unsigned char c = *(unsigned char *)src++;
3634
3635       if (cpos)
3636         {
3637           /* Previous character was first byte of Shift-JIS Kanji char. */
3638           if (BYTE_SJIS_TWO_BYTE_2_P (c))
3639             {
3640               unsigned char e1, e2;
3641
3642               DECODE_SJIS (cpos, c, e1, e2);
3643 #ifdef UTF2000
3644               DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_japanese_jisx0208,
3645                                             e1 & 0x7F,
3646                                             e2 & 0x7F), dst);
3647 #else
3648               Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
3649               Dynarr_add (dst, e1);
3650               Dynarr_add (dst, e2);
3651 #endif
3652             }
3653           else
3654             {
3655               DECODE_ADD_BINARY_CHAR (cpos, dst);
3656               DECODE_ADD_BINARY_CHAR (c, dst);
3657             }
3658           cpos = 0;
3659         }
3660       else
3661         {
3662           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3663           if (BYTE_SJIS_TWO_BYTE_1_P (c))
3664             cpos = c;
3665           else if (BYTE_SJIS_KATAKANA_P (c))
3666             {
3667 #ifdef UTF2000
3668               DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_katakana_jisx0201,
3669                                             c & 0x7F, 0), dst);
3670 #else
3671               Dynarr_add (dst, LEADING_BYTE_KATAKANA_JISX0201);
3672               Dynarr_add (dst, c);
3673 #endif
3674             }
3675 #ifdef UTF2000
3676           else if (c > 32)
3677             DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_latin_jisx0201,
3678                                           c, 0), dst);
3679 #endif
3680           else
3681             DECODE_ADD_BINARY_CHAR (c, dst);
3682         }
3683     label_continue_loop:;
3684     }
3685
3686   DECODE_HANDLE_END_OF_CONVERSION (flags, cpos, dst);
3687
3688   str->flags = flags;
3689   str->cpos  = cpos;
3690 }
3691
3692 /* Convert internal character representation to Shift_JIS. */
3693
3694 void
3695 char_encode_shift_jis (struct encoding_stream *str, Emchar ch,
3696                        unsigned_char_dynarr *dst, unsigned int *flags)
3697 {
3698   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3699
3700   if (ch == '\n')
3701     {
3702       if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3703         Dynarr_add (dst, '\r');
3704       if (eol_type != EOL_CR)
3705         Dynarr_add (dst, ch);
3706     }
3707   else
3708     {
3709       unsigned int s1, s2;
3710 #ifdef UTF2000
3711       int code_point = charset_code_point (Vcharset_latin_jisx0201, ch, 0);
3712
3713       if (code_point >= 0)
3714         Dynarr_add (dst, code_point);
3715       else if ((code_point
3716                 = charset_code_point (Vcharset_japanese_jisx0208_1990, ch, 0))
3717                >= 0)
3718         {
3719           ENCODE_SJIS ((code_point >> 8) | 0x80,
3720                        (code_point & 0xFF) | 0x80, s1, s2);
3721           Dynarr_add (dst, s1);
3722           Dynarr_add (dst, s2);
3723         }
3724       else if ((code_point
3725                 = charset_code_point (Vcharset_katakana_jisx0201, ch, 0))
3726                >= 0)
3727         Dynarr_add (dst, code_point | 0x80);
3728       else if ((code_point
3729                 = charset_code_point (Vcharset_japanese_jisx0208, ch, 0))
3730                >= 0)
3731         {
3732           ENCODE_SJIS ((code_point >> 8) | 0x80,
3733                        (code_point & 0xFF) | 0x80, s1, s2);
3734           Dynarr_add (dst, s1);
3735           Dynarr_add (dst, s2);
3736         }
3737       else if ((code_point = charset_code_point (Vcharset_ascii, ch, 0))
3738                >= 0)
3739         Dynarr_add (dst, code_point);
3740       else
3741         Dynarr_add (dst, '?');
3742 #else
3743       Lisp_Object charset;
3744       unsigned int c1, c2;
3745
3746       BREAKUP_CHAR (ch, charset, c1, c2);
3747
3748       if (EQ(charset, Vcharset_katakana_jisx0201))
3749         {
3750           Dynarr_add (dst, c1 | 0x80);
3751         }
3752       else if (c2 == 0)
3753         {
3754           Dynarr_add (dst, c1);
3755         }
3756       else if (EQ(charset, Vcharset_japanese_jisx0208))
3757         {
3758           ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3759           Dynarr_add (dst, s1);
3760           Dynarr_add (dst, s2);
3761         }
3762       else
3763         Dynarr_add (dst, '?');
3764 #endif
3765     }
3766 }
3767
3768 void
3769 char_finish_shift_jis (struct encoding_stream *str, unsigned_char_dynarr *dst,
3770                        unsigned int *flags)
3771 {
3772 }
3773
3774 DEFUN ("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /*
3775 Decode a JISX0208 character of Shift-JIS coding-system.
3776 CODE is the character code in Shift-JIS as a cons of type bytes.
3777 Return the corresponding character.
3778 */
3779        (code))
3780 {
3781   unsigned char c1, c2, s1, s2;
3782
3783   CHECK_CONS (code);
3784   CHECK_INT (XCAR (code));
3785   CHECK_INT (XCDR (code));
3786   s1 = XINT (XCAR (code));
3787   s2 = XINT (XCDR (code));
3788   if (BYTE_SJIS_TWO_BYTE_1_P (s1) &&
3789       BYTE_SJIS_TWO_BYTE_2_P (s2))
3790     {
3791       DECODE_SJIS (s1, s2, c1, c2);
3792       return make_char (MAKE_CHAR (Vcharset_japanese_jisx0208,
3793                                    c1 & 0x7F, c2 & 0x7F));
3794     }
3795   else
3796     return Qnil;
3797 }
3798
3799 DEFUN ("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /*
3800 Encode a JISX0208 character CHARACTER to SHIFT-JIS coding-system.
3801 Return the corresponding character code in SHIFT-JIS as a cons of two bytes.
3802 */
3803        (character))
3804 {
3805   Lisp_Object charset;
3806   int c1, c2, s1, s2;
3807
3808   CHECK_CHAR_COERCE_INT (character);
3809   BREAKUP_CHAR (XCHAR (character), charset, c1, c2);
3810   if (EQ (charset, Vcharset_japanese_jisx0208))
3811     {
3812       ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3813       return Fcons (make_int (s1), make_int (s2));
3814     }
3815   else
3816     return Qnil;
3817 }
3818
3819 \f
3820 /************************************************************************/
3821 /*                            Big5 methods                              */
3822 /************************************************************************/
3823
3824 /* BIG5 is a coding system encoding two character sets: ASCII and
3825    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
3826    character set and is encoded in two-byte.
3827
3828    --- CODE RANGE of BIG5 ---
3829    (character set)      (range)
3830    ASCII                0x00 .. 0x7F
3831    Big5 (1st byte)      0xA1 .. 0xFE
3832         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
3833    --------------------------
3834
3835    Since the number of characters in Big5 is larger than maximum
3836    characters in Emacs' charset (96x96), it can't be handled as one
3837    charset.  So, in Emacs, Big5 is divided into two: `charset-big5-1'
3838    and `charset-big5-2'.  Both <type>s are DIMENSION2_CHARS94.  The former
3839    contains frequently used characters and the latter contains less
3840    frequently used characters.  */
3841
3842 #ifdef UTF2000
3843 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
3844   ((c) >= 0x81 && (c) <= 0xFE)
3845 #else
3846 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
3847   ((c) >= 0xA1 && (c) <= 0xFE)
3848 #endif
3849
3850 /* Is this the second byte of a Shift-JIS two-byte char? */
3851
3852 #define BYTE_BIG5_TWO_BYTE_2_P(c) \
3853   (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0xA1 && (c) <= 0xFE))
3854
3855 /* Number of Big5 characters which have the same code in 1st byte.  */
3856
3857 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
3858
3859 /* Code conversion macros.  These are macros because they are used in
3860    inner loops during code conversion.
3861
3862    Note that temporary variables in macros introduce the classic
3863    dynamic-scoping problems with variable names.  We use capital-
3864    lettered variables in the assumption that XEmacs does not use
3865    capital letters in variables except in a very formalized way
3866    (e.g. Qstring). */
3867
3868 /* Convert Big5 code (b1, b2) into its internal string representation
3869    (lb, c1, c2). */
3870
3871 /* There is a much simpler way to split the Big5 charset into two.
3872    For the moment I'm going to leave the algorithm as-is because it
3873    claims to separate out the most-used characters into a single
3874    charset, which perhaps will lead to optimizations in various
3875    places.
3876
3877    The way the algorithm works is something like this:
3878
3879    Big5 can be viewed as a 94x157 charset, where the row is
3880    encoded into the bytes 0xA1 .. 0xFE and the column is encoded
3881    into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE.  As for frequency,
3882    the split between low and high column numbers is apparently
3883    meaningless; ascending rows produce less and less frequent chars.
3884    Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to
3885    the first charset, and the upper half (0xC9 .. 0xFE) to the
3886    second.  To do the conversion, we convert the character into
3887    a single number where 0 .. 156 is the first row, 157 .. 313
3888    is the second, etc.  That way, the characters are ordered by
3889    decreasing frequency.  Then we just chop the space in two
3890    and coerce the result into a 94x94 space.
3891    */
3892
3893 #define DECODE_BIG5(b1, b2, lb, c1, c2) do                              \
3894 {                                                                       \
3895   int B1 = b1, B2 = b2;                                                 \
3896   unsigned int I                                                        \
3897     = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62);     \
3898                                                                         \
3899   if (B1 < 0xC9)                                                        \
3900     {                                                                   \
3901       lb = LEADING_BYTE_CHINESE_BIG5_1;                                 \
3902     }                                                                   \
3903   else                                                                  \
3904     {                                                                   \
3905       lb = LEADING_BYTE_CHINESE_BIG5_2;                                 \
3906       I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1);                             \
3907     }                                                                   \
3908   c1 = I / (0xFF - 0xA1) + 0xA1;                                        \
3909   c2 = I % (0xFF - 0xA1) + 0xA1;                                        \
3910 } while (0)
3911
3912 /* Convert the internal string representation of a Big5 character
3913    (lb, c1, c2) into Big5 code (b1, b2). */
3914
3915 #define ENCODE_BIG5(lb, c1, c2, b1, b2) do                              \
3916 {                                                                       \
3917   unsigned int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1);       \
3918                                                                         \
3919   if (lb == LEADING_BYTE_CHINESE_BIG5_2)                                \
3920     {                                                                   \
3921       I += BIG5_SAME_ROW * (0xC9 - 0xA1);                               \
3922     }                                                                   \
3923   b1 = I / BIG5_SAME_ROW + 0xA1;                                        \
3924   b2 = I % BIG5_SAME_ROW;                                               \
3925   b2 += b2 < 0x3F ? 0x40 : 0x62;                                        \
3926 } while (0)
3927
3928 static int
3929 detect_coding_big5 (struct detection_state *st, const Extbyte *src, Lstream_data_count n)
3930 {
3931   while (n--)
3932     {
3933       unsigned char c = *(unsigned char *)src++;
3934       if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO
3935 #ifndef UTF2000
3936           || (c >= 0x80 && c <= 0xA0)
3937 #endif
3938           )
3939         return 0;
3940       if (st->big5.in_second_byte)
3941         {
3942           st->big5.in_second_byte = 0;
3943           if (c < 0x40 || (c >= 0x80 && c <= 0xA0))
3944             return 0;
3945         }
3946       else if (
3947 #ifdef UTF2000
3948                c >= 0x81
3949 #else
3950                c >= 0xA1
3951 #endif
3952                )
3953         st->big5.in_second_byte = 1;
3954     }
3955   return CODING_CATEGORY_BIG5_MASK;
3956 }
3957
3958 /* Convert Big5 data to internal format. */
3959
3960 static void
3961 decode_coding_big5 (Lstream *decoding, const Extbyte *src,
3962                     unsigned_char_dynarr *dst, Lstream_data_count n)
3963 {
3964   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3965   unsigned int flags  = str->flags;
3966   unsigned int cpos   = str->cpos;
3967   eol_type_t eol_type = str->eol_type;
3968 #ifdef UTF2000
3969   Lisp_Object ccs
3970     = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (DECODING_STREAM_DATA
3971                                              (decoding)->codesys, 1);
3972 #endif
3973
3974   while (n--)
3975     {
3976       unsigned char c = *(unsigned char *)src++;
3977       if (cpos)
3978         {
3979           /* Previous character was first byte of Big5 char. */
3980           if (BYTE_BIG5_TWO_BYTE_2_P (c))
3981             {
3982 #ifdef UTF2000
3983               int code_point = (cpos << 8) | c;
3984               Emchar char_id = decode_defined_char (ccs, code_point, 0);
3985
3986               if (char_id < 0)
3987                 char_id
3988                   = DECODE_CHAR (Vcharset_chinese_big5, code_point, 0);
3989               DECODE_ADD_UCS_CHAR (char_id, dst);
3990 #else
3991               unsigned char b1, b2, b3;
3992               DECODE_BIG5 (cpos, c, b1, b2, b3);
3993               Dynarr_add (dst, b1);
3994               Dynarr_add (dst, b2);
3995               Dynarr_add (dst, b3);
3996 #endif
3997             }
3998           else
3999             {
4000               DECODE_ADD_BINARY_CHAR (cpos, dst);
4001               DECODE_ADD_BINARY_CHAR (c, dst);
4002             }
4003           cpos = 0;
4004         }
4005       else
4006         {
4007           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4008           if (BYTE_BIG5_TWO_BYTE_1_P (c))
4009             {
4010               decode_flush_er_chars (str, dst);
4011               cpos = c;
4012             }
4013           else if ( c < ' ' )
4014             {
4015               decode_flush_er_chars (str, dst);
4016               DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4017               DECODE_ADD_BINARY_CHAR (c, dst);
4018             }
4019           else
4020             {
4021               /* DECODE_ADD_BINARY_CHAR (c, dst); */
4022               decode_add_er_char (str, c, dst);
4023             }
4024         }
4025     label_continue_loop:;
4026     }
4027
4028   /* DECODE_HANDLE_END_OF_CONVERSION (flags, cpos, dst); */
4029   if (flags & CODING_STATE_END)
4030     {
4031       decode_flush_er_chars (str, dst);
4032       DECODE_OUTPUT_PARTIAL_CHAR (cpos);
4033       if (flags & CODING_STATE_CR)
4034         Dynarr_add (dst, '\r');
4035     }
4036
4037   str->flags = flags;
4038   str->cpos  = cpos;
4039 }
4040
4041 /* Convert internally-formatted data to Big5. */
4042
4043 void
4044 char_encode_big5 (struct encoding_stream *str, Emchar ch,
4045                   unsigned_char_dynarr *dst, unsigned int *flags)
4046 {
4047   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
4048
4049   if (ch == '\n')
4050     {
4051       if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
4052         Dynarr_add (dst, '\r');
4053       if (eol_type != EOL_CR)
4054         Dynarr_add (dst, ch);
4055     }
4056   else
4057     {
4058 #ifdef UTF2000
4059       int code_point;
4060       Lisp_Object ccs
4061         = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, 1);
4062
4063       if ((code_point = charset_code_point (Vcharset_ascii, ch, 0)) >= 0)
4064         Dynarr_add (dst, code_point);
4065       else if ((code_point = charset_code_point (ccs, ch, 0)) >= 0)
4066         {
4067           Dynarr_add (dst, code_point >> 8);
4068           Dynarr_add (dst, code_point & 0xFF);
4069         }
4070       else if ((code_point
4071                 = charset_code_point (Vcharset_chinese_big5, ch, 0)) >= 0)
4072         {
4073           Dynarr_add (dst, code_point >> 8);
4074           Dynarr_add (dst, code_point & 0xFF);
4075         }
4076       else if ((code_point
4077                 = charset_code_point (Vcharset_chinese_big5_1, ch, 0)) >= 0)
4078         {
4079           unsigned int I
4080             = ((code_point >> 8) - 33) * (0xFF - 0xA1)
4081             + ((code_point & 0xFF) - 33);
4082           unsigned char b1 = I / BIG5_SAME_ROW + 0xA1;
4083           unsigned char b2 = I % BIG5_SAME_ROW;
4084
4085           b2 += b2 < 0x3F ? 0x40 : 0x62;
4086           Dynarr_add (dst, b1);
4087           Dynarr_add (dst, b2);
4088         }
4089       else if ((code_point
4090                 = charset_code_point (Vcharset_chinese_big5_2, ch, 0)) >= 0)
4091         {
4092           unsigned int I
4093             = ((code_point >> 8) - 33) * (0xFF - 0xA1)
4094             + ((code_point & 0xFF) - 33);
4095           unsigned char b1, b2;
4096
4097           I += BIG5_SAME_ROW * (0xC9 - 0xA1);
4098           b1 = I / BIG5_SAME_ROW + 0xA1;
4099           b2 = I % BIG5_SAME_ROW;
4100           b2 += b2 < 0x3F ? 0x40 : 0x62;
4101           Dynarr_add (dst, b1);
4102           Dynarr_add (dst, b2);
4103         }
4104       else if (CODING_SYSTEM_USE_ENTITY_REFERENCE (str->codesys))
4105         {
4106           char buf[18];
4107
4108           char_encode_as_entity_reference (ch, buf);
4109           Dynarr_add_many (dst, buf, strlen (buf));
4110         }
4111       else
4112         Dynarr_add (dst, '?');
4113 #else
4114 #endif
4115     }
4116 }
4117
4118 void
4119 char_finish_big5 (struct encoding_stream *str, unsigned_char_dynarr *dst,
4120                   unsigned int *flags)
4121 {
4122 }
4123
4124
4125 DEFUN ("decode-big5-char", Fdecode_big5_char, 1, 1, 0, /*
4126 Decode a Big5 character CODE of BIG5 coding-system.
4127 CODE is the character code in BIG5, a cons of two integers.
4128 Return the corresponding character.
4129 */
4130        (code))
4131 {
4132   unsigned char c1, c2, b1, b2;
4133
4134   CHECK_CONS (code);
4135   CHECK_INT (XCAR (code));
4136   CHECK_INT (XCDR (code));
4137   b1 = XINT (XCAR (code));
4138   b2 = XINT (XCDR (code));
4139   if (BYTE_BIG5_TWO_BYTE_1_P (b1) &&
4140       BYTE_BIG5_TWO_BYTE_2_P (b2))
4141     {
4142       Charset_ID leading_byte;
4143       Lisp_Object charset;
4144       DECODE_BIG5 (b1, b2, leading_byte, c1, c2);
4145       charset = CHARSET_BY_LEADING_BYTE (leading_byte);
4146       return make_char (MAKE_CHAR (charset, c1 & 0x7F, c2 & 0x7F));
4147     }
4148   else
4149     return Qnil;
4150 }
4151
4152 DEFUN ("encode-big5-char", Fencode_big5_char, 1, 1, 0, /*
4153 Encode the Big5 character CHARACTER in the BIG5 coding-system.
4154 Return the corresponding character code in Big5.
4155 */
4156        (character))
4157 {
4158   Lisp_Object charset;
4159   int c1, c2, b1, b2;
4160
4161   CHECK_CHAR_COERCE_INT (character);
4162   BREAKUP_CHAR (XCHAR (character), charset, c1, c2);
4163   if (EQ (charset, Vcharset_chinese_big5_1) ||
4164       EQ (charset, Vcharset_chinese_big5_2))
4165     {
4166       ENCODE_BIG5 (XCHARSET_LEADING_BYTE (charset), c1 | 0x80, c2 | 0x80,
4167                    b1, b2);
4168       return Fcons (make_int (b1), make_int (b2));
4169     }
4170   else
4171     return Qnil;
4172 }
4173
4174 \f
4175 /************************************************************************/
4176 /*                           UCS-4 methods                              */
4177 /************************************************************************/
4178
4179 static int
4180 detect_coding_ucs4 (struct detection_state *st, const Extbyte *src, Lstream_data_count n)
4181 {
4182   while (n--)
4183     {
4184       unsigned char c = *(unsigned char *)src++;
4185       switch (st->ucs4.in_byte)
4186         {
4187         case 0:
4188           if (c >= 128)
4189             return 0;
4190           else
4191             st->ucs4.in_byte++;
4192           break;
4193         case 3:
4194           st->ucs4.in_byte = 0;
4195           break;
4196         default:
4197           st->ucs4.in_byte++;
4198         }
4199     }
4200   return CODING_CATEGORY_UCS4_MASK;
4201 }
4202
4203 static void
4204 decode_coding_ucs4 (Lstream *decoding, const Extbyte *src,
4205                     unsigned_char_dynarr *dst, Lstream_data_count n)
4206 {
4207   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4208   unsigned int flags = str->flags;
4209   unsigned int cpos  = str->cpos;
4210   unsigned char counter = str->counter;
4211
4212   while (n--)
4213     {
4214       unsigned char c = *(unsigned char *)src++;
4215       switch (counter)
4216         {
4217         case 0:
4218           cpos = c;
4219           counter = 3;
4220           break;
4221         case 1:
4222           DECODE_ADD_UCS_CHAR ((cpos << 8) | c, dst);
4223           cpos = 0;
4224           counter = 0;
4225           break;
4226         default:
4227           cpos = ( cpos << 8 ) | c;
4228           counter--;
4229         }
4230     }
4231   if (counter & CODING_STATE_END)
4232     DECODE_OUTPUT_PARTIAL_CHAR (cpos);
4233
4234   str->flags    = flags;
4235   str->cpos     = cpos;
4236   str->counter  = counter;
4237 }
4238
4239 void
4240 char_encode_ucs4 (struct encoding_stream *str, Emchar ch,
4241                   unsigned_char_dynarr *dst, unsigned int *flags)
4242 {
4243   Dynarr_add (dst, ch >> 24);
4244   Dynarr_add (dst, ch >> 16);
4245   Dynarr_add (dst, ch >>  8);
4246   Dynarr_add (dst, ch      );
4247 }
4248
4249 void
4250 char_finish_ucs4 (struct encoding_stream *str, unsigned_char_dynarr *dst,
4251                   unsigned int *flags)
4252 {
4253 }
4254
4255 \f
4256 /************************************************************************/
4257 /*                           UTF-16 methods                             */
4258 /************************************************************************/
4259
4260 static int
4261 detect_coding_utf16 (struct detection_state *st, const Extbyte *src, Lstream_data_count n)
4262 {
4263   return CODING_CATEGORY_UTF16_MASK;
4264 }
4265
4266 static void
4267 decode_coding_utf16 (Lstream *decoding, const Extbyte *src,
4268                     unsigned_char_dynarr *dst, Lstream_data_count n)
4269 {
4270   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4271   unsigned int flags = str->flags;
4272   unsigned int cpos  = str->cpos;
4273   unsigned char counter = str->counter & 3;
4274   unsigned char byte_order = str->counter >> 2;
4275   eol_type_t eol_type = str->eol_type;
4276
4277   while (n--)
4278     {
4279       unsigned char c = *(unsigned char *)src++;
4280       if (counter == 0)
4281         {
4282           cpos = c;
4283           counter = 1;
4284         }
4285       else if (counter == 1)
4286         {
4287           int code;
4288
4289           if (byte_order == 0)
4290             code = (c << 8) | cpos;
4291           else
4292             code = (cpos << 8) | c;
4293           if (code == 0xFFFE)
4294             {
4295               code = ((code & 0xFF) << 8) | (code >> 8);
4296               if ( byte_order == 0 )
4297                 byte_order = 1;
4298               else
4299                 byte_order = 0;
4300             }
4301           if ( (0xD800 <= code) && (code <= 0xDBFF) )
4302             {
4303               counter = 2;
4304               cpos = code;
4305             }
4306           else
4307             {
4308               counter = 0;
4309               cpos = 0;
4310               if (code != 0xFEFF)
4311                 {
4312                   DECODE_HANDLE_EOL_TYPE (eol_type, code, flags, dst);
4313                   DECODE_ADD_UCS_CHAR (code, dst);
4314                 }
4315             }
4316         }
4317       else if (counter == 2)
4318         {
4319           cpos = (cpos << 8) | c;
4320           counter++;
4321         }
4322       else
4323         {
4324           int x = cpos >> 8;
4325           int y
4326             = (byte_order == 0)
4327             ? (c << 8) | (cpos & 0xFF)
4328             : ((cpos & 0xFF) << 8) | c;
4329
4330           DECODE_ADD_UCS_CHAR ((x - 0xD800) * 0x400 + (y - 0xDC00)
4331                                + 0x10000, dst);
4332           counter = 0;
4333           cpos = 0;
4334         }
4335     label_continue_loop:;
4336     }
4337   if (counter & CODING_STATE_END)
4338     DECODE_OUTPUT_PARTIAL_CHAR (cpos);
4339
4340   str->flags    = flags;
4341   str->cpos     = cpos;
4342   str->counter  = (byte_order << 2) | counter;
4343 }
4344
4345 void
4346 char_encode_utf16 (struct encoding_stream *str, Emchar ch,
4347                   unsigned_char_dynarr *dst, unsigned int *flags)
4348 {
4349   if (ch <= 0xFFFF)
4350     {
4351       Dynarr_add (dst, ch);
4352       Dynarr_add (dst, ch >> 8);
4353     }
4354   else
4355     {
4356       int y = ((ch - 0x10000) / 0x400) + 0xD800;
4357       int z = ((ch - 0x10000) % 0x400) + 0xDC00;
4358
4359       Dynarr_add (dst, y);
4360       Dynarr_add (dst, y >> 8);
4361       Dynarr_add (dst, z);
4362       Dynarr_add (dst, z >> 8);
4363     }
4364 }
4365
4366 void
4367 char_finish_utf16 (struct encoding_stream *str, unsigned_char_dynarr *dst,
4368                   unsigned int *flags)
4369 {
4370 }
4371
4372 \f
4373 /************************************************************************/
4374 /*                           UTF-8 methods                              */
4375 /************************************************************************/
4376
4377 static int
4378 detect_coding_utf8 (struct detection_state *st, const Extbyte *src, Lstream_data_count n)
4379 {
4380   while (n--)
4381     {
4382       unsigned char c = *(unsigned char *)src++;
4383       switch (st->utf8.in_byte)
4384         {
4385         case 0:
4386           if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
4387             return 0;
4388           else if (c >= 0xfc)
4389             st->utf8.in_byte = 5;
4390           else if (c >= 0xf8)
4391             st->utf8.in_byte = 4;
4392           else if (c >= 0xf0)
4393             st->utf8.in_byte = 3;
4394           else if (c >= 0xe0)
4395             st->utf8.in_byte = 2;
4396           else if (c >= 0xc0)
4397             st->utf8.in_byte = 1;
4398           else if (c >= 0x80)
4399             return 0;
4400           break;
4401         default:
4402           if ((c & 0xc0) != 0x80)
4403             return 0;
4404           else
4405             st->utf8.in_byte--;
4406         }
4407     }
4408   return CODING_CATEGORY_UTF8_MASK;
4409 }
4410
4411 static void
4412 decode_output_utf8_partial_char (unsigned char counter,
4413                                  unsigned int cpos,
4414                                  unsigned_char_dynarr *dst)
4415 {
4416   if (counter == 5)
4417     DECODE_ADD_BINARY_CHAR ( (cpos|0xFC), dst);
4418   else if (counter == 4)
4419     {
4420       if (cpos < (1 << 6))
4421         DECODE_ADD_BINARY_CHAR ( (cpos|0xF8), dst);
4422       else
4423         {
4424           DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xFC), dst);
4425           DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
4426         }
4427     }
4428   else if (counter == 3)
4429     {
4430       if (cpos < (1 << 6))
4431         DECODE_ADD_BINARY_CHAR ( (cpos|0xF0), dst);
4432       else if (cpos < (1 << 12))
4433         {
4434           DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xF8), dst);
4435           DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
4436         }
4437       else
4438         {
4439           DECODE_ADD_BINARY_CHAR ( ( (cpos >> 12)|0xFC), dst);
4440           DECODE_ADD_BINARY_CHAR ( (((cpos >>  6)&0x3F)|0x80), dst);
4441           DECODE_ADD_BINARY_CHAR ( ( (cpos       &0x3F)|0x80), dst);
4442         }
4443     }
4444   else if (counter == 2)
4445     {
4446       if (cpos < (1 << 6))
4447         DECODE_ADD_BINARY_CHAR ( (cpos|0xE0), dst);
4448       else if (cpos < (1 << 12))
4449         {
4450           DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xF0), dst);
4451           DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
4452         }
4453       else if (cpos < (1 << 18))
4454         {
4455           DECODE_ADD_BINARY_CHAR ( ( (cpos >> 12)|0xF8), dst);
4456           DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
4457           DECODE_ADD_BINARY_CHAR ( ( (cpos      &0x3F)|0x80), dst);
4458         }
4459       else
4460         {
4461           DECODE_ADD_BINARY_CHAR ( ( (cpos >> 18)|0xFC), dst);
4462           DECODE_ADD_BINARY_CHAR ( (((cpos >> 12)&0x3F)|0x80), dst);
4463           DECODE_ADD_BINARY_CHAR ( (((cpos >>  6)&0x3F)|0x80), dst);
4464           DECODE_ADD_BINARY_CHAR ( ( (cpos       &0x3F)|0x80), dst);
4465         }
4466     }
4467   else
4468     {
4469       if (cpos < (1 << 6))
4470         DECODE_ADD_BINARY_CHAR ( (cpos|0xC0), dst);
4471       else if (cpos < (1 << 12))
4472         {
4473           DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xE0), dst);
4474           DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
4475         }
4476       else if (cpos < (1 << 18))
4477         {
4478           DECODE_ADD_BINARY_CHAR ( ( (cpos >> 12)|0xF0), dst);
4479           DECODE_ADD_BINARY_CHAR ( (((cpos >>  6)&0x3F)|0x80), dst);
4480           DECODE_ADD_BINARY_CHAR ( ( (cpos       &0x3F)|0x80), dst);
4481         }
4482       else if (cpos < (1 << 24))
4483         {
4484           DECODE_ADD_BINARY_CHAR ( ( (cpos >> 18)|0xF8), dst);
4485           DECODE_ADD_BINARY_CHAR ( (((cpos >> 12)&0x3F)|0x80), dst);
4486           DECODE_ADD_BINARY_CHAR ( (((cpos >>  6)&0x3F)|0x80), dst);
4487           DECODE_ADD_BINARY_CHAR ( ( (cpos       &0x3F)|0x80), dst);
4488         }
4489       else
4490         {
4491           DECODE_ADD_BINARY_CHAR ( ( (cpos >> 24)|0xFC), dst);
4492           DECODE_ADD_BINARY_CHAR ( (((cpos >> 18)&0x3F)|0x80), dst);
4493           DECODE_ADD_BINARY_CHAR ( (((cpos >> 12)&0x3F)|0x80), dst);
4494           DECODE_ADD_BINARY_CHAR ( (((cpos >>  6)&0x3F)|0x80), dst);
4495           DECODE_ADD_BINARY_CHAR ( ( (cpos       &0x3F)|0x80), dst);
4496         }
4497     }
4498 }
4499
4500 static void
4501 decode_coding_utf8 (Lstream *decoding, const Extbyte *src,
4502                     unsigned_char_dynarr *dst, Lstream_data_count n)
4503 {
4504   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4505   unsigned int flags    = str->flags;
4506   unsigned int cpos     = str->cpos;
4507   eol_type_t eol_type   = str->eol_type;
4508   unsigned char counter = str->counter;
4509 #ifdef UTF2000
4510   int bom_flag = str->bom_flag;
4511   Lisp_Object ccs
4512     = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (DECODING_STREAM_DATA
4513                                              (decoding)->codesys, 0);
4514 #endif
4515
4516   while (n--)
4517     {
4518       unsigned char c = *(unsigned char *)src++;
4519       if (counter == 0)
4520         {
4521           if ( c < ' ' )
4522             {
4523               COMPOSE_FLUSH_CHARS (str, dst);
4524               decode_flush_er_chars (str, dst);
4525               DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4526
4527               if ( bom_flag == 0 )
4528                 bom_flag = -1;
4529
4530               DECODE_ADD_UCS_CHAR (c, dst);
4531             }
4532           else if ( c < 0xC0 )
4533             {
4534               if ( bom_flag == 0 )
4535                 bom_flag = -1;
4536
4537               /* decode_add_er_char (str, c, dst); */
4538               COMPOSE_ADD_CHAR (str, c, dst);
4539             }
4540           else
4541             {
4542               /* decode_flush_er_chars (str, dst); */
4543               if ( c < 0xE0 )
4544                 {
4545                   cpos = c & 0x1f;
4546                   counter = 1;
4547                 }
4548               else if ( c < 0xF0 )
4549                 {
4550                   cpos = c & 0x0f;
4551                   counter = 2;
4552                 }
4553               else if ( c < 0xF8 )
4554                 {
4555                   cpos = c & 0x07;
4556                   counter = 3;
4557                 }
4558               else if ( c < 0xFC )
4559                 {
4560                   cpos = c & 0x03;
4561                   counter = 4;
4562                 }
4563               else
4564                 {
4565                   cpos = c & 0x01;
4566                   counter = 5;
4567                 }
4568             }
4569         }
4570       else if ( (c & 0xC0) == 0x80 )
4571         {
4572           cpos = ( cpos << 6 ) | ( c & 0x3f );
4573           if (counter == 1)
4574             {
4575               Emchar char_id;
4576
4577               if ( bom_flag == 0 )
4578                 {
4579                   if ( cpos == 0xFEFF )
4580                     {
4581                       bom_flag = 1;
4582                       goto decoded;
4583                     }
4584                   else
4585                     bom_flag = -1;
4586                 }
4587
4588               if (!NILP (ccs))
4589                 {
4590                   char_id = decode_defined_char (ccs, cpos, 0);
4591
4592                   if (char_id < 0)
4593                     char_id = cpos;
4594                 }
4595               else
4596                 char_id = cpos;
4597               COMPOSE_ADD_CHAR (str, char_id, dst);
4598             decoded:
4599               cpos = 0;
4600               counter = 0;
4601             }
4602           else
4603             counter--;
4604         }
4605       else
4606         {
4607           COMPOSE_FLUSH_CHARS (str, dst);
4608           decode_flush_er_chars (str, dst);
4609           decode_output_utf8_partial_char (counter, cpos, dst);
4610           DECODE_ADD_BINARY_CHAR (c, dst);
4611           cpos = 0;
4612           counter = 0;
4613         }
4614     label_continue_loop:;
4615     }
4616
4617   if (flags & CODING_STATE_END)
4618     {
4619       COMPOSE_FLUSH_CHARS (str, dst);
4620       decode_flush_er_chars (str, dst);
4621       if (counter > 0)
4622         {
4623           decode_output_utf8_partial_char (counter, cpos, dst);
4624           cpos = 0;
4625           counter = 0;
4626         }
4627     }
4628   str->flags    = flags;
4629   str->cpos     = cpos;
4630   str->counter  = counter;
4631 #ifdef UTF2000
4632   str->bom_flag = bom_flag;
4633 #endif
4634 }
4635
4636 void
4637 char_encode_utf8 (struct encoding_stream *str, Emchar ch,
4638                   unsigned_char_dynarr *dst, unsigned int *flags)
4639 {
4640   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
4641
4642   if (ch == '\n')
4643     {
4644       if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
4645         Dynarr_add (dst, '\r');
4646       if (eol_type != EOL_CR)
4647         Dynarr_add (dst, ch);
4648     }
4649   else if (ch <= 0x7f)
4650     {
4651       Dynarr_add (dst, ch);
4652     }
4653   else
4654     {
4655       Lisp_Object ucs_ccs
4656         = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, 0);
4657       int code_point = charset_code_point (ucs_ccs, ch, 0);
4658
4659       if ( (code_point < 0) || (code_point > 0xEFFFF) )
4660         {
4661           Lisp_Object seq = Fchar_feature (make_char (ch),
4662                                            Qrep_decomposition, Qnil,
4663                                            Qnil, Qnil);
4664           Lisp_Object map, ret;
4665
4666           if ( CONSP (seq) )
4667             {
4668               Lisp_Object base = Fcar (seq);
4669
4670               seq = Fcdr (seq);
4671               if ( CHARP (base) && CONSP (seq) )
4672                 {
4673                   Lisp_Object comb = Fcar (seq);
4674
4675                   if ( CHARP (comb) )
4676                     {
4677                       char_encode_utf8 (str, XCHAR (base), dst, flags);
4678                       char_encode_utf8 (str, XCHAR (comb), dst, flags);
4679                       return;
4680                     }
4681                 }
4682             }
4683
4684           map = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, 1);
4685           if ( !NILP (map)
4686                && INTP (ret = Fchar_feature (make_char (ch),
4687                                              map, Qnil,
4688                                              Qnil, Qnil)) )
4689             code_point = XINT (ret);
4690           else if ( !NILP (map =
4691                            CODING_SYSTEM_ISO2022_INITIAL_CHARSET
4692                            (str->codesys, 2))
4693                     && INTP (ret = Fchar_feature (make_char (ch),
4694                                                   map, Qnil,
4695                                                   Qnil, Qnil)) )
4696             code_point = XINT (ret);
4697           else if (CODING_SYSTEM_USE_ENTITY_REFERENCE (str->codesys))
4698             {
4699               char buf[18];
4700
4701               char_encode_as_entity_reference (ch, buf);
4702               Dynarr_add_many (dst, buf, strlen (buf));
4703               return;
4704             }
4705           else
4706             code_point = ch;
4707         }
4708       if (code_point <= 0x7ff)
4709         {
4710           Dynarr_add (dst, (code_point >> 6) | 0xc0);
4711           Dynarr_add (dst, (code_point & 0x3f) | 0x80);
4712         }
4713       else if (code_point <= 0xffff)
4714         {
4715           Dynarr_add (dst,  (code_point >> 12) | 0xe0);
4716           Dynarr_add (dst, ((code_point >>  6) & 0x3f) | 0x80);
4717           Dynarr_add (dst,  (code_point        & 0x3f) | 0x80);
4718         }
4719       else if (code_point <= 0x1fffff)
4720         {
4721           Dynarr_add (dst,  (code_point >> 18) | 0xf0);
4722           Dynarr_add (dst, ((code_point >> 12) & 0x3f) | 0x80);
4723           Dynarr_add (dst, ((code_point >>  6) & 0x3f) | 0x80);
4724           Dynarr_add (dst,  (code_point        & 0x3f) | 0x80);
4725         }
4726       else if (code_point <= 0x3ffffff)
4727         {
4728           Dynarr_add (dst,  (code_point >> 24) | 0xf8);
4729           Dynarr_add (dst, ((code_point >> 18) & 0x3f) | 0x80);
4730           Dynarr_add (dst, ((code_point >> 12) & 0x3f) | 0x80);
4731           Dynarr_add (dst, ((code_point >>  6) & 0x3f) | 0x80);
4732           Dynarr_add (dst,  (code_point        & 0x3f) | 0x80);
4733         }
4734       else
4735         {
4736           Dynarr_add (dst,  (code_point >> 30) | 0xfc);
4737           Dynarr_add (dst, ((code_point >> 24) & 0x3f) | 0x80);
4738           Dynarr_add (dst, ((code_point >> 18) & 0x3f) | 0x80);
4739           Dynarr_add (dst, ((code_point >> 12) & 0x3f) | 0x80);
4740           Dynarr_add (dst, ((code_point >>  6) & 0x3f) | 0x80);
4741           Dynarr_add (dst,  (code_point        & 0x3f) | 0x80);
4742         }
4743     }
4744 }
4745
4746 void
4747 char_finish_utf8 (struct encoding_stream *str, unsigned_char_dynarr *dst,
4748                   unsigned int *flags)
4749 {
4750 }
4751
4752 \f
4753 /************************************************************************/
4754 /*                           ISO2022 methods                            */
4755 /************************************************************************/
4756
4757 /* The following note describes the coding system ISO2022 briefly.
4758    Since the intention of this note is to help understand the
4759    functions in this file, some parts are NOT ACCURATE or OVERLY
4760    SIMPLIFIED.  For thorough understanding, please refer to the
4761    original document of ISO2022.
4762
4763    ISO2022 provides many mechanisms to encode several character sets
4764    in 7-bit and 8-bit environments.  For 7-bit environments, all text
4765    is encoded using bytes less than 128.  This may make the encoded
4766    text a little bit longer, but the text passes more easily through
4767    several gateways, some of which strip off MSB (Most Signigant Bit).
4768
4769    There are two kinds of character sets: control character set and
4770    graphic character set.  The former contains control characters such
4771    as `newline' and `escape' to provide control functions (control
4772    functions are also provided by escape sequences).  The latter
4773    contains graphic characters such as 'A' and '-'.  Emacs recognizes
4774    two control character sets and many graphic character sets.
4775
4776    Graphic character sets are classified into one of the following
4777    four classes, according to the number of bytes (DIMENSION) and
4778    number of characters in one dimension (CHARS) of the set:
4779    - DIMENSION1_CHARS94
4780    - DIMENSION1_CHARS96
4781    - DIMENSION2_CHARS94
4782    - DIMENSION2_CHARS96
4783
4784    In addition, each character set is assigned an identification tag,
4785    unique for each set, called "final character" (denoted as <F>
4786    hereafter).  The <F> of each character set is decided by ECMA(*)
4787    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
4788    (0x30..0x3F are for private use only).
4789
4790    Note (*): ECMA = European Computer Manufacturers Association
4791
4792    Here are examples of graphic character set [NAME(<F>)]:
4793         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
4794         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
4795         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
4796         o DIMENSION2_CHARS96 -- none for the moment
4797
4798    A code area (1 byte = 8 bits) is divided into 4 areas, C0, GL, C1, and GR.
4799         C0 [0x00..0x1F] -- control character plane 0
4800         GL [0x20..0x7F] -- graphic character plane 0
4801         C1 [0x80..0x9F] -- control character plane 1
4802         GR [0xA0..0xFF] -- graphic character plane 1
4803
4804    A control character set is directly designated and invoked to C0 or
4805    C1 by an escape sequence.  The most common case is that:
4806    - ISO646's  control character set is designated/invoked to C0, and
4807    - ISO6429's control character set is designated/invoked to C1,
4808    and usually these designations/invocations are omitted in encoded
4809    text.  In a 7-bit environment, only C0 can be used, and a control
4810    character for C1 is encoded by an appropriate escape sequence to
4811    fit into the environment.  All control characters for C1 are
4812    defined to have corresponding escape sequences.
4813
4814    A graphic character set is at first designated to one of four
4815    graphic registers (G0 through G3), then these graphic registers are
4816    invoked to GL or GR.  These designations and invocations can be
4817    done independently.  The most common case is that G0 is invoked to
4818    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
4819    these invocations and designations are omitted in encoded text.
4820    In a 7-bit environment, only GL can be used.
4821
4822    When a graphic character set of CHARS94 is invoked to GL, codes
4823    0x20 and 0x7F of the GL area work as control characters SPACE and
4824    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
4825    be used.
4826
4827    There are two ways of invocation: locking-shift and single-shift.
4828    With locking-shift, the invocation lasts until the next different
4829    invocation, whereas with single-shift, the invocation affects the
4830    following character only and doesn't affect the locking-shift
4831    state.  Invocations are done by the following control characters or
4832    escape sequences:
4833
4834    ----------------------------------------------------------------------
4835    abbrev  function                  cntrl escape seq   description
4836    ----------------------------------------------------------------------
4837    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
4838    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
4839    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
4840    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
4841    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
4842    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
4843    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
4844    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
4845    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
4846    ----------------------------------------------------------------------
4847    (*) These are not used by any known coding system.
4848
4849    Control characters for these functions are defined by macros
4850    ISO_CODE_XXX in `coding.h'.
4851
4852    Designations are done by the following escape sequences:
4853    ----------------------------------------------------------------------
4854    escape sequence      description
4855    ----------------------------------------------------------------------
4856    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
4857    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
4858    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
4859    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
4860    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
4861    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
4862    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
4863    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
4864    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
4865    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
4866    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
4867    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
4868    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
4869    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
4870    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
4871    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
4872    ----------------------------------------------------------------------
4873
4874    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
4875    of dimension 1, chars 94, and final character <F>, etc...
4876
4877    Note (*): Although these designations are not allowed in ISO2022,
4878    Emacs accepts them on decoding, and produces them on encoding
4879    CHARS96 character sets in a coding system which is characterized as
4880    7-bit environment, non-locking-shift, and non-single-shift.
4881
4882    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
4883    '(' can be omitted.  We refer to this as "short-form" hereafter.
4884
4885    Now you may notice that there are a lot of ways for encoding the
4886    same multilingual text in ISO2022.  Actually, there exist many
4887    coding systems such as Compound Text (used in X11's inter client
4888    communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
4889    (used in Korean internet), EUC (Extended UNIX Code, used in Asian
4890    localized platforms), and all of these are variants of ISO2022.
4891
4892    In addition to the above, Emacs handles two more kinds of escape
4893    sequences: ISO6429's direction specification and Emacs' private
4894    sequence for specifying character composition.
4895
4896    ISO6429's direction specification takes the following form:
4897         o CSI ']'      -- end of the current direction
4898         o CSI '0' ']'  -- end of the current direction
4899         o CSI '1' ']'  -- start of left-to-right text
4900         o CSI '2' ']'  -- start of right-to-left text
4901    The control character CSI (0x9B: control sequence introducer) is
4902    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
4903
4904    Character composition specification takes the following form:
4905         o ESC '0' -- start character composition
4906         o ESC '1' -- end character composition
4907    Since these are not standard escape sequences of any ISO standard,
4908    their use with these meanings is restricted to Emacs only.  */
4909
4910 static void
4911 reset_iso2022 (Lisp_Object coding_system, struct iso2022_decoder *iso)
4912 {
4913   int i;
4914
4915   for (i = 0; i < 4; i++)
4916     {
4917       if (!NILP (coding_system))
4918         iso->charset[i] =
4919           XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, i);
4920       else
4921         iso->charset[i] = Qt;
4922       iso->invalid_designated[i] = 0;
4923     }
4924   iso->esc = ISO_ESC_NOTHING;
4925   iso->esc_bytes_index = 0;
4926   iso->register_left = 0;
4927   iso->register_right = 1;
4928   iso->switched_dir_and_no_valid_charset_yet = 0;
4929   iso->invalid_switch_dir = 0;
4930   iso->output_direction_sequence = 0;
4931   iso->output_literally = 0;
4932 #ifdef ENABLE_COMPOSITE_CHARS
4933   if (iso->composite_chars)
4934     Dynarr_reset (iso->composite_chars);
4935 #endif
4936 }
4937
4938 static int
4939 fit_to_be_escape_quoted (unsigned char c)
4940 {
4941   switch (c)
4942     {
4943     case ISO_CODE_ESC:
4944     case ISO_CODE_CSI:
4945     case ISO_CODE_SS2:
4946     case ISO_CODE_SS3:
4947     case ISO_CODE_SO:
4948     case ISO_CODE_SI:
4949       return 1;
4950
4951     default:
4952       return 0;
4953     }
4954 }
4955
4956 /* Parse one byte of an ISO2022 escape sequence.
4957    If the result is an invalid escape sequence, return 0 and
4958    do not change anything in STR.  Otherwise, if the result is
4959    an incomplete escape sequence, update ISO2022.ESC and
4960    ISO2022.ESC_BYTES and return -1.  Otherwise, update
4961    all the state variables (but not ISO2022.ESC_BYTES) and
4962    return 1.
4963
4964    If CHECK_INVALID_CHARSETS is non-zero, check for designation
4965    or invocation of an invalid character set and treat that as
4966    an unrecognized escape sequence. */
4967
4968 static int
4969 parse_iso2022_esc (Lisp_Object codesys, struct iso2022_decoder *iso,
4970                    unsigned char c, unsigned int *flags,
4971                    int check_invalid_charsets)
4972 {
4973   /* (1) If we're at the end of a designation sequence, CS is the
4974      charset being designated and REG is the register to designate
4975      it to.
4976
4977      (2) If we're at the end of a locking-shift sequence, REG is
4978      the register to invoke and HALF (0 == left, 1 == right) is
4979      the half to invoke it into.
4980
4981      (3) If we're at the end of a single-shift sequence, REG is
4982      the register to invoke. */
4983   Lisp_Object cs = Qnil;
4984   int reg, half;
4985
4986   /* NOTE: This code does goto's all over the fucking place.
4987      The reason for this is that we're basically implementing
4988      a state machine here, and hierarchical languages like C
4989      don't really provide a clean way of doing this. */
4990
4991   if (! (*flags & CODING_STATE_ESCAPE))
4992     /* At beginning of escape sequence; we need to reset our
4993        escape-state variables. */
4994     iso->esc = ISO_ESC_NOTHING;
4995
4996   iso->output_literally = 0;
4997   iso->output_direction_sequence = 0;
4998
4999   switch (iso->esc)
5000     {
5001     case ISO_ESC_NOTHING:
5002       iso->esc_bytes_index = 0;
5003       switch (c)
5004         {
5005         case ISO_CODE_ESC:      /* Start escape sequence */
5006           *flags |= CODING_STATE_ESCAPE;
5007           iso->esc = ISO_ESC;
5008           goto not_done;
5009
5010         case ISO_CODE_CSI:      /* ISO6429 (specifying directionality) */
5011           *flags |= CODING_STATE_ESCAPE;
5012           iso->esc = ISO_ESC_5_11;
5013           goto not_done;
5014
5015         case ISO_CODE_SO:       /* locking shift 1 */
5016           reg = 1; half = 0;
5017           goto locking_shift;
5018         case ISO_CODE_SI:       /* locking shift 0 */
5019           reg = 0; half = 0;
5020           goto locking_shift;
5021
5022         case ISO_CODE_SS2:      /* single shift */
5023           reg = 2;
5024           goto single_shift;
5025         case ISO_CODE_SS3:      /* single shift */
5026           reg = 3;
5027           goto single_shift;
5028
5029         default:                        /* Other control characters */
5030           return 0;
5031         }
5032
5033     case ISO_ESC:
5034       switch (c)
5035         {
5036           /**** single shift ****/
5037
5038         case 'N':       /* single shift 2 */
5039           reg = 2;
5040           goto single_shift;
5041         case 'O':       /* single shift 3 */
5042           reg = 3;
5043           goto single_shift;
5044
5045           /**** locking shift ****/
5046
5047         case '~':       /* locking shift 1 right */
5048           reg = 1; half = 1;
5049           goto locking_shift;
5050         case 'n':       /* locking shift 2 */
5051           reg = 2; half = 0;
5052           goto locking_shift;
5053         case '}':       /* locking shift 2 right */
5054           reg = 2; half = 1;
5055           goto locking_shift;
5056         case 'o':       /* locking shift 3 */
5057           reg = 3; half = 0;
5058           goto locking_shift;
5059         case '|':       /* locking shift 3 right */
5060           reg = 3; half = 1;
5061           goto locking_shift;
5062
5063 #ifdef ENABLE_COMPOSITE_CHARS
5064           /**** composite ****/
5065
5066         case '0':
5067           iso->esc = ISO_ESC_START_COMPOSITE;
5068           *flags = (*flags & CODING_STATE_ISO2022_LOCK) |
5069             CODING_STATE_COMPOSITE;
5070           return 1;
5071
5072         case '1':
5073           iso->esc = ISO_ESC_END_COMPOSITE;
5074           *flags = (*flags & CODING_STATE_ISO2022_LOCK) &
5075             ~CODING_STATE_COMPOSITE;
5076           return 1;
5077 #endif /* ENABLE_COMPOSITE_CHARS */
5078
5079           /**** directionality ****/
5080
5081         case '[':
5082           iso->esc = ISO_ESC_5_11;
5083           goto not_done;
5084
5085           /**** designation ****/
5086
5087         case '$':       /* multibyte charset prefix */
5088           iso->esc = ISO_ESC_2_4;
5089           goto not_done;
5090
5091         default:
5092           if (0x28 <= c && c <= 0x2F)
5093             {
5094               iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_8);
5095               goto not_done;
5096             }
5097
5098           /* This function is called with CODESYS equal to nil when
5099              doing coding-system detection. */
5100           if (!NILP (codesys)
5101               && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5102               && fit_to_be_escape_quoted (c))
5103             {
5104               iso->esc = ISO_ESC_LITERAL;
5105               *flags &= CODING_STATE_ISO2022_LOCK;
5106               return 1;
5107             }
5108
5109           /* bzzzt! */
5110           return 0;
5111         }
5112
5113
5114
5115       /**** directionality ****/
5116
5117     case ISO_ESC_5_11:          /* ISO6429 direction control */
5118       if (c == ']')
5119         {
5120           *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
5121           goto directionality;
5122         }
5123       if      (c == '0') iso->esc = ISO_ESC_5_11_0;
5124       else if (c == '1') iso->esc = ISO_ESC_5_11_1;
5125       else if (c == '2') iso->esc = ISO_ESC_5_11_2;
5126       else               return 0;
5127       goto not_done;
5128
5129     case ISO_ESC_5_11_0:
5130       if (c == ']')
5131         {
5132           *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
5133           goto directionality;
5134         }
5135       return 0;
5136
5137     case ISO_ESC_5_11_1:
5138       if (c == ']')
5139         {
5140           *flags = (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
5141           goto directionality;
5142         }
5143       return 0;
5144
5145     case ISO_ESC_5_11_2:
5146       if (c == ']')
5147         {
5148           *flags = (*flags & CODING_STATE_ISO2022_LOCK) | CODING_STATE_R2L;
5149           goto directionality;
5150         }
5151       return 0;
5152
5153     directionality:
5154       iso->esc = ISO_ESC_DIRECTIONALITY;
5155       /* Various junk here to attempt to preserve the direction sequences
5156          literally in the text if they would otherwise be swallowed due
5157          to invalid designations that don't show up as actual charset
5158          changes in the text. */
5159       if (iso->invalid_switch_dir)
5160         {
5161           /* We already inserted a direction switch literally into the
5162              text.  We assume (#### this may not be right) that the
5163              next direction switch is the one going the other way,
5164              and we need to output that literally as well. */
5165           iso->output_literally = 1;
5166           iso->invalid_switch_dir = 0;
5167         }
5168       else
5169         {
5170           int jj;
5171
5172           /* If we are in the thrall of an invalid designation,
5173            then stick the directionality sequence literally into the
5174            output stream so it ends up in the original text again. */
5175           for (jj = 0; jj < 4; jj++)
5176             if (iso->invalid_designated[jj])
5177               break;
5178           if (jj < 4)
5179             {
5180               iso->output_literally = 1;
5181               iso->invalid_switch_dir = 1;
5182             }
5183           else
5184             /* Indicate that we haven't yet seen a valid designation,
5185                so that if a switch-dir is directly followed by an
5186                invalid designation, both get inserted literally. */
5187             iso->switched_dir_and_no_valid_charset_yet = 1;
5188         }
5189       return 1;
5190
5191
5192       /**** designation ****/
5193
5194     case ISO_ESC_2_4:
5195       if (0x28 <= c && c <= 0x2F)
5196         {
5197           iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_4_8);
5198           goto not_done;
5199         }
5200       if (0x40 <= c && c <= 0x42)
5201         {
5202           /* 94^n-set */
5203           cs = CHARSET_BY_ATTRIBUTES (94, -1, c,
5204                                       *flags & CODING_STATE_R2L ?
5205                                       CHARSET_RIGHT_TO_LEFT :
5206                                       CHARSET_LEFT_TO_RIGHT);
5207           reg = 0;
5208           goto designated;
5209         }
5210       return 0;
5211
5212     default:
5213       {
5214         int chars = 0;
5215         int single = 0;
5216
5217         if (c < '0' || c > '~')
5218           return 0; /* bad final byte */
5219
5220         if (iso->esc >= ISO_ESC_2_8 &&
5221             iso->esc <= ISO_ESC_2_15)
5222           {
5223             chars = (iso->esc >= ISO_ESC_2_12) ? 96 : 94;
5224             single = 1; /* single-byte */
5225             reg = (iso->esc - ISO_ESC_2_8) & 3;
5226           }
5227         else if (iso->esc >= ISO_ESC_2_4_8 &&
5228                  iso->esc <= ISO_ESC_2_4_15)
5229           {
5230             chars = (iso->esc >= ISO_ESC_2_4_12) ? 96 : 94;
5231             single = -1; /* multi-byte */
5232             reg = (iso->esc - ISO_ESC_2_4_8) & 3;
5233           }
5234         else
5235           {
5236             /* Can this ever be reached? -slb */
5237             abort();
5238           }
5239
5240         cs = CHARSET_BY_ATTRIBUTES (chars, single, c,
5241                                     *flags & CODING_STATE_R2L ?
5242                                     CHARSET_RIGHT_TO_LEFT :
5243                                     CHARSET_LEFT_TO_RIGHT);
5244         goto designated;
5245       }
5246     }
5247
5248  not_done:
5249   iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char) c;
5250   return -1;
5251
5252  single_shift:
5253   if (check_invalid_charsets && !CHARSETP (iso->charset[reg]))
5254     /* can't invoke something that ain't there. */
5255     return 0;
5256   iso->esc = ISO_ESC_SINGLE_SHIFT;
5257   *flags &= CODING_STATE_ISO2022_LOCK;
5258   if (reg == 2)
5259     *flags |= CODING_STATE_SS2;
5260   else
5261     *flags |= CODING_STATE_SS3;
5262   return 1;
5263
5264  locking_shift:
5265   if (check_invalid_charsets &&
5266       !CHARSETP (iso->charset[reg]))
5267     /* can't invoke something that ain't there. */
5268     return 0;
5269   if (half)
5270     iso->register_right = reg;
5271   else
5272     iso->register_left = reg;
5273   *flags &= CODING_STATE_ISO2022_LOCK;
5274   iso->esc = ISO_ESC_LOCKING_SHIFT;
5275   return 1;
5276
5277  designated:
5278   if (NILP (cs) && check_invalid_charsets)
5279     {
5280       iso->invalid_designated[reg] = 1;
5281       iso->charset[reg] = Vcharset_ascii;
5282       iso->esc = ISO_ESC_DESIGNATE;
5283       *flags &= CODING_STATE_ISO2022_LOCK;
5284       iso->output_literally = 1;
5285       if (iso->switched_dir_and_no_valid_charset_yet)
5286         {
5287           /* We encountered a switch-direction followed by an
5288              invalid designation.  Ensure that the switch-direction
5289              gets outputted; otherwise it will probably get eaten
5290              when the text is written out again. */
5291           iso->switched_dir_and_no_valid_charset_yet = 0;
5292           iso->output_direction_sequence = 1;
5293           /* And make sure that the switch-dir going the other
5294              way gets outputted, as well. */
5295           iso->invalid_switch_dir = 1;
5296         }
5297       return 1;
5298     }
5299   /* This function is called with CODESYS equal to nil when
5300      doing coding-system detection. */
5301   if (!NILP (codesys))
5302     {
5303       charset_conversion_spec_dynarr *dyn =
5304         XCODING_SYSTEM (codesys)->iso2022.input_conv;
5305
5306       if (dyn)
5307         {
5308           int i;
5309
5310           for (i = 0; i < Dynarr_length (dyn); i++)
5311             {
5312               struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
5313               if (EQ (cs, spec->from_charset))
5314                 cs = spec->to_charset;
5315             }
5316         }
5317     }
5318
5319   iso->charset[reg] = cs;
5320   iso->esc = ISO_ESC_DESIGNATE;
5321   *flags &= CODING_STATE_ISO2022_LOCK;
5322   if (iso->invalid_designated[reg])
5323     {
5324       iso->invalid_designated[reg] = 0;
5325       iso->output_literally = 1;
5326     }
5327   if (iso->switched_dir_and_no_valid_charset_yet)
5328     iso->switched_dir_and_no_valid_charset_yet = 0;
5329   return 1;
5330 }
5331
5332 static int
5333 detect_coding_iso2022 (struct detection_state *st, const Extbyte *src, Lstream_data_count n)
5334 {
5335   int mask;
5336
5337   /* #### There are serious deficiencies in the recognition mechanism
5338      here.  This needs to be much smarter if it's going to cut it.
5339      The sequence "\xff\x0f" is currently detected as LOCK_SHIFT while
5340      it should be detected as Latin-1.
5341      All the ISO2022 stuff in this file should be synced up with the
5342      code from FSF Emacs-20.4, in which Mule should be more or less stable.
5343      Perhaps we should wait till R2L works in FSF Emacs? */
5344
5345   if (!st->iso2022.initted)
5346     {
5347       reset_iso2022 (Qnil, &st->iso2022.iso);
5348       st->iso2022.mask = (CODING_CATEGORY_ISO_7_MASK |
5349                           CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
5350                           CODING_CATEGORY_ISO_8_1_MASK |
5351                           CODING_CATEGORY_ISO_8_2_MASK |
5352                           CODING_CATEGORY_ISO_LOCK_SHIFT_MASK);
5353       st->iso2022.flags = 0;
5354       st->iso2022.high_byte_count = 0;
5355       st->iso2022.saw_single_shift = 0;
5356       st->iso2022.initted = 1;
5357     }
5358
5359   mask = st->iso2022.mask;
5360
5361   while (n--)
5362     {
5363       unsigned char c = *(unsigned char *)src++;
5364       if (c >= 0xA0)
5365         {
5366           mask &= ~CODING_CATEGORY_ISO_7_MASK;
5367           st->iso2022.high_byte_count++;
5368         }
5369       else
5370         {
5371           if (st->iso2022.high_byte_count && !st->iso2022.saw_single_shift)
5372             {
5373               if (st->iso2022.high_byte_count & 1)
5374                 /* odd number of high bytes; assume not iso-8-2 */
5375                 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
5376             }
5377           st->iso2022.high_byte_count = 0;
5378           st->iso2022.saw_single_shift = 0;
5379           if (c > 0x80)
5380             mask &= ~CODING_CATEGORY_ISO_7_MASK;
5381         }
5382       if (!(st->iso2022.flags & CODING_STATE_ESCAPE)
5383           && (BYTE_C0_P (c) || BYTE_C1_P (c)))
5384         { /* control chars */
5385           switch (c)
5386             {
5387               /* Allow and ignore control characters that you might
5388                  reasonably see in a text file */
5389             case '\r':
5390             case '\n':
5391             case '\t':
5392             case  7: /* bell */
5393             case  8: /* backspace */
5394             case 11: /* vertical tab */
5395             case 12: /* form feed */
5396             case 26: /* MS-DOS C-z junk */
5397             case 31: /* '^_' -- for info */
5398               goto label_continue_loop;
5399
5400             default:
5401               break;
5402             }
5403         }
5404
5405       if ((st->iso2022.flags & CODING_STATE_ESCAPE) || BYTE_C0_P (c)
5406           || BYTE_C1_P (c))
5407         {
5408           if (parse_iso2022_esc (Qnil, &st->iso2022.iso, c,
5409                                  &st->iso2022.flags, 0))
5410             {
5411               switch (st->iso2022.iso.esc)
5412                 {
5413                 case ISO_ESC_DESIGNATE:
5414                   mask &= ~CODING_CATEGORY_ISO_8_1_MASK;
5415                   mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
5416                   break;
5417                 case ISO_ESC_LOCKING_SHIFT:
5418                   mask = CODING_CATEGORY_ISO_LOCK_SHIFT_MASK;
5419                   goto ran_out_of_chars;
5420                 case ISO_ESC_SINGLE_SHIFT:
5421                   mask &= ~CODING_CATEGORY_ISO_8_DESIGNATE_MASK;
5422                   st->iso2022.saw_single_shift = 1;
5423                   break;
5424                 default:
5425                   break;
5426                 }
5427             }
5428           else
5429             {
5430               mask = 0;
5431               goto ran_out_of_chars;
5432             }
5433         }
5434     label_continue_loop:;
5435     }
5436
5437  ran_out_of_chars:
5438
5439   return mask;
5440 }
5441
5442 static int
5443 postprocess_iso2022_mask (int mask)
5444 {
5445   /* #### kind of cheesy */
5446   /* If seven-bit ISO is allowed, then assume that the encoding is
5447      entirely seven-bit and turn off the eight-bit ones. */
5448   if (mask & CODING_CATEGORY_ISO_7_MASK)
5449     mask &= ~ (CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
5450                CODING_CATEGORY_ISO_8_1_MASK |
5451                CODING_CATEGORY_ISO_8_2_MASK);
5452   return mask;
5453 }
5454
5455 /* If FLAGS is a null pointer or specifies right-to-left motion,
5456    output a switch-dir-to-left-to-right sequence to DST.
5457    Also update FLAGS if it is not a null pointer.
5458    If INTERNAL_P is set, we are outputting in internal format and
5459    need to handle the CSI differently. */
5460
5461 static void
5462 restore_left_to_right_direction (Lisp_Coding_System *codesys,
5463                                  unsigned_char_dynarr *dst,
5464                                  unsigned int *flags,
5465                                  int internal_p)
5466 {
5467   if (!flags || (*flags & CODING_STATE_R2L))
5468     {
5469       if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5470         {
5471           Dynarr_add (dst, ISO_CODE_ESC);
5472           Dynarr_add (dst, '[');
5473         }
5474       else if (internal_p)
5475         DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
5476       else
5477         Dynarr_add (dst, ISO_CODE_CSI);
5478       Dynarr_add (dst, '0');
5479       Dynarr_add (dst, ']');
5480       if (flags)
5481         *flags &= ~CODING_STATE_R2L;
5482     }
5483 }
5484
5485 /* If FLAGS is a null pointer or specifies a direction different from
5486    DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or
5487    CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape
5488    sequence to DST.  Also update FLAGS if it is not a null pointer.
5489    If INTERNAL_P is set, we are outputting in internal format and
5490    need to handle the CSI differently. */
5491
5492 static void
5493 ensure_correct_direction (int direction, Lisp_Coding_System *codesys,
5494                           unsigned_char_dynarr *dst, unsigned int *flags,
5495                           int internal_p)
5496 {
5497   if ((!flags || (*flags & CODING_STATE_R2L)) &&
5498       direction == CHARSET_LEFT_TO_RIGHT)
5499     restore_left_to_right_direction (codesys, dst, flags, internal_p);
5500   else if (!CODING_SYSTEM_ISO2022_NO_ISO6429 (codesys)
5501            && (!flags || !(*flags & CODING_STATE_R2L)) &&
5502            direction == CHARSET_RIGHT_TO_LEFT)
5503     {
5504       if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5505         {
5506           Dynarr_add (dst, ISO_CODE_ESC);
5507           Dynarr_add (dst, '[');
5508         }
5509       else if (internal_p)
5510         DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
5511       else
5512         Dynarr_add (dst, ISO_CODE_CSI);
5513       Dynarr_add (dst, '2');
5514       Dynarr_add (dst, ']');
5515       if (flags)
5516         *flags |= CODING_STATE_R2L;
5517     }
5518 }
5519
5520 /* Convert ISO2022-format data to internal format. */
5521
5522 static void
5523 decode_coding_iso2022 (Lstream *decoding, const Extbyte *src,
5524                        unsigned_char_dynarr *dst, Lstream_data_count n)
5525 {
5526   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
5527   unsigned int flags    = str->flags;
5528   unsigned int cpos     = str->cpos;
5529   unsigned char counter = str->counter;
5530   eol_type_t eol_type   = str->eol_type;
5531 #ifdef ENABLE_COMPOSITE_CHARS
5532   unsigned_char_dynarr *real_dst = dst;
5533 #endif
5534   Lisp_Object coding_system;
5535
5536   XSETCODING_SYSTEM (coding_system, str->codesys);
5537
5538 #ifdef ENABLE_COMPOSITE_CHARS
5539   if (flags & CODING_STATE_COMPOSITE)
5540     dst = str->iso2022.composite_chars;
5541 #endif /* ENABLE_COMPOSITE_CHARS */
5542
5543   while (n--)
5544     {
5545       unsigned char c = *(unsigned char *)src++;
5546       if (flags & CODING_STATE_ESCAPE)
5547         {       /* Within ESC sequence */
5548           int retval = parse_iso2022_esc (coding_system, &str->iso2022,
5549                                           c, &flags, 1);
5550
5551           if (retval)
5552             {
5553               switch (str->iso2022.esc)
5554                 {
5555 #ifdef ENABLE_COMPOSITE_CHARS
5556                 case ISO_ESC_START_COMPOSITE:
5557                   if (str->iso2022.composite_chars)
5558                     Dynarr_reset (str->iso2022.composite_chars);
5559                   else
5560                     str->iso2022.composite_chars = Dynarr_new (unsigned_char);
5561                   dst = str->iso2022.composite_chars;
5562                   break;
5563                 case ISO_ESC_END_COMPOSITE:
5564                   {
5565                     Bufbyte comstr[MAX_EMCHAR_LEN];
5566                     Bytecount len;
5567                     Emchar emch = lookup_composite_char (Dynarr_atp (dst, 0),
5568                                                          Dynarr_length (dst));
5569                     dst = real_dst;
5570                     len = set_charptr_emchar (comstr, emch);
5571                     Dynarr_add_many (dst, comstr, len);
5572                     break;
5573                   }
5574 #endif /* ENABLE_COMPOSITE_CHARS */
5575
5576                 case ISO_ESC_LITERAL:
5577                   COMPOSE_FLUSH_CHARS (str, dst);
5578                   decode_flush_er_chars (str, dst);
5579                   DECODE_ADD_BINARY_CHAR (c, dst);
5580                   break;
5581
5582                 default:
5583                   /* Everything else handled already */
5584                   break;
5585                 }
5586             }
5587
5588           /* Attempted error recovery. */
5589           if (str->iso2022.output_direction_sequence)
5590             ensure_correct_direction (flags & CODING_STATE_R2L ?
5591                                       CHARSET_RIGHT_TO_LEFT :
5592                                       CHARSET_LEFT_TO_RIGHT,
5593                                       str->codesys, dst, 0, 1);
5594           /* More error recovery. */
5595           if (!retval || str->iso2022.output_literally)
5596             {
5597               /* Output the (possibly invalid) sequence */
5598               int i;
5599               COMPOSE_FLUSH_CHARS (str, dst);
5600               decode_flush_er_chars (str, dst);
5601               for (i = 0; i < str->iso2022.esc_bytes_index; i++)
5602                 DECODE_ADD_BINARY_CHAR (str->iso2022.esc_bytes[i], dst);
5603               flags &= CODING_STATE_ISO2022_LOCK;
5604               if (!retval)
5605                 n++, src--;/* Repeat the loop with the same character. */
5606               else
5607                 {
5608                   /* No sense in reprocessing the final byte of the
5609                      escape sequence; it could mess things up anyway.
5610                      Just add it now. */
5611                   COMPOSE_FLUSH_CHARS (str, dst);
5612                   decode_flush_er_chars (str, dst);
5613                   DECODE_ADD_BINARY_CHAR (c, dst);
5614                 }
5615             }
5616           cpos = 0;
5617           counter = 0;
5618         }
5619       else if (BYTE_C0_P (c) || BYTE_C1_P (c))
5620         { /* Control characters */
5621
5622           /***** Error-handling *****/
5623
5624           /* If we were in the middle of a character, dump out the
5625              partial character. */
5626           if (counter)
5627             {
5628               COMPOSE_FLUSH_CHARS (str, dst);
5629               decode_flush_er_chars (str, dst);
5630               while (counter > 0)
5631                 {
5632                   counter--;
5633                   DECODE_ADD_BINARY_CHAR
5634                     ((unsigned char)(cpos >> (counter * 8)), dst);
5635                 }
5636               cpos = 0;
5637             }
5638
5639           /* If we just saw a single-shift character, dump it out.
5640              This may dump out the wrong sort of single-shift character,
5641              but least it will give an indication that something went
5642              wrong. */
5643           if (flags & CODING_STATE_SS2)
5644             {
5645               COMPOSE_FLUSH_CHARS (str, dst);
5646               decode_flush_er_chars (str, dst);
5647               DECODE_ADD_BINARY_CHAR (ISO_CODE_SS2, dst);
5648               flags &= ~CODING_STATE_SS2;
5649             }
5650           if (flags & CODING_STATE_SS3)
5651             {
5652               COMPOSE_FLUSH_CHARS (str, dst);
5653               decode_flush_er_chars (str, dst);
5654               DECODE_ADD_BINARY_CHAR (ISO_CODE_SS3, dst);
5655               flags &= ~CODING_STATE_SS3;
5656             }
5657
5658           /***** Now handle the control characters. *****/
5659
5660           /* Handle CR/LF */
5661 #ifdef UTF2000
5662           if (c == '\r')
5663             {
5664               COMPOSE_FLUSH_CHARS (str, dst);
5665               decode_flush_er_chars (str, dst);
5666               if (eol_type == EOL_CR)
5667                 Dynarr_add (dst, '\n');
5668               else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR)
5669                 Dynarr_add (dst, c);
5670               else
5671                 flags |= CODING_STATE_CR;
5672               goto label_continue_loop;
5673             }
5674           else if (flags & CODING_STATE_CR)
5675             {   /* eol_type == CODING_SYSTEM_EOL_CRLF */
5676               if (c != '\n')
5677                 Dynarr_add (dst, '\r');
5678               flags &= ~CODING_STATE_CR;
5679             }
5680 #else
5681           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5682 #endif
5683
5684           flags &= CODING_STATE_ISO2022_LOCK;
5685
5686           if (!parse_iso2022_esc (coding_system, &str->iso2022, c, &flags, 1))
5687             {
5688               COMPOSE_FLUSH_CHARS (str, dst);
5689               decode_flush_er_chars (str, dst);
5690               DECODE_ADD_BINARY_CHAR (c, dst);
5691             }
5692         }
5693       else
5694         {                       /* Graphic characters */
5695           Lisp_Object charset;
5696 #ifndef UTF2000
5697           Charset_ID lb;
5698 #endif
5699           int reg;
5700
5701 #ifdef UTF2000
5702           if (c == '\r')
5703             {
5704               COMPOSE_FLUSH_CHARS (str, dst);
5705               decode_flush_er_chars (str, dst);
5706               if (eol_type == EOL_CR)
5707                 Dynarr_add (dst, '\n');
5708               else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR)
5709                 Dynarr_add (dst, c);
5710               else
5711                 flags |= CODING_STATE_CR;
5712               goto label_continue_loop;
5713             }
5714           else if (flags & CODING_STATE_CR)
5715             {   /* eol_type == CODING_SYSTEM_EOL_CRLF */
5716               if (c != '\n')
5717                 Dynarr_add (dst, '\r');
5718               flags &= ~CODING_STATE_CR;
5719             }
5720 #else
5721           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5722 #endif
5723
5724           /* Now determine the charset. */
5725           reg = ((flags & CODING_STATE_SS2) ? 2
5726                  : (flags & CODING_STATE_SS3) ? 3
5727                  : !BYTE_ASCII_P (c) ? str->iso2022.register_right
5728                  : str->iso2022.register_left);
5729           charset = str->iso2022.charset[reg];
5730
5731           /* Error checking: */
5732           if (! CHARSETP (charset)
5733               || str->iso2022.invalid_designated[reg]
5734               || (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL)
5735                   && XCHARSET_CHARS (charset) == 94))
5736             /* Mrmph.  We are trying to invoke a register that has no
5737                or an invalid charset in it, or trying to add a character
5738                outside the range of the charset.  Insert that char literally
5739                to preserve it for the output. */
5740             {
5741               COMPOSE_FLUSH_CHARS (str, dst);
5742               decode_flush_er_chars (str, dst);
5743               while (counter > 0)
5744                 {
5745                   counter--;
5746                   DECODE_ADD_BINARY_CHAR
5747                     ((unsigned char)(cpos >> (counter * 8)), dst);
5748                 }
5749               cpos = 0;
5750               DECODE_ADD_BINARY_CHAR (c, dst);
5751             }
5752
5753           else
5754             {
5755               /* Things are probably hunky-dorey. */
5756
5757               /* Fetch reverse charset, maybe. */
5758               if (((flags & CODING_STATE_R2L) &&
5759                    XCHARSET_DIRECTION (charset) == CHARSET_LEFT_TO_RIGHT)
5760                   ||
5761                   (!(flags & CODING_STATE_R2L) &&
5762                    XCHARSET_DIRECTION (charset) == CHARSET_RIGHT_TO_LEFT))
5763                 {
5764                   Lisp_Object new_charset =
5765                     XCHARSET_REVERSE_DIRECTION_CHARSET (charset);
5766                   if (!NILP (new_charset))
5767                     charset = new_charset;
5768                 }
5769
5770 #ifdef UTF2000
5771               counter++;
5772               if (XCHARSET_DIMENSION (charset) == counter)
5773                 {
5774                   COMPOSE_ADD_CHAR (str,
5775                                     DECODE_CHAR (charset,
5776                                                  ((cpos & 0x7F7F7F) << 8)
5777                                                  | (c & 0x7F), 0),
5778                                     dst);
5779                   cpos = 0;
5780                   counter = 0;
5781                 }
5782               else
5783                 cpos = (cpos << 8) | c;
5784 #else
5785               lb = XCHARSET_LEADING_BYTE (charset);
5786               switch (XCHARSET_REP_BYTES (charset))
5787                 {
5788                 case 1: /* ASCII */
5789                   DECODE_OUTPUT_PARTIAL_CHAR (ch);
5790                   Dynarr_add (dst, c & 0x7F);
5791                   break;
5792
5793                 case 2: /* one-byte official */
5794                   DECODE_OUTPUT_PARTIAL_CHAR (ch);
5795                   Dynarr_add (dst, lb);
5796                   Dynarr_add (dst, c | 0x80);
5797                   break;
5798
5799                 case 3: /* one-byte private or two-byte official */
5800                   if (XCHARSET_PRIVATE_P (charset))
5801                     {
5802                       DECODE_OUTPUT_PARTIAL_CHAR (ch);
5803                       Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_1);
5804                       Dynarr_add (dst, lb);
5805                       Dynarr_add (dst, c | 0x80);
5806                     }
5807                   else
5808                     {
5809                       if (ch)
5810                         {
5811                           Dynarr_add (dst, lb);
5812                           Dynarr_add (dst, ch | 0x80);
5813                           Dynarr_add (dst, c | 0x80);
5814                           ch = 0;
5815                         }
5816                       else
5817                         ch = c;
5818                     }
5819                   break;
5820
5821                 default:        /* two-byte private */
5822                   if (ch)
5823                     {
5824                       Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_2);
5825                       Dynarr_add (dst, lb);
5826                       Dynarr_add (dst, ch | 0x80);
5827                       Dynarr_add (dst, c | 0x80);
5828                       ch = 0;
5829                     }
5830                   else
5831                     ch = c;
5832                 }
5833 #endif
5834             }
5835
5836           if (!cpos)
5837             flags &= CODING_STATE_ISO2022_LOCK;
5838         }
5839
5840     label_continue_loop:;
5841     }
5842
5843   if (flags & CODING_STATE_END)
5844     {
5845       COMPOSE_FLUSH_CHARS (str, dst);
5846       decode_flush_er_chars (str, dst);
5847       DECODE_OUTPUT_PARTIAL_CHAR (cpos);
5848     }
5849   str->flags   = flags;
5850   str->cpos    = cpos;
5851   str->counter = counter;
5852 }
5853
5854
5855 /***** ISO2022 encoder *****/
5856
5857 /* Designate CHARSET into register REG. */
5858
5859 static void
5860 iso2022_designate (Lisp_Object charset, unsigned char reg,
5861                    struct encoding_stream *str, unsigned_char_dynarr *dst)
5862 {
5863   static const char inter94[] = "()*+";
5864   static const char inter96[] = ",-./";
5865   unsigned short chars;
5866   unsigned char dimension;
5867   unsigned char final;
5868   Lisp_Object old_charset = str->iso2022.charset[reg];
5869
5870   str->iso2022.charset[reg] = charset;
5871   if (!CHARSETP (charset))
5872     /* charset might be an initial nil or t. */
5873     return;
5874   chars = XCHARSET_CHARS (charset);
5875   dimension = XCHARSET_DIMENSION (charset);
5876   final = XCHARSET_FINAL (charset);
5877   if (!str->iso2022.force_charset_on_output[reg] &&
5878       CHARSETP (old_charset) &&
5879       XCHARSET_CHARS (old_charset) == chars &&
5880       XCHARSET_DIMENSION (old_charset) == dimension &&
5881       XCHARSET_FINAL (old_charset) == final)
5882     return;
5883
5884   str->iso2022.force_charset_on_output[reg] = 0;
5885
5886   {
5887     charset_conversion_spec_dynarr *dyn =
5888       str->codesys->iso2022.output_conv;
5889
5890     if (dyn)
5891       {
5892         int i;
5893
5894         for (i = 0; i < Dynarr_length (dyn); i++)
5895           {
5896             struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
5897             if (EQ (charset, spec->from_charset))
5898                 charset = spec->to_charset;
5899           }
5900       }
5901   }
5902
5903   Dynarr_add (dst, ISO_CODE_ESC);
5904   switch (chars)
5905     {
5906     case 94:
5907       if (dimension == 1)
5908         Dynarr_add (dst, inter94[reg]);
5909       else
5910         {
5911           Dynarr_add (dst, '$');
5912           if (reg != 0
5913               || !(CODING_SYSTEM_ISO2022_SHORT (str->codesys))
5914               || final < '@'
5915               || final > 'B')
5916             Dynarr_add (dst, inter94[reg]);
5917         }
5918       break;
5919     case 96:
5920       if (dimension == 1)
5921         Dynarr_add (dst, inter96[reg]);
5922       else
5923         {
5924           Dynarr_add (dst, '$');
5925           Dynarr_add (dst, inter96[reg]);
5926         }
5927       break;
5928     }
5929   Dynarr_add (dst, final);
5930 }
5931
5932 static void
5933 ensure_normal_shift (struct encoding_stream *str, unsigned_char_dynarr *dst)
5934 {
5935   if (str->iso2022.register_left != 0)
5936     {
5937       Dynarr_add (dst, ISO_CODE_SI);
5938       str->iso2022.register_left = 0;
5939     }
5940 }
5941
5942 static void
5943 ensure_shift_out (struct encoding_stream *str, unsigned_char_dynarr *dst)
5944 {
5945   if (str->iso2022.register_left != 1)
5946     {
5947       Dynarr_add (dst, ISO_CODE_SO);
5948       str->iso2022.register_left = 1;
5949     }
5950 }
5951
5952 void
5953 char_encode_iso2022 (struct encoding_stream *str, Emchar ch,
5954                      unsigned_char_dynarr *dst, unsigned int *flags)
5955 {
5956   unsigned char charmask;
5957   Lisp_Coding_System* codesys = str->codesys;
5958   eol_type_t eol_type         = CODING_SYSTEM_EOL_TYPE (str->codesys);
5959   int i;
5960   Lisp_Object charset = str->iso2022.current_charset;
5961   int half = str->iso2022.current_half;
5962   int code_point = -1;
5963
5964   if (ch <= 0x7F)
5965     {
5966       restore_left_to_right_direction (codesys, dst, flags, 0);
5967
5968       /* Make sure G0 contains ASCII */
5969       if ((ch > ' ' && ch < ISO_CODE_DEL)
5970           || !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
5971         {
5972           ensure_normal_shift (str, dst);
5973           iso2022_designate (Vcharset_ascii, 0, str, dst);
5974         }
5975
5976       /* If necessary, restore everything to the default state
5977          at end-of-line */
5978       if (ch == '\n' && !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys)))
5979         {
5980           restore_left_to_right_direction (codesys, dst, flags, 0);
5981
5982           ensure_normal_shift (str, dst);
5983
5984           for (i = 0; i < 4; i++)
5985             {
5986               Lisp_Object initial_charset =
5987                 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5988               iso2022_designate (initial_charset, i, str, dst);
5989             }
5990         }
5991       if (ch == '\n')
5992         {
5993           if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5994             Dynarr_add (dst, '\r');
5995           if (eol_type != EOL_CR)
5996             Dynarr_add (dst, ch);
5997         }
5998       else
5999         {
6000           if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
6001               && fit_to_be_escape_quoted (ch))
6002             Dynarr_add (dst, ISO_CODE_ESC);
6003           Dynarr_add (dst, ch);
6004         }
6005     }
6006   else if ( (0x80 <= ch) && (ch <= 0x9f) )
6007     {
6008       charmask = (half == 0 ? 0x00 : 0x80);
6009
6010       if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
6011           && fit_to_be_escape_quoted (ch))
6012         Dynarr_add (dst, ISO_CODE_ESC);
6013       /* you asked for it ... */
6014       Dynarr_add (dst, ch);
6015     }
6016   else
6017     {
6018       int reg;
6019
6020       /* Now determine which register to use. */
6021       reg = -1;
6022       for (i = 0; i < 4; i++)
6023         {
6024           if ((CHARSETP (charset = str->iso2022.charset[i])
6025                && ((code_point = charset_code_point (charset, ch, 0)) >= 0))
6026               ||
6027               (CHARSETP
6028                (charset
6029                 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i))
6030                && ((code_point = charset_code_point (charset, ch, 0)) >= 0)))
6031             {
6032               reg = i;
6033               break;
6034             }
6035         }
6036       if (reg == -1)
6037         {
6038           Lisp_Object original_default_coded_charset_priority_list
6039             = Vdefault_coded_charset_priority_list;
6040           Vdefault_coded_charset_priority_list
6041             = CODING_SYSTEM_CCS_PRIORITY_LIST (codesys);
6042           while (!EQ (Vdefault_coded_charset_priority_list, Qnil))
6043             {
6044               code_point = ENCODE_CHAR (ch, charset);
6045               if (XCHARSET_FINAL (charset))
6046                 goto found;
6047               Vdefault_coded_charset_priority_list
6048                 = Fcdr (Fmemq (XCHARSET_NAME (charset),
6049                                Vdefault_coded_charset_priority_list));
6050             }
6051           Vdefault_coded_charset_priority_list
6052             = original_default_coded_charset_priority_list;
6053           while (!EQ (Vdefault_coded_charset_priority_list, Qnil))
6054             {
6055               code_point = ENCODE_CHAR (ch, charset);
6056               if (XCHARSET_FINAL (charset))
6057                 goto found;
6058               Vdefault_coded_charset_priority_list
6059                 = Fcdr (Fmemq (XCHARSET_NAME (charset),
6060                                Vdefault_coded_charset_priority_list));
6061             }
6062           code_point = ENCODE_CHAR (ch, charset);
6063           if (!XCHARSET_FINAL (charset))
6064             {
6065               charset = Vcharset_ascii;
6066               code_point = '~';
6067             }
6068         found:
6069           Vdefault_coded_charset_priority_list
6070             = original_default_coded_charset_priority_list;
6071         }
6072       ensure_correct_direction (XCHARSET_DIRECTION (charset),
6073                                 codesys, dst, flags, 0);
6074
6075       if (reg == -1)
6076         {
6077           if (XCHARSET_GRAPHIC (charset) != 0)
6078             {
6079               if (!NILP (str->iso2022.charset[1]) &&
6080                   (!CODING_SYSTEM_ISO2022_SEVEN (codesys)
6081                    || CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys)))
6082                 reg = 1;
6083               else if (!NILP (str->iso2022.charset[2]))
6084                 reg = 2;
6085               else if (!NILP (str->iso2022.charset[3]))
6086                 reg = 3;
6087               else
6088                 reg = 0;
6089             }
6090           else
6091             reg = 0;
6092         }
6093
6094       iso2022_designate (charset, reg, str, dst);
6095
6096       /* Now invoke that register. */
6097       switch (reg)
6098         {
6099         case 0:
6100           ensure_normal_shift (str, dst);
6101           half = 0;
6102           break;
6103         case 1:
6104           if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
6105             {
6106               ensure_shift_out (str, dst);
6107               half = 0;
6108             }
6109           else
6110             half = 1;
6111           break;
6112         case 2:
6113           if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
6114             {
6115               Dynarr_add (dst, ISO_CODE_ESC);
6116               Dynarr_add (dst, 'N');
6117               half = 0;
6118             }
6119           else
6120             {
6121               Dynarr_add (dst, ISO_CODE_SS2);
6122               half = 1;
6123             }
6124           break;
6125         case 3:
6126           if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
6127             {
6128               Dynarr_add (dst, ISO_CODE_ESC);
6129               Dynarr_add (dst, 'O');
6130               half = 0;
6131             }
6132           else
6133             {
6134               Dynarr_add (dst, ISO_CODE_SS3);
6135               half = 1;
6136             }
6137           break;
6138         default:
6139           abort ();
6140         }
6141
6142       charmask = (half == 0 ? 0x00 : 0x80);
6143
6144       switch (XCHARSET_DIMENSION (charset))
6145         {
6146         case 1:
6147           Dynarr_add (dst, (code_point & 0xFF) | charmask);
6148           break;
6149         case 2:
6150           Dynarr_add (dst, ((code_point >> 8) & 0xFF) | charmask);
6151           Dynarr_add (dst, ( code_point       & 0xFF) | charmask);
6152           break;
6153         case 3:
6154           Dynarr_add (dst, ((code_point >> 16) & 0xFF) | charmask);
6155           Dynarr_add (dst, ((code_point >>  8) & 0xFF) | charmask);
6156           Dynarr_add (dst, ( code_point        & 0xFF) | charmask);
6157           break;
6158         case 4:
6159           Dynarr_add (dst, ((code_point >> 24) & 0xFF) | charmask);
6160           Dynarr_add (dst, ((code_point >> 16) & 0xFF) | charmask);
6161           Dynarr_add (dst, ((code_point >>  8) & 0xFF) | charmask);
6162           Dynarr_add (dst, ( code_point        & 0xFF) | charmask);
6163           break;
6164         default:
6165           abort ();
6166         }
6167     }
6168   str->iso2022.current_charset = charset;
6169   str->iso2022.current_half = half;
6170 }
6171
6172 void
6173 char_finish_iso2022 (struct encoding_stream *str, unsigned_char_dynarr *dst,
6174                      unsigned int *flags)
6175 {
6176   Lisp_Coding_System* codesys = str->codesys;
6177   int i;
6178
6179   restore_left_to_right_direction (codesys, dst, flags, 0);
6180   ensure_normal_shift (str, dst);
6181   for (i = 0; i < 4; i++)
6182     {
6183       Lisp_Object initial_charset
6184         = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
6185       iso2022_designate (initial_charset, i, str, dst);
6186     }
6187 }
6188 #endif /* MULE */
6189 \f
6190 /************************************************************************/
6191 /*                     No-conversion methods                            */
6192 /************************************************************************/
6193
6194 /* This is used when reading in "binary" files -- i.e. files that may
6195    contain all 256 possible byte values and that are not to be
6196    interpreted as being in any particular decoding. */
6197 static void
6198 decode_coding_no_conversion (Lstream *decoding, const Extbyte *src,
6199                              unsigned_char_dynarr *dst, Lstream_data_count n)
6200 {
6201   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
6202   unsigned int flags  = str->flags;
6203   unsigned int cpos   = str->cpos;
6204   eol_type_t eol_type = str->eol_type;
6205
6206   while (n--)
6207     {
6208       unsigned char c = *(unsigned char *)src++;
6209
6210       DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
6211       DECODE_ADD_BINARY_CHAR (c, dst);
6212     label_continue_loop:;
6213     }
6214
6215   DECODE_HANDLE_END_OF_CONVERSION (flags, cpos, dst);
6216
6217   str->flags = flags;
6218   str->cpos  = cpos;
6219 }
6220
6221 static void
6222 encode_coding_no_conversion (Lstream *encoding, const Bufbyte *src,
6223                              unsigned_char_dynarr *dst, Lstream_data_count n)
6224 {
6225   unsigned char c;
6226   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
6227   unsigned int flags  = str->flags;
6228   unsigned int ch     = str->ch;
6229   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
6230 #ifdef UTF2000
6231   unsigned char char_boundary = str->iso2022.current_char_boundary;
6232 #endif
6233
6234   while (n--)
6235     {
6236       c = *src++;
6237 #ifdef UTF2000
6238       if (char_boundary == 0)
6239         if ( c >= 0xfc )
6240           {
6241             ch = c & 0x01;
6242             char_boundary = 5;
6243           }
6244         else if ( c >= 0xf8 )
6245           {
6246             ch = c & 0x03;
6247             char_boundary = 4;
6248           }
6249         else if ( c >= 0xf0 )
6250           {
6251             ch = c & 0x07;
6252             char_boundary = 3;
6253           }
6254         else if ( c >= 0xe0 )
6255           {
6256             ch = c & 0x0f;
6257             char_boundary = 2;
6258           }
6259         else if ( c >= 0xc0 )
6260           {
6261             ch = c & 0x1f;
6262             char_boundary = 1;
6263           }
6264         else
6265           {
6266             ch = 0;
6267             if (c == '\n')
6268               {
6269                 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
6270                   Dynarr_add (dst, '\r');
6271                 if (eol_type != EOL_CR)
6272                   Dynarr_add (dst, c);
6273               }
6274             else
6275               Dynarr_add (dst, c);
6276             char_boundary = 0;
6277           }
6278       else if (char_boundary == 1)
6279         {
6280           ch = ( ch << 6 ) | ( c & 0x3f );
6281           Dynarr_add (dst, ch & 0xff);
6282           char_boundary = 0;
6283         }
6284       else
6285         {
6286           ch = ( ch << 6 ) | ( c & 0x3f );
6287           char_boundary--;
6288         }
6289 #else /* not UTF2000 */
6290       if (c == '\n')
6291         {
6292           if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
6293             Dynarr_add (dst, '\r');
6294           if (eol_type != EOL_CR)
6295             Dynarr_add (dst, '\n');
6296           ch = 0;
6297         }
6298       else if (BYTE_ASCII_P (c))
6299         {
6300           assert (ch == 0);
6301           Dynarr_add (dst, c);
6302         }
6303       else if (BUFBYTE_LEADING_BYTE_P (c))
6304         {
6305           assert (ch == 0);
6306           if (c == LEADING_BYTE_LATIN_ISO8859_1 ||
6307               c == LEADING_BYTE_CONTROL_1)
6308             ch = c;
6309           else
6310             Dynarr_add (dst, '~'); /* untranslatable character */
6311         }
6312       else
6313         {
6314           if (ch == LEADING_BYTE_LATIN_ISO8859_1)
6315             Dynarr_add (dst, c);
6316           else if (ch == LEADING_BYTE_CONTROL_1)
6317             {
6318               assert (c < 0xC0);
6319               Dynarr_add (dst, c - 0x20);
6320             }
6321           /* else it should be the second or third byte of an
6322              untranslatable character, so ignore it */
6323           ch = 0;
6324         }
6325 #endif /* not UTF2000 */
6326     }
6327
6328   str->flags = flags;
6329   str->ch    = ch;
6330 #ifdef UTF2000
6331   str->iso2022.current_char_boundary = char_boundary;
6332 #endif
6333 }
6334
6335 \f
6336
6337 /************************************************************************/
6338 /*                             Initialization                           */
6339 /************************************************************************/
6340
6341 void
6342 syms_of_file_coding (void)
6343 {
6344   INIT_LRECORD_IMPLEMENTATION (coding_system);
6345
6346   DEFERROR_STANDARD (Qcoding_system_error, Qio_error);
6347
6348   DEFSUBR (Fcoding_system_p);
6349   DEFSUBR (Ffind_coding_system);
6350   DEFSUBR (Fget_coding_system);
6351   DEFSUBR (Fcoding_system_list);
6352   DEFSUBR (Fcoding_system_name);
6353   DEFSUBR (Fmake_coding_system);
6354   DEFSUBR (Fcopy_coding_system);
6355   DEFSUBR (Fcoding_system_canonical_name_p);
6356   DEFSUBR (Fcoding_system_alias_p);
6357   DEFSUBR (Fcoding_system_aliasee);
6358   DEFSUBR (Fdefine_coding_system_alias);
6359   DEFSUBR (Fsubsidiary_coding_system);
6360
6361   DEFSUBR (Fcoding_system_type);
6362   DEFSUBR (Fcoding_system_doc_string);
6363 #ifdef MULE
6364   DEFSUBR (Fcoding_system_charset);
6365 #endif
6366   DEFSUBR (Fcoding_system_property);
6367
6368   DEFSUBR (Fcoding_category_list);
6369   DEFSUBR (Fset_coding_priority_list);
6370   DEFSUBR (Fcoding_priority_list);
6371   DEFSUBR (Fset_coding_category_system);
6372   DEFSUBR (Fcoding_category_system);
6373
6374   DEFSUBR (Fdetect_coding_region);
6375   DEFSUBR (Fdecode_coding_region);
6376   DEFSUBR (Fencode_coding_region);
6377 #ifdef MULE
6378   DEFSUBR (Fdecode_shift_jis_char);
6379   DEFSUBR (Fencode_shift_jis_char);
6380   DEFSUBR (Fdecode_big5_char);
6381   DEFSUBR (Fencode_big5_char);
6382 #endif /* MULE */
6383   defsymbol (&Qcoding_systemp, "coding-system-p");
6384   defsymbol (&Qno_conversion, "no-conversion");
6385   defsymbol (&Qraw_text, "raw-text");
6386 #ifdef MULE
6387   defsymbol (&Qbig5, "big5");
6388   defsymbol (&Qshift_jis, "shift-jis");
6389   defsymbol (&Qucs4, "ucs-4");
6390   defsymbol (&Qutf8, "utf-8");
6391   defsymbol (&Qutf16, "utf-16");
6392   defsymbol (&Qccl, "ccl");
6393   defsymbol (&Qiso2022, "iso2022");
6394 #endif /* MULE */
6395   defsymbol (&Qmnemonic, "mnemonic");
6396   defsymbol (&Qeol_type, "eol-type");
6397   defsymbol (&Qpost_read_conversion, "post-read-conversion");
6398   defsymbol (&Qpre_write_conversion, "pre-write-conversion");
6399
6400   defsymbol (&Qcr, "cr");
6401   defsymbol (&Qlf, "lf");
6402   defsymbol (&Qcrlf, "crlf");
6403   defsymbol (&Qeol_cr, "eol-cr");
6404   defsymbol (&Qeol_lf, "eol-lf");
6405   defsymbol (&Qeol_crlf, "eol-crlf");
6406 #ifdef MULE
6407   defsymbol (&Qcharset_g0, "charset-g0");
6408   defsymbol (&Qcharset_g1, "charset-g1");
6409   defsymbol (&Qcharset_g2, "charset-g2");
6410   defsymbol (&Qcharset_g3, "charset-g3");
6411   defsymbol (&Qforce_g0_on_output, "force-g0-on-output");
6412   defsymbol (&Qforce_g1_on_output, "force-g1-on-output");
6413   defsymbol (&Qforce_g2_on_output, "force-g2-on-output");
6414   defsymbol (&Qforce_g3_on_output, "force-g3-on-output");
6415   defsymbol (&Qno_iso6429, "no-iso6429");
6416   defsymbol (&Qinput_charset_conversion, "input-charset-conversion");
6417   defsymbol (&Qoutput_charset_conversion, "output-charset-conversion");
6418
6419   defsymbol (&Qshort, "short");
6420   defsymbol (&Qno_ascii_eol, "no-ascii-eol");
6421   defsymbol (&Qno_ascii_cntl, "no-ascii-cntl");
6422   defsymbol (&Qseven, "seven");
6423   defsymbol (&Qlock_shift, "lock-shift");
6424   defsymbol (&Qescape_quoted, "escape-quoted");
6425 #endif /* MULE */
6426 #ifdef UTF2000
6427   defsymbol (&Qutf_8_mcs, "utf-8-mcs");
6428   defsymbol (&Qdisable_composition, "disable-composition");
6429   defsymbol (&Qccs_priority_list, "ccs-priority-list");
6430   defsymbol (&Quse_entity_reference, "use-entity-reference");
6431   defsymbol (&Qd, "d");
6432   defsymbol (&Qx, "x");
6433   defsymbol (&QX, "X");
6434 #endif
6435   defsymbol (&Qencode, "encode");
6436   defsymbol (&Qdecode, "decode");
6437
6438 #ifdef MULE
6439   defsymbol (&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS],
6440              "shift-jis");
6441   defsymbol (&coding_category_symbol[CODING_CATEGORY_BIG5],
6442              "big5");
6443   defsymbol (&coding_category_symbol[CODING_CATEGORY_UCS4],
6444              "ucs-4");
6445   defsymbol (&coding_category_symbol[CODING_CATEGORY_UTF16],
6446              "utf-16");
6447   defsymbol (&coding_category_symbol[CODING_CATEGORY_UTF8],
6448              "utf-8");
6449   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_7],
6450              "iso-7");
6451   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE],
6452              "iso-8-designate");
6453   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_1],
6454              "iso-8-1");
6455   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_2],
6456              "iso-8-2");
6457   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_LOCK_SHIFT],
6458              "iso-lock-shift");
6459 #endif /* MULE */
6460   defsymbol (&coding_category_symbol[CODING_CATEGORY_NO_CONVERSION],
6461              "no-conversion");
6462 }
6463
6464 void
6465 lstream_type_create_file_coding (void)
6466 {
6467   LSTREAM_HAS_METHOD (decoding, reader);
6468   LSTREAM_HAS_METHOD (decoding, writer);
6469   LSTREAM_HAS_METHOD (decoding, rewinder);
6470   LSTREAM_HAS_METHOD (decoding, seekable_p);
6471   LSTREAM_HAS_METHOD (decoding, flusher);
6472   LSTREAM_HAS_METHOD (decoding, closer);
6473   LSTREAM_HAS_METHOD (decoding, marker);
6474
6475   LSTREAM_HAS_METHOD (encoding, reader);
6476   LSTREAM_HAS_METHOD (encoding, writer);
6477   LSTREAM_HAS_METHOD (encoding, rewinder);
6478   LSTREAM_HAS_METHOD (encoding, seekable_p);
6479   LSTREAM_HAS_METHOD (encoding, flusher);
6480   LSTREAM_HAS_METHOD (encoding, closer);
6481   LSTREAM_HAS_METHOD (encoding, marker);
6482 }
6483
6484 void
6485 vars_of_file_coding (void)
6486 {
6487   int i;
6488
6489   fcd = xnew (struct file_coding_dump);
6490   dump_add_root_struct_ptr (&fcd, &fcd_description);
6491
6492   /* Initialize to something reasonable ... */
6493   for (i = 0; i < CODING_CATEGORY_LAST; i++)
6494     {
6495       fcd->coding_category_system[i] = Qnil;
6496       fcd->coding_category_by_priority[i] = i;
6497     }
6498
6499   Fprovide (intern ("file-coding"));
6500
6501   DEFVAR_LISP ("keyboard-coding-system", &Vkeyboard_coding_system /*
6502 Coding system used for TTY keyboard input.
6503 Not used under a windowing system.
6504 */ );
6505   Vkeyboard_coding_system = Qnil;
6506
6507   DEFVAR_LISP ("terminal-coding-system", &Vterminal_coding_system /*
6508 Coding system used for TTY display output.
6509 Not used under a windowing system.
6510 */ );
6511   Vterminal_coding_system = Qnil;
6512
6513   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read /*
6514 Overriding coding system used when reading from a file or process.
6515 You should bind this variable with `let', but do not set it globally.
6516 If this is non-nil, it specifies the coding system that will be used
6517 to decode input on read operations, such as from a file or process.
6518 It overrides `buffer-file-coding-system-for-read',
6519 `insert-file-contents-pre-hook', etc.  Use those variables instead of
6520 this one for permanent changes to the environment.  */ );
6521   Vcoding_system_for_read = Qnil;
6522
6523   DEFVAR_LISP ("coding-system-for-write",
6524                &Vcoding_system_for_write /*
6525 Overriding coding system used when writing to a file or process.
6526 You should bind this variable with `let', but do not set it globally.
6527 If this is non-nil, it specifies the coding system that will be used
6528 to encode output for write operations, such as to a file or process.
6529 It overrides `buffer-file-coding-system', `write-region-pre-hook', etc.
6530 Use those variables instead of this one for permanent changes to the
6531 environment.  */ );
6532   Vcoding_system_for_write = Qnil;
6533
6534   DEFVAR_LISP ("file-name-coding-system", &Vfile_name_coding_system /*
6535 Coding system used to convert pathnames when accessing files.
6536 */ );
6537   Vfile_name_coding_system = Qnil;
6538
6539   DEFVAR_LISP ("coded-charset-entity-reference-alist",
6540                &Vcoded_charset_entity_reference_alist /*
6541 Alist of coded-charset vs corresponding entity-reference.
6542 Each element looks like (CCS PREFIX CODE-COLUMNS CODE-TYPE).
6543 CCS is coded-charset.
6544 CODE-COLUMNS is columns of code-point of entity-reference.
6545 CODE-TYPE is format type of code-point of entity-reference.
6546 `d' means decimal value and `x' means hexadecimal value.
6547 */ );
6548   Vcoded_charset_entity_reference_alist = Qnil;
6549
6550   DEFVAR_BOOL ("enable-multibyte-characters", &enable_multibyte_characters /*
6551 Non-nil means the buffer contents are regarded as multi-byte form
6552 of characters, not a binary code.  This affects the display, file I/O,
6553 and behaviors of various editing commands.
6554
6555 Setting this to nil does not do anything.
6556 */ );
6557   enable_multibyte_characters = 1;
6558 }
6559
6560 void
6561 complex_vars_of_file_coding (void)
6562 {
6563   staticpro (&Vcoding_system_hash_table);
6564   Vcoding_system_hash_table =
6565     make_lisp_hash_table (50, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ);
6566
6567   the_codesys_prop_dynarr = Dynarr_new (codesys_prop);
6568   dump_add_root_struct_ptr (&the_codesys_prop_dynarr, &codesys_prop_dynarr_description);
6569
6570 #define DEFINE_CODESYS_PROP(Prop_Type, Sym) do  \
6571 {                                               \
6572   struct codesys_prop csp;                      \
6573   csp.sym = (Sym);                              \
6574   csp.prop_type = (Prop_Type);                  \
6575   Dynarr_add (the_codesys_prop_dynarr, csp);    \
6576 } while (0)
6577
6578   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qmnemonic);
6579   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_type);
6580   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_cr);
6581   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_crlf);
6582   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_lf);
6583   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qpost_read_conversion);
6584   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qpre_write_conversion);
6585 #ifdef MULE
6586   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g0);
6587   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g1);
6588   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g2);
6589   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g3);
6590   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g0_on_output);
6591   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g1_on_output);
6592   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g2_on_output);
6593   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g3_on_output);
6594   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qshort);
6595   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_eol);
6596   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_cntl);
6597   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qseven);
6598   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qlock_shift);
6599   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_iso6429);
6600   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qescape_quoted);
6601   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qinput_charset_conversion);
6602   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qoutput_charset_conversion);
6603 #ifdef UTF2000
6604   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qccs_priority_list);
6605 #endif
6606
6607   DEFINE_CODESYS_PROP (CODESYS_PROP_CCL,     Qencode);
6608   DEFINE_CODESYS_PROP (CODESYS_PROP_CCL,     Qdecode);
6609 #ifdef UTF2000
6610   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qdisable_composition);
6611   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Quse_entity_reference);
6612 #endif
6613 #endif /* MULE */
6614   /* Need to create this here or we're really screwed. */
6615   Fmake_coding_system
6616     (Qraw_text, Qno_conversion,
6617      build_string ("Raw text, which means it converts only line-break-codes."),
6618      list2 (Qmnemonic, build_string ("Raw")));
6619
6620   Fmake_coding_system
6621     (Qbinary, Qno_conversion,
6622      build_string ("Binary, which means it does not convert anything."),
6623      list4 (Qeol_type, Qlf,
6624             Qmnemonic, build_string ("Binary")));
6625
6626 #ifdef UTF2000
6627   Fmake_coding_system
6628     (Qutf_8_mcs, Qutf8,
6629      build_string
6630      ("Coding-system of UTF-8 with Multiple Coded-character-Sets extension."),
6631      list2 (Qmnemonic, build_string ("MTF8")));
6632 #endif
6633
6634   Fdefine_coding_system_alias (Qno_conversion, Qraw_text);
6635
6636   Fdefine_coding_system_alias (Qfile_name, Qbinary);
6637
6638   Fdefine_coding_system_alias (Qterminal, Qbinary);
6639   Fdefine_coding_system_alias (Qkeyboard, Qbinary);
6640
6641   /* Need this for bootstrapping */
6642   fcd->coding_category_system[CODING_CATEGORY_NO_CONVERSION] =
6643     Fget_coding_system (Qraw_text);
6644
6645 #ifdef UTF2000
6646   fcd->coding_category_system[CODING_CATEGORY_UTF8]
6647    = Fget_coding_system (Qutf_8_mcs);
6648 #endif
6649
6650 #if defined(MULE) && !defined(UTF2000)
6651   {
6652     size_t i;
6653
6654     for (i = 0; i < countof (fcd->ucs_to_mule_table); i++)
6655       fcd->ucs_to_mule_table[i] = Qnil;
6656   }
6657   staticpro (&mule_to_ucs_table);
6658   mule_to_ucs_table = Fmake_char_table(Qgeneric);
6659 #endif /* defined(MULE) && !defined(UTF2000) */
6660 }