src/text-coding.c

   1 /* Code conversion functions.
   2    Copyright (C) 1991, 1995 Free Software Foundation, Inc.
   3    Copyright (C) 1995 Sun Microsystems, Inc.
   4    Copyright (C) 1999,2000 MORIOKA Tomohiko
   5
   6 This file is part of XEmacs.
   7
   8 XEmacs is free software; you can redistribute it and/or modify it
   9 under the terms of the GNU General Public License as published by the
  10 Free Software Foundation; either version 2, or (at your option) any
  11 later version.
  12
  13 XEmacs is distributed in the hope that it will be useful, but WITHOUT
  14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with XEmacs; see the file COPYING.  If not, write to
  20 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  21 Boston, MA 02111-1307, USA.  */
  22
  23 /* Synched up with: Mule 2.3.   Not in FSF. */
  24
  25 /* Rewritten by Ben Wing <ben@xemacs.org>. */
  26
  27 #include <config.h>
  28 #include "lisp.h"
  29
  30 #include "buffer.h"
  31 #include "elhash.h"
  32 #include "insdel.h"
  33 #include "lstream.h"
  34 #include "opaque.h"
  35 #ifdef MULE
  36 #include "mule-ccl.h"
  37 #include "chartab.h"
  38 #endif
  39 #include "file-coding.h"
  40
  41 Lisp_Object Qcoding_system_error;
  42
  43 Lisp_Object Vkeyboard_coding_system;
  44 Lisp_Object Vterminal_coding_system;
  45 Lisp_Object Vcoding_system_for_read;
  46 Lisp_Object Vcoding_system_for_write;
  47 Lisp_Object Vfile_name_coding_system;
  48
  49 /* Table of symbols identifying each coding category. */
  50 Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST + 1];
  51
  52
  53
  54 struct file_coding_dump {
  55   /* Coding system currently associated with each coding category. */
  56   Lisp_Object coding_category_system[CODING_CATEGORY_LAST + 1];
  57
  58   /* Table of all coding categories in decreasing order of priority.
  59      This describes a permutation of the possible coding categories. */
  60   int coding_category_by_priority[CODING_CATEGORY_LAST + 1];
  61
  62 #if defined(MULE) && !defined(UTF2000)
  63   Lisp_Object ucs_to_mule_table[65536];
  64 #endif
  65 } *fcd;
  66
  67 static const struct lrecord_description fcd_description_1[] = {
  68   { XD_LISP_OBJECT_ARRAY, offsetof (struct file_coding_dump, coding_category_system), CODING_CATEGORY_LAST + 1 },
  69 #if defined(MULE) && !defined(UTF2000)
  70   { XD_LISP_OBJECT_ARRAY, offsetof (struct file_coding_dump, ucs_to_mule_table), 65536 },
  71 #endif
  72   { XD_END }
  73 };
  74
  75 static const struct struct_description fcd_description = {
  76   sizeof (struct file_coding_dump),
  77   fcd_description_1
  78 };
  79
  80 Lisp_Object mule_to_ucs_table;
  81
  82 Lisp_Object Qcoding_systemp;
  83
  84 Lisp_Object Qraw_text, Qno_conversion, Qccl, Qiso2022;
  85 /* Qinternal in general.c */
  86
  87 Lisp_Object Qmnemonic, Qeol_type;
  88 Lisp_Object Qcr, Qcrlf, Qlf;
  89 Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf;
  90 Lisp_Object Qpost_read_conversion;
  91 Lisp_Object Qpre_write_conversion;
  92
  93 #ifdef MULE
  94 Lisp_Object Qucs4, Qutf8;
  95 Lisp_Object Qbig5, Qshift_jis;
  96 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3;
  97 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output;
  98 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output;
  99 Lisp_Object Qno_iso6429;
 100 Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion;
 101 Lisp_Object Qescape_quoted;
 102 Lisp_Object Qshort, Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift;
 103 #endif
 104 #ifdef UTF2000
 105 Lisp_Object Qdisable_composition;
 106 #endif
 107 Lisp_Object Qencode, Qdecode;
 108
 109 Lisp_Object Vcoding_system_hash_table;
 110
 111 int enable_multibyte_characters;
 112
 113 #ifdef MULE
 114 /* Additional information used by the ISO2022 decoder and detector. */
 115 struct iso2022_decoder
 116 {
 117   /* CHARSET holds the character sets currently assigned to the G0
 118      through G3 variables.  It is initialized from the array
 119      INITIAL_CHARSET in CODESYS. */
 120   Lisp_Object charset[4];
 121
 122   /* Which registers are currently invoked into the left (GL) and
 123      right (GR) halves of the 8-bit encoding space? */
 124   int register_left, register_right;
 125
 126   /* ISO_ESC holds a value indicating part of an escape sequence
 127      that has already been seen. */
 128   enum iso_esc_flag esc;
 129
 130   /* This records the bytes we've seen so far in an escape sequence,
 131      in case the sequence is invalid (we spit out the bytes unchanged). */
 132   unsigned char esc_bytes[8];
 133
 134   /* Index for next byte to store in ISO escape sequence. */
 135   int esc_bytes_index;
 136
 137 #ifdef ENABLE_COMPOSITE_CHARS
 138   /* Stuff seen so far when composing a string. */
 139   unsigned_char_dynarr *composite_chars;
 140 #endif
 141
 142   /* If we saw an invalid designation sequence for a particular
 143      register, we flag it here and switch to ASCII.  The next time we
 144      see a valid designation for this register, we turn off the flag
 145      and do the designation normally, but pretend the sequence was
 146      invalid.  The effect of all this is that (most of the time) the
 147      escape sequences for both the switch to the unknown charset, and
 148      the switch back to the known charset, get inserted literally into
 149      the buffer and saved out as such.  The hope is that we can
 150      preserve the escape sequences so that the resulting written out
 151      file makes sense.  If we don't do any of this, the designation
 152      to the invalid charset will be preserved but that switch back
 153      to the known charset will probably get eaten because it was
 154      the same charset that was already present in the register. */
 155   unsigned char invalid_designated[4];
 156
 157   /* We try to do similar things as above for direction-switching
 158      sequences.  If we encountered a direction switch while an
 159      invalid designation was present, or an invalid designation
 160      just after a direction switch (i.e. no valid designation
 161      encountered yet), we insert the direction-switch escape
 162      sequence literally into the output stream, and later on
 163      insert the corresponding direction-restoring escape sequence
 164      literally also. */
 165   unsigned int switched_dir_and_no_valid_charset_yet :1;
 166   unsigned int invalid_switch_dir :1;
 167
 168   /* Tells the decoder to output the escape sequence literally
 169      even though it was valid.  Used in the games we play to
 170      avoid lossage when we encounter invalid designations. */
 171   unsigned int output_literally :1;
 172   /* We encountered a direction switch followed by an invalid
 173      designation.  We didn't output the direction switch
 174      literally because we didn't know about the invalid designation;
 175      but we have to do so now. */
 176   unsigned int output_direction_sequence :1;
 177 };
 178 #endif /* MULE */
 179 EXFUN (Fcopy_coding_system, 2);
 180 #ifdef MULE
 181 struct detection_state;
 182
 183 static void
 184 text_encode_generic (Lstream *encoding, const unsigned char *src,
 185                      unsigned_char_dynarr *dst, unsigned int n);
 186
 187 static int detect_coding_sjis (struct detection_state *st,
 188                                const unsigned char *src,
 189                                unsigned int n);
 190 static void decode_coding_sjis (Lstream *decoding,
 191                                 const unsigned char *src,
 192                                 unsigned_char_dynarr *dst,
 193                                 unsigned int n);
 194 void char_encode_shift_jis (struct encoding_stream *str, Emchar c,
 195                             unsigned_char_dynarr *dst, unsigned int *flags);
 196 void char_finish_shift_jis (struct encoding_stream *str,
 197                             unsigned_char_dynarr *dst, unsigned int *flags);
 198
 199 static int detect_coding_big5 (struct detection_state *st,
 200                                const unsigned char *src,
 201                                unsigned int n);
 202 static void decode_coding_big5 (Lstream *decoding,
 203                                 const unsigned char *src,
 204                                 unsigned_char_dynarr *dst, unsigned int n);
 205 static void encode_coding_big5 (Lstream *encoding,
 206                                 const unsigned char *src,
 207                                 unsigned_char_dynarr *dst, unsigned int n);
 208 static int detect_coding_ucs4 (struct detection_state *st,
 209                                const unsigned char *src,
 210                                unsigned int n);
 211 static void decode_coding_ucs4 (Lstream *decoding,
 212                                 const unsigned char *src,
 213                                 unsigned_char_dynarr *dst, unsigned int n);
 214 void char_encode_ucs4 (struct encoding_stream *str, Emchar c,
 215                        unsigned_char_dynarr *dst, unsigned int *flags);
 216 void char_finish_ucs4 (struct encoding_stream *str,
 217                        unsigned_char_dynarr *dst, unsigned int *flags);
 218
 219 static int detect_coding_utf8 (struct detection_state *st,
 220                                const unsigned char *src,
 221                                unsigned int n);
 222 static void decode_coding_utf8 (Lstream *decoding,
 223                                 const unsigned char *src,
 224                                 unsigned_char_dynarr *dst, unsigned int n);
 225 void char_encode_utf8 (struct encoding_stream *str, Emchar c,
 226                        unsigned_char_dynarr *dst, unsigned int *flags);
 227 void char_finish_utf8 (struct encoding_stream *str,
 228                        unsigned_char_dynarr *dst, unsigned int *flags);
 229
 230 static int postprocess_iso2022_mask (int mask);
 231 static void reset_iso2022 (Lisp_Object coding_system,
 232                            struct iso2022_decoder *iso);
 233 static int detect_coding_iso2022 (struct detection_state *st,
 234                                   const unsigned char *src,
 235                                   unsigned int n);
 236 static void decode_coding_iso2022 (Lstream *decoding,
 237                                    const unsigned char *src,
 238                                    unsigned_char_dynarr *dst, unsigned int n);
 239 void char_encode_iso2022 (struct encoding_stream *str, Emchar c,
 240                           unsigned_char_dynarr *dst, unsigned int *flags);
 241 void char_finish_iso2022 (struct encoding_stream *str,
 242                           unsigned_char_dynarr *dst, unsigned int *flags);
 243 #endif /* MULE */
 244 static void decode_coding_no_conversion (Lstream *decoding,
 245                                          const unsigned char *src,
 246                                          unsigned_char_dynarr *dst,
 247                                          unsigned int n);
 248 static void encode_coding_no_conversion (Lstream *encoding,
 249                                          const unsigned char *src,
 250                                          unsigned_char_dynarr *dst,
 251                                          unsigned int n);
 252 static void mule_decode (Lstream *decoding, const unsigned char *src,
 253                          unsigned_char_dynarr *dst, unsigned int n);
 254 static void mule_encode (Lstream *encoding, const unsigned char *src,
 255                          unsigned_char_dynarr *dst, unsigned int n);
 256
 257 typedef struct codesys_prop codesys_prop;
 258 struct codesys_prop
 259 {
 260   Lisp_Object sym;
 261   int prop_type;
 262 };
 263
 264 typedef struct
 265 {
 266   Dynarr_declare (codesys_prop);
 267 } codesys_prop_dynarr;
 268
 269 static const struct lrecord_description codesys_prop_description_1[] = {
 270   { XD_LISP_OBJECT, offsetof (codesys_prop, sym) },
 271   { XD_END }
 272 };
 273
 274 static const struct struct_description codesys_prop_description = {
 275   sizeof (codesys_prop),
 276   codesys_prop_description_1
 277 };
 278
 279 static const struct lrecord_description codesys_prop_dynarr_description_1[] = {
 280   XD_DYNARR_DESC (codesys_prop_dynarr, &codesys_prop_description),
 281   { XD_END }
 282 };
 283
 284 static const struct struct_description codesys_prop_dynarr_description = {
 285   sizeof (codesys_prop_dynarr),
 286   codesys_prop_dynarr_description_1
 287 };
 288
 289 codesys_prop_dynarr *the_codesys_prop_dynarr;
 290
 291 enum codesys_prop_enum
 292 {
 293   CODESYS_PROP_ALL_OK,
 294   CODESYS_PROP_ISO2022,
 295   CODESYS_PROP_CCL
 296 };
 297
 298 \f
 299 /************************************************************************/
 300 /*                       Coding system functions                        */
 301 /************************************************************************/
 302
 303 static Lisp_Object mark_coding_system (Lisp_Object);
 304 static void print_coding_system (Lisp_Object, Lisp_Object, int);
 305 static void finalize_coding_system (void *header, int for_disksave);
 306
 307 #ifdef MULE
 308 static const struct lrecord_description ccs_description_1[] = {
 309   { XD_LISP_OBJECT, offsetof (charset_conversion_spec, from_charset) },
 310   { XD_LISP_OBJECT, offsetof (charset_conversion_spec, to_charset) },
 311   { XD_END }
 312 };
 313
 314 static const struct struct_description ccs_description = {
 315   sizeof (charset_conversion_spec),
 316   ccs_description_1
 317 };
 318
 319 static const struct lrecord_description ccsd_description_1[] = {
 320   XD_DYNARR_DESC (charset_conversion_spec_dynarr, &ccs_description),
 321   { XD_END }
 322 };
 323
 324 static const struct struct_description ccsd_description = {
 325   sizeof (charset_conversion_spec_dynarr),
 326   ccsd_description_1
 327 };
 328 #endif
 329
 330 static const struct lrecord_description coding_system_description[] = {
 331   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, name) },
 332   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, doc_string) },
 333   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, mnemonic) },
 334   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, post_read_conversion) },
 335   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, pre_write_conversion) },
 336   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_lf) },
 337   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_crlf) },
 338   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_cr) },
 339 #ifdef MULE
 340   { XD_LISP_OBJECT_ARRAY, offsetof (Lisp_Coding_System, iso2022.initial_charset), 4 },
 341   { XD_STRUCT_PTR,  offsetof (Lisp_Coding_System, iso2022.input_conv),  1, &ccsd_description },
 342   { XD_STRUCT_PTR,  offsetof (Lisp_Coding_System, iso2022.output_conv), 1, &ccsd_description },
 343   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccl.decode) },
 344   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccl.encode) },
 345 #endif
 346   { XD_END }
 347 };
 348
 349 DEFINE_LRECORD_IMPLEMENTATION ("coding-system", coding_system,
 350                                mark_coding_system, print_coding_system,
 351                                finalize_coding_system,
 352                                0, 0, coding_system_description,
 353                                Lisp_Coding_System);
 354
 355 static Lisp_Object
 356 mark_coding_system (Lisp_Object obj)
 357 {
 358   Lisp_Coding_System *codesys = XCODING_SYSTEM (obj);
 359
 360   mark_object (CODING_SYSTEM_NAME (codesys));
 361   mark_object (CODING_SYSTEM_DOC_STRING (codesys));
 362   mark_object (CODING_SYSTEM_MNEMONIC (codesys));
 363   mark_object (CODING_SYSTEM_EOL_LF (codesys));
 364   mark_object (CODING_SYSTEM_EOL_CRLF (codesys));
 365   mark_object (CODING_SYSTEM_EOL_CR (codesys));
 366
 367   switch (CODING_SYSTEM_TYPE (codesys))
 368     {
 369 #ifdef MULE
 370       int i;
 371     case CODESYS_ISO2022:
 372       for (i = 0; i < 4; i++)
 373         mark_object (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i));
 374       if (codesys->iso2022.input_conv)
 375         {
 376           for (i = 0; i < Dynarr_length (codesys->iso2022.input_conv); i++)
 377             {
 378               struct charset_conversion_spec *ccs =
 379                 Dynarr_atp (codesys->iso2022.input_conv, i);
 380               mark_object (ccs->from_charset);
 381               mark_object (ccs->to_charset);
 382             }
 383         }
 384       if (codesys->iso2022.output_conv)
 385         {
 386           for (i = 0; i < Dynarr_length (codesys->iso2022.output_conv); i++)
 387             {
 388               struct charset_conversion_spec *ccs =
 389                 Dynarr_atp (codesys->iso2022.output_conv, i);
 390               mark_object (ccs->from_charset);
 391               mark_object (ccs->to_charset);
 392             }
 393         }
 394       break;
 395
 396     case CODESYS_CCL:
 397       mark_object (CODING_SYSTEM_CCL_DECODE (codesys));
 398       mark_object (CODING_SYSTEM_CCL_ENCODE (codesys));
 399       break;
 400 #endif /* MULE */
 401     default:
 402       break;
 403     }
 404
 405   mark_object (CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys));
 406   return CODING_SYSTEM_POST_READ_CONVERSION (codesys);
 407 }
 408
 409 static void
 410 print_coding_system (Lisp_Object obj, Lisp_Object printcharfun,
 411                      int escapeflag)
 412 {
 413   Lisp_Coding_System *c = XCODING_SYSTEM (obj);
 414   if (print_readably)
 415     error ("printing unreadable object #<coding_system 0x%x>",
 416            c->header.uid);
 417
 418   write_c_string ("#<coding_system ", printcharfun);
 419   print_internal (c->name, printcharfun, 1);
 420   write_c_string (">", printcharfun);
 421 }
 422
 423 static void
 424 finalize_coding_system (void *header, int for_disksave)
 425 {
 426   Lisp_Coding_System *c = (Lisp_Coding_System *) header;
 427   /* Since coding systems never go away, this function is not
 428      necessary.  But it would be necessary if we changed things
 429      so that coding systems could go away. */
 430   if (!for_disksave) /* see comment in lstream.c */
 431     {
 432       switch (CODING_SYSTEM_TYPE (c))
 433         {
 434 #ifdef MULE
 435         case CODESYS_ISO2022:
 436           if (c->iso2022.input_conv)
 437             {
 438               Dynarr_free (c->iso2022.input_conv);
 439               c->iso2022.input_conv = 0;
 440             }
 441           if (c->iso2022.output_conv)
 442             {
 443               Dynarr_free (c->iso2022.output_conv);
 444               c->iso2022.output_conv = 0;
 445             }
 446           break;
 447 #endif /* MULE */
 448         default:
 449           break;
 450         }
 451     }
 452 }
 453
 454 static eol_type_t
 455 symbol_to_eol_type (Lisp_Object symbol)
 456 {
 457   CHECK_SYMBOL (symbol);
 458   if (NILP (symbol))      return EOL_AUTODETECT;
 459   if (EQ (symbol, Qlf))   return EOL_LF;
 460   if (EQ (symbol, Qcrlf)) return EOL_CRLF;
 461   if (EQ (symbol, Qcr))   return EOL_CR;
 462
 463   signal_simple_error ("Unrecognized eol type", symbol);
 464   return EOL_AUTODETECT; /* not reached */
 465 }
 466
 467 static Lisp_Object
 468 eol_type_to_symbol (eol_type_t type)
 469 {
 470   switch (type)
 471     {
 472     default: abort ();
 473     case EOL_LF:         return Qlf;
 474     case EOL_CRLF:       return Qcrlf;
 475     case EOL_CR:         return Qcr;
 476     case EOL_AUTODETECT: return Qnil;
 477     }
 478 }
 479
 480 static void
 481 setup_eol_coding_systems (Lisp_Coding_System *codesys)
 482 {
 483   Lisp_Object codesys_obj;
 484   int len = string_length (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name);
 485   char *codesys_name = (char *) alloca (len + 7);
 486   int mlen = -1;
 487   char *codesys_mnemonic=0;
 488
 489   Lisp_Object codesys_name_sym, sub_codesys_obj;
 490
 491   /* kludge */
 492
 493   XSETCODING_SYSTEM (codesys_obj, codesys);
 494
 495   memcpy (codesys_name,
 496           string_data (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name), len);
 497
 498   if (STRINGP (CODING_SYSTEM_MNEMONIC (codesys)))
 499     {
 500       mlen = XSTRING_LENGTH (CODING_SYSTEM_MNEMONIC (codesys));
 501       codesys_mnemonic = (char *) alloca (mlen + 7);
 502       memcpy (codesys_mnemonic,
 503               XSTRING_DATA (CODING_SYSTEM_MNEMONIC (codesys)), mlen);
 504     }
 505
 506 #define DEFINE_SUB_CODESYS(op_sys, op_sys_abbr, Type) do {                      \
 507   strcpy (codesys_name + len, "-" op_sys);                                      \
 508   if (mlen != -1)                                                               \
 509     strcpy (codesys_mnemonic + mlen, op_sys_abbr);                              \
 510   codesys_name_sym = intern (codesys_name);                                     \
 511   sub_codesys_obj = Fcopy_coding_system (codesys_obj, codesys_name_sym);        \
 512   XCODING_SYSTEM_EOL_TYPE (sub_codesys_obj) = Type;                             \
 513   if (mlen != -1)                                                               \
 514     XCODING_SYSTEM_MNEMONIC(sub_codesys_obj) =                                  \
 515       build_string (codesys_mnemonic);                                          \
 516   CODING_SYSTEM_##Type (codesys) = sub_codesys_obj;                             \
 517 } while (0)
 518
 519   DEFINE_SUB_CODESYS("unix", "", EOL_LF);
 520   DEFINE_SUB_CODESYS("dos",  ":T", EOL_CRLF);
 521   DEFINE_SUB_CODESYS("mac",  ":t", EOL_CR);
 522 }
 523
 524 DEFUN ("coding-system-p", Fcoding_system_p, 1, 1, 0, /*
 525 Return t if OBJECT is a coding system.
 526 A coding system is an object that defines how text containing multiple
 527 character sets is encoded into a stream of (typically 8-bit) bytes.
 528 The coding system is used to decode the stream into a series of
 529 characters (which may be from multiple charsets) when the text is read
 530 from a file or process, and is used to encode the text back into the
 531 same format when it is written out to a file or process.
 532
 533 For example, many ISO2022-compliant coding systems (such as Compound
 534 Text, which is used for inter-client data under the X Window System)
 535 use escape sequences to switch between different charsets -- Japanese
 536 Kanji, for example, is invoked with "ESC $ ( B"; ASCII is invoked
 537 with "ESC ( B"; and Cyrillic is invoked with "ESC - L".  See
 538 `make-coding-system' for more information.
 539
 540 Coding systems are normally identified using a symbol, and the
 541 symbol is accepted in place of the actual coding system object whenever
 542 a coding system is called for. (This is similar to how faces work.)
 543 */
 544        (object))
 545 {
 546   return CODING_SYSTEMP (object) ? Qt : Qnil;
 547 }
 548
 549 DEFUN ("find-coding-system", Ffind_coding_system, 1, 1, 0, /*
 550 Retrieve the coding system of the given name.
 551
 552 If CODING-SYSTEM-OR-NAME is a coding-system object, it is simply
 553 returned.  Otherwise, CODING-SYSTEM-OR-NAME should be a symbol.
 554 If there is no such coding system, nil is returned.  Otherwise the
 555 associated coding system object is returned.
 556 */
 557        (coding_system_or_name))
 558 {
 559   if (NILP (coding_system_or_name))
 560     coding_system_or_name = Qbinary;
 561   else if (CODING_SYSTEMP (coding_system_or_name))
 562     return coding_system_or_name;
 563   else
 564     CHECK_SYMBOL (coding_system_or_name);
 565
 566   while (1)
 567     {
 568       coding_system_or_name =
 569         Fgethash (coding_system_or_name, Vcoding_system_hash_table, Qnil);
 570
 571       if (CODING_SYSTEMP (coding_system_or_name) || NILP (coding_system_or_name))
 572         return coding_system_or_name;
 573     }
 574 }
 575
 576 DEFUN ("get-coding-system", Fget_coding_system, 1, 1, 0, /*
 577 Retrieve the coding system of the given name.
 578 Same as `find-coding-system' except that if there is no such
 579 coding system, an error is signaled instead of returning nil.
 580 */
 581        (name))
 582 {
 583   Lisp_Object coding_system = Ffind_coding_system (name);
 584
 585   if (NILP (coding_system))
 586     signal_simple_error ("No such coding system", name);
 587   return coding_system;
 588 }
 589
 590 /* We store the coding systems in hash tables with the names as the key and the
 591    actual coding system object as the value.  Occasionally we need to use them
 592    in a list format.  These routines provide us with that. */
 593 struct coding_system_list_closure
 594 {
 595   Lisp_Object *coding_system_list;
 596 };
 597
 598 static int
 599 add_coding_system_to_list_mapper (Lisp_Object key, Lisp_Object value,
 600                                   void *coding_system_list_closure)
 601 {
 602   /* This function can GC */
 603   struct coding_system_list_closure *cscl =
 604     (struct coding_system_list_closure *) coding_system_list_closure;
 605   Lisp_Object *coding_system_list = cscl->coding_system_list;
 606
 607   *coding_system_list = Fcons (key, *coding_system_list);
 608   return 0;
 609 }
 610
 611 DEFUN ("coding-system-list", Fcoding_system_list, 0, 0, 0, /*
 612 Return a list of the names of all defined coding systems.
 613 */
 614        ())
 615 {
 616   Lisp_Object coding_system_list = Qnil;
 617   struct gcpro gcpro1;
 618   struct coding_system_list_closure coding_system_list_closure;
 619
 620   GCPRO1 (coding_system_list);
 621   coding_system_list_closure.coding_system_list = &coding_system_list;
 622   elisp_maphash (add_coding_system_to_list_mapper, Vcoding_system_hash_table,
 623                  &coding_system_list_closure);
 624   UNGCPRO;
 625
 626   return coding_system_list;
 627 }
 628
 629 DEFUN ("coding-system-name", Fcoding_system_name, 1, 1, 0, /*
 630 Return the name of the given coding system.
 631 */
 632        (coding_system))
 633 {
 634   coding_system = Fget_coding_system (coding_system);
 635   return XCODING_SYSTEM_NAME (coding_system);
 636 }
 637
 638 static Lisp_Coding_System *
 639 allocate_coding_system (enum coding_system_type type, Lisp_Object name)
 640 {
 641   Lisp_Coding_System *codesys =
 642     alloc_lcrecord_type (Lisp_Coding_System, &lrecord_coding_system);
 643
 644   zero_lcrecord (codesys);
 645   CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = Qnil;
 646   CODING_SYSTEM_POST_READ_CONVERSION (codesys) = Qnil;
 647   CODING_SYSTEM_EOL_TYPE (codesys) = EOL_AUTODETECT;
 648   CODING_SYSTEM_EOL_CRLF (codesys) = Qnil;
 649   CODING_SYSTEM_EOL_CR   (codesys) = Qnil;
 650   CODING_SYSTEM_EOL_LF   (codesys) = Qnil;
 651   CODING_SYSTEM_TYPE     (codesys) = type;
 652   CODING_SYSTEM_MNEMONIC (codesys) = Qnil;
 653 #ifdef MULE
 654   if (type == CODESYS_ISO2022)
 655     {
 656       int i;
 657       for (i = 0; i < 4; i++)
 658         CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i) = Qnil;
 659     }
 660   else if (type == CODESYS_CCL)
 661     {
 662       CODING_SYSTEM_CCL_DECODE (codesys) = Qnil;
 663       CODING_SYSTEM_CCL_ENCODE (codesys) = Qnil;
 664     }
 665 #endif /* MULE */
 666   CODING_SYSTEM_NAME (codesys) = name;
 667
 668   return codesys;
 669 }
 670
 671 #ifdef MULE
 672 /* Given a list of charset conversion specs as specified in a Lisp
 673    program, parse it into STORE_HERE. */
 674
 675 static void
 676 parse_charset_conversion_specs (charset_conversion_spec_dynarr *store_here,
 677                                 Lisp_Object spec_list)
 678 {
 679   Lisp_Object rest;
 680
 681   EXTERNAL_LIST_LOOP (rest, spec_list)
 682     {
 683       Lisp_Object car = XCAR (rest);
 684       Lisp_Object from, to;
 685       struct charset_conversion_spec spec;
 686
 687       if (!CONSP (car) || !CONSP (XCDR (car)) || !NILP (XCDR (XCDR (car))))
 688         signal_simple_error ("Invalid charset conversion spec", car);
 689       from = Fget_charset (XCAR (car));
 690       to = Fget_charset (XCAR (XCDR (car)));
 691       if ( (XCHARSET_CHARS (from) != XCHARSET_CHARS (to)) ||
 692            (XCHARSET_DIMENSION (from) != XCHARSET_DIMENSION (to)) )
 693         signal_simple_error_2
 694           ("Attempted conversion between different charset types",
 695            from, to);
 696       spec.from_charset = from;
 697       spec.to_charset = to;
 698
 699       Dynarr_add (store_here, spec);
 700     }
 701 }
 702
 703 /* Given a dynarr LOAD_HERE of internally-stored charset conversion
 704    specs, return the equivalent as the Lisp programmer would see it.
 705
 706    If LOAD_HERE is 0, return Qnil. */
 707
 708 static Lisp_Object
 709 unparse_charset_conversion_specs (charset_conversion_spec_dynarr *load_here)
 710 {
 711   int i;
 712   Lisp_Object result;
 713
 714   if (!load_here)
 715     return Qnil;
 716   for (i = 0, result = Qnil; i < Dynarr_length (load_here); i++)
 717     {
 718       struct charset_conversion_spec *ccs = Dynarr_atp (load_here, i);
 719       result = Fcons (list2 (ccs->from_charset, ccs->to_charset), result);
 720     }
 721
 722   return Fnreverse (result);
 723 }
 724
 725 #endif /* MULE */
 726
 727 DEFUN ("make-coding-system", Fmake_coding_system, 2, 4, 0, /*
 728 Register symbol NAME as a coding system.
 729
 730 TYPE describes the conversion method used and should be one of
 731
 732 nil or 'undecided
 733      Automatic conversion.  XEmacs attempts to detect the coding system
 734      used in the file.
 735 'no-conversion
 736      No conversion.  Use this for binary files and such.  On output,
 737      graphic characters that are not in ASCII or Latin-1 will be
 738      replaced by a ?. (For a no-conversion-encoded buffer, these
 739      characters will only be present if you explicitly insert them.)
 740 'shift-jis
 741      Shift-JIS (a Japanese encoding commonly used in PC operating systems).
 742 'ucs-4
 743      ISO 10646 UCS-4 encoding.
 744 'utf-8
 745      ISO 10646 UTF-8 encoding.
 746 'iso2022
 747      Any ISO2022-compliant encoding.  Among other things, this includes
 748      JIS (the Japanese encoding commonly used for e-mail), EUC (the
 749      standard Unix encoding for Japanese and other languages), and
 750      Compound Text (the encoding used in X11).  You can specify more
 751      specific information about the conversion with the FLAGS argument.
 752 'big5
 753      Big5 (the encoding commonly used for Taiwanese).
 754 'ccl
 755      The conversion is performed using a user-written pseudo-code
 756      program.  CCL (Code Conversion Language) is the name of this
 757      pseudo-code.
 758 'internal
 759      Write out or read in the raw contents of the memory representing
 760      the buffer's text.  This is primarily useful for debugging
 761      purposes, and is only enabled when XEmacs has been compiled with
 762      DEBUG_XEMACS defined (via the --debug configure option).
 763      WARNING: Reading in a file using 'internal conversion can result
 764      in an internal inconsistency in the memory representing a
 765      buffer's text, which will produce unpredictable results and may
 766      cause XEmacs to crash.  Under normal circumstances you should
 767      never use 'internal conversion.
 768
 769 DOC-STRING is a string describing the coding system.
 770
 771 PROPS is a property list, describing the specific nature of the
 772 character set.  Recognized properties are:
 773
 774 'mnemonic
 775      String to be displayed in the modeline when this coding system is
 776      active.
 777
 778 'eol-type
 779      End-of-line conversion to be used.  It should be one of
 780
 781         nil
 782                 Automatically detect the end-of-line type (LF, CRLF,
 783                 or CR).  Also generate subsidiary coding systems named
 784                 `NAME-unix', `NAME-dos', and `NAME-mac', that are
 785                 identical to this coding system but have an EOL-TYPE
 786                 value of 'lf, 'crlf, and 'cr, respectively.
 787         'lf
 788                 The end of a line is marked externally using ASCII LF.
 789                 Since this is also the way that XEmacs represents an
 790                 end-of-line internally, specifying this option results
 791                 in no end-of-line conversion.  This is the standard
 792                 format for Unix text files.
 793         'crlf
 794                 The end of a line is marked externally using ASCII
 795                 CRLF.  This is the standard format for MS-DOS text
 796                 files.
 797         'cr
 798                 The end of a line is marked externally using ASCII CR.
 799                 This is the standard format for Macintosh text files.
 800         t
 801                 Automatically detect the end-of-line type but do not
 802                 generate subsidiary coding systems.  (This value is
 803                 converted to nil when stored internally, and
 804                 `coding-system-property' will return nil.)
 805
 806 'disable-composition
 807      If non-nil, composition/decomposition for combining characters
 808      are disabled.
 809
 810 'post-read-conversion
 811      Function called after a file has been read in, to perform the
 812      decoding.  Called with two arguments, BEG and END, denoting
 813      a region of the current buffer to be decoded.
 814
 815 'pre-write-conversion
 816      Function called before a file is written out, to perform the
 817      encoding.  Called with two arguments, BEG and END, denoting
 818      a region of the current buffer to be encoded.
 819
 820
 821 The following additional properties are recognized if TYPE is 'iso2022:
 822
 823 'charset-g0
 824 'charset-g1
 825 'charset-g2
 826 'charset-g3
 827      The character set initially designated to the G0 - G3 registers.
 828      The value should be one of
 829
 830           -- A charset object (designate that character set)
 831           -- nil (do not ever use this register)
 832           -- t (no character set is initially designated to
 833                 the register, but may be later on; this automatically
 834                 sets the corresponding `force-g*-on-output' property)
 835
 836 'force-g0-on-output
 837 'force-g1-on-output
 838 'force-g2-on-output
 839 'force-g2-on-output
 840      If non-nil, send an explicit designation sequence on output before
 841      using the specified register.
 842
 843 'short
 844      If non-nil, use the short forms "ESC $ @", "ESC $ A", and
 845      "ESC $ B" on output in place of the full designation sequences
 846      "ESC $ ( @", "ESC $ ( A", and "ESC $ ( B".
 847
 848 'no-ascii-eol
 849      If non-nil, don't designate ASCII to G0 at each end of line on output.
 850      Setting this to non-nil also suppresses other state-resetting that
 851      normally happens at the end of a line.
 852
 853 'no-ascii-cntl
 854      If non-nil, don't designate ASCII to G0 before control chars on output.
 855
 856 'seven
 857      If non-nil, use 7-bit environment on output.  Otherwise, use 8-bit
 858      environment.
 859
 860 'lock-shift
 861      If non-nil, use locking-shift (SO/SI) instead of single-shift
 862      or designation by escape sequence.
 863
 864 'no-iso6429
 865      If non-nil, don't use ISO6429's direction specification.
 866
 867 'escape-quoted
 868      If non-nil, literal control characters that are the same as
 869      the beginning of a recognized ISO2022 or ISO6429 escape sequence
 870      (in particular, ESC (0x1B), SO (0x0E), SI (0x0F), SS2 (0x8E),
 871      SS3 (0x8F), and CSI (0x9B)) are "quoted" with an escape character
 872      so that they can be properly distinguished from an escape sequence.
 873      (Note that doing this results in a non-portable encoding.) This
 874      encoding flag is used for byte-compiled files.  Note that ESC
 875      is a good choice for a quoting character because there are no
 876      escape sequences whose second byte is a character from the Control-0
 877      or Control-1 character sets; this is explicitly disallowed by the
 878      ISO2022 standard.
 879
 880 'input-charset-conversion
 881      A list of conversion specifications, specifying conversion of
 882      characters in one charset to another when decoding is performed.
 883      Each specification is a list of two elements: the source charset,
 884      and the destination charset.
 885
 886 'output-charset-conversion
 887      A list of conversion specifications, specifying conversion of
 888      characters in one charset to another when encoding is performed.
 889      The form of each specification is the same as for
 890      'input-charset-conversion.
 891
 892
 893 The following additional properties are recognized (and required)
 894 if TYPE is 'ccl:
 895
 896 'decode
 897      CCL program used for decoding (converting to internal format).
 898
 899 'encode
 900      CCL program used for encoding (converting to external format).
 901 */
 902        (name, type, doc_string, props))
 903 {
 904   Lisp_Coding_System *codesys;
 905   Lisp_Object rest, key, value;
 906   enum coding_system_type ty;
 907   int need_to_setup_eol_systems = 1;
 908
 909   /* Convert type to constant */
 910   if (NILP (type) || EQ (type, Qundecided))
 911                                       { ty = CODESYS_AUTODETECT; }
 912 #ifdef MULE
 913   else if (EQ (type, Qshift_jis))     { ty = CODESYS_SHIFT_JIS; }
 914   else if (EQ (type, Qiso2022))       { ty = CODESYS_ISO2022; }
 915   else if (EQ (type, Qbig5))          { ty = CODESYS_BIG5; }
 916   else if (EQ (type, Qucs4))          { ty = CODESYS_UCS4; }
 917   else if (EQ (type, Qutf8))          { ty = CODESYS_UTF8; }
 918   else if (EQ (type, Qccl))           { ty = CODESYS_CCL; }
 919 #endif
 920   else if (EQ (type, Qno_conversion)) { ty = CODESYS_NO_CONVERSION; }
 921 #ifdef DEBUG_XEMACS
 922   else if (EQ (type, Qinternal))      { ty = CODESYS_INTERNAL; }
 923 #endif
 924   else
 925     signal_simple_error ("Invalid coding system type", type);
 926
 927   CHECK_SYMBOL (name);
 928
 929   codesys = allocate_coding_system (ty, name);
 930
 931   if (NILP (doc_string))
 932     doc_string = build_string ("");
 933   else
 934     CHECK_STRING (doc_string);
 935   CODING_SYSTEM_DOC_STRING (codesys) = doc_string;
 936
 937   EXTERNAL_PROPERTY_LIST_LOOP (rest, key, value, props)
 938     {
 939       if (EQ (key, Qmnemonic))
 940         {
 941           if (!NILP (value))
 942             CHECK_STRING (value);
 943           CODING_SYSTEM_MNEMONIC (codesys) = value;
 944         }
 945
 946       else if (EQ (key, Qeol_type))
 947         {
 948           need_to_setup_eol_systems = NILP (value);
 949           if (EQ (value, Qt))
 950             value = Qnil;
 951           CODING_SYSTEM_EOL_TYPE (codesys) = symbol_to_eol_type (value);
 952         }
 953
 954       else if (EQ (key, Qpost_read_conversion))
 955         CODING_SYSTEM_POST_READ_CONVERSION (codesys) = value;
 956       else if (EQ (key, Qpre_write_conversion))
 957         CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = value;
 958 #ifdef UTF2000
 959       else if (EQ (key, Qdisable_composition))
 960         CODING_SYSTEM_DISABLE_COMPOSITION (codesys) = !NILP (value);
 961 #endif
 962 #ifdef MULE
 963       else if (ty == CODESYS_ISO2022)
 964         {
 965 #define FROB_INITIAL_CHARSET(charset_num) \
 966   CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \
 967     ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value))
 968
 969           if      (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
 970           else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
 971           else if (EQ (key, Qcharset_g2)) FROB_INITIAL_CHARSET (2);
 972           else if (EQ (key, Qcharset_g3)) FROB_INITIAL_CHARSET (3);
 973
 974 #define FROB_FORCE_CHARSET(charset_num) \
 975   CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = !NILP (value)
 976
 977           else if (EQ (key, Qforce_g0_on_output)) FROB_FORCE_CHARSET (0);
 978           else if (EQ (key, Qforce_g1_on_output)) FROB_FORCE_CHARSET (1);
 979           else if (EQ (key, Qforce_g2_on_output)) FROB_FORCE_CHARSET (2);
 980           else if (EQ (key, Qforce_g3_on_output)) FROB_FORCE_CHARSET (3);
 981
 982 #define FROB_BOOLEAN_PROPERTY(prop) \
 983   CODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value)
 984
 985           else if (EQ (key, Qshort))         FROB_BOOLEAN_PROPERTY (SHORT);
 986           else if (EQ (key, Qno_ascii_eol))  FROB_BOOLEAN_PROPERTY (NO_ASCII_EOL);
 987           else if (EQ (key, Qno_ascii_cntl)) FROB_BOOLEAN_PROPERTY (NO_ASCII_CNTL);
 988           else if (EQ (key, Qseven))         FROB_BOOLEAN_PROPERTY (SEVEN);
 989           else if (EQ (key, Qlock_shift))    FROB_BOOLEAN_PROPERTY (LOCK_SHIFT);
 990           else if (EQ (key, Qno_iso6429))    FROB_BOOLEAN_PROPERTY (NO_ISO6429);
 991           else if (EQ (key, Qescape_quoted)) FROB_BOOLEAN_PROPERTY (ESCAPE_QUOTED);
 992
 993           else if (EQ (key, Qinput_charset_conversion))
 994             {
 995               codesys->iso2022.input_conv =
 996                 Dynarr_new (charset_conversion_spec);
 997               parse_charset_conversion_specs (codesys->iso2022.input_conv,
 998                                               value);
 999             }
1000           else if (EQ (key, Qoutput_charset_conversion))
1001             {
1002               codesys->iso2022.output_conv =
1003                 Dynarr_new (charset_conversion_spec);
1004               parse_charset_conversion_specs (codesys->iso2022.output_conv,
1005                                               value);
1006             }
1007           else
1008             signal_simple_error ("Unrecognized property", key);
1009         }
1010       else if (EQ (type, Qccl))
1011         {
1012           if (EQ (key, Qdecode))
1013             {
1014               CHECK_VECTOR (value);
1015               CODING_SYSTEM_CCL_DECODE (codesys) = value;
1016             }
1017           else if (EQ (key, Qencode))
1018             {
1019               CHECK_VECTOR (value);
1020               CODING_SYSTEM_CCL_ENCODE (codesys) = value;
1021             }
1022           else
1023             signal_simple_error ("Unrecognized property", key);
1024         }
1025 #endif /* MULE */
1026       else
1027         signal_simple_error ("Unrecognized property", key);
1028     }
1029
1030   if (need_to_setup_eol_systems)
1031     setup_eol_coding_systems (codesys);
1032
1033   {
1034     Lisp_Object codesys_obj;
1035     XSETCODING_SYSTEM (codesys_obj, codesys);
1036     Fputhash (name, codesys_obj, Vcoding_system_hash_table);
1037     return codesys_obj;
1038   }
1039 }
1040
1041 DEFUN ("copy-coding-system", Fcopy_coding_system, 2, 2, 0, /*
1042 Copy OLD-CODING-SYSTEM to NEW-NAME.
1043 If NEW-NAME does not name an existing coding system, a new one will
1044 be created.
1045 */
1046        (old_coding_system, new_name))
1047 {
1048   Lisp_Object new_coding_system;
1049   old_coding_system = Fget_coding_system (old_coding_system);
1050   new_coding_system = Ffind_coding_system (new_name);
1051   if (NILP (new_coding_system))
1052     {
1053       XSETCODING_SYSTEM (new_coding_system,
1054                          allocate_coding_system
1055                          (XCODING_SYSTEM_TYPE (old_coding_system),
1056                           new_name));
1057       Fputhash (new_name, new_coding_system, Vcoding_system_hash_table);
1058     }
1059
1060   {
1061     Lisp_Coding_System *to = XCODING_SYSTEM (new_coding_system);
1062     Lisp_Coding_System *from = XCODING_SYSTEM (old_coding_system);
1063     memcpy (((char *) to  ) + sizeof (to->header),
1064             ((char *) from) + sizeof (from->header),
1065             sizeof (*from) - sizeof (from->header));
1066     to->name = new_name;
1067   }
1068   return new_coding_system;
1069 }
1070
1071 DEFUN ("coding-system-canonical-name-p", Fcoding_system_canonical_name_p, 1, 1, 0, /*
1072 Return t if OBJECT names a coding system, and is not a coding system alias.
1073 */
1074        (object))
1075 {
1076   return CODING_SYSTEMP (Fgethash (object, Vcoding_system_hash_table, Qnil))
1077     ? Qt : Qnil;
1078 }
1079
1080 DEFUN ("coding-system-alias-p", Fcoding_system_alias_p, 1, 1, 0, /*
1081 Return t if OBJECT is a coding system alias.
1082 All coding system aliases are created by `define-coding-system-alias'.
1083 */
1084        (object))
1085 {
1086   return SYMBOLP (Fgethash (object, Vcoding_system_hash_table, Qzero))
1087     ? Qt : Qnil;
1088 }
1089
1090 DEFUN ("coding-system-aliasee", Fcoding_system_aliasee, 1, 1, 0, /*
1091 Return the coding-system symbol for which symbol ALIAS is an alias.
1092 */
1093        (alias))
1094 {
1095   Lisp_Object aliasee = Fgethash (alias, Vcoding_system_hash_table, Qnil);
1096   if (SYMBOLP (aliasee))
1097     return aliasee;
1098   else
1099     signal_simple_error ("Symbol is not a coding system alias", alias);
1100 }
1101
1102 static Lisp_Object
1103 append_suffix_to_symbol (Lisp_Object symbol, const char *ascii_string)
1104 {
1105   return Fintern (concat2 (Fsymbol_name (symbol), build_string (ascii_string)),
1106                   Qnil);
1107 }
1108
1109 /* A maphash function, for removing dangling coding system aliases. */
1110 static int
1111 dangling_coding_system_alias_p (Lisp_Object alias,
1112                                 Lisp_Object aliasee,
1113                                 void *dangling_aliases)
1114 {
1115   if (SYMBOLP (aliasee)
1116       && NILP (Fgethash (aliasee, Vcoding_system_hash_table, Qnil)))
1117     {
1118       (*(int *) dangling_aliases)++;
1119       return 1;
1120     }
1121   else
1122     return 0;
1123 }
1124
1125 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias, 2, 2, 0, /*
1126 Define symbol ALIAS as an alias for coding system ALIASEE.
1127
1128 You can use this function to redefine an alias that has already been defined,
1129 but you cannot redefine a name which is the canonical name for a coding system.
1130 \(a canonical name of a coding system is what is returned when you call
1131 `coding-system-name' on a coding system).
1132
1133 ALIASEE itself can be an alias, which allows you to define nested aliases.
1134
1135 You are forbidden, however, from creating alias loops or `dangling' aliases.
1136 These will be detected, and an error will be signaled if you attempt to do so.
1137
1138 If ALIASEE is nil, then ALIAS will simply be undefined.
1139
1140 See also `coding-system-alias-p', `coding-system-aliasee',
1141 and `coding-system-canonical-name-p'.
1142 */
1143        (alias, aliasee))
1144 {
1145   Lisp_Object real_coding_system, probe;
1146
1147   CHECK_SYMBOL (alias);
1148
1149   if (!NILP (Fcoding_system_canonical_name_p (alias)))
1150     signal_simple_error
1151       ("Symbol is the canonical name of a coding system and cannot be redefined",
1152        alias);
1153
1154   if (NILP (aliasee))
1155     {
1156       Lisp_Object subsidiary_unix = append_suffix_to_symbol (alias, "-unix");
1157       Lisp_Object subsidiary_dos  = append_suffix_to_symbol (alias, "-dos");
1158       Lisp_Object subsidiary_mac  = append_suffix_to_symbol (alias, "-mac");
1159
1160       Fremhash (alias, Vcoding_system_hash_table);
1161
1162       /* Undefine subsidiary aliases,
1163          presumably created by a previous call to this function */
1164       if (! NILP (Fcoding_system_alias_p (subsidiary_unix)) &&
1165           ! NILP (Fcoding_system_alias_p (subsidiary_dos))  &&
1166           ! NILP (Fcoding_system_alias_p (subsidiary_mac)))
1167         {
1168           Fdefine_coding_system_alias (subsidiary_unix, Qnil);
1169           Fdefine_coding_system_alias (subsidiary_dos,  Qnil);
1170           Fdefine_coding_system_alias (subsidiary_mac,  Qnil);
1171         }
1172
1173       /* Undefine dangling coding system aliases. */
1174       {
1175         int dangling_aliases;
1176
1177         do {
1178           dangling_aliases = 0;
1179           elisp_map_remhash (dangling_coding_system_alias_p,
1180                              Vcoding_system_hash_table,
1181                              &dangling_aliases);
1182         } while (dangling_aliases > 0);
1183       }
1184
1185       return Qnil;
1186     }
1187
1188   if (CODING_SYSTEMP (aliasee))
1189     aliasee = XCODING_SYSTEM_NAME (aliasee);
1190
1191   /* Checks that aliasee names a coding-system */
1192   real_coding_system = Fget_coding_system (aliasee);
1193
1194   /* Check for coding system alias loops */
1195   if (EQ (alias, aliasee))
1196     alias_loop: signal_simple_error_2
1197       ("Attempt to create a coding system alias loop", alias, aliasee);
1198
1199   for (probe = aliasee;
1200        SYMBOLP (probe);
1201        probe = Fgethash (probe, Vcoding_system_hash_table, Qzero))
1202     {
1203       if (EQ (probe, alias))
1204         goto alias_loop;
1205     }
1206
1207   Fputhash (alias, aliasee, Vcoding_system_hash_table);
1208
1209   /* Set up aliases for subsidiaries.
1210      #### There must be a better way to handle subsidiary coding systems. */
1211   {
1212     static const char *suffixes[] = { "-unix", "-dos", "-mac" };
1213     int i;
1214     for (i = 0; i < countof (suffixes); i++)
1215       {
1216         Lisp_Object alias_subsidiary =
1217           append_suffix_to_symbol (alias, suffixes[i]);
1218         Lisp_Object aliasee_subsidiary =
1219           append_suffix_to_symbol (aliasee, suffixes[i]);
1220
1221         if (! NILP (Ffind_coding_system (aliasee_subsidiary)))
1222           Fdefine_coding_system_alias (alias_subsidiary, aliasee_subsidiary);
1223       }
1224   }
1225   /* FSF return value is a vector of [ALIAS-unix ALIAS-dos ALIAS-mac],
1226      but it doesn't look intentional, so I'd rather return something
1227      meaningful or nothing at all. */
1228   return Qnil;
1229 }
1230
1231 static Lisp_Object
1232 subsidiary_coding_system (Lisp_Object coding_system, eol_type_t type)
1233 {
1234   Lisp_Coding_System *cs = XCODING_SYSTEM (coding_system);
1235   Lisp_Object new_coding_system;
1236
1237   if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
1238     return coding_system;
1239
1240   switch (type)
1241     {
1242     case EOL_AUTODETECT: return coding_system;
1243     case EOL_LF:   new_coding_system = CODING_SYSTEM_EOL_LF   (cs); break;
1244     case EOL_CR:   new_coding_system = CODING_SYSTEM_EOL_CR   (cs); break;
1245     case EOL_CRLF: new_coding_system = CODING_SYSTEM_EOL_CRLF (cs); break;
1246     default:       abort ();
1247     }
1248
1249   return NILP (new_coding_system) ? coding_system : new_coding_system;
1250 }
1251
1252 DEFUN ("subsidiary-coding-system", Fsubsidiary_coding_system, 2, 2, 0, /*
1253 Return the subsidiary coding system of CODING-SYSTEM with eol type EOL-TYPE.
1254 */
1255        (coding_system, eol_type))
1256 {
1257   coding_system = Fget_coding_system (coding_system);
1258
1259   return subsidiary_coding_system (coding_system,
1260                                    symbol_to_eol_type (eol_type));
1261 }
1262
1263 \f
1264 /************************************************************************/
1265 /*                         Coding system accessors                      */
1266 /************************************************************************/
1267
1268 DEFUN ("coding-system-doc-string", Fcoding_system_doc_string, 1, 1, 0, /*
1269 Return the doc string for CODING-SYSTEM.
1270 */
1271        (coding_system))
1272 {
1273   coding_system = Fget_coding_system (coding_system);
1274   return XCODING_SYSTEM_DOC_STRING (coding_system);
1275 }
1276
1277 DEFUN ("coding-system-type", Fcoding_system_type, 1, 1, 0, /*
1278 Return the type of CODING-SYSTEM.
1279 */
1280        (coding_system))
1281 {
1282   switch (XCODING_SYSTEM_TYPE (Fget_coding_system (coding_system)))
1283     {
1284     default: abort ();
1285     case CODESYS_AUTODETECT:    return Qundecided;
1286 #ifdef MULE
1287     case CODESYS_SHIFT_JIS:     return Qshift_jis;
1288     case CODESYS_ISO2022:       return Qiso2022;
1289     case CODESYS_BIG5:          return Qbig5;
1290     case CODESYS_UCS4:          return Qucs4;
1291     case CODESYS_UTF8:          return Qutf8;
1292     case CODESYS_CCL:           return Qccl;
1293 #endif
1294     case CODESYS_NO_CONVERSION: return Qno_conversion;
1295 #ifdef DEBUG_XEMACS
1296     case CODESYS_INTERNAL:      return Qinternal;
1297 #endif
1298     }
1299 }
1300
1301 #ifdef MULE
1302 static
1303 Lisp_Object coding_system_charset (Lisp_Object coding_system, int gnum)
1304 {
1305   Lisp_Object cs
1306     = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, gnum);
1307
1308   return CHARSETP (cs) ? XCHARSET_NAME (cs) : Qnil;
1309 }
1310
1311 DEFUN ("coding-system-charset", Fcoding_system_charset, 2, 2, 0, /*
1312 Return initial charset of CODING-SYSTEM designated to GNUM.
1313 GNUM allows 0 .. 3.
1314 */
1315        (coding_system, gnum))
1316 {
1317   coding_system = Fget_coding_system (coding_system);
1318   CHECK_INT (gnum);
1319
1320   return coding_system_charset (coding_system, XINT (gnum));
1321 }
1322 #endif /* MULE */
1323
1324 DEFUN ("coding-system-property", Fcoding_system_property, 2, 2, 0, /*
1325 Return the PROP property of CODING-SYSTEM.
1326 */
1327        (coding_system, prop))
1328 {
1329   int i, ok = 0;
1330   enum coding_system_type type;
1331
1332   coding_system = Fget_coding_system (coding_system);
1333   CHECK_SYMBOL (prop);
1334   type = XCODING_SYSTEM_TYPE (coding_system);
1335
1336   for (i = 0; !ok && i < Dynarr_length (the_codesys_prop_dynarr); i++)
1337     if (EQ (Dynarr_at (the_codesys_prop_dynarr, i).sym, prop))
1338       {
1339         ok = 1;
1340         switch (Dynarr_at (the_codesys_prop_dynarr, i).prop_type)
1341           {
1342           case CODESYS_PROP_ALL_OK:
1343             break;
1344 #ifdef MULE
1345           case CODESYS_PROP_ISO2022:
1346             if (type != CODESYS_ISO2022)
1347               signal_simple_error
1348                 ("Property only valid in ISO2022 coding systems",
1349                  prop);
1350             break;
1351
1352           case CODESYS_PROP_CCL:
1353             if (type != CODESYS_CCL)
1354               signal_simple_error
1355                 ("Property only valid in CCL coding systems",
1356                  prop);
1357             break;
1358 #endif /* MULE */
1359           default:
1360             abort ();
1361           }
1362       }
1363
1364   if (!ok)
1365     signal_simple_error ("Unrecognized property", prop);
1366
1367   if (EQ (prop, Qname))
1368     return XCODING_SYSTEM_NAME (coding_system);
1369   else if (EQ (prop, Qtype))
1370     return Fcoding_system_type (coding_system);
1371   else if (EQ (prop, Qdoc_string))
1372     return XCODING_SYSTEM_DOC_STRING (coding_system);
1373   else if (EQ (prop, Qmnemonic))
1374     return XCODING_SYSTEM_MNEMONIC (coding_system);
1375   else if (EQ (prop, Qeol_type))
1376     return eol_type_to_symbol (XCODING_SYSTEM_EOL_TYPE (coding_system));
1377   else if (EQ (prop, Qeol_lf))
1378     return XCODING_SYSTEM_EOL_LF (coding_system);
1379   else if (EQ (prop, Qeol_crlf))
1380     return XCODING_SYSTEM_EOL_CRLF (coding_system);
1381   else if (EQ (prop, Qeol_cr))
1382     return XCODING_SYSTEM_EOL_CR (coding_system);
1383   else if (EQ (prop, Qpost_read_conversion))
1384     return XCODING_SYSTEM_POST_READ_CONVERSION (coding_system);
1385   else if (EQ (prop, Qpre_write_conversion))
1386     return XCODING_SYSTEM_PRE_WRITE_CONVERSION (coding_system);
1387 #ifdef MULE
1388   else if (type == CODESYS_ISO2022)
1389     {
1390       if (EQ (prop, Qcharset_g0))
1391         return coding_system_charset (coding_system, 0);
1392       else if (EQ (prop, Qcharset_g1))
1393         return coding_system_charset (coding_system, 1);
1394       else if (EQ (prop, Qcharset_g2))
1395         return coding_system_charset (coding_system, 2);
1396       else if (EQ (prop, Qcharset_g3))
1397         return coding_system_charset (coding_system, 3);
1398
1399 #define FORCE_CHARSET(charset_num) \
1400   (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \
1401    (coding_system, charset_num) ? Qt : Qnil)
1402
1403       else if (EQ (prop, Qforce_g0_on_output)) return FORCE_CHARSET (0);
1404       else if (EQ (prop, Qforce_g1_on_output)) return FORCE_CHARSET (1);
1405       else if (EQ (prop, Qforce_g2_on_output)) return FORCE_CHARSET (2);
1406       else if (EQ (prop, Qforce_g3_on_output)) return FORCE_CHARSET (3);
1407
1408 #define LISP_BOOLEAN(prop) \
1409   (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil)
1410
1411       else if (EQ (prop, Qshort))         return LISP_BOOLEAN (SHORT);
1412       else if (EQ (prop, Qno_ascii_eol))  return LISP_BOOLEAN (NO_ASCII_EOL);
1413       else if (EQ (prop, Qno_ascii_cntl)) return LISP_BOOLEAN (NO_ASCII_CNTL);
1414       else if (EQ (prop, Qseven))         return LISP_BOOLEAN (SEVEN);
1415       else if (EQ (prop, Qlock_shift))    return LISP_BOOLEAN (LOCK_SHIFT);
1416       else if (EQ (prop, Qno_iso6429))    return LISP_BOOLEAN (NO_ISO6429);
1417       else if (EQ (prop, Qescape_quoted)) return LISP_BOOLEAN (ESCAPE_QUOTED);
1418
1419       else if (EQ (prop, Qinput_charset_conversion))
1420         return
1421           unparse_charset_conversion_specs
1422             (XCODING_SYSTEM (coding_system)->iso2022.input_conv);
1423       else if (EQ (prop, Qoutput_charset_conversion))
1424         return
1425           unparse_charset_conversion_specs
1426             (XCODING_SYSTEM (coding_system)->iso2022.output_conv);
1427       else
1428         abort ();
1429     }
1430   else if (type == CODESYS_CCL)
1431     {
1432       if (EQ (prop, Qdecode))
1433         return XCODING_SYSTEM_CCL_DECODE (coding_system);
1434       else if (EQ (prop, Qencode))
1435         return XCODING_SYSTEM_CCL_ENCODE (coding_system);
1436       else
1437         abort ();
1438     }
1439 #endif /* MULE */
1440   else
1441     abort ();
1442
1443   return Qnil; /* not reached */
1444 }
1445
1446 \f
1447 /************************************************************************/
1448 /*                       Coding category functions                      */
1449 /************************************************************************/
1450
1451 static int
1452 decode_coding_category (Lisp_Object symbol)
1453 {
1454   int i;
1455
1456   CHECK_SYMBOL (symbol);
1457   for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1458     if (EQ (coding_category_symbol[i], symbol))
1459       return i;
1460
1461   signal_simple_error ("Unrecognized coding category", symbol);
1462   return 0; /* not reached */
1463 }
1464
1465 DEFUN ("coding-category-list", Fcoding_category_list, 0, 0, 0, /*
1466 Return a list of all recognized coding categories.
1467 */
1468        ())
1469 {
1470   int i;
1471   Lisp_Object list = Qnil;
1472
1473   for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1474     list = Fcons (coding_category_symbol[i], list);
1475   return list;
1476 }
1477
1478 DEFUN ("set-coding-priority-list", Fset_coding_priority_list, 1, 1, 0, /*
1479 Change the priority order of the coding categories.
1480 LIST should be list of coding categories, in descending order of
1481 priority.  Unspecified coding categories will be lower in priority
1482 than all specified ones, in the same relative order they were in
1483 previously.
1484 */
1485        (list))
1486 {
1487   int category_to_priority[CODING_CATEGORY_LAST + 1];
1488   int i, j;
1489   Lisp_Object rest;
1490
1491   /* First generate a list that maps coding categories to priorities. */
1492
1493   for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1494     category_to_priority[i] = -1;
1495
1496   /* Highest priority comes from the specified list. */
1497   i = 0;
1498   EXTERNAL_LIST_LOOP (rest, list)
1499     {
1500       int cat = decode_coding_category (XCAR (rest));
1501
1502       if (category_to_priority[cat] >= 0)
1503         signal_simple_error ("Duplicate coding category in list", XCAR (rest));
1504       category_to_priority[cat] = i++;
1505     }
1506
1507   /* Now go through the existing categories by priority to retrieve
1508      the categories not yet specified and preserve their priority
1509      order. */
1510   for (j = 0; j <= CODING_CATEGORY_LAST; j++)
1511     {
1512       int cat = fcd->coding_category_by_priority[j];
1513       if (category_to_priority[cat] < 0)
1514         category_to_priority[cat] = i++;
1515     }
1516
1517   /* Now we need to construct the inverse of the mapping we just
1518      constructed. */
1519
1520   for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1521     fcd->coding_category_by_priority[category_to_priority[i]] = i;
1522
1523   /* Phew!  That was confusing. */
1524   return Qnil;
1525 }
1526
1527 DEFUN ("coding-priority-list", Fcoding_priority_list, 0, 0, 0, /*
1528 Return a list of coding categories in descending order of priority.
1529 */
1530        ())
1531 {
1532   int i;
1533   Lisp_Object list = Qnil;
1534
1535   for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1536     list = Fcons (coding_category_symbol[fcd->coding_category_by_priority[i]],
1537                   list);
1538   return list;
1539 }
1540
1541 DEFUN ("set-coding-category-system", Fset_coding_category_system, 2, 2, 0, /*
1542 Change the coding system associated with a coding category.
1543 */
1544        (coding_category, coding_system))
1545 {
1546   int cat = decode_coding_category (coding_category);
1547
1548   coding_system = Fget_coding_system (coding_system);
1549   fcd->coding_category_system[cat] = coding_system;
1550   return Qnil;
1551 }
1552
1553 DEFUN ("coding-category-system", Fcoding_category_system, 1, 1, 0, /*
1554 Return the coding system associated with a coding category.
1555 */
1556        (coding_category))
1557 {
1558   int cat = decode_coding_category (coding_category);
1559   Lisp_Object sys = fcd->coding_category_system[cat];
1560
1561   if (!NILP (sys))
1562     return XCODING_SYSTEM_NAME (sys);
1563   return Qnil;
1564 }
1565
1566 \f
1567 /************************************************************************/
1568 /*                     Detecting the encoding of data                   */
1569 /************************************************************************/
1570
1571 struct detection_state
1572 {
1573   eol_type_t eol_type;
1574   int seen_non_ascii;
1575   int mask;
1576 #ifdef MULE
1577   struct
1578     {
1579       int mask;
1580       int in_second_byte;
1581     }
1582   big5;
1583
1584   struct
1585     {
1586       int mask;
1587       int in_second_byte;
1588     }
1589   shift_jis;
1590
1591   struct
1592     {
1593       int mask;
1594       int in_byte;
1595   }
1596   ucs4;
1597
1598   struct
1599     {
1600       int mask;
1601       int in_byte;
1602     }
1603   utf8;
1604
1605   struct
1606     {
1607       int mask;
1608       int initted;
1609       struct iso2022_decoder iso;
1610       unsigned int flags;
1611       int high_byte_count;
1612       unsigned int saw_single_shift:1;
1613     }
1614   iso2022;
1615 #endif
1616   struct
1617     {
1618       int seen_anything;
1619       int just_saw_cr;
1620     }
1621   eol;
1622 };
1623
1624 static int
1625 acceptable_control_char_p (int c)
1626 {
1627   switch (c)
1628     {
1629       /* Allow and ignore control characters that you might
1630          reasonably see in a text file */
1631     case '\r':
1632     case '\n':
1633     case '\t':
1634     case  7: /* bell */
1635     case  8: /* backspace */
1636     case 11: /* vertical tab */
1637     case 12: /* form feed */
1638     case 26: /* MS-DOS C-z junk */
1639     case 31: /* '^_' -- for info */
1640       return 1;
1641     default:
1642       return 0;
1643     }
1644 }
1645
1646 static int
1647 mask_has_at_most_one_bit_p (int mask)
1648 {
1649   /* Perhaps the only thing useful you learn from intensive Microsoft
1650      technical interviews */
1651   return (mask & (mask - 1)) == 0;
1652 }
1653
1654 static eol_type_t
1655 detect_eol_type (struct detection_state *st, const unsigned char *src,
1656                  unsigned int n)
1657 {
1658   int c;
1659
1660   while (n--)
1661     {
1662       c = *src++;
1663       if (c == '\n')
1664         {
1665           if (st->eol.just_saw_cr)
1666             return EOL_CRLF;
1667           else if (st->eol.seen_anything)
1668             return EOL_LF;
1669         }
1670       else if (st->eol.just_saw_cr)
1671         return EOL_CR;
1672       else if (c == '\r')
1673         st->eol.just_saw_cr = 1;
1674       else
1675         st->eol.just_saw_cr = 0;
1676       st->eol.seen_anything = 1;
1677     }
1678
1679   return EOL_AUTODETECT;
1680 }
1681
1682 /* Attempt to determine the encoding and EOL type of the given text.
1683    Before calling this function for the first type, you must initialize
1684    st->eol_type as appropriate and initialize st->mask to ~0.
1685
1686    st->eol_type holds the determined EOL type, or EOL_AUTODETECT if
1687    not yet known.
1688
1689    st->mask holds the determined coding category mask, or ~0 if only
1690    ASCII has been seen so far.
1691
1692    Returns:
1693
1694    0 == st->eol_type is EOL_AUTODETECT and/or more than coding category
1695         is present in st->mask
1696    1 == definitive answers are here for both st->eol_type and st->mask
1697 */
1698
1699 static int
1700 detect_coding_type (struct detection_state *st, const Extbyte *src,
1701                     unsigned int n, int just_do_eol)
1702 {
1703   int c;
1704
1705   if (st->eol_type == EOL_AUTODETECT)
1706     st->eol_type = detect_eol_type (st, src, n);
1707
1708   if (just_do_eol)
1709     return st->eol_type != EOL_AUTODETECT;
1710
1711   if (!st->seen_non_ascii)
1712     {
1713       for (; n; n--, src++)
1714         {
1715           c = *src;
1716           if ((c < 0x20 && !acceptable_control_char_p (c)) || c >= 0x80)
1717             {
1718               st->seen_non_ascii = 1;
1719 #ifdef MULE
1720               st->shift_jis.mask = ~0;
1721               st->big5.mask = ~0;
1722               st->ucs4.mask = ~0;
1723               st->utf8.mask = ~0;
1724               st->iso2022.mask = ~0;
1725 #endif
1726               break;
1727             }
1728         }
1729     }
1730
1731   if (!n)
1732     return 0;
1733 #ifdef MULE
1734   if (!mask_has_at_most_one_bit_p (st->iso2022.mask))
1735     st->iso2022.mask = detect_coding_iso2022 (st, src, n);
1736   if (!mask_has_at_most_one_bit_p (st->shift_jis.mask))
1737     st->shift_jis.mask = detect_coding_sjis (st, src, n);
1738   if (!mask_has_at_most_one_bit_p (st->big5.mask))
1739     st->big5.mask = detect_coding_big5 (st, src, n);
1740   if (!mask_has_at_most_one_bit_p (st->utf8.mask))
1741     st->utf8.mask = detect_coding_utf8 (st, src, n);
1742   if (!mask_has_at_most_one_bit_p (st->ucs4.mask))
1743     st->ucs4.mask = detect_coding_ucs4 (st, src, n);
1744
1745   st->mask
1746     = st->iso2022.mask | st->shift_jis.mask | st->big5.mask
1747     | st->utf8.mask | st->ucs4.mask;
1748 #endif
1749   {
1750     int retval = mask_has_at_most_one_bit_p (st->mask);
1751     st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK;
1752     return retval && st->eol_type != EOL_AUTODETECT;
1753   }
1754 }
1755
1756 static Lisp_Object
1757 coding_system_from_mask (int mask)
1758 {
1759   if (mask == ~0)
1760     {
1761       /* If the file was entirely or basically ASCII, use the
1762          default value of `buffer-file-coding-system'. */
1763       Lisp_Object retval =
1764         XBUFFER (Vbuffer_defaults)->buffer_file_coding_system;
1765       if (!NILP (retval))
1766         {
1767           retval = Ffind_coding_system (retval);
1768           if (NILP (retval))
1769             {
1770               warn_when_safe
1771                 (Qbad_variable, Qwarning,
1772                  "Invalid `default-buffer-file-coding-system', set to nil");
1773               XBUFFER (Vbuffer_defaults)->buffer_file_coding_system = Qnil;
1774             }
1775         }
1776       if (NILP (retval))
1777         retval = Fget_coding_system (Qraw_text);
1778       return retval;
1779     }
1780   else
1781     {
1782       int i;
1783       int cat = -1;
1784 #ifdef MULE
1785       mask = postprocess_iso2022_mask (mask);
1786 #endif
1787       /* Look through the coding categories by priority and find
1788          the first one that is allowed. */
1789       for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1790         {
1791           cat = fcd->coding_category_by_priority[i];
1792           if ((mask & (1 << cat)) &&
1793               !NILP (fcd->coding_category_system[cat]))
1794             break;
1795         }
1796       if (cat >= 0)
1797         return fcd->coding_category_system[cat];
1798       else
1799         return Fget_coding_system (Qraw_text);
1800     }
1801 }
1802
1803 /* Given a seekable read stream and potential coding system and EOL type
1804    as specified, do any autodetection that is called for.  If the
1805    coding system and/or EOL type are not `autodetect', they will be left
1806    alone; but this function will never return an autodetect coding system
1807    or EOL type.
1808
1809    This function does not automatically fetch subsidiary coding systems;
1810    that should be unnecessary with the explicit eol-type argument. */
1811
1812 #define LENGTH(string_constant) (sizeof (string_constant) - 1)
1813
1814 void
1815 determine_real_coding_system (Lstream *stream, Lisp_Object *codesys_in_out,
1816                               eol_type_t *eol_type_in_out)
1817 {
1818   struct detection_state decst;
1819
1820   if (*eol_type_in_out == EOL_AUTODETECT)
1821     *eol_type_in_out = XCODING_SYSTEM_EOL_TYPE (*codesys_in_out);
1822
1823   xzero (decst);
1824   decst.eol_type = *eol_type_in_out;
1825   decst.mask = ~0;
1826
1827   /* If autodetection is called for, do it now. */
1828   if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT
1829       || *eol_type_in_out == EOL_AUTODETECT)
1830     {
1831       Extbyte buf[4096];
1832       Lisp_Object coding_system = Qnil;
1833       Extbyte *p;
1834       ssize_t nread = Lstream_read (stream, buf, sizeof (buf));
1835       Extbyte *scan_end;
1836
1837       /* Look for initial "-*-"; mode line prefix */
1838       for (p = buf,
1839              scan_end = buf + nread - LENGTH ("-*-coding:?-*-");
1840            p <= scan_end
1841              && *p != '\n'
1842              && *p != '\r';
1843            p++)
1844         if (*p == '-' && *(p+1) == '*' && *(p+2) == '-')
1845           {
1846             Extbyte *local_vars_beg = p + 3;
1847             /* Look for final "-*-"; mode line suffix */
1848             for (p = local_vars_beg,
1849                    scan_end = buf + nread - LENGTH ("-*-");
1850                  p <= scan_end
1851                    && *p != '\n'
1852                    && *p != '\r';
1853                  p++)
1854               if (*p == '-' && *(p+1) == '*' && *(p+2) == '-')
1855                 {
1856                   Extbyte *suffix = p;
1857                   /* Look for "coding:" */
1858                   for (p = local_vars_beg,
1859                          scan_end = suffix - LENGTH ("coding:?");
1860                        p <= scan_end;
1861                        p++)
1862                     if (memcmp ("coding:", p, LENGTH ("coding:")) == 0
1863                         && (p == local_vars_beg
1864                             || (*(p-1) == ' '  ||
1865                                 *(p-1) == '\t' ||
1866                                 *(p-1) == ';')))
1867                       {
1868                         Extbyte save;
1869                         int n;
1870                         p += LENGTH ("coding:");
1871                         while (*p == ' ' || *p == '\t') p++;
1872
1873                         /* Get coding system name */
1874                         save = *suffix; *suffix = '\0';
1875                         /* Characters valid in a MIME charset name (rfc 1521),
1876                            and in a Lisp symbol name. */
1877                         n = strspn ( (char *) p,
1878                                     "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
1879                                     "abcdefghijklmnopqrstuvwxyz"
1880                                     "0123456789"
1881                                     "!$%&*+-.^_{|}~");
1882                         *suffix = save;
1883                         if (n > 0)
1884                           {
1885                             save = p[n]; p[n] = '\0';
1886                             coding_system =
1887                               Ffind_coding_system (intern ((char *) p));
1888                             p[n] = save;
1889                           }
1890                         break;
1891                       }
1892                   break;
1893                 }
1894             break;
1895           }
1896
1897       if (NILP (coding_system))
1898         do
1899           {
1900             if (detect_coding_type (&decst, buf, nread,
1901                                     XCODING_SYSTEM_TYPE (*codesys_in_out)
1902                                     != CODESYS_AUTODETECT))
1903               break;
1904             nread = Lstream_read (stream, buf, sizeof (buf));
1905             if (nread == 0)
1906               break;
1907           }
1908         while (1);
1909
1910       else if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT
1911                && XCODING_SYSTEM_EOL_TYPE (coding_system) == EOL_AUTODETECT)
1912         do
1913           {
1914             if (detect_coding_type (&decst, buf, nread, 1))
1915               break;
1916             nread = Lstream_read (stream, buf, sizeof (buf));
1917             if (!nread)
1918               break;
1919           }
1920         while (1);
1921
1922       *eol_type_in_out = decst.eol_type;
1923       if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT)
1924         {
1925           if (NILP (coding_system))
1926             *codesys_in_out = coding_system_from_mask (decst.mask);
1927           else
1928             *codesys_in_out = coding_system;
1929         }
1930     }
1931
1932   /* If we absolutely can't determine the EOL type, just assume LF. */
1933   if (*eol_type_in_out == EOL_AUTODETECT)
1934     *eol_type_in_out = EOL_LF;
1935
1936   Lstream_rewind (stream);
1937 }
1938
1939 DEFUN ("detect-coding-region", Fdetect_coding_region, 2, 3, 0, /*
1940 Detect coding system of the text in the region between START and END.
1941 Returned a list of possible coding systems ordered by priority.
1942 If only ASCII characters are found, it returns 'undecided or one of
1943 its subsidiary coding systems according to a detected end-of-line
1944 type.  Optional arg BUFFER defaults to the current buffer.
1945 */
1946        (start, end, buffer))
1947 {
1948   Lisp_Object val = Qnil;
1949   struct buffer *buf = decode_buffer (buffer, 0);
1950   Bufpos b, e;
1951   Lisp_Object instream, lb_instream;
1952   Lstream *istr, *lb_istr;
1953   struct detection_state decst;
1954   struct gcpro gcpro1, gcpro2;
1955
1956   get_buffer_range_char (buf, start, end, &b, &e, 0);
1957   lb_instream = make_lisp_buffer_input_stream (buf, b, e, 0);
1958   lb_istr = XLSTREAM (lb_instream);
1959   instream = make_encoding_input_stream (lb_istr, Fget_coding_system (Qbinary));
1960   istr = XLSTREAM (instream);
1961   GCPRO2 (instream, lb_instream);
1962   xzero (decst);
1963   decst.eol_type = EOL_AUTODETECT;
1964   decst.mask = ~0;
1965   while (1)
1966     {
1967       unsigned char random_buffer[4096];
1968       ssize_t nread = Lstream_read (istr, random_buffer, sizeof (random_buffer));
1969
1970       if (!nread)
1971         break;
1972       if (detect_coding_type (&decst, random_buffer, nread, 0))
1973         break;
1974     }
1975
1976   if (decst.mask == ~0)
1977     val = subsidiary_coding_system (Fget_coding_system (Qundecided),
1978                                     decst.eol_type);
1979   else
1980     {
1981       int i;
1982
1983       val = Qnil;
1984 #ifdef MULE
1985       decst.mask = postprocess_iso2022_mask (decst.mask);
1986 #endif
1987       for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1988         {
1989           int sys = fcd->coding_category_by_priority[i];
1990           if (decst.mask & (1 << sys))
1991             {
1992               Lisp_Object codesys = fcd->coding_category_system[sys];
1993               if (!NILP (codesys))
1994                 codesys = subsidiary_coding_system (codesys, decst.eol_type);
1995               val = Fcons (codesys, val);
1996             }
1997         }
1998     }
1999   Lstream_close (istr);
2000   UNGCPRO;
2001   Lstream_delete (istr);
2002   Lstream_delete (lb_istr);
2003   return val;
2004 }
2005
2006 \f
2007 /************************************************************************/
2008 /*           Converting to internal Mule format ("decoding")            */
2009 /************************************************************************/
2010
2011 /* A decoding stream is a stream used for decoding text (i.e.
2012    converting from some external format to internal format).
2013    The decoding-stream object keeps track of the actual coding
2014    stream, the stream that is at the other end, and data that
2015    needs to be persistent across the lifetime of the stream. */
2016
2017 /* Handle the EOL stuff related to just-read-in character C.
2018    EOL_TYPE is the EOL type of the coding stream.
2019    FLAGS is the current value of FLAGS in the coding stream, and may
2020    be modified by this macro.  (The macro only looks at the
2021    CODING_STATE_CR flag.)  DST is the Dynarr to which the decoded
2022    bytes are to be written.  You need to also define a local goto
2023    label "label_continue_loop" that is at the end of the main
2024    character-reading loop.
2025
2026    If C is a CR character, then this macro handles it entirely and
2027    jumps to label_continue_loop.  Otherwise, this macro does not add
2028    anything to DST, and continues normally.  You should continue
2029    processing C normally after this macro. */
2030
2031 #define DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst)         \
2032 do {                                                            \
2033   if (c == '\r')                                                \
2034     {                                                           \
2035       if (eol_type == EOL_CR)                                   \
2036         Dynarr_add (dst, '\n');                                 \
2037       else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR) \
2038         Dynarr_add (dst, c);                                    \
2039       else                                                      \
2040         flags |= CODING_STATE_CR;                               \
2041       goto label_continue_loop;                                 \
2042     }                                                           \
2043   else if (flags & CODING_STATE_CR)                             \
2044     {   /* eol_type == CODING_SYSTEM_EOL_CRLF */                \
2045       if (c != '\n')                                            \
2046         Dynarr_add (dst, '\r');                                 \
2047       flags &= ~CODING_STATE_CR;                                \
2048     }                                                           \
2049 } while (0)
2050
2051 /* C should be a binary character in the range 0 - 255; convert
2052    to internal format and add to Dynarr DST. */
2053
2054 #ifdef UTF2000
2055 #define DECODE_ADD_BINARY_CHAR(c, dst) \
2056 do {                                            \
2057   if (BYTE_ASCII_P (c))                         \
2058     Dynarr_add (dst, c);                        \
2059   else                                          \
2060     {                                           \
2061       Dynarr_add (dst, (c >> 6) | 0xc0);        \
2062       Dynarr_add (dst, (c & 0x3f) | 0x80);      \
2063     }                                           \
2064 } while (0)
2065
2066 INLINE_HEADER void DECODE_ADD_UCS_CHAR(Emchar c, unsigned_char_dynarr* dst);
2067 INLINE_HEADER void
2068 DECODE_ADD_UCS_CHAR(Emchar c, unsigned_char_dynarr* dst)
2069 {
2070   if ( c <= 0x7f )
2071     {
2072       Dynarr_add (dst, c);
2073     }
2074   else if ( c <= 0x7ff )
2075     {
2076       Dynarr_add (dst, (c >> 6) | 0xc0);
2077       Dynarr_add (dst, (c & 0x3f) | 0x80);
2078     }
2079   else if ( c <= 0xffff )
2080     {
2081       Dynarr_add (dst,  (c >> 12) | 0xe0);
2082       Dynarr_add (dst, ((c >>  6) & 0x3f) | 0x80);
2083       Dynarr_add (dst,  (c        & 0x3f) | 0x80);
2084     }
2085   else if ( c <= 0x1fffff )
2086     {
2087       Dynarr_add (dst,  (c >> 18) | 0xf0);
2088       Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2089       Dynarr_add (dst, ((c >>  6) & 0x3f) | 0x80);
2090       Dynarr_add (dst,  (c        & 0x3f) | 0x80);
2091     }
2092   else if ( c <= 0x3ffffff )
2093     {
2094       Dynarr_add (dst,  (c >> 24) | 0xf8);
2095       Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
2096       Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2097       Dynarr_add (dst, ((c >>  6) & 0x3f) | 0x80);
2098       Dynarr_add (dst,  (c        & 0x3f) | 0x80);
2099     }
2100   else
2101     {
2102       Dynarr_add (dst,  (c >> 30) | 0xfc);
2103       Dynarr_add (dst, ((c >> 24) & 0x3f) | 0x80);
2104       Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
2105       Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2106       Dynarr_add (dst, ((c >>  6) & 0x3f) | 0x80);
2107       Dynarr_add (dst,  (c        & 0x3f) | 0x80);
2108     }
2109 }
2110 #else
2111 #define DECODE_ADD_BINARY_CHAR(c, dst)          \
2112 do {                                            \
2113   if (BYTE_ASCII_P (c))                         \
2114     Dynarr_add (dst, c);                        \
2115   else if (BYTE_C1_P (c))                       \
2116     {                                           \
2117       Dynarr_add (dst, LEADING_BYTE_CONTROL_1); \
2118       Dynarr_add (dst, c + 0x20);               \
2119     }                                           \
2120   else                                          \
2121     {                                           \
2122       Dynarr_add (dst, LEADING_BYTE_LATIN_ISO8859_1); \
2123       Dynarr_add (dst, c);                      \
2124     }                                           \
2125 } while (0)
2126 #endif
2127
2128 #define DECODE_OUTPUT_PARTIAL_CHAR(ch)  \
2129 do {                                    \
2130   if (ch)                               \
2131     {                                   \
2132       DECODE_ADD_BINARY_CHAR (ch, dst); \
2133       ch = 0;                           \
2134     }                                   \
2135 } while (0)
2136
2137 #define DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst) \
2138 do {                                    \
2139   if (flags & CODING_STATE_END)         \
2140     {                                   \
2141       DECODE_OUTPUT_PARTIAL_CHAR (ch);  \
2142       if (flags & CODING_STATE_CR)      \
2143         Dynarr_add (dst, '\r');         \
2144     }                                   \
2145 } while (0)
2146
2147 #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding)
2148
2149 struct decoding_stream
2150 {
2151   /* Coding system that governs the conversion. */
2152   Lisp_Coding_System *codesys;
2153
2154   /* Stream that we read the encoded data from or
2155      write the decoded data to. */
2156   Lstream *other_end;
2157
2158   /* If we are reading, then we can return only a fixed amount of
2159      data, so if the conversion resulted in too much data, we store it
2160      here for retrieval the next time around. */
2161   unsigned_char_dynarr *runoff;
2162
2163   /* FLAGS holds flags indicating the current state of the decoding.
2164      Some of these flags are dependent on the coding system. */
2165   unsigned int flags;
2166
2167   /* CPOS holds a partially built-up code-point of character. */
2168   unsigned int cpos;
2169
2170   /* EOL_TYPE specifies the type of end-of-line conversion that
2171      currently applies.  We need to keep this separate from the
2172      EOL type stored in CODESYS because the latter might indicate
2173      automatic EOL-type detection while the former will always
2174      indicate a particular EOL type. */
2175   eol_type_t eol_type;
2176 #ifdef MULE
2177   /* Additional ISO2022 information.  We define the structure above
2178      because it's also needed by the detection routines. */
2179   struct iso2022_decoder iso2022;
2180
2181   /* Additional information (the state of the running CCL program)
2182      used by the CCL decoder. */
2183   struct ccl_program ccl;
2184
2185   /* counter for UTF-8 or UCS-4 */
2186   unsigned char counter;
2187 #endif
2188 #ifdef UTF2000
2189   unsigned combined_char_count;
2190   Emchar combined_chars[16];
2191   Lisp_Object combining_table;
2192 #endif
2193   struct detection_state decst;
2194 };
2195
2196 #ifdef UTF2000
2197 extern Lisp_Object Vcharacter_composition_table;
2198
2199 INLINE_HEADER void
2200 COMPOSE_FLUSH_CHARS (struct decoding_stream *str, unsigned_char_dynarr* dst);
2201 INLINE_HEADER void
2202 COMPOSE_FLUSH_CHARS (struct decoding_stream *str, unsigned_char_dynarr* dst)
2203 {
2204   unsigned i;
2205
2206   for (i = 0; i < str->combined_char_count; i++)
2207     DECODE_ADD_UCS_CHAR (str->combined_chars[i], dst);
2208   str->combined_char_count = 0;
2209   str->combining_table = Qnil;
2210 }
2211
2212 void COMPOSE_ADD_CHAR(struct decoding_stream *str, Emchar character,
2213                       unsigned_char_dynarr* dst);
2214 void
2215 COMPOSE_ADD_CHAR(struct decoding_stream *str,
2216                  Emchar character, unsigned_char_dynarr* dst)
2217 {
2218   if (CODING_SYSTEM_DISABLE_COMPOSITION (str->codesys))
2219     DECODE_ADD_UCS_CHAR (character, dst);
2220   else if (!CHAR_ID_TABLE_P (str->combining_table))
2221     {
2222       Lisp_Object ret
2223         = get_char_id_table (character, Vcharacter_composition_table);
2224
2225       if (NILP (ret))
2226         DECODE_ADD_UCS_CHAR (character, dst);
2227       else
2228         {
2229           str->combined_chars[0] = character;
2230           str->combined_char_count = 1;
2231           str->combining_table = ret;
2232         }
2233     }
2234   else
2235     {
2236       Lisp_Object ret
2237         = get_char_id_table (character, str->combining_table);
2238
2239       if (CHARP (ret))
2240         {
2241           Emchar char2 = XCHARVAL (ret);
2242           ret = get_char_id_table (char2, Vcharacter_composition_table);
2243           if (NILP (ret))
2244             {
2245               DECODE_ADD_UCS_CHAR (char2, dst);
2246               str->combined_char_count = 0;
2247               str->combining_table = Qnil;
2248             }
2249           else
2250             {
2251               str->combined_chars[0] = char2;
2252               str->combined_char_count = 1;
2253               str->combining_table = ret;
2254             }
2255         }
2256       else if (CHAR_ID_TABLE_P (ret))
2257         {
2258           str->combined_chars[str->combined_char_count++] = character;
2259           str->combining_table = ret;
2260         }
2261       else
2262         {
2263           COMPOSE_FLUSH_CHARS (str, dst);
2264           DECODE_ADD_UCS_CHAR (character, dst);
2265         }
2266     }
2267 }
2268 #else /* not UTF2000 */
2269 #define COMPOSE_FLUSH_CHARS(str, dst)
2270 #define COMPOSE_ADD_CHAR(str, ch, dst) DECODE_ADD_UCS_CHAR (ch, dst)
2271 #endif /* UTF2000 */
2272
2273 static ssize_t decoding_reader (Lstream *stream,
2274                                 unsigned char *data, size_t size);
2275 static ssize_t decoding_writer (Lstream *stream,
2276                                 const unsigned char *data, size_t size);
2277 static int decoding_rewinder   (Lstream *stream);
2278 static int decoding_seekable_p (Lstream *stream);
2279 static int decoding_flusher    (Lstream *stream);
2280 static int decoding_closer     (Lstream *stream);
2281
2282 static Lisp_Object decoding_marker (Lisp_Object stream);
2283
2284 DEFINE_LSTREAM_IMPLEMENTATION ("decoding", lstream_decoding,
2285                                sizeof (struct decoding_stream));
2286
2287 static Lisp_Object
2288 decoding_marker (Lisp_Object stream)
2289 {
2290   Lstream *str = DECODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2291   Lisp_Object str_obj;
2292
2293   /* We do not need to mark the coding systems or charsets stored
2294      within the stream because they are stored in a global list
2295      and automatically marked. */
2296
2297   XSETLSTREAM (str_obj, str);
2298   mark_object (str_obj);
2299   if (str->imp->marker)
2300     return (str->imp->marker) (str_obj);
2301   else
2302     return Qnil;
2303 }
2304
2305 /* Read SIZE bytes of data and store it into DATA.  We are a decoding stream
2306    so we read data from the other end, decode it, and store it into DATA. */
2307
2308 static ssize_t
2309 decoding_reader (Lstream *stream, unsigned char *data, size_t size)
2310 {
2311   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2312   unsigned char *orig_data = data;
2313   ssize_t read_size;
2314   int error_occurred = 0;
2315
2316   /* We need to interface to mule_decode(), which expects to take some
2317      amount of data and store the result into a Dynarr.  We have
2318      mule_decode() store into str->runoff, and take data from there
2319      as necessary. */
2320
2321   /* We loop until we have enough data, reading chunks from the other
2322      end and decoding it. */
2323   while (1)
2324     {
2325       /* Take data from the runoff if we can.  Make sure to take at
2326          most SIZE bytes, and delete the data from the runoff. */
2327       if (Dynarr_length (str->runoff) > 0)
2328         {
2329           size_t chunk = min (size, (size_t) Dynarr_length (str->runoff));
2330           memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2331           Dynarr_delete_many (str->runoff, 0, chunk);
2332           data += chunk;
2333           size -= chunk;
2334         }
2335
2336       if (size == 0)
2337         break; /* No more room for data */
2338
2339       if (str->flags & CODING_STATE_END)
2340         /* This means that on the previous iteration, we hit the EOF on
2341            the other end.  We loop once more so that mule_decode() can
2342            output any final stuff it may be holding, or any "go back
2343            to a sane state" escape sequences. (This latter makes sense
2344            during encoding.) */
2345         break;
2346
2347       /* Exhausted the runoff, so get some more.  DATA has at least
2348          SIZE bytes left of storage in it, so it's OK to read directly
2349          into it.  (We'll be overwriting above, after we've decoded it
2350          into the runoff.) */
2351       read_size = Lstream_read (str->other_end, data, size);
2352       if (read_size < 0)
2353         {
2354           error_occurred = 1;
2355           break;
2356         }
2357       if (read_size == 0)
2358         /* There might be some more end data produced in the translation.
2359            See the comment above. */
2360         str->flags |= CODING_STATE_END;
2361       mule_decode (stream, data, str->runoff, read_size);
2362     }
2363
2364   if (data - orig_data == 0)
2365     return error_occurred ? -1 : 0;
2366   else
2367     return data - orig_data;
2368 }
2369
2370 static ssize_t
2371 decoding_writer (Lstream *stream, const unsigned char *data, size_t size)
2372 {
2373   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2374   ssize_t retval;
2375
2376   /* Decode all our data into the runoff, and then attempt to write
2377      it all out to the other end.  Remove whatever chunk we succeeded
2378      in writing. */
2379   mule_decode (stream, data, str->runoff, size);
2380   retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2381                           Dynarr_length (str->runoff));
2382   if (retval > 0)
2383     Dynarr_delete_many (str->runoff, 0, retval);
2384   /* Do NOT return retval.  The return value indicates how much
2385      of the incoming data was written, not how many bytes were
2386      written. */
2387   return size;
2388 }
2389
2390 static void
2391 reset_decoding_stream (struct decoding_stream *str)
2392 {
2393 #ifdef MULE
2394   if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_ISO2022)
2395     {
2396       Lisp_Object coding_system;
2397       XSETCODING_SYSTEM (coding_system, str->codesys);
2398       reset_iso2022 (coding_system, &str->iso2022);
2399     }
2400   else if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_CCL)
2401     {
2402       setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_DECODE (str->codesys));
2403     }
2404   str->counter = 0;
2405 #endif /* MULE */
2406 #ifdef UTF2000
2407   str->combined_char_count = 0;
2408   str->combining_table = Qnil;
2409 #endif
2410   str->flags = str->cpos = 0;
2411 }
2412
2413 static int
2414 decoding_rewinder (Lstream *stream)
2415 {
2416   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2417   reset_decoding_stream (str);
2418   Dynarr_reset (str->runoff);
2419   return Lstream_rewind (str->other_end);
2420 }
2421
2422 static int
2423 decoding_seekable_p (Lstream *stream)
2424 {
2425   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2426   return Lstream_seekable_p (str->other_end);
2427 }
2428
2429 static int
2430 decoding_flusher (Lstream *stream)
2431 {
2432   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2433   return Lstream_flush (str->other_end);
2434 }
2435
2436 static int
2437 decoding_closer (Lstream *stream)
2438 {
2439   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2440   if (stream->flags & LSTREAM_FL_WRITE)
2441     {
2442       str->flags |= CODING_STATE_END;
2443       decoding_writer (stream, 0, 0);
2444     }
2445   Dynarr_free (str->runoff);
2446 #ifdef MULE
2447 #ifdef ENABLE_COMPOSITE_CHARS
2448   if (str->iso2022.composite_chars)
2449     Dynarr_free (str->iso2022.composite_chars);
2450 #endif
2451 #endif
2452   return Lstream_close (str->other_end);
2453 }
2454
2455 Lisp_Object
2456 decoding_stream_coding_system (Lstream *stream)
2457 {
2458   Lisp_Object coding_system;
2459   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2460
2461   XSETCODING_SYSTEM (coding_system, str->codesys);
2462   return subsidiary_coding_system (coding_system, str->eol_type);
2463 }
2464
2465 void
2466 set_decoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2467 {
2468   Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2469   struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2470   str->codesys = cs;
2471   if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
2472     str->eol_type = CODING_SYSTEM_EOL_TYPE (cs);
2473   reset_decoding_stream (str);
2474 }
2475
2476 /* WARNING WARNING WARNING WARNING!!!!!  If you open up a decoding
2477    stream for writing, no automatic code detection will be performed.
2478    The reason for this is that automatic code detection requires a
2479    seekable input.  Things will also fail if you open a decoding
2480    stream for reading using a non-fully-specified coding system and
2481    a non-seekable input stream. */
2482
2483 static Lisp_Object
2484 make_decoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2485                         const char *mode)
2486 {
2487   Lstream *lstr = Lstream_new (lstream_decoding, mode);
2488   struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2489   Lisp_Object obj;
2490
2491   xzero (*str);
2492   str->other_end = stream;
2493   str->runoff = (unsigned_char_dynarr *) Dynarr_new (unsigned_char);
2494   str->eol_type = EOL_AUTODETECT;
2495   if (!strcmp (mode, "r")
2496       && Lstream_seekable_p (stream))
2497     /* We can determine the coding system now. */
2498     determine_real_coding_system (stream, &codesys, &str->eol_type);
2499   set_decoding_stream_coding_system (lstr, codesys);
2500   str->decst.eol_type = str->eol_type;
2501   str->decst.mask = ~0;
2502   XSETLSTREAM (obj, lstr);
2503   return obj;
2504 }
2505
2506 Lisp_Object
2507 make_decoding_input_stream (Lstream *stream, Lisp_Object codesys)
2508 {
2509   return make_decoding_stream_1 (stream, codesys, "r");
2510 }
2511
2512 Lisp_Object
2513 make_decoding_output_stream (Lstream *stream, Lisp_Object codesys)
2514 {
2515   return make_decoding_stream_1 (stream, codesys, "w");
2516 }
2517
2518 /* Note: the decode_coding_* functions all take the same
2519    arguments as mule_decode(), which is to say some SRC data of
2520    size N, which is to be stored into dynamic array DST.
2521    DECODING is the stream within which the decoding is
2522    taking place, but no data is actually read from or
2523    written to that stream; that is handled in decoding_reader()
2524    or decoding_writer().  This allows the same functions to
2525    be used for both reading and writing. */
2526
2527 static void
2528 mule_decode (Lstream *decoding, const unsigned char *src,
2529              unsigned_char_dynarr *dst, unsigned int n)
2530 {
2531   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2532
2533   /* If necessary, do encoding-detection now.  We do this when
2534      we're a writing stream or a non-seekable reading stream,
2535      meaning that we can't just process the whole input,
2536      rewind, and start over. */
2537
2538   if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT ||
2539       str->eol_type == EOL_AUTODETECT)
2540     {
2541       Lisp_Object codesys;
2542
2543       XSETCODING_SYSTEM (codesys, str->codesys);
2544       detect_coding_type (&str->decst, src, n,
2545                           CODING_SYSTEM_TYPE (str->codesys) !=
2546                           CODESYS_AUTODETECT);
2547       if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT &&
2548           str->decst.mask != ~0)
2549         /* #### This is cheesy.  What we really ought to do is
2550            buffer up a certain amount of data so as to get a
2551            less random result. */
2552         codesys = coding_system_from_mask (str->decst.mask);
2553       str->eol_type = str->decst.eol_type;
2554       if (XCODING_SYSTEM (codesys) != str->codesys)
2555         {
2556           /* Preserve the CODING_STATE_END flag in case it was set.
2557              If we erase it, bad things might happen. */
2558           int was_end = str->flags & CODING_STATE_END;
2559           set_decoding_stream_coding_system (decoding, codesys);
2560           if (was_end)
2561             str->flags |= CODING_STATE_END;
2562         }
2563     }
2564
2565   switch (CODING_SYSTEM_TYPE (str->codesys))
2566     {
2567 #ifdef DEBUG_XEMACS
2568     case CODESYS_INTERNAL:
2569       Dynarr_add_many (dst, src, n);
2570       break;
2571 #endif
2572     case CODESYS_AUTODETECT:
2573       /* If we got this far and still haven't decided on the coding
2574          system, then do no conversion. */
2575     case CODESYS_NO_CONVERSION:
2576       decode_coding_no_conversion (decoding, src, dst, n);
2577       break;
2578 #ifdef MULE
2579     case CODESYS_SHIFT_JIS:
2580       decode_coding_sjis (decoding, src, dst, n);
2581       break;
2582     case CODESYS_BIG5:
2583       decode_coding_big5 (decoding, src, dst, n);
2584       break;
2585     case CODESYS_UCS4:
2586       decode_coding_ucs4 (decoding, src, dst, n);
2587       break;
2588     case CODESYS_UTF8:
2589       decode_coding_utf8 (decoding, src, dst, n);
2590       break;
2591     case CODESYS_CCL:
2592       str->ccl.last_block = str->flags & CODING_STATE_END;
2593       ccl_driver (&str->ccl, src, dst, n, 0, CCL_MODE_DECODING);
2594       break;
2595     case CODESYS_ISO2022:
2596       decode_coding_iso2022 (decoding, src, dst, n);
2597       break;
2598 #endif /* MULE */
2599     default:
2600       abort ();
2601     }
2602 }
2603
2604 DEFUN ("decode-coding-region", Fdecode_coding_region, 3, 4, 0, /*
2605 Decode the text between START and END which is encoded in CODING-SYSTEM.
2606 This is useful if you've read in encoded text from a file without decoding
2607 it (e.g. you read in a JIS-formatted file but used the `binary' or
2608 `no-conversion' coding system, so that it shows up as "^[$B!<!+^[(B").
2609 Return length of decoded text.
2610 BUFFER defaults to the current buffer if unspecified.
2611 */
2612        (start, end, coding_system, buffer))
2613 {
2614   Bufpos b, e;
2615   struct buffer *buf = decode_buffer (buffer, 0);
2616   Lisp_Object instream, lb_outstream, de_outstream, outstream;
2617   Lstream *istr, *ostr;
2618   struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2619
2620   get_buffer_range_char (buf, start, end, &b, &e, 0);
2621
2622   barf_if_buffer_read_only (buf, b, e);
2623
2624   coding_system = Fget_coding_system (coding_system);
2625   instream = make_lisp_buffer_input_stream  (buf, b, e, 0);
2626   lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2627   de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2628                                               coding_system);
2629   outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2630                                            Fget_coding_system (Qbinary));
2631   istr = XLSTREAM (instream);
2632   ostr = XLSTREAM (outstream);
2633   GCPRO4 (instream, lb_outstream, de_outstream, outstream);
2634
2635   /* The chain of streams looks like this:
2636
2637      [BUFFER] <----- send through
2638                      ------> [ENCODE AS BINARY]
2639                              ------> [DECODE AS SPECIFIED]
2640                                      ------> [BUFFER]
2641    */
2642
2643   while (1)
2644     {
2645       char tempbuf[1024]; /* some random amount */
2646       Bufpos newpos, even_newer_pos;
2647       Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2648       ssize_t size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2649
2650       if (!size_in_bytes)
2651         break;
2652       newpos = lisp_buffer_stream_startpos (istr);
2653       Lstream_write (ostr, tempbuf, size_in_bytes);
2654       even_newer_pos = lisp_buffer_stream_startpos (istr);
2655       buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2656                            even_newer_pos, 0);
2657     }
2658   Lstream_close (istr);
2659   Lstream_close (ostr);
2660   UNGCPRO;
2661   Lstream_delete (istr);
2662   Lstream_delete (ostr);
2663   Lstream_delete (XLSTREAM (de_outstream));
2664   Lstream_delete (XLSTREAM (lb_outstream));
2665   return Qnil;
2666 }
2667
2668 \f
2669 /************************************************************************/
2670 /*           Converting to an external encoding ("encoding")            */
2671 /************************************************************************/
2672
2673 /* An encoding stream is an output stream.  When you create the
2674    stream, you specify the coding system that governs the encoding
2675    and another stream that the resulting encoded data is to be
2676    sent to, and then start sending data to it. */
2677
2678 #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding)
2679
2680 struct encoding_stream
2681 {
2682   /* Coding system that governs the conversion. */
2683   Lisp_Coding_System *codesys;
2684
2685   /* Stream that we read the encoded data from or
2686      write the decoded data to. */
2687   Lstream *other_end;
2688
2689   /* If we are reading, then we can return only a fixed amount of
2690      data, so if the conversion resulted in too much data, we store it
2691      here for retrieval the next time around. */
2692   unsigned_char_dynarr *runoff;
2693
2694   /* FLAGS holds flags indicating the current state of the encoding.
2695      Some of these flags are dependent on the coding system. */
2696   unsigned int flags;
2697
2698   /* CH holds a partially built-up character.  Since we only deal
2699      with one- and two-byte characters at the moment, we only use
2700      this to store the first byte of a two-byte character. */
2701   unsigned int ch;
2702 #ifdef MULE
2703   /* Additional information used by the ISO2022 encoder. */
2704   struct
2705     {
2706       /* CHARSET holds the character sets currently assigned to the G0
2707          through G3 registers.  It is initialized from the array
2708          INITIAL_CHARSET in CODESYS. */
2709       Lisp_Object charset[4];
2710
2711       /* Which registers are currently invoked into the left (GL) and
2712          right (GR) halves of the 8-bit encoding space? */
2713       int register_left, register_right;
2714
2715       /* Whether we need to explicitly designate the charset in the
2716          G? register before using it.  It is initialized from the
2717          array FORCE_CHARSET_ON_OUTPUT in CODESYS. */
2718       unsigned char force_charset_on_output[4];
2719
2720       /* Other state variables that need to be preserved across
2721          invocations. */
2722       Lisp_Object current_charset;
2723       int current_half;
2724       int current_char_boundary;
2725     } iso2022;
2726
2727   void (*encode_char) (struct encoding_stream *str, Emchar c,
2728                        unsigned_char_dynarr *dst, unsigned int *flags);
2729   void (*finish) (struct encoding_stream *str,
2730                   unsigned_char_dynarr *dst, unsigned int *flags);
2731
2732   /* Additional information (the state of the running CCL program)
2733      used by the CCL encoder. */
2734   struct ccl_program ccl;
2735 #endif /* MULE */
2736 };
2737
2738 static ssize_t encoding_reader (Lstream *stream, unsigned char *data, size_t size);
2739 static ssize_t encoding_writer (Lstream *stream, const unsigned char *data,
2740                                 size_t size);
2741 static int encoding_rewinder   (Lstream *stream);
2742 static int encoding_seekable_p (Lstream *stream);
2743 static int encoding_flusher    (Lstream *stream);
2744 static int encoding_closer     (Lstream *stream);
2745
2746 static Lisp_Object encoding_marker (Lisp_Object stream);
2747
2748 DEFINE_LSTREAM_IMPLEMENTATION ("encoding", lstream_encoding,
2749                                sizeof (struct encoding_stream));
2750
2751 static Lisp_Object
2752 encoding_marker (Lisp_Object stream)
2753 {
2754   Lstream *str = ENCODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2755   Lisp_Object str_obj;
2756
2757   /* We do not need to mark the coding systems or charsets stored
2758      within the stream because they are stored in a global list
2759      and automatically marked. */
2760
2761   XSETLSTREAM (str_obj, str);
2762   mark_object (str_obj);
2763   if (str->imp->marker)
2764     return (str->imp->marker) (str_obj);
2765   else
2766     return Qnil;
2767 }
2768
2769 /* Read SIZE bytes of data and store it into DATA.  We are a encoding stream
2770    so we read data from the other end, encode it, and store it into DATA. */
2771
2772 static ssize_t
2773 encoding_reader (Lstream *stream, unsigned char *data, size_t size)
2774 {
2775   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2776   unsigned char *orig_data = data;
2777   ssize_t read_size;
2778   int error_occurred = 0;
2779
2780   /* We need to interface to mule_encode(), which expects to take some
2781      amount of data and store the result into a Dynarr.  We have
2782      mule_encode() store into str->runoff, and take data from there
2783      as necessary. */
2784
2785   /* We loop until we have enough data, reading chunks from the other
2786      end and encoding it. */
2787   while (1)
2788     {
2789       /* Take data from the runoff if we can.  Make sure to take at
2790          most SIZE bytes, and delete the data from the runoff. */
2791       if (Dynarr_length (str->runoff) > 0)
2792         {
2793           int chunk = min ((int) size, Dynarr_length (str->runoff));
2794           memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2795           Dynarr_delete_many (str->runoff, 0, chunk);
2796           data += chunk;
2797           size -= chunk;
2798         }
2799
2800       if (size == 0)
2801         break; /* No more room for data */
2802
2803       if (str->flags & CODING_STATE_END)
2804         /* This means that on the previous iteration, we hit the EOF on
2805            the other end.  We loop once more so that mule_encode() can
2806            output any final stuff it may be holding, or any "go back
2807            to a sane state" escape sequences. (This latter makes sense
2808            during encoding.) */
2809         break;
2810
2811       /* Exhausted the runoff, so get some more.  DATA at least SIZE bytes
2812          left of storage in it, so it's OK to read directly into it.
2813          (We'll be overwriting above, after we've encoded it into the
2814          runoff.) */
2815       read_size = Lstream_read (str->other_end, data, size);
2816       if (read_size < 0)
2817         {
2818           error_occurred = 1;
2819           break;
2820         }
2821       if (read_size == 0)
2822         /* There might be some more end data produced in the translation.
2823            See the comment above. */
2824         str->flags |= CODING_STATE_END;
2825       mule_encode (stream, data, str->runoff, read_size);
2826     }
2827
2828   if (data == orig_data)
2829     return error_occurred ? -1 : 0;
2830   else
2831     return data - orig_data;
2832 }
2833
2834 static ssize_t
2835 encoding_writer (Lstream *stream, const unsigned char *data, size_t size)
2836 {
2837   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2838   ssize_t retval;
2839
2840   /* Encode all our data into the runoff, and then attempt to write
2841      it all out to the other end.  Remove whatever chunk we succeeded
2842      in writing. */
2843   mule_encode (stream, data, str->runoff, size);
2844   retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2845                           Dynarr_length (str->runoff));
2846   if (retval > 0)
2847     Dynarr_delete_many (str->runoff, 0, retval);
2848   /* Do NOT return retval.  The return value indicates how much
2849      of the incoming data was written, not how many bytes were
2850      written. */
2851   return size;
2852 }
2853
2854 static void
2855 reset_encoding_stream (struct encoding_stream *str)
2856 {
2857 #ifdef MULE
2858   switch (CODING_SYSTEM_TYPE (str->codesys))
2859     {
2860     case CODESYS_ISO2022:
2861       {
2862         int i;
2863
2864         str->encode_char = &char_encode_iso2022;
2865         str->finish = &char_finish_iso2022;
2866         for (i = 0; i < 4; i++)
2867           {
2868             str->iso2022.charset[i] =
2869               CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, i);
2870             str->iso2022.force_charset_on_output[i] =
2871               CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (str->codesys, i);
2872           }
2873         str->iso2022.register_left = 0;
2874         str->iso2022.register_right = 1;
2875         str->iso2022.current_charset = Qnil;
2876         str->iso2022.current_half = 0;
2877         break;
2878       }
2879     case CODESYS_CCL:
2880       setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_ENCODE (str->codesys));
2881       break;
2882     case CODESYS_UTF8:
2883       str->encode_char = &char_encode_utf8;
2884       str->finish = &char_finish_utf8;
2885       break;
2886     case CODESYS_UCS4:
2887       str->encode_char = &char_encode_ucs4;
2888       str->finish = &char_finish_ucs4;
2889       break;
2890     case CODESYS_SHIFT_JIS:
2891       str->encode_char = &char_encode_shift_jis;
2892       str->finish = &char_finish_shift_jis;
2893       break;
2894     default:
2895       break;
2896     }
2897 #endif /* MULE */
2898   str->iso2022.current_char_boundary = 0;
2899   str->flags = str->ch = 0;
2900 }
2901
2902 static int
2903 encoding_rewinder (Lstream *stream)
2904 {
2905   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2906   reset_encoding_stream (str);
2907   Dynarr_reset (str->runoff);
2908   return Lstream_rewind (str->other_end);
2909 }
2910
2911 static int
2912 encoding_seekable_p (Lstream *stream)
2913 {
2914   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2915   return Lstream_seekable_p (str->other_end);
2916 }
2917
2918 static int
2919 encoding_flusher (Lstream *stream)
2920 {
2921   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2922   return Lstream_flush (str->other_end);
2923 }
2924
2925 static int
2926 encoding_closer (Lstream *stream)
2927 {
2928   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2929   if (stream->flags & LSTREAM_FL_WRITE)
2930     {
2931       str->flags |= CODING_STATE_END;
2932       encoding_writer (stream, 0, 0);
2933     }
2934   Dynarr_free (str->runoff);
2935   return Lstream_close (str->other_end);
2936 }
2937
2938 Lisp_Object
2939 encoding_stream_coding_system (Lstream *stream)
2940 {
2941   Lisp_Object coding_system;
2942   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2943
2944   XSETCODING_SYSTEM (coding_system, str->codesys);
2945   return coding_system;
2946 }
2947
2948 void
2949 set_encoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2950 {
2951   Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2952   struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2953   str->codesys = cs;
2954   reset_encoding_stream (str);
2955 }
2956
2957 static Lisp_Object
2958 make_encoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2959                         const char *mode)
2960 {
2961   Lstream *lstr = Lstream_new (lstream_encoding, mode);
2962   struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2963   Lisp_Object obj;
2964
2965   xzero (*str);
2966   str->runoff = Dynarr_new (unsigned_char);
2967   str->other_end = stream;
2968   set_encoding_stream_coding_system (lstr, codesys);
2969   XSETLSTREAM (obj, lstr);
2970   return obj;
2971 }
2972
2973 Lisp_Object
2974 make_encoding_input_stream (Lstream *stream, Lisp_Object codesys)
2975 {
2976   return make_encoding_stream_1 (stream, codesys, "r");
2977 }
2978
2979 Lisp_Object
2980 make_encoding_output_stream (Lstream *stream, Lisp_Object codesys)
2981 {
2982   return make_encoding_stream_1 (stream, codesys, "w");
2983 }
2984
2985 /* Convert N bytes of internally-formatted data stored in SRC to an
2986    external format, according to the encoding stream ENCODING.
2987    Store the encoded data into DST. */
2988
2989 static void
2990 mule_encode (Lstream *encoding, const unsigned char *src,
2991              unsigned_char_dynarr *dst, unsigned int n)
2992 {
2993   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2994
2995   switch (CODING_SYSTEM_TYPE (str->codesys))
2996     {
2997 #ifdef DEBUG_XEMACS
2998     case CODESYS_INTERNAL:
2999       Dynarr_add_many (dst, src, n);
3000       break;
3001 #endif
3002     case CODESYS_AUTODETECT:
3003       /* If we got this far and still haven't decided on the coding
3004          system, then do no conversion. */
3005     case CODESYS_NO_CONVERSION:
3006       encode_coding_no_conversion (encoding, src, dst, n);
3007       break;
3008 #ifdef MULE
3009     case CODESYS_BIG5:
3010       encode_coding_big5 (encoding, src, dst, n);
3011       break;
3012     case CODESYS_CCL:
3013       str->ccl.last_block = str->flags & CODING_STATE_END;
3014       ccl_driver (&str->ccl, src, dst, n, 0, CCL_MODE_ENCODING);
3015       break;
3016 #endif /* MULE */
3017     default:
3018       text_encode_generic (encoding, src, dst, n);
3019     }
3020 }
3021
3022 DEFUN ("encode-coding-region", Fencode_coding_region, 3, 4, 0, /*
3023 Encode the text between START and END using CODING-SYSTEM.
3024 This will, for example, convert Japanese characters into stuff such as
3025 "^[$B!<!+^[(B" if you use the JIS encoding.  Return length of encoded
3026 text.  BUFFER defaults to the current buffer if unspecified.
3027 */
3028        (start, end, coding_system, buffer))
3029 {
3030   Bufpos b, e;
3031   struct buffer *buf = decode_buffer (buffer, 0);
3032   Lisp_Object instream, lb_outstream, de_outstream, outstream;
3033   Lstream *istr, *ostr;
3034   struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
3035
3036   get_buffer_range_char (buf, start, end, &b, &e, 0);
3037
3038   barf_if_buffer_read_only (buf, b, e);
3039
3040   coding_system = Fget_coding_system (coding_system);
3041   instream  = make_lisp_buffer_input_stream  (buf, b, e, 0);
3042   lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
3043   de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
3044                                               Fget_coding_system (Qbinary));
3045   outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
3046                                            coding_system);
3047   istr = XLSTREAM (instream);
3048   ostr = XLSTREAM (outstream);
3049   GCPRO4 (instream, outstream, de_outstream, lb_outstream);
3050   /* The chain of streams looks like this:
3051
3052      [BUFFER] <----- send through
3053                      ------> [ENCODE AS SPECIFIED]
3054                              ------> [DECODE AS BINARY]
3055                                      ------> [BUFFER]
3056    */
3057   while (1)
3058     {
3059       char tempbuf[1024]; /* some random amount */
3060       Bufpos newpos, even_newer_pos;
3061       Bufpos oldpos = lisp_buffer_stream_startpos (istr);
3062       ssize_t size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
3063
3064       if (!size_in_bytes)
3065         break;
3066       newpos = lisp_buffer_stream_startpos (istr);
3067       Lstream_write (ostr, tempbuf, size_in_bytes);
3068       even_newer_pos = lisp_buffer_stream_startpos (istr);
3069       buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
3070                            even_newer_pos, 0);
3071     }
3072
3073   {
3074     Charcount retlen =
3075       lisp_buffer_stream_startpos (XLSTREAM (instream)) - b;
3076     Lstream_close (istr);
3077     Lstream_close (ostr);
3078     UNGCPRO;
3079     Lstream_delete (istr);
3080     Lstream_delete (ostr);
3081     Lstream_delete (XLSTREAM (de_outstream));
3082     Lstream_delete (XLSTREAM (lb_outstream));
3083     return make_int (retlen);
3084   }
3085 }
3086
3087 #ifdef MULE
3088 \f
3089 static void
3090 text_encode_generic (Lstream *encoding, const unsigned char *src,
3091                      unsigned_char_dynarr *dst, unsigned int n)
3092 {
3093   unsigned char c;
3094   unsigned char char_boundary;
3095   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3096   unsigned int flags          = str->flags;
3097   Emchar ch                   = str->ch;
3098
3099   char_boundary = str->iso2022.current_char_boundary;
3100
3101   while (n--)
3102     {
3103       c = *src++;
3104
3105       if (char_boundary == 0)
3106         {
3107           if (c >= 0xfc)
3108             {
3109               ch = c & 0x01;
3110               char_boundary = 5;
3111             }
3112           else if (c >= 0xf8)
3113             {
3114               ch = c & 0x03;
3115               char_boundary = 4;
3116             }
3117           else if (c >= 0xf0)
3118             {
3119               ch = c & 0x07;
3120               char_boundary = 3;
3121             }
3122           else if (c >= 0xe0)
3123             {
3124               ch = c & 0x0f;
3125               char_boundary = 2;
3126             }
3127           else if (c >= 0xc0)
3128             {
3129               ch = c & 0x1f;
3130               char_boundary = 1;
3131             }
3132           else
3133             (*str->encode_char) (str, c, dst, &flags);
3134         }
3135       else if (char_boundary == 1)
3136         {
3137           (*str->encode_char) (str, (ch << 6) | (c & 0x3f), dst, &flags);
3138           ch =0;
3139           char_boundary = 0;
3140         }
3141       else
3142         {
3143           ch = (ch << 6) | (c & 0x3f);
3144           char_boundary--;
3145         }
3146     }
3147
3148   if ((char_boundary == 0) && (flags & CODING_STATE_END))
3149     {
3150       (*str->finish) (str, dst, &flags);
3151     }
3152
3153   str->flags = flags;
3154   str->ch    = ch;
3155   str->iso2022.current_char_boundary = char_boundary;
3156 }
3157
3158 \f
3159 /************************************************************************/
3160 /*                          Shift-JIS methods                           */
3161 /************************************************************************/
3162
3163 /* Shift-JIS is a coding system encoding three character sets: ASCII, right
3164    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
3165    as is.  A character of JISX0201-Kana (DIMENSION1_CHARS94 character set) is
3166    encoded by "position-code + 0x80".  A character of JISX0208
3167    (DIMENSION2_CHARS94 character set) is encoded in 2-byte but two
3168    position-codes are divided and shifted so that it fit in the range
3169    below.
3170
3171    --- CODE RANGE of Shift-JIS ---
3172    (character set)      (range)
3173    ASCII                0x00 .. 0x7F
3174    JISX0201-Kana        0xA0 .. 0xDF
3175    JISX0208 (1st byte)  0x80 .. 0x9F and 0xE0 .. 0xEF
3176             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
3177    -------------------------------
3178
3179 */
3180
3181 /* Is this the first byte of a Shift-JIS two-byte char? */
3182
3183 #define BYTE_SJIS_TWO_BYTE_1_P(c) \
3184   (((c) >= 0x81 && (c) <= 0x9F) || ((c) >= 0xE0 && (c) <= 0xEF))
3185
3186 /* Is this the second byte of a Shift-JIS two-byte char? */
3187
3188 #define BYTE_SJIS_TWO_BYTE_2_P(c) \
3189   (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0x80 && (c) <= 0xFC))
3190
3191 #define BYTE_SJIS_KATAKANA_P(c) \
3192   ((c) >= 0xA1 && (c) <= 0xDF)
3193
3194 static int
3195 detect_coding_sjis (struct detection_state *st, const unsigned char *src,
3196                     unsigned int n)
3197 {
3198   int c;
3199
3200   while (n--)
3201     {
3202       c = *src++;
3203       if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3204         return 0;
3205       if (st->shift_jis.in_second_byte)
3206         {
3207           st->shift_jis.in_second_byte = 0;
3208           if (c < 0x40)
3209             return 0;
3210         }
3211       else if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
3212         st->shift_jis.in_second_byte = 1;
3213     }
3214   return CODING_CATEGORY_SHIFT_JIS_MASK;
3215 }
3216
3217 /* Convert Shift-JIS data to internal format. */
3218
3219 static void
3220 decode_coding_sjis (Lstream *decoding, const unsigned char *src,
3221                     unsigned_char_dynarr *dst, unsigned int n)
3222 {
3223   unsigned char c;
3224   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3225   unsigned int flags  = str->flags;
3226   unsigned int cpos   = str->cpos;
3227   eol_type_t eol_type = str->eol_type;
3228
3229   while (n--)
3230     {
3231       c = *src++;
3232
3233       if (cpos)
3234         {
3235           /* Previous character was first byte of Shift-JIS Kanji char. */
3236           if (BYTE_SJIS_TWO_BYTE_2_P (c))
3237             {
3238               unsigned char e1, e2;
3239
3240               DECODE_SJIS (cpos, c, e1, e2);
3241 #ifdef UTF2000
3242               DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_japanese_jisx0208,
3243                                             e1 & 0x7F,
3244                                             e2 & 0x7F), dst);
3245 #else
3246               Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
3247               Dynarr_add (dst, e1);
3248               Dynarr_add (dst, e2);
3249 #endif
3250             }
3251           else
3252             {
3253               DECODE_ADD_BINARY_CHAR (cpos, dst);
3254               DECODE_ADD_BINARY_CHAR (c, dst);
3255             }
3256           cpos = 0;
3257         }
3258       else
3259         {
3260           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3261           if (BYTE_SJIS_TWO_BYTE_1_P (c))
3262             cpos = c;
3263           else if (BYTE_SJIS_KATAKANA_P (c))
3264             {
3265 #ifdef UTF2000
3266               DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_katakana_jisx0201,
3267                                             c & 0x7F, 0), dst);
3268 #else
3269               Dynarr_add (dst, LEADING_BYTE_KATAKANA_JISX0201);
3270               Dynarr_add (dst, c);
3271 #endif
3272             }
3273 #ifdef UTF2000
3274           else if (c > 32)
3275             DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_latin_jisx0201,
3276                                           c, 0), dst);
3277 #endif
3278           else
3279             DECODE_ADD_BINARY_CHAR (c, dst);
3280         }
3281     label_continue_loop:;
3282     }
3283
3284   DECODE_HANDLE_END_OF_CONVERSION (flags, cpos, dst);
3285
3286   str->flags = flags;
3287   str->cpos  = cpos;
3288 }
3289
3290 /* Convert internal character representation to Shift_JIS. */
3291
3292 void
3293 char_encode_shift_jis (struct encoding_stream *str, Emchar ch,
3294                        unsigned_char_dynarr *dst, unsigned int *flags)
3295 {
3296   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3297
3298   if (ch == '\n')
3299     {
3300       if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3301         Dynarr_add (dst, '\r');
3302       if (eol_type != EOL_CR)
3303         Dynarr_add (dst, ch);
3304     }
3305   else
3306     {
3307       Lisp_Object charset;
3308       unsigned int c1, c2, s1, s2;
3309 #ifdef UTF2000
3310       int code_point = charset_code_point (Vcharset_latin_jisx0201, ch);
3311
3312       if (code_point >= 0)
3313         {
3314           charset = Vcharset_latin_jisx0201;
3315           c1 = code_point;
3316           c2 = 0;
3317         }
3318       else
3319 #endif
3320         BREAKUP_CHAR (ch, charset, c1, c2);
3321
3322       if (EQ(charset, Vcharset_katakana_jisx0201))
3323         {
3324           Dynarr_add (dst, c1 | 0x80);
3325         }
3326       else if (c2 == 0)
3327         {
3328           Dynarr_add (dst, c1);
3329         }
3330       else if (EQ(charset, Vcharset_japanese_jisx0208))
3331         {
3332           ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3333           Dynarr_add (dst, s1);
3334           Dynarr_add (dst, s2);
3335         }
3336       else
3337         Dynarr_add (dst, '?');
3338     }
3339 }
3340
3341 void
3342 char_finish_shift_jis (struct encoding_stream *str, unsigned_char_dynarr *dst,
3343                        unsigned int *flags)
3344 {
3345 }
3346
3347 DEFUN ("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /*
3348 Decode a JISX0208 character of Shift-JIS coding-system.
3349 CODE is the character code in Shift-JIS as a cons of type bytes.
3350 Return the corresponding character.
3351 */
3352        (code))
3353 {
3354   unsigned char c1, c2, s1, s2;
3355
3356   CHECK_CONS (code);
3357   CHECK_INT (XCAR (code));
3358   CHECK_INT (XCDR (code));
3359   s1 = XINT (XCAR (code));
3360   s2 = XINT (XCDR (code));
3361   if (BYTE_SJIS_TWO_BYTE_1_P (s1) &&
3362       BYTE_SJIS_TWO_BYTE_2_P (s2))
3363     {
3364       DECODE_SJIS (s1, s2, c1, c2);
3365       return make_char (MAKE_CHAR (Vcharset_japanese_jisx0208,
3366                                    c1 & 0x7F, c2 & 0x7F));
3367     }
3368   else
3369     return Qnil;
3370 }
3371
3372 DEFUN ("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /*
3373 Encode a JISX0208 character CHAR to SHIFT-JIS coding-system.
3374 Return the corresponding character code in SHIFT-JIS as a cons of two bytes.
3375 */
3376        (ch))
3377 {
3378   Lisp_Object charset;
3379   int c1, c2, s1, s2;
3380
3381   CHECK_CHAR_COERCE_INT (ch);
3382   BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
3383   if (EQ (charset, Vcharset_japanese_jisx0208))
3384     {
3385       ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3386       return Fcons (make_int (s1), make_int (s2));
3387     }
3388   else
3389     return Qnil;
3390 }
3391
3392 \f
3393 /************************************************************************/
3394 /*                            Big5 methods                              */
3395 /************************************************************************/
3396
3397 /* BIG5 is a coding system encoding two character sets: ASCII and
3398    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
3399    character set and is encoded in two-byte.
3400
3401    --- CODE RANGE of BIG5 ---
3402    (character set)      (range)
3403    ASCII                0x00 .. 0x7F
3404    Big5 (1st byte)      0xA1 .. 0xFE
3405         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
3406    --------------------------
3407
3408    Since the number of characters in Big5 is larger than maximum
3409    characters in Emacs' charset (96x96), it can't be handled as one
3410    charset.  So, in Emacs, Big5 is divided into two: `charset-big5-1'
3411    and `charset-big5-2'.  Both <type>s are DIMENSION2_CHARS94.  The former
3412    contains frequently used characters and the latter contains less
3413    frequently used characters.  */
3414
3415 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
3416   ((c) >= 0xA1 && (c) <= 0xFE)
3417
3418 /* Is this the second byte of a Shift-JIS two-byte char? */
3419
3420 #define BYTE_BIG5_TWO_BYTE_2_P(c) \
3421   (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0xA1 && (c) <= 0xFE))
3422
3423 /* Number of Big5 characters which have the same code in 1st byte.  */
3424
3425 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
3426
3427 /* Code conversion macros.  These are macros because they are used in
3428    inner loops during code conversion.
3429
3430    Note that temporary variables in macros introduce the classic
3431    dynamic-scoping problems with variable names.  We use capital-
3432    lettered variables in the assumption that XEmacs does not use
3433    capital letters in variables except in a very formalized way
3434    (e.g. Qstring). */
3435
3436 /* Convert Big5 code (b1, b2) into its internal string representation
3437    (lb, c1, c2). */
3438
3439 /* There is a much simpler way to split the Big5 charset into two.
3440    For the moment I'm going to leave the algorithm as-is because it
3441    claims to separate out the most-used characters into a single
3442    charset, which perhaps will lead to optimizations in various
3443    places.
3444
3445    The way the algorithm works is something like this:
3446
3447    Big5 can be viewed as a 94x157 charset, where the row is
3448    encoded into the bytes 0xA1 .. 0xFE and the column is encoded
3449    into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE.  As for frequency,
3450    the split between low and high column numbers is apparently
3451    meaningless; ascending rows produce less and less frequent chars.
3452    Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to
3453    the first charset, and the upper half (0xC9 .. 0xFE) to the
3454    second.  To do the conversion, we convert the character into
3455    a single number where 0 .. 156 is the first row, 157 .. 313
3456    is the second, etc.  That way, the characters are ordered by
3457    decreasing frequency.  Then we just chop the space in two
3458    and coerce the result into a 94x94 space.
3459    */
3460
3461 #define DECODE_BIG5(b1, b2, lb, c1, c2) do                              \
3462 {                                                                       \
3463   int B1 = b1, B2 = b2;                                                 \
3464   unsigned int I                                                        \
3465     = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62);     \
3466                                                                         \
3467   if (B1 < 0xC9)                                                        \
3468     {                                                                   \
3469       lb = LEADING_BYTE_CHINESE_BIG5_1;                                 \
3470     }                                                                   \
3471   else                                                                  \
3472     {                                                                   \
3473       lb = LEADING_BYTE_CHINESE_BIG5_2;                                 \
3474       I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1);                             \
3475     }                                                                   \
3476   c1 = I / (0xFF - 0xA1) + 0xA1;                                        \
3477   c2 = I % (0xFF - 0xA1) + 0xA1;                                        \
3478 } while (0)
3479
3480 /* Convert the internal string representation of a Big5 character
3481    (lb, c1, c2) into Big5 code (b1, b2). */
3482
3483 #define ENCODE_BIG5(lb, c1, c2, b1, b2) do                              \
3484 {                                                                       \
3485   unsigned int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1);       \
3486                                                                         \
3487   if (lb == LEADING_BYTE_CHINESE_BIG5_2)                                \
3488     {                                                                   \
3489       I += BIG5_SAME_ROW * (0xC9 - 0xA1);                               \
3490     }                                                                   \
3491   b1 = I / BIG5_SAME_ROW + 0xA1;                                        \
3492   b2 = I % BIG5_SAME_ROW;                                               \
3493   b2 += b2 < 0x3F ? 0x40 : 0x62;                                        \
3494 } while (0)
3495
3496 static int
3497 detect_coding_big5 (struct detection_state *st, const unsigned char *src,
3498                     unsigned int n)
3499 {
3500   int c;
3501
3502   while (n--)
3503     {
3504       c = *src++;
3505       if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO ||
3506           (c >= 0x80 && c <= 0xA0))
3507         return 0;
3508       if (st->big5.in_second_byte)
3509         {
3510           st->big5.in_second_byte = 0;
3511           if (c < 0x40 || (c >= 0x80 && c <= 0xA0))
3512             return 0;
3513         }
3514       else if (c >= 0xA1)
3515         st->big5.in_second_byte = 1;
3516     }
3517   return CODING_CATEGORY_BIG5_MASK;
3518 }
3519
3520 /* Convert Big5 data to internal format. */
3521
3522 static void
3523 decode_coding_big5 (Lstream *decoding, const unsigned char *src,
3524                     unsigned_char_dynarr *dst, unsigned int n)
3525 {
3526   unsigned char c;
3527   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3528   unsigned int flags  = str->flags;
3529   unsigned int cpos   = str->cpos;
3530   eol_type_t eol_type = str->eol_type;
3531
3532   while (n--)
3533     {
3534       c = *src++;
3535       if (cpos)
3536         {
3537           /* Previous character was first byte of Big5 char. */
3538           if (BYTE_BIG5_TWO_BYTE_2_P (c))
3539             {
3540 #ifdef UTF2000
3541               Charset_ID b1;
3542               unsigned char b2, b3;
3543               DECODE_BIG5 (cpos, c, b1, b2, b3);
3544               DECODE_ADD_UCS_CHAR (MAKE_CHAR (CHARSET_BY_LEADING_BYTE (b1),
3545                                               b2 & 0x7F, b3 & 0x7F),
3546                                    dst);
3547 #else
3548               unsigned char b1, b2, b3;
3549               DECODE_BIG5 (cpos, c, b1, b2, b3);
3550               Dynarr_add (dst, b1);
3551               Dynarr_add (dst, b2);
3552               Dynarr_add (dst, b3);
3553 #endif
3554             }
3555           else
3556             {
3557               DECODE_ADD_BINARY_CHAR (cpos, dst);
3558               DECODE_ADD_BINARY_CHAR (c, dst);
3559             }
3560           cpos = 0;
3561         }
3562       else
3563         {
3564           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3565           if (BYTE_BIG5_TWO_BYTE_1_P (c))
3566             cpos = c;
3567           else
3568             DECODE_ADD_BINARY_CHAR (c, dst);
3569         }
3570     label_continue_loop:;
3571     }
3572
3573   DECODE_HANDLE_END_OF_CONVERSION (flags, cpos, dst);
3574
3575   str->flags = flags;
3576   str->cpos  = cpos;
3577 }
3578
3579 /* Convert internally-formatted data to Big5. */
3580
3581 static void
3582 encode_coding_big5 (Lstream *encoding, const unsigned char *src,
3583                     unsigned_char_dynarr *dst, unsigned int n)
3584 {
3585 #ifndef UTF2000
3586   unsigned char c;
3587   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3588   unsigned int flags  = str->flags;
3589   unsigned int ch     = str->ch;
3590   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3591
3592   while (n--)
3593     {
3594       c = *src++;
3595       if (c == '\n')
3596         {
3597           if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3598             Dynarr_add (dst, '\r');
3599           if (eol_type != EOL_CR)
3600             Dynarr_add (dst, '\n');
3601         }
3602       else if (BYTE_ASCII_P (c))
3603         {
3604           /* ASCII. */
3605           Dynarr_add (dst, c);
3606         }
3607       else if (BUFBYTE_LEADING_BYTE_P (c))
3608         {
3609           if (c == LEADING_BYTE_CHINESE_BIG5_1 ||
3610               c == LEADING_BYTE_CHINESE_BIG5_2)
3611             {
3612               /* A recognized leading byte. */
3613               ch = c;
3614               continue; /* not done with this character. */
3615             }
3616           /* otherwise just ignore this character. */
3617         }
3618       else if (ch == LEADING_BYTE_CHINESE_BIG5_1 ||
3619                ch == LEADING_BYTE_CHINESE_BIG5_2)
3620         {
3621           /* Previous char was a recognized leading byte. */
3622           ch = (ch << 8) | c;
3623           continue; /* not done with this character. */
3624         }
3625       else if (ch)
3626         {
3627           /* Encountering second byte of a Big5 character. */
3628           unsigned char b1, b2;
3629
3630           ENCODE_BIG5 (ch >> 8, ch & 0xFF, c, b1, b2);
3631           Dynarr_add (dst, b1);
3632           Dynarr_add (dst, b2);
3633         }
3634
3635       ch = 0;
3636     }
3637
3638   str->flags = flags;
3639   str->ch    = ch;
3640 #endif
3641 }
3642
3643
3644 DEFUN ("decode-big5-char", Fdecode_big5_char, 1, 1, 0, /*
3645 Decode a Big5 character CODE of BIG5 coding-system.
3646 CODE is the character code in BIG5, a cons of two integers.
3647 Return the corresponding character.
3648 */
3649        (code))
3650 {
3651   unsigned char c1, c2, b1, b2;
3652
3653   CHECK_CONS (code);
3654   CHECK_INT (XCAR (code));
3655   CHECK_INT (XCDR (code));
3656   b1 = XINT (XCAR (code));
3657   b2 = XINT (XCDR (code));
3658   if (BYTE_BIG5_TWO_BYTE_1_P (b1) &&
3659       BYTE_BIG5_TWO_BYTE_2_P (b2))
3660     {
3661       Charset_ID leading_byte;
3662       Lisp_Object charset;
3663       DECODE_BIG5 (b1, b2, leading_byte, c1, c2);
3664       charset = CHARSET_BY_LEADING_BYTE (leading_byte);
3665       return make_char (MAKE_CHAR (charset, c1 & 0x7F, c2 & 0x7F));
3666     }
3667   else
3668     return Qnil;
3669 }
3670
3671 DEFUN ("encode-big5-char", Fencode_big5_char, 1, 1, 0, /*
3672 Encode the Big5 character CH to BIG5 coding-system.
3673 Return the corresponding character code in Big5.
3674 */
3675        (ch))
3676 {
3677   Lisp_Object charset;
3678   int c1, c2, b1, b2;
3679
3680   CHECK_CHAR_COERCE_INT (ch);
3681   BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
3682   if (EQ (charset, Vcharset_chinese_big5_1) ||
3683       EQ (charset, Vcharset_chinese_big5_2))
3684     {
3685       ENCODE_BIG5 (XCHARSET_LEADING_BYTE (charset), c1 | 0x80, c2 | 0x80,
3686                    b1, b2);
3687       return Fcons (make_int (b1), make_int (b2));
3688     }
3689   else
3690     return Qnil;
3691 }
3692
3693 \f
3694 /************************************************************************/
3695 /*                           UCS-4 methods                              */
3696 /************************************************************************/
3697
3698 static int
3699 detect_coding_ucs4 (struct detection_state *st, const unsigned char *src,
3700                     unsigned int n)
3701 {
3702   while (n--)
3703     {
3704       int c = *src++;
3705       switch (st->ucs4.in_byte)
3706         {
3707         case 0:
3708           if (c >= 128)
3709             return 0;
3710           else
3711             st->ucs4.in_byte++;
3712           break;
3713         case 3:
3714           st->ucs4.in_byte = 0;
3715           break;
3716         default:
3717           st->ucs4.in_byte++;
3718         }
3719     }
3720   return CODING_CATEGORY_UCS4_MASK;
3721 }
3722
3723 static void
3724 decode_coding_ucs4 (Lstream *decoding, const unsigned char *src,
3725                     unsigned_char_dynarr *dst, unsigned int n)
3726 {
3727   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3728   unsigned int flags = str->flags;
3729   unsigned int cpos  = str->cpos;
3730   unsigned char counter = str->counter;
3731
3732   while (n--)
3733     {
3734       unsigned char c = *src++;
3735       switch (counter)
3736         {
3737         case 0:
3738           cpos = c;
3739           counter = 3;
3740           break;
3741         case 1:
3742           DECODE_ADD_UCS_CHAR ((cpos << 8) | c, dst);
3743           cpos = 0;
3744           counter = 0;
3745           break;
3746         default:
3747           cpos = ( cpos << 8 ) | c;
3748           counter--;
3749         }
3750     }
3751   if (counter & CODING_STATE_END)
3752     DECODE_OUTPUT_PARTIAL_CHAR (cpos);
3753
3754   str->flags    = flags;
3755   str->cpos     = cpos;
3756   str->counter  = counter;
3757 }
3758
3759 void
3760 char_encode_ucs4 (struct encoding_stream *str, Emchar ch,
3761                   unsigned_char_dynarr *dst, unsigned int *flags)
3762 {
3763   Dynarr_add (dst, ch >> 24);
3764   Dynarr_add (dst, ch >> 16);
3765   Dynarr_add (dst, ch >>  8);
3766   Dynarr_add (dst, ch      );
3767 }
3768
3769 void
3770 char_finish_ucs4 (struct encoding_stream *str, unsigned_char_dynarr *dst,
3771                   unsigned int *flags)
3772 {
3773 }
3774
3775 \f
3776 /************************************************************************/
3777 /*                           UTF-8 methods                              */
3778 /************************************************************************/
3779
3780 static int
3781 detect_coding_utf8 (struct detection_state *st, const unsigned char *src,
3782                     unsigned int n)
3783 {
3784   while (n--)
3785     {
3786       unsigned char c = *src++;
3787       switch (st->utf8.in_byte)
3788         {
3789         case 0:
3790           if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3791             return 0;
3792           else if (c >= 0xfc)
3793             st->utf8.in_byte = 5;
3794           else if (c >= 0xf8)
3795             st->utf8.in_byte = 4;
3796           else if (c >= 0xf0)
3797             st->utf8.in_byte = 3;
3798           else if (c >= 0xe0)
3799             st->utf8.in_byte = 2;
3800           else if (c >= 0xc0)
3801             st->utf8.in_byte = 1;
3802           else if (c >= 0x80)
3803             return 0;
3804           break;
3805         default:
3806           if ((c & 0xc0) != 0x80)
3807             return 0;
3808           else
3809             st->utf8.in_byte--;
3810         }
3811     }
3812   return CODING_CATEGORY_UTF8_MASK;
3813 }
3814
3815 static void
3816 decode_coding_utf8 (Lstream *decoding, const unsigned char *src,
3817                     unsigned_char_dynarr *dst, unsigned int n)
3818 {
3819   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3820   unsigned int flags    = str->flags;
3821   unsigned int cpos     = str->cpos;
3822   eol_type_t eol_type   = str->eol_type;
3823   unsigned char counter = str->counter;
3824
3825   while (n--)
3826     {
3827       unsigned char c = *src++;
3828       switch (counter)
3829         {
3830         case 0:
3831           if ( c >= 0xfc )
3832             {
3833               cpos = c & 0x01;
3834               counter = 5;
3835             }
3836           else if ( c >= 0xf8 )
3837             {
3838               cpos = c & 0x03;
3839               counter = 4;
3840             }
3841           else if ( c >= 0xf0 )
3842             {
3843               cpos = c & 0x07;
3844               counter = 3;
3845             }
3846           else if ( c >= 0xe0 )
3847             {
3848               cpos = c & 0x0f;
3849               counter = 2;
3850             }
3851           else if ( c >= 0xc0 )
3852             {
3853               cpos = c & 0x1f;
3854               counter = 1;
3855             }
3856           else
3857             {
3858               DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3859               DECODE_ADD_UCS_CHAR (c, dst);
3860             }
3861           break;
3862         case 1:
3863           cpos = ( cpos << 6 ) | ( c & 0x3f );
3864           DECODE_ADD_UCS_CHAR (cpos, dst);
3865           cpos = 0;
3866           counter = 0;
3867           break;
3868         default:
3869           cpos = ( cpos << 6 ) | ( c & 0x3f );
3870           counter--;
3871         }
3872     label_continue_loop:;
3873     }
3874
3875   if (flags & CODING_STATE_END)
3876     DECODE_OUTPUT_PARTIAL_CHAR (cpos);
3877
3878   str->flags    = flags;
3879   str->cpos     = cpos;
3880   str->counter  = counter;
3881 }
3882
3883 void
3884 char_encode_utf8 (struct encoding_stream *str, Emchar ch,
3885                   unsigned_char_dynarr *dst, unsigned int *flags)
3886 {
3887   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3888
3889   if (ch == '\n')
3890     {
3891       if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3892         Dynarr_add (dst, '\r');
3893       if (eol_type != EOL_CR)
3894         Dynarr_add (dst, ch);
3895     }
3896   else if (ch <= 0x7f)
3897     {
3898       Dynarr_add (dst, ch);
3899     }
3900   else if (ch <= 0x7ff)
3901     {
3902       Dynarr_add (dst, (ch >> 6) | 0xc0);
3903       Dynarr_add (dst, (ch & 0x3f) | 0x80);
3904     }
3905   else if (ch <= 0xffff)
3906     {
3907       Dynarr_add (dst,  (ch >> 12) | 0xe0);
3908       Dynarr_add (dst, ((ch >>  6) & 0x3f) | 0x80);
3909       Dynarr_add (dst,  (ch        & 0x3f) | 0x80);
3910     }
3911   else if (ch <= 0x1fffff)
3912     {
3913       Dynarr_add (dst,  (ch >> 18) | 0xf0);
3914       Dynarr_add (dst, ((ch >> 12) & 0x3f) | 0x80);
3915       Dynarr_add (dst, ((ch >>  6) & 0x3f) | 0x80);
3916       Dynarr_add (dst,  (ch        & 0x3f) | 0x80);
3917     }
3918   else if (ch <= 0x3ffffff)
3919     {
3920       Dynarr_add (dst,  (ch >> 24) | 0xf8);
3921       Dynarr_add (dst, ((ch >> 18) & 0x3f) | 0x80);
3922       Dynarr_add (dst, ((ch >> 12) & 0x3f) | 0x80);
3923       Dynarr_add (dst, ((ch >>  6) & 0x3f) | 0x80);
3924       Dynarr_add (dst,  (ch        & 0x3f) | 0x80);
3925     }
3926   else
3927     {
3928       Dynarr_add (dst,  (ch >> 30) | 0xfc);
3929       Dynarr_add (dst, ((ch >> 24) & 0x3f) | 0x80);
3930       Dynarr_add (dst, ((ch >> 18) & 0x3f) | 0x80);
3931       Dynarr_add (dst, ((ch >> 12) & 0x3f) | 0x80);
3932       Dynarr_add (dst, ((ch >>  6) & 0x3f) | 0x80);
3933       Dynarr_add (dst,  (ch        & 0x3f) | 0x80);
3934     }
3935 }
3936
3937 void
3938 char_finish_utf8 (struct encoding_stream *str, unsigned_char_dynarr *dst,
3939                   unsigned int *flags)
3940 {
3941 }
3942
3943 \f
3944 /************************************************************************/
3945 /*                           ISO2022 methods                            */
3946 /************************************************************************/
3947
3948 /* The following note describes the coding system ISO2022 briefly.
3949    Since the intention of this note is to help understand the
3950    functions in this file, some parts are NOT ACCURATE or OVERLY
3951    SIMPLIFIED.  For thorough understanding, please refer to the
3952    original document of ISO2022.
3953
3954    ISO2022 provides many mechanisms to encode several character sets
3955    in 7-bit and 8-bit environments.  For 7-bit environments, all text
3956    is encoded using bytes less than 128.  This may make the encoded
3957    text a little bit longer, but the text passes more easily through
3958    several gateways, some of which strip off MSB (Most Signigant Bit).
3959
3960    There are two kinds of character sets: control character set and
3961    graphic character set.  The former contains control characters such
3962    as `newline' and `escape' to provide control functions (control
3963    functions are also provided by escape sequences).  The latter
3964    contains graphic characters such as 'A' and '-'.  Emacs recognizes
3965    two control character sets and many graphic character sets.
3966
3967    Graphic character sets are classified into one of the following
3968    four classes, according to the number of bytes (DIMENSION) and
3969    number of characters in one dimension (CHARS) of the set:
3970    - DIMENSION1_CHARS94
3971    - DIMENSION1_CHARS96
3972    - DIMENSION2_CHARS94
3973    - DIMENSION2_CHARS96
3974
3975    In addition, each character set is assigned an identification tag,
3976    unique for each set, called "final character" (denoted as <F>
3977    hereafter).  The <F> of each character set is decided by ECMA(*)
3978    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
3979    (0x30..0x3F are for private use only).
3980
3981    Note (*): ECMA = European Computer Manufacturers Association
3982
3983    Here are examples of graphic character set [NAME(<F>)]:
3984         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
3985         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
3986         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
3987         o DIMENSION2_CHARS96 -- none for the moment
3988
3989    A code area (1 byte = 8 bits) is divided into 4 areas, C0, GL, C1, and GR.
3990         C0 [0x00..0x1F] -- control character plane 0
3991         GL [0x20..0x7F] -- graphic character plane 0
3992         C1 [0x80..0x9F] -- control character plane 1
3993         GR [0xA0..0xFF] -- graphic character plane 1
3994
3995    A control character set is directly designated and invoked to C0 or
3996    C1 by an escape sequence.  The most common case is that:
3997    - ISO646's  control character set is designated/invoked to C0, and
3998    - ISO6429's control character set is designated/invoked to C1,
3999    and usually these designations/invocations are omitted in encoded
4000    text.  In a 7-bit environment, only C0 can be used, and a control
4001    character for C1 is encoded by an appropriate escape sequence to
4002    fit into the environment.  All control characters for C1 are
4003    defined to have corresponding escape sequences.
4004
4005    A graphic character set is at first designated to one of four
4006    graphic registers (G0 through G3), then these graphic registers are
4007    invoked to GL or GR.  These designations and invocations can be
4008    done independently.  The most common case is that G0 is invoked to
4009    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
4010    these invocations and designations are omitted in encoded text.
4011    In a 7-bit environment, only GL can be used.
4012
4013    When a graphic character set of CHARS94 is invoked to GL, codes
4014    0x20 and 0x7F of the GL area work as control characters SPACE and
4015    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
4016    be used.
4017
4018    There are two ways of invocation: locking-shift and single-shift.
4019    With locking-shift, the invocation lasts until the next different
4020    invocation, whereas with single-shift, the invocation affects the
4021    following character only and doesn't affect the locking-shift
4022    state.  Invocations are done by the following control characters or
4023    escape sequences:
4024
4025    ----------------------------------------------------------------------
4026    abbrev  function                  cntrl escape seq   description
4027    ----------------------------------------------------------------------
4028    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
4029    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
4030    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
4031    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
4032    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
4033    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
4034    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
4035    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
4036    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
4037    ----------------------------------------------------------------------
4038    (*) These are not used by any known coding system.
4039
4040    Control characters for these functions are defined by macros
4041    ISO_CODE_XXX in `coding.h'.
4042
4043    Designations are done by the following escape sequences:
4044    ----------------------------------------------------------------------
4045    escape sequence      description
4046    ----------------------------------------------------------------------
4047    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
4048    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
4049    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
4050    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
4051    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
4052    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
4053    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
4054    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
4055    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
4056    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
4057    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
4058    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
4059    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
4060    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
4061    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
4062    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
4063    ----------------------------------------------------------------------
4064
4065    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
4066    of dimension 1, chars 94, and final character <F>, etc...
4067
4068    Note (*): Although these designations are not allowed in ISO2022,
4069    Emacs accepts them on decoding, and produces them on encoding
4070    CHARS96 character sets in a coding system which is characterized as
4071    7-bit environment, non-locking-shift, and non-single-shift.
4072
4073    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
4074    '(' can be omitted.  We refer to this as "short-form" hereafter.
4075
4076    Now you may notice that there are a lot of ways for encoding the
4077    same multilingual text in ISO2022.  Actually, there exist many
4078    coding systems such as Compound Text (used in X11's inter client
4079    communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
4080    (used in Korean internet), EUC (Extended UNIX Code, used in Asian
4081    localized platforms), and all of these are variants of ISO2022.
4082
4083    In addition to the above, Emacs handles two more kinds of escape
4084    sequences: ISO6429's direction specification and Emacs' private
4085    sequence for specifying character composition.
4086
4087    ISO6429's direction specification takes the following form:
4088         o CSI ']'      -- end of the current direction
4089         o CSI '0' ']'  -- end of the current direction
4090         o CSI '1' ']'  -- start of left-to-right text
4091         o CSI '2' ']'  -- start of right-to-left text
4092    The control character CSI (0x9B: control sequence introducer) is
4093    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
4094
4095    Character composition specification takes the following form:
4096         o ESC '0' -- start character composition
4097         o ESC '1' -- end character composition
4098    Since these are not standard escape sequences of any ISO standard,
4099    their use with these meanings is restricted to Emacs only.  */
4100
4101 static void
4102 reset_iso2022 (Lisp_Object coding_system, struct iso2022_decoder *iso)
4103 {
4104   int i;
4105
4106   for (i = 0; i < 4; i++)
4107     {
4108       if (!NILP (coding_system))
4109         iso->charset[i] =
4110           XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, i);
4111       else
4112         iso->charset[i] = Qt;
4113       iso->invalid_designated[i] = 0;
4114     }
4115   iso->esc = ISO_ESC_NOTHING;
4116   iso->esc_bytes_index = 0;
4117   iso->register_left = 0;
4118   iso->register_right = 1;
4119   iso->switched_dir_and_no_valid_charset_yet = 0;
4120   iso->invalid_switch_dir = 0;
4121   iso->output_direction_sequence = 0;
4122   iso->output_literally = 0;
4123 #ifdef ENABLE_COMPOSITE_CHARS
4124   if (iso->composite_chars)
4125     Dynarr_reset (iso->composite_chars);
4126 #endif
4127 }
4128
4129 static int
4130 fit_to_be_escape_quoted (unsigned char c)
4131 {
4132   switch (c)
4133     {
4134     case ISO_CODE_ESC:
4135     case ISO_CODE_CSI:
4136     case ISO_CODE_SS2:
4137     case ISO_CODE_SS3:
4138     case ISO_CODE_SO:
4139     case ISO_CODE_SI:
4140       return 1;
4141
4142     default:
4143       return 0;
4144     }
4145 }
4146
4147 /* Parse one byte of an ISO2022 escape sequence.
4148    If the result is an invalid escape sequence, return 0 and
4149    do not change anything in STR.  Otherwise, if the result is
4150    an incomplete escape sequence, update ISO2022.ESC and
4151    ISO2022.ESC_BYTES and return -1.  Otherwise, update
4152    all the state variables (but not ISO2022.ESC_BYTES) and
4153    return 1.
4154
4155    If CHECK_INVALID_CHARSETS is non-zero, check for designation
4156    or invocation of an invalid character set and treat that as
4157    an unrecognized escape sequence. */
4158
4159 static int
4160 parse_iso2022_esc (Lisp_Object codesys, struct iso2022_decoder *iso,
4161                    unsigned char c, unsigned int *flags,
4162                    int check_invalid_charsets)
4163 {
4164   /* (1) If we're at the end of a designation sequence, CS is the
4165      charset being designated and REG is the register to designate
4166      it to.
4167
4168      (2) If we're at the end of a locking-shift sequence, REG is
4169      the register to invoke and HALF (0 == left, 1 == right) is
4170      the half to invoke it into.
4171
4172      (3) If we're at the end of a single-shift sequence, REG is
4173      the register to invoke. */
4174   Lisp_Object cs = Qnil;
4175   int reg, half;
4176
4177   /* NOTE: This code does goto's all over the fucking place.
4178      The reason for this is that we're basically implementing
4179      a state machine here, and hierarchical languages like C
4180      don't really provide a clean way of doing this. */
4181
4182   if (! (*flags & CODING_STATE_ESCAPE))
4183     /* At beginning of escape sequence; we need to reset our
4184        escape-state variables. */
4185     iso->esc = ISO_ESC_NOTHING;
4186
4187   iso->output_literally = 0;
4188   iso->output_direction_sequence = 0;
4189
4190   switch (iso->esc)
4191     {
4192     case ISO_ESC_NOTHING:
4193       iso->esc_bytes_index = 0;
4194       switch (c)
4195         {
4196         case ISO_CODE_ESC:      /* Start escape sequence */
4197           *flags |= CODING_STATE_ESCAPE;
4198           iso->esc = ISO_ESC;
4199           goto not_done;
4200
4201         case ISO_CODE_CSI:      /* ISO6429 (specifying directionality) */
4202           *flags |= CODING_STATE_ESCAPE;
4203           iso->esc = ISO_ESC_5_11;
4204           goto not_done;
4205
4206         case ISO_CODE_SO:       /* locking shift 1 */
4207           reg = 1; half = 0;
4208           goto locking_shift;
4209         case ISO_CODE_SI:       /* locking shift 0 */
4210           reg = 0; half = 0;
4211           goto locking_shift;
4212
4213         case ISO_CODE_SS2:      /* single shift */
4214           reg = 2;
4215           goto single_shift;
4216         case ISO_CODE_SS3:      /* single shift */
4217           reg = 3;
4218           goto single_shift;
4219
4220         default:                        /* Other control characters */
4221           return 0;
4222         }
4223
4224     case ISO_ESC:
4225       switch (c)
4226         {
4227           /**** single shift ****/
4228
4229         case 'N':       /* single shift 2 */
4230           reg = 2;
4231           goto single_shift;
4232         case 'O':       /* single shift 3 */
4233           reg = 3;
4234           goto single_shift;
4235
4236           /**** locking shift ****/
4237
4238         case '~':       /* locking shift 1 right */
4239           reg = 1; half = 1;
4240           goto locking_shift;
4241         case 'n':       /* locking shift 2 */
4242           reg = 2; half = 0;
4243           goto locking_shift;
4244         case '}':       /* locking shift 2 right */
4245           reg = 2; half = 1;
4246           goto locking_shift;
4247         case 'o':       /* locking shift 3 */
4248           reg = 3; half = 0;
4249           goto locking_shift;
4250         case '|':       /* locking shift 3 right */
4251           reg = 3; half = 1;
4252           goto locking_shift;
4253
4254 #ifdef ENABLE_COMPOSITE_CHARS
4255           /**** composite ****/
4256
4257         case '0':
4258           iso->esc = ISO_ESC_START_COMPOSITE;
4259           *flags = (*flags & CODING_STATE_ISO2022_LOCK) |
4260             CODING_STATE_COMPOSITE;
4261           return 1;
4262
4263         case '1':
4264           iso->esc = ISO_ESC_END_COMPOSITE;
4265           *flags = (*flags & CODING_STATE_ISO2022_LOCK) &
4266             ~CODING_STATE_COMPOSITE;
4267           return 1;
4268 #endif /* ENABLE_COMPOSITE_CHARS */
4269
4270           /**** directionality ****/
4271
4272         case '[':
4273           iso->esc = ISO_ESC_5_11;
4274           goto not_done;
4275
4276           /**** designation ****/
4277
4278         case '$':       /* multibyte charset prefix */
4279           iso->esc = ISO_ESC_2_4;
4280           goto not_done;
4281
4282         default:
4283           if (0x28 <= c && c <= 0x2F)
4284             {
4285               iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_8);
4286               goto not_done;
4287             }
4288
4289           /* This function is called with CODESYS equal to nil when
4290              doing coding-system detection. */
4291           if (!NILP (codesys)
4292               && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
4293               && fit_to_be_escape_quoted (c))
4294             {
4295               iso->esc = ISO_ESC_LITERAL;
4296               *flags &= CODING_STATE_ISO2022_LOCK;
4297               return 1;
4298             }
4299
4300           /* bzzzt! */
4301           return 0;
4302         }
4303
4304
4305
4306       /**** directionality ****/
4307
4308     case ISO_ESC_5_11:          /* ISO6429 direction control */
4309       if (c == ']')
4310         {
4311           *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4312           goto directionality;
4313         }
4314       if      (c == '0') iso->esc = ISO_ESC_5_11_0;
4315       else if (c == '1') iso->esc = ISO_ESC_5_11_1;
4316       else if (c == '2') iso->esc = ISO_ESC_5_11_2;
4317       else               return 0;
4318       goto not_done;
4319
4320     case ISO_ESC_5_11_0:
4321       if (c == ']')
4322         {
4323           *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4324           goto directionality;
4325         }
4326       return 0;
4327
4328     case ISO_ESC_5_11_1:
4329       if (c == ']')
4330         {
4331           *flags = (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4332           goto directionality;
4333         }
4334       return 0;
4335
4336     case ISO_ESC_5_11_2:
4337       if (c == ']')
4338         {
4339           *flags = (*flags & CODING_STATE_ISO2022_LOCK) | CODING_STATE_R2L;
4340           goto directionality;
4341         }
4342       return 0;
4343
4344     directionality:
4345       iso->esc = ISO_ESC_DIRECTIONALITY;
4346       /* Various junk here to attempt to preserve the direction sequences
4347          literally in the text if they would otherwise be swallowed due
4348          to invalid designations that don't show up as actual charset
4349          changes in the text. */
4350       if (iso->invalid_switch_dir)
4351         {
4352           /* We already inserted a direction switch literally into the
4353              text.  We assume (#### this may not be right) that the
4354              next direction switch is the one going the other way,
4355              and we need to output that literally as well. */
4356           iso->output_literally = 1;
4357           iso->invalid_switch_dir = 0;
4358         }
4359       else
4360         {
4361           int jj;
4362
4363           /* If we are in the thrall of an invalid designation,
4364            then stick the directionality sequence literally into the
4365            output stream so it ends up in the original text again. */
4366           for (jj = 0; jj < 4; jj++)
4367             if (iso->invalid_designated[jj])
4368               break;
4369           if (jj < 4)
4370             {
4371               iso->output_literally = 1;
4372               iso->invalid_switch_dir = 1;
4373             }
4374           else
4375             /* Indicate that we haven't yet seen a valid designation,
4376                so that if a switch-dir is directly followed by an
4377                invalid designation, both get inserted literally. */
4378             iso->switched_dir_and_no_valid_charset_yet = 1;
4379         }
4380       return 1;
4381
4382
4383       /**** designation ****/
4384
4385     case ISO_ESC_2_4:
4386       if (0x28 <= c && c <= 0x2F)
4387         {
4388           iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_4_8);
4389           goto not_done;
4390         }
4391       if (0x40 <= c && c <= 0x42)
4392         {
4393           /* 94^n-set */
4394           cs = CHARSET_BY_ATTRIBUTES (94, -1, c,
4395                                       *flags & CODING_STATE_R2L ?
4396                                       CHARSET_RIGHT_TO_LEFT :
4397                                       CHARSET_LEFT_TO_RIGHT);
4398           reg = 0;
4399           goto designated;
4400         }
4401       return 0;
4402
4403     default:
4404       {
4405         int chars = 0;
4406         int single = 0;
4407
4408         if (c < '0' || c > '~')
4409           return 0; /* bad final byte */
4410
4411         if (iso->esc >= ISO_ESC_2_8 &&
4412             iso->esc <= ISO_ESC_2_15)
4413           {
4414             chars = (iso->esc >= ISO_ESC_2_12) ? 96 : 94;
4415             single = 1; /* single-byte */
4416             reg = (iso->esc - ISO_ESC_2_8) & 3;
4417           }
4418         else if (iso->esc >= ISO_ESC_2_4_8 &&
4419                  iso->esc <= ISO_ESC_2_4_15)
4420           {
4421             chars = (iso->esc >= ISO_ESC_2_4_12) ? 96 : 94;
4422             single = -1; /* multi-byte */
4423             reg = (iso->esc - ISO_ESC_2_4_8) & 3;
4424           }
4425         else
4426           {
4427             /* Can this ever be reached? -slb */
4428             abort();
4429           }
4430
4431         cs = CHARSET_BY_ATTRIBUTES (chars, single, c,
4432                                     *flags & CODING_STATE_R2L ?
4433                                     CHARSET_RIGHT_TO_LEFT :
4434                                     CHARSET_LEFT_TO_RIGHT);
4435         goto designated;
4436       }
4437     }
4438
4439  not_done:
4440   iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char) c;
4441   return -1;
4442
4443  single_shift:
4444   if (check_invalid_charsets && !CHARSETP (iso->charset[reg]))
4445     /* can't invoke something that ain't there. */
4446     return 0;
4447   iso->esc = ISO_ESC_SINGLE_SHIFT;
4448   *flags &= CODING_STATE_ISO2022_LOCK;
4449   if (reg == 2)
4450     *flags |= CODING_STATE_SS2;
4451   else
4452     *flags |= CODING_STATE_SS3;
4453   return 1;
4454
4455  locking_shift:
4456   if (check_invalid_charsets &&
4457       !CHARSETP (iso->charset[reg]))
4458     /* can't invoke something that ain't there. */
4459     return 0;
4460   if (half)
4461     iso->register_right = reg;
4462   else
4463     iso->register_left = reg;
4464   *flags &= CODING_STATE_ISO2022_LOCK;
4465   iso->esc = ISO_ESC_LOCKING_SHIFT;
4466   return 1;
4467
4468  designated:
4469   if (NILP (cs) && check_invalid_charsets)
4470     {
4471       iso->invalid_designated[reg] = 1;
4472       iso->charset[reg] = Vcharset_ascii;
4473       iso->esc = ISO_ESC_DESIGNATE;
4474       *flags &= CODING_STATE_ISO2022_LOCK;
4475       iso->output_literally = 1;
4476       if (iso->switched_dir_and_no_valid_charset_yet)
4477         {
4478           /* We encountered a switch-direction followed by an
4479              invalid designation.  Ensure that the switch-direction
4480              gets outputted; otherwise it will probably get eaten
4481              when the text is written out again. */
4482           iso->switched_dir_and_no_valid_charset_yet = 0;
4483           iso->output_direction_sequence = 1;
4484           /* And make sure that the switch-dir going the other
4485              way gets outputted, as well. */
4486           iso->invalid_switch_dir = 1;
4487         }
4488       return 1;
4489     }
4490   /* This function is called with CODESYS equal to nil when
4491      doing coding-system detection. */
4492   if (!NILP (codesys))
4493     {
4494       charset_conversion_spec_dynarr *dyn =
4495         XCODING_SYSTEM (codesys)->iso2022.input_conv;
4496
4497       if (dyn)
4498         {
4499           int i;
4500
4501           for (i = 0; i < Dynarr_length (dyn); i++)
4502             {
4503               struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
4504               if (EQ (cs, spec->from_charset))
4505                 cs = spec->to_charset;
4506             }
4507         }
4508     }
4509
4510   iso->charset[reg] = cs;
4511   iso->esc = ISO_ESC_DESIGNATE;
4512   *flags &= CODING_STATE_ISO2022_LOCK;
4513   if (iso->invalid_designated[reg])
4514     {
4515       iso->invalid_designated[reg] = 0;
4516       iso->output_literally = 1;
4517     }
4518   if (iso->switched_dir_and_no_valid_charset_yet)
4519     iso->switched_dir_and_no_valid_charset_yet = 0;
4520   return 1;
4521 }
4522
4523 static int
4524 detect_coding_iso2022 (struct detection_state *st, const unsigned char *src,
4525                        unsigned int n)
4526 {
4527   int mask;
4528
4529   /* #### There are serious deficiencies in the recognition mechanism
4530      here.  This needs to be much smarter if it's going to cut it.
4531      The sequence "\xff\x0f" is currently detected as LOCK_SHIFT while
4532      it should be detected as Latin-1.
4533      All the ISO2022 stuff in this file should be synced up with the
4534      code from FSF Emacs-20.4, in which Mule should be more or less stable.
4535      Perhaps we should wait till R2L works in FSF Emacs? */
4536
4537   if (!st->iso2022.initted)
4538     {
4539       reset_iso2022 (Qnil, &st->iso2022.iso);
4540       st->iso2022.mask = (CODING_CATEGORY_ISO_7_MASK |
4541                           CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4542                           CODING_CATEGORY_ISO_8_1_MASK |
4543                           CODING_CATEGORY_ISO_8_2_MASK |
4544                           CODING_CATEGORY_ISO_LOCK_SHIFT_MASK);
4545       st->iso2022.flags = 0;
4546       st->iso2022.high_byte_count = 0;
4547       st->iso2022.saw_single_shift = 0;
4548       st->iso2022.initted = 1;
4549     }
4550
4551   mask = st->iso2022.mask;
4552
4553   while (n--)
4554     {
4555       int c = *src++;
4556       if (c >= 0xA0)
4557         {
4558           mask &= ~CODING_CATEGORY_ISO_7_MASK;
4559           st->iso2022.high_byte_count++;
4560         }
4561       else
4562         {
4563           if (st->iso2022.high_byte_count && !st->iso2022.saw_single_shift)
4564             {
4565               if (st->iso2022.high_byte_count & 1)
4566                 /* odd number of high bytes; assume not iso-8-2 */
4567                 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4568             }
4569           st->iso2022.high_byte_count = 0;
4570           st->iso2022.saw_single_shift = 0;
4571           if (c > 0x80)
4572             mask &= ~CODING_CATEGORY_ISO_7_MASK;
4573         }
4574       if (!(st->iso2022.flags & CODING_STATE_ESCAPE)
4575           && (BYTE_C0_P (c) || BYTE_C1_P (c)))
4576         { /* control chars */
4577           switch (c)
4578             {
4579               /* Allow and ignore control characters that you might
4580                  reasonably see in a text file */
4581             case '\r':
4582             case '\n':
4583             case '\t':
4584             case  7: /* bell */
4585             case  8: /* backspace */
4586             case 11: /* vertical tab */
4587             case 12: /* form feed */
4588             case 26: /* MS-DOS C-z junk */
4589             case 31: /* '^_' -- for info */
4590               goto label_continue_loop;
4591
4592             default:
4593               break;
4594             }
4595         }
4596
4597       if ((st->iso2022.flags & CODING_STATE_ESCAPE) || BYTE_C0_P (c)
4598           || BYTE_C1_P (c))
4599         {
4600           if (parse_iso2022_esc (Qnil, &st->iso2022.iso, c,
4601                                  &st->iso2022.flags, 0))
4602             {
4603               switch (st->iso2022.iso.esc)
4604                 {
4605                 case ISO_ESC_DESIGNATE:
4606                   mask &= ~CODING_CATEGORY_ISO_8_1_MASK;
4607                   mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4608                   break;
4609                 case ISO_ESC_LOCKING_SHIFT:
4610                   mask = CODING_CATEGORY_ISO_LOCK_SHIFT_MASK;
4611                   goto ran_out_of_chars;
4612                 case ISO_ESC_SINGLE_SHIFT:
4613                   mask &= ~CODING_CATEGORY_ISO_8_DESIGNATE_MASK;
4614                   st->iso2022.saw_single_shift = 1;
4615                   break;
4616                 default:
4617                   break;
4618                 }
4619             }
4620           else
4621             {
4622               mask = 0;
4623               goto ran_out_of_chars;
4624             }
4625         }
4626     label_continue_loop:;
4627     }
4628
4629  ran_out_of_chars:
4630
4631   return mask;
4632 }
4633
4634 static int
4635 postprocess_iso2022_mask (int mask)
4636 {
4637   /* #### kind of cheesy */
4638   /* If seven-bit ISO is allowed, then assume that the encoding is
4639      entirely seven-bit and turn off the eight-bit ones. */
4640   if (mask & CODING_CATEGORY_ISO_7_MASK)
4641     mask &= ~ (CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4642                CODING_CATEGORY_ISO_8_1_MASK |
4643                CODING_CATEGORY_ISO_8_2_MASK);
4644   return mask;
4645 }
4646
4647 /* If FLAGS is a null pointer or specifies right-to-left motion,
4648    output a switch-dir-to-left-to-right sequence to DST.
4649    Also update FLAGS if it is not a null pointer.
4650    If INTERNAL_P is set, we are outputting in internal format and
4651    need to handle the CSI differently. */
4652
4653 static void
4654 restore_left_to_right_direction (Lisp_Coding_System *codesys,
4655                                  unsigned_char_dynarr *dst,
4656                                  unsigned int *flags,
4657                                  int internal_p)
4658 {
4659   if (!flags || (*flags & CODING_STATE_R2L))
4660     {
4661       if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4662         {
4663           Dynarr_add (dst, ISO_CODE_ESC);
4664           Dynarr_add (dst, '[');
4665         }
4666       else if (internal_p)
4667         DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4668       else
4669         Dynarr_add (dst, ISO_CODE_CSI);
4670       Dynarr_add (dst, '0');
4671       Dynarr_add (dst, ']');
4672       if (flags)
4673         *flags &= ~CODING_STATE_R2L;
4674     }
4675 }
4676
4677 /* If FLAGS is a null pointer or specifies a direction different from
4678    DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or
4679    CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape
4680    sequence to DST.  Also update FLAGS if it is not a null pointer.
4681    If INTERNAL_P is set, we are outputting in internal format and
4682    need to handle the CSI differently. */
4683
4684 static void
4685 ensure_correct_direction (int direction, Lisp_Coding_System *codesys,
4686                           unsigned_char_dynarr *dst, unsigned int *flags,
4687                           int internal_p)
4688 {
4689   if ((!flags || (*flags & CODING_STATE_R2L)) &&
4690       direction == CHARSET_LEFT_TO_RIGHT)
4691     restore_left_to_right_direction (codesys, dst, flags, internal_p);
4692   else if (!CODING_SYSTEM_ISO2022_NO_ISO6429 (codesys)
4693            && (!flags || !(*flags & CODING_STATE_R2L)) &&
4694            direction == CHARSET_RIGHT_TO_LEFT)
4695     {
4696       if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4697         {
4698           Dynarr_add (dst, ISO_CODE_ESC);
4699           Dynarr_add (dst, '[');
4700         }
4701       else if (internal_p)
4702         DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4703       else
4704         Dynarr_add (dst, ISO_CODE_CSI);
4705       Dynarr_add (dst, '2');
4706       Dynarr_add (dst, ']');
4707       if (flags)
4708         *flags |= CODING_STATE_R2L;
4709     }
4710 }
4711
4712 /* Convert ISO2022-format data to internal format. */
4713
4714 static void
4715 decode_coding_iso2022 (Lstream *decoding, const unsigned char *src,
4716                        unsigned_char_dynarr *dst, unsigned int n)
4717 {
4718   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4719   unsigned int flags    = str->flags;
4720   unsigned int cpos     = str->cpos;
4721   unsigned char counter = str->counter;
4722   eol_type_t eol_type   = str->eol_type;
4723 #ifdef ENABLE_COMPOSITE_CHARS
4724   unsigned_char_dynarr *real_dst = dst;
4725 #endif
4726   Lisp_Object coding_system;
4727
4728   XSETCODING_SYSTEM (coding_system, str->codesys);
4729
4730 #ifdef ENABLE_COMPOSITE_CHARS
4731   if (flags & CODING_STATE_COMPOSITE)
4732     dst = str->iso2022.composite_chars;
4733 #endif /* ENABLE_COMPOSITE_CHARS */
4734
4735   while (n--)
4736     {
4737       unsigned char c = *src++;
4738       if (flags & CODING_STATE_ESCAPE)
4739         {       /* Within ESC sequence */
4740           int retval = parse_iso2022_esc (coding_system, &str->iso2022,
4741                                           c, &flags, 1);
4742
4743           if (retval)
4744             {
4745               switch (str->iso2022.esc)
4746                 {
4747 #ifdef ENABLE_COMPOSITE_CHARS
4748                 case ISO_ESC_START_COMPOSITE:
4749                   if (str->iso2022.composite_chars)
4750                     Dynarr_reset (str->iso2022.composite_chars);
4751                   else
4752                     str->iso2022.composite_chars = Dynarr_new (unsigned_char);
4753                   dst = str->iso2022.composite_chars;
4754                   break;
4755                 case ISO_ESC_END_COMPOSITE:
4756                   {
4757                     Bufbyte comstr[MAX_EMCHAR_LEN];
4758                     Bytecount len;
4759                     Emchar emch = lookup_composite_char (Dynarr_atp (dst, 0),
4760                                                          Dynarr_length (dst));
4761                     dst = real_dst;
4762                     len = set_charptr_emchar (comstr, emch);
4763                     Dynarr_add_many (dst, comstr, len);
4764                     break;
4765                   }
4766 #endif /* ENABLE_COMPOSITE_CHARS */
4767
4768                 case ISO_ESC_LITERAL:
4769                   COMPOSE_FLUSH_CHARS (str, dst);
4770                   DECODE_ADD_BINARY_CHAR (c, dst);
4771                   break;
4772
4773                 default:
4774                   /* Everything else handled already */
4775                   break;
4776                 }
4777             }
4778
4779           /* Attempted error recovery. */
4780           if (str->iso2022.output_direction_sequence)
4781             ensure_correct_direction (flags & CODING_STATE_R2L ?
4782                                       CHARSET_RIGHT_TO_LEFT :
4783                                       CHARSET_LEFT_TO_RIGHT,
4784                                       str->codesys, dst, 0, 1);
4785           /* More error recovery. */
4786           if (!retval || str->iso2022.output_literally)
4787             {
4788               /* Output the (possibly invalid) sequence */
4789               int i;
4790               COMPOSE_FLUSH_CHARS (str, dst);
4791               for (i = 0; i < str->iso2022.esc_bytes_index; i++)
4792                 DECODE_ADD_BINARY_CHAR (str->iso2022.esc_bytes[i], dst);
4793               flags &= CODING_STATE_ISO2022_LOCK;
4794               if (!retval)
4795                 n++, src--;/* Repeat the loop with the same character. */
4796               else
4797                 {
4798                   /* No sense in reprocessing the final byte of the
4799                      escape sequence; it could mess things up anyway.
4800                      Just add it now. */
4801                   COMPOSE_FLUSH_CHARS (str, dst);
4802                   DECODE_ADD_BINARY_CHAR (c, dst);
4803                 }
4804             }
4805           cpos = 0;
4806           counter = 0;
4807         }
4808       else if (BYTE_C0_P (c) || BYTE_C1_P (c))
4809         { /* Control characters */
4810
4811           /***** Error-handling *****/
4812
4813           /* If we were in the middle of a character, dump out the
4814              partial character. */
4815           if (counter)
4816             {
4817               COMPOSE_FLUSH_CHARS (str, dst);
4818               while (counter > 0)
4819                 {
4820                   counter--;
4821                   DECODE_ADD_BINARY_CHAR
4822                     ((unsigned char)(cpos >> (counter * 8)), dst);
4823                 }
4824               cpos = 0;
4825             }
4826
4827           /* If we just saw a single-shift character, dump it out.
4828              This may dump out the wrong sort of single-shift character,
4829              but least it will give an indication that something went
4830              wrong. */
4831           if (flags & CODING_STATE_SS2)
4832             {
4833               COMPOSE_FLUSH_CHARS (str, dst);
4834               DECODE_ADD_BINARY_CHAR (ISO_CODE_SS2, dst);
4835               flags &= ~CODING_STATE_SS2;
4836             }
4837           if (flags & CODING_STATE_SS3)
4838             {
4839               COMPOSE_FLUSH_CHARS (str, dst);
4840               DECODE_ADD_BINARY_CHAR (ISO_CODE_SS3, dst);
4841               flags &= ~CODING_STATE_SS3;
4842             }
4843
4844           /***** Now handle the control characters. *****/
4845
4846           /* Handle CR/LF */
4847 #ifdef UTF2000
4848           if (c == '\r')
4849             {
4850               COMPOSE_FLUSH_CHARS (str, dst);
4851               if (eol_type == EOL_CR)
4852                 Dynarr_add (dst, '\n');
4853               else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR)
4854                 Dynarr_add (dst, c);
4855               else
4856                 flags |= CODING_STATE_CR;
4857               goto label_continue_loop;
4858             }
4859           else if (flags & CODING_STATE_CR)
4860             {   /* eol_type == CODING_SYSTEM_EOL_CRLF */
4861               if (c != '\n')
4862                 Dynarr_add (dst, '\r');
4863               flags &= ~CODING_STATE_CR;
4864             }
4865 #else
4866           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4867 #endif
4868
4869           flags &= CODING_STATE_ISO2022_LOCK;
4870
4871           if (!parse_iso2022_esc (coding_system, &str->iso2022, c, &flags, 1))
4872             {
4873               COMPOSE_FLUSH_CHARS (str, dst);
4874               DECODE_ADD_BINARY_CHAR (c, dst);
4875             }
4876         }
4877       else
4878         {                       /* Graphic characters */
4879           Lisp_Object charset;
4880 #ifndef UTF2000
4881           Charset_ID lb;
4882 #endif
4883           int reg;
4884
4885 #ifdef UTF2000
4886           if (c == '\r')
4887             {
4888               COMPOSE_FLUSH_CHARS (str, dst);
4889               if (eol_type == EOL_CR)
4890                 Dynarr_add (dst, '\n');
4891               else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR)
4892                 Dynarr_add (dst, c);
4893               else
4894                 flags |= CODING_STATE_CR;
4895               goto label_continue_loop;
4896             }
4897           else if (flags & CODING_STATE_CR)
4898             {   /* eol_type == CODING_SYSTEM_EOL_CRLF */
4899               if (c != '\n')
4900                 Dynarr_add (dst, '\r');
4901               flags &= ~CODING_STATE_CR;
4902             }
4903 #else
4904           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4905 #endif
4906
4907           /* Now determine the charset. */
4908           reg = ((flags & CODING_STATE_SS2) ? 2
4909                  : (flags & CODING_STATE_SS3) ? 3
4910                  : !BYTE_ASCII_P (c) ? str->iso2022.register_right
4911                  : str->iso2022.register_left);
4912           charset = str->iso2022.charset[reg];
4913
4914           /* Error checking: */
4915           if (! CHARSETP (charset)
4916               || str->iso2022.invalid_designated[reg]
4917               || (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL)
4918                   && XCHARSET_CHARS (charset) == 94))
4919             /* Mrmph.  We are trying to invoke a register that has no
4920                or an invalid charset in it, or trying to add a character
4921                outside the range of the charset.  Insert that char literally
4922                to preserve it for the output. */
4923             {
4924               COMPOSE_FLUSH_CHARS (str, dst);
4925               while (counter > 0)
4926                 {
4927                   counter--;
4928                   DECODE_ADD_BINARY_CHAR
4929                     ((unsigned char)(cpos >> (counter * 8)), dst);
4930                 }
4931               cpos = 0;
4932               DECODE_ADD_BINARY_CHAR (c, dst);
4933             }
4934
4935           else
4936             {
4937               /* Things are probably hunky-dorey. */
4938
4939               /* Fetch reverse charset, maybe. */
4940               if (((flags & CODING_STATE_R2L) &&
4941                    XCHARSET_DIRECTION (charset) == CHARSET_LEFT_TO_RIGHT)
4942                   ||
4943                   (!(flags & CODING_STATE_R2L) &&
4944                    XCHARSET_DIRECTION (charset) == CHARSET_RIGHT_TO_LEFT))
4945                 {
4946                   Lisp_Object new_charset =
4947                     XCHARSET_REVERSE_DIRECTION_CHARSET (charset);
4948                   if (!NILP (new_charset))
4949                     charset = new_charset;
4950                 }
4951
4952 #ifdef UTF2000
4953               counter++;
4954               if (XCHARSET_DIMENSION (charset) == counter)
4955                 {
4956                   COMPOSE_ADD_CHAR (str,
4957                                     DECODE_CHAR (charset,
4958                                                  ((cpos & 0x7F7F7F) << 8)
4959                                                  | (c & 0x7F)),
4960                                     dst);
4961                   cpos = 0;
4962                   counter = 0;
4963                 }
4964               else
4965                 cpos = (cpos << 8) | c;
4966 #else
4967               lb = XCHARSET_LEADING_BYTE (charset);
4968               switch (XCHARSET_REP_BYTES (charset))
4969                 {
4970                 case 1: /* ASCII */
4971                   DECODE_OUTPUT_PARTIAL_CHAR (ch);
4972                   Dynarr_add (dst, c & 0x7F);
4973                   break;
4974
4975                 case 2: /* one-byte official */
4976                   DECODE_OUTPUT_PARTIAL_CHAR (ch);
4977                   Dynarr_add (dst, lb);
4978                   Dynarr_add (dst, c | 0x80);
4979                   break;
4980
4981                 case 3: /* one-byte private or two-byte official */
4982                   if (XCHARSET_PRIVATE_P (charset))
4983                     {
4984                       DECODE_OUTPUT_PARTIAL_CHAR (ch);
4985                       Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_1);
4986                       Dynarr_add (dst, lb);
4987                       Dynarr_add (dst, c | 0x80);
4988                     }
4989                   else
4990                     {
4991                       if (ch)
4992                         {
4993                           Dynarr_add (dst, lb);
4994                           Dynarr_add (dst, ch | 0x80);
4995                           Dynarr_add (dst, c | 0x80);
4996                           ch = 0;
4997                         }
4998                       else
4999                         ch = c;
5000                     }
5001                   break;
5002
5003                 default:        /* two-byte private */
5004                   if (ch)
5005                     {
5006                       Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_2);
5007                       Dynarr_add (dst, lb);
5008                       Dynarr_add (dst, ch | 0x80);
5009                       Dynarr_add (dst, c | 0x80);
5010                       ch = 0;
5011                     }
5012                   else
5013                     ch = c;
5014                 }
5015 #endif
5016             }
5017
5018           if (!cpos)
5019             flags &= CODING_STATE_ISO2022_LOCK;
5020         }
5021
5022     label_continue_loop:;
5023     }
5024
5025   if (flags & CODING_STATE_END)
5026     {
5027       COMPOSE_FLUSH_CHARS (str, dst);
5028       DECODE_OUTPUT_PARTIAL_CHAR (cpos);
5029     }
5030   str->flags   = flags;
5031   str->cpos    = cpos;
5032   str->counter = counter;
5033 }
5034
5035
5036 /***** ISO2022 encoder *****/
5037
5038 /* Designate CHARSET into register REG. */
5039
5040 static void
5041 iso2022_designate (Lisp_Object charset, unsigned char reg,
5042                    struct encoding_stream *str, unsigned_char_dynarr *dst)
5043 {
5044   static const char inter94[] = "()*+";
5045   static const char inter96[] = ",-./";
5046   unsigned short chars;
5047   unsigned char dimension;
5048   unsigned char final;
5049   Lisp_Object old_charset = str->iso2022.charset[reg];
5050
5051   str->iso2022.charset[reg] = charset;
5052   if (!CHARSETP (charset))
5053     /* charset might be an initial nil or t. */
5054     return;
5055   chars = XCHARSET_CHARS (charset);
5056   dimension = XCHARSET_DIMENSION (charset);
5057   final = XCHARSET_FINAL (charset);
5058   if (!str->iso2022.force_charset_on_output[reg] &&
5059       CHARSETP (old_charset) &&
5060       XCHARSET_CHARS (old_charset) == chars &&
5061       XCHARSET_DIMENSION (old_charset) == dimension &&
5062       XCHARSET_FINAL (old_charset) == final)
5063     return;
5064
5065   str->iso2022.force_charset_on_output[reg] = 0;
5066
5067   {
5068     charset_conversion_spec_dynarr *dyn =
5069       str->codesys->iso2022.output_conv;
5070
5071     if (dyn)
5072       {
5073         int i;
5074
5075         for (i = 0; i < Dynarr_length (dyn); i++)
5076           {
5077             struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
5078             if (EQ (charset, spec->from_charset))
5079                 charset = spec->to_charset;
5080           }
5081       }
5082   }
5083
5084   Dynarr_add (dst, ISO_CODE_ESC);
5085   switch (chars)
5086     {
5087     case 94:
5088       if (dimension == 1)
5089         Dynarr_add (dst, inter94[reg]);
5090       else
5091         {
5092           Dynarr_add (dst, '$');
5093           if (reg != 0
5094               || !(CODING_SYSTEM_ISO2022_SHORT (str->codesys))
5095               || final < '@'
5096               || final > 'B')
5097             Dynarr_add (dst, inter94[reg]);
5098         }
5099       break;
5100     case 96:
5101       if (dimension == 1)
5102         Dynarr_add (dst, inter96[reg]);
5103       else
5104         {
5105           Dynarr_add (dst, '$');
5106           Dynarr_add (dst, inter96[reg]);
5107         }
5108       break;
5109     }
5110   Dynarr_add (dst, final);
5111 }
5112
5113 static void
5114 ensure_normal_shift (struct encoding_stream *str, unsigned_char_dynarr *dst)
5115 {
5116   if (str->iso2022.register_left != 0)
5117     {
5118       Dynarr_add (dst, ISO_CODE_SI);
5119       str->iso2022.register_left = 0;
5120     }
5121 }
5122
5123 static void
5124 ensure_shift_out (struct encoding_stream *str, unsigned_char_dynarr *dst)
5125 {
5126   if (str->iso2022.register_left != 1)
5127     {
5128       Dynarr_add (dst, ISO_CODE_SO);
5129       str->iso2022.register_left = 1;
5130     }
5131 }
5132
5133 void
5134 char_encode_iso2022 (struct encoding_stream *str, Emchar ch,
5135                      unsigned_char_dynarr *dst, unsigned int *flags)
5136 {
5137   unsigned char charmask;
5138   Lisp_Coding_System* codesys = str->codesys;
5139   eol_type_t eol_type         = CODING_SYSTEM_EOL_TYPE (str->codesys);
5140   int i;
5141   Lisp_Object charset = str->iso2022.current_charset;
5142   int half = str->iso2022.current_half;
5143   int code_point = -1;
5144
5145   if (ch <= 0x7F)
5146     {
5147       restore_left_to_right_direction (codesys, dst, flags, 0);
5148
5149       /* Make sure G0 contains ASCII */
5150       if ((ch > ' ' && ch < ISO_CODE_DEL)
5151           || !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
5152         {
5153           ensure_normal_shift (str, dst);
5154           iso2022_designate (Vcharset_ascii, 0, str, dst);
5155         }
5156
5157       /* If necessary, restore everything to the default state
5158          at end-of-line */
5159       if (ch == '\n' && !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys)))
5160         {
5161           restore_left_to_right_direction (codesys, dst, flags, 0);
5162
5163           ensure_normal_shift (str, dst);
5164
5165           for (i = 0; i < 4; i++)
5166             {
5167               Lisp_Object initial_charset =
5168                 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5169               iso2022_designate (initial_charset, i, str, dst);
5170             }
5171         }
5172       if (ch == '\n')
5173         {
5174           if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5175             Dynarr_add (dst, '\r');
5176           if (eol_type != EOL_CR)
5177             Dynarr_add (dst, ch);
5178         }
5179       else
5180         {
5181           if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5182               && fit_to_be_escape_quoted (ch))
5183             Dynarr_add (dst, ISO_CODE_ESC);
5184           Dynarr_add (dst, ch);
5185         }
5186     }
5187   else if ( (0x80 <= ch) && (ch <= 0x9f) )
5188     {
5189       charmask = (half == 0 ? 0x00 : 0x80);
5190
5191       if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5192           && fit_to_be_escape_quoted (ch))
5193         Dynarr_add (dst, ISO_CODE_ESC);
5194       /* you asked for it ... */
5195       Dynarr_add (dst, ch);
5196     }
5197   else
5198     {
5199       int reg;
5200
5201       /* Now determine which register to use. */
5202       reg = -1;
5203       for (i = 0; i < 4; i++)
5204         {
5205           if ((CHARSETP (charset = str->iso2022.charset[i])
5206                && ((code_point = charset_code_point (charset, ch)) >= 0))
5207               ||
5208               (CHARSETP
5209                (charset
5210                 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i))
5211                && ((code_point = charset_code_point (charset, ch)) >= 0)))
5212             {
5213               reg = i;
5214               break;
5215             }
5216         }
5217       if (reg == -1)
5218         {
5219           Lisp_Object original_default_coded_charset_priority_list
5220             = Vdefault_coded_charset_priority_list;
5221
5222           while (!EQ (Vdefault_coded_charset_priority_list, Qnil))
5223             {
5224               code_point = ENCODE_CHAR (ch, charset);
5225               if (XCHARSET_FINAL (charset))
5226                 goto found;
5227               Vdefault_coded_charset_priority_list
5228                 = Fcdr (Fmemq (XCHARSET_NAME (charset),
5229                                Vdefault_coded_charset_priority_list));
5230             }
5231           code_point = ENCODE_CHAR (ch, charset);
5232           if (!XCHARSET_FINAL (charset))
5233             {
5234               charset = Vcharset_ascii;
5235               code_point = '~';
5236             }
5237         found:
5238           Vdefault_coded_charset_priority_list
5239             = original_default_coded_charset_priority_list;
5240         }
5241       ensure_correct_direction (XCHARSET_DIRECTION (charset),
5242                                 codesys, dst, flags, 0);
5243
5244       if (reg == -1)
5245         {
5246           if (XCHARSET_GRAPHIC (charset) != 0)
5247             {
5248               if (!NILP (str->iso2022.charset[1]) &&
5249                   (!CODING_SYSTEM_ISO2022_SEVEN (codesys)
5250                    || CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys)))
5251                 reg = 1;
5252               else if (!NILP (str->iso2022.charset[2]))
5253                 reg = 2;
5254               else if (!NILP (str->iso2022.charset[3]))
5255                 reg = 3;
5256               else
5257                 reg = 0;
5258             }
5259           else
5260             reg = 0;
5261         }
5262
5263       iso2022_designate (charset, reg, str, dst);
5264
5265       /* Now invoke that register. */
5266       switch (reg)
5267         {
5268         case 0:
5269           ensure_normal_shift (str, dst);
5270           half = 0;
5271           break;
5272         case 1:
5273           if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5274             {
5275               ensure_shift_out (str, dst);
5276               half = 0;
5277             }
5278           else
5279             half = 1;
5280           break;
5281         case 2:
5282           if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5283             {
5284               Dynarr_add (dst, ISO_CODE_ESC);
5285               Dynarr_add (dst, 'N');
5286               half = 0;
5287             }
5288           else
5289             {
5290               Dynarr_add (dst, ISO_CODE_SS2);
5291               half = 1;
5292             }
5293           break;
5294         case 3:
5295           if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5296             {
5297               Dynarr_add (dst, ISO_CODE_ESC);
5298               Dynarr_add (dst, 'O');
5299               half = 0;
5300             }
5301           else
5302             {
5303               Dynarr_add (dst, ISO_CODE_SS3);
5304               half = 1;
5305             }
5306           break;
5307         default:
5308           abort ();
5309         }
5310
5311       charmask = (half == 0 ? 0x00 : 0x80);
5312
5313       switch (XCHARSET_DIMENSION (charset))
5314         {
5315         case 1:
5316           Dynarr_add (dst, (code_point & 0xFF) | charmask);
5317           break;
5318         case 2:
5319           Dynarr_add (dst, ((code_point >> 8) & 0xFF) | charmask);
5320           Dynarr_add (dst, ( code_point       & 0xFF) | charmask);
5321           break;
5322         case 3:
5323           Dynarr_add (dst, ((code_point >> 16) & 0xFF) | charmask);
5324           Dynarr_add (dst, ((code_point >>  8) & 0xFF) | charmask);
5325           Dynarr_add (dst, ( code_point        & 0xFF) | charmask);
5326           break;
5327         case 4:
5328           Dynarr_add (dst, ((code_point >> 24) & 0xFF) | charmask);
5329           Dynarr_add (dst, ((code_point >> 16) & 0xFF) | charmask);
5330           Dynarr_add (dst, ((code_point >>  8) & 0xFF) | charmask);
5331           Dynarr_add (dst, ( code_point        & 0xFF) | charmask);
5332           break;
5333         default:
5334           abort ();
5335         }
5336     }
5337   str->iso2022.current_charset = charset;
5338   str->iso2022.current_half = half;
5339 }
5340
5341 void
5342 char_finish_iso2022 (struct encoding_stream *str, unsigned_char_dynarr *dst,
5343                      unsigned int *flags)
5344 {
5345   Lisp_Coding_System* codesys = str->codesys;
5346   int i;
5347
5348   restore_left_to_right_direction (codesys, dst, flags, 0);
5349   ensure_normal_shift (str, dst);
5350   for (i = 0; i < 4; i++)
5351     {
5352       Lisp_Object initial_charset
5353         = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5354       iso2022_designate (initial_charset, i, str, dst);
5355     }
5356 }
5357 #endif /* MULE */
5358 \f
5359 /************************************************************************/
5360 /*                     No-conversion methods                            */
5361 /************************************************************************/
5362
5363 /* This is used when reading in "binary" files -- i.e. files that may
5364    contain all 256 possible byte values and that are not to be
5365    interpreted as being in any particular decoding. */
5366 static void
5367 decode_coding_no_conversion (Lstream *decoding, const unsigned char *src,
5368                              unsigned_char_dynarr *dst, unsigned int n)
5369 {
5370   unsigned char c;
5371   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
5372   unsigned int flags  = str->flags;
5373   unsigned int cpos   = str->cpos;
5374   eol_type_t eol_type = str->eol_type;
5375
5376   while (n--)
5377     {
5378       c = *src++;
5379
5380       DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5381       DECODE_ADD_BINARY_CHAR (c, dst);
5382     label_continue_loop:;
5383     }
5384
5385   DECODE_HANDLE_END_OF_CONVERSION (flags, cpos, dst);
5386
5387   str->flags = flags;
5388   str->cpos  = cpos;
5389 }
5390
5391 static void
5392 encode_coding_no_conversion (Lstream *encoding, const unsigned char *src,
5393                              unsigned_char_dynarr *dst, unsigned int n)
5394 {
5395   unsigned char c;
5396   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
5397   unsigned int flags  = str->flags;
5398   unsigned int ch     = str->ch;
5399   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
5400 #ifdef UTF2000
5401   unsigned char char_boundary = str->iso2022.current_char_boundary;
5402 #endif
5403
5404   while (n--)
5405     {
5406       c = *src++;
5407 #ifdef UTF2000
5408       if (char_boundary == 0)
5409         if ( c >= 0xfc )
5410           {
5411             ch = c & 0x01;
5412             char_boundary = 5;
5413           }
5414         else if ( c >= 0xf8 )
5415           {
5416             ch = c & 0x03;
5417             char_boundary = 4;
5418           }
5419         else if ( c >= 0xf0 )
5420           {
5421             ch = c & 0x07;
5422             char_boundary = 3;
5423           }
5424         else if ( c >= 0xe0 )
5425           {
5426             ch = c & 0x0f;
5427             char_boundary = 2;
5428           }
5429         else if ( c >= 0xc0 )
5430           {
5431             ch = c & 0x1f;
5432             char_boundary = 1;
5433           }
5434         else
5435           {
5436             ch = 0;
5437             if (c == '\n')
5438               {
5439                 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5440                   Dynarr_add (dst, '\r');
5441                 if (eol_type != EOL_CR)
5442                   Dynarr_add (dst, c);
5443               }
5444             else
5445               Dynarr_add (dst, c);
5446             char_boundary = 0;
5447           }
5448       else if (char_boundary == 1)
5449         {
5450           ch = ( ch << 6 ) | ( c & 0x3f );
5451           Dynarr_add (dst, ch & 0xff);
5452           char_boundary = 0;
5453         }
5454       else
5455         {
5456           ch = ( ch << 6 ) | ( c & 0x3f );
5457           char_boundary--;
5458         }
5459 #else /* not UTF2000 */
5460       if (c == '\n')
5461         {
5462           if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5463             Dynarr_add (dst, '\r');
5464           if (eol_type != EOL_CR)
5465             Dynarr_add (dst, '\n');
5466           ch = 0;
5467         }
5468       else if (BYTE_ASCII_P (c))
5469         {
5470           assert (ch == 0);
5471           Dynarr_add (dst, c);
5472         }
5473       else if (BUFBYTE_LEADING_BYTE_P (c))
5474         {
5475           assert (ch == 0);
5476           if (c == LEADING_BYTE_LATIN_ISO8859_1 ||
5477               c == LEADING_BYTE_CONTROL_1)
5478             ch = c;
5479           else
5480             Dynarr_add (dst, '~'); /* untranslatable character */
5481         }
5482       else
5483         {
5484           if (ch == LEADING_BYTE_LATIN_ISO8859_1)
5485             Dynarr_add (dst, c);
5486           else if (ch == LEADING_BYTE_CONTROL_1)
5487             {
5488               assert (c < 0xC0);
5489               Dynarr_add (dst, c - 0x20);
5490             }
5491           /* else it should be the second or third byte of an
5492              untranslatable character, so ignore it */
5493           ch = 0;
5494         }
5495 #endif /* not UTF2000 */
5496     }
5497
5498   str->flags = flags;
5499   str->ch    = ch;
5500 #ifdef UTF2000
5501   str->iso2022.current_char_boundary = char_boundary;
5502 #endif
5503 }
5504
5505 \f
5506
5507 /************************************************************************/
5508 /*                             Initialization                           */
5509 /************************************************************************/
5510
5511 void
5512 syms_of_file_coding (void)
5513 {
5514   INIT_LRECORD_IMPLEMENTATION (coding_system);
5515
5516   deferror (&Qcoding_system_error, "coding-system-error",
5517             "Coding-system error", Qio_error);
5518
5519   DEFSUBR (Fcoding_system_p);
5520   DEFSUBR (Ffind_coding_system);
5521   DEFSUBR (Fget_coding_system);
5522   DEFSUBR (Fcoding_system_list);
5523   DEFSUBR (Fcoding_system_name);
5524   DEFSUBR (Fmake_coding_system);
5525   DEFSUBR (Fcopy_coding_system);
5526   DEFSUBR (Fcoding_system_canonical_name_p);
5527   DEFSUBR (Fcoding_system_alias_p);
5528   DEFSUBR (Fcoding_system_aliasee);
5529   DEFSUBR (Fdefine_coding_system_alias);
5530   DEFSUBR (Fsubsidiary_coding_system);
5531
5532   DEFSUBR (Fcoding_system_type);
5533   DEFSUBR (Fcoding_system_doc_string);
5534 #ifdef MULE
5535   DEFSUBR (Fcoding_system_charset);
5536 #endif
5537   DEFSUBR (Fcoding_system_property);
5538
5539   DEFSUBR (Fcoding_category_list);
5540   DEFSUBR (Fset_coding_priority_list);
5541   DEFSUBR (Fcoding_priority_list);
5542   DEFSUBR (Fset_coding_category_system);
5543   DEFSUBR (Fcoding_category_system);
5544
5545   DEFSUBR (Fdetect_coding_region);
5546   DEFSUBR (Fdecode_coding_region);
5547   DEFSUBR (Fencode_coding_region);
5548 #ifdef MULE
5549   DEFSUBR (Fdecode_shift_jis_char);
5550   DEFSUBR (Fencode_shift_jis_char);
5551   DEFSUBR (Fdecode_big5_char);
5552   DEFSUBR (Fencode_big5_char);
5553 #endif /* MULE */
5554   defsymbol (&Qcoding_systemp, "coding-system-p");
5555   defsymbol (&Qno_conversion, "no-conversion");
5556   defsymbol (&Qraw_text, "raw-text");
5557 #ifdef MULE
5558   defsymbol (&Qbig5, "big5");
5559   defsymbol (&Qshift_jis, "shift-jis");
5560   defsymbol (&Qucs4, "ucs-4");
5561   defsymbol (&Qutf8, "utf-8");
5562   defsymbol (&Qccl, "ccl");
5563   defsymbol (&Qiso2022, "iso2022");
5564 #endif /* MULE */
5565   defsymbol (&Qmnemonic, "mnemonic");
5566   defsymbol (&Qeol_type, "eol-type");
5567   defsymbol (&Qpost_read_conversion, "post-read-conversion");
5568   defsymbol (&Qpre_write_conversion, "pre-write-conversion");
5569
5570   defsymbol (&Qcr, "cr");
5571   defsymbol (&Qlf, "lf");
5572   defsymbol (&Qcrlf, "crlf");
5573   defsymbol (&Qeol_cr, "eol-cr");
5574   defsymbol (&Qeol_lf, "eol-lf");
5575   defsymbol (&Qeol_crlf, "eol-crlf");
5576 #ifdef MULE
5577   defsymbol (&Qcharset_g0, "charset-g0");
5578   defsymbol (&Qcharset_g1, "charset-g1");
5579   defsymbol (&Qcharset_g2, "charset-g2");
5580   defsymbol (&Qcharset_g3, "charset-g3");
5581   defsymbol (&Qforce_g0_on_output, "force-g0-on-output");
5582   defsymbol (&Qforce_g1_on_output, "force-g1-on-output");
5583   defsymbol (&Qforce_g2_on_output, "force-g2-on-output");
5584   defsymbol (&Qforce_g3_on_output, "force-g3-on-output");
5585   defsymbol (&Qno_iso6429, "no-iso6429");
5586   defsymbol (&Qinput_charset_conversion, "input-charset-conversion");
5587   defsymbol (&Qoutput_charset_conversion, "output-charset-conversion");
5588
5589   defsymbol (&Qshort, "short");
5590   defsymbol (&Qno_ascii_eol, "no-ascii-eol");
5591   defsymbol (&Qno_ascii_cntl, "no-ascii-cntl");
5592   defsymbol (&Qseven, "seven");
5593   defsymbol (&Qlock_shift, "lock-shift");
5594   defsymbol (&Qescape_quoted, "escape-quoted");
5595 #endif /* MULE */
5596 #ifdef UTF2000
5597   defsymbol (&Qdisable_composition, "disable-composition");
5598 #endif
5599   defsymbol (&Qencode, "encode");
5600   defsymbol (&Qdecode, "decode");
5601
5602 #ifdef MULE
5603   defsymbol (&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS],
5604              "shift-jis");
5605   defsymbol (&coding_category_symbol[CODING_CATEGORY_BIG5],
5606              "big5");
5607   defsymbol (&coding_category_symbol[CODING_CATEGORY_UCS4],
5608              "ucs-4");
5609   defsymbol (&coding_category_symbol[CODING_CATEGORY_UTF8],
5610              "utf-8");
5611   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_7],
5612              "iso-7");
5613   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE],
5614              "iso-8-designate");
5615   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_1],
5616              "iso-8-1");
5617   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_2],
5618              "iso-8-2");
5619   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_LOCK_SHIFT],
5620              "iso-lock-shift");
5621 #endif /* MULE */
5622   defsymbol (&coding_category_symbol[CODING_CATEGORY_NO_CONVERSION],
5623              "no-conversion");
5624 }
5625
5626 void
5627 lstream_type_create_file_coding (void)
5628 {
5629   LSTREAM_HAS_METHOD (decoding, reader);
5630   LSTREAM_HAS_METHOD (decoding, writer);
5631   LSTREAM_HAS_METHOD (decoding, rewinder);
5632   LSTREAM_HAS_METHOD (decoding, seekable_p);
5633   LSTREAM_HAS_METHOD (decoding, flusher);
5634   LSTREAM_HAS_METHOD (decoding, closer);
5635   LSTREAM_HAS_METHOD (decoding, marker);
5636
5637   LSTREAM_HAS_METHOD (encoding, reader);
5638   LSTREAM_HAS_METHOD (encoding, writer);
5639   LSTREAM_HAS_METHOD (encoding, rewinder);
5640   LSTREAM_HAS_METHOD (encoding, seekable_p);
5641   LSTREAM_HAS_METHOD (encoding, flusher);
5642   LSTREAM_HAS_METHOD (encoding, closer);
5643   LSTREAM_HAS_METHOD (encoding, marker);
5644 }
5645
5646 void
5647 vars_of_file_coding (void)
5648 {
5649   int i;
5650
5651   fcd = xnew (struct file_coding_dump);
5652   dumpstruct (&fcd, &fcd_description);
5653
5654   /* Initialize to something reasonable ... */
5655   for (i = 0; i <= CODING_CATEGORY_LAST; i++)
5656     {
5657       fcd->coding_category_system[i] = Qnil;
5658       fcd->coding_category_by_priority[i] = i;
5659     }
5660
5661   Fprovide (intern ("file-coding"));
5662
5663   DEFVAR_LISP ("keyboard-coding-system", &Vkeyboard_coding_system /*
5664 Coding system used for TTY keyboard input.
5665 Not used under a windowing system.
5666 */ );
5667   Vkeyboard_coding_system = Qnil;
5668
5669   DEFVAR_LISP ("terminal-coding-system", &Vterminal_coding_system /*
5670 Coding system used for TTY display output.
5671 Not used under a windowing system.
5672 */ );
5673   Vterminal_coding_system = Qnil;
5674
5675   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read /*
5676 Overriding coding system used when reading from a file or process.
5677 You should bind this variable with `let', but do not set it globally.
5678 If this is non-nil, it specifies the coding system that will be used
5679 to decode input on read operations, such as from a file or process.
5680 It overrides `buffer-file-coding-system-for-read',
5681 `insert-file-contents-pre-hook', etc.  Use those variables instead of
5682 this one for permanent changes to the environment.  */ );
5683   Vcoding_system_for_read = Qnil;
5684
5685   DEFVAR_LISP ("coding-system-for-write",
5686                &Vcoding_system_for_write /*
5687 Overriding coding system used when writing to a file or process.
5688 You should bind this variable with `let', but do not set it globally.
5689 If this is non-nil, it specifies the coding system that will be used
5690 to encode output for write operations, such as to a file or process.
5691 It overrides `buffer-file-coding-system', `write-region-pre-hook', etc.
5692 Use those variables instead of this one for permanent changes to the
5693 environment.  */ );
5694   Vcoding_system_for_write = Qnil;
5695
5696   DEFVAR_LISP ("file-name-coding-system", &Vfile_name_coding_system /*
5697 Coding system used to convert pathnames when accessing files.
5698 */ );
5699   Vfile_name_coding_system = Qnil;
5700
5701   DEFVAR_BOOL ("enable-multibyte-characters", &enable_multibyte_characters /*
5702 Non-nil means the buffer contents are regarded as multi-byte form
5703 of characters, not a binary code.  This affects the display, file I/O,
5704 and behaviors of various editing commands.
5705
5706 Setting this to nil does not do anything.
5707 */ );
5708   enable_multibyte_characters = 1;
5709 }
5710
5711 void
5712 complex_vars_of_file_coding (void)
5713 {
5714   staticpro (&Vcoding_system_hash_table);
5715   Vcoding_system_hash_table =
5716     make_lisp_hash_table (50, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ);
5717
5718   the_codesys_prop_dynarr = Dynarr_new (codesys_prop);
5719   dumpstruct (&the_codesys_prop_dynarr, &codesys_prop_dynarr_description);
5720
5721 #define DEFINE_CODESYS_PROP(Prop_Type, Sym) do  \
5722 {                                               \
5723   struct codesys_prop csp;                      \
5724   csp.sym = (Sym);                              \
5725   csp.prop_type = (Prop_Type);                  \
5726   Dynarr_add (the_codesys_prop_dynarr, csp);    \
5727 } while (0)
5728
5729   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qmnemonic);
5730   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_type);
5731   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_cr);
5732   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_crlf);
5733   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_lf);
5734   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qpost_read_conversion);
5735   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qpre_write_conversion);
5736 #ifdef MULE
5737   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g0);
5738   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g1);
5739   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g2);
5740   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g3);
5741   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g0_on_output);
5742   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g1_on_output);
5743   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g2_on_output);
5744   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g3_on_output);
5745   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qshort);
5746   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_eol);
5747   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_cntl);
5748   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qseven);
5749   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qlock_shift);
5750   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_iso6429);
5751   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qescape_quoted);
5752   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qinput_charset_conversion);
5753   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qoutput_charset_conversion);
5754
5755   DEFINE_CODESYS_PROP (CODESYS_PROP_CCL,     Qencode);
5756   DEFINE_CODESYS_PROP (CODESYS_PROP_CCL,     Qdecode);
5757 #endif /* MULE */
5758   /* Need to create this here or we're really screwed. */
5759   Fmake_coding_system
5760     (Qraw_text, Qno_conversion,
5761      build_string ("Raw text, which means it converts only line-break-codes."),
5762      list2 (Qmnemonic, build_string ("Raw")));
5763
5764   Fmake_coding_system
5765     (Qbinary, Qno_conversion,
5766      build_string ("Binary, which means it does not convert anything."),
5767      list4 (Qeol_type, Qlf,
5768             Qmnemonic, build_string ("Binary")));
5769
5770 #ifdef UTF2000
5771   Fmake_coding_system
5772     (Qutf8, Qutf8,
5773      build_string ("Coding-system of ISO/IEC 10646 UTF-8."),
5774      list2 (Qmnemonic, build_string ("UTF8")));
5775 #endif
5776
5777   Fdefine_coding_system_alias (Qno_conversion, Qraw_text);
5778
5779   Fdefine_coding_system_alias (Qfile_name, Qbinary);
5780
5781   Fdefine_coding_system_alias (Qterminal, Qbinary);
5782   Fdefine_coding_system_alias (Qkeyboard, Qbinary);
5783
5784   /* Need this for bootstrapping */
5785   fcd->coding_category_system[CODING_CATEGORY_NO_CONVERSION] =
5786     Fget_coding_system (Qraw_text);
5787
5788 #ifdef UTF2000
5789   fcd->coding_category_system[CODING_CATEGORY_UTF8]
5790    = Fget_coding_system (Qutf8);
5791 #endif
5792
5793 #if defined(MULE) && !defined(UTF2000)
5794   {
5795     unsigned int i;
5796
5797     for (i = 0; i < 65536; i++)
5798       fcd->ucs_to_mule_table[i] = Qnil;
5799   }
5800   staticpro (&mule_to_ucs_table);
5801   mule_to_ucs_table = Fmake_char_table(Qgeneric);
5802 #endif /* defined(MULE) && !defined(UTF2000) */
5803 }