src/text-coding.c

   1 /* Code conversion functions.
   2    Copyright (C) 1991, 1995 Free Software Foundation, Inc.
   3    Copyright (C) 1995 Sun Microsystems, Inc.
   4    Copyright (C) 1999,2000 MORIOKA Tomohiko
   5
   6 This file is part of XEmacs.
   7
   8 XEmacs is free software; you can redistribute it and/or modify it
   9 under the terms of the GNU General Public License as published by the
  10 Free Software Foundation; either version 2, or (at your option) any
  11 later version.
  12
  13 XEmacs is distributed in the hope that it will be useful, but WITHOUT
  14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with XEmacs; see the file COPYING.  If not, write to
  20 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  21 Boston, MA 02111-1307, USA.  */
  22
  23 /* Synched up with: Mule 2.3.   Not in FSF. */
  24
  25 /* Rewritten by Ben Wing <ben@xemacs.org>. */
  26
  27 #include <config.h>
  28 #include "lisp.h"
  29
  30 #include "buffer.h"
  31 #include "elhash.h"
  32 #include "insdel.h"
  33 #include "lstream.h"
  34 #include "opaque.h"
  35 #ifdef MULE
  36 #include "mule-ccl.h"
  37 #include "chartab.h"
  38 #endif
  39 #include "file-coding.h"
  40
  41 Lisp_Object Qcoding_system_error;
  42
  43 Lisp_Object Vkeyboard_coding_system;
  44 Lisp_Object Vterminal_coding_system;
  45 Lisp_Object Vcoding_system_for_read;
  46 Lisp_Object Vcoding_system_for_write;
  47 Lisp_Object Vfile_name_coding_system;
  48
  49 /* Table of symbols identifying each coding category. */
  50 Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST + 1];
  51
  52
  53
  54 struct file_coding_dump {
  55   /* Coding system currently associated with each coding category. */
  56   Lisp_Object coding_category_system[CODING_CATEGORY_LAST + 1];
  57
  58   /* Table of all coding categories in decreasing order of priority.
  59      This describes a permutation of the possible coding categories. */
  60   int coding_category_by_priority[CODING_CATEGORY_LAST + 1];
  61
  62 #if defined(MULE) && !defined(UTF2000)
  63   Lisp_Object ucs_to_mule_table[65536];
  64 #endif
  65 } *fcd;
  66
  67 static const struct lrecord_description fcd_description_1[] = {
  68   { XD_LISP_OBJECT_ARRAY, offsetof (struct file_coding_dump, coding_category_system), CODING_CATEGORY_LAST + 1 },
  69 #if defined(MULE) && !defined(UTF2000)
  70   { XD_LISP_OBJECT_ARRAY, offsetof (struct file_coding_dump, ucs_to_mule_table), 65536 },
  71 #endif
  72   { XD_END }
  73 };
  74
  75 static const struct struct_description fcd_description = {
  76   sizeof (struct file_coding_dump),
  77   fcd_description_1
  78 };
  79
  80 Lisp_Object mule_to_ucs_table;
  81
  82 Lisp_Object Qcoding_systemp;
  83
  84 Lisp_Object Qraw_text, Qno_conversion, Qccl, Qiso2022;
  85 /* Qinternal in general.c */
  86
  87 Lisp_Object Qmnemonic, Qeol_type;
  88 Lisp_Object Qcr, Qcrlf, Qlf;
  89 Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf;
  90 Lisp_Object Qpost_read_conversion;
  91 Lisp_Object Qpre_write_conversion;
  92
  93 #ifdef MULE
  94 Lisp_Object Qucs4, Qutf8;
  95 Lisp_Object Qbig5, Qshift_jis;
  96 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3;
  97 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output;
  98 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output;
  99 Lisp_Object Qno_iso6429;
 100 Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion;
 101 Lisp_Object Qescape_quoted;
 102 Lisp_Object Qshort, Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift;
 103 #endif
 104 #ifdef UTF2000
 105 Lisp_Object Qdisable_composition;
 106 #endif
 107 Lisp_Object Qencode, Qdecode;
 108
 109 Lisp_Object Vcoding_system_hash_table;
 110
 111 int enable_multibyte_characters;
 112
 113 #ifdef MULE
 114 /* Additional information used by the ISO2022 decoder and detector. */
 115 struct iso2022_decoder
 116 {
 117   /* CHARSET holds the character sets currently assigned to the G0
 118      through G3 variables.  It is initialized from the array
 119      INITIAL_CHARSET in CODESYS. */
 120   Lisp_Object charset[4];
 121
 122   /* Which registers are currently invoked into the left (GL) and
 123      right (GR) halves of the 8-bit encoding space? */
 124   int register_left, register_right;
 125
 126   /* ISO_ESC holds a value indicating part of an escape sequence
 127      that has already been seen. */
 128   enum iso_esc_flag esc;
 129
 130   /* This records the bytes we've seen so far in an escape sequence,
 131      in case the sequence is invalid (we spit out the bytes unchanged). */
 132   unsigned char esc_bytes[8];
 133
 134   /* Index for next byte to store in ISO escape sequence. */
 135   int esc_bytes_index;
 136
 137 #ifdef ENABLE_COMPOSITE_CHARS
 138   /* Stuff seen so far when composing a string. */
 139   unsigned_char_dynarr *composite_chars;
 140 #endif
 141
 142   /* If we saw an invalid designation sequence for a particular
 143      register, we flag it here and switch to ASCII.  The next time we
 144      see a valid designation for this register, we turn off the flag
 145      and do the designation normally, but pretend the sequence was
 146      invalid.  The effect of all this is that (most of the time) the
 147      escape sequences for both the switch to the unknown charset, and
 148      the switch back to the known charset, get inserted literally into
 149      the buffer and saved out as such.  The hope is that we can
 150      preserve the escape sequences so that the resulting written out
 151      file makes sense.  If we don't do any of this, the designation
 152      to the invalid charset will be preserved but that switch back
 153      to the known charset will probably get eaten because it was
 154      the same charset that was already present in the register. */
 155   unsigned char invalid_designated[4];
 156
 157   /* We try to do similar things as above for direction-switching
 158      sequences.  If we encountered a direction switch while an
 159      invalid designation was present, or an invalid designation
 160      just after a direction switch (i.e. no valid designation
 161      encountered yet), we insert the direction-switch escape
 162      sequence literally into the output stream, and later on
 163      insert the corresponding direction-restoring escape sequence
 164      literally also. */
 165   unsigned int switched_dir_and_no_valid_charset_yet :1;
 166   unsigned int invalid_switch_dir :1;
 167
 168   /* Tells the decoder to output the escape sequence literally
 169      even though it was valid.  Used in the games we play to
 170      avoid lossage when we encounter invalid designations. */
 171   unsigned int output_literally :1;
 172   /* We encountered a direction switch followed by an invalid
 173      designation.  We didn't output the direction switch
 174      literally because we didn't know about the invalid designation;
 175      but we have to do so now. */
 176   unsigned int output_direction_sequence :1;
 177 };
 178 #endif /* MULE */
 179 EXFUN (Fcopy_coding_system, 2);
 180 #ifdef MULE
 181 struct detection_state;
 182
 183 static void
 184 text_encode_generic (Lstream *encoding, const unsigned char *src,
 185                      unsigned_char_dynarr *dst, unsigned int n);
 186
 187 static int detect_coding_sjis (struct detection_state *st,
 188                                const unsigned char *src,
 189                                unsigned int n);
 190 static void decode_coding_sjis (Lstream *decoding,
 191                                 const unsigned char *src,
 192                                 unsigned_char_dynarr *dst,
 193                                 unsigned int n);
 194 void char_encode_shift_jis (struct encoding_stream *str, Emchar c,
 195                             unsigned_char_dynarr *dst, unsigned int *flags);
 196 void char_finish_shift_jis (struct encoding_stream *str,
 197                             unsigned_char_dynarr *dst, unsigned int *flags);
 198
 199 static int detect_coding_big5 (struct detection_state *st,
 200                                const unsigned char *src,
 201                                unsigned int n);
 202 static void decode_coding_big5 (Lstream *decoding,
 203                                 const unsigned char *src,
 204                                 unsigned_char_dynarr *dst, unsigned int n);
 205 static void encode_coding_big5 (Lstream *encoding,
 206                                 const unsigned char *src,
 207                                 unsigned_char_dynarr *dst, unsigned int n);
 208 static int detect_coding_ucs4 (struct detection_state *st,
 209                                const unsigned char *src,
 210                                unsigned int n);
 211 static void decode_coding_ucs4 (Lstream *decoding,
 212                                 const unsigned char *src,
 213                                 unsigned_char_dynarr *dst, unsigned int n);
 214 void char_encode_ucs4 (struct encoding_stream *str, Emchar c,
 215                        unsigned_char_dynarr *dst, unsigned int *flags);
 216 void char_finish_ucs4 (struct encoding_stream *str,
 217                        unsigned_char_dynarr *dst, unsigned int *flags);
 218
 219 static int detect_coding_utf8 (struct detection_state *st,
 220                                const unsigned char *src,
 221                                unsigned int n);
 222 static void decode_coding_utf8 (Lstream *decoding,
 223                                 const unsigned char *src,
 224                                 unsigned_char_dynarr *dst, unsigned int n);
 225 void char_encode_utf8 (struct encoding_stream *str, Emchar c,
 226                        unsigned_char_dynarr *dst, unsigned int *flags);
 227 void char_finish_utf8 (struct encoding_stream *str,
 228                        unsigned_char_dynarr *dst, unsigned int *flags);
 229
 230 static int postprocess_iso2022_mask (int mask);
 231 static void reset_iso2022 (Lisp_Object coding_system,
 232                            struct iso2022_decoder *iso);
 233 static int detect_coding_iso2022 (struct detection_state *st,
 234                                   const unsigned char *src,
 235                                   unsigned int n);
 236 static void decode_coding_iso2022 (Lstream *decoding,
 237                                    const unsigned char *src,
 238                                    unsigned_char_dynarr *dst, unsigned int n);
 239 void char_encode_iso2022 (struct encoding_stream *str, Emchar c,
 240                           unsigned_char_dynarr *dst, unsigned int *flags);
 241 void char_finish_iso2022 (struct encoding_stream *str,
 242                           unsigned_char_dynarr *dst, unsigned int *flags);
 243 #endif /* MULE */
 244 static void decode_coding_no_conversion (Lstream *decoding,
 245                                          const unsigned char *src,
 246                                          unsigned_char_dynarr *dst,
 247                                          unsigned int n);
 248 static void encode_coding_no_conversion (Lstream *encoding,
 249                                          const unsigned char *src,
 250                                          unsigned_char_dynarr *dst,
 251                                          unsigned int n);
 252 static void mule_decode (Lstream *decoding, const unsigned char *src,
 253                          unsigned_char_dynarr *dst, unsigned int n);
 254 static void mule_encode (Lstream *encoding, const unsigned char *src,
 255                          unsigned_char_dynarr *dst, unsigned int n);
 256
 257 typedef struct codesys_prop codesys_prop;
 258 struct codesys_prop
 259 {
 260   Lisp_Object sym;
 261   int prop_type;
 262 };
 263
 264 typedef struct
 265 {
 266   Dynarr_declare (codesys_prop);
 267 } codesys_prop_dynarr;
 268
 269 static const struct lrecord_description codesys_prop_description_1[] = {
 270   { XD_LISP_OBJECT, offsetof (codesys_prop, sym) },
 271   { XD_END }
 272 };
 273
 274 static const struct struct_description codesys_prop_description = {
 275   sizeof (codesys_prop),
 276   codesys_prop_description_1
 277 };
 278
 279 static const struct lrecord_description codesys_prop_dynarr_description_1[] = {
 280   XD_DYNARR_DESC (codesys_prop_dynarr, &codesys_prop_description),
 281   { XD_END }
 282 };
 283
 284 static const struct struct_description codesys_prop_dynarr_description = {
 285   sizeof (codesys_prop_dynarr),
 286   codesys_prop_dynarr_description_1
 287 };
 288
 289 codesys_prop_dynarr *the_codesys_prop_dynarr;
 290
 291 enum codesys_prop_enum
 292 {
 293   CODESYS_PROP_ALL_OK,
 294   CODESYS_PROP_ISO2022,
 295   CODESYS_PROP_CCL
 296 };
 297
 298 \f
 299 /************************************************************************/
 300 /*                       Coding system functions                        */
 301 /************************************************************************/
 302
 303 static Lisp_Object mark_coding_system (Lisp_Object);
 304 static void print_coding_system (Lisp_Object, Lisp_Object, int);
 305 static void finalize_coding_system (void *header, int for_disksave);
 306
 307 #ifdef MULE
 308 static const struct lrecord_description ccs_description_1[] = {
 309   { XD_LISP_OBJECT, offsetof (charset_conversion_spec, from_charset) },
 310   { XD_LISP_OBJECT, offsetof (charset_conversion_spec, to_charset) },
 311   { XD_END }
 312 };
 313
 314 static const struct struct_description ccs_description = {
 315   sizeof (charset_conversion_spec),
 316   ccs_description_1
 317 };
 318
 319 static const struct lrecord_description ccsd_description_1[] = {
 320   XD_DYNARR_DESC (charset_conversion_spec_dynarr, &ccs_description),
 321   { XD_END }
 322 };
 323
 324 static const struct struct_description ccsd_description = {
 325   sizeof (charset_conversion_spec_dynarr),
 326   ccsd_description_1
 327 };
 328 #endif
 329
 330 static const struct lrecord_description coding_system_description[] = {
 331   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, name) },
 332   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, doc_string) },
 333   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, mnemonic) },
 334   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, post_read_conversion) },
 335   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, pre_write_conversion) },
 336   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_lf) },
 337   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_crlf) },
 338   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_cr) },
 339 #ifdef MULE
 340   { XD_LISP_OBJECT_ARRAY, offsetof (Lisp_Coding_System, iso2022.initial_charset), 4 },
 341   { XD_STRUCT_PTR,  offsetof (Lisp_Coding_System, iso2022.input_conv),  1, &ccsd_description },
 342   { XD_STRUCT_PTR,  offsetof (Lisp_Coding_System, iso2022.output_conv), 1, &ccsd_description },
 343   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccl.decode) },
 344   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccl.encode) },
 345 #endif
 346   { XD_END }
 347 };
 348
 349 DEFINE_LRECORD_IMPLEMENTATION ("coding-system", coding_system,
 350                                mark_coding_system, print_coding_system,
 351                                finalize_coding_system,
 352                                0, 0, coding_system_description,
 353                                Lisp_Coding_System);
 354
 355 static Lisp_Object
 356 mark_coding_system (Lisp_Object obj)
 357 {
 358   Lisp_Coding_System *codesys = XCODING_SYSTEM (obj);
 359
 360   mark_object (CODING_SYSTEM_NAME (codesys));
 361   mark_object (CODING_SYSTEM_DOC_STRING (codesys));
 362   mark_object (CODING_SYSTEM_MNEMONIC (codesys));
 363   mark_object (CODING_SYSTEM_EOL_LF (codesys));
 364   mark_object (CODING_SYSTEM_EOL_CRLF (codesys));
 365   mark_object (CODING_SYSTEM_EOL_CR (codesys));
 366
 367   switch (CODING_SYSTEM_TYPE (codesys))
 368     {
 369 #ifdef MULE
 370       int i;
 371     case CODESYS_ISO2022:
 372       for (i = 0; i < 4; i++)
 373         mark_object (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i));
 374       if (codesys->iso2022.input_conv)
 375         {
 376           for (i = 0; i < Dynarr_length (codesys->iso2022.input_conv); i++)
 377             {
 378               struct charset_conversion_spec *ccs =
 379                 Dynarr_atp (codesys->iso2022.input_conv, i);
 380               mark_object (ccs->from_charset);
 381               mark_object (ccs->to_charset);
 382             }
 383         }
 384       if (codesys->iso2022.output_conv)
 385         {
 386           for (i = 0; i < Dynarr_length (codesys->iso2022.output_conv); i++)
 387             {
 388               struct charset_conversion_spec *ccs =
 389                 Dynarr_atp (codesys->iso2022.output_conv, i);
 390               mark_object (ccs->from_charset);
 391               mark_object (ccs->to_charset);
 392             }
 393         }
 394       break;
 395
 396     case CODESYS_CCL:
 397       mark_object (CODING_SYSTEM_CCL_DECODE (codesys));
 398       mark_object (CODING_SYSTEM_CCL_ENCODE (codesys));
 399       break;
 400 #endif /* MULE */
 401     default:
 402       break;
 403     }
 404
 405   mark_object (CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys));
 406   return CODING_SYSTEM_POST_READ_CONVERSION (codesys);
 407 }
 408
 409 static void
 410 print_coding_system (Lisp_Object obj, Lisp_Object printcharfun,
 411                      int escapeflag)
 412 {
 413   Lisp_Coding_System *c = XCODING_SYSTEM (obj);
 414   if (print_readably)
 415     error ("printing unreadable object #<coding_system 0x%x>",
 416            c->header.uid);
 417
 418   write_c_string ("#<coding_system ", printcharfun);
 419   print_internal (c->name, printcharfun, 1);
 420   write_c_string (">", printcharfun);
 421 }
 422
 423 static void
 424 finalize_coding_system (void *header, int for_disksave)
 425 {
 426   Lisp_Coding_System *c = (Lisp_Coding_System *) header;
 427   /* Since coding systems never go away, this function is not
 428      necessary.  But it would be necessary if we changed things
 429      so that coding systems could go away. */
 430   if (!for_disksave) /* see comment in lstream.c */
 431     {
 432       switch (CODING_SYSTEM_TYPE (c))
 433         {
 434 #ifdef MULE
 435         case CODESYS_ISO2022:
 436           if (c->iso2022.input_conv)
 437             {
 438               Dynarr_free (c->iso2022.input_conv);
 439               c->iso2022.input_conv = 0;
 440             }
 441           if (c->iso2022.output_conv)
 442             {
 443               Dynarr_free (c->iso2022.output_conv);
 444               c->iso2022.output_conv = 0;
 445             }
 446           break;
 447 #endif /* MULE */
 448         default:
 449           break;
 450         }
 451     }
 452 }
 453
 454 static eol_type_t
 455 symbol_to_eol_type (Lisp_Object symbol)
 456 {
 457   CHECK_SYMBOL (symbol);
 458   if (NILP (symbol))      return EOL_AUTODETECT;
 459   if (EQ (symbol, Qlf))   return EOL_LF;
 460   if (EQ (symbol, Qcrlf)) return EOL_CRLF;
 461   if (EQ (symbol, Qcr))   return EOL_CR;
 462
 463   signal_simple_error ("Unrecognized eol type", symbol);
 464   return EOL_AUTODETECT; /* not reached */
 465 }
 466
 467 static Lisp_Object
 468 eol_type_to_symbol (eol_type_t type)
 469 {
 470   switch (type)
 471     {
 472     default: abort ();
 473     case EOL_LF:         return Qlf;
 474     case EOL_CRLF:       return Qcrlf;
 475     case EOL_CR:         return Qcr;
 476     case EOL_AUTODETECT: return Qnil;
 477     }
 478 }
 479
 480 static void
 481 setup_eol_coding_systems (Lisp_Coding_System *codesys)
 482 {
 483   Lisp_Object codesys_obj;
 484   int len = string_length (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name);
 485   char *codesys_name = (char *) alloca (len + 7);
 486   int mlen = -1;
 487   char *codesys_mnemonic=0;
 488
 489   Lisp_Object codesys_name_sym, sub_codesys_obj;
 490
 491   /* kludge */
 492
 493   XSETCODING_SYSTEM (codesys_obj, codesys);
 494
 495   memcpy (codesys_name,
 496           string_data (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name), len);
 497
 498   if (STRINGP (CODING_SYSTEM_MNEMONIC (codesys)))
 499     {
 500       mlen = XSTRING_LENGTH (CODING_SYSTEM_MNEMONIC (codesys));
 501       codesys_mnemonic = (char *) alloca (mlen + 7);
 502       memcpy (codesys_mnemonic,
 503               XSTRING_DATA (CODING_SYSTEM_MNEMONIC (codesys)), mlen);
 504     }
 505
 506 #define DEFINE_SUB_CODESYS(op_sys, op_sys_abbr, Type) do {                      \
 507   strcpy (codesys_name + len, "-" op_sys);                                      \
 508   if (mlen != -1)                                                               \
 509     strcpy (codesys_mnemonic + mlen, op_sys_abbr);                              \
 510   codesys_name_sym = intern (codesys_name);                                     \
 511   sub_codesys_obj = Fcopy_coding_system (codesys_obj, codesys_name_sym);        \
 512   XCODING_SYSTEM_EOL_TYPE (sub_codesys_obj) = Type;                             \
 513   if (mlen != -1)                                                               \
 514     XCODING_SYSTEM_MNEMONIC(sub_codesys_obj) =                                  \
 515       build_string (codesys_mnemonic);                                          \
 516   CODING_SYSTEM_##Type (codesys) = sub_codesys_obj;                             \
 517 } while (0)
 518
 519   DEFINE_SUB_CODESYS("unix", "", EOL_LF);
 520   DEFINE_SUB_CODESYS("dos",  ":T", EOL_CRLF);
 521   DEFINE_SUB_CODESYS("mac",  ":t", EOL_CR);
 522 }
 523
 524 DEFUN ("coding-system-p", Fcoding_system_p, 1, 1, 0, /*
 525 Return t if OBJECT is a coding system.
 526 A coding system is an object that defines how text containing multiple
 527 character sets is encoded into a stream of (typically 8-bit) bytes.
 528 The coding system is used to decode the stream into a series of
 529 characters (which may be from multiple charsets) when the text is read
 530 from a file or process, and is used to encode the text back into the
 531 same format when it is written out to a file or process.
 532
 533 For example, many ISO2022-compliant coding systems (such as Compound
 534 Text, which is used for inter-client data under the X Window System)
 535 use escape sequences to switch between different charsets -- Japanese
 536 Kanji, for example, is invoked with "ESC $ ( B"; ASCII is invoked
 537 with "ESC ( B"; and Cyrillic is invoked with "ESC - L".  See
 538 `make-coding-system' for more information.
 539
 540 Coding systems are normally identified using a symbol, and the
 541 symbol is accepted in place of the actual coding system object whenever
 542 a coding system is called for. (This is similar to how faces work.)
 543 */
 544        (object))
 545 {
 546   return CODING_SYSTEMP (object) ? Qt : Qnil;
 547 }
 548
 549 DEFUN ("find-coding-system", Ffind_coding_system, 1, 1, 0, /*
 550 Retrieve the coding system of the given name.
 551
 552 If CODING-SYSTEM-OR-NAME is a coding-system object, it is simply
 553 returned.  Otherwise, CODING-SYSTEM-OR-NAME should be a symbol.
 554 If there is no such coding system, nil is returned.  Otherwise the
 555 associated coding system object is returned.
 556 */
 557        (coding_system_or_name))
 558 {
 559   if (NILP (coding_system_or_name))
 560     coding_system_or_name = Qbinary;
 561   else if (CODING_SYSTEMP (coding_system_or_name))
 562     return coding_system_or_name;
 563   else
 564     CHECK_SYMBOL (coding_system_or_name);
 565
 566   while (1)
 567     {
 568       coding_system_or_name =
 569         Fgethash (coding_system_or_name, Vcoding_system_hash_table, Qnil);
 570
 571       if (CODING_SYSTEMP (coding_system_or_name) || NILP (coding_system_or_name))
 572         return coding_system_or_name;
 573     }
 574 }
 575
 576 DEFUN ("get-coding-system", Fget_coding_system, 1, 1, 0, /*
 577 Retrieve the coding system of the given name.
 578 Same as `find-coding-system' except that if there is no such
 579 coding system, an error is signaled instead of returning nil.
 580 */
 581        (name))
 582 {
 583   Lisp_Object coding_system = Ffind_coding_system (name);
 584
 585   if (NILP (coding_system))
 586     signal_simple_error ("No such coding system", name);
 587   return coding_system;
 588 }
 589
 590 /* We store the coding systems in hash tables with the names as the key and the
 591    actual coding system object as the value.  Occasionally we need to use them
 592    in a list format.  These routines provide us with that. */
 593 struct coding_system_list_closure
 594 {
 595   Lisp_Object *coding_system_list;
 596 };
 597
 598 static int
 599 add_coding_system_to_list_mapper (Lisp_Object key, Lisp_Object value,
 600                                   void *coding_system_list_closure)
 601 {
 602   /* This function can GC */
 603   struct coding_system_list_closure *cscl =
 604     (struct coding_system_list_closure *) coding_system_list_closure;
 605   Lisp_Object *coding_system_list = cscl->coding_system_list;
 606
 607   *coding_system_list = Fcons (key, *coding_system_list);
 608   return 0;
 609 }
 610
 611 DEFUN ("coding-system-list", Fcoding_system_list, 0, 0, 0, /*
 612 Return a list of the names of all defined coding systems.
 613 */
 614        ())
 615 {
 616   Lisp_Object coding_system_list = Qnil;
 617   struct gcpro gcpro1;
 618   struct coding_system_list_closure coding_system_list_closure;
 619
 620   GCPRO1 (coding_system_list);
 621   coding_system_list_closure.coding_system_list = &coding_system_list;
 622   elisp_maphash (add_coding_system_to_list_mapper, Vcoding_system_hash_table,
 623                  &coding_system_list_closure);
 624   UNGCPRO;
 625
 626   return coding_system_list;
 627 }
 628
 629 DEFUN ("coding-system-name", Fcoding_system_name, 1, 1, 0, /*
 630 Return the name of the given coding system.
 631 */
 632        (coding_system))
 633 {
 634   coding_system = Fget_coding_system (coding_system);
 635   return XCODING_SYSTEM_NAME (coding_system);
 636 }
 637
 638 static Lisp_Coding_System *
 639 allocate_coding_system (enum coding_system_type type, Lisp_Object name)
 640 {
 641   Lisp_Coding_System *codesys =
 642     alloc_lcrecord_type (Lisp_Coding_System, &lrecord_coding_system);
 643
 644   zero_lcrecord (codesys);
 645   CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = Qnil;
 646   CODING_SYSTEM_POST_READ_CONVERSION (codesys) = Qnil;
 647   CODING_SYSTEM_EOL_TYPE (codesys) = EOL_AUTODETECT;
 648   CODING_SYSTEM_EOL_CRLF (codesys) = Qnil;
 649   CODING_SYSTEM_EOL_CR   (codesys) = Qnil;
 650   CODING_SYSTEM_EOL_LF   (codesys) = Qnil;
 651   CODING_SYSTEM_TYPE     (codesys) = type;
 652   CODING_SYSTEM_MNEMONIC (codesys) = Qnil;
 653 #ifdef MULE
 654   if (type == CODESYS_ISO2022)
 655     {
 656       int i;
 657       for (i = 0; i < 4; i++)
 658         CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i) = Qnil;
 659     }
 660   else if (type == CODESYS_CCL)
 661     {
 662       CODING_SYSTEM_CCL_DECODE (codesys) = Qnil;
 663       CODING_SYSTEM_CCL_ENCODE (codesys) = Qnil;
 664     }
 665 #endif /* MULE */
 666   CODING_SYSTEM_NAME (codesys) = name;
 667
 668   return codesys;
 669 }
 670
 671 #ifdef MULE
 672 /* Given a list of charset conversion specs as specified in a Lisp
 673    program, parse it into STORE_HERE. */
 674
 675 static void
 676 parse_charset_conversion_specs (charset_conversion_spec_dynarr *store_here,
 677                                 Lisp_Object spec_list)
 678 {
 679   Lisp_Object rest;
 680
 681   EXTERNAL_LIST_LOOP (rest, spec_list)
 682     {
 683       Lisp_Object car = XCAR (rest);
 684       Lisp_Object from, to;
 685       struct charset_conversion_spec spec;
 686
 687       if (!CONSP (car) || !CONSP (XCDR (car)) || !NILP (XCDR (XCDR (car))))
 688         signal_simple_error ("Invalid charset conversion spec", car);
 689       from = Fget_charset (XCAR (car));
 690       to = Fget_charset (XCAR (XCDR (car)));
 691       if ( (XCHARSET_CHARS (from) != XCHARSET_CHARS (to)) ||
 692            (XCHARSET_DIMENSION (from) != XCHARSET_DIMENSION (to)) )
 693         signal_simple_error_2
 694           ("Attempted conversion between different charset types",
 695            from, to);
 696       spec.from_charset = from;
 697       spec.to_charset = to;
 698
 699       Dynarr_add (store_here, spec);
 700     }
 701 }
 702
 703 /* Given a dynarr LOAD_HERE of internally-stored charset conversion
 704    specs, return the equivalent as the Lisp programmer would see it.
 705
 706    If LOAD_HERE is 0, return Qnil. */
 707
 708 static Lisp_Object
 709 unparse_charset_conversion_specs (charset_conversion_spec_dynarr *load_here)
 710 {
 711   int i;
 712   Lisp_Object result;
 713
 714   if (!load_here)
 715     return Qnil;
 716   for (i = 0, result = Qnil; i < Dynarr_length (load_here); i++)
 717     {
 718       struct charset_conversion_spec *ccs = Dynarr_atp (load_here, i);
 719       result = Fcons (list2 (ccs->from_charset, ccs->to_charset), result);
 720     }
 721
 722   return Fnreverse (result);
 723 }
 724
 725 #endif /* MULE */
 726
 727 DEFUN ("make-coding-system", Fmake_coding_system, 2, 4, 0, /*
 728 Register symbol NAME as a coding system.
 729
 730 TYPE describes the conversion method used and should be one of
 731
 732 nil or 'undecided
 733      Automatic conversion.  XEmacs attempts to detect the coding system
 734      used in the file.
 735 'no-conversion
 736      No conversion.  Use this for binary files and such.  On output,
 737      graphic characters that are not in ASCII or Latin-1 will be
 738      replaced by a ?. (For a no-conversion-encoded buffer, these
 739      characters will only be present if you explicitly insert them.)
 740 'shift-jis
 741      Shift-JIS (a Japanese encoding commonly used in PC operating systems).
 742 'ucs-4
 743      ISO 10646 UCS-4 encoding.
 744 'utf-8
 745      ISO 10646 UTF-8 encoding.
 746 'iso2022
 747      Any ISO2022-compliant encoding.  Among other things, this includes
 748      JIS (the Japanese encoding commonly used for e-mail), EUC (the
 749      standard Unix encoding for Japanese and other languages), and
 750      Compound Text (the encoding used in X11).  You can specify more
 751      specific information about the conversion with the FLAGS argument.
 752 'big5
 753      Big5 (the encoding commonly used for Taiwanese).
 754 'ccl
 755      The conversion is performed using a user-written pseudo-code
 756      program.  CCL (Code Conversion Language) is the name of this
 757      pseudo-code.
 758 'internal
 759      Write out or read in the raw contents of the memory representing
 760      the buffer's text.  This is primarily useful for debugging
 761      purposes, and is only enabled when XEmacs has been compiled with
 762      DEBUG_XEMACS defined (via the --debug configure option).
 763      WARNING: Reading in a file using 'internal conversion can result
 764      in an internal inconsistency in the memory representing a
 765      buffer's text, which will produce unpredictable results and may
 766      cause XEmacs to crash.  Under normal circumstances you should
 767      never use 'internal conversion.
 768
 769 DOC-STRING is a string describing the coding system.
 770
 771 PROPS is a property list, describing the specific nature of the
 772 character set.  Recognized properties are:
 773
 774 'mnemonic
 775      String to be displayed in the modeline when this coding system is
 776      active.
 777
 778 'eol-type
 779      End-of-line conversion to be used.  It should be one of
 780
 781         nil
 782                 Automatically detect the end-of-line type (LF, CRLF,
 783                 or CR).  Also generate subsidiary coding systems named
 784                 `NAME-unix', `NAME-dos', and `NAME-mac', that are
 785                 identical to this coding system but have an EOL-TYPE
 786                 value of 'lf, 'crlf, and 'cr, respectively.
 787         'lf
 788                 The end of a line is marked externally using ASCII LF.
 789                 Since this is also the way that XEmacs represents an
 790                 end-of-line internally, specifying this option results
 791                 in no end-of-line conversion.  This is the standard
 792                 format for Unix text files.
 793         'crlf
 794                 The end of a line is marked externally using ASCII
 795                 CRLF.  This is the standard format for MS-DOS text
 796                 files.
 797         'cr
 798                 The end of a line is marked externally using ASCII CR.
 799                 This is the standard format for Macintosh text files.
 800         t
 801                 Automatically detect the end-of-line type but do not
 802                 generate subsidiary coding systems.  (This value is
 803                 converted to nil when stored internally, and
 804                 `coding-system-property' will return nil.)
 805
 806 'disable-composition
 807      If non-nil, composition/decomposition for combining characters
 808      are disabled.
 809
 810 'post-read-conversion
 811      Function called after a file has been read in, to perform the
 812      decoding.  Called with two arguments, BEG and END, denoting
 813      a region of the current buffer to be decoded.
 814
 815 'pre-write-conversion
 816      Function called before a file is written out, to perform the
 817      encoding.  Called with two arguments, BEG and END, denoting
 818      a region of the current buffer to be encoded.
 819
 820
 821 The following additional properties are recognized if TYPE is 'iso2022:
 822
 823 'charset-g0
 824 'charset-g1
 825 'charset-g2
 826 'charset-g3
 827      The character set initially designated to the G0 - G3 registers.
 828      The value should be one of
 829
 830           -- A charset object (designate that character set)
 831           -- nil (do not ever use this register)
 832           -- t (no character set is initially designated to
 833                 the register, but may be later on; this automatically
 834                 sets the corresponding `force-g*-on-output' property)
 835
 836 'force-g0-on-output
 837 'force-g1-on-output
 838 'force-g2-on-output
 839 'force-g2-on-output
 840      If non-nil, send an explicit designation sequence on output before
 841      using the specified register.
 842
 843 'short
 844      If non-nil, use the short forms "ESC $ @", "ESC $ A", and
 845      "ESC $ B" on output in place of the full designation sequences
 846      "ESC $ ( @", "ESC $ ( A", and "ESC $ ( B".
 847
 848 'no-ascii-eol
 849      If non-nil, don't designate ASCII to G0 at each end of line on output.
 850      Setting this to non-nil also suppresses other state-resetting that
 851      normally happens at the end of a line.
 852
 853 'no-ascii-cntl
 854      If non-nil, don't designate ASCII to G0 before control chars on output.
 855
 856 'seven
 857      If non-nil, use 7-bit environment on output.  Otherwise, use 8-bit
 858      environment.
 859
 860 'lock-shift
 861      If non-nil, use locking-shift (SO/SI) instead of single-shift
 862      or designation by escape sequence.
 863
 864 'no-iso6429
 865      If non-nil, don't use ISO6429's direction specification.
 866
 867 'escape-quoted
 868      If non-nil, literal control characters that are the same as
 869      the beginning of a recognized ISO2022 or ISO6429 escape sequence
 870      (in particular, ESC (0x1B), SO (0x0E), SI (0x0F), SS2 (0x8E),
 871      SS3 (0x8F), and CSI (0x9B)) are "quoted" with an escape character
 872      so that they can be properly distinguished from an escape sequence.
 873      (Note that doing this results in a non-portable encoding.) This
 874      encoding flag is used for byte-compiled files.  Note that ESC
 875      is a good choice for a quoting character because there are no
 876      escape sequences whose second byte is a character from the Control-0
 877      or Control-1 character sets; this is explicitly disallowed by the
 878      ISO2022 standard.
 879
 880 'input-charset-conversion
 881      A list of conversion specifications, specifying conversion of
 882      characters in one charset to another when decoding is performed.
 883      Each specification is a list of two elements: the source charset,
 884      and the destination charset.
 885
 886 'output-charset-conversion
 887      A list of conversion specifications, specifying conversion of
 888      characters in one charset to another when encoding is performed.
 889      The form of each specification is the same as for
 890      'input-charset-conversion.
 891
 892
 893 The following additional properties are recognized (and required)
 894 if TYPE is 'ccl:
 895
 896 'decode
 897      CCL program used for decoding (converting to internal format).
 898
 899 'encode
 900      CCL program used for encoding (converting to external format).
 901 */
 902        (name, type, doc_string, props))
 903 {
 904   Lisp_Coding_System *codesys;
 905   Lisp_Object rest, key, value;
 906   enum coding_system_type ty;
 907   int need_to_setup_eol_systems = 1;
 908
 909   /* Convert type to constant */
 910   if (NILP (type) || EQ (type, Qundecided))
 911                                       { ty = CODESYS_AUTODETECT; }
 912 #ifdef MULE
 913   else if (EQ (type, Qshift_jis))     { ty = CODESYS_SHIFT_JIS; }
 914   else if (EQ (type, Qiso2022))       { ty = CODESYS_ISO2022; }
 915   else if (EQ (type, Qbig5))          { ty = CODESYS_BIG5; }
 916   else if (EQ (type, Qucs4))          { ty = CODESYS_UCS4; }
 917   else if (EQ (type, Qutf8))          { ty = CODESYS_UTF8; }
 918   else if (EQ (type, Qccl))           { ty = CODESYS_CCL; }
 919 #endif
 920   else if (EQ (type, Qno_conversion)) { ty = CODESYS_NO_CONVERSION; }
 921 #ifdef DEBUG_XEMACS
 922   else if (EQ (type, Qinternal))      { ty = CODESYS_INTERNAL; }
 923 #endif
 924   else
 925     signal_simple_error ("Invalid coding system type", type);
 926
 927   CHECK_SYMBOL (name);
 928
 929   codesys = allocate_coding_system (ty, name);
 930
 931   if (NILP (doc_string))
 932     doc_string = build_string ("");
 933   else
 934     CHECK_STRING (doc_string);
 935   CODING_SYSTEM_DOC_STRING (codesys) = doc_string;
 936
 937   EXTERNAL_PROPERTY_LIST_LOOP (rest, key, value, props)
 938     {
 939       if (EQ (key, Qmnemonic))
 940         {
 941           if (!NILP (value))
 942             CHECK_STRING (value);
 943           CODING_SYSTEM_MNEMONIC (codesys) = value;
 944         }
 945
 946       else if (EQ (key, Qeol_type))
 947         {
 948           need_to_setup_eol_systems = NILP (value);
 949           if (EQ (value, Qt))
 950             value = Qnil;
 951           CODING_SYSTEM_EOL_TYPE (codesys) = symbol_to_eol_type (value);
 952         }
 953
 954       else if (EQ (key, Qpost_read_conversion))
 955         CODING_SYSTEM_POST_READ_CONVERSION (codesys) = value;
 956       else if (EQ (key, Qpre_write_conversion))
 957         CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = value;
 958 #ifdef UTF2000
 959       else if (EQ (key, Qdisable_composition))
 960         CODING_SYSTEM_DISABLE_COMPOSITION (codesys) = !NILP (value);
 961 #endif
 962 #ifdef MULE
 963       else if (ty == CODESYS_ISO2022)
 964         {
 965 #define FROB_INITIAL_CHARSET(charset_num) \
 966   CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \
 967     ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value))
 968
 969           if      (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
 970           else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
 971           else if (EQ (key, Qcharset_g2)) FROB_INITIAL_CHARSET (2);
 972           else if (EQ (key, Qcharset_g3)) FROB_INITIAL_CHARSET (3);
 973
 974 #define FROB_FORCE_CHARSET(charset_num) \
 975   CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = !NILP (value)
 976
 977           else if (EQ (key, Qforce_g0_on_output)) FROB_FORCE_CHARSET (0);
 978           else if (EQ (key, Qforce_g1_on_output)) FROB_FORCE_CHARSET (1);
 979           else if (EQ (key, Qforce_g2_on_output)) FROB_FORCE_CHARSET (2);
 980           else if (EQ (key, Qforce_g3_on_output)) FROB_FORCE_CHARSET (3);
 981
 982 #define FROB_BOOLEAN_PROPERTY(prop) \
 983   CODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value)
 984
 985           else if (EQ (key, Qshort))         FROB_BOOLEAN_PROPERTY (SHORT);
 986           else if (EQ (key, Qno_ascii_eol))  FROB_BOOLEAN_PROPERTY (NO_ASCII_EOL);
 987           else if (EQ (key, Qno_ascii_cntl)) FROB_BOOLEAN_PROPERTY (NO_ASCII_CNTL);
 988           else if (EQ (key, Qseven))         FROB_BOOLEAN_PROPERTY (SEVEN);
 989           else if (EQ (key, Qlock_shift))    FROB_BOOLEAN_PROPERTY (LOCK_SHIFT);
 990           else if (EQ (key, Qno_iso6429))    FROB_BOOLEAN_PROPERTY (NO_ISO6429);
 991           else if (EQ (key, Qescape_quoted)) FROB_BOOLEAN_PROPERTY (ESCAPE_QUOTED);
 992
 993           else if (EQ (key, Qinput_charset_conversion))
 994             {
 995               codesys->iso2022.input_conv =
 996                 Dynarr_new (charset_conversion_spec);
 997               parse_charset_conversion_specs (codesys->iso2022.input_conv,
 998                                               value);
 999             }
1000           else if (EQ (key, Qoutput_charset_conversion))
1001             {
1002               codesys->iso2022.output_conv =
1003                 Dynarr_new (charset_conversion_spec);
1004               parse_charset_conversion_specs (codesys->iso2022.output_conv,
1005                                               value);
1006             }
1007           else
1008             signal_simple_error ("Unrecognized property", key);
1009         }
1010       else if (EQ (type, Qccl))
1011         {
1012           if (EQ (key, Qdecode))
1013             {
1014               CHECK_VECTOR (value);
1015               CODING_SYSTEM_CCL_DECODE (codesys) = value;
1016             }
1017           else if (EQ (key, Qencode))
1018             {
1019               CHECK_VECTOR (value);
1020               CODING_SYSTEM_CCL_ENCODE (codesys) = value;
1021             }
1022           else
1023             signal_simple_error ("Unrecognized property", key);
1024         }
1025 #endif /* MULE */
1026       else
1027         signal_simple_error ("Unrecognized property", key);
1028     }
1029
1030   if (need_to_setup_eol_systems)
1031     setup_eol_coding_systems (codesys);
1032
1033   {
1034     Lisp_Object codesys_obj;
1035     XSETCODING_SYSTEM (codesys_obj, codesys);
1036     Fputhash (name, codesys_obj, Vcoding_system_hash_table);
1037     return codesys_obj;
1038   }
1039 }
1040
1041 DEFUN ("copy-coding-system", Fcopy_coding_system, 2, 2, 0, /*
1042 Copy OLD-CODING-SYSTEM to NEW-NAME.
1043 If NEW-NAME does not name an existing coding system, a new one will
1044 be created.
1045 */
1046        (old_coding_system, new_name))
1047 {
1048   Lisp_Object new_coding_system;
1049   old_coding_system = Fget_coding_system (old_coding_system);
1050   new_coding_system = Ffind_coding_system (new_name);
1051   if (NILP (new_coding_system))
1052     {
1053       XSETCODING_SYSTEM (new_coding_system,
1054                          allocate_coding_system
1055                          (XCODING_SYSTEM_TYPE (old_coding_system),
1056                           new_name));
1057       Fputhash (new_name, new_coding_system, Vcoding_system_hash_table);
1058     }
1059
1060   {
1061     Lisp_Coding_System *to = XCODING_SYSTEM (new_coding_system);
1062     Lisp_Coding_System *from = XCODING_SYSTEM (old_coding_system);
1063     memcpy (((char *) to  ) + sizeof (to->header),
1064             ((char *) from) + sizeof (from->header),
1065             sizeof (*from) - sizeof (from->header));
1066     to->name = new_name;
1067   }
1068   return new_coding_system;
1069 }
1070
1071 DEFUN ("coding-system-canonical-name-p", Fcoding_system_canonical_name_p, 1, 1, 0, /*
1072 Return t if OBJECT names a coding system, and is not a coding system alias.
1073 */
1074        (object))
1075 {
1076   return CODING_SYSTEMP (Fgethash (object, Vcoding_system_hash_table, Qnil))
1077     ? Qt : Qnil;
1078 }
1079
1080 DEFUN ("coding-system-alias-p", Fcoding_system_alias_p, 1, 1, 0, /*
1081 Return t if OBJECT is a coding system alias.
1082 All coding system aliases are created by `define-coding-system-alias'.
1083 */
1084        (object))
1085 {
1086   return SYMBOLP (Fgethash (object, Vcoding_system_hash_table, Qzero))
1087     ? Qt : Qnil;
1088 }
1089
1090 DEFUN ("coding-system-aliasee", Fcoding_system_aliasee, 1, 1, 0, /*
1091 Return the coding-system symbol for which symbol ALIAS is an alias.
1092 */
1093        (alias))
1094 {
1095   Lisp_Object aliasee = Fgethash (alias, Vcoding_system_hash_table, Qnil);
1096   if (SYMBOLP (aliasee))
1097     return aliasee;
1098   else
1099     signal_simple_error ("Symbol is not a coding system alias", alias);
1100 }
1101
1102 static Lisp_Object
1103 append_suffix_to_symbol (Lisp_Object symbol, const char *ascii_string)
1104 {
1105   return Fintern (concat2 (Fsymbol_name (symbol), build_string (ascii_string)),
1106                   Qnil);
1107 }
1108
1109 /* A maphash function, for removing dangling coding system aliases. */
1110 static int
1111 dangling_coding_system_alias_p (Lisp_Object alias,
1112                                 Lisp_Object aliasee,
1113                                 void *dangling_aliases)
1114 {
1115   if (SYMBOLP (aliasee)
1116       && NILP (Fgethash (aliasee, Vcoding_system_hash_table, Qnil)))
1117     {
1118       (*(int *) dangling_aliases)++;
1119       return 1;
1120     }
1121   else
1122     return 0;
1123 }
1124
1125 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias, 2, 2, 0, /*
1126 Define symbol ALIAS as an alias for coding system ALIASEE.
1127
1128 You can use this function to redefine an alias that has already been defined,
1129 but you cannot redefine a name which is the canonical name for a coding system.
1130 \(a canonical name of a coding system is what is returned when you call
1131 `coding-system-name' on a coding system).
1132
1133 ALIASEE itself can be an alias, which allows you to define nested aliases.
1134
1135 You are forbidden, however, from creating alias loops or `dangling' aliases.
1136 These will be detected, and an error will be signaled if you attempt to do so.
1137
1138 If ALIASEE is nil, then ALIAS will simply be undefined.
1139
1140 See also `coding-system-alias-p', `coding-system-aliasee',
1141 and `coding-system-canonical-name-p'.
1142 */
1143        (alias, aliasee))
1144 {
1145   Lisp_Object real_coding_system, probe;
1146
1147   CHECK_SYMBOL (alias);
1148
1149   if (!NILP (Fcoding_system_canonical_name_p (alias)))
1150     signal_simple_error
1151       ("Symbol is the canonical name of a coding system and cannot be redefined",
1152        alias);
1153
1154   if (NILP (aliasee))
1155     {
1156       Lisp_Object subsidiary_unix = append_suffix_to_symbol (alias, "-unix");
1157       Lisp_Object subsidiary_dos  = append_suffix_to_symbol (alias, "-dos");
1158       Lisp_Object subsidiary_mac  = append_suffix_to_symbol (alias, "-mac");
1159
1160       Fremhash (alias, Vcoding_system_hash_table);
1161
1162       /* Undefine subsidiary aliases,
1163          presumably created by a previous call to this function */
1164       if (! NILP (Fcoding_system_alias_p (subsidiary_unix)) &&
1165           ! NILP (Fcoding_system_alias_p (subsidiary_dos))  &&
1166           ! NILP (Fcoding_system_alias_p (subsidiary_mac)))
1167         {
1168           Fdefine_coding_system_alias (subsidiary_unix, Qnil);
1169           Fdefine_coding_system_alias (subsidiary_dos,  Qnil);
1170           Fdefine_coding_system_alias (subsidiary_mac,  Qnil);
1171         }
1172
1173       /* Undefine dangling coding system aliases. */
1174       {
1175         int dangling_aliases;
1176
1177         do {
1178           dangling_aliases = 0;
1179           elisp_map_remhash (dangling_coding_system_alias_p,
1180                              Vcoding_system_hash_table,
1181                              &dangling_aliases);
1182         } while (dangling_aliases > 0);
1183       }
1184
1185       return Qnil;
1186     }
1187
1188   if (CODING_SYSTEMP (aliasee))
1189     aliasee = XCODING_SYSTEM_NAME (aliasee);
1190
1191   /* Checks that aliasee names a coding-system */
1192   real_coding_system = Fget_coding_system (aliasee);
1193
1194   /* Check for coding system alias loops */
1195   if (EQ (alias, aliasee))
1196     alias_loop: signal_simple_error_2
1197       ("Attempt to create a coding system alias loop", alias, aliasee);
1198
1199   for (probe = aliasee;
1200        SYMBOLP (probe);
1201        probe = Fgethash (probe, Vcoding_system_hash_table, Qzero))
1202     {
1203       if (EQ (probe, alias))
1204         goto alias_loop;
1205     }
1206
1207   Fputhash (alias, aliasee, Vcoding_system_hash_table);
1208
1209   /* Set up aliases for subsidiaries.
1210      #### There must be a better way to handle subsidiary coding systems. */
1211   {
1212     static const char *suffixes[] = { "-unix", "-dos", "-mac" };
1213     int i;
1214     for (i = 0; i < countof (suffixes); i++)
1215       {
1216         Lisp_Object alias_subsidiary =
1217           append_suffix_to_symbol (alias, suffixes[i]);
1218         Lisp_Object aliasee_subsidiary =
1219           append_suffix_to_symbol (aliasee, suffixes[i]);
1220
1221         if (! NILP (Ffind_coding_system (aliasee_subsidiary)))
1222           Fdefine_coding_system_alias (alias_subsidiary, aliasee_subsidiary);
1223       }
1224   }
1225   /* FSF return value is a vector of [ALIAS-unix ALIAS-dos ALIAS-mac],
1226      but it doesn't look intentional, so I'd rather return something
1227      meaningful or nothing at all. */
1228   return Qnil;
1229 }
1230
1231 static Lisp_Object
1232 subsidiary_coding_system (Lisp_Object coding_system, eol_type_t type)
1233 {
1234   Lisp_Coding_System *cs = XCODING_SYSTEM (coding_system);
1235   Lisp_Object new_coding_system;
1236
1237   if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
1238     return coding_system;
1239
1240   switch (type)
1241     {
1242     case EOL_AUTODETECT: return coding_system;
1243     case EOL_LF:   new_coding_system = CODING_SYSTEM_EOL_LF   (cs); break;
1244     case EOL_CR:   new_coding_system = CODING_SYSTEM_EOL_CR   (cs); break;
1245     case EOL_CRLF: new_coding_system = CODING_SYSTEM_EOL_CRLF (cs); break;
1246     default:       abort ();
1247     }
1248
1249   return NILP (new_coding_system) ? coding_system : new_coding_system;
1250 }
1251
1252 DEFUN ("subsidiary-coding-system", Fsubsidiary_coding_system, 2, 2, 0, /*
1253 Return the subsidiary coding system of CODING-SYSTEM with eol type EOL-TYPE.
1254 */
1255        (coding_system, eol_type))
1256 {
1257   coding_system = Fget_coding_system (coding_system);
1258
1259   return subsidiary_coding_system (coding_system,
1260                                    symbol_to_eol_type (eol_type));
1261 }
1262
1263 \f
1264 /************************************************************************/
1265 /*                         Coding system accessors                      */
1266 /************************************************************************/
1267
1268 DEFUN ("coding-system-doc-string", Fcoding_system_doc_string, 1, 1, 0, /*
1269 Return the doc string for CODING-SYSTEM.
1270 */
1271        (coding_system))
1272 {
1273   coding_system = Fget_coding_system (coding_system);
1274   return XCODING_SYSTEM_DOC_STRING (coding_system);
1275 }
1276
1277 DEFUN ("coding-system-type", Fcoding_system_type, 1, 1, 0, /*
1278 Return the type of CODING-SYSTEM.
1279 */
1280        (coding_system))
1281 {
1282   switch (XCODING_SYSTEM_TYPE (Fget_coding_system (coding_system)))
1283     {
1284     default: abort ();
1285     case CODESYS_AUTODETECT:    return Qundecided;
1286 #ifdef MULE
1287     case CODESYS_SHIFT_JIS:     return Qshift_jis;
1288     case CODESYS_ISO2022:       return Qiso2022;
1289     case CODESYS_BIG5:          return Qbig5;
1290     case CODESYS_UCS4:          return Qucs4;
1291     case CODESYS_UTF8:          return Qutf8;
1292     case CODESYS_CCL:           return Qccl;
1293 #endif
1294     case CODESYS_NO_CONVERSION: return Qno_conversion;
1295 #ifdef DEBUG_XEMACS
1296     case CODESYS_INTERNAL:      return Qinternal;
1297 #endif
1298     }
1299 }
1300
1301 #ifdef MULE
1302 static
1303 Lisp_Object coding_system_charset (Lisp_Object coding_system, int gnum)
1304 {
1305   Lisp_Object cs
1306     = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, gnum);
1307
1308   return CHARSETP (cs) ? XCHARSET_NAME (cs) : Qnil;
1309 }
1310
1311 DEFUN ("coding-system-charset", Fcoding_system_charset, 2, 2, 0, /*
1312 Return initial charset of CODING-SYSTEM designated to GNUM.
1313 GNUM allows 0 .. 3.
1314 */
1315        (coding_system, gnum))
1316 {
1317   coding_system = Fget_coding_system (coding_system);
1318   CHECK_INT (gnum);
1319
1320   return coding_system_charset (coding_system, XINT (gnum));
1321 }
1322 #endif /* MULE */
1323
1324 DEFUN ("coding-system-property", Fcoding_system_property, 2, 2, 0, /*
1325 Return the PROP property of CODING-SYSTEM.
1326 */
1327        (coding_system, prop))
1328 {
1329   int i, ok = 0;
1330   enum coding_system_type type;
1331
1332   coding_system = Fget_coding_system (coding_system);
1333   CHECK_SYMBOL (prop);
1334   type = XCODING_SYSTEM_TYPE (coding_system);
1335
1336   for (i = 0; !ok && i < Dynarr_length (the_codesys_prop_dynarr); i++)
1337     if (EQ (Dynarr_at (the_codesys_prop_dynarr, i).sym, prop))
1338       {
1339         ok = 1;
1340         switch (Dynarr_at (the_codesys_prop_dynarr, i).prop_type)
1341           {
1342           case CODESYS_PROP_ALL_OK:
1343             break;
1344 #ifdef MULE
1345           case CODESYS_PROP_ISO2022:
1346             if (type != CODESYS_ISO2022)
1347               signal_simple_error
1348                 ("Property only valid in ISO2022 coding systems",
1349                  prop);
1350             break;
1351
1352           case CODESYS_PROP_CCL:
1353             if (type != CODESYS_CCL)
1354               signal_simple_error
1355                 ("Property only valid in CCL coding systems",
1356                  prop);
1357             break;
1358 #endif /* MULE */
1359           default:
1360             abort ();
1361           }
1362       }
1363
1364   if (!ok)
1365     signal_simple_error ("Unrecognized property", prop);
1366
1367   if (EQ (prop, Qname))
1368     return XCODING_SYSTEM_NAME (coding_system);
1369   else if (EQ (prop, Qtype))
1370     return Fcoding_system_type (coding_system);
1371   else if (EQ (prop, Qdoc_string))
1372     return XCODING_SYSTEM_DOC_STRING (coding_system);
1373   else if (EQ (prop, Qmnemonic))
1374     return XCODING_SYSTEM_MNEMONIC (coding_system);
1375   else if (EQ (prop, Qeol_type))
1376     return eol_type_to_symbol (XCODING_SYSTEM_EOL_TYPE (coding_system));
1377   else if (EQ (prop, Qeol_lf))
1378     return XCODING_SYSTEM_EOL_LF (coding_system);
1379   else if (EQ (prop, Qeol_crlf))
1380     return XCODING_SYSTEM_EOL_CRLF (coding_system);
1381   else if (EQ (prop, Qeol_cr))
1382     return XCODING_SYSTEM_EOL_CR (coding_system);
1383   else if (EQ (prop, Qpost_read_conversion))
1384     return XCODING_SYSTEM_POST_READ_CONVERSION (coding_system);
1385   else if (EQ (prop, Qpre_write_conversion))
1386     return XCODING_SYSTEM_PRE_WRITE_CONVERSION (coding_system);
1387 #ifdef MULE
1388   else if (type == CODESYS_ISO2022)
1389     {
1390       if (EQ (prop, Qcharset_g0))
1391         return coding_system_charset (coding_system, 0);
1392       else if (EQ (prop, Qcharset_g1))
1393         return coding_system_charset (coding_system, 1);
1394       else if (EQ (prop, Qcharset_g2))
1395         return coding_system_charset (coding_system, 2);
1396       else if (EQ (prop, Qcharset_g3))
1397         return coding_system_charset (coding_system, 3);
1398
1399 #define FORCE_CHARSET(charset_num) \
1400   (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \
1401    (coding_system, charset_num) ? Qt : Qnil)
1402
1403       else if (EQ (prop, Qforce_g0_on_output)) return FORCE_CHARSET (0);
1404       else if (EQ (prop, Qforce_g1_on_output)) return FORCE_CHARSET (1);
1405       else if (EQ (prop, Qforce_g2_on_output)) return FORCE_CHARSET (2);
1406       else if (EQ (prop, Qforce_g3_on_output)) return FORCE_CHARSET (3);
1407
1408 #define LISP_BOOLEAN(prop) \
1409   (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil)
1410
1411       else if (EQ (prop, Qshort))         return LISP_BOOLEAN (SHORT);
1412       else if (EQ (prop, Qno_ascii_eol))  return LISP_BOOLEAN (NO_ASCII_EOL);
1413       else if (EQ (prop, Qno_ascii_cntl)) return LISP_BOOLEAN (NO_ASCII_CNTL);
1414       else if (EQ (prop, Qseven))         return LISP_BOOLEAN (SEVEN);
1415       else if (EQ (prop, Qlock_shift))    return LISP_BOOLEAN (LOCK_SHIFT);
1416       else if (EQ (prop, Qno_iso6429))    return LISP_BOOLEAN (NO_ISO6429);
1417       else if (EQ (prop, Qescape_quoted)) return LISP_BOOLEAN (ESCAPE_QUOTED);
1418
1419       else if (EQ (prop, Qinput_charset_conversion))
1420         return
1421           unparse_charset_conversion_specs
1422             (XCODING_SYSTEM (coding_system)->iso2022.input_conv);
1423       else if (EQ (prop, Qoutput_charset_conversion))
1424         return
1425           unparse_charset_conversion_specs
1426             (XCODING_SYSTEM (coding_system)->iso2022.output_conv);
1427       else
1428         abort ();
1429     }
1430   else if (type == CODESYS_CCL)
1431     {
1432       if (EQ (prop, Qdecode))
1433         return XCODING_SYSTEM_CCL_DECODE (coding_system);
1434       else if (EQ (prop, Qencode))
1435         return XCODING_SYSTEM_CCL_ENCODE (coding_system);
1436       else
1437         abort ();
1438     }
1439 #endif /* MULE */
1440   else
1441     abort ();
1442
1443   return Qnil; /* not reached */
1444 }
1445
1446 \f
1447 /************************************************************************/
1448 /*                       Coding category functions                      */
1449 /************************************************************************/
1450
1451 static int
1452 decode_coding_category (Lisp_Object symbol)
1453 {
1454   int i;
1455
1456   CHECK_SYMBOL (symbol);
1457   for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1458     if (EQ (coding_category_symbol[i], symbol))
1459       return i;
1460
1461   signal_simple_error ("Unrecognized coding category", symbol);
1462   return 0; /* not reached */
1463 }
1464
1465 DEFUN ("coding-category-list", Fcoding_category_list, 0, 0, 0, /*
1466 Return a list of all recognized coding categories.
1467 */
1468        ())
1469 {
1470   int i;
1471   Lisp_Object list = Qnil;
1472
1473   for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1474     list = Fcons (coding_category_symbol[i], list);
1475   return list;
1476 }
1477
1478 DEFUN ("set-coding-priority-list", Fset_coding_priority_list, 1, 1, 0, /*
1479 Change the priority order of the coding categories.
1480 LIST should be list of coding categories, in descending order of
1481 priority.  Unspecified coding categories will be lower in priority
1482 than all specified ones, in the same relative order they were in
1483 previously.
1484 */
1485        (list))
1486 {
1487   int category_to_priority[CODING_CATEGORY_LAST + 1];
1488   int i, j;
1489   Lisp_Object rest;
1490
1491   /* First generate a list that maps coding categories to priorities. */
1492
1493   for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1494     category_to_priority[i] = -1;
1495
1496   /* Highest priority comes from the specified list. */
1497   i = 0;
1498   EXTERNAL_LIST_LOOP (rest, list)
1499     {
1500       int cat = decode_coding_category (XCAR (rest));
1501
1502       if (category_to_priority[cat] >= 0)
1503         signal_simple_error ("Duplicate coding category in list", XCAR (rest));
1504       category_to_priority[cat] = i++;
1505     }
1506
1507   /* Now go through the existing categories by priority to retrieve
1508      the categories not yet specified and preserve their priority
1509      order. */
1510   for (j = 0; j <= CODING_CATEGORY_LAST; j++)
1511     {
1512       int cat = fcd->coding_category_by_priority[j];
1513       if (category_to_priority[cat] < 0)
1514         category_to_priority[cat] = i++;
1515     }
1516
1517   /* Now we need to construct the inverse of the mapping we just
1518      constructed. */
1519
1520   for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1521     fcd->coding_category_by_priority[category_to_priority[i]] = i;
1522
1523   /* Phew!  That was confusing. */
1524   return Qnil;
1525 }
1526
1527 DEFUN ("coding-priority-list", Fcoding_priority_list, 0, 0, 0, /*
1528 Return a list of coding categories in descending order of priority.
1529 */
1530        ())
1531 {
1532   int i;
1533   Lisp_Object list = Qnil;
1534
1535   for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1536     list = Fcons (coding_category_symbol[fcd->coding_category_by_priority[i]],
1537                   list);
1538   return list;
1539 }
1540
1541 DEFUN ("set-coding-category-system", Fset_coding_category_system, 2, 2, 0, /*
1542 Change the coding system associated with a coding category.
1543 */
1544        (coding_category, coding_system))
1545 {
1546   int cat = decode_coding_category (coding_category);
1547
1548   coding_system = Fget_coding_system (coding_system);
1549   fcd->coding_category_system[cat] = coding_system;
1550   return Qnil;
1551 }
1552
1553 DEFUN ("coding-category-system", Fcoding_category_system, 1, 1, 0, /*
1554 Return the coding system associated with a coding category.
1555 */
1556        (coding_category))
1557 {
1558   int cat = decode_coding_category (coding_category);
1559   Lisp_Object sys = fcd->coding_category_system[cat];
1560
1561   if (!NILP (sys))
1562     return XCODING_SYSTEM_NAME (sys);
1563   return Qnil;
1564 }
1565
1566 \f
1567 /************************************************************************/
1568 /*                     Detecting the encoding of data                   */
1569 /************************************************************************/
1570
1571 struct detection_state
1572 {
1573   eol_type_t eol_type;
1574   int seen_non_ascii;
1575   int mask;
1576 #ifdef MULE
1577   struct
1578     {
1579       int mask;
1580       int in_second_byte;
1581     }
1582   big5;
1583
1584   struct
1585     {
1586       int mask;
1587       int in_second_byte;
1588     }
1589   shift_jis;
1590
1591   struct
1592     {
1593       int mask;
1594       int in_byte;
1595   }
1596   ucs4;
1597
1598   struct
1599     {
1600       int mask;
1601       int in_byte;
1602     }
1603   utf8;
1604
1605   struct
1606     {
1607       int mask;
1608       int initted;
1609       struct iso2022_decoder iso;
1610       unsigned int flags;
1611       int high_byte_count;
1612       unsigned int saw_single_shift:1;
1613     }
1614   iso2022;
1615 #endif
1616   struct
1617     {
1618       int seen_anything;
1619       int just_saw_cr;
1620     }
1621   eol;
1622 };
1623
1624 static int
1625 acceptable_control_char_p (int c)
1626 {
1627   switch (c)
1628     {
1629       /* Allow and ignore control characters that you might
1630          reasonably see in a text file */
1631     case '\r':
1632     case '\n':
1633     case '\t':
1634     case  7: /* bell */
1635     case  8: /* backspace */
1636     case 11: /* vertical tab */
1637     case 12: /* form feed */
1638     case 26: /* MS-DOS C-z junk */
1639     case 31: /* '^_' -- for info */
1640       return 1;
1641     default:
1642       return 0;
1643     }
1644 }
1645
1646 static int
1647 mask_has_at_most_one_bit_p (int mask)
1648 {
1649   /* Perhaps the only thing useful you learn from intensive Microsoft
1650      technical interviews */
1651   return (mask & (mask - 1)) == 0;
1652 }
1653
1654 static eol_type_t
1655 detect_eol_type (struct detection_state *st, const unsigned char *src,
1656                  unsigned int n)
1657 {
1658   int c;
1659
1660   while (n--)
1661     {
1662       c = *src++;
1663       if (c == '\n')
1664         {
1665           if (st->eol.just_saw_cr)
1666             return EOL_CRLF;
1667           else if (st->eol.seen_anything)
1668             return EOL_LF;
1669         }
1670       else if (st->eol.just_saw_cr)
1671         return EOL_CR;
1672       else if (c == '\r')
1673         st->eol.just_saw_cr = 1;
1674       else
1675         st->eol.just_saw_cr = 0;
1676       st->eol.seen_anything = 1;
1677     }
1678
1679   return EOL_AUTODETECT;
1680 }
1681
1682 /* Attempt to determine the encoding and EOL type of the given text.
1683    Before calling this function for the first type, you must initialize
1684    st->eol_type as appropriate and initialize st->mask to ~0.
1685
1686    st->eol_type holds the determined EOL type, or EOL_AUTODETECT if
1687    not yet known.
1688
1689    st->mask holds the determined coding category mask, or ~0 if only
1690    ASCII has been seen so far.
1691
1692    Returns:
1693
1694    0 == st->eol_type is EOL_AUTODETECT and/or more than coding category
1695         is present in st->mask
1696    1 == definitive answers are here for both st->eol_type and st->mask
1697 */
1698
1699 static int
1700 detect_coding_type (struct detection_state *st, const Extbyte *src,
1701                     unsigned int n, int just_do_eol)
1702 {
1703   int c;
1704
1705   if (st->eol_type == EOL_AUTODETECT)
1706     st->eol_type = detect_eol_type (st, src, n);
1707
1708   if (just_do_eol)
1709     return st->eol_type != EOL_AUTODETECT;
1710
1711   if (!st->seen_non_ascii)
1712     {
1713       for (; n; n--, src++)
1714         {
1715           c = *src;
1716           if ((c < 0x20 && !acceptable_control_char_p (c)) || c >= 0x80)
1717             {
1718               st->seen_non_ascii = 1;
1719 #ifdef MULE
1720               st->shift_jis.mask = ~0;
1721               st->big5.mask = ~0;
1722               st->ucs4.mask = ~0;
1723               st->utf8.mask = ~0;
1724               st->iso2022.mask = ~0;
1725 #endif
1726               break;
1727             }
1728         }
1729     }
1730
1731   if (!n)
1732     return 0;
1733 #ifdef MULE
1734   if (!mask_has_at_most_one_bit_p (st->iso2022.mask))
1735     st->iso2022.mask = detect_coding_iso2022 (st, src, n);
1736   if (!mask_has_at_most_one_bit_p (st->shift_jis.mask))
1737     st->shift_jis.mask = detect_coding_sjis (st, src, n);
1738   if (!mask_has_at_most_one_bit_p (st->big5.mask))
1739     st->big5.mask = detect_coding_big5 (st, src, n);
1740   if (!mask_has_at_most_one_bit_p (st->utf8.mask))
1741     st->utf8.mask = detect_coding_utf8 (st, src, n);
1742   if (!mask_has_at_most_one_bit_p (st->ucs4.mask))
1743     st->ucs4.mask = detect_coding_ucs4 (st, src, n);
1744
1745   st->mask
1746     = st->iso2022.mask | st->shift_jis.mask | st->big5.mask
1747     | st->utf8.mask | st->ucs4.mask;
1748 #endif
1749   {
1750     int retval = mask_has_at_most_one_bit_p (st->mask);
1751     st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK;
1752     return retval && st->eol_type != EOL_AUTODETECT;
1753   }
1754 }
1755
1756 static Lisp_Object
1757 coding_system_from_mask (int mask)
1758 {
1759   if (mask == ~0)
1760     {
1761       /* If the file was entirely or basically ASCII, use the
1762          default value of `buffer-file-coding-system'. */
1763       Lisp_Object retval =
1764         XBUFFER (Vbuffer_defaults)->buffer_file_coding_system;
1765       if (!NILP (retval))
1766         {
1767           retval = Ffind_coding_system (retval);
1768           if (NILP (retval))
1769             {
1770               warn_when_safe
1771                 (Qbad_variable, Qwarning,
1772                  "Invalid `default-buffer-file-coding-system', set to nil");
1773               XBUFFER (Vbuffer_defaults)->buffer_file_coding_system = Qnil;
1774             }
1775         }
1776       if (NILP (retval))
1777         retval = Fget_coding_system (Qraw_text);
1778       return retval;
1779     }
1780   else
1781     {
1782       int i;
1783       int cat = -1;
1784 #ifdef MULE
1785       mask = postprocess_iso2022_mask (mask);
1786 #endif
1787       /* Look through the coding categories by priority and find
1788          the first one that is allowed. */
1789       for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1790         {
1791           cat = fcd->coding_category_by_priority[i];
1792           if ((mask & (1 << cat)) &&
1793               !NILP (fcd->coding_category_system[cat]))
1794             break;
1795         }
1796       if (cat >= 0)
1797         return fcd->coding_category_system[cat];
1798       else
1799         return Fget_coding_system (Qraw_text);
1800     }
1801 }
1802
1803 /* Given a seekable read stream and potential coding system and EOL type
1804    as specified, do any autodetection that is called for.  If the
1805    coding system and/or EOL type are not `autodetect', they will be left
1806    alone; but this function will never return an autodetect coding system
1807    or EOL type.
1808
1809    This function does not automatically fetch subsidiary coding systems;
1810    that should be unnecessary with the explicit eol-type argument. */
1811
1812 #define LENGTH(string_constant) (sizeof (string_constant) - 1)
1813
1814 void
1815 determine_real_coding_system (Lstream *stream, Lisp_Object *codesys_in_out,
1816                               eol_type_t *eol_type_in_out)
1817 {
1818   struct detection_state decst;
1819
1820   if (*eol_type_in_out == EOL_AUTODETECT)
1821     *eol_type_in_out = XCODING_SYSTEM_EOL_TYPE (*codesys_in_out);
1822
1823   xzero (decst);
1824   decst.eol_type = *eol_type_in_out;
1825   decst.mask = ~0;
1826
1827   /* If autodetection is called for, do it now. */
1828   if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT
1829       || *eol_type_in_out == EOL_AUTODETECT)
1830     {
1831       Extbyte buf[4096];
1832       Lisp_Object coding_system = Qnil;
1833       Extbyte *p;
1834       ssize_t nread = Lstream_read (stream, buf, sizeof (buf));
1835       Extbyte *scan_end;
1836
1837       /* Look for initial "-*-"; mode line prefix */
1838       for (p = buf,
1839              scan_end = buf + nread - LENGTH ("-*-coding:?-*-");
1840            p <= scan_end
1841              && *p != '\n'
1842              && *p != '\r';
1843            p++)
1844         if (*p == '-' && *(p+1) == '*' && *(p+2) == '-')
1845           {
1846             Extbyte *local_vars_beg = p + 3;
1847             /* Look for final "-*-"; mode line suffix */
1848             for (p = local_vars_beg,
1849                    scan_end = buf + nread - LENGTH ("-*-");
1850                  p <= scan_end
1851                    && *p != '\n'
1852                    && *p != '\r';
1853                  p++)
1854               if (*p == '-' && *(p+1) == '*' && *(p+2) == '-')
1855                 {
1856                   Extbyte *suffix = p;
1857                   /* Look for "coding:" */
1858                   for (p = local_vars_beg,
1859                          scan_end = suffix - LENGTH ("coding:?");
1860                        p <= scan_end;
1861                        p++)
1862                     if (memcmp ("coding:", p, LENGTH ("coding:")) == 0
1863                         && (p == local_vars_beg
1864                             || (*(p-1) == ' '  ||
1865                                 *(p-1) == '\t' ||
1866                                 *(p-1) == ';')))
1867                       {
1868                         Extbyte save;
1869                         int n;
1870                         p += LENGTH ("coding:");
1871                         while (*p == ' ' || *p == '\t') p++;
1872
1873                         /* Get coding system name */
1874                         save = *suffix; *suffix = '\0';
1875                         /* Characters valid in a MIME charset name (rfc 1521),
1876                            and in a Lisp symbol name. */
1877                         n = strspn ( (char *) p,
1878                                     "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
1879                                     "abcdefghijklmnopqrstuvwxyz"
1880                                     "0123456789"
1881                                     "!$%&*+-.^_{|}~");
1882                         *suffix = save;
1883                         if (n > 0)
1884                           {
1885                             save = p[n]; p[n] = '\0';
1886                             coding_system =
1887                               Ffind_coding_system (intern ((char *) p));
1888                             p[n] = save;
1889                           }
1890                         break;
1891                       }
1892                   break;
1893                 }
1894             break;
1895           }
1896
1897       if (NILP (coding_system))
1898         do
1899           {
1900             if (detect_coding_type (&decst, buf, nread,
1901                                     XCODING_SYSTEM_TYPE (*codesys_in_out)
1902                                     != CODESYS_AUTODETECT))
1903               break;
1904             nread = Lstream_read (stream, buf, sizeof (buf));
1905             if (nread == 0)
1906               break;
1907           }
1908         while (1);
1909
1910       else if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT
1911                && XCODING_SYSTEM_EOL_TYPE (coding_system) == EOL_AUTODETECT)
1912         do
1913           {
1914             if (detect_coding_type (&decst, buf, nread, 1))
1915               break;
1916             nread = Lstream_read (stream, buf, sizeof (buf));
1917             if (!nread)
1918               break;
1919           }
1920         while (1);
1921
1922       *eol_type_in_out = decst.eol_type;
1923       if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT)
1924         {
1925           if (NILP (coding_system))
1926             *codesys_in_out = coding_system_from_mask (decst.mask);
1927           else
1928             *codesys_in_out = coding_system;
1929         }
1930     }
1931
1932   /* If we absolutely can't determine the EOL type, just assume LF. */
1933   if (*eol_type_in_out == EOL_AUTODETECT)
1934     *eol_type_in_out = EOL_LF;
1935
1936   Lstream_rewind (stream);
1937 }
1938
1939 DEFUN ("detect-coding-region", Fdetect_coding_region, 2, 3, 0, /*
1940 Detect coding system of the text in the region between START and END.
1941 Returned a list of possible coding systems ordered by priority.
1942 If only ASCII characters are found, it returns 'undecided or one of
1943 its subsidiary coding systems according to a detected end-of-line
1944 type.  Optional arg BUFFER defaults to the current buffer.
1945 */
1946        (start, end, buffer))
1947 {
1948   Lisp_Object val = Qnil;
1949   struct buffer *buf = decode_buffer (buffer, 0);
1950   Bufpos b, e;
1951   Lisp_Object instream, lb_instream;
1952   Lstream *istr, *lb_istr;
1953   struct detection_state decst;
1954   struct gcpro gcpro1, gcpro2;
1955
1956   get_buffer_range_char (buf, start, end, &b, &e, 0);
1957   lb_instream = make_lisp_buffer_input_stream (buf, b, e, 0);
1958   lb_istr = XLSTREAM (lb_instream);
1959   instream = make_encoding_input_stream (lb_istr, Fget_coding_system (Qbinary));
1960   istr = XLSTREAM (instream);
1961   GCPRO2 (instream, lb_instream);
1962   xzero (decst);
1963   decst.eol_type = EOL_AUTODETECT;
1964   decst.mask = ~0;
1965   while (1)
1966     {
1967       unsigned char random_buffer[4096];
1968       ssize_t nread = Lstream_read (istr, random_buffer, sizeof (random_buffer));
1969
1970       if (!nread)
1971         break;
1972       if (detect_coding_type (&decst, random_buffer, nread, 0))
1973         break;
1974     }
1975
1976   if (decst.mask == ~0)
1977     val = subsidiary_coding_system (Fget_coding_system (Qundecided),
1978                                     decst.eol_type);
1979   else
1980     {
1981       int i;
1982
1983       val = Qnil;
1984 #ifdef MULE
1985       decst.mask = postprocess_iso2022_mask (decst.mask);
1986 #endif
1987       for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1988         {
1989           int sys = fcd->coding_category_by_priority[i];
1990           if (decst.mask & (1 << sys))
1991             {
1992               Lisp_Object codesys = fcd->coding_category_system[sys];
1993               if (!NILP (codesys))
1994                 codesys = subsidiary_coding_system (codesys, decst.eol_type);
1995               val = Fcons (codesys, val);
1996             }
1997         }
1998     }
1999   Lstream_close (istr);
2000   UNGCPRO;
2001   Lstream_delete (istr);
2002   Lstream_delete (lb_istr);
2003   return val;
2004 }
2005
2006 \f
2007 /************************************************************************/
2008 /*           Converting to internal Mule format ("decoding")            */
2009 /************************************************************************/
2010
2011 /* A decoding stream is a stream used for decoding text (i.e.
2012    converting from some external format to internal format).
2013    The decoding-stream object keeps track of the actual coding
2014    stream, the stream that is at the other end, and data that
2015    needs to be persistent across the lifetime of the stream. */
2016
2017 /* Handle the EOL stuff related to just-read-in character C.
2018    EOL_TYPE is the EOL type of the coding stream.
2019    FLAGS is the current value of FLAGS in the coding stream, and may
2020    be modified by this macro.  (The macro only looks at the
2021    CODING_STATE_CR flag.)  DST is the Dynarr to which the decoded
2022    bytes are to be written.  You need to also define a local goto
2023    label "label_continue_loop" that is at the end of the main
2024    character-reading loop.
2025
2026    If C is a CR character, then this macro handles it entirely and
2027    jumps to label_continue_loop.  Otherwise, this macro does not add
2028    anything to DST, and continues normally.  You should continue
2029    processing C normally after this macro. */
2030
2031 #define DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst)         \
2032 do {                                                            \
2033   if (c == '\r')                                                \
2034     {                                                           \
2035       if (eol_type == EOL_CR)                                   \
2036         Dynarr_add (dst, '\n');                                 \
2037       else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR) \
2038         Dynarr_add (dst, c);                                    \
2039       else                                                      \
2040         flags |= CODING_STATE_CR;                               \
2041       goto label_continue_loop;                                 \
2042     }                                                           \
2043   else if (flags & CODING_STATE_CR)                             \
2044     {   /* eol_type == CODING_SYSTEM_EOL_CRLF */                \
2045       if (c != '\n')                                            \
2046         Dynarr_add (dst, '\r');                                 \
2047       flags &= ~CODING_STATE_CR;                                \
2048     }                                                           \
2049 } while (0)
2050
2051 /* C should be a binary character in the range 0 - 255; convert
2052    to internal format and add to Dynarr DST. */
2053
2054 #ifdef UTF2000
2055 #define DECODE_ADD_BINARY_CHAR(c, dst) \
2056 do {                                            \
2057   if (BYTE_ASCII_P (c))                         \
2058     Dynarr_add (dst, c);                        \
2059   else                                          \
2060     {                                           \
2061       Dynarr_add (dst, (c >> 6) | 0xc0);        \
2062       Dynarr_add (dst, (c & 0x3f) | 0x80);      \
2063     }                                           \
2064 } while (0)
2065
2066 INLINE_HEADER void DECODE_ADD_UCS_CHAR(Emchar c, unsigned_char_dynarr* dst);
2067 INLINE_HEADER void
2068 DECODE_ADD_UCS_CHAR(Emchar c, unsigned_char_dynarr* dst)
2069 {
2070   if ( c <= 0x7f )
2071     {
2072       Dynarr_add (dst, c);
2073     }
2074   else if ( c <= 0x7ff )
2075     {
2076       Dynarr_add (dst, (c >> 6) | 0xc0);
2077       Dynarr_add (dst, (c & 0x3f) | 0x80);
2078     }
2079   else if ( c <= 0xffff )
2080     {
2081       Dynarr_add (dst,  (c >> 12) | 0xe0);
2082       Dynarr_add (dst, ((c >>  6) & 0x3f) | 0x80);
2083       Dynarr_add (dst,  (c        & 0x3f) | 0x80);
2084     }
2085   else if ( c <= 0x1fffff )
2086     {
2087       Dynarr_add (dst,  (c >> 18) | 0xf0);
2088       Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2089       Dynarr_add (dst, ((c >>  6) & 0x3f) | 0x80);
2090       Dynarr_add (dst,  (c        & 0x3f) | 0x80);
2091     }
2092   else if ( c <= 0x3ffffff )
2093     {
2094       Dynarr_add (dst,  (c >> 24) | 0xf8);
2095       Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
2096       Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2097       Dynarr_add (dst, ((c >>  6) & 0x3f) | 0x80);
2098       Dynarr_add (dst,  (c        & 0x3f) | 0x80);
2099     }
2100   else
2101     {
2102       Dynarr_add (dst,  (c >> 30) | 0xfc);
2103       Dynarr_add (dst, ((c >> 24) & 0x3f) | 0x80);
2104       Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
2105       Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2106       Dynarr_add (dst, ((c >>  6) & 0x3f) | 0x80);
2107       Dynarr_add (dst,  (c        & 0x3f) | 0x80);
2108     }
2109 }
2110 #else
2111 #define DECODE_ADD_BINARY_CHAR(c, dst)          \
2112 do {                                            \
2113   if (BYTE_ASCII_P (c))                         \
2114     Dynarr_add (dst, c);                        \
2115   else if (BYTE_C1_P (c))                       \
2116     {                                           \
2117       Dynarr_add (dst, LEADING_BYTE_CONTROL_1); \
2118       Dynarr_add (dst, c + 0x20);               \
2119     }                                           \
2120   else                                          \
2121     {                                           \
2122       Dynarr_add (dst, LEADING_BYTE_LATIN_ISO8859_1); \
2123       Dynarr_add (dst, c);                      \
2124     }                                           \
2125 } while (0)
2126 #endif
2127
2128 #define DECODE_OUTPUT_PARTIAL_CHAR(ch)  \
2129 do {                                    \
2130   if (ch)                               \
2131     {                                   \
2132       DECODE_ADD_BINARY_CHAR (ch, dst); \
2133       ch = 0;                           \
2134     }                                   \
2135 } while (0)
2136
2137 #define DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst) \
2138 do {                                    \
2139   if (flags & CODING_STATE_END)         \
2140     {                                   \
2141       DECODE_OUTPUT_PARTIAL_CHAR (ch);  \
2142       if (flags & CODING_STATE_CR)      \
2143         Dynarr_add (dst, '\r');         \
2144     }                                   \
2145 } while (0)
2146
2147 #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding)
2148
2149 struct decoding_stream
2150 {
2151   /* Coding system that governs the conversion. */
2152   Lisp_Coding_System *codesys;
2153
2154   /* Stream that we read the encoded data from or
2155      write the decoded data to. */
2156   Lstream *other_end;
2157
2158   /* If we are reading, then we can return only a fixed amount of
2159      data, so if the conversion resulted in too much data, we store it
2160      here for retrieval the next time around. */
2161   unsigned_char_dynarr *runoff;
2162
2163   /* FLAGS holds flags indicating the current state of the decoding.
2164      Some of these flags are dependent on the coding system. */
2165   unsigned int flags;
2166
2167   /* CPOS holds a partially built-up code-point of character. */
2168   unsigned int cpos;
2169
2170   /* EOL_TYPE specifies the type of end-of-line conversion that
2171      currently applies.  We need to keep this separate from the
2172      EOL type stored in CODESYS because the latter might indicate
2173      automatic EOL-type detection while the former will always
2174      indicate a particular EOL type. */
2175   eol_type_t eol_type;
2176 #ifdef MULE
2177   /* Additional ISO2022 information.  We define the structure above
2178      because it's also needed by the detection routines. */
2179   struct iso2022_decoder iso2022;
2180
2181   /* Additional information (the state of the running CCL program)
2182      used by the CCL decoder. */
2183   struct ccl_program ccl;
2184
2185   /* counter for UTF-8 or UCS-4 */
2186   unsigned char counter;
2187 #endif
2188 #ifdef UTF2000
2189   unsigned combined_char_count;
2190   Emchar combined_chars[16];
2191   Lisp_Object combining_table;
2192 #endif
2193   struct detection_state decst;
2194 };
2195
2196 #ifdef UTF2000
2197 extern Lisp_Object Vcharacter_composition_table;
2198
2199 INLINE_HEADER void
2200 COMPOSE_FLUSH_CHARS (struct decoding_stream *str, unsigned_char_dynarr* dst);
2201 INLINE_HEADER void
2202 COMPOSE_FLUSH_CHARS (struct decoding_stream *str, unsigned_char_dynarr* dst)
2203 {
2204   unsigned i;
2205
2206   for (i = 0; i < str->combined_char_count; i++)
2207     DECODE_ADD_UCS_CHAR (str->combined_chars[i], dst);
2208   str->combined_char_count = 0;
2209   str->combining_table = Qnil;
2210 }
2211
2212 void COMPOSE_ADD_CHAR(struct decoding_stream *str, Emchar character,
2213                       unsigned_char_dynarr* dst);
2214 void
2215 COMPOSE_ADD_CHAR(struct decoding_stream *str,
2216                  Emchar character, unsigned_char_dynarr* dst)
2217 {
2218   if (CODING_SYSTEM_DISABLE_COMPOSITION (str->codesys))
2219     DECODE_ADD_UCS_CHAR (character, dst);
2220   else if (!CHAR_ID_TABLE_P (str->combining_table))
2221     {
2222       Lisp_Object ret
2223         = get_char_id_table (character, Vcharacter_composition_table);
2224
2225       if (NILP (ret))
2226         DECODE_ADD_UCS_CHAR (character, dst);
2227       else
2228         {
2229           str->combined_chars[0] = character;
2230           str->combined_char_count = 1;
2231           str->combining_table = ret;
2232         }
2233     }
2234   else
2235     {
2236       Lisp_Object ret
2237         = get_char_id_table (character, str->combining_table);
2238
2239       if (CHARP (ret))
2240         {
2241           Emchar char2 = XCHARVAL (ret);
2242           ret = get_char_id_table (char2, Vcharacter_composition_table);
2243           if (NILP (ret))
2244             {
2245               DECODE_ADD_UCS_CHAR (char2, dst);
2246               str->combined_char_count = 0;
2247               str->combining_table = Qnil;
2248             }
2249           else
2250             {
2251               str->combined_chars[0] = char2;
2252               str->combined_char_count = 1;
2253               str->combining_table = ret;
2254             }
2255         }
2256       else if (CHAR_ID_TABLE_P (ret))
2257         {
2258           str->combined_chars[str->combined_char_count++] = character;
2259           str->combining_table = ret;
2260         }
2261       else
2262         {
2263           COMPOSE_FLUSH_CHARS (str, dst);
2264           DECODE_ADD_UCS_CHAR (character, dst);
2265         }
2266     }
2267 }
2268 #else /* not UTF2000 */
2269 #define COMPOSE_FLUSH_CHARS(str, dst)
2270 #define COMPOSE_ADD_CHAR(str, ch, dst) DECODE_ADD_UCS_CHAR (ch, dst)
2271 #endif /* UTF2000 */
2272
2273 static ssize_t decoding_reader (Lstream *stream,
2274                                 unsigned char *data, size_t size);
2275 static ssize_t decoding_writer (Lstream *stream,
2276                                 const unsigned char *data, size_t size);
2277 static int decoding_rewinder   (Lstream *stream);
2278 static int decoding_seekable_p (Lstream *stream);
2279 static int decoding_flusher    (Lstream *stream);
2280 static int decoding_closer     (Lstream *stream);
2281
2282 static Lisp_Object decoding_marker (Lisp_Object stream);
2283
2284 DEFINE_LSTREAM_IMPLEMENTATION ("decoding", lstream_decoding,
2285                                sizeof (struct decoding_stream));
2286
2287 static Lisp_Object
2288 decoding_marker (Lisp_Object stream)
2289 {
2290   Lstream *str = DECODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2291   Lisp_Object str_obj;
2292
2293   /* We do not need to mark the coding systems or charsets stored
2294      within the stream because they are stored in a global list
2295      and automatically marked. */
2296
2297   XSETLSTREAM (str_obj, str);
2298   mark_object (str_obj);
2299   if (str->imp->marker)
2300     return (str->imp->marker) (str_obj);
2301   else
2302     return Qnil;
2303 }
2304
2305 /* Read SIZE bytes of data and store it into DATA.  We are a decoding stream
2306    so we read data from the other end, decode it, and store it into DATA. */
2307
2308 static ssize_t
2309 decoding_reader (Lstream *stream, unsigned char *data, size_t size)
2310 {
2311   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2312   unsigned char *orig_data = data;
2313   ssize_t read_size;
2314   int error_occurred = 0;
2315
2316   /* We need to interface to mule_decode(), which expects to take some
2317      amount of data and store the result into a Dynarr.  We have
2318      mule_decode() store into str->runoff, and take data from there
2319      as necessary. */
2320
2321   /* We loop until we have enough data, reading chunks from the other
2322      end and decoding it. */
2323   while (1)
2324     {
2325       /* Take data from the runoff if we can.  Make sure to take at
2326          most SIZE bytes, and delete the data from the runoff. */
2327       if (Dynarr_length (str->runoff) > 0)
2328         {
2329           size_t chunk = min (size, (size_t) Dynarr_length (str->runoff));
2330           memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2331           Dynarr_delete_many (str->runoff, 0, chunk);
2332           data += chunk;
2333           size -= chunk;
2334         }
2335
2336       if (size == 0)
2337         break; /* No more room for data */
2338
2339       if (str->flags & CODING_STATE_END)
2340         /* This means that on the previous iteration, we hit the EOF on
2341            the other end.  We loop once more so that mule_decode() can
2342            output any final stuff it may be holding, or any "go back
2343            to a sane state" escape sequences. (This latter makes sense
2344            during encoding.) */
2345         break;
2346
2347       /* Exhausted the runoff, so get some more.  DATA has at least
2348          SIZE bytes left of storage in it, so it's OK to read directly
2349          into it.  (We'll be overwriting above, after we've decoded it
2350          into the runoff.) */
2351       read_size = Lstream_read (str->other_end, data, size);
2352       if (read_size < 0)
2353         {
2354           error_occurred = 1;
2355           break;
2356         }
2357       if (read_size == 0)
2358         /* There might be some more end data produced in the translation.
2359            See the comment above. */
2360         str->flags |= CODING_STATE_END;
2361       mule_decode (stream, data, str->runoff, read_size);
2362     }
2363
2364   if (data - orig_data == 0)
2365     return error_occurred ? -1 : 0;
2366   else
2367     return data - orig_data;
2368 }
2369
2370 static ssize_t
2371 decoding_writer (Lstream *stream, const unsigned char *data, size_t size)
2372 {
2373   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2374   ssize_t retval;
2375
2376   /* Decode all our data into the runoff, and then attempt to write
2377      it all out to the other end.  Remove whatever chunk we succeeded
2378      in writing. */
2379   mule_decode (stream, data, str->runoff, size);
2380   retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2381                           Dynarr_length (str->runoff));
2382   if (retval > 0)
2383     Dynarr_delete_many (str->runoff, 0, retval);
2384   /* Do NOT return retval.  The return value indicates how much
2385      of the incoming data was written, not how many bytes were
2386      written. */
2387   return size;
2388 }
2389
2390 static void
2391 reset_decoding_stream (struct decoding_stream *str)
2392 {
2393 #ifdef MULE
2394   if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_ISO2022)
2395     {
2396       Lisp_Object coding_system;
2397       XSETCODING_SYSTEM (coding_system, str->codesys);
2398       reset_iso2022 (coding_system, &str->iso2022);
2399     }
2400   else if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_CCL)
2401     {
2402       setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_DECODE (str->codesys));
2403     }
2404   str->counter = 0;
2405 #endif /* MULE */
2406 #ifdef UTF2000
2407   str->combined_char_count = 0;
2408   str->combining_table = Qnil;
2409 #endif
2410   str->flags = str->cpos = 0;
2411 }
2412
2413 static int
2414 decoding_rewinder (Lstream *stream)
2415 {
2416   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2417   reset_decoding_stream (str);
2418   Dynarr_reset (str->runoff);
2419   return Lstream_rewind (str->other_end);
2420 }
2421
2422 static int
2423 decoding_seekable_p (Lstream *stream)
2424 {
2425   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2426   return Lstream_seekable_p (str->other_end);
2427 }
2428
2429 static int
2430 decoding_flusher (Lstream *stream)
2431 {
2432   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2433   return Lstream_flush (str->other_end);
2434 }
2435
2436 static int
2437 decoding_closer (Lstream *stream)
2438 {
2439   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2440   if (stream->flags & LSTREAM_FL_WRITE)
2441     {
2442       str->flags |= CODING_STATE_END;
2443       decoding_writer (stream, 0, 0);
2444     }
2445   Dynarr_free (str->runoff);
2446 #ifdef MULE
2447 #ifdef ENABLE_COMPOSITE_CHARS
2448   if (str->iso2022.composite_chars)
2449     Dynarr_free (str->iso2022.composite_chars);
2450 #endif
2451 #endif
2452   return Lstream_close (str->other_end);
2453 }
2454
2455 Lisp_Object
2456 decoding_stream_coding_system (Lstream *stream)
2457 {
2458   Lisp_Object coding_system;
2459   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2460
2461   XSETCODING_SYSTEM (coding_system, str->codesys);
2462   return subsidiary_coding_system (coding_system, str->eol_type);
2463 }
2464
2465 void
2466 set_decoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2467 {
2468   Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2469   struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2470   str->codesys = cs;
2471   if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
2472     str->eol_type = CODING_SYSTEM_EOL_TYPE (cs);
2473   reset_decoding_stream (str);
2474 }
2475
2476 /* WARNING WARNING WARNING WARNING!!!!!  If you open up a decoding
2477    stream for writing, no automatic code detection will be performed.
2478    The reason for this is that automatic code detection requires a
2479    seekable input.  Things will also fail if you open a decoding
2480    stream for reading using a non-fully-specified coding system and
2481    a non-seekable input stream. */
2482
2483 static Lisp_Object
2484 make_decoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2485                         const char *mode)
2486 {
2487   Lstream *lstr = Lstream_new (lstream_decoding, mode);
2488   struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2489   Lisp_Object obj;
2490
2491   xzero (*str);
2492   str->other_end = stream;
2493   str->runoff = (unsigned_char_dynarr *) Dynarr_new (unsigned_char);
2494   str->eol_type = EOL_AUTODETECT;
2495   if (!strcmp (mode, "r")
2496       && Lstream_seekable_p (stream))
2497     /* We can determine the coding system now. */
2498     determine_real_coding_system (stream, &codesys, &str->eol_type);
2499   set_decoding_stream_coding_system (lstr, codesys);
2500   str->decst.eol_type = str->eol_type;
2501   str->decst.mask = ~0;
2502   XSETLSTREAM (obj, lstr);
2503   return obj;
2504 }
2505
2506 Lisp_Object
2507 make_decoding_input_stream (Lstream *stream, Lisp_Object codesys)
2508 {
2509   return make_decoding_stream_1 (stream, codesys, "r");
2510 }
2511
2512 Lisp_Object
2513 make_decoding_output_stream (Lstream *stream, Lisp_Object codesys)
2514 {
2515   return make_decoding_stream_1 (stream, codesys, "w");
2516 }
2517
2518 /* Note: the decode_coding_* functions all take the same
2519    arguments as mule_decode(), which is to say some SRC data of
2520    size N, which is to be stored into dynamic array DST.
2521    DECODING is the stream within which the decoding is
2522    taking place, but no data is actually read from or
2523    written to that stream; that is handled in decoding_reader()
2524    or decoding_writer().  This allows the same functions to
2525    be used for both reading and writing. */
2526
2527 static void
2528 mule_decode (Lstream *decoding, const unsigned char *src,
2529              unsigned_char_dynarr *dst, unsigned int n)
2530 {
2531   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2532
2533   /* If necessary, do encoding-detection now.  We do this when
2534      we're a writing stream or a non-seekable reading stream,
2535      meaning that we can't just process the whole input,
2536      rewind, and start over. */
2537
2538   if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT ||
2539       str->eol_type == EOL_AUTODETECT)
2540     {
2541       Lisp_Object codesys;
2542
2543       XSETCODING_SYSTEM (codesys, str->codesys);
2544       detect_coding_type (&str->decst, src, n,
2545                           CODING_SYSTEM_TYPE (str->codesys) !=
2546                           CODESYS_AUTODETECT);
2547       if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT &&
2548           str->decst.mask != ~0)
2549         /* #### This is cheesy.  What we really ought to do is
2550            buffer up a certain amount of data so as to get a
2551            less random result. */
2552         codesys = coding_system_from_mask (str->decst.mask);
2553       str->eol_type = str->decst.eol_type;
2554       if (XCODING_SYSTEM (codesys) != str->codesys)
2555         {
2556           /* Preserve the CODING_STATE_END flag in case it was set.
2557              If we erase it, bad things might happen. */
2558           int was_end = str->flags & CODING_STATE_END;
2559           set_decoding_stream_coding_system (decoding, codesys);
2560           if (was_end)
2561             str->flags |= CODING_STATE_END;
2562         }
2563     }
2564
2565   switch (CODING_SYSTEM_TYPE (str->codesys))
2566     {
2567 #ifdef DEBUG_XEMACS
2568     case CODESYS_INTERNAL:
2569       Dynarr_add_many (dst, src, n);
2570       break;
2571 #endif
2572     case CODESYS_AUTODETECT:
2573       /* If we got this far and still haven't decided on the coding
2574          system, then do no conversion. */
2575     case CODESYS_NO_CONVERSION:
2576       decode_coding_no_conversion (decoding, src, dst, n);
2577       break;
2578 #ifdef MULE
2579     case CODESYS_SHIFT_JIS:
2580       decode_coding_sjis (decoding, src, dst, n);
2581       break;
2582     case CODESYS_BIG5:
2583       decode_coding_big5 (decoding, src, dst, n);
2584       break;
2585     case CODESYS_UCS4:
2586       decode_coding_ucs4 (decoding, src, dst, n);
2587       break;
2588     case CODESYS_UTF8:
2589       decode_coding_utf8 (decoding, src, dst, n);
2590       break;
2591     case CODESYS_CCL:
2592       str->ccl.last_block = str->flags & CODING_STATE_END;
2593       ccl_driver (&str->ccl, src, dst, n, 0, CCL_MODE_DECODING);
2594       break;
2595     case CODESYS_ISO2022:
2596       decode_coding_iso2022 (decoding, src, dst, n);
2597       break;
2598 #endif /* MULE */
2599     default:
2600       abort ();
2601     }
2602 }
2603
2604 DEFUN ("decode-coding-region", Fdecode_coding_region, 3, 4, 0, /*
2605 Decode the text between START and END which is encoded in CODING-SYSTEM.
2606 This is useful if you've read in encoded text from a file without decoding
2607 it (e.g. you read in a JIS-formatted file but used the `binary' or
2608 `no-conversion' coding system, so that it shows up as "^[$B!<!+^[(B").
2609 Return length of decoded text.
2610 BUFFER defaults to the current buffer if unspecified.
2611 */
2612        (start, end, coding_system, buffer))
2613 {
2614   Bufpos b, e;
2615   struct buffer *buf = decode_buffer (buffer, 0);
2616   Lisp_Object instream, lb_outstream, de_outstream, outstream;
2617   Lstream *istr, *ostr;
2618   struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2619
2620   get_buffer_range_char (buf, start, end, &b, &e, 0);
2621
2622   barf_if_buffer_read_only (buf, b, e);
2623
2624   coding_system = Fget_coding_system (coding_system);
2625   instream = make_lisp_buffer_input_stream  (buf, b, e, 0);
2626   lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2627   de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2628                                               coding_system);
2629   outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2630                                            Fget_coding_system (Qbinary));
2631   istr = XLSTREAM (instream);
2632   ostr = XLSTREAM (outstream);
2633   GCPRO4 (instream, lb_outstream, de_outstream, outstream);
2634
2635   /* The chain of streams looks like this:
2636
2637      [BUFFER] <----- send through
2638                      ------> [ENCODE AS BINARY]
2639                              ------> [DECODE AS SPECIFIED]
2640                                      ------> [BUFFER]
2641    */
2642
2643   while (1)
2644     {
2645       char tempbuf[1024]; /* some random amount */
2646       Bufpos newpos, even_newer_pos;
2647       Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2648       ssize_t size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2649
2650       if (!size_in_bytes)
2651         break;
2652       newpos = lisp_buffer_stream_startpos (istr);
2653       Lstream_write (ostr, tempbuf, size_in_bytes);
2654       even_newer_pos = lisp_buffer_stream_startpos (istr);
2655       buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2656                            even_newer_pos, 0);
2657     }
2658   Lstream_close (istr);
2659   Lstream_close (ostr);
2660   UNGCPRO;
2661   Lstream_delete (istr);
2662   Lstream_delete (ostr);
2663   Lstream_delete (XLSTREAM (de_outstream));
2664   Lstream_delete (XLSTREAM (lb_outstream));
2665   return Qnil;
2666 }
2667
2668 \f
2669 /************************************************************************/
2670 /*           Converting to an external encoding ("encoding")            */
2671 /************************************************************************/
2672
2673 /* An encoding stream is an output stream.  When you create the
2674    stream, you specify the coding system that governs the encoding
2675    and another stream that the resulting encoded data is to be
2676    sent to, and then start sending data to it. */
2677
2678 #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding)
2679
2680 struct encoding_stream
2681 {
2682   /* Coding system that governs the conversion. */
2683   Lisp_Coding_System *codesys;
2684
2685   /* Stream that we read the encoded data from or
2686      write the decoded data to. */
2687   Lstream *other_end;
2688
2689   /* If we are reading, then we can return only a fixed amount of
2690      data, so if the conversion resulted in too much data, we store it
2691      here for retrieval the next time around. */
2692   unsigned_char_dynarr *runoff;
2693
2694   /* FLAGS holds flags indicating the current state of the encoding.
2695      Some of these flags are dependent on the coding system. */
2696   unsigned int flags;
2697
2698   /* CH holds a partially built-up character.  Since we only deal
2699      with one- and two-byte characters at the moment, we only use
2700      this to store the first byte of a two-byte character. */
2701   unsigned int ch;
2702 #ifdef MULE
2703   /* Additional information used by the ISO2022 encoder. */
2704   struct
2705     {
2706       /* CHARSET holds the character sets currently assigned to the G0
2707          through G3 registers.  It is initialized from the array
2708          INITIAL_CHARSET in CODESYS. */
2709       Lisp_Object charset[4];
2710
2711       /* Which registers are currently invoked into the left (GL) and
2712          right (GR) halves of the 8-bit encoding space? */
2713       int register_left, register_right;
2714
2715       /* Whether we need to explicitly designate the charset in the
2716          G? register before using it.  It is initialized from the
2717          array FORCE_CHARSET_ON_OUTPUT in CODESYS. */
2718       unsigned char force_charset_on_output[4];
2719
2720       /* Other state variables that need to be preserved across
2721          invocations. */
2722       Lisp_Object current_charset;
2723       int current_half;
2724       int current_char_boundary;
2725     } iso2022;
2726
2727   void (*encode_char) (struct encoding_stream *str, Emchar c,
2728                        unsigned_char_dynarr *dst, unsigned int *flags);
2729   void (*finish) (struct encoding_stream *str,
2730                   unsigned_char_dynarr *dst, unsigned int *flags);
2731
2732   /* Additional information (the state of the running CCL program)
2733      used by the CCL encoder. */
2734   struct ccl_program ccl;
2735 #endif /* MULE */
2736 };
2737
2738 static ssize_t encoding_reader (Lstream *stream, unsigned char *data, size_t size);
2739 static ssize_t encoding_writer (Lstream *stream, const unsigned char *data,
2740                                 size_t size);
2741 static int encoding_rewinder   (Lstream *stream);
2742 static int encoding_seekable_p (Lstream *stream);
2743 static int encoding_flusher    (Lstream *stream);
2744 static int encoding_closer     (Lstream *stream);
2745
2746 static Lisp_Object encoding_marker (Lisp_Object stream);
2747
2748 DEFINE_LSTREAM_IMPLEMENTATION ("encoding", lstream_encoding,
2749                                sizeof (struct encoding_stream));
2750
2751 static Lisp_Object
2752 encoding_marker (Lisp_Object stream)
2753 {
2754   Lstream *str = ENCODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2755   Lisp_Object str_obj;
2756
2757   /* We do not need to mark the coding systems or charsets stored
2758      within the stream because they are stored in a global list
2759      and automatically marked. */
2760
2761   XSETLSTREAM (str_obj, str);
2762   mark_object (str_obj);
2763   if (str->imp->marker)
2764     return (str->imp->marker) (str_obj);
2765   else
2766     return Qnil;
2767 }
2768
2769 /* Read SIZE bytes of data and store it into DATA.  We are a encoding stream
2770    so we read data from the other end, encode it, and store it into DATA. */
2771
2772 static ssize_t
2773 encoding_reader (Lstream *stream, unsigned char *data, size_t size)
2774 {
2775   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2776   unsigned char *orig_data = data;
2777   ssize_t read_size;
2778   int error_occurred = 0;
2779
2780   /* We need to interface to mule_encode(), which expects to take some
2781      amount of data and store the result into a Dynarr.  We have
2782      mule_encode() store into str->runoff, and take data from there
2783      as necessary. */
2784
2785   /* We loop until we have enough data, reading chunks from the other
2786      end and encoding it. */
2787   while (1)
2788     {
2789       /* Take data from the runoff if we can.  Make sure to take at
2790          most SIZE bytes, and delete the data from the runoff. */
2791       if (Dynarr_length (str->runoff) > 0)
2792         {
2793           int chunk = min ((int) size, Dynarr_length (str->runoff));
2794           memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2795           Dynarr_delete_many (str->runoff, 0, chunk);
2796           data += chunk;
2797           size -= chunk;
2798         }
2799
2800       if (size == 0)
2801         break; /* No more room for data */
2802
2803       if (str->flags & CODING_STATE_END)
2804         /* This means that on the previous iteration, we hit the EOF on
2805            the other end.  We loop once more so that mule_encode() can
2806            output any final stuff it may be holding, or any "go back
2807            to a sane state" escape sequences. (This latter makes sense
2808            during encoding.) */
2809         break;
2810
2811       /* Exhausted the runoff, so get some more.  DATA at least SIZE bytes
2812          left of storage in it, so it's OK to read directly into it.
2813          (We'll be overwriting above, after we've encoded it into the
2814          runoff.) */
2815       read_size = Lstream_read (str->other_end, data, size);
2816       if (read_size < 0)
2817         {
2818           error_occurred = 1;
2819           break;
2820         }
2821       if (read_size == 0)
2822         /* There might be some more end data produced in the translation.
2823            See the comment above. */
2824         str->flags |= CODING_STATE_END;
2825       mule_encode (stream, data, str->runoff, read_size);
2826     }
2827
2828   if (data == orig_data)
2829     return error_occurred ? -1 : 0;
2830   else
2831     return data - orig_data;
2832 }
2833
2834 static ssize_t
2835 encoding_writer (Lstream *stream, const unsigned char *data, size_t size)
2836 {
2837   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2838   ssize_t retval;
2839
2840   /* Encode all our data into the runoff, and then attempt to write
2841      it all out to the other end.  Remove whatever chunk we succeeded
2842      in writing. */
2843   mule_encode (stream, data, str->runoff, size);
2844   retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2845                           Dynarr_length (str->runoff));
2846   if (retval > 0)
2847     Dynarr_delete_many (str->runoff, 0, retval);
2848   /* Do NOT return retval.  The return value indicates how much
2849      of the incoming data was written, not how many bytes were
2850      written. */
2851   return size;
2852 }
2853
2854 static void
2855 reset_encoding_stream (struct encoding_stream *str)
2856 {
2857 #ifdef MULE
2858   switch (CODING_SYSTEM_TYPE (str->codesys))
2859     {
2860     case CODESYS_ISO2022:
2861       {
2862         int i;
2863
2864         str->encode_char = &char_encode_iso2022;
2865         str->finish = &char_finish_iso2022;
2866         for (i = 0; i < 4; i++)
2867           {
2868             str->iso2022.charset[i] =
2869               CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, i);
2870             str->iso2022.force_charset_on_output[i] =
2871               CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (str->codesys, i);
2872           }
2873         str->iso2022.register_left = 0;
2874         str->iso2022.register_right = 1;
2875         str->iso2022.current_charset = Qnil;
2876         str->iso2022.current_half = 0;
2877         break;
2878       }
2879     case CODESYS_CCL:
2880       setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_ENCODE (str->codesys));
2881       break;
2882     case CODESYS_UTF8:
2883       str->encode_char = &char_encode_utf8;
2884       str->finish = &char_finish_utf8;
2885       break;
2886     case CODESYS_UCS4:
2887       str->encode_char = &char_encode_ucs4;
2888       str->finish = &char_finish_ucs4;
2889       break;
2890     case CODESYS_SHIFT_JIS:
2891       str->encode_char = &char_encode_shift_jis;
2892       str->finish = &char_finish_shift_jis;
2893       break;
2894     default:
2895       break;
2896     }
2897 #endif /* MULE */
2898   str->iso2022.current_char_boundary = 0;
2899   str->flags = str->ch = 0;
2900 }
2901
2902 static int
2903 encoding_rewinder (Lstream *stream)
2904 {
2905   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2906   reset_encoding_stream (str);
2907   Dynarr_reset (str->runoff);
2908   return Lstream_rewind (str->other_end);
2909 }
2910
2911 static int
2912 encoding_seekable_p (Lstream *stream)
2913 {
2914   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2915   return Lstream_seekable_p (str->other_end);
2916 }
2917
2918 static int
2919 encoding_flusher (Lstream *stream)
2920 {
2921   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2922   return Lstream_flush (str->other_end);
2923 }
2924
2925 static int
2926 encoding_closer (Lstream *stream)
2927 {
2928   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2929   if (stream->flags & LSTREAM_FL_WRITE)
2930     {
2931       str->flags |= CODING_STATE_END;
2932       encoding_writer (stream, 0, 0);
2933     }
2934   Dynarr_free (str->runoff);
2935   return Lstream_close (str->other_end);
2936 }
2937
2938 Lisp_Object
2939 encoding_stream_coding_system (Lstream *stream)
2940 {
2941   Lisp_Object coding_system;
2942   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2943
2944   XSETCODING_SYSTEM (coding_system, str->codesys);
2945   return coding_system;
2946 }
2947
2948 void
2949 set_encoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2950 {
2951   Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2952   struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2953   str->codesys = cs;
2954   reset_encoding_stream (str);
2955 }
2956
2957 static Lisp_Object
2958 make_encoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2959                         const char *mode)
2960 {
2961   Lstream *lstr = Lstream_new (lstream_encoding, mode);
2962   struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2963   Lisp_Object obj;
2964
2965   xzero (*str);
2966   str->runoff = Dynarr_new (unsigned_char);
2967   str->other_end = stream;
2968   set_encoding_stream_coding_system (lstr, codesys);
2969   XSETLSTREAM (obj, lstr);
2970   return obj;
2971 }
2972
2973 Lisp_Object
2974 make_encoding_input_stream (Lstream *stream, Lisp_Object codesys)
2975 {
2976   return make_encoding_stream_1 (stream, codesys, "r");
2977 }
2978
2979 Lisp_Object
2980 make_encoding_output_stream (Lstream *stream, Lisp_Object codesys)
2981 {
2982   return make_encoding_stream_1 (stream, codesys, "w");
2983 }
2984
2985 /* Convert N bytes of internally-formatted data stored in SRC to an
2986    external format, according to the encoding stream ENCODING.
2987    Store the encoded data into DST. */
2988
2989 static void
2990 mule_encode (Lstream *encoding, const unsigned char *src,
2991              unsigned_char_dynarr *dst, unsigned int n)
2992 {
2993   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2994
2995   switch (CODING_SYSTEM_TYPE (str->codesys))
2996     {
2997 #ifdef DEBUG_XEMACS
2998     case CODESYS_INTERNAL:
2999       Dynarr_add_many (dst, src, n);
3000       break;
3001 #endif
3002     case CODESYS_AUTODETECT:
3003       /* If we got this far and still haven't decided on the coding
3004          system, then do no conversion. */
3005     case CODESYS_NO_CONVERSION:
3006       encode_coding_no_conversion (encoding, src, dst, n);
3007       break;
3008 #ifdef MULE
3009     case CODESYS_BIG5:
3010       encode_coding_big5 (encoding, src, dst, n);
3011       break;
3012     case CODESYS_CCL:
3013       str->ccl.last_block = str->flags & CODING_STATE_END;
3014       ccl_driver (&str->ccl, src, dst, n, 0, CCL_MODE_ENCODING);
3015       break;
3016 #endif /* MULE */
3017     default:
3018       text_encode_generic (encoding, src, dst, n);
3019     }
3020 }
3021
3022 DEFUN ("encode-coding-region", Fencode_coding_region, 3, 4, 0, /*
3023 Encode the text between START and END using CODING-SYSTEM.
3024 This will, for example, convert Japanese characters into stuff such as
3025 "^[$B!<!+^[(B" if you use the JIS encoding.  Return length of encoded
3026 text.  BUFFER defaults to the current buffer if unspecified.
3027 */
3028        (start, end, coding_system, buffer))
3029 {
3030   Bufpos b, e;
3031   struct buffer *buf = decode_buffer (buffer, 0);
3032   Lisp_Object instream, lb_outstream, de_outstream, outstream;
3033   Lstream *istr, *ostr;
3034   struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
3035
3036   get_buffer_range_char (buf, start, end, &b, &e, 0);
3037
3038   barf_if_buffer_read_only (buf, b, e);
3039
3040   coding_system = Fget_coding_system (coding_system);
3041   instream  = make_lisp_buffer_input_stream  (buf, b, e, 0);
3042   lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
3043   de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
3044                                               Fget_coding_system (Qbinary));
3045   outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
3046                                            coding_system);
3047   istr = XLSTREAM (instream);
3048   ostr = XLSTREAM (outstream);
3049   GCPRO4 (instream, outstream, de_outstream, lb_outstream);
3050   /* The chain of streams looks like this:
3051
3052      [BUFFER] <----- send through
3053                      ------> [ENCODE AS SPECIFIED]
3054                              ------> [DECODE AS BINARY]
3055                                      ------> [BUFFER]
3056    */
3057   while (1)
3058     {
3059       char tempbuf[1024]; /* some random amount */
3060       Bufpos newpos, even_newer_pos;
3061       Bufpos oldpos = lisp_buffer_stream_startpos (istr);
3062       ssize_t size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
3063
3064       if (!size_in_bytes)
3065         break;
3066       newpos = lisp_buffer_stream_startpos (istr);
3067       Lstream_write (ostr, tempbuf, size_in_bytes);
3068       even_newer_pos = lisp_buffer_stream_startpos (istr);
3069       buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
3070                            even_newer_pos, 0);
3071     }
3072
3073   {
3074     Charcount retlen =
3075       lisp_buffer_stream_startpos (XLSTREAM (instream)) - b;
3076     Lstream_close (istr);
3077     Lstream_close (ostr);
3078     UNGCPRO;
3079     Lstream_delete (istr);
3080     Lstream_delete (ostr);
3081     Lstream_delete (XLSTREAM (de_outstream));
3082     Lstream_delete (XLSTREAM (lb_outstream));
3083     return make_int (retlen);
3084   }
3085 }
3086
3087 #ifdef MULE
3088 \f
3089 static void
3090 text_encode_generic (Lstream *encoding, const unsigned char *src,
3091                      unsigned_char_dynarr *dst, unsigned int n)
3092 {
3093   unsigned char c;
3094   unsigned char char_boundary;
3095   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3096   unsigned int flags          = str->flags;
3097   Emchar ch                   = str->ch;
3098
3099   char_boundary = str->iso2022.current_char_boundary;
3100
3101   while (n--)
3102     {
3103       c = *src++;
3104
3105       if (char_boundary == 0)
3106         {
3107           if (c >= 0xfc)
3108             {
3109               ch = c & 0x01;
3110               char_boundary = 5;
3111             }
3112           else if (c >= 0xf8)
3113             {
3114               ch = c & 0x03;
3115               char_boundary = 4;
3116             }
3117           else if (c >= 0xf0)
3118             {
3119               ch = c & 0x07;
3120               char_boundary = 3;
3121             }
3122           else if (c >= 0xe0)
3123             {
3124               ch = c & 0x0f;
3125               char_boundary = 2;
3126             }
3127           else if (c >= 0xc0)
3128             {
3129               ch = c & 0x1f;
3130               char_boundary = 1;
3131             }
3132           else
3133             (*str->encode_char) (str, c, dst, &flags);
3134         }
3135       else if (char_boundary == 1)
3136         {
3137           (*str->encode_char) (str, (ch << 6) | (c & 0x3f), dst, &flags);
3138           ch =0;
3139           char_boundary = 0;
3140         }
3141       else
3142         {
3143           ch = (ch << 6) | (c & 0x3f);
3144           char_boundary--;
3145         }
3146     }
3147
3148   if ((char_boundary == 0) && (flags & CODING_STATE_END))
3149     {
3150       (*str->finish) (str, dst, &flags);
3151     }
3152
3153   str->flags = flags;
3154   str->ch    = ch;
3155   str->iso2022.current_char_boundary = char_boundary;
3156 }
3157
3158 \f
3159 /************************************************************************/
3160 /*                          Shift-JIS methods                           */
3161 /************************************************************************/
3162
3163 /* Shift-JIS is a coding system encoding three character sets: ASCII, right
3164    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
3165    as is.  A character of JISX0201-Kana (DIMENSION1_CHARS94 character set) is
3166    encoded by "position-code + 0x80".  A character of JISX0208
3167    (DIMENSION2_CHARS94 character set) is encoded in 2-byte but two
3168    position-codes are divided and shifted so that it fit in the range
3169    below.
3170
3171    --- CODE RANGE of Shift-JIS ---
3172    (character set)      (range)
3173    ASCII                0x00 .. 0x7F
3174    JISX0201-Kana        0xA0 .. 0xDF
3175    JISX0208 (1st byte)  0x80 .. 0x9F and 0xE0 .. 0xEF
3176             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
3177    -------------------------------
3178
3179 */
3180
3181 /* Is this the first byte of a Shift-JIS two-byte char? */
3182
3183 #define BYTE_SJIS_TWO_BYTE_1_P(c) \
3184   (((c) >= 0x81 && (c) <= 0x9F) || ((c) >= 0xE0 && (c) <= 0xEF))
3185
3186 /* Is this the second byte of a Shift-JIS two-byte char? */
3187
3188 #define BYTE_SJIS_TWO_BYTE_2_P(c) \
3189   (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0x80 && (c) <= 0xFC))
3190
3191 #define BYTE_SJIS_KATAKANA_P(c) \
3192   ((c) >= 0xA1 && (c) <= 0xDF)
3193
3194 static int
3195 detect_coding_sjis (struct detection_state *st, const unsigned char *src,
3196                     unsigned int n)
3197 {
3198   int c;
3199
3200   while (n--)
3201     {
3202       c = *src++;
3203       if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3204         return 0;
3205       if (st->shift_jis.in_second_byte)
3206         {
3207           st->shift_jis.in_second_byte = 0;
3208           if (c < 0x40)
3209             return 0;
3210         }
3211       else if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
3212         st->shift_jis.in_second_byte = 1;
3213     }
3214   return CODING_CATEGORY_SHIFT_JIS_MASK;
3215 }
3216
3217 /* Convert Shift-JIS data to internal format. */
3218
3219 static void
3220 decode_coding_sjis (Lstream *decoding, const unsigned char *src,
3221                     unsigned_char_dynarr *dst, unsigned int n)
3222 {
3223   unsigned char c;
3224   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3225   unsigned int flags  = str->flags;
3226   unsigned int cpos   = str->cpos;
3227   eol_type_t eol_type = str->eol_type;
3228
3229   while (n--)
3230     {
3231       c = *src++;
3232
3233       if (cpos)
3234         {
3235           /* Previous character was first byte of Shift-JIS Kanji char. */
3236           if (BYTE_SJIS_TWO_BYTE_2_P (c))
3237             {
3238               unsigned char e1, e2;
3239
3240               DECODE_SJIS (cpos, c, e1, e2);
3241 #ifdef UTF2000
3242               DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_japanese_jisx0208,
3243                                             e1 & 0x7F,
3244                                             e2 & 0x7F), dst);
3245 #else
3246               Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
3247               Dynarr_add (dst, e1);
3248               Dynarr_add (dst, e2);
3249 #endif
3250             }
3251           else
3252             {
3253               DECODE_ADD_BINARY_CHAR (cpos, dst);
3254               DECODE_ADD_BINARY_CHAR (c, dst);
3255             }
3256           cpos = 0;
3257         }
3258       else
3259         {
3260           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3261           if (BYTE_SJIS_TWO_BYTE_1_P (c))
3262             cpos = c;
3263           else if (BYTE_SJIS_KATAKANA_P (c))
3264             {
3265 #ifdef UTF2000
3266               DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_katakana_jisx0201,
3267                                             c & 0x7F, 0), dst);
3268 #else
3269               Dynarr_add (dst, LEADING_BYTE_KATAKANA_JISX0201);
3270               Dynarr_add (dst, c);
3271 #endif
3272             }
3273 #ifdef UTF2000
3274           else if (c > 32)
3275             DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_latin_jisx0201,
3276                                           c, 0), dst);
3277 #endif
3278           else
3279             DECODE_ADD_BINARY_CHAR (c, dst);
3280         }
3281     label_continue_loop:;
3282     }
3283
3284   DECODE_HANDLE_END_OF_CONVERSION (flags, cpos, dst);
3285
3286   str->flags = flags;
3287   str->cpos  = cpos;
3288 }
3289
3290 /* Convert internal character representation to Shift_JIS. */
3291
3292 void
3293 char_encode_shift_jis (struct encoding_stream *str, Emchar ch,
3294                        unsigned_char_dynarr *dst, unsigned int *flags)
3295 {
3296   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3297
3298   if (ch == '\n')
3299     {
3300       if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3301         Dynarr_add (dst, '\r');
3302       if (eol_type != EOL_CR)
3303         Dynarr_add (dst, ch);
3304     }
3305   else
3306     {
3307       unsigned int s1, s2;
3308 #ifdef UTF2000
3309       int code_point = charset_code_point (Vcharset_latin_jisx0201, ch);
3310
3311       if (code_point >= 0)
3312         Dynarr_add (dst, code_point);
3313       else if ((code_point
3314                 = charset_code_point (Vcharset_japanese_jisx0208_1990, ch))
3315                >= 0)
3316         {
3317           ENCODE_SJIS ((code_point >> 8) | 0x80,
3318                        (code_point & 0xFF) | 0x80, s1, s2);
3319           Dynarr_add (dst, s1);
3320           Dynarr_add (dst, s2);
3321         }
3322       else if ((code_point
3323                 = charset_code_point (Vcharset_katakana_jisx0201, ch))
3324                >= 0)
3325         Dynarr_add (dst, code_point | 0x80);
3326       else if ((code_point
3327                 = charset_code_point (Vcharset_japanese_jisx0208, ch))
3328                >= 0)
3329         {
3330           ENCODE_SJIS ((code_point >> 8) | 0x80,
3331                        (code_point & 0xFF) | 0x80, s1, s2);
3332           Dynarr_add (dst, s1);
3333           Dynarr_add (dst, s2);
3334         }
3335       else if ((code_point = charset_code_point (Vcharset_ascii, ch))
3336                >= 0)
3337         Dynarr_add (dst, code_point);
3338       else
3339         Dynarr_add (dst, '?');
3340 #else
3341       Lisp_Object charset;
3342       unsigned int c1, c2;
3343
3344       BREAKUP_CHAR (ch, charset, c1, c2);
3345
3346       if (EQ(charset, Vcharset_katakana_jisx0201))
3347         {
3348           Dynarr_add (dst, c1 | 0x80);
3349         }
3350       else if (c2 == 0)
3351         {
3352           Dynarr_add (dst, c1);
3353         }
3354       else if (EQ(charset, Vcharset_japanese_jisx0208))
3355         {
3356           ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3357           Dynarr_add (dst, s1);
3358           Dynarr_add (dst, s2);
3359         }
3360       else
3361         Dynarr_add (dst, '?');
3362 #endif
3363     }
3364 }
3365
3366 void
3367 char_finish_shift_jis (struct encoding_stream *str, unsigned_char_dynarr *dst,
3368                        unsigned int *flags)
3369 {
3370 }
3371
3372 DEFUN ("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /*
3373 Decode a JISX0208 character of Shift-JIS coding-system.
3374 CODE is the character code in Shift-JIS as a cons of type bytes.
3375 Return the corresponding character.
3376 */
3377        (code))
3378 {
3379   unsigned char c1, c2, s1, s2;
3380
3381   CHECK_CONS (code);
3382   CHECK_INT (XCAR (code));
3383   CHECK_INT (XCDR (code));
3384   s1 = XINT (XCAR (code));
3385   s2 = XINT (XCDR (code));
3386   if (BYTE_SJIS_TWO_BYTE_1_P (s1) &&
3387       BYTE_SJIS_TWO_BYTE_2_P (s2))
3388     {
3389       DECODE_SJIS (s1, s2, c1, c2);
3390       return make_char (MAKE_CHAR (Vcharset_japanese_jisx0208,
3391                                    c1 & 0x7F, c2 & 0x7F));
3392     }
3393   else
3394     return Qnil;
3395 }
3396
3397 DEFUN ("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /*
3398 Encode a JISX0208 character CHAR to SHIFT-JIS coding-system.
3399 Return the corresponding character code in SHIFT-JIS as a cons of two bytes.
3400 */
3401        (ch))
3402 {
3403   Lisp_Object charset;
3404   int c1, c2, s1, s2;
3405
3406   CHECK_CHAR_COERCE_INT (ch);
3407   BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
3408   if (EQ (charset, Vcharset_japanese_jisx0208))
3409     {
3410       ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3411       return Fcons (make_int (s1), make_int (s2));
3412     }
3413   else
3414     return Qnil;
3415 }
3416
3417 \f
3418 /************************************************************************/
3419 /*                            Big5 methods                              */
3420 /************************************************************************/
3421
3422 /* BIG5 is a coding system encoding two character sets: ASCII and
3423    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
3424    character set and is encoded in two-byte.
3425
3426    --- CODE RANGE of BIG5 ---
3427    (character set)      (range)
3428    ASCII                0x00 .. 0x7F
3429    Big5 (1st byte)      0xA1 .. 0xFE
3430         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
3431    --------------------------
3432
3433    Since the number of characters in Big5 is larger than maximum
3434    characters in Emacs' charset (96x96), it can't be handled as one
3435    charset.  So, in Emacs, Big5 is divided into two: `charset-big5-1'
3436    and `charset-big5-2'.  Both <type>s are DIMENSION2_CHARS94.  The former
3437    contains frequently used characters and the latter contains less
3438    frequently used characters.  */
3439
3440 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
3441   ((c) >= 0xA1 && (c) <= 0xFE)
3442
3443 /* Is this the second byte of a Shift-JIS two-byte char? */
3444
3445 #define BYTE_BIG5_TWO_BYTE_2_P(c) \
3446   (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0xA1 && (c) <= 0xFE))
3447
3448 /* Number of Big5 characters which have the same code in 1st byte.  */
3449
3450 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
3451
3452 /* Code conversion macros.  These are macros because they are used in
3453    inner loops during code conversion.
3454
3455    Note that temporary variables in macros introduce the classic
3456    dynamic-scoping problems with variable names.  We use capital-
3457    lettered variables in the assumption that XEmacs does not use
3458    capital letters in variables except in a very formalized way
3459    (e.g. Qstring). */
3460
3461 /* Convert Big5 code (b1, b2) into its internal string representation
3462    (lb, c1, c2). */
3463
3464 /* There is a much simpler way to split the Big5 charset into two.
3465    For the moment I'm going to leave the algorithm as-is because it
3466    claims to separate out the most-used characters into a single
3467    charset, which perhaps will lead to optimizations in various
3468    places.
3469
3470    The way the algorithm works is something like this:
3471
3472    Big5 can be viewed as a 94x157 charset, where the row is
3473    encoded into the bytes 0xA1 .. 0xFE and the column is encoded
3474    into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE.  As for frequency,
3475    the split between low and high column numbers is apparently
3476    meaningless; ascending rows produce less and less frequent chars.
3477    Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to
3478    the first charset, and the upper half (0xC9 .. 0xFE) to the
3479    second.  To do the conversion, we convert the character into
3480    a single number where 0 .. 156 is the first row, 157 .. 313
3481    is the second, etc.  That way, the characters are ordered by
3482    decreasing frequency.  Then we just chop the space in two
3483    and coerce the result into a 94x94 space.
3484    */
3485
3486 #define DECODE_BIG5(b1, b2, lb, c1, c2) do                              \
3487 {                                                                       \
3488   int B1 = b1, B2 = b2;                                                 \
3489   unsigned int I                                                        \
3490     = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62);     \
3491                                                                         \
3492   if (B1 < 0xC9)                                                        \
3493     {                                                                   \
3494       lb = LEADING_BYTE_CHINESE_BIG5_1;                                 \
3495     }                                                                   \
3496   else                                                                  \
3497     {                                                                   \
3498       lb = LEADING_BYTE_CHINESE_BIG5_2;                                 \
3499       I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1);                             \
3500     }                                                                   \
3501   c1 = I / (0xFF - 0xA1) + 0xA1;                                        \
3502   c2 = I % (0xFF - 0xA1) + 0xA1;                                        \
3503 } while (0)
3504
3505 /* Convert the internal string representation of a Big5 character
3506    (lb, c1, c2) into Big5 code (b1, b2). */
3507
3508 #define ENCODE_BIG5(lb, c1, c2, b1, b2) do                              \
3509 {                                                                       \
3510   unsigned int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1);       \
3511                                                                         \
3512   if (lb == LEADING_BYTE_CHINESE_BIG5_2)                                \
3513     {                                                                   \
3514       I += BIG5_SAME_ROW * (0xC9 - 0xA1);                               \
3515     }                                                                   \
3516   b1 = I / BIG5_SAME_ROW + 0xA1;                                        \
3517   b2 = I % BIG5_SAME_ROW;                                               \
3518   b2 += b2 < 0x3F ? 0x40 : 0x62;                                        \
3519 } while (0)
3520
3521 static int
3522 detect_coding_big5 (struct detection_state *st, const unsigned char *src,
3523                     unsigned int n)
3524 {
3525   int c;
3526
3527   while (n--)
3528     {
3529       c = *src++;
3530       if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO ||
3531           (c >= 0x80 && c <= 0xA0))
3532         return 0;
3533       if (st->big5.in_second_byte)
3534         {
3535           st->big5.in_second_byte = 0;
3536           if (c < 0x40 || (c >= 0x80 && c <= 0xA0))
3537             return 0;
3538         }
3539       else if (c >= 0xA1)
3540         st->big5.in_second_byte = 1;
3541     }
3542   return CODING_CATEGORY_BIG5_MASK;
3543 }
3544
3545 /* Convert Big5 data to internal format. */
3546
3547 static void
3548 decode_coding_big5 (Lstream *decoding, const unsigned char *src,
3549                     unsigned_char_dynarr *dst, unsigned int n)
3550 {
3551   unsigned char c;
3552   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3553   unsigned int flags  = str->flags;
3554   unsigned int cpos   = str->cpos;
3555   eol_type_t eol_type = str->eol_type;
3556
3557   while (n--)
3558     {
3559       c = *src++;
3560       if (cpos)
3561         {
3562           /* Previous character was first byte of Big5 char. */
3563           if (BYTE_BIG5_TWO_BYTE_2_P (c))
3564             {
3565 #ifdef UTF2000
3566               DECODE_ADD_UCS_CHAR
3567                 (DECODE_CHAR (Vcharset_chinese_big5, (cpos << 8) | c),
3568                  dst);
3569 #else
3570               unsigned char b1, b2, b3;
3571               DECODE_BIG5 (cpos, c, b1, b2, b3);
3572               Dynarr_add (dst, b1);
3573               Dynarr_add (dst, b2);
3574               Dynarr_add (dst, b3);
3575 #endif
3576             }
3577           else
3578             {
3579               DECODE_ADD_BINARY_CHAR (cpos, dst);
3580               DECODE_ADD_BINARY_CHAR (c, dst);
3581             }
3582           cpos = 0;
3583         }
3584       else
3585         {
3586           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3587           if (BYTE_BIG5_TWO_BYTE_1_P (c))
3588             cpos = c;
3589           else
3590             DECODE_ADD_BINARY_CHAR (c, dst);
3591         }
3592     label_continue_loop:;
3593     }
3594
3595   DECODE_HANDLE_END_OF_CONVERSION (flags, cpos, dst);
3596
3597   str->flags = flags;
3598   str->cpos  = cpos;
3599 }
3600
3601 /* Convert internally-formatted data to Big5. */
3602
3603 static void
3604 encode_coding_big5 (Lstream *encoding, const unsigned char *src,
3605                     unsigned_char_dynarr *dst, unsigned int n)
3606 {
3607 #ifndef UTF2000
3608   unsigned char c;
3609   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3610   unsigned int flags  = str->flags;
3611   unsigned int ch     = str->ch;
3612   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3613
3614   while (n--)
3615     {
3616       c = *src++;
3617       if (c == '\n')
3618         {
3619           if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3620             Dynarr_add (dst, '\r');
3621           if (eol_type != EOL_CR)
3622             Dynarr_add (dst, '\n');
3623         }
3624       else if (BYTE_ASCII_P (c))
3625         {
3626           /* ASCII. */
3627           Dynarr_add (dst, c);
3628         }
3629       else if (BUFBYTE_LEADING_BYTE_P (c))
3630         {
3631           if (c == LEADING_BYTE_CHINESE_BIG5_1 ||
3632               c == LEADING_BYTE_CHINESE_BIG5_2)
3633             {
3634               /* A recognized leading byte. */
3635               ch = c;
3636               continue; /* not done with this character. */
3637             }
3638           /* otherwise just ignore this character. */
3639         }
3640       else if (ch == LEADING_BYTE_CHINESE_BIG5_1 ||
3641                ch == LEADING_BYTE_CHINESE_BIG5_2)
3642         {
3643           /* Previous char was a recognized leading byte. */
3644           ch = (ch << 8) | c;
3645           continue; /* not done with this character. */
3646         }
3647       else if (ch)
3648         {
3649           /* Encountering second byte of a Big5 character. */
3650           unsigned char b1, b2;
3651
3652           ENCODE_BIG5 (ch >> 8, ch & 0xFF, c, b1, b2);
3653           Dynarr_add (dst, b1);
3654           Dynarr_add (dst, b2);
3655         }
3656
3657       ch = 0;
3658     }
3659
3660   str->flags = flags;
3661   str->ch    = ch;
3662 #endif
3663 }
3664
3665
3666 DEFUN ("decode-big5-char", Fdecode_big5_char, 1, 1, 0, /*
3667 Decode a Big5 character CODE of BIG5 coding-system.
3668 CODE is the character code in BIG5, a cons of two integers.
3669 Return the corresponding character.
3670 */
3671        (code))
3672 {
3673   unsigned char c1, c2, b1, b2;
3674
3675   CHECK_CONS (code);
3676   CHECK_INT (XCAR (code));
3677   CHECK_INT (XCDR (code));
3678   b1 = XINT (XCAR (code));
3679   b2 = XINT (XCDR (code));
3680   if (BYTE_BIG5_TWO_BYTE_1_P (b1) &&
3681       BYTE_BIG5_TWO_BYTE_2_P (b2))
3682     {
3683       Charset_ID leading_byte;
3684       Lisp_Object charset;
3685       DECODE_BIG5 (b1, b2, leading_byte, c1, c2);
3686       charset = CHARSET_BY_LEADING_BYTE (leading_byte);
3687       return make_char (MAKE_CHAR (charset, c1 & 0x7F, c2 & 0x7F));
3688     }
3689   else
3690     return Qnil;
3691 }
3692
3693 DEFUN ("encode-big5-char", Fencode_big5_char, 1, 1, 0, /*
3694 Encode the Big5 character CH to BIG5 coding-system.
3695 Return the corresponding character code in Big5.
3696 */
3697        (ch))
3698 {
3699   Lisp_Object charset;
3700   int c1, c2, b1, b2;
3701
3702   CHECK_CHAR_COERCE_INT (ch);
3703   BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
3704   if (EQ (charset, Vcharset_chinese_big5_1) ||
3705       EQ (charset, Vcharset_chinese_big5_2))
3706     {
3707       ENCODE_BIG5 (XCHARSET_LEADING_BYTE (charset), c1 | 0x80, c2 | 0x80,
3708                    b1, b2);
3709       return Fcons (make_int (b1), make_int (b2));
3710     }
3711   else
3712     return Qnil;
3713 }
3714
3715 \f
3716 /************************************************************************/
3717 /*                           UCS-4 methods                              */
3718 /************************************************************************/
3719
3720 static int
3721 detect_coding_ucs4 (struct detection_state *st, const unsigned char *src,
3722                     unsigned int n)
3723 {
3724   while (n--)
3725     {
3726       int c = *src++;
3727       switch (st->ucs4.in_byte)
3728         {
3729         case 0:
3730           if (c >= 128)
3731             return 0;
3732           else
3733             st->ucs4.in_byte++;
3734           break;
3735         case 3:
3736           st->ucs4.in_byte = 0;
3737           break;
3738         default:
3739           st->ucs4.in_byte++;
3740         }
3741     }
3742   return CODING_CATEGORY_UCS4_MASK;
3743 }
3744
3745 static void
3746 decode_coding_ucs4 (Lstream *decoding, const unsigned char *src,
3747                     unsigned_char_dynarr *dst, unsigned int n)
3748 {
3749   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3750   unsigned int flags = str->flags;
3751   unsigned int cpos  = str->cpos;
3752   unsigned char counter = str->counter;
3753
3754   while (n--)
3755     {
3756       unsigned char c = *src++;
3757       switch (counter)
3758         {
3759         case 0:
3760           cpos = c;
3761           counter = 3;
3762           break;
3763         case 1:
3764           DECODE_ADD_UCS_CHAR ((cpos << 8) | c, dst);
3765           cpos = 0;
3766           counter = 0;
3767           break;
3768         default:
3769           cpos = ( cpos << 8 ) | c;
3770           counter--;
3771         }
3772     }
3773   if (counter & CODING_STATE_END)
3774     DECODE_OUTPUT_PARTIAL_CHAR (cpos);
3775
3776   str->flags    = flags;
3777   str->cpos     = cpos;
3778   str->counter  = counter;
3779 }
3780
3781 void
3782 char_encode_ucs4 (struct encoding_stream *str, Emchar ch,
3783                   unsigned_char_dynarr *dst, unsigned int *flags)
3784 {
3785   Dynarr_add (dst, ch >> 24);
3786   Dynarr_add (dst, ch >> 16);
3787   Dynarr_add (dst, ch >>  8);
3788   Dynarr_add (dst, ch      );
3789 }
3790
3791 void
3792 char_finish_ucs4 (struct encoding_stream *str, unsigned_char_dynarr *dst,
3793                   unsigned int *flags)
3794 {
3795 }
3796
3797 \f
3798 /************************************************************************/
3799 /*                           UTF-8 methods                              */
3800 /************************************************************************/
3801
3802 static int
3803 detect_coding_utf8 (struct detection_state *st, const unsigned char *src,
3804                     unsigned int n)
3805 {
3806   while (n--)
3807     {
3808       unsigned char c = *src++;
3809       switch (st->utf8.in_byte)
3810         {
3811         case 0:
3812           if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3813             return 0;
3814           else if (c >= 0xfc)
3815             st->utf8.in_byte = 5;
3816           else if (c >= 0xf8)
3817             st->utf8.in_byte = 4;
3818           else if (c >= 0xf0)
3819             st->utf8.in_byte = 3;
3820           else if (c >= 0xe0)
3821             st->utf8.in_byte = 2;
3822           else if (c >= 0xc0)
3823             st->utf8.in_byte = 1;
3824           else if (c >= 0x80)
3825             return 0;
3826           break;
3827         default:
3828           if ((c & 0xc0) != 0x80)
3829             return 0;
3830           else
3831             st->utf8.in_byte--;
3832         }
3833     }
3834   return CODING_CATEGORY_UTF8_MASK;
3835 }
3836
3837 static void
3838 decode_coding_utf8 (Lstream *decoding, const unsigned char *src,
3839                     unsigned_char_dynarr *dst, unsigned int n)
3840 {
3841   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3842   unsigned int flags    = str->flags;
3843   unsigned int cpos     = str->cpos;
3844   eol_type_t eol_type   = str->eol_type;
3845   unsigned char counter = str->counter;
3846
3847   while (n--)
3848     {
3849       unsigned char c = *src++;
3850       switch (counter)
3851         {
3852         case 0:
3853           if ( c >= 0xfc )
3854             {
3855               cpos = c & 0x01;
3856               counter = 5;
3857             }
3858           else if ( c >= 0xf8 )
3859             {
3860               cpos = c & 0x03;
3861               counter = 4;
3862             }
3863           else if ( c >= 0xf0 )
3864             {
3865               cpos = c & 0x07;
3866               counter = 3;
3867             }
3868           else if ( c >= 0xe0 )
3869             {
3870               cpos = c & 0x0f;
3871               counter = 2;
3872             }
3873           else if ( c >= 0xc0 )
3874             {
3875               cpos = c & 0x1f;
3876               counter = 1;
3877             }
3878           else
3879             {
3880               DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3881               DECODE_ADD_UCS_CHAR (c, dst);
3882             }
3883           break;
3884         case 1:
3885           cpos = ( cpos << 6 ) | ( c & 0x3f );
3886           DECODE_ADD_UCS_CHAR (cpos, dst);
3887           cpos = 0;
3888           counter = 0;
3889           break;
3890         default:
3891           cpos = ( cpos << 6 ) | ( c & 0x3f );
3892           counter--;
3893         }
3894     label_continue_loop:;
3895     }
3896
3897   if (flags & CODING_STATE_END)
3898     DECODE_OUTPUT_PARTIAL_CHAR (cpos);
3899
3900   str->flags    = flags;
3901   str->cpos     = cpos;
3902   str->counter  = counter;
3903 }
3904
3905 void
3906 char_encode_utf8 (struct encoding_stream *str, Emchar ch,
3907                   unsigned_char_dynarr *dst, unsigned int *flags)
3908 {
3909   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3910
3911   if (ch == '\n')
3912     {
3913       if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3914         Dynarr_add (dst, '\r');
3915       if (eol_type != EOL_CR)
3916         Dynarr_add (dst, ch);
3917     }
3918   else if (ch <= 0x7f)
3919     {
3920       Dynarr_add (dst, ch);
3921     }
3922   else if (ch <= 0x7ff)
3923     {
3924       Dynarr_add (dst, (ch >> 6) | 0xc0);
3925       Dynarr_add (dst, (ch & 0x3f) | 0x80);
3926     }
3927   else if (ch <= 0xffff)
3928     {
3929       Dynarr_add (dst,  (ch >> 12) | 0xe0);
3930       Dynarr_add (dst, ((ch >>  6) & 0x3f) | 0x80);
3931       Dynarr_add (dst,  (ch        & 0x3f) | 0x80);
3932     }
3933   else if (ch <= 0x1fffff)
3934     {
3935       Dynarr_add (dst,  (ch >> 18) | 0xf0);
3936       Dynarr_add (dst, ((ch >> 12) & 0x3f) | 0x80);
3937       Dynarr_add (dst, ((ch >>  6) & 0x3f) | 0x80);
3938       Dynarr_add (dst,  (ch        & 0x3f) | 0x80);
3939     }
3940   else if (ch <= 0x3ffffff)
3941     {
3942       Dynarr_add (dst,  (ch >> 24) | 0xf8);
3943       Dynarr_add (dst, ((ch >> 18) & 0x3f) | 0x80);
3944       Dynarr_add (dst, ((ch >> 12) & 0x3f) | 0x80);
3945       Dynarr_add (dst, ((ch >>  6) & 0x3f) | 0x80);
3946       Dynarr_add (dst,  (ch        & 0x3f) | 0x80);
3947     }
3948   else
3949     {
3950       Dynarr_add (dst,  (ch >> 30) | 0xfc);
3951       Dynarr_add (dst, ((ch >> 24) & 0x3f) | 0x80);
3952       Dynarr_add (dst, ((ch >> 18) & 0x3f) | 0x80);
3953       Dynarr_add (dst, ((ch >> 12) & 0x3f) | 0x80);
3954       Dynarr_add (dst, ((ch >>  6) & 0x3f) | 0x80);
3955       Dynarr_add (dst,  (ch        & 0x3f) | 0x80);
3956     }
3957 }
3958
3959 void
3960 char_finish_utf8 (struct encoding_stream *str, unsigned_char_dynarr *dst,
3961                   unsigned int *flags)
3962 {
3963 }
3964
3965 \f
3966 /************************************************************************/
3967 /*                           ISO2022 methods                            */
3968 /************************************************************************/
3969
3970 /* The following note describes the coding system ISO2022 briefly.
3971    Since the intention of this note is to help understand the
3972    functions in this file, some parts are NOT ACCURATE or OVERLY
3973    SIMPLIFIED.  For thorough understanding, please refer to the
3974    original document of ISO2022.
3975
3976    ISO2022 provides many mechanisms to encode several character sets
3977    in 7-bit and 8-bit environments.  For 7-bit environments, all text
3978    is encoded using bytes less than 128.  This may make the encoded
3979    text a little bit longer, but the text passes more easily through
3980    several gateways, some of which strip off MSB (Most Signigant Bit).
3981
3982    There are two kinds of character sets: control character set and
3983    graphic character set.  The former contains control characters such
3984    as `newline' and `escape' to provide control functions (control
3985    functions are also provided by escape sequences).  The latter
3986    contains graphic characters such as 'A' and '-'.  Emacs recognizes
3987    two control character sets and many graphic character sets.
3988
3989    Graphic character sets are classified into one of the following
3990    four classes, according to the number of bytes (DIMENSION) and
3991    number of characters in one dimension (CHARS) of the set:
3992    - DIMENSION1_CHARS94
3993    - DIMENSION1_CHARS96
3994    - DIMENSION2_CHARS94
3995    - DIMENSION2_CHARS96
3996
3997    In addition, each character set is assigned an identification tag,
3998    unique for each set, called "final character" (denoted as <F>
3999    hereafter).  The <F> of each character set is decided by ECMA(*)
4000    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
4001    (0x30..0x3F are for private use only).
4002
4003    Note (*): ECMA = European Computer Manufacturers Association
4004
4005    Here are examples of graphic character set [NAME(<F>)]:
4006         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
4007         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
4008         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
4009         o DIMENSION2_CHARS96 -- none for the moment
4010
4011    A code area (1 byte = 8 bits) is divided into 4 areas, C0, GL, C1, and GR.
4012         C0 [0x00..0x1F] -- control character plane 0
4013         GL [0x20..0x7F] -- graphic character plane 0
4014         C1 [0x80..0x9F] -- control character plane 1
4015         GR [0xA0..0xFF] -- graphic character plane 1
4016
4017    A control character set is directly designated and invoked to C0 or
4018    C1 by an escape sequence.  The most common case is that:
4019    - ISO646's  control character set is designated/invoked to C0, and
4020    - ISO6429's control character set is designated/invoked to C1,
4021    and usually these designations/invocations are omitted in encoded
4022    text.  In a 7-bit environment, only C0 can be used, and a control
4023    character for C1 is encoded by an appropriate escape sequence to
4024    fit into the environment.  All control characters for C1 are
4025    defined to have corresponding escape sequences.
4026
4027    A graphic character set is at first designated to one of four
4028    graphic registers (G0 through G3), then these graphic registers are
4029    invoked to GL or GR.  These designations and invocations can be
4030    done independently.  The most common case is that G0 is invoked to
4031    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
4032    these invocations and designations are omitted in encoded text.
4033    In a 7-bit environment, only GL can be used.
4034
4035    When a graphic character set of CHARS94 is invoked to GL, codes
4036    0x20 and 0x7F of the GL area work as control characters SPACE and
4037    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
4038    be used.
4039
4040    There are two ways of invocation: locking-shift and single-shift.
4041    With locking-shift, the invocation lasts until the next different
4042    invocation, whereas with single-shift, the invocation affects the
4043    following character only and doesn't affect the locking-shift
4044    state.  Invocations are done by the following control characters or
4045    escape sequences:
4046
4047    ----------------------------------------------------------------------
4048    abbrev  function                  cntrl escape seq   description
4049    ----------------------------------------------------------------------
4050    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
4051    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
4052    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
4053    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
4054    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
4055    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
4056    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
4057    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
4058    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
4059    ----------------------------------------------------------------------
4060    (*) These are not used by any known coding system.
4061
4062    Control characters for these functions are defined by macros
4063    ISO_CODE_XXX in `coding.h'.
4064
4065    Designations are done by the following escape sequences:
4066    ----------------------------------------------------------------------
4067    escape sequence      description
4068    ----------------------------------------------------------------------
4069    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
4070    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
4071    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
4072    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
4073    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
4074    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
4075    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
4076    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
4077    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
4078    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
4079    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
4080    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
4081    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
4082    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
4083    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
4084    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
4085    ----------------------------------------------------------------------
4086
4087    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
4088    of dimension 1, chars 94, and final character <F>, etc...
4089
4090    Note (*): Although these designations are not allowed in ISO2022,
4091    Emacs accepts them on decoding, and produces them on encoding
4092    CHARS96 character sets in a coding system which is characterized as
4093    7-bit environment, non-locking-shift, and non-single-shift.
4094
4095    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
4096    '(' can be omitted.  We refer to this as "short-form" hereafter.
4097
4098    Now you may notice that there are a lot of ways for encoding the
4099    same multilingual text in ISO2022.  Actually, there exist many
4100    coding systems such as Compound Text (used in X11's inter client
4101    communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
4102    (used in Korean internet), EUC (Extended UNIX Code, used in Asian
4103    localized platforms), and all of these are variants of ISO2022.
4104
4105    In addition to the above, Emacs handles two more kinds of escape
4106    sequences: ISO6429's direction specification and Emacs' private
4107    sequence for specifying character composition.
4108
4109    ISO6429's direction specification takes the following form:
4110         o CSI ']'      -- end of the current direction
4111         o CSI '0' ']'  -- end of the current direction
4112         o CSI '1' ']'  -- start of left-to-right text
4113         o CSI '2' ']'  -- start of right-to-left text
4114    The control character CSI (0x9B: control sequence introducer) is
4115    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
4116
4117    Character composition specification takes the following form:
4118         o ESC '0' -- start character composition
4119         o ESC '1' -- end character composition
4120    Since these are not standard escape sequences of any ISO standard,
4121    their use with these meanings is restricted to Emacs only.  */
4122
4123 static void
4124 reset_iso2022 (Lisp_Object coding_system, struct iso2022_decoder *iso)
4125 {
4126   int i;
4127
4128   for (i = 0; i < 4; i++)
4129     {
4130       if (!NILP (coding_system))
4131         iso->charset[i] =
4132           XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, i);
4133       else
4134         iso->charset[i] = Qt;
4135       iso->invalid_designated[i] = 0;
4136     }
4137   iso->esc = ISO_ESC_NOTHING;
4138   iso->esc_bytes_index = 0;
4139   iso->register_left = 0;
4140   iso->register_right = 1;
4141   iso->switched_dir_and_no_valid_charset_yet = 0;
4142   iso->invalid_switch_dir = 0;
4143   iso->output_direction_sequence = 0;
4144   iso->output_literally = 0;
4145 #ifdef ENABLE_COMPOSITE_CHARS
4146   if (iso->composite_chars)
4147     Dynarr_reset (iso->composite_chars);
4148 #endif
4149 }
4150
4151 static int
4152 fit_to_be_escape_quoted (unsigned char c)
4153 {
4154   switch (c)
4155     {
4156     case ISO_CODE_ESC:
4157     case ISO_CODE_CSI:
4158     case ISO_CODE_SS2:
4159     case ISO_CODE_SS3:
4160     case ISO_CODE_SO:
4161     case ISO_CODE_SI:
4162       return 1;
4163
4164     default:
4165       return 0;
4166     }
4167 }
4168
4169 /* Parse one byte of an ISO2022 escape sequence.
4170    If the result is an invalid escape sequence, return 0 and
4171    do not change anything in STR.  Otherwise, if the result is
4172    an incomplete escape sequence, update ISO2022.ESC and
4173    ISO2022.ESC_BYTES and return -1.  Otherwise, update
4174    all the state variables (but not ISO2022.ESC_BYTES) and
4175    return 1.
4176
4177    If CHECK_INVALID_CHARSETS is non-zero, check for designation
4178    or invocation of an invalid character set and treat that as
4179    an unrecognized escape sequence. */
4180
4181 static int
4182 parse_iso2022_esc (Lisp_Object codesys, struct iso2022_decoder *iso,
4183                    unsigned char c, unsigned int *flags,
4184                    int check_invalid_charsets)
4185 {
4186   /* (1) If we're at the end of a designation sequence, CS is the
4187      charset being designated and REG is the register to designate
4188      it to.
4189
4190      (2) If we're at the end of a locking-shift sequence, REG is
4191      the register to invoke and HALF (0 == left, 1 == right) is
4192      the half to invoke it into.
4193
4194      (3) If we're at the end of a single-shift sequence, REG is
4195      the register to invoke. */
4196   Lisp_Object cs = Qnil;
4197   int reg, half;
4198
4199   /* NOTE: This code does goto's all over the fucking place.
4200      The reason for this is that we're basically implementing
4201      a state machine here, and hierarchical languages like C
4202      don't really provide a clean way of doing this. */
4203
4204   if (! (*flags & CODING_STATE_ESCAPE))
4205     /* At beginning of escape sequence; we need to reset our
4206        escape-state variables. */
4207     iso->esc = ISO_ESC_NOTHING;
4208
4209   iso->output_literally = 0;
4210   iso->output_direction_sequence = 0;
4211
4212   switch (iso->esc)
4213     {
4214     case ISO_ESC_NOTHING:
4215       iso->esc_bytes_index = 0;
4216       switch (c)
4217         {
4218         case ISO_CODE_ESC:      /* Start escape sequence */
4219           *flags |= CODING_STATE_ESCAPE;
4220           iso->esc = ISO_ESC;
4221           goto not_done;
4222
4223         case ISO_CODE_CSI:      /* ISO6429 (specifying directionality) */
4224           *flags |= CODING_STATE_ESCAPE;
4225           iso->esc = ISO_ESC_5_11;
4226           goto not_done;
4227
4228         case ISO_CODE_SO:       /* locking shift 1 */
4229           reg = 1; half = 0;
4230           goto locking_shift;
4231         case ISO_CODE_SI:       /* locking shift 0 */
4232           reg = 0; half = 0;
4233           goto locking_shift;
4234
4235         case ISO_CODE_SS2:      /* single shift */
4236           reg = 2;
4237           goto single_shift;
4238         case ISO_CODE_SS3:      /* single shift */
4239           reg = 3;
4240           goto single_shift;
4241
4242         default:                        /* Other control characters */
4243           return 0;
4244         }
4245
4246     case ISO_ESC:
4247       switch (c)
4248         {
4249           /**** single shift ****/
4250
4251         case 'N':       /* single shift 2 */
4252           reg = 2;
4253           goto single_shift;
4254         case 'O':       /* single shift 3 */
4255           reg = 3;
4256           goto single_shift;
4257
4258           /**** locking shift ****/
4259
4260         case '~':       /* locking shift 1 right */
4261           reg = 1; half = 1;
4262           goto locking_shift;
4263         case 'n':       /* locking shift 2 */
4264           reg = 2; half = 0;
4265           goto locking_shift;
4266         case '}':       /* locking shift 2 right */
4267           reg = 2; half = 1;
4268           goto locking_shift;
4269         case 'o':       /* locking shift 3 */
4270           reg = 3; half = 0;
4271           goto locking_shift;
4272         case '|':       /* locking shift 3 right */
4273           reg = 3; half = 1;
4274           goto locking_shift;
4275
4276 #ifdef ENABLE_COMPOSITE_CHARS
4277           /**** composite ****/
4278
4279         case '0':
4280           iso->esc = ISO_ESC_START_COMPOSITE;
4281           *flags = (*flags & CODING_STATE_ISO2022_LOCK) |
4282             CODING_STATE_COMPOSITE;
4283           return 1;
4284
4285         case '1':
4286           iso->esc = ISO_ESC_END_COMPOSITE;
4287           *flags = (*flags & CODING_STATE_ISO2022_LOCK) &
4288             ~CODING_STATE_COMPOSITE;
4289           return 1;
4290 #endif /* ENABLE_COMPOSITE_CHARS */
4291
4292           /**** directionality ****/
4293
4294         case '[':
4295           iso->esc = ISO_ESC_5_11;
4296           goto not_done;
4297
4298           /**** designation ****/
4299
4300         case '$':       /* multibyte charset prefix */
4301           iso->esc = ISO_ESC_2_4;
4302           goto not_done;
4303
4304         default:
4305           if (0x28 <= c && c <= 0x2F)
4306             {
4307               iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_8);
4308               goto not_done;
4309             }
4310
4311           /* This function is called with CODESYS equal to nil when
4312              doing coding-system detection. */
4313           if (!NILP (codesys)
4314               && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
4315               && fit_to_be_escape_quoted (c))
4316             {
4317               iso->esc = ISO_ESC_LITERAL;
4318               *flags &= CODING_STATE_ISO2022_LOCK;
4319               return 1;
4320             }
4321
4322           /* bzzzt! */
4323           return 0;
4324         }
4325
4326
4327
4328       /**** directionality ****/
4329
4330     case ISO_ESC_5_11:          /* ISO6429 direction control */
4331       if (c == ']')
4332         {
4333           *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4334           goto directionality;
4335         }
4336       if      (c == '0') iso->esc = ISO_ESC_5_11_0;
4337       else if (c == '1') iso->esc = ISO_ESC_5_11_1;
4338       else if (c == '2') iso->esc = ISO_ESC_5_11_2;
4339       else               return 0;
4340       goto not_done;
4341
4342     case ISO_ESC_5_11_0:
4343       if (c == ']')
4344         {
4345           *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4346           goto directionality;
4347         }
4348       return 0;
4349
4350     case ISO_ESC_5_11_1:
4351       if (c == ']')
4352         {
4353           *flags = (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4354           goto directionality;
4355         }
4356       return 0;
4357
4358     case ISO_ESC_5_11_2:
4359       if (c == ']')
4360         {
4361           *flags = (*flags & CODING_STATE_ISO2022_LOCK) | CODING_STATE_R2L;
4362           goto directionality;
4363         }
4364       return 0;
4365
4366     directionality:
4367       iso->esc = ISO_ESC_DIRECTIONALITY;
4368       /* Various junk here to attempt to preserve the direction sequences
4369          literally in the text if they would otherwise be swallowed due
4370          to invalid designations that don't show up as actual charset
4371          changes in the text. */
4372       if (iso->invalid_switch_dir)
4373         {
4374           /* We already inserted a direction switch literally into the
4375              text.  We assume (#### this may not be right) that the
4376              next direction switch is the one going the other way,
4377              and we need to output that literally as well. */
4378           iso->output_literally = 1;
4379           iso->invalid_switch_dir = 0;
4380         }
4381       else
4382         {
4383           int jj;
4384
4385           /* If we are in the thrall of an invalid designation,
4386            then stick the directionality sequence literally into the
4387            output stream so it ends up in the original text again. */
4388           for (jj = 0; jj < 4; jj++)
4389             if (iso->invalid_designated[jj])
4390               break;
4391           if (jj < 4)
4392             {
4393               iso->output_literally = 1;
4394               iso->invalid_switch_dir = 1;
4395             }
4396           else
4397             /* Indicate that we haven't yet seen a valid designation,
4398                so that if a switch-dir is directly followed by an
4399                invalid designation, both get inserted literally. */
4400             iso->switched_dir_and_no_valid_charset_yet = 1;
4401         }
4402       return 1;
4403
4404
4405       /**** designation ****/
4406
4407     case ISO_ESC_2_4:
4408       if (0x28 <= c && c <= 0x2F)
4409         {
4410           iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_4_8);
4411           goto not_done;
4412         }
4413       if (0x40 <= c && c <= 0x42)
4414         {
4415           /* 94^n-set */
4416           cs = CHARSET_BY_ATTRIBUTES (94, -1, c,
4417                                       *flags & CODING_STATE_R2L ?
4418                                       CHARSET_RIGHT_TO_LEFT :
4419                                       CHARSET_LEFT_TO_RIGHT);
4420           reg = 0;
4421           goto designated;
4422         }
4423       return 0;
4424
4425     default:
4426       {
4427         int chars = 0;
4428         int single = 0;
4429
4430         if (c < '0' || c > '~')
4431           return 0; /* bad final byte */
4432
4433         if (iso->esc >= ISO_ESC_2_8 &&
4434             iso->esc <= ISO_ESC_2_15)
4435           {
4436             chars = (iso->esc >= ISO_ESC_2_12) ? 96 : 94;
4437             single = 1; /* single-byte */
4438             reg = (iso->esc - ISO_ESC_2_8) & 3;
4439           }
4440         else if (iso->esc >= ISO_ESC_2_4_8 &&
4441                  iso->esc <= ISO_ESC_2_4_15)
4442           {
4443             chars = (iso->esc >= ISO_ESC_2_4_12) ? 96 : 94;
4444             single = -1; /* multi-byte */
4445             reg = (iso->esc - ISO_ESC_2_4_8) & 3;
4446           }
4447         else
4448           {
4449             /* Can this ever be reached? -slb */
4450             abort();
4451           }
4452
4453         cs = CHARSET_BY_ATTRIBUTES (chars, single, c,
4454                                     *flags & CODING_STATE_R2L ?
4455                                     CHARSET_RIGHT_TO_LEFT :
4456                                     CHARSET_LEFT_TO_RIGHT);
4457         goto designated;
4458       }
4459     }
4460
4461  not_done:
4462   iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char) c;
4463   return -1;
4464
4465  single_shift:
4466   if (check_invalid_charsets && !CHARSETP (iso->charset[reg]))
4467     /* can't invoke something that ain't there. */
4468     return 0;
4469   iso->esc = ISO_ESC_SINGLE_SHIFT;
4470   *flags &= CODING_STATE_ISO2022_LOCK;
4471   if (reg == 2)
4472     *flags |= CODING_STATE_SS2;
4473   else
4474     *flags |= CODING_STATE_SS3;
4475   return 1;
4476
4477  locking_shift:
4478   if (check_invalid_charsets &&
4479       !CHARSETP (iso->charset[reg]))
4480     /* can't invoke something that ain't there. */
4481     return 0;
4482   if (half)
4483     iso->register_right = reg;
4484   else
4485     iso->register_left = reg;
4486   *flags &= CODING_STATE_ISO2022_LOCK;
4487   iso->esc = ISO_ESC_LOCKING_SHIFT;
4488   return 1;
4489
4490  designated:
4491   if (NILP (cs) && check_invalid_charsets)
4492     {
4493       iso->invalid_designated[reg] = 1;
4494       iso->charset[reg] = Vcharset_ascii;
4495       iso->esc = ISO_ESC_DESIGNATE;
4496       *flags &= CODING_STATE_ISO2022_LOCK;
4497       iso->output_literally = 1;
4498       if (iso->switched_dir_and_no_valid_charset_yet)
4499         {
4500           /* We encountered a switch-direction followed by an
4501              invalid designation.  Ensure that the switch-direction
4502              gets outputted; otherwise it will probably get eaten
4503              when the text is written out again. */
4504           iso->switched_dir_and_no_valid_charset_yet = 0;
4505           iso->output_direction_sequence = 1;
4506           /* And make sure that the switch-dir going the other
4507              way gets outputted, as well. */
4508           iso->invalid_switch_dir = 1;
4509         }
4510       return 1;
4511     }
4512   /* This function is called with CODESYS equal to nil when
4513      doing coding-system detection. */
4514   if (!NILP (codesys))
4515     {
4516       charset_conversion_spec_dynarr *dyn =
4517         XCODING_SYSTEM (codesys)->iso2022.input_conv;
4518
4519       if (dyn)
4520         {
4521           int i;
4522
4523           for (i = 0; i < Dynarr_length (dyn); i++)
4524             {
4525               struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
4526               if (EQ (cs, spec->from_charset))
4527                 cs = spec->to_charset;
4528             }
4529         }
4530     }
4531
4532   iso->charset[reg] = cs;
4533   iso->esc = ISO_ESC_DESIGNATE;
4534   *flags &= CODING_STATE_ISO2022_LOCK;
4535   if (iso->invalid_designated[reg])
4536     {
4537       iso->invalid_designated[reg] = 0;
4538       iso->output_literally = 1;
4539     }
4540   if (iso->switched_dir_and_no_valid_charset_yet)
4541     iso->switched_dir_and_no_valid_charset_yet = 0;
4542   return 1;
4543 }
4544
4545 static int
4546 detect_coding_iso2022 (struct detection_state *st, const unsigned char *src,
4547                        unsigned int n)
4548 {
4549   int mask;
4550
4551   /* #### There are serious deficiencies in the recognition mechanism
4552      here.  This needs to be much smarter if it's going to cut it.
4553      The sequence "\xff\x0f" is currently detected as LOCK_SHIFT while
4554      it should be detected as Latin-1.
4555      All the ISO2022 stuff in this file should be synced up with the
4556      code from FSF Emacs-20.4, in which Mule should be more or less stable.
4557      Perhaps we should wait till R2L works in FSF Emacs? */
4558
4559   if (!st->iso2022.initted)
4560     {
4561       reset_iso2022 (Qnil, &st->iso2022.iso);
4562       st->iso2022.mask = (CODING_CATEGORY_ISO_7_MASK |
4563                           CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4564                           CODING_CATEGORY_ISO_8_1_MASK |
4565                           CODING_CATEGORY_ISO_8_2_MASK |
4566                           CODING_CATEGORY_ISO_LOCK_SHIFT_MASK);
4567       st->iso2022.flags = 0;
4568       st->iso2022.high_byte_count = 0;
4569       st->iso2022.saw_single_shift = 0;
4570       st->iso2022.initted = 1;
4571     }
4572
4573   mask = st->iso2022.mask;
4574
4575   while (n--)
4576     {
4577       int c = *src++;
4578       if (c >= 0xA0)
4579         {
4580           mask &= ~CODING_CATEGORY_ISO_7_MASK;
4581           st->iso2022.high_byte_count++;
4582         }
4583       else
4584         {
4585           if (st->iso2022.high_byte_count && !st->iso2022.saw_single_shift)
4586             {
4587               if (st->iso2022.high_byte_count & 1)
4588                 /* odd number of high bytes; assume not iso-8-2 */
4589                 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4590             }
4591           st->iso2022.high_byte_count = 0;
4592           st->iso2022.saw_single_shift = 0;
4593           if (c > 0x80)
4594             mask &= ~CODING_CATEGORY_ISO_7_MASK;
4595         }
4596       if (!(st->iso2022.flags & CODING_STATE_ESCAPE)
4597           && (BYTE_C0_P (c) || BYTE_C1_P (c)))
4598         { /* control chars */
4599           switch (c)
4600             {
4601               /* Allow and ignore control characters that you might
4602                  reasonably see in a text file */
4603             case '\r':
4604             case '\n':
4605             case '\t':
4606             case  7: /* bell */
4607             case  8: /* backspace */
4608             case 11: /* vertical tab */
4609             case 12: /* form feed */
4610             case 26: /* MS-DOS C-z junk */
4611             case 31: /* '^_' -- for info */
4612               goto label_continue_loop;
4613
4614             default:
4615               break;
4616             }
4617         }
4618
4619       if ((st->iso2022.flags & CODING_STATE_ESCAPE) || BYTE_C0_P (c)
4620           || BYTE_C1_P (c))
4621         {
4622           if (parse_iso2022_esc (Qnil, &st->iso2022.iso, c,
4623                                  &st->iso2022.flags, 0))
4624             {
4625               switch (st->iso2022.iso.esc)
4626                 {
4627                 case ISO_ESC_DESIGNATE:
4628                   mask &= ~CODING_CATEGORY_ISO_8_1_MASK;
4629                   mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4630                   break;
4631                 case ISO_ESC_LOCKING_SHIFT:
4632                   mask = CODING_CATEGORY_ISO_LOCK_SHIFT_MASK;
4633                   goto ran_out_of_chars;
4634                 case ISO_ESC_SINGLE_SHIFT:
4635                   mask &= ~CODING_CATEGORY_ISO_8_DESIGNATE_MASK;
4636                   st->iso2022.saw_single_shift = 1;
4637                   break;
4638                 default:
4639                   break;
4640                 }
4641             }
4642           else
4643             {
4644               mask = 0;
4645               goto ran_out_of_chars;
4646             }
4647         }
4648     label_continue_loop:;
4649     }
4650
4651  ran_out_of_chars:
4652
4653   return mask;
4654 }
4655
4656 static int
4657 postprocess_iso2022_mask (int mask)
4658 {
4659   /* #### kind of cheesy */
4660   /* If seven-bit ISO is allowed, then assume that the encoding is
4661      entirely seven-bit and turn off the eight-bit ones. */
4662   if (mask & CODING_CATEGORY_ISO_7_MASK)
4663     mask &= ~ (CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4664                CODING_CATEGORY_ISO_8_1_MASK |
4665                CODING_CATEGORY_ISO_8_2_MASK);
4666   return mask;
4667 }
4668
4669 /* If FLAGS is a null pointer or specifies right-to-left motion,
4670    output a switch-dir-to-left-to-right sequence to DST.
4671    Also update FLAGS if it is not a null pointer.
4672    If INTERNAL_P is set, we are outputting in internal format and
4673    need to handle the CSI differently. */
4674
4675 static void
4676 restore_left_to_right_direction (Lisp_Coding_System *codesys,
4677                                  unsigned_char_dynarr *dst,
4678                                  unsigned int *flags,
4679                                  int internal_p)
4680 {
4681   if (!flags || (*flags & CODING_STATE_R2L))
4682     {
4683       if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4684         {
4685           Dynarr_add (dst, ISO_CODE_ESC);
4686           Dynarr_add (dst, '[');
4687         }
4688       else if (internal_p)
4689         DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4690       else
4691         Dynarr_add (dst, ISO_CODE_CSI);
4692       Dynarr_add (dst, '0');
4693       Dynarr_add (dst, ']');
4694       if (flags)
4695         *flags &= ~CODING_STATE_R2L;
4696     }
4697 }
4698
4699 /* If FLAGS is a null pointer or specifies a direction different from
4700    DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or
4701    CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape
4702    sequence to DST.  Also update FLAGS if it is not a null pointer.
4703    If INTERNAL_P is set, we are outputting in internal format and
4704    need to handle the CSI differently. */
4705
4706 static void
4707 ensure_correct_direction (int direction, Lisp_Coding_System *codesys,
4708                           unsigned_char_dynarr *dst, unsigned int *flags,
4709                           int internal_p)
4710 {
4711   if ((!flags || (*flags & CODING_STATE_R2L)) &&
4712       direction == CHARSET_LEFT_TO_RIGHT)
4713     restore_left_to_right_direction (codesys, dst, flags, internal_p);
4714   else if (!CODING_SYSTEM_ISO2022_NO_ISO6429 (codesys)
4715            && (!flags || !(*flags & CODING_STATE_R2L)) &&
4716            direction == CHARSET_RIGHT_TO_LEFT)
4717     {
4718       if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4719         {
4720           Dynarr_add (dst, ISO_CODE_ESC);
4721           Dynarr_add (dst, '[');
4722         }
4723       else if (internal_p)
4724         DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4725       else
4726         Dynarr_add (dst, ISO_CODE_CSI);
4727       Dynarr_add (dst, '2');
4728       Dynarr_add (dst, ']');
4729       if (flags)
4730         *flags |= CODING_STATE_R2L;
4731     }
4732 }
4733
4734 /* Convert ISO2022-format data to internal format. */
4735
4736 static void
4737 decode_coding_iso2022 (Lstream *decoding, const unsigned char *src,
4738                        unsigned_char_dynarr *dst, unsigned int n)
4739 {
4740   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4741   unsigned int flags    = str->flags;
4742   unsigned int cpos     = str->cpos;
4743   unsigned char counter = str->counter;
4744   eol_type_t eol_type   = str->eol_type;
4745 #ifdef ENABLE_COMPOSITE_CHARS
4746   unsigned_char_dynarr *real_dst = dst;
4747 #endif
4748   Lisp_Object coding_system;
4749
4750   XSETCODING_SYSTEM (coding_system, str->codesys);
4751
4752 #ifdef ENABLE_COMPOSITE_CHARS
4753   if (flags & CODING_STATE_COMPOSITE)
4754     dst = str->iso2022.composite_chars;
4755 #endif /* ENABLE_COMPOSITE_CHARS */
4756
4757   while (n--)
4758     {
4759       unsigned char c = *src++;
4760       if (flags & CODING_STATE_ESCAPE)
4761         {       /* Within ESC sequence */
4762           int retval = parse_iso2022_esc (coding_system, &str->iso2022,
4763                                           c, &flags, 1);
4764
4765           if (retval)
4766             {
4767               switch (str->iso2022.esc)
4768                 {
4769 #ifdef ENABLE_COMPOSITE_CHARS
4770                 case ISO_ESC_START_COMPOSITE:
4771                   if (str->iso2022.composite_chars)
4772                     Dynarr_reset (str->iso2022.composite_chars);
4773                   else
4774                     str->iso2022.composite_chars = Dynarr_new (unsigned_char);
4775                   dst = str->iso2022.composite_chars;
4776                   break;
4777                 case ISO_ESC_END_COMPOSITE:
4778                   {
4779                     Bufbyte comstr[MAX_EMCHAR_LEN];
4780                     Bytecount len;
4781                     Emchar emch = lookup_composite_char (Dynarr_atp (dst, 0),
4782                                                          Dynarr_length (dst));
4783                     dst = real_dst;
4784                     len = set_charptr_emchar (comstr, emch);
4785                     Dynarr_add_many (dst, comstr, len);
4786                     break;
4787                   }
4788 #endif /* ENABLE_COMPOSITE_CHARS */
4789
4790                 case ISO_ESC_LITERAL:
4791                   COMPOSE_FLUSH_CHARS (str, dst);
4792                   DECODE_ADD_BINARY_CHAR (c, dst);
4793                   break;
4794
4795                 default:
4796                   /* Everything else handled already */
4797                   break;
4798                 }
4799             }
4800
4801           /* Attempted error recovery. */
4802           if (str->iso2022.output_direction_sequence)
4803             ensure_correct_direction (flags & CODING_STATE_R2L ?
4804                                       CHARSET_RIGHT_TO_LEFT :
4805                                       CHARSET_LEFT_TO_RIGHT,
4806                                       str->codesys, dst, 0, 1);
4807           /* More error recovery. */
4808           if (!retval || str->iso2022.output_literally)
4809             {
4810               /* Output the (possibly invalid) sequence */
4811               int i;
4812               COMPOSE_FLUSH_CHARS (str, dst);
4813               for (i = 0; i < str->iso2022.esc_bytes_index; i++)
4814                 DECODE_ADD_BINARY_CHAR (str->iso2022.esc_bytes[i], dst);
4815               flags &= CODING_STATE_ISO2022_LOCK;
4816               if (!retval)
4817                 n++, src--;/* Repeat the loop with the same character. */
4818               else
4819                 {
4820                   /* No sense in reprocessing the final byte of the
4821                      escape sequence; it could mess things up anyway.
4822                      Just add it now. */
4823                   COMPOSE_FLUSH_CHARS (str, dst);
4824                   DECODE_ADD_BINARY_CHAR (c, dst);
4825                 }
4826             }
4827           cpos = 0;
4828           counter = 0;
4829         }
4830       else if (BYTE_C0_P (c) || BYTE_C1_P (c))
4831         { /* Control characters */
4832
4833           /***** Error-handling *****/
4834
4835           /* If we were in the middle of a character, dump out the
4836              partial character. */
4837           if (counter)
4838             {
4839               COMPOSE_FLUSH_CHARS (str, dst);
4840               while (counter > 0)
4841                 {
4842                   counter--;
4843                   DECODE_ADD_BINARY_CHAR
4844                     ((unsigned char)(cpos >> (counter * 8)), dst);
4845                 }
4846               cpos = 0;
4847             }
4848
4849           /* If we just saw a single-shift character, dump it out.
4850              This may dump out the wrong sort of single-shift character,
4851              but least it will give an indication that something went
4852              wrong. */
4853           if (flags & CODING_STATE_SS2)
4854             {
4855               COMPOSE_FLUSH_CHARS (str, dst);
4856               DECODE_ADD_BINARY_CHAR (ISO_CODE_SS2, dst);
4857               flags &= ~CODING_STATE_SS2;
4858             }
4859           if (flags & CODING_STATE_SS3)
4860             {
4861               COMPOSE_FLUSH_CHARS (str, dst);
4862               DECODE_ADD_BINARY_CHAR (ISO_CODE_SS3, dst);
4863               flags &= ~CODING_STATE_SS3;
4864             }
4865
4866           /***** Now handle the control characters. *****/
4867
4868           /* Handle CR/LF */
4869 #ifdef UTF2000
4870           if (c == '\r')
4871             {
4872               COMPOSE_FLUSH_CHARS (str, dst);
4873               if (eol_type == EOL_CR)
4874                 Dynarr_add (dst, '\n');
4875               else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR)
4876                 Dynarr_add (dst, c);
4877               else
4878                 flags |= CODING_STATE_CR;
4879               goto label_continue_loop;
4880             }
4881           else if (flags & CODING_STATE_CR)
4882             {   /* eol_type == CODING_SYSTEM_EOL_CRLF */
4883               if (c != '\n')
4884                 Dynarr_add (dst, '\r');
4885               flags &= ~CODING_STATE_CR;
4886             }
4887 #else
4888           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4889 #endif
4890
4891           flags &= CODING_STATE_ISO2022_LOCK;
4892
4893           if (!parse_iso2022_esc (coding_system, &str->iso2022, c, &flags, 1))
4894             {
4895               COMPOSE_FLUSH_CHARS (str, dst);
4896               DECODE_ADD_BINARY_CHAR (c, dst);
4897             }
4898         }
4899       else
4900         {                       /* Graphic characters */
4901           Lisp_Object charset;
4902 #ifndef UTF2000
4903           Charset_ID lb;
4904 #endif
4905           int reg;
4906
4907 #ifdef UTF2000
4908           if (c == '\r')
4909             {
4910               COMPOSE_FLUSH_CHARS (str, dst);
4911               if (eol_type == EOL_CR)
4912                 Dynarr_add (dst, '\n');
4913               else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR)
4914                 Dynarr_add (dst, c);
4915               else
4916                 flags |= CODING_STATE_CR;
4917               goto label_continue_loop;
4918             }
4919           else if (flags & CODING_STATE_CR)
4920             {   /* eol_type == CODING_SYSTEM_EOL_CRLF */
4921               if (c != '\n')
4922                 Dynarr_add (dst, '\r');
4923               flags &= ~CODING_STATE_CR;
4924             }
4925 #else
4926           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4927 #endif
4928
4929           /* Now determine the charset. */
4930           reg = ((flags & CODING_STATE_SS2) ? 2
4931                  : (flags & CODING_STATE_SS3) ? 3
4932                  : !BYTE_ASCII_P (c) ? str->iso2022.register_right
4933                  : str->iso2022.register_left);
4934           charset = str->iso2022.charset[reg];
4935
4936           /* Error checking: */
4937           if (! CHARSETP (charset)
4938               || str->iso2022.invalid_designated[reg]
4939               || (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL)
4940                   && XCHARSET_CHARS (charset) == 94))
4941             /* Mrmph.  We are trying to invoke a register that has no
4942                or an invalid charset in it, or trying to add a character
4943                outside the range of the charset.  Insert that char literally
4944                to preserve it for the output. */
4945             {
4946               COMPOSE_FLUSH_CHARS (str, dst);
4947               while (counter > 0)
4948                 {
4949                   counter--;
4950                   DECODE_ADD_BINARY_CHAR
4951                     ((unsigned char)(cpos >> (counter * 8)), dst);
4952                 }
4953               cpos = 0;
4954               DECODE_ADD_BINARY_CHAR (c, dst);
4955             }
4956
4957           else
4958             {
4959               /* Things are probably hunky-dorey. */
4960
4961               /* Fetch reverse charset, maybe. */
4962               if (((flags & CODING_STATE_R2L) &&
4963                    XCHARSET_DIRECTION (charset) == CHARSET_LEFT_TO_RIGHT)
4964                   ||
4965                   (!(flags & CODING_STATE_R2L) &&
4966                    XCHARSET_DIRECTION (charset) == CHARSET_RIGHT_TO_LEFT))
4967                 {
4968                   Lisp_Object new_charset =
4969                     XCHARSET_REVERSE_DIRECTION_CHARSET (charset);
4970                   if (!NILP (new_charset))
4971                     charset = new_charset;
4972                 }
4973
4974 #ifdef UTF2000
4975               counter++;
4976               if (XCHARSET_DIMENSION (charset) == counter)
4977                 {
4978                   COMPOSE_ADD_CHAR (str,
4979                                     DECODE_CHAR (charset,
4980                                                  ((cpos & 0x7F7F7F) << 8)
4981                                                  | (c & 0x7F)),
4982                                     dst);
4983                   cpos = 0;
4984                   counter = 0;
4985                 }
4986               else
4987                 cpos = (cpos << 8) | c;
4988 #else
4989               lb = XCHARSET_LEADING_BYTE (charset);
4990               switch (XCHARSET_REP_BYTES (charset))
4991                 {
4992                 case 1: /* ASCII */
4993                   DECODE_OUTPUT_PARTIAL_CHAR (ch);
4994                   Dynarr_add (dst, c & 0x7F);
4995                   break;
4996
4997                 case 2: /* one-byte official */
4998                   DECODE_OUTPUT_PARTIAL_CHAR (ch);
4999                   Dynarr_add (dst, lb);
5000                   Dynarr_add (dst, c | 0x80);
5001                   break;
5002
5003                 case 3: /* one-byte private or two-byte official */
5004                   if (XCHARSET_PRIVATE_P (charset))
5005                     {
5006                       DECODE_OUTPUT_PARTIAL_CHAR (ch);
5007                       Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_1);
5008                       Dynarr_add (dst, lb);
5009                       Dynarr_add (dst, c | 0x80);
5010                     }
5011                   else
5012                     {
5013                       if (ch)
5014                         {
5015                           Dynarr_add (dst, lb);
5016                           Dynarr_add (dst, ch | 0x80);
5017                           Dynarr_add (dst, c | 0x80);
5018                           ch = 0;
5019                         }
5020                       else
5021                         ch = c;
5022                     }
5023                   break;
5024
5025                 default:        /* two-byte private */
5026                   if (ch)
5027                     {
5028                       Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_2);
5029                       Dynarr_add (dst, lb);
5030                       Dynarr_add (dst, ch | 0x80);
5031                       Dynarr_add (dst, c | 0x80);
5032                       ch = 0;
5033                     }
5034                   else
5035                     ch = c;
5036                 }
5037 #endif
5038             }
5039
5040           if (!cpos)
5041             flags &= CODING_STATE_ISO2022_LOCK;
5042         }
5043
5044     label_continue_loop:;
5045     }
5046
5047   if (flags & CODING_STATE_END)
5048     {
5049       COMPOSE_FLUSH_CHARS (str, dst);
5050       DECODE_OUTPUT_PARTIAL_CHAR (cpos);
5051     }
5052   str->flags   = flags;
5053   str->cpos    = cpos;
5054   str->counter = counter;
5055 }
5056
5057
5058 /***** ISO2022 encoder *****/
5059
5060 /* Designate CHARSET into register REG. */
5061
5062 static void
5063 iso2022_designate (Lisp_Object charset, unsigned char reg,
5064                    struct encoding_stream *str, unsigned_char_dynarr *dst)
5065 {
5066   static const char inter94[] = "()*+";
5067   static const char inter96[] = ",-./";
5068   unsigned short chars;
5069   unsigned char dimension;
5070   unsigned char final;
5071   Lisp_Object old_charset = str->iso2022.charset[reg];
5072
5073   str->iso2022.charset[reg] = charset;
5074   if (!CHARSETP (charset))
5075     /* charset might be an initial nil or t. */
5076     return;
5077   chars = XCHARSET_CHARS (charset);
5078   dimension = XCHARSET_DIMENSION (charset);
5079   final = XCHARSET_FINAL (charset);
5080   if (!str->iso2022.force_charset_on_output[reg] &&
5081       CHARSETP (old_charset) &&
5082       XCHARSET_CHARS (old_charset) == chars &&
5083       XCHARSET_DIMENSION (old_charset) == dimension &&
5084       XCHARSET_FINAL (old_charset) == final)
5085     return;
5086
5087   str->iso2022.force_charset_on_output[reg] = 0;
5088
5089   {
5090     charset_conversion_spec_dynarr *dyn =
5091       str->codesys->iso2022.output_conv;
5092
5093     if (dyn)
5094       {
5095         int i;
5096
5097         for (i = 0; i < Dynarr_length (dyn); i++)
5098           {
5099             struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
5100             if (EQ (charset, spec->from_charset))
5101                 charset = spec->to_charset;
5102           }
5103       }
5104   }
5105
5106   Dynarr_add (dst, ISO_CODE_ESC);
5107   switch (chars)
5108     {
5109     case 94:
5110       if (dimension == 1)
5111         Dynarr_add (dst, inter94[reg]);
5112       else
5113         {
5114           Dynarr_add (dst, '$');
5115           if (reg != 0
5116               || !(CODING_SYSTEM_ISO2022_SHORT (str->codesys))
5117               || final < '@'
5118               || final > 'B')
5119             Dynarr_add (dst, inter94[reg]);
5120         }
5121       break;
5122     case 96:
5123       if (dimension == 1)
5124         Dynarr_add (dst, inter96[reg]);
5125       else
5126         {
5127           Dynarr_add (dst, '$');
5128           Dynarr_add (dst, inter96[reg]);
5129         }
5130       break;
5131     }
5132   Dynarr_add (dst, final);
5133 }
5134
5135 static void
5136 ensure_normal_shift (struct encoding_stream *str, unsigned_char_dynarr *dst)
5137 {
5138   if (str->iso2022.register_left != 0)
5139     {
5140       Dynarr_add (dst, ISO_CODE_SI);
5141       str->iso2022.register_left = 0;
5142     }
5143 }
5144
5145 static void
5146 ensure_shift_out (struct encoding_stream *str, unsigned_char_dynarr *dst)
5147 {
5148   if (str->iso2022.register_left != 1)
5149     {
5150       Dynarr_add (dst, ISO_CODE_SO);
5151       str->iso2022.register_left = 1;
5152     }
5153 }
5154
5155 void
5156 char_encode_iso2022 (struct encoding_stream *str, Emchar ch,
5157                      unsigned_char_dynarr *dst, unsigned int *flags)
5158 {
5159   unsigned char charmask;
5160   Lisp_Coding_System* codesys = str->codesys;
5161   eol_type_t eol_type         = CODING_SYSTEM_EOL_TYPE (str->codesys);
5162   int i;
5163   Lisp_Object charset = str->iso2022.current_charset;
5164   int half = str->iso2022.current_half;
5165   int code_point = -1;
5166
5167   if (ch <= 0x7F)
5168     {
5169       restore_left_to_right_direction (codesys, dst, flags, 0);
5170
5171       /* Make sure G0 contains ASCII */
5172       if ((ch > ' ' && ch < ISO_CODE_DEL)
5173           || !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
5174         {
5175           ensure_normal_shift (str, dst);
5176           iso2022_designate (Vcharset_ascii, 0, str, dst);
5177         }
5178
5179       /* If necessary, restore everything to the default state
5180          at end-of-line */
5181       if (ch == '\n' && !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys)))
5182         {
5183           restore_left_to_right_direction (codesys, dst, flags, 0);
5184
5185           ensure_normal_shift (str, dst);
5186
5187           for (i = 0; i < 4; i++)
5188             {
5189               Lisp_Object initial_charset =
5190                 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5191               iso2022_designate (initial_charset, i, str, dst);
5192             }
5193         }
5194       if (ch == '\n')
5195         {
5196           if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5197             Dynarr_add (dst, '\r');
5198           if (eol_type != EOL_CR)
5199             Dynarr_add (dst, ch);
5200         }
5201       else
5202         {
5203           if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5204               && fit_to_be_escape_quoted (ch))
5205             Dynarr_add (dst, ISO_CODE_ESC);
5206           Dynarr_add (dst, ch);
5207         }
5208     }
5209   else if ( (0x80 <= ch) && (ch <= 0x9f) )
5210     {
5211       charmask = (half == 0 ? 0x00 : 0x80);
5212
5213       if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5214           && fit_to_be_escape_quoted (ch))
5215         Dynarr_add (dst, ISO_CODE_ESC);
5216       /* you asked for it ... */
5217       Dynarr_add (dst, ch);
5218     }
5219   else
5220     {
5221       int reg;
5222
5223       /* Now determine which register to use. */
5224       reg = -1;
5225       for (i = 0; i < 4; i++)
5226         {
5227           if ((CHARSETP (charset = str->iso2022.charset[i])
5228                && ((code_point = charset_code_point (charset, ch)) >= 0))
5229               ||
5230               (CHARSETP
5231                (charset
5232                 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i))
5233                && ((code_point = charset_code_point (charset, ch)) >= 0)))
5234             {
5235               reg = i;
5236               break;
5237             }
5238         }
5239       if (reg == -1)
5240         {
5241           Lisp_Object original_default_coded_charset_priority_list
5242             = Vdefault_coded_charset_priority_list;
5243
5244           while (!EQ (Vdefault_coded_charset_priority_list, Qnil))
5245             {
5246               code_point = ENCODE_CHAR (ch, charset);
5247               if (XCHARSET_FINAL (charset))
5248                 goto found;
5249               Vdefault_coded_charset_priority_list
5250                 = Fcdr (Fmemq (XCHARSET_NAME (charset),
5251                                Vdefault_coded_charset_priority_list));
5252             }
5253           code_point = ENCODE_CHAR (ch, charset);
5254           if (!XCHARSET_FINAL (charset))
5255             {
5256               charset = Vcharset_ascii;
5257               code_point = '~';
5258             }
5259         found:
5260           Vdefault_coded_charset_priority_list
5261             = original_default_coded_charset_priority_list;
5262         }
5263       ensure_correct_direction (XCHARSET_DIRECTION (charset),
5264                                 codesys, dst, flags, 0);
5265
5266       if (reg == -1)
5267         {
5268           if (XCHARSET_GRAPHIC (charset) != 0)
5269             {
5270               if (!NILP (str->iso2022.charset[1]) &&
5271                   (!CODING_SYSTEM_ISO2022_SEVEN (codesys)
5272                    || CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys)))
5273                 reg = 1;
5274               else if (!NILP (str->iso2022.charset[2]))
5275                 reg = 2;
5276               else if (!NILP (str->iso2022.charset[3]))
5277                 reg = 3;
5278               else
5279                 reg = 0;
5280             }
5281           else
5282             reg = 0;
5283         }
5284
5285       iso2022_designate (charset, reg, str, dst);
5286
5287       /* Now invoke that register. */
5288       switch (reg)
5289         {
5290         case 0:
5291           ensure_normal_shift (str, dst);
5292           half = 0;
5293           break;
5294         case 1:
5295           if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5296             {
5297               ensure_shift_out (str, dst);
5298               half = 0;
5299             }
5300           else
5301             half = 1;
5302           break;
5303         case 2:
5304           if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5305             {
5306               Dynarr_add (dst, ISO_CODE_ESC);
5307               Dynarr_add (dst, 'N');
5308               half = 0;
5309             }
5310           else
5311             {
5312               Dynarr_add (dst, ISO_CODE_SS2);
5313               half = 1;
5314             }
5315           break;
5316         case 3:
5317           if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5318             {
5319               Dynarr_add (dst, ISO_CODE_ESC);
5320               Dynarr_add (dst, 'O');
5321               half = 0;
5322             }
5323           else
5324             {
5325               Dynarr_add (dst, ISO_CODE_SS3);
5326               half = 1;
5327             }
5328           break;
5329         default:
5330           abort ();
5331         }
5332
5333       charmask = (half == 0 ? 0x00 : 0x80);
5334
5335       switch (XCHARSET_DIMENSION (charset))
5336         {
5337         case 1:
5338           Dynarr_add (dst, (code_point & 0xFF) | charmask);
5339           break;
5340         case 2:
5341           Dynarr_add (dst, ((code_point >> 8) & 0xFF) | charmask);
5342           Dynarr_add (dst, ( code_point       & 0xFF) | charmask);
5343           break;
5344         case 3:
5345           Dynarr_add (dst, ((code_point >> 16) & 0xFF) | charmask);
5346           Dynarr_add (dst, ((code_point >>  8) & 0xFF) | charmask);
5347           Dynarr_add (dst, ( code_point        & 0xFF) | charmask);
5348           break;
5349         case 4:
5350           Dynarr_add (dst, ((code_point >> 24) & 0xFF) | charmask);
5351           Dynarr_add (dst, ((code_point >> 16) & 0xFF) | charmask);
5352           Dynarr_add (dst, ((code_point >>  8) & 0xFF) | charmask);
5353           Dynarr_add (dst, ( code_point        & 0xFF) | charmask);
5354           break;
5355         default:
5356           abort ();
5357         }
5358     }
5359   str->iso2022.current_charset = charset;
5360   str->iso2022.current_half = half;
5361 }
5362
5363 void
5364 char_finish_iso2022 (struct encoding_stream *str, unsigned_char_dynarr *dst,
5365                      unsigned int *flags)
5366 {
5367   Lisp_Coding_System* codesys = str->codesys;
5368   int i;
5369
5370   restore_left_to_right_direction (codesys, dst, flags, 0);
5371   ensure_normal_shift (str, dst);
5372   for (i = 0; i < 4; i++)
5373     {
5374       Lisp_Object initial_charset
5375         = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5376       iso2022_designate (initial_charset, i, str, dst);
5377     }
5378 }
5379 #endif /* MULE */
5380 \f
5381 /************************************************************************/
5382 /*                     No-conversion methods                            */
5383 /************************************************************************/
5384
5385 /* This is used when reading in "binary" files -- i.e. files that may
5386    contain all 256 possible byte values and that are not to be
5387    interpreted as being in any particular decoding. */
5388 static void
5389 decode_coding_no_conversion (Lstream *decoding, const unsigned char *src,
5390                              unsigned_char_dynarr *dst, unsigned int n)
5391 {
5392   unsigned char c;
5393   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
5394   unsigned int flags  = str->flags;
5395   unsigned int cpos   = str->cpos;
5396   eol_type_t eol_type = str->eol_type;
5397
5398   while (n--)
5399     {
5400       c = *src++;
5401
5402       DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5403       DECODE_ADD_BINARY_CHAR (c, dst);
5404     label_continue_loop:;
5405     }
5406
5407   DECODE_HANDLE_END_OF_CONVERSION (flags, cpos, dst);
5408
5409   str->flags = flags;
5410   str->cpos  = cpos;
5411 }
5412
5413 static void
5414 encode_coding_no_conversion (Lstream *encoding, const unsigned char *src,
5415                              unsigned_char_dynarr *dst, unsigned int n)
5416 {
5417   unsigned char c;
5418   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
5419   unsigned int flags  = str->flags;
5420   unsigned int ch     = str->ch;
5421   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
5422 #ifdef UTF2000
5423   unsigned char char_boundary = str->iso2022.current_char_boundary;
5424 #endif
5425
5426   while (n--)
5427     {
5428       c = *src++;
5429 #ifdef UTF2000
5430       if (char_boundary == 0)
5431         if ( c >= 0xfc )
5432           {
5433             ch = c & 0x01;
5434             char_boundary = 5;
5435           }
5436         else if ( c >= 0xf8 )
5437           {
5438             ch = c & 0x03;
5439             char_boundary = 4;
5440           }
5441         else if ( c >= 0xf0 )
5442           {
5443             ch = c & 0x07;
5444             char_boundary = 3;
5445           }
5446         else if ( c >= 0xe0 )
5447           {
5448             ch = c & 0x0f;
5449             char_boundary = 2;
5450           }
5451         else if ( c >= 0xc0 )
5452           {
5453             ch = c & 0x1f;
5454             char_boundary = 1;
5455           }
5456         else
5457           {
5458             ch = 0;
5459             if (c == '\n')
5460               {
5461                 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5462                   Dynarr_add (dst, '\r');
5463                 if (eol_type != EOL_CR)
5464                   Dynarr_add (dst, c);
5465               }
5466             else
5467               Dynarr_add (dst, c);
5468             char_boundary = 0;
5469           }
5470       else if (char_boundary == 1)
5471         {
5472           ch = ( ch << 6 ) | ( c & 0x3f );
5473           Dynarr_add (dst, ch & 0xff);
5474           char_boundary = 0;
5475         }
5476       else
5477         {
5478           ch = ( ch << 6 ) | ( c & 0x3f );
5479           char_boundary--;
5480         }
5481 #else /* not UTF2000 */
5482       if (c == '\n')
5483         {
5484           if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5485             Dynarr_add (dst, '\r');
5486           if (eol_type != EOL_CR)
5487             Dynarr_add (dst, '\n');
5488           ch = 0;
5489         }
5490       else if (BYTE_ASCII_P (c))
5491         {
5492           assert (ch == 0);
5493           Dynarr_add (dst, c);
5494         }
5495       else if (BUFBYTE_LEADING_BYTE_P (c))
5496         {
5497           assert (ch == 0);
5498           if (c == LEADING_BYTE_LATIN_ISO8859_1 ||
5499               c == LEADING_BYTE_CONTROL_1)
5500             ch = c;
5501           else
5502             Dynarr_add (dst, '~'); /* untranslatable character */
5503         }
5504       else
5505         {
5506           if (ch == LEADING_BYTE_LATIN_ISO8859_1)
5507             Dynarr_add (dst, c);
5508           else if (ch == LEADING_BYTE_CONTROL_1)
5509             {
5510               assert (c < 0xC0);
5511               Dynarr_add (dst, c - 0x20);
5512             }
5513           /* else it should be the second or third byte of an
5514              untranslatable character, so ignore it */
5515           ch = 0;
5516         }
5517 #endif /* not UTF2000 */
5518     }
5519
5520   str->flags = flags;
5521   str->ch    = ch;
5522 #ifdef UTF2000
5523   str->iso2022.current_char_boundary = char_boundary;
5524 #endif
5525 }
5526
5527 \f
5528
5529 /************************************************************************/
5530 /*                             Initialization                           */
5531 /************************************************************************/
5532
5533 void
5534 syms_of_file_coding (void)
5535 {
5536   INIT_LRECORD_IMPLEMENTATION (coding_system);
5537
5538   deferror (&Qcoding_system_error, "coding-system-error",
5539             "Coding-system error", Qio_error);
5540
5541   DEFSUBR (Fcoding_system_p);
5542   DEFSUBR (Ffind_coding_system);
5543   DEFSUBR (Fget_coding_system);
5544   DEFSUBR (Fcoding_system_list);
5545   DEFSUBR (Fcoding_system_name);
5546   DEFSUBR (Fmake_coding_system);
5547   DEFSUBR (Fcopy_coding_system);
5548   DEFSUBR (Fcoding_system_canonical_name_p);
5549   DEFSUBR (Fcoding_system_alias_p);
5550   DEFSUBR (Fcoding_system_aliasee);
5551   DEFSUBR (Fdefine_coding_system_alias);
5552   DEFSUBR (Fsubsidiary_coding_system);
5553
5554   DEFSUBR (Fcoding_system_type);
5555   DEFSUBR (Fcoding_system_doc_string);
5556 #ifdef MULE
5557   DEFSUBR (Fcoding_system_charset);
5558 #endif
5559   DEFSUBR (Fcoding_system_property);
5560
5561   DEFSUBR (Fcoding_category_list);
5562   DEFSUBR (Fset_coding_priority_list);
5563   DEFSUBR (Fcoding_priority_list);
5564   DEFSUBR (Fset_coding_category_system);
5565   DEFSUBR (Fcoding_category_system);
5566
5567   DEFSUBR (Fdetect_coding_region);
5568   DEFSUBR (Fdecode_coding_region);
5569   DEFSUBR (Fencode_coding_region);
5570 #ifdef MULE
5571   DEFSUBR (Fdecode_shift_jis_char);
5572   DEFSUBR (Fencode_shift_jis_char);
5573   DEFSUBR (Fdecode_big5_char);
5574   DEFSUBR (Fencode_big5_char);
5575 #endif /* MULE */
5576   defsymbol (&Qcoding_systemp, "coding-system-p");
5577   defsymbol (&Qno_conversion, "no-conversion");
5578   defsymbol (&Qraw_text, "raw-text");
5579 #ifdef MULE
5580   defsymbol (&Qbig5, "big5");
5581   defsymbol (&Qshift_jis, "shift-jis");
5582   defsymbol (&Qucs4, "ucs-4");
5583   defsymbol (&Qutf8, "utf-8");
5584   defsymbol (&Qccl, "ccl");
5585   defsymbol (&Qiso2022, "iso2022");
5586 #endif /* MULE */
5587   defsymbol (&Qmnemonic, "mnemonic");
5588   defsymbol (&Qeol_type, "eol-type");
5589   defsymbol (&Qpost_read_conversion, "post-read-conversion");
5590   defsymbol (&Qpre_write_conversion, "pre-write-conversion");
5591
5592   defsymbol (&Qcr, "cr");
5593   defsymbol (&Qlf, "lf");
5594   defsymbol (&Qcrlf, "crlf");
5595   defsymbol (&Qeol_cr, "eol-cr");
5596   defsymbol (&Qeol_lf, "eol-lf");
5597   defsymbol (&Qeol_crlf, "eol-crlf");
5598 #ifdef MULE
5599   defsymbol (&Qcharset_g0, "charset-g0");
5600   defsymbol (&Qcharset_g1, "charset-g1");
5601   defsymbol (&Qcharset_g2, "charset-g2");
5602   defsymbol (&Qcharset_g3, "charset-g3");
5603   defsymbol (&Qforce_g0_on_output, "force-g0-on-output");
5604   defsymbol (&Qforce_g1_on_output, "force-g1-on-output");
5605   defsymbol (&Qforce_g2_on_output, "force-g2-on-output");
5606   defsymbol (&Qforce_g3_on_output, "force-g3-on-output");
5607   defsymbol (&Qno_iso6429, "no-iso6429");
5608   defsymbol (&Qinput_charset_conversion, "input-charset-conversion");
5609   defsymbol (&Qoutput_charset_conversion, "output-charset-conversion");
5610
5611   defsymbol (&Qshort, "short");
5612   defsymbol (&Qno_ascii_eol, "no-ascii-eol");
5613   defsymbol (&Qno_ascii_cntl, "no-ascii-cntl");
5614   defsymbol (&Qseven, "seven");
5615   defsymbol (&Qlock_shift, "lock-shift");
5616   defsymbol (&Qescape_quoted, "escape-quoted");
5617 #endif /* MULE */
5618 #ifdef UTF2000
5619   defsymbol (&Qdisable_composition, "disable-composition");
5620 #endif
5621   defsymbol (&Qencode, "encode");
5622   defsymbol (&Qdecode, "decode");
5623
5624 #ifdef MULE
5625   defsymbol (&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS],
5626              "shift-jis");
5627   defsymbol (&coding_category_symbol[CODING_CATEGORY_BIG5],
5628              "big5");
5629   defsymbol (&coding_category_symbol[CODING_CATEGORY_UCS4],
5630              "ucs-4");
5631   defsymbol (&coding_category_symbol[CODING_CATEGORY_UTF8],
5632              "utf-8");
5633   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_7],
5634              "iso-7");
5635   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE],
5636              "iso-8-designate");
5637   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_1],
5638              "iso-8-1");
5639   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_2],
5640              "iso-8-2");
5641   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_LOCK_SHIFT],
5642              "iso-lock-shift");
5643 #endif /* MULE */
5644   defsymbol (&coding_category_symbol[CODING_CATEGORY_NO_CONVERSION],
5645              "no-conversion");
5646 }
5647
5648 void
5649 lstream_type_create_file_coding (void)
5650 {
5651   LSTREAM_HAS_METHOD (decoding, reader);
5652   LSTREAM_HAS_METHOD (decoding, writer);
5653   LSTREAM_HAS_METHOD (decoding, rewinder);
5654   LSTREAM_HAS_METHOD (decoding, seekable_p);
5655   LSTREAM_HAS_METHOD (decoding, flusher);
5656   LSTREAM_HAS_METHOD (decoding, closer);
5657   LSTREAM_HAS_METHOD (decoding, marker);
5658
5659   LSTREAM_HAS_METHOD (encoding, reader);
5660   LSTREAM_HAS_METHOD (encoding, writer);
5661   LSTREAM_HAS_METHOD (encoding, rewinder);
5662   LSTREAM_HAS_METHOD (encoding, seekable_p);
5663   LSTREAM_HAS_METHOD (encoding, flusher);
5664   LSTREAM_HAS_METHOD (encoding, closer);
5665   LSTREAM_HAS_METHOD (encoding, marker);
5666 }
5667
5668 void
5669 vars_of_file_coding (void)
5670 {
5671   int i;
5672
5673   fcd = xnew (struct file_coding_dump);
5674   dumpstruct (&fcd, &fcd_description);
5675
5676   /* Initialize to something reasonable ... */
5677   for (i = 0; i <= CODING_CATEGORY_LAST; i++)
5678     {
5679       fcd->coding_category_system[i] = Qnil;
5680       fcd->coding_category_by_priority[i] = i;
5681     }
5682
5683   Fprovide (intern ("file-coding"));
5684
5685   DEFVAR_LISP ("keyboard-coding-system", &Vkeyboard_coding_system /*
5686 Coding system used for TTY keyboard input.
5687 Not used under a windowing system.
5688 */ );
5689   Vkeyboard_coding_system = Qnil;
5690
5691   DEFVAR_LISP ("terminal-coding-system", &Vterminal_coding_system /*
5692 Coding system used for TTY display output.
5693 Not used under a windowing system.
5694 */ );
5695   Vterminal_coding_system = Qnil;
5696
5697   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read /*
5698 Overriding coding system used when reading from a file or process.
5699 You should bind this variable with `let', but do not set it globally.
5700 If this is non-nil, it specifies the coding system that will be used
5701 to decode input on read operations, such as from a file or process.
5702 It overrides `buffer-file-coding-system-for-read',
5703 `insert-file-contents-pre-hook', etc.  Use those variables instead of
5704 this one for permanent changes to the environment.  */ );
5705   Vcoding_system_for_read = Qnil;
5706
5707   DEFVAR_LISP ("coding-system-for-write",
5708                &Vcoding_system_for_write /*
5709 Overriding coding system used when writing to a file or process.
5710 You should bind this variable with `let', but do not set it globally.
5711 If this is non-nil, it specifies the coding system that will be used
5712 to encode output for write operations, such as to a file or process.
5713 It overrides `buffer-file-coding-system', `write-region-pre-hook', etc.
5714 Use those variables instead of this one for permanent changes to the
5715 environment.  */ );
5716   Vcoding_system_for_write = Qnil;
5717
5718   DEFVAR_LISP ("file-name-coding-system", &Vfile_name_coding_system /*
5719 Coding system used to convert pathnames when accessing files.
5720 */ );
5721   Vfile_name_coding_system = Qnil;
5722
5723   DEFVAR_BOOL ("enable-multibyte-characters", &enable_multibyte_characters /*
5724 Non-nil means the buffer contents are regarded as multi-byte form
5725 of characters, not a binary code.  This affects the display, file I/O,
5726 and behaviors of various editing commands.
5727
5728 Setting this to nil does not do anything.
5729 */ );
5730   enable_multibyte_characters = 1;
5731 }
5732
5733 void
5734 complex_vars_of_file_coding (void)
5735 {
5736   staticpro (&Vcoding_system_hash_table);
5737   Vcoding_system_hash_table =
5738     make_lisp_hash_table (50, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ);
5739
5740   the_codesys_prop_dynarr = Dynarr_new (codesys_prop);
5741   dumpstruct (&the_codesys_prop_dynarr, &codesys_prop_dynarr_description);
5742
5743 #define DEFINE_CODESYS_PROP(Prop_Type, Sym) do  \
5744 {                                               \
5745   struct codesys_prop csp;                      \
5746   csp.sym = (Sym);                              \
5747   csp.prop_type = (Prop_Type);                  \
5748   Dynarr_add (the_codesys_prop_dynarr, csp);    \
5749 } while (0)
5750
5751   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qmnemonic);
5752   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_type);
5753   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_cr);
5754   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_crlf);
5755   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_lf);
5756   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qpost_read_conversion);
5757   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qpre_write_conversion);
5758 #ifdef MULE
5759   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g0);
5760   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g1);
5761   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g2);
5762   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g3);
5763   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g0_on_output);
5764   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g1_on_output);
5765   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g2_on_output);
5766   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g3_on_output);
5767   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qshort);
5768   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_eol);
5769   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_cntl);
5770   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qseven);
5771   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qlock_shift);
5772   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_iso6429);
5773   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qescape_quoted);
5774   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qinput_charset_conversion);
5775   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qoutput_charset_conversion);
5776
5777   DEFINE_CODESYS_PROP (CODESYS_PROP_CCL,     Qencode);
5778   DEFINE_CODESYS_PROP (CODESYS_PROP_CCL,     Qdecode);
5779 #endif /* MULE */
5780   /* Need to create this here or we're really screwed. */
5781   Fmake_coding_system
5782     (Qraw_text, Qno_conversion,
5783      build_string ("Raw text, which means it converts only line-break-codes."),
5784      list2 (Qmnemonic, build_string ("Raw")));
5785
5786   Fmake_coding_system
5787     (Qbinary, Qno_conversion,
5788      build_string ("Binary, which means it does not convert anything."),
5789      list4 (Qeol_type, Qlf,
5790             Qmnemonic, build_string ("Binary")));
5791
5792 #ifdef UTF2000
5793   Fmake_coding_system
5794     (Qutf8, Qutf8,
5795      build_string ("Coding-system of ISO/IEC 10646 UTF-8."),
5796      list2 (Qmnemonic, build_string ("UTF8")));
5797 #endif
5798
5799   Fdefine_coding_system_alias (Qno_conversion, Qraw_text);
5800
5801   Fdefine_coding_system_alias (Qfile_name, Qbinary);
5802
5803   Fdefine_coding_system_alias (Qterminal, Qbinary);
5804   Fdefine_coding_system_alias (Qkeyboard, Qbinary);
5805
5806   /* Need this for bootstrapping */
5807   fcd->coding_category_system[CODING_CATEGORY_NO_CONVERSION] =
5808     Fget_coding_system (Qraw_text);
5809
5810 #ifdef UTF2000
5811   fcd->coding_category_system[CODING_CATEGORY_UTF8]
5812    = Fget_coding_system (Qutf8);
5813 #endif
5814
5815 #if defined(MULE) && !defined(UTF2000)
5816   {
5817     unsigned int i;
5818
5819     for (i = 0; i < 65536; i++)
5820       fcd->ucs_to_mule_table[i] = Qnil;
5821   }
5822   staticpro (&mule_to_ucs_table);
5823   mule_to_ucs_table = Fmake_char_table(Qgeneric);
5824 #endif /* defined(MULE) && !defined(UTF2000) */
5825 }