src/text-coding.c

   1 /* Code conversion functions.
   2    Copyright (C) 1991, 1995 Free Software Foundation, Inc.
   3    Copyright (C) 1995 Sun Microsystems, Inc.
   4    Copyright (C) 1999,2000,2001 MORIOKA Tomohiko
   5
   6 This file is part of XEmacs.
   7
   8 XEmacs is free software; you can redistribute it and/or modify it
   9 under the terms of the GNU General Public License as published by the
  10 Free Software Foundation; either version 2, or (at your option) any
  11 later version.
  12
  13 XEmacs is distributed in the hope that it will be useful, but WITHOUT
  14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with XEmacs; see the file COPYING.  If not, write to
  20 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  21 Boston, MA 02111-1307, USA.  */
  22
  23 /* Synched up with: Mule 2.3.   Not in FSF. */
  24
  25 /* Rewritten by Ben Wing <ben@xemacs.org>. */
  26
  27 #include <config.h>
  28 #include "lisp.h"
  29
  30 #include "buffer.h"
  31 #include "elhash.h"
  32 #include "insdel.h"
  33 #include "lstream.h"
  34 #include "opaque.h"
  35 #ifdef MULE
  36 #include "mule-ccl.h"
  37 #include "chartab.h"
  38 #endif
  39 #include "file-coding.h"
  40
  41 Lisp_Object Qcoding_system_error;
  42
  43 Lisp_Object Vkeyboard_coding_system;
  44 Lisp_Object Vterminal_coding_system;
  45 Lisp_Object Vcoding_system_for_read;
  46 Lisp_Object Vcoding_system_for_write;
  47 Lisp_Object Vfile_name_coding_system;
  48
  49 /* Table of symbols identifying each coding category. */
  50 Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST];
  51
  52
  53
  54 struct file_coding_dump {
  55   /* Coding system currently associated with each coding category. */
  56   Lisp_Object coding_category_system[CODING_CATEGORY_LAST];
  57
  58   /* Table of all coding categories in decreasing order of priority.
  59      This describes a permutation of the possible coding categories. */
  60   int coding_category_by_priority[CODING_CATEGORY_LAST];
  61
  62 #if defined(MULE) && !defined(UTF2000)
  63   Lisp_Object ucs_to_mule_table[65536];
  64 #endif
  65 } *fcd;
  66
  67 static const struct lrecord_description fcd_description_1[] = {
  68   { XD_LISP_OBJECT_ARRAY, offsetof (struct file_coding_dump, coding_category_system), CODING_CATEGORY_LAST },
  69 #if defined(MULE) && !defined(UTF2000)
  70   { XD_LISP_OBJECT_ARRAY, offsetof (struct file_coding_dump, ucs_to_mule_table), countof (fcd->ucs_to_mule_table) },
  71 #endif
  72   { XD_END }
  73 };
  74
  75 static const struct struct_description fcd_description = {
  76   sizeof (struct file_coding_dump),
  77   fcd_description_1
  78 };
  79
  80 Lisp_Object mule_to_ucs_table;
  81
  82 Lisp_Object Qcoding_systemp;
  83
  84 Lisp_Object Qraw_text, Qno_conversion, Qccl, Qiso2022;
  85 /* Qinternal in general.c */
  86
  87 Lisp_Object Qmnemonic, Qeol_type;
  88 Lisp_Object Qcr, Qcrlf, Qlf;
  89 Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf;
  90 Lisp_Object Qpost_read_conversion;
  91 Lisp_Object Qpre_write_conversion;
  92
  93 #ifdef MULE
  94 Lisp_Object Qucs4, Qutf8;
  95 Lisp_Object Qbig5, Qshift_jis;
  96 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3;
  97 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output;
  98 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output;
  99 Lisp_Object Qno_iso6429;
 100 Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion;
 101 Lisp_Object Qescape_quoted;
 102 Lisp_Object Qshort, Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift;
 103 #endif
 104 #ifdef UTF2000
 105 Lisp_Object Qdisable_composition;
 106 #endif
 107 Lisp_Object Qencode, Qdecode;
 108
 109 Lisp_Object Vcoding_system_hash_table;
 110
 111 int enable_multibyte_characters;
 112
 113 #ifdef MULE
 114 /* Additional information used by the ISO2022 decoder and detector. */
 115 struct iso2022_decoder
 116 {
 117   /* CHARSET holds the character sets currently assigned to the G0
 118      through G3 variables.  It is initialized from the array
 119      INITIAL_CHARSET in CODESYS. */
 120   Lisp_Object charset[4];
 121
 122   /* Which registers are currently invoked into the left (GL) and
 123      right (GR) halves of the 8-bit encoding space? */
 124   int register_left, register_right;
 125
 126   /* ISO_ESC holds a value indicating part of an escape sequence
 127      that has already been seen. */
 128   enum iso_esc_flag esc;
 129
 130   /* This records the bytes we've seen so far in an escape sequence,
 131      in case the sequence is invalid (we spit out the bytes unchanged). */
 132   unsigned char esc_bytes[8];
 133
 134   /* Index for next byte to store in ISO escape sequence. */
 135   int esc_bytes_index;
 136
 137 #ifdef ENABLE_COMPOSITE_CHARS
 138   /* Stuff seen so far when composing a string. */
 139   unsigned_char_dynarr *composite_chars;
 140 #endif
 141
 142   /* If we saw an invalid designation sequence for a particular
 143      register, we flag it here and switch to ASCII.  The next time we
 144      see a valid designation for this register, we turn off the flag
 145      and do the designation normally, but pretend the sequence was
 146      invalid.  The effect of all this is that (most of the time) the
 147      escape sequences for both the switch to the unknown charset, and
 148      the switch back to the known charset, get inserted literally into
 149      the buffer and saved out as such.  The hope is that we can
 150      preserve the escape sequences so that the resulting written out
 151      file makes sense.  If we don't do any of this, the designation
 152      to the invalid charset will be preserved but that switch back
 153      to the known charset will probably get eaten because it was
 154      the same charset that was already present in the register. */
 155   unsigned char invalid_designated[4];
 156
 157   /* We try to do similar things as above for direction-switching
 158      sequences.  If we encountered a direction switch while an
 159      invalid designation was present, or an invalid designation
 160      just after a direction switch (i.e. no valid designation
 161      encountered yet), we insert the direction-switch escape
 162      sequence literally into the output stream, and later on
 163      insert the corresponding direction-restoring escape sequence
 164      literally also. */
 165   unsigned int switched_dir_and_no_valid_charset_yet :1;
 166   unsigned int invalid_switch_dir :1;
 167
 168   /* Tells the decoder to output the escape sequence literally
 169      even though it was valid.  Used in the games we play to
 170      avoid lossage when we encounter invalid designations. */
 171   unsigned int output_literally :1;
 172   /* We encountered a direction switch followed by an invalid
 173      designation.  We didn't output the direction switch
 174      literally because we didn't know about the invalid designation;
 175      but we have to do so now. */
 176   unsigned int output_direction_sequence :1;
 177 };
 178 #endif /* MULE */
 179 EXFUN (Fcopy_coding_system, 2);
 180 #ifdef MULE
 181 struct detection_state;
 182
 183 static void
 184 text_encode_generic (Lstream *encoding, const Bufbyte *src,
 185                      unsigned_char_dynarr *dst, size_t n);
 186
 187 static int detect_coding_sjis (struct detection_state *st,
 188                                const Extbyte *src, size_t n);
 189 static void decode_coding_sjis (Lstream *decoding, const Extbyte *src,
 190                                 unsigned_char_dynarr *dst, size_t n);
 191 void char_encode_shift_jis (struct encoding_stream *str, Emchar c,
 192                             unsigned_char_dynarr *dst, unsigned int *flags);
 193 void char_finish_shift_jis (struct encoding_stream *str,
 194                             unsigned_char_dynarr *dst, unsigned int *flags);
 195
 196 static int detect_coding_big5 (struct detection_state *st,
 197                                const Extbyte *src, size_t n);
 198 static void decode_coding_big5 (Lstream *decoding, const Extbyte *src,
 199                                 unsigned_char_dynarr *dst, size_t n);
 200 void char_encode_big5 (struct encoding_stream *str, Emchar c,
 201                        unsigned_char_dynarr *dst, unsigned int *flags);
 202 void char_finish_big5 (struct encoding_stream *str,
 203                        unsigned_char_dynarr *dst, unsigned int *flags);
 204
 205 static int detect_coding_ucs4 (struct detection_state *st,
 206                                const Extbyte *src, size_t n);
 207 static void decode_coding_ucs4 (Lstream *decoding, const Extbyte *src,
 208                                 unsigned_char_dynarr *dst, size_t n);
 209 void char_encode_ucs4 (struct encoding_stream *str, Emchar c,
 210                        unsigned_char_dynarr *dst, unsigned int *flags);
 211 void char_finish_ucs4 (struct encoding_stream *str,
 212                        unsigned_char_dynarr *dst, unsigned int *flags);
 213
 214 static int detect_coding_utf8 (struct detection_state *st,
 215                                const Extbyte *src, size_t n);
 216 static void decode_coding_utf8 (Lstream *decoding, const Extbyte *src,
 217                                 unsigned_char_dynarr *dst, size_t n);
 218 void char_encode_utf8 (struct encoding_stream *str, Emchar c,
 219                        unsigned_char_dynarr *dst, unsigned int *flags);
 220 void char_finish_utf8 (struct encoding_stream *str,
 221                        unsigned_char_dynarr *dst, unsigned int *flags);
 222
 223 static int postprocess_iso2022_mask (int mask);
 224 static void reset_iso2022 (Lisp_Object coding_system,
 225                            struct iso2022_decoder *iso);
 226 static int detect_coding_iso2022 (struct detection_state *st,
 227                                   const Extbyte *src, size_t n);
 228 static void decode_coding_iso2022 (Lstream *decoding, const Extbyte *src,
 229                                    unsigned_char_dynarr *dst, size_t n);
 230 void char_encode_iso2022 (struct encoding_stream *str, Emchar c,
 231                           unsigned_char_dynarr *dst, unsigned int *flags);
 232 void char_finish_iso2022 (struct encoding_stream *str,
 233                           unsigned_char_dynarr *dst, unsigned int *flags);
 234 #endif /* MULE */
 235 static void decode_coding_no_conversion (Lstream *decoding, const Extbyte *src,
 236                                          unsigned_char_dynarr *dst, size_t n);
 237 static void encode_coding_no_conversion (Lstream *encoding, const Bufbyte *src,
 238                                          unsigned_char_dynarr *dst, size_t n);
 239 static void mule_decode (Lstream *decoding, const Extbyte *src,
 240                          unsigned_char_dynarr *dst, size_t n);
 241 static void mule_encode (Lstream *encoding, const Bufbyte *src,
 242                          unsigned_char_dynarr *dst, size_t n);
 243
 244 typedef struct codesys_prop codesys_prop;
 245 struct codesys_prop
 246 {
 247   Lisp_Object sym;
 248   int prop_type;
 249 };
 250
 251 typedef struct
 252 {
 253   Dynarr_declare (codesys_prop);
 254 } codesys_prop_dynarr;
 255
 256 static const struct lrecord_description codesys_prop_description_1[] = {
 257   { XD_LISP_OBJECT, offsetof (codesys_prop, sym) },
 258   { XD_END }
 259 };
 260
 261 static const struct struct_description codesys_prop_description = {
 262   sizeof (codesys_prop),
 263   codesys_prop_description_1
 264 };
 265
 266 static const struct lrecord_description codesys_prop_dynarr_description_1[] = {
 267   XD_DYNARR_DESC (codesys_prop_dynarr, &codesys_prop_description),
 268   { XD_END }
 269 };
 270
 271 static const struct struct_description codesys_prop_dynarr_description = {
 272   sizeof (codesys_prop_dynarr),
 273   codesys_prop_dynarr_description_1
 274 };
 275
 276 codesys_prop_dynarr *the_codesys_prop_dynarr;
 277
 278 enum codesys_prop_enum
 279 {
 280   CODESYS_PROP_ALL_OK,
 281   CODESYS_PROP_ISO2022,
 282   CODESYS_PROP_CCL
 283 };
 284
 285 \f
 286 /************************************************************************/
 287 /*                       Coding system functions                        */
 288 /************************************************************************/
 289
 290 static Lisp_Object mark_coding_system (Lisp_Object);
 291 static void print_coding_system (Lisp_Object, Lisp_Object, int);
 292 static void finalize_coding_system (void *header, int for_disksave);
 293
 294 #ifdef MULE
 295 static const struct lrecord_description ccs_description_1[] = {
 296   { XD_LISP_OBJECT, offsetof (charset_conversion_spec, from_charset) },
 297   { XD_LISP_OBJECT, offsetof (charset_conversion_spec, to_charset) },
 298   { XD_END }
 299 };
 300
 301 static const struct struct_description ccs_description = {
 302   sizeof (charset_conversion_spec),
 303   ccs_description_1
 304 };
 305
 306 static const struct lrecord_description ccsd_description_1[] = {
 307   XD_DYNARR_DESC (charset_conversion_spec_dynarr, &ccs_description),
 308   { XD_END }
 309 };
 310
 311 static const struct struct_description ccsd_description = {
 312   sizeof (charset_conversion_spec_dynarr),
 313   ccsd_description_1
 314 };
 315 #endif
 316
 317 static const struct lrecord_description coding_system_description[] = {
 318   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, name) },
 319   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, doc_string) },
 320   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, mnemonic) },
 321   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, post_read_conversion) },
 322   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, pre_write_conversion) },
 323   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_lf) },
 324   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_crlf) },
 325   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_cr) },
 326 #ifdef MULE
 327   { XD_LISP_OBJECT_ARRAY, offsetof (Lisp_Coding_System, iso2022.initial_charset), 4 },
 328   { XD_STRUCT_PTR,  offsetof (Lisp_Coding_System, iso2022.input_conv),  1, &ccsd_description },
 329   { XD_STRUCT_PTR,  offsetof (Lisp_Coding_System, iso2022.output_conv), 1, &ccsd_description },
 330   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccl.decode) },
 331   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccl.encode) },
 332 #endif
 333   { XD_END }
 334 };
 335
 336 DEFINE_LRECORD_IMPLEMENTATION ("coding-system", coding_system,
 337                                mark_coding_system, print_coding_system,
 338                                finalize_coding_system,
 339                                0, 0, coding_system_description,
 340                                Lisp_Coding_System);
 341
 342 static Lisp_Object
 343 mark_coding_system (Lisp_Object obj)
 344 {
 345   Lisp_Coding_System *codesys = XCODING_SYSTEM (obj);
 346
 347   mark_object (CODING_SYSTEM_NAME (codesys));
 348   mark_object (CODING_SYSTEM_DOC_STRING (codesys));
 349   mark_object (CODING_SYSTEM_MNEMONIC (codesys));
 350   mark_object (CODING_SYSTEM_EOL_LF (codesys));
 351   mark_object (CODING_SYSTEM_EOL_CRLF (codesys));
 352   mark_object (CODING_SYSTEM_EOL_CR (codesys));
 353
 354   switch (CODING_SYSTEM_TYPE (codesys))
 355     {
 356 #ifdef MULE
 357       int i;
 358     case CODESYS_ISO2022:
 359       for (i = 0; i < 4; i++)
 360         mark_object (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i));
 361       if (codesys->iso2022.input_conv)
 362         {
 363           for (i = 0; i < Dynarr_length (codesys->iso2022.input_conv); i++)
 364             {
 365               struct charset_conversion_spec *ccs =
 366                 Dynarr_atp (codesys->iso2022.input_conv, i);
 367               mark_object (ccs->from_charset);
 368               mark_object (ccs->to_charset);
 369             }
 370         }
 371       if (codesys->iso2022.output_conv)
 372         {
 373           for (i = 0; i < Dynarr_length (codesys->iso2022.output_conv); i++)
 374             {
 375               struct charset_conversion_spec *ccs =
 376                 Dynarr_atp (codesys->iso2022.output_conv, i);
 377               mark_object (ccs->from_charset);
 378               mark_object (ccs->to_charset);
 379             }
 380         }
 381       break;
 382
 383     case CODESYS_CCL:
 384       mark_object (CODING_SYSTEM_CCL_DECODE (codesys));
 385       mark_object (CODING_SYSTEM_CCL_ENCODE (codesys));
 386       break;
 387 #endif /* MULE */
 388     default:
 389       break;
 390     }
 391
 392   mark_object (CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys));
 393   return CODING_SYSTEM_POST_READ_CONVERSION (codesys);
 394 }
 395
 396 static void
 397 print_coding_system (Lisp_Object obj, Lisp_Object printcharfun,
 398                      int escapeflag)
 399 {
 400   Lisp_Coding_System *c = XCODING_SYSTEM (obj);
 401   if (print_readably)
 402     error ("printing unreadable object #<coding_system 0x%x>",
 403            c->header.uid);
 404
 405   write_c_string ("#<coding_system ", printcharfun);
 406   print_internal (c->name, printcharfun, 1);
 407   write_c_string (">", printcharfun);
 408 }
 409
 410 static void
 411 finalize_coding_system (void *header, int for_disksave)
 412 {
 413   Lisp_Coding_System *c = (Lisp_Coding_System *) header;
 414   /* Since coding systems never go away, this function is not
 415      necessary.  But it would be necessary if we changed things
 416      so that coding systems could go away. */
 417   if (!for_disksave) /* see comment in lstream.c */
 418     {
 419       switch (CODING_SYSTEM_TYPE (c))
 420         {
 421 #ifdef MULE
 422         case CODESYS_ISO2022:
 423           if (c->iso2022.input_conv)
 424             {
 425               Dynarr_free (c->iso2022.input_conv);
 426               c->iso2022.input_conv = 0;
 427             }
 428           if (c->iso2022.output_conv)
 429             {
 430               Dynarr_free (c->iso2022.output_conv);
 431               c->iso2022.output_conv = 0;
 432             }
 433           break;
 434 #endif /* MULE */
 435         default:
 436           break;
 437         }
 438     }
 439 }
 440
 441 static eol_type_t
 442 symbol_to_eol_type (Lisp_Object symbol)
 443 {
 444   CHECK_SYMBOL (symbol);
 445   if (NILP (symbol))      return EOL_AUTODETECT;
 446   if (EQ (symbol, Qlf))   return EOL_LF;
 447   if (EQ (symbol, Qcrlf)) return EOL_CRLF;
 448   if (EQ (symbol, Qcr))   return EOL_CR;
 449
 450   signal_simple_error ("Unrecognized eol type", symbol);
 451   return EOL_AUTODETECT; /* not reached */
 452 }
 453
 454 static Lisp_Object
 455 eol_type_to_symbol (eol_type_t type)
 456 {
 457   switch (type)
 458     {
 459     default: abort ();
 460     case EOL_LF:         return Qlf;
 461     case EOL_CRLF:       return Qcrlf;
 462     case EOL_CR:         return Qcr;
 463     case EOL_AUTODETECT: return Qnil;
 464     }
 465 }
 466
 467 static void
 468 setup_eol_coding_systems (Lisp_Coding_System *codesys)
 469 {
 470   Lisp_Object codesys_obj;
 471   int len = string_length (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name);
 472   char *codesys_name = (char *) alloca (len + 7);
 473   int mlen = -1;
 474   char *codesys_mnemonic=0;
 475
 476   Lisp_Object codesys_name_sym, sub_codesys_obj;
 477
 478   /* kludge */
 479
 480   XSETCODING_SYSTEM (codesys_obj, codesys);
 481
 482   memcpy (codesys_name,
 483           string_data (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name), len);
 484
 485   if (STRINGP (CODING_SYSTEM_MNEMONIC (codesys)))
 486     {
 487       mlen = XSTRING_LENGTH (CODING_SYSTEM_MNEMONIC (codesys));
 488       codesys_mnemonic = (char *) alloca (mlen + 7);
 489       memcpy (codesys_mnemonic,
 490               XSTRING_DATA (CODING_SYSTEM_MNEMONIC (codesys)), mlen);
 491     }
 492
 493 #define DEFINE_SUB_CODESYS(op_sys, op_sys_abbr, Type) do {                      \
 494   strcpy (codesys_name + len, "-" op_sys);                                      \
 495   if (mlen != -1)                                                               \
 496     strcpy (codesys_mnemonic + mlen, op_sys_abbr);                              \
 497   codesys_name_sym = intern (codesys_name);                                     \
 498   sub_codesys_obj = Fcopy_coding_system (codesys_obj, codesys_name_sym);        \
 499   XCODING_SYSTEM_EOL_TYPE (sub_codesys_obj) = Type;                             \
 500   if (mlen != -1)                                                               \
 501     XCODING_SYSTEM_MNEMONIC(sub_codesys_obj) =                                  \
 502       build_string (codesys_mnemonic);                                          \
 503   CODING_SYSTEM_##Type (codesys) = sub_codesys_obj;                             \
 504 } while (0)
 505
 506   DEFINE_SUB_CODESYS("unix", "", EOL_LF);
 507   DEFINE_SUB_CODESYS("dos",  ":T", EOL_CRLF);
 508   DEFINE_SUB_CODESYS("mac",  ":t", EOL_CR);
 509 }
 510
 511 DEFUN ("coding-system-p", Fcoding_system_p, 1, 1, 0, /*
 512 Return t if OBJECT is a coding system.
 513 A coding system is an object that defines how text containing multiple
 514 character sets is encoded into a stream of (typically 8-bit) bytes.
 515 The coding system is used to decode the stream into a series of
 516 characters (which may be from multiple charsets) when the text is read
 517 from a file or process, and is used to encode the text back into the
 518 same format when it is written out to a file or process.
 519
 520 For example, many ISO2022-compliant coding systems (such as Compound
 521 Text, which is used for inter-client data under the X Window System)
 522 use escape sequences to switch between different charsets -- Japanese
 523 Kanji, for example, is invoked with "ESC $ ( B"; ASCII is invoked
 524 with "ESC ( B"; and Cyrillic is invoked with "ESC - L".  See
 525 `make-coding-system' for more information.
 526
 527 Coding systems are normally identified using a symbol, and the
 528 symbol is accepted in place of the actual coding system object whenever
 529 a coding system is called for. (This is similar to how faces work.)
 530 */
 531        (object))
 532 {
 533   return CODING_SYSTEMP (object) ? Qt : Qnil;
 534 }
 535
 536 DEFUN ("find-coding-system", Ffind_coding_system, 1, 1, 0, /*
 537 Retrieve the coding system of the given name.
 538
 539 If CODING-SYSTEM-OR-NAME is a coding-system object, it is simply
 540 returned.  Otherwise, CODING-SYSTEM-OR-NAME should be a symbol.
 541 If there is no such coding system, nil is returned.  Otherwise the
 542 associated coding system object is returned.
 543 */
 544        (coding_system_or_name))
 545 {
 546   if (NILP (coding_system_or_name))
 547     coding_system_or_name = Qbinary;
 548   else if (CODING_SYSTEMP (coding_system_or_name))
 549     return coding_system_or_name;
 550   else
 551     CHECK_SYMBOL (coding_system_or_name);
 552
 553   while (1)
 554     {
 555       coding_system_or_name =
 556         Fgethash (coding_system_or_name, Vcoding_system_hash_table, Qnil);
 557
 558       if (CODING_SYSTEMP (coding_system_or_name) || NILP (coding_system_or_name))
 559         return coding_system_or_name;
 560     }
 561 }
 562
 563 DEFUN ("get-coding-system", Fget_coding_system, 1, 1, 0, /*
 564 Retrieve the coding system of the given name.
 565 Same as `find-coding-system' except that if there is no such
 566 coding system, an error is signaled instead of returning nil.
 567 */
 568        (name))
 569 {
 570   Lisp_Object coding_system = Ffind_coding_system (name);
 571
 572   if (NILP (coding_system))
 573     signal_simple_error ("No such coding system", name);
 574   return coding_system;
 575 }
 576
 577 /* We store the coding systems in hash tables with the names as the key and the
 578    actual coding system object as the value.  Occasionally we need to use them
 579    in a list format.  These routines provide us with that. */
 580 struct coding_system_list_closure
 581 {
 582   Lisp_Object *coding_system_list;
 583 };
 584
 585 static int
 586 add_coding_system_to_list_mapper (Lisp_Object key, Lisp_Object value,
 587                                   void *coding_system_list_closure)
 588 {
 589   /* This function can GC */
 590   struct coding_system_list_closure *cscl =
 591     (struct coding_system_list_closure *) coding_system_list_closure;
 592   Lisp_Object *coding_system_list = cscl->coding_system_list;
 593
 594   *coding_system_list = Fcons (key, *coding_system_list);
 595   return 0;
 596 }
 597
 598 DEFUN ("coding-system-list", Fcoding_system_list, 0, 0, 0, /*
 599 Return a list of the names of all defined coding systems.
 600 */
 601        ())
 602 {
 603   Lisp_Object coding_system_list = Qnil;
 604   struct gcpro gcpro1;
 605   struct coding_system_list_closure coding_system_list_closure;
 606
 607   GCPRO1 (coding_system_list);
 608   coding_system_list_closure.coding_system_list = &coding_system_list;
 609   elisp_maphash (add_coding_system_to_list_mapper, Vcoding_system_hash_table,
 610                  &coding_system_list_closure);
 611   UNGCPRO;
 612
 613   return coding_system_list;
 614 }
 615
 616 DEFUN ("coding-system-name", Fcoding_system_name, 1, 1, 0, /*
 617 Return the name of the given coding system.
 618 */
 619        (coding_system))
 620 {
 621   coding_system = Fget_coding_system (coding_system);
 622   return XCODING_SYSTEM_NAME (coding_system);
 623 }
 624
 625 static Lisp_Coding_System *
 626 allocate_coding_system (enum coding_system_type type, Lisp_Object name)
 627 {
 628   Lisp_Coding_System *codesys =
 629     alloc_lcrecord_type (Lisp_Coding_System, &lrecord_coding_system);
 630
 631   zero_lcrecord (codesys);
 632   CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = Qnil;
 633   CODING_SYSTEM_POST_READ_CONVERSION (codesys) = Qnil;
 634   CODING_SYSTEM_EOL_TYPE (codesys) = EOL_AUTODETECT;
 635   CODING_SYSTEM_EOL_CRLF (codesys) = Qnil;
 636   CODING_SYSTEM_EOL_CR   (codesys) = Qnil;
 637   CODING_SYSTEM_EOL_LF   (codesys) = Qnil;
 638   CODING_SYSTEM_TYPE     (codesys) = type;
 639   CODING_SYSTEM_MNEMONIC (codesys) = Qnil;
 640 #ifdef MULE
 641   if (type == CODESYS_ISO2022)
 642     {
 643       int i;
 644       for (i = 0; i < 4; i++)
 645         CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i) = Qnil;
 646     }
 647   else if (type == CODESYS_CCL)
 648     {
 649       CODING_SYSTEM_CCL_DECODE (codesys) = Qnil;
 650       CODING_SYSTEM_CCL_ENCODE (codesys) = Qnil;
 651     }
 652 #endif /* MULE */
 653   CODING_SYSTEM_NAME (codesys) = name;
 654
 655   return codesys;
 656 }
 657
 658 #ifdef MULE
 659 /* Given a list of charset conversion specs as specified in a Lisp
 660    program, parse it into STORE_HERE. */
 661
 662 static void
 663 parse_charset_conversion_specs (charset_conversion_spec_dynarr *store_here,
 664                                 Lisp_Object spec_list)
 665 {
 666   Lisp_Object rest;
 667
 668   EXTERNAL_LIST_LOOP (rest, spec_list)
 669     {
 670       Lisp_Object car = XCAR (rest);
 671       Lisp_Object from, to;
 672       struct charset_conversion_spec spec;
 673
 674       if (!CONSP (car) || !CONSP (XCDR (car)) || !NILP (XCDR (XCDR (car))))
 675         signal_simple_error ("Invalid charset conversion spec", car);
 676       from = Fget_charset (XCAR (car));
 677       to = Fget_charset (XCAR (XCDR (car)));
 678       if ( (XCHARSET_CHARS (from) != XCHARSET_CHARS (to)) ||
 679            (XCHARSET_DIMENSION (from) != XCHARSET_DIMENSION (to)) )
 680         signal_simple_error_2
 681           ("Attempted conversion between different charset types",
 682            from, to);
 683       spec.from_charset = from;
 684       spec.to_charset = to;
 685
 686       Dynarr_add (store_here, spec);
 687     }
 688 }
 689
 690 /* Given a dynarr LOAD_HERE of internally-stored charset conversion
 691    specs, return the equivalent as the Lisp programmer would see it.
 692
 693    If LOAD_HERE is 0, return Qnil. */
 694
 695 static Lisp_Object
 696 unparse_charset_conversion_specs (charset_conversion_spec_dynarr *load_here)
 697 {
 698   int i;
 699   Lisp_Object result;
 700
 701   if (!load_here)
 702     return Qnil;
 703   for (i = 0, result = Qnil; i < Dynarr_length (load_here); i++)
 704     {
 705       struct charset_conversion_spec *ccs = Dynarr_atp (load_here, i);
 706       result = Fcons (list2 (ccs->from_charset, ccs->to_charset), result);
 707     }
 708
 709   return Fnreverse (result);
 710 }
 711
 712 #endif /* MULE */
 713
 714 DEFUN ("make-coding-system", Fmake_coding_system, 2, 4, 0, /*
 715 Register symbol NAME as a coding system.
 716
 717 TYPE describes the conversion method used and should be one of
 718
 719 nil or 'undecided
 720      Automatic conversion.  XEmacs attempts to detect the coding system
 721      used in the file.
 722 'no-conversion
 723      No conversion.  Use this for binary files and such.  On output,
 724      graphic characters that are not in ASCII or Latin-1 will be
 725      replaced by a ?. (For a no-conversion-encoded buffer, these
 726      characters will only be present if you explicitly insert them.)
 727 'shift-jis
 728      Shift-JIS (a Japanese encoding commonly used in PC operating systems).
 729 'ucs-4
 730      ISO 10646 UCS-4 encoding.
 731 'utf-8
 732      ISO 10646 UTF-8 encoding.
 733 'iso2022
 734      Any ISO2022-compliant encoding.  Among other things, this includes
 735      JIS (the Japanese encoding commonly used for e-mail), EUC (the
 736      standard Unix encoding for Japanese and other languages), and
 737      Compound Text (the encoding used in X11).  You can specify more
 738      specific information about the conversion with the PROPS argument.
 739 'big5
 740      Big5 (the encoding commonly used for Taiwanese).
 741 'ccl
 742      The conversion is performed using a user-written pseudo-code
 743      program.  CCL (Code Conversion Language) is the name of this
 744      pseudo-code.
 745 'internal
 746      Write out or read in the raw contents of the memory representing
 747      the buffer's text.  This is primarily useful for debugging
 748      purposes, and is only enabled when XEmacs has been compiled with
 749      DEBUG_XEMACS defined (via the --debug configure option).
 750      WARNING: Reading in a file using 'internal conversion can result
 751      in an internal inconsistency in the memory representing a
 752      buffer's text, which will produce unpredictable results and may
 753      cause XEmacs to crash.  Under normal circumstances you should
 754      never use 'internal conversion.
 755
 756 DOC-STRING is a string describing the coding system.
 757
 758 PROPS is a property list, describing the specific nature of the
 759 character set.  Recognized properties are:
 760
 761 'mnemonic
 762      String to be displayed in the modeline when this coding system is
 763      active.
 764
 765 'eol-type
 766      End-of-line conversion to be used.  It should be one of
 767
 768         nil
 769                 Automatically detect the end-of-line type (LF, CRLF,
 770                 or CR).  Also generate subsidiary coding systems named
 771                 `NAME-unix', `NAME-dos', and `NAME-mac', that are
 772                 identical to this coding system but have an EOL-TYPE
 773                 value of 'lf, 'crlf, and 'cr, respectively.
 774         'lf
 775                 The end of a line is marked externally using ASCII LF.
 776                 Since this is also the way that XEmacs represents an
 777                 end-of-line internally, specifying this option results
 778                 in no end-of-line conversion.  This is the standard
 779                 format for Unix text files.
 780         'crlf
 781                 The end of a line is marked externally using ASCII
 782                 CRLF.  This is the standard format for MS-DOS text
 783                 files.
 784         'cr
 785                 The end of a line is marked externally using ASCII CR.
 786                 This is the standard format for Macintosh text files.
 787         t
 788                 Automatically detect the end-of-line type but do not
 789                 generate subsidiary coding systems.  (This value is
 790                 converted to nil when stored internally, and
 791                 `coding-system-property' will return nil.)
 792
 793 'disable-composition
 794      If non-nil, composition/decomposition for combining characters
 795      are disabled.
 796
 797 'post-read-conversion
 798      Function called after a file has been read in, to perform the
 799      decoding.  Called with two arguments, START and END, denoting
 800      a region of the current buffer to be decoded.
 801
 802 'pre-write-conversion
 803      Function called before a file is written out, to perform the
 804      encoding.  Called with two arguments, START and END, denoting
 805      a region of the current buffer to be encoded.
 806
 807
 808 The following additional properties are recognized if TYPE is 'iso2022:
 809
 810 'charset-g0
 811 'charset-g1
 812 'charset-g2
 813 'charset-g3
 814      The character set initially designated to the G0 - G3 registers.
 815      The value should be one of
 816
 817           -- A charset object (designate that character set)
 818           -- nil (do not ever use this register)
 819           -- t (no character set is initially designated to
 820                 the register, but may be later on; this automatically
 821                 sets the corresponding `force-g*-on-output' property)
 822
 823 'force-g0-on-output
 824 'force-g1-on-output
 825 'force-g2-on-output
 826 'force-g2-on-output
 827      If non-nil, send an explicit designation sequence on output before
 828      using the specified register.
 829
 830 'short
 831      If non-nil, use the short forms "ESC $ @", "ESC $ A", and
 832      "ESC $ B" on output in place of the full designation sequences
 833      "ESC $ ( @", "ESC $ ( A", and "ESC $ ( B".
 834
 835 'no-ascii-eol
 836      If non-nil, don't designate ASCII to G0 at each end of line on output.
 837      Setting this to non-nil also suppresses other state-resetting that
 838      normally happens at the end of a line.
 839
 840 'no-ascii-cntl
 841      If non-nil, don't designate ASCII to G0 before control chars on output.
 842
 843 'seven
 844      If non-nil, use 7-bit environment on output.  Otherwise, use 8-bit
 845      environment.
 846
 847 'lock-shift
 848      If non-nil, use locking-shift (SO/SI) instead of single-shift
 849      or designation by escape sequence.
 850
 851 'no-iso6429
 852      If non-nil, don't use ISO6429's direction specification.
 853
 854 'escape-quoted
 855      If non-nil, literal control characters that are the same as
 856      the beginning of a recognized ISO2022 or ISO6429 escape sequence
 857      (in particular, ESC (0x1B), SO (0x0E), SI (0x0F), SS2 (0x8E),
 858      SS3 (0x8F), and CSI (0x9B)) are "quoted" with an escape character
 859      so that they can be properly distinguished from an escape sequence.
 860      (Note that doing this results in a non-portable encoding.) This
 861      encoding flag is used for byte-compiled files.  Note that ESC
 862      is a good choice for a quoting character because there are no
 863      escape sequences whose second byte is a character from the Control-0
 864      or Control-1 character sets; this is explicitly disallowed by the
 865      ISO2022 standard.
 866
 867 'input-charset-conversion
 868      A list of conversion specifications, specifying conversion of
 869      characters in one charset to another when decoding is performed.
 870      Each specification is a list of two elements: the source charset,
 871      and the destination charset.
 872
 873 'output-charset-conversion
 874      A list of conversion specifications, specifying conversion of
 875      characters in one charset to another when encoding is performed.
 876      The form of each specification is the same as for
 877      'input-charset-conversion.
 878
 879
 880 The following additional properties are recognized (and required)
 881 if TYPE is 'ccl:
 882
 883 'decode
 884      CCL program used for decoding (converting to internal format).
 885
 886 'encode
 887      CCL program used for encoding (converting to external format).
 888 */
 889        (name, type, doc_string, props))
 890 {
 891   Lisp_Coding_System *codesys;
 892   enum coding_system_type ty;
 893   int need_to_setup_eol_systems = 1;
 894
 895   /* Convert type to constant */
 896   if (NILP (type) || EQ (type, Qundecided))
 897                                       { ty = CODESYS_AUTODETECT; }
 898 #ifdef MULE
 899   else if (EQ (type, Qshift_jis))     { ty = CODESYS_SHIFT_JIS; }
 900   else if (EQ (type, Qiso2022))       { ty = CODESYS_ISO2022; }
 901   else if (EQ (type, Qbig5))          { ty = CODESYS_BIG5; }
 902   else if (EQ (type, Qucs4))          { ty = CODESYS_UCS4; }
 903   else if (EQ (type, Qutf8))          { ty = CODESYS_UTF8; }
 904   else if (EQ (type, Qccl))           { ty = CODESYS_CCL; }
 905 #endif
 906   else if (EQ (type, Qno_conversion)) { ty = CODESYS_NO_CONVERSION; }
 907 #ifdef DEBUG_XEMACS
 908   else if (EQ (type, Qinternal))      { ty = CODESYS_INTERNAL; }
 909 #endif
 910   else
 911     signal_simple_error ("Invalid coding system type", type);
 912
 913   CHECK_SYMBOL (name);
 914
 915   codesys = allocate_coding_system (ty, name);
 916
 917   if (NILP (doc_string))
 918     doc_string = build_string ("");
 919   else
 920     CHECK_STRING (doc_string);
 921   CODING_SYSTEM_DOC_STRING (codesys) = doc_string;
 922
 923   {
 924     EXTERNAL_PROPERTY_LIST_LOOP_3 (key, value, props)
 925       {
 926         if (EQ (key, Qmnemonic))
 927           {
 928             if (!NILP (value))
 929               CHECK_STRING (value);
 930             CODING_SYSTEM_MNEMONIC (codesys) = value;
 931           }
 932
 933         else if (EQ (key, Qeol_type))
 934           {
 935             need_to_setup_eol_systems = NILP (value);
 936             if (EQ (value, Qt))
 937               value = Qnil;
 938             CODING_SYSTEM_EOL_TYPE (codesys) = symbol_to_eol_type (value);
 939           }
 940
 941         else if (EQ (key, Qpost_read_conversion))
 942           CODING_SYSTEM_POST_READ_CONVERSION (codesys) = value;
 943         else if (EQ (key, Qpre_write_conversion))
 944           CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = value;
 945 #ifdef UTF2000
 946         else if (EQ (key, Qdisable_composition))
 947           CODING_SYSTEM_DISABLE_COMPOSITION (codesys) = !NILP (value);
 948 #endif
 949 #ifdef MULE
 950         else if (ty == CODESYS_ISO2022)
 951           {
 952 #define FROB_INITIAL_CHARSET(charset_num) \
 953   CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \
 954     ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value))
 955
 956             if      (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
 957             else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
 958             else if (EQ (key, Qcharset_g2)) FROB_INITIAL_CHARSET (2);
 959             else if (EQ (key, Qcharset_g3)) FROB_INITIAL_CHARSET (3);
 960
 961 #define FROB_FORCE_CHARSET(charset_num) \
 962   CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = !NILP (value)
 963
 964             else if (EQ (key, Qforce_g0_on_output)) FROB_FORCE_CHARSET (0);
 965             else if (EQ (key, Qforce_g1_on_output)) FROB_FORCE_CHARSET (1);
 966             else if (EQ (key, Qforce_g2_on_output)) FROB_FORCE_CHARSET (2);
 967             else if (EQ (key, Qforce_g3_on_output)) FROB_FORCE_CHARSET (3);
 968
 969 #define FROB_BOOLEAN_PROPERTY(prop) \
 970   CODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value)
 971
 972             else if (EQ (key, Qshort))         FROB_BOOLEAN_PROPERTY (SHORT);
 973             else if (EQ (key, Qno_ascii_eol))  FROB_BOOLEAN_PROPERTY (NO_ASCII_EOL);
 974             else if (EQ (key, Qno_ascii_cntl)) FROB_BOOLEAN_PROPERTY (NO_ASCII_CNTL);
 975             else if (EQ (key, Qseven))         FROB_BOOLEAN_PROPERTY (SEVEN);
 976             else if (EQ (key, Qlock_shift))    FROB_BOOLEAN_PROPERTY (LOCK_SHIFT);
 977             else if (EQ (key, Qno_iso6429))    FROB_BOOLEAN_PROPERTY (NO_ISO6429);
 978             else if (EQ (key, Qescape_quoted)) FROB_BOOLEAN_PROPERTY (ESCAPE_QUOTED);
 979
 980             else if (EQ (key, Qinput_charset_conversion))
 981               {
 982                 codesys->iso2022.input_conv =
 983                   Dynarr_new (charset_conversion_spec);
 984                 parse_charset_conversion_specs (codesys->iso2022.input_conv,
 985                                                 value);
 986               }
 987             else if (EQ (key, Qoutput_charset_conversion))
 988               {
 989                 codesys->iso2022.output_conv =
 990                   Dynarr_new (charset_conversion_spec);
 991                 parse_charset_conversion_specs (codesys->iso2022.output_conv,
 992                                                 value);
 993               }
 994             else
 995               signal_simple_error ("Unrecognized property", key);
 996           }
 997         else if (EQ (type, Qccl))
 998           {
 999             Lisp_Object sym;
1000             struct ccl_program test_ccl;
1001             Extbyte *suffix;
1002
1003             /* Check key first.  */
1004             if (EQ (key, Qdecode))
1005               suffix = "-ccl-decode";
1006             else if (EQ (key, Qencode))
1007               suffix = "-ccl-encode";
1008             else
1009               signal_simple_error ("Unrecognized property", key);
1010
1011             /* If value is vector, register it as a ccl program
1012                associated with an newly created symbol for
1013                backward compatibility.  */
1014             if (VECTORP (value))
1015               {
1016                 sym = Fintern (concat2 (Fsymbol_name (name),
1017                                         build_string (suffix)),
1018                                Qnil);
1019                 Fregister_ccl_program (sym, value);
1020               }
1021             else
1022               {
1023                 CHECK_SYMBOL (value);
1024                 sym = value;
1025               }
1026             /* check if the given ccl programs are valid.  */
1027             if (setup_ccl_program (&test_ccl, sym) < 0)
1028               signal_simple_error ("Invalid CCL program", value);
1029
1030             if (EQ (key, Qdecode))
1031               CODING_SYSTEM_CCL_DECODE (codesys) = sym;
1032             else if (EQ (key, Qencode))
1033               CODING_SYSTEM_CCL_ENCODE (codesys) = sym;
1034
1035           }
1036 #endif /* MULE */
1037         else
1038           signal_simple_error ("Unrecognized property", key);
1039       }
1040   }
1041
1042   if (need_to_setup_eol_systems)
1043     setup_eol_coding_systems (codesys);
1044
1045   {
1046     Lisp_Object codesys_obj;
1047     XSETCODING_SYSTEM (codesys_obj, codesys);
1048     Fputhash (name, codesys_obj, Vcoding_system_hash_table);
1049     return codesys_obj;
1050   }
1051 }
1052
1053 DEFUN ("copy-coding-system", Fcopy_coding_system, 2, 2, 0, /*
1054 Copy OLD-CODING-SYSTEM to NEW-NAME.
1055 If NEW-NAME does not name an existing coding system, a new one will
1056 be created.
1057 */
1058        (old_coding_system, new_name))
1059 {
1060   Lisp_Object new_coding_system;
1061   old_coding_system = Fget_coding_system (old_coding_system);
1062   new_coding_system = Ffind_coding_system (new_name);
1063   if (NILP (new_coding_system))
1064     {
1065       XSETCODING_SYSTEM (new_coding_system,
1066                          allocate_coding_system
1067                          (XCODING_SYSTEM_TYPE (old_coding_system),
1068                           new_name));
1069       Fputhash (new_name, new_coding_system, Vcoding_system_hash_table);
1070     }
1071
1072   {
1073     Lisp_Coding_System *to = XCODING_SYSTEM (new_coding_system);
1074     Lisp_Coding_System *from = XCODING_SYSTEM (old_coding_system);
1075     memcpy (((char *) to  ) + sizeof (to->header),
1076             ((char *) from) + sizeof (from->header),
1077             sizeof (*from) - sizeof (from->header));
1078     to->name = new_name;
1079   }
1080   return new_coding_system;
1081 }
1082
1083 DEFUN ("coding-system-canonical-name-p", Fcoding_system_canonical_name_p, 1, 1, 0, /*
1084 Return t if OBJECT names a coding system, and is not a coding system alias.
1085 */
1086        (object))
1087 {
1088   return CODING_SYSTEMP (Fgethash (object, Vcoding_system_hash_table, Qnil))
1089     ? Qt : Qnil;
1090 }
1091
1092 DEFUN ("coding-system-alias-p", Fcoding_system_alias_p, 1, 1, 0, /*
1093 Return t if OBJECT is a coding system alias.
1094 All coding system aliases are created by `define-coding-system-alias'.
1095 */
1096        (object))
1097 {
1098   return SYMBOLP (Fgethash (object, Vcoding_system_hash_table, Qzero))
1099     ? Qt : Qnil;
1100 }
1101
1102 DEFUN ("coding-system-aliasee", Fcoding_system_aliasee, 1, 1, 0, /*
1103 Return the coding-system symbol for which symbol ALIAS is an alias.
1104 */
1105        (alias))
1106 {
1107   Lisp_Object aliasee = Fgethash (alias, Vcoding_system_hash_table, Qnil);
1108   if (SYMBOLP (aliasee))
1109     return aliasee;
1110   else
1111     signal_simple_error ("Symbol is not a coding system alias", alias);
1112   return Qnil;          /* To keep the compiler happy */
1113 }
1114
1115 static Lisp_Object
1116 append_suffix_to_symbol (Lisp_Object symbol, const char *ascii_string)
1117 {
1118   return Fintern (concat2 (Fsymbol_name (symbol), build_string (ascii_string)),
1119                   Qnil);
1120 }
1121
1122 /* A maphash function, for removing dangling coding system aliases. */
1123 static int
1124 dangling_coding_system_alias_p (Lisp_Object alias,
1125                                 Lisp_Object aliasee,
1126                                 void *dangling_aliases)
1127 {
1128   if (SYMBOLP (aliasee)
1129       && NILP (Fgethash (aliasee, Vcoding_system_hash_table, Qnil)))
1130     {
1131       (*(int *) dangling_aliases)++;
1132       return 1;
1133     }
1134   else
1135     return 0;
1136 }
1137
1138 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias, 2, 2, 0, /*
1139 Define symbol ALIAS as an alias for coding system ALIASEE.
1140
1141 You can use this function to redefine an alias that has already been defined,
1142 but you cannot redefine a name which is the canonical name for a coding system.
1143 \(a canonical name of a coding system is what is returned when you call
1144 `coding-system-name' on a coding system).
1145
1146 ALIASEE itself can be an alias, which allows you to define nested aliases.
1147
1148 You are forbidden, however, from creating alias loops or `dangling' aliases.
1149 These will be detected, and an error will be signaled if you attempt to do so.
1150
1151 If ALIASEE is nil, then ALIAS will simply be undefined.
1152
1153 See also `coding-system-alias-p', `coding-system-aliasee',
1154 and `coding-system-canonical-name-p'.
1155 */
1156        (alias, aliasee))
1157 {
1158   Lisp_Object real_coding_system, probe;
1159
1160   CHECK_SYMBOL (alias);
1161
1162   if (!NILP (Fcoding_system_canonical_name_p (alias)))
1163     signal_simple_error
1164       ("Symbol is the canonical name of a coding system and cannot be redefined",
1165        alias);
1166
1167   if (NILP (aliasee))
1168     {
1169       Lisp_Object subsidiary_unix = append_suffix_to_symbol (alias, "-unix");
1170       Lisp_Object subsidiary_dos  = append_suffix_to_symbol (alias, "-dos");
1171       Lisp_Object subsidiary_mac  = append_suffix_to_symbol (alias, "-mac");
1172
1173       Fremhash (alias, Vcoding_system_hash_table);
1174
1175       /* Undefine subsidiary aliases,
1176          presumably created by a previous call to this function */
1177       if (! NILP (Fcoding_system_alias_p (subsidiary_unix)) &&
1178           ! NILP (Fcoding_system_alias_p (subsidiary_dos))  &&
1179           ! NILP (Fcoding_system_alias_p (subsidiary_mac)))
1180         {
1181           Fdefine_coding_system_alias (subsidiary_unix, Qnil);
1182           Fdefine_coding_system_alias (subsidiary_dos,  Qnil);
1183           Fdefine_coding_system_alias (subsidiary_mac,  Qnil);
1184         }
1185
1186       /* Undefine dangling coding system aliases. */
1187       {
1188         int dangling_aliases;
1189
1190         do {
1191           dangling_aliases = 0;
1192           elisp_map_remhash (dangling_coding_system_alias_p,
1193                              Vcoding_system_hash_table,
1194                              &dangling_aliases);
1195         } while (dangling_aliases > 0);
1196       }
1197
1198       return Qnil;
1199     }
1200
1201   if (CODING_SYSTEMP (aliasee))
1202     aliasee = XCODING_SYSTEM_NAME (aliasee);
1203
1204   /* Checks that aliasee names a coding-system */
1205   real_coding_system = Fget_coding_system (aliasee);
1206
1207   /* Check for coding system alias loops */
1208   if (EQ (alias, aliasee))
1209     alias_loop: signal_simple_error_2
1210       ("Attempt to create a coding system alias loop", alias, aliasee);
1211
1212   for (probe = aliasee;
1213        SYMBOLP (probe);
1214        probe = Fgethash (probe, Vcoding_system_hash_table, Qzero))
1215     {
1216       if (EQ (probe, alias))
1217         goto alias_loop;
1218     }
1219
1220   Fputhash (alias, aliasee, Vcoding_system_hash_table);
1221
1222   /* Set up aliases for subsidiaries.
1223      #### There must be a better way to handle subsidiary coding systems. */
1224   {
1225     static const char *suffixes[] = { "-unix", "-dos", "-mac" };
1226     int i;
1227     for (i = 0; i < countof (suffixes); i++)
1228       {
1229         Lisp_Object alias_subsidiary =
1230           append_suffix_to_symbol (alias, suffixes[i]);
1231         Lisp_Object aliasee_subsidiary =
1232           append_suffix_to_symbol (aliasee, suffixes[i]);
1233
1234         if (! NILP (Ffind_coding_system (aliasee_subsidiary)))
1235           Fdefine_coding_system_alias (alias_subsidiary, aliasee_subsidiary);
1236       }
1237   }
1238   /* FSF return value is a vector of [ALIAS-unix ALIAS-dos ALIAS-mac],
1239      but it doesn't look intentional, so I'd rather return something
1240      meaningful or nothing at all. */
1241   return Qnil;
1242 }
1243
1244 static Lisp_Object
1245 subsidiary_coding_system (Lisp_Object coding_system, eol_type_t type)
1246 {
1247   Lisp_Coding_System *cs = XCODING_SYSTEM (coding_system);
1248   Lisp_Object new_coding_system;
1249
1250   if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
1251     return coding_system;
1252
1253   switch (type)
1254     {
1255     case EOL_AUTODETECT: return coding_system;
1256     case EOL_LF:   new_coding_system = CODING_SYSTEM_EOL_LF   (cs); break;
1257     case EOL_CR:   new_coding_system = CODING_SYSTEM_EOL_CR   (cs); break;
1258     case EOL_CRLF: new_coding_system = CODING_SYSTEM_EOL_CRLF (cs); break;
1259     default:       abort (); return Qnil;
1260     }
1261
1262   return NILP (new_coding_system) ? coding_system : new_coding_system;
1263 }
1264
1265 DEFUN ("subsidiary-coding-system", Fsubsidiary_coding_system, 2, 2, 0, /*
1266 Return the subsidiary coding system of CODING-SYSTEM with eol type EOL-TYPE.
1267 */
1268        (coding_system, eol_type))
1269 {
1270   coding_system = Fget_coding_system (coding_system);
1271
1272   return subsidiary_coding_system (coding_system,
1273                                    symbol_to_eol_type (eol_type));
1274 }
1275
1276 \f
1277 /************************************************************************/
1278 /*                         Coding system accessors                      */
1279 /************************************************************************/
1280
1281 DEFUN ("coding-system-doc-string", Fcoding_system_doc_string, 1, 1, 0, /*
1282 Return the doc string for CODING-SYSTEM.
1283 */
1284        (coding_system))
1285 {
1286   coding_system = Fget_coding_system (coding_system);
1287   return XCODING_SYSTEM_DOC_STRING (coding_system);
1288 }
1289
1290 DEFUN ("coding-system-type", Fcoding_system_type, 1, 1, 0, /*
1291 Return the type of CODING-SYSTEM.
1292 */
1293        (coding_system))
1294 {
1295   switch (XCODING_SYSTEM_TYPE (Fget_coding_system (coding_system)))
1296     {
1297     default: abort ();
1298     case CODESYS_AUTODETECT:    return Qundecided;
1299 #ifdef MULE
1300     case CODESYS_SHIFT_JIS:     return Qshift_jis;
1301     case CODESYS_ISO2022:       return Qiso2022;
1302     case CODESYS_BIG5:          return Qbig5;
1303     case CODESYS_UCS4:          return Qucs4;
1304     case CODESYS_UTF8:          return Qutf8;
1305     case CODESYS_CCL:           return Qccl;
1306 #endif
1307     case CODESYS_NO_CONVERSION: return Qno_conversion;
1308 #ifdef DEBUG_XEMACS
1309     case CODESYS_INTERNAL:      return Qinternal;
1310 #endif
1311     }
1312 }
1313
1314 #ifdef MULE
1315 static
1316 Lisp_Object coding_system_charset (Lisp_Object coding_system, int gnum)
1317 {
1318   Lisp_Object cs
1319     = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, gnum);
1320
1321   return CHARSETP (cs) ? XCHARSET_NAME (cs) : Qnil;
1322 }
1323
1324 DEFUN ("coding-system-charset", Fcoding_system_charset, 2, 2, 0, /*
1325 Return initial charset of CODING-SYSTEM designated to GNUM.
1326 GNUM allows 0 .. 3.
1327 */
1328        (coding_system, gnum))
1329 {
1330   coding_system = Fget_coding_system (coding_system);
1331   CHECK_INT (gnum);
1332
1333   return coding_system_charset (coding_system, XINT (gnum));
1334 }
1335 #endif /* MULE */
1336
1337 DEFUN ("coding-system-property", Fcoding_system_property, 2, 2, 0, /*
1338 Return the PROP property of CODING-SYSTEM.
1339 */
1340        (coding_system, prop))
1341 {
1342   int i, ok = 0;
1343   enum coding_system_type type;
1344
1345   coding_system = Fget_coding_system (coding_system);
1346   CHECK_SYMBOL (prop);
1347   type = XCODING_SYSTEM_TYPE (coding_system);
1348
1349   for (i = 0; !ok && i < Dynarr_length (the_codesys_prop_dynarr); i++)
1350     if (EQ (Dynarr_at (the_codesys_prop_dynarr, i).sym, prop))
1351       {
1352         ok = 1;
1353         switch (Dynarr_at (the_codesys_prop_dynarr, i).prop_type)
1354           {
1355           case CODESYS_PROP_ALL_OK:
1356             break;
1357 #ifdef MULE
1358           case CODESYS_PROP_ISO2022:
1359             if (type != CODESYS_ISO2022)
1360               signal_simple_error
1361                 ("Property only valid in ISO2022 coding systems",
1362                  prop);
1363             break;
1364
1365           case CODESYS_PROP_CCL:
1366             if (type != CODESYS_CCL)
1367               signal_simple_error
1368                 ("Property only valid in CCL coding systems",
1369                  prop);
1370             break;
1371 #endif /* MULE */
1372           default:
1373             abort ();
1374           }
1375       }
1376
1377   if (!ok)
1378     signal_simple_error ("Unrecognized property", prop);
1379
1380   if (EQ (prop, Qname))
1381     return XCODING_SYSTEM_NAME (coding_system);
1382   else if (EQ (prop, Qtype))
1383     return Fcoding_system_type (coding_system);
1384   else if (EQ (prop, Qdoc_string))
1385     return XCODING_SYSTEM_DOC_STRING (coding_system);
1386   else if (EQ (prop, Qmnemonic))
1387     return XCODING_SYSTEM_MNEMONIC (coding_system);
1388   else if (EQ (prop, Qeol_type))
1389     return eol_type_to_symbol (XCODING_SYSTEM_EOL_TYPE (coding_system));
1390   else if (EQ (prop, Qeol_lf))
1391     return XCODING_SYSTEM_EOL_LF (coding_system);
1392   else if (EQ (prop, Qeol_crlf))
1393     return XCODING_SYSTEM_EOL_CRLF (coding_system);
1394   else if (EQ (prop, Qeol_cr))
1395     return XCODING_SYSTEM_EOL_CR (coding_system);
1396   else if (EQ (prop, Qpost_read_conversion))
1397     return XCODING_SYSTEM_POST_READ_CONVERSION (coding_system);
1398   else if (EQ (prop, Qpre_write_conversion))
1399     return XCODING_SYSTEM_PRE_WRITE_CONVERSION (coding_system);
1400 #ifdef MULE
1401   else if (type == CODESYS_ISO2022)
1402     {
1403       if (EQ (prop, Qcharset_g0))
1404         return coding_system_charset (coding_system, 0);
1405       else if (EQ (prop, Qcharset_g1))
1406         return coding_system_charset (coding_system, 1);
1407       else if (EQ (prop, Qcharset_g2))
1408         return coding_system_charset (coding_system, 2);
1409       else if (EQ (prop, Qcharset_g3))
1410         return coding_system_charset (coding_system, 3);
1411
1412 #define FORCE_CHARSET(charset_num) \
1413   (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \
1414    (coding_system, charset_num) ? Qt : Qnil)
1415
1416       else if (EQ (prop, Qforce_g0_on_output)) return FORCE_CHARSET (0);
1417       else if (EQ (prop, Qforce_g1_on_output)) return FORCE_CHARSET (1);
1418       else if (EQ (prop, Qforce_g2_on_output)) return FORCE_CHARSET (2);
1419       else if (EQ (prop, Qforce_g3_on_output)) return FORCE_CHARSET (3);
1420
1421 #define LISP_BOOLEAN(prop) \
1422   (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil)
1423
1424       else if (EQ (prop, Qshort))         return LISP_BOOLEAN (SHORT);
1425       else if (EQ (prop, Qno_ascii_eol))  return LISP_BOOLEAN (NO_ASCII_EOL);
1426       else if (EQ (prop, Qno_ascii_cntl)) return LISP_BOOLEAN (NO_ASCII_CNTL);
1427       else if (EQ (prop, Qseven))         return LISP_BOOLEAN (SEVEN);
1428       else if (EQ (prop, Qlock_shift))    return LISP_BOOLEAN (LOCK_SHIFT);
1429       else if (EQ (prop, Qno_iso6429))    return LISP_BOOLEAN (NO_ISO6429);
1430       else if (EQ (prop, Qescape_quoted)) return LISP_BOOLEAN (ESCAPE_QUOTED);
1431
1432       else if (EQ (prop, Qinput_charset_conversion))
1433         return
1434           unparse_charset_conversion_specs
1435             (XCODING_SYSTEM (coding_system)->iso2022.input_conv);
1436       else if (EQ (prop, Qoutput_charset_conversion))
1437         return
1438           unparse_charset_conversion_specs
1439             (XCODING_SYSTEM (coding_system)->iso2022.output_conv);
1440       else
1441         abort ();
1442     }
1443   else if (type == CODESYS_CCL)
1444     {
1445       if (EQ (prop, Qdecode))
1446         return XCODING_SYSTEM_CCL_DECODE (coding_system);
1447       else if (EQ (prop, Qencode))
1448         return XCODING_SYSTEM_CCL_ENCODE (coding_system);
1449       else
1450         abort ();
1451     }
1452 #endif /* MULE */
1453   else
1454     abort ();
1455
1456   return Qnil; /* not reached */
1457 }
1458
1459 \f
1460 /************************************************************************/
1461 /*                       Coding category functions                      */
1462 /************************************************************************/
1463
1464 static int
1465 decode_coding_category (Lisp_Object symbol)
1466 {
1467   int i;
1468
1469   CHECK_SYMBOL (symbol);
1470   for (i = 0; i < CODING_CATEGORY_LAST; i++)
1471     if (EQ (coding_category_symbol[i], symbol))
1472       return i;
1473
1474   signal_simple_error ("Unrecognized coding category", symbol);
1475   return 0; /* not reached */
1476 }
1477
1478 DEFUN ("coding-category-list", Fcoding_category_list, 0, 0, 0, /*
1479 Return a list of all recognized coding categories.
1480 */
1481        ())
1482 {
1483   int i;
1484   Lisp_Object list = Qnil;
1485
1486   for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
1487     list = Fcons (coding_category_symbol[i], list);
1488   return list;
1489 }
1490
1491 DEFUN ("set-coding-priority-list", Fset_coding_priority_list, 1, 1, 0, /*
1492 Change the priority order of the coding categories.
1493 LIST should be list of coding categories, in descending order of
1494 priority.  Unspecified coding categories will be lower in priority
1495 than all specified ones, in the same relative order they were in
1496 previously.
1497 */
1498        (list))
1499 {
1500   int category_to_priority[CODING_CATEGORY_LAST];
1501   int i, j;
1502   Lisp_Object rest;
1503
1504   /* First generate a list that maps coding categories to priorities. */
1505
1506   for (i = 0; i < CODING_CATEGORY_LAST; i++)
1507     category_to_priority[i] = -1;
1508
1509   /* Highest priority comes from the specified list. */
1510   i = 0;
1511   EXTERNAL_LIST_LOOP (rest, list)
1512     {
1513       int cat = decode_coding_category (XCAR (rest));
1514
1515       if (category_to_priority[cat] >= 0)
1516         signal_simple_error ("Duplicate coding category in list", XCAR (rest));
1517       category_to_priority[cat] = i++;
1518     }
1519
1520   /* Now go through the existing categories by priority to retrieve
1521      the categories not yet specified and preserve their priority
1522      order. */
1523   for (j = 0; j < CODING_CATEGORY_LAST; j++)
1524     {
1525       int cat = fcd->coding_category_by_priority[j];
1526       if (category_to_priority[cat] < 0)
1527         category_to_priority[cat] = i++;
1528     }
1529
1530   /* Now we need to construct the inverse of the mapping we just
1531      constructed. */
1532
1533   for (i = 0; i < CODING_CATEGORY_LAST; i++)
1534     fcd->coding_category_by_priority[category_to_priority[i]] = i;
1535
1536   /* Phew!  That was confusing. */
1537   return Qnil;
1538 }
1539
1540 DEFUN ("coding-priority-list", Fcoding_priority_list, 0, 0, 0, /*
1541 Return a list of coding categories in descending order of priority.
1542 */
1543        ())
1544 {
1545   int i;
1546   Lisp_Object list = Qnil;
1547
1548   for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
1549     list = Fcons (coding_category_symbol[fcd->coding_category_by_priority[i]],
1550                   list);
1551   return list;
1552 }
1553
1554 DEFUN ("set-coding-category-system", Fset_coding_category_system, 2, 2, 0, /*
1555 Change the coding system associated with a coding category.
1556 */
1557        (coding_category, coding_system))
1558 {
1559   int cat = decode_coding_category (coding_category);
1560
1561   coding_system = Fget_coding_system (coding_system);
1562   fcd->coding_category_system[cat] = coding_system;
1563   return Qnil;
1564 }
1565
1566 DEFUN ("coding-category-system", Fcoding_category_system, 1, 1, 0, /*
1567 Return the coding system associated with a coding category.
1568 */
1569        (coding_category))
1570 {
1571   int cat = decode_coding_category (coding_category);
1572   Lisp_Object sys = fcd->coding_category_system[cat];
1573
1574   if (!NILP (sys))
1575     return XCODING_SYSTEM_NAME (sys);
1576   return Qnil;
1577 }
1578
1579 \f
1580 /************************************************************************/
1581 /*                     Detecting the encoding of data                   */
1582 /************************************************************************/
1583
1584 struct detection_state
1585 {
1586   eol_type_t eol_type;
1587   int seen_non_ascii;
1588   int mask;
1589 #ifdef MULE
1590   struct
1591     {
1592       int mask;
1593       int in_second_byte;
1594     }
1595   big5;
1596
1597   struct
1598     {
1599       int mask;
1600       int in_second_byte;
1601     }
1602   shift_jis;
1603
1604   struct
1605     {
1606       int mask;
1607       int in_byte;
1608   }
1609   ucs4;
1610
1611   struct
1612     {
1613       int mask;
1614       int in_byte;
1615     }
1616   utf8;
1617
1618   struct
1619     {
1620       int mask;
1621       int initted;
1622       struct iso2022_decoder iso;
1623       unsigned int flags;
1624       int high_byte_count;
1625       unsigned int saw_single_shift:1;
1626     }
1627   iso2022;
1628 #endif
1629   struct
1630     {
1631       int seen_anything;
1632       int just_saw_cr;
1633     }
1634   eol;
1635 };
1636
1637 static int
1638 acceptable_control_char_p (int c)
1639 {
1640   switch (c)
1641     {
1642       /* Allow and ignore control characters that you might
1643          reasonably see in a text file */
1644     case '\r':
1645     case '\n':
1646     case '\t':
1647     case  7: /* bell */
1648     case  8: /* backspace */
1649     case 11: /* vertical tab */
1650     case 12: /* form feed */
1651     case 26: /* MS-DOS C-z junk */
1652     case 31: /* '^_' -- for info */
1653       return 1;
1654     default:
1655       return 0;
1656     }
1657 }
1658
1659 static int
1660 mask_has_at_most_one_bit_p (int mask)
1661 {
1662   /* Perhaps the only thing useful you learn from intensive Microsoft
1663      technical interviews */
1664   return (mask & (mask - 1)) == 0;
1665 }
1666
1667 static eol_type_t
1668 detect_eol_type (struct detection_state *st, const Extbyte *src,
1669                  size_t n)
1670 {
1671   while (n--)
1672     {
1673       unsigned char c = *(unsigned char *)src++;
1674       if (c == '\n')
1675         {
1676           if (st->eol.just_saw_cr)
1677             return EOL_CRLF;
1678           else if (st->eol.seen_anything)
1679             return EOL_LF;
1680         }
1681       else if (st->eol.just_saw_cr)
1682         return EOL_CR;
1683       else if (c == '\r')
1684         st->eol.just_saw_cr = 1;
1685       else
1686         st->eol.just_saw_cr = 0;
1687       st->eol.seen_anything = 1;
1688     }
1689
1690   return EOL_AUTODETECT;
1691 }
1692
1693 /* Attempt to determine the encoding and EOL type of the given text.
1694    Before calling this function for the first type, you must initialize
1695    st->eol_type as appropriate and initialize st->mask to ~0.
1696
1697    st->eol_type holds the determined EOL type, or EOL_AUTODETECT if
1698    not yet known.
1699
1700    st->mask holds the determined coding category mask, or ~0 if only
1701    ASCII has been seen so far.
1702
1703    Returns:
1704
1705    0 == st->eol_type is EOL_AUTODETECT and/or more than coding category
1706         is present in st->mask
1707    1 == definitive answers are here for both st->eol_type and st->mask
1708 */
1709
1710 static int
1711 detect_coding_type (struct detection_state *st, const Extbyte *src,
1712                     size_t n, int just_do_eol)
1713 {
1714   if (st->eol_type == EOL_AUTODETECT)
1715     st->eol_type = detect_eol_type (st, src, n);
1716
1717   if (just_do_eol)
1718     return st->eol_type != EOL_AUTODETECT;
1719
1720   if (!st->seen_non_ascii)
1721     {
1722       for (; n; n--, src++)
1723         {
1724           unsigned char c = *(unsigned char *) src;
1725           if ((c < 0x20 && !acceptable_control_char_p (c)) || c >= 0x80)
1726             {
1727               st->seen_non_ascii = 1;
1728 #ifdef MULE
1729               st->shift_jis.mask = ~0;
1730               st->big5.mask = ~0;
1731               st->ucs4.mask = ~0;
1732               st->utf8.mask = ~0;
1733               st->iso2022.mask = ~0;
1734 #endif
1735               break;
1736             }
1737         }
1738     }
1739
1740   if (!n)
1741     return 0;
1742 #ifdef MULE
1743   if (!mask_has_at_most_one_bit_p (st->iso2022.mask))
1744     st->iso2022.mask = detect_coding_iso2022 (st, src, n);
1745   if (!mask_has_at_most_one_bit_p (st->shift_jis.mask))
1746     st->shift_jis.mask = detect_coding_sjis (st, src, n);
1747   if (!mask_has_at_most_one_bit_p (st->big5.mask))
1748     st->big5.mask = detect_coding_big5 (st, src, n);
1749   if (!mask_has_at_most_one_bit_p (st->utf8.mask))
1750     st->utf8.mask = detect_coding_utf8 (st, src, n);
1751   if (!mask_has_at_most_one_bit_p (st->ucs4.mask))
1752     st->ucs4.mask = detect_coding_ucs4 (st, src, n);
1753
1754   st->mask
1755     = st->iso2022.mask | st->shift_jis.mask | st->big5.mask
1756     | st->utf8.mask | st->ucs4.mask;
1757 #endif
1758   {
1759     int retval = mask_has_at_most_one_bit_p (st->mask);
1760     st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK;
1761     return retval && st->eol_type != EOL_AUTODETECT;
1762   }
1763 }
1764
1765 static Lisp_Object
1766 coding_system_from_mask (int mask)
1767 {
1768   if (mask == ~0)
1769     {
1770       /* If the file was entirely or basically ASCII, use the
1771          default value of `buffer-file-coding-system'. */
1772       Lisp_Object retval =
1773         XBUFFER (Vbuffer_defaults)->buffer_file_coding_system;
1774       if (!NILP (retval))
1775         {
1776           retval = Ffind_coding_system (retval);
1777           if (NILP (retval))
1778             {
1779               warn_when_safe
1780                 (Qbad_variable, Qwarning,
1781                  "Invalid `default-buffer-file-coding-system', set to nil");
1782               XBUFFER (Vbuffer_defaults)->buffer_file_coding_system = Qnil;
1783             }
1784         }
1785       if (NILP (retval))
1786         retval = Fget_coding_system (Qraw_text);
1787       return retval;
1788     }
1789   else
1790     {
1791       int i;
1792       int cat = -1;
1793 #ifdef MULE
1794       mask = postprocess_iso2022_mask (mask);
1795 #endif
1796       /* Look through the coding categories by priority and find
1797          the first one that is allowed. */
1798       for (i = 0; i < CODING_CATEGORY_LAST; i++)
1799         {
1800           cat = fcd->coding_category_by_priority[i];
1801           if ((mask & (1 << cat)) &&
1802               !NILP (fcd->coding_category_system[cat]))
1803             break;
1804         }
1805       if (cat >= 0)
1806         return fcd->coding_category_system[cat];
1807       else
1808         return Fget_coding_system (Qraw_text);
1809     }
1810 }
1811
1812 /* Given a seekable read stream and potential coding system and EOL type
1813    as specified, do any autodetection that is called for.  If the
1814    coding system and/or EOL type are not `autodetect', they will be left
1815    alone; but this function will never return an autodetect coding system
1816    or EOL type.
1817
1818    This function does not automatically fetch subsidiary coding systems;
1819    that should be unnecessary with the explicit eol-type argument. */
1820
1821 #define LENGTH(string_constant) (sizeof (string_constant) - 1)
1822
1823 void
1824 determine_real_coding_system (Lstream *stream, Lisp_Object *codesys_in_out,
1825                               eol_type_t *eol_type_in_out)
1826 {
1827   struct detection_state decst;
1828
1829   if (*eol_type_in_out == EOL_AUTODETECT)
1830     *eol_type_in_out = XCODING_SYSTEM_EOL_TYPE (*codesys_in_out);
1831
1832   xzero (decst);
1833   decst.eol_type = *eol_type_in_out;
1834   decst.mask = ~0;
1835
1836   /* If autodetection is called for, do it now. */
1837   if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT
1838       || *eol_type_in_out == EOL_AUTODETECT)
1839     {
1840       Extbyte buf[4096];
1841       Lisp_Object coding_system = Qnil;
1842       Extbyte *p;
1843       ssize_t nread = Lstream_read (stream, buf, sizeof (buf));
1844       Extbyte *scan_end;
1845
1846       /* Look for initial "-*-"; mode line prefix */
1847       for (p = buf,
1848              scan_end = buf + nread - LENGTH ("-*-coding:?-*-");
1849            p <= scan_end
1850              && *p != '\n'
1851              && *p != '\r';
1852            p++)
1853         if (*p == '-' && *(p+1) == '*' && *(p+2) == '-')
1854           {
1855             Extbyte *local_vars_beg = p + 3;
1856             /* Look for final "-*-"; mode line suffix */
1857             for (p = local_vars_beg,
1858                    scan_end = buf + nread - LENGTH ("-*-");
1859                  p <= scan_end
1860                    && *p != '\n'
1861                    && *p != '\r';
1862                  p++)
1863               if (*p == '-' && *(p+1) == '*' && *(p+2) == '-')
1864                 {
1865                   Extbyte *suffix = p;
1866                   /* Look for "coding:" */
1867                   for (p = local_vars_beg,
1868                          scan_end = suffix - LENGTH ("coding:?");
1869                        p <= scan_end;
1870                        p++)
1871                     if (memcmp ("coding:", p, LENGTH ("coding:")) == 0
1872                         && (p == local_vars_beg
1873                             || (*(p-1) == ' '  ||
1874                                 *(p-1) == '\t' ||
1875                                 *(p-1) == ';')))
1876                       {
1877                         Extbyte save;
1878                         int n;
1879                         p += LENGTH ("coding:");
1880                         while (*p == ' ' || *p == '\t') p++;
1881
1882                         /* Get coding system name */
1883                         save = *suffix; *suffix = '\0';
1884                         /* Characters valid in a MIME charset name (rfc 1521),
1885                            and in a Lisp symbol name. */
1886                         n = strspn ( (char *) p,
1887                                     "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
1888                                     "abcdefghijklmnopqrstuvwxyz"
1889                                     "0123456789"
1890                                     "!$%&*+-.^_{|}~");
1891                         *suffix = save;
1892                         if (n > 0)
1893                           {
1894                             save = p[n]; p[n] = '\0';
1895                             coding_system =
1896                               Ffind_coding_system (intern ((char *) p));
1897                             p[n] = save;
1898                           }
1899                         break;
1900                       }
1901                   break;
1902                 }
1903             break;
1904           }
1905
1906       if (NILP (coding_system))
1907         do
1908           {
1909             if (detect_coding_type (&decst, buf, nread,
1910                                     XCODING_SYSTEM_TYPE (*codesys_in_out)
1911                                     != CODESYS_AUTODETECT))
1912               break;
1913             nread = Lstream_read (stream, buf, sizeof (buf));
1914             if (nread == 0)
1915               break;
1916           }
1917         while (1);
1918
1919       else if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT
1920                && XCODING_SYSTEM_EOL_TYPE (coding_system) == EOL_AUTODETECT)
1921         do
1922           {
1923             if (detect_coding_type (&decst, buf, nread, 1))
1924               break;
1925             nread = Lstream_read (stream, buf, sizeof (buf));
1926             if (!nread)
1927               break;
1928           }
1929         while (1);
1930
1931       *eol_type_in_out = decst.eol_type;
1932       if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT)
1933         {
1934           if (NILP (coding_system))
1935             *codesys_in_out = coding_system_from_mask (decst.mask);
1936           else
1937             *codesys_in_out = coding_system;
1938         }
1939     }
1940
1941   /* If we absolutely can't determine the EOL type, just assume LF. */
1942   if (*eol_type_in_out == EOL_AUTODETECT)
1943     *eol_type_in_out = EOL_LF;
1944
1945   Lstream_rewind (stream);
1946 }
1947
1948 DEFUN ("detect-coding-region", Fdetect_coding_region, 2, 3, 0, /*
1949 Detect coding system of the text in the region between START and END.
1950 Return a list of possible coding systems ordered by priority.
1951 If only ASCII characters are found, return 'undecided or one of
1952 its subsidiary coding systems according to a detected end-of-line
1953 type.  Optional arg BUFFER defaults to the current buffer.
1954 */
1955        (start, end, buffer))
1956 {
1957   Lisp_Object val = Qnil;
1958   struct buffer *buf = decode_buffer (buffer, 0);
1959   Bufpos b, e;
1960   Lisp_Object instream, lb_instream;
1961   Lstream *istr, *lb_istr;
1962   struct detection_state decst;
1963   struct gcpro gcpro1, gcpro2;
1964
1965   get_buffer_range_char (buf, start, end, &b, &e, 0);
1966   lb_instream = make_lisp_buffer_input_stream (buf, b, e, 0);
1967   lb_istr = XLSTREAM (lb_instream);
1968   instream = make_encoding_input_stream (lb_istr, Fget_coding_system (Qbinary));
1969   istr = XLSTREAM (instream);
1970   GCPRO2 (instream, lb_instream);
1971   xzero (decst);
1972   decst.eol_type = EOL_AUTODETECT;
1973   decst.mask = ~0;
1974   while (1)
1975     {
1976       Extbyte random_buffer[4096];
1977       ssize_t nread = Lstream_read (istr, random_buffer, sizeof (random_buffer));
1978
1979       if (!nread)
1980         break;
1981       if (detect_coding_type (&decst, random_buffer, nread, 0))
1982         break;
1983     }
1984
1985   if (decst.mask == ~0)
1986     val = subsidiary_coding_system (Fget_coding_system (Qundecided),
1987                                     decst.eol_type);
1988   else
1989     {
1990       int i;
1991
1992       val = Qnil;
1993 #ifdef MULE
1994       decst.mask = postprocess_iso2022_mask (decst.mask);
1995 #endif
1996       for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
1997         {
1998           int sys = fcd->coding_category_by_priority[i];
1999           if (decst.mask & (1 << sys))
2000             {
2001               Lisp_Object codesys = fcd->coding_category_system[sys];
2002               if (!NILP (codesys))
2003                 codesys = subsidiary_coding_system (codesys, decst.eol_type);
2004               val = Fcons (codesys, val);
2005             }
2006         }
2007     }
2008   Lstream_close (istr);
2009   UNGCPRO;
2010   Lstream_delete (istr);
2011   Lstream_delete (lb_istr);
2012   return val;
2013 }
2014
2015 \f
2016 /************************************************************************/
2017 /*           Converting to internal Mule format ("decoding")            */
2018 /************************************************************************/
2019
2020 /* A decoding stream is a stream used for decoding text (i.e.
2021    converting from some external format to internal format).
2022    The decoding-stream object keeps track of the actual coding
2023    stream, the stream that is at the other end, and data that
2024    needs to be persistent across the lifetime of the stream. */
2025
2026 /* Handle the EOL stuff related to just-read-in character C.
2027    EOL_TYPE is the EOL type of the coding stream.
2028    FLAGS is the current value of FLAGS in the coding stream, and may
2029    be modified by this macro.  (The macro only looks at the
2030    CODING_STATE_CR flag.)  DST is the Dynarr to which the decoded
2031    bytes are to be written.  You need to also define a local goto
2032    label "label_continue_loop" that is at the end of the main
2033    character-reading loop.
2034
2035    If C is a CR character, then this macro handles it entirely and
2036    jumps to label_continue_loop.  Otherwise, this macro does not add
2037    anything to DST, and continues normally.  You should continue
2038    processing C normally after this macro. */
2039
2040 #define DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst)         \
2041 do {                                                            \
2042   if (c == '\r')                                                \
2043     {                                                           \
2044       if (eol_type == EOL_CR)                                   \
2045         Dynarr_add (dst, '\n');                                 \
2046       else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR) \
2047         Dynarr_add (dst, c);                                    \
2048       else                                                      \
2049         flags |= CODING_STATE_CR;                               \
2050       goto label_continue_loop;                                 \
2051     }                                                           \
2052   else if (flags & CODING_STATE_CR)                             \
2053     {   /* eol_type == CODING_SYSTEM_EOL_CRLF */                \
2054       if (c != '\n')                                            \
2055         Dynarr_add (dst, '\r');                                 \
2056       flags &= ~CODING_STATE_CR;                                \
2057     }                                                           \
2058 } while (0)
2059
2060 /* C should be a binary character in the range 0 - 255; convert
2061    to internal format and add to Dynarr DST. */
2062
2063 #ifdef UTF2000
2064 #define DECODE_ADD_BINARY_CHAR(c, dst) \
2065 do {                                            \
2066   if (BYTE_ASCII_P (c))                         \
2067     Dynarr_add (dst, c);                        \
2068   else                                          \
2069     {                                           \
2070       Dynarr_add (dst, (c >> 6) | 0xc0);        \
2071       Dynarr_add (dst, (c & 0x3f) | 0x80);      \
2072     }                                           \
2073 } while (0)
2074
2075 INLINE_HEADER void DECODE_ADD_UCS_CHAR(Emchar c, unsigned_char_dynarr* dst);
2076 INLINE_HEADER void
2077 DECODE_ADD_UCS_CHAR(Emchar c, unsigned_char_dynarr* dst)
2078 {
2079   if ( c <= 0x7f )
2080     {
2081       Dynarr_add (dst, c);
2082     }
2083   else if ( c <= 0x7ff )
2084     {
2085       Dynarr_add (dst, (c >> 6) | 0xc0);
2086       Dynarr_add (dst, (c & 0x3f) | 0x80);
2087     }
2088   else if ( c <= 0xffff )
2089     {
2090       Dynarr_add (dst,  (c >> 12) | 0xe0);
2091       Dynarr_add (dst, ((c >>  6) & 0x3f) | 0x80);
2092       Dynarr_add (dst,  (c        & 0x3f) | 0x80);
2093     }
2094   else if ( c <= 0x1fffff )
2095     {
2096       Dynarr_add (dst,  (c >> 18) | 0xf0);
2097       Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2098       Dynarr_add (dst, ((c >>  6) & 0x3f) | 0x80);
2099       Dynarr_add (dst,  (c        & 0x3f) | 0x80);
2100     }
2101   else if ( c <= 0x3ffffff )
2102     {
2103       Dynarr_add (dst,  (c >> 24) | 0xf8);
2104       Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
2105       Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2106       Dynarr_add (dst, ((c >>  6) & 0x3f) | 0x80);
2107       Dynarr_add (dst,  (c        & 0x3f) | 0x80);
2108     }
2109   else
2110     {
2111       Dynarr_add (dst,  (c >> 30) | 0xfc);
2112       Dynarr_add (dst, ((c >> 24) & 0x3f) | 0x80);
2113       Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
2114       Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2115       Dynarr_add (dst, ((c >>  6) & 0x3f) | 0x80);
2116       Dynarr_add (dst,  (c        & 0x3f) | 0x80);
2117     }
2118 }
2119 #else
2120 #define DECODE_ADD_BINARY_CHAR(c, dst)          \
2121 do {                                            \
2122   if (BYTE_ASCII_P (c))                         \
2123     Dynarr_add (dst, c);                        \
2124   else if (BYTE_C1_P (c))                       \
2125     {                                           \
2126       Dynarr_add (dst, LEADING_BYTE_CONTROL_1); \
2127       Dynarr_add (dst, c + 0x20);               \
2128     }                                           \
2129   else                                          \
2130     {                                           \
2131       Dynarr_add (dst, LEADING_BYTE_LATIN_ISO8859_1); \
2132       Dynarr_add (dst, c);                      \
2133     }                                           \
2134 } while (0)
2135 #endif
2136
2137 #define DECODE_OUTPUT_PARTIAL_CHAR(ch)  \
2138 do {                                    \
2139   if (ch)                               \
2140     {                                   \
2141       DECODE_ADD_BINARY_CHAR (ch, dst); \
2142       ch = 0;                           \
2143     }                                   \
2144 } while (0)
2145
2146 #define DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst) \
2147 do {                                    \
2148   if (flags & CODING_STATE_END)         \
2149     {                                   \
2150       DECODE_OUTPUT_PARTIAL_CHAR (ch);  \
2151       if (flags & CODING_STATE_CR)      \
2152         Dynarr_add (dst, '\r');         \
2153     }                                   \
2154 } while (0)
2155
2156 #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding)
2157
2158 struct decoding_stream
2159 {
2160   /* Coding system that governs the conversion. */
2161   Lisp_Coding_System *codesys;
2162
2163   /* Stream that we read the encoded data from or
2164      write the decoded data to. */
2165   Lstream *other_end;
2166
2167   /* If we are reading, then we can return only a fixed amount of
2168      data, so if the conversion resulted in too much data, we store it
2169      here for retrieval the next time around. */
2170   unsigned_char_dynarr *runoff;
2171
2172   /* FLAGS holds flags indicating the current state of the decoding.
2173      Some of these flags are dependent on the coding system. */
2174   unsigned int flags;
2175
2176   /* CPOS holds a partially built-up code-point of character. */
2177   unsigned int cpos;
2178
2179   /* EOL_TYPE specifies the type of end-of-line conversion that
2180      currently applies.  We need to keep this separate from the
2181      EOL type stored in CODESYS because the latter might indicate
2182      automatic EOL-type detection while the former will always
2183      indicate a particular EOL type. */
2184   eol_type_t eol_type;
2185 #ifdef MULE
2186   /* Additional ISO2022 information.  We define the structure above
2187      because it's also needed by the detection routines. */
2188   struct iso2022_decoder iso2022;
2189
2190   /* Additional information (the state of the running CCL program)
2191      used by the CCL decoder. */
2192   struct ccl_program ccl;
2193
2194   /* counter for UTF-8 or UCS-4 */
2195   unsigned char counter;
2196 #endif
2197 #ifdef UTF2000
2198   unsigned combined_char_count;
2199   Emchar combined_chars[16];
2200   Lisp_Object combining_table;
2201 #endif
2202   struct detection_state decst;
2203 };
2204
2205 #ifdef UTF2000
2206 extern Lisp_Object Vcharacter_composition_table;
2207
2208 INLINE_HEADER void
2209 COMPOSE_FLUSH_CHARS (struct decoding_stream *str, unsigned_char_dynarr* dst);
2210 INLINE_HEADER void
2211 COMPOSE_FLUSH_CHARS (struct decoding_stream *str, unsigned_char_dynarr* dst)
2212 {
2213   unsigned i;
2214
2215   for (i = 0; i < str->combined_char_count; i++)
2216     DECODE_ADD_UCS_CHAR (str->combined_chars[i], dst);
2217   str->combined_char_count = 0;
2218   str->combining_table = Qnil;
2219 }
2220
2221 void COMPOSE_ADD_CHAR(struct decoding_stream *str, Emchar character,
2222                       unsigned_char_dynarr* dst);
2223 void
2224 COMPOSE_ADD_CHAR(struct decoding_stream *str,
2225                  Emchar character, unsigned_char_dynarr* dst)
2226 {
2227   if (CODING_SYSTEM_DISABLE_COMPOSITION (str->codesys))
2228     DECODE_ADD_UCS_CHAR (character, dst);
2229   else if (!CHAR_ID_TABLE_P (str->combining_table))
2230     {
2231       Lisp_Object ret
2232         = get_char_id_table (XCHAR_ID_TABLE(Vcharacter_composition_table),
2233                              character);
2234
2235       if (NILP (ret))
2236         DECODE_ADD_UCS_CHAR (character, dst);
2237       else
2238         {
2239           str->combined_chars[0] = character;
2240           str->combined_char_count = 1;
2241           str->combining_table = ret;
2242         }
2243     }
2244   else
2245     {
2246       Lisp_Object ret
2247         = get_char_id_table (XCHAR_ID_TABLE(str->combining_table),
2248                              character);
2249
2250       if (CHARP (ret))
2251         {
2252           Emchar char2 = XCHARVAL (ret);
2253           ret =
2254             get_char_id_table (XCHAR_ID_TABLE(Vcharacter_composition_table),
2255                                char2);
2256           if (NILP (ret))
2257             {
2258               DECODE_ADD_UCS_CHAR (char2, dst);
2259               str->combined_char_count = 0;
2260               str->combining_table = Qnil;
2261             }
2262           else
2263             {
2264               str->combined_chars[0] = char2;
2265               str->combined_char_count = 1;
2266               str->combining_table = ret;
2267             }
2268         }
2269       else if (CHAR_ID_TABLE_P (ret))
2270         {
2271           str->combined_chars[str->combined_char_count++] = character;
2272           str->combining_table = ret;
2273         }
2274       else
2275         {
2276           COMPOSE_FLUSH_CHARS (str, dst);
2277           DECODE_ADD_UCS_CHAR (character, dst);
2278         }
2279     }
2280 }
2281 #else /* not UTF2000 */
2282 #define COMPOSE_FLUSH_CHARS(str, dst)
2283 #define COMPOSE_ADD_CHAR(str, ch, dst) DECODE_ADD_UCS_CHAR (ch, dst)
2284 #endif /* UTF2000 */
2285
2286 static ssize_t decoding_reader (Lstream *stream,
2287                                 unsigned char *data, size_t size);
2288 static ssize_t decoding_writer (Lstream *stream,
2289                                 const unsigned char *data, size_t size);
2290 static int decoding_rewinder   (Lstream *stream);
2291 static int decoding_seekable_p (Lstream *stream);
2292 static int decoding_flusher    (Lstream *stream);
2293 static int decoding_closer     (Lstream *stream);
2294
2295 static Lisp_Object decoding_marker (Lisp_Object stream);
2296
2297 DEFINE_LSTREAM_IMPLEMENTATION ("decoding", lstream_decoding,
2298                                sizeof (struct decoding_stream));
2299
2300 static Lisp_Object
2301 decoding_marker (Lisp_Object stream)
2302 {
2303   Lstream *str = DECODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2304   Lisp_Object str_obj;
2305
2306   /* We do not need to mark the coding systems or charsets stored
2307      within the stream because they are stored in a global list
2308      and automatically marked. */
2309
2310   XSETLSTREAM (str_obj, str);
2311   mark_object (str_obj);
2312   if (str->imp->marker)
2313     return (str->imp->marker) (str_obj);
2314   else
2315     return Qnil;
2316 }
2317
2318 /* Read SIZE bytes of data and store it into DATA.  We are a decoding stream
2319    so we read data from the other end, decode it, and store it into DATA. */
2320
2321 static ssize_t
2322 decoding_reader (Lstream *stream, unsigned char *data, size_t size)
2323 {
2324   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2325   unsigned char *orig_data = data;
2326   ssize_t read_size;
2327   int error_occurred = 0;
2328
2329   /* We need to interface to mule_decode(), which expects to take some
2330      amount of data and store the result into a Dynarr.  We have
2331      mule_decode() store into str->runoff, and take data from there
2332      as necessary. */
2333
2334   /* We loop until we have enough data, reading chunks from the other
2335      end and decoding it. */
2336   while (1)
2337     {
2338       /* Take data from the runoff if we can.  Make sure to take at
2339          most SIZE bytes, and delete the data from the runoff. */
2340       if (Dynarr_length (str->runoff) > 0)
2341         {
2342           size_t chunk = min (size, (size_t) Dynarr_length (str->runoff));
2343           memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2344           Dynarr_delete_many (str->runoff, 0, chunk);
2345           data += chunk;
2346           size -= chunk;
2347         }
2348
2349       if (size == 0)
2350         break; /* No more room for data */
2351
2352       if (str->flags & CODING_STATE_END)
2353         /* This means that on the previous iteration, we hit the EOF on
2354            the other end.  We loop once more so that mule_decode() can
2355            output any final stuff it may be holding, or any "go back
2356            to a sane state" escape sequences. (This latter makes sense
2357            during encoding.) */
2358         break;
2359
2360       /* Exhausted the runoff, so get some more.  DATA has at least
2361          SIZE bytes left of storage in it, so it's OK to read directly
2362          into it.  (We'll be overwriting above, after we've decoded it
2363          into the runoff.) */
2364       read_size = Lstream_read (str->other_end, data, size);
2365       if (read_size < 0)
2366         {
2367           error_occurred = 1;
2368           break;
2369         }
2370       if (read_size == 0)
2371         /* There might be some more end data produced in the translation.
2372            See the comment above. */
2373         str->flags |= CODING_STATE_END;
2374       mule_decode (stream, (Extbyte *) data, str->runoff, read_size);
2375     }
2376
2377   if (data - orig_data == 0)
2378     return error_occurred ? -1 : 0;
2379   else
2380     return data - orig_data;
2381 }
2382
2383 static ssize_t
2384 decoding_writer (Lstream *stream, const unsigned char *data, size_t size)
2385 {
2386   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2387   ssize_t retval;
2388
2389   /* Decode all our data into the runoff, and then attempt to write
2390      it all out to the other end.  Remove whatever chunk we succeeded
2391      in writing. */
2392   mule_decode (stream, (Extbyte *) data, str->runoff, size);
2393   retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2394                           Dynarr_length (str->runoff));
2395   if (retval > 0)
2396     Dynarr_delete_many (str->runoff, 0, retval);
2397   /* Do NOT return retval.  The return value indicates how much
2398      of the incoming data was written, not how many bytes were
2399      written. */
2400   return size;
2401 }
2402
2403 static void
2404 reset_decoding_stream (struct decoding_stream *str)
2405 {
2406 #ifdef MULE
2407   if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_ISO2022)
2408     {
2409       Lisp_Object coding_system;
2410       XSETCODING_SYSTEM (coding_system, str->codesys);
2411       reset_iso2022 (coding_system, &str->iso2022);
2412     }
2413   else if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_CCL)
2414     {
2415       setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_DECODE (str->codesys));
2416     }
2417   str->counter = 0;
2418 #endif /* MULE */
2419 #ifdef UTF2000
2420   str->combined_char_count = 0;
2421   str->combining_table = Qnil;
2422 #endif
2423   str->flags = str->cpos = 0;
2424 }
2425
2426 static int
2427 decoding_rewinder (Lstream *stream)
2428 {
2429   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2430   reset_decoding_stream (str);
2431   Dynarr_reset (str->runoff);
2432   return Lstream_rewind (str->other_end);
2433 }
2434
2435 static int
2436 decoding_seekable_p (Lstream *stream)
2437 {
2438   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2439   return Lstream_seekable_p (str->other_end);
2440 }
2441
2442 static int
2443 decoding_flusher (Lstream *stream)
2444 {
2445   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2446   return Lstream_flush (str->other_end);
2447 }
2448
2449 static int
2450 decoding_closer (Lstream *stream)
2451 {
2452   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2453   if (stream->flags & LSTREAM_FL_WRITE)
2454     {
2455       str->flags |= CODING_STATE_END;
2456       decoding_writer (stream, 0, 0);
2457     }
2458   Dynarr_free (str->runoff);
2459 #ifdef MULE
2460 #ifdef ENABLE_COMPOSITE_CHARS
2461   if (str->iso2022.composite_chars)
2462     Dynarr_free (str->iso2022.composite_chars);
2463 #endif
2464 #endif
2465   return Lstream_close (str->other_end);
2466 }
2467
2468 Lisp_Object
2469 decoding_stream_coding_system (Lstream *stream)
2470 {
2471   Lisp_Object coding_system;
2472   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2473
2474   XSETCODING_SYSTEM (coding_system, str->codesys);
2475   return subsidiary_coding_system (coding_system, str->eol_type);
2476 }
2477
2478 void
2479 set_decoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2480 {
2481   Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2482   struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2483   str->codesys = cs;
2484   if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
2485     str->eol_type = CODING_SYSTEM_EOL_TYPE (cs);
2486   reset_decoding_stream (str);
2487 }
2488
2489 /* WARNING WARNING WARNING WARNING!!!!!  If you open up a decoding
2490    stream for writing, no automatic code detection will be performed.
2491    The reason for this is that automatic code detection requires a
2492    seekable input.  Things will also fail if you open a decoding
2493    stream for reading using a non-fully-specified coding system and
2494    a non-seekable input stream. */
2495
2496 static Lisp_Object
2497 make_decoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2498                         const char *mode)
2499 {
2500   Lstream *lstr = Lstream_new (lstream_decoding, mode);
2501   struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2502   Lisp_Object obj;
2503
2504   xzero (*str);
2505   str->other_end = stream;
2506   str->runoff = (unsigned_char_dynarr *) Dynarr_new (unsigned_char);
2507   str->eol_type = EOL_AUTODETECT;
2508   if (!strcmp (mode, "r")
2509       && Lstream_seekable_p (stream))
2510     /* We can determine the coding system now. */
2511     determine_real_coding_system (stream, &codesys, &str->eol_type);
2512   set_decoding_stream_coding_system (lstr, codesys);
2513   str->decst.eol_type = str->eol_type;
2514   str->decst.mask = ~0;
2515   XSETLSTREAM (obj, lstr);
2516   return obj;
2517 }
2518
2519 Lisp_Object
2520 make_decoding_input_stream (Lstream *stream, Lisp_Object codesys)
2521 {
2522   return make_decoding_stream_1 (stream, codesys, "r");
2523 }
2524
2525 Lisp_Object
2526 make_decoding_output_stream (Lstream *stream, Lisp_Object codesys)
2527 {
2528   return make_decoding_stream_1 (stream, codesys, "w");
2529 }
2530
2531 /* Note: the decode_coding_* functions all take the same
2532    arguments as mule_decode(), which is to say some SRC data of
2533    size N, which is to be stored into dynamic array DST.
2534    DECODING is the stream within which the decoding is
2535    taking place, but no data is actually read from or
2536    written to that stream; that is handled in decoding_reader()
2537    or decoding_writer().  This allows the same functions to
2538    be used for both reading and writing. */
2539
2540 static void
2541 mule_decode (Lstream *decoding, const Extbyte *src,
2542              unsigned_char_dynarr *dst, size_t n)
2543 {
2544   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2545
2546   /* If necessary, do encoding-detection now.  We do this when
2547      we're a writing stream or a non-seekable reading stream,
2548      meaning that we can't just process the whole input,
2549      rewind, and start over. */
2550
2551   if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT ||
2552       str->eol_type == EOL_AUTODETECT)
2553     {
2554       Lisp_Object codesys;
2555
2556       XSETCODING_SYSTEM (codesys, str->codesys);
2557       detect_coding_type (&str->decst, src, n,
2558                           CODING_SYSTEM_TYPE (str->codesys) !=
2559                           CODESYS_AUTODETECT);
2560       if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT &&
2561           str->decst.mask != ~0)
2562         /* #### This is cheesy.  What we really ought to do is
2563            buffer up a certain amount of data so as to get a
2564            less random result. */
2565         codesys = coding_system_from_mask (str->decst.mask);
2566       str->eol_type = str->decst.eol_type;
2567       if (XCODING_SYSTEM (codesys) != str->codesys)
2568         {
2569           /* Preserve the CODING_STATE_END flag in case it was set.
2570              If we erase it, bad things might happen. */
2571           int was_end = str->flags & CODING_STATE_END;
2572           set_decoding_stream_coding_system (decoding, codesys);
2573           if (was_end)
2574             str->flags |= CODING_STATE_END;
2575         }
2576     }
2577
2578   switch (CODING_SYSTEM_TYPE (str->codesys))
2579     {
2580 #ifdef DEBUG_XEMACS
2581     case CODESYS_INTERNAL:
2582       Dynarr_add_many (dst, src, n);
2583       break;
2584 #endif
2585     case CODESYS_AUTODETECT:
2586       /* If we got this far and still haven't decided on the coding
2587          system, then do no conversion. */
2588     case CODESYS_NO_CONVERSION:
2589       decode_coding_no_conversion (decoding, src, dst, n);
2590       break;
2591 #ifdef MULE
2592     case CODESYS_SHIFT_JIS:
2593       decode_coding_sjis (decoding, src, dst, n);
2594       break;
2595     case CODESYS_BIG5:
2596       decode_coding_big5 (decoding, src, dst, n);
2597       break;
2598     case CODESYS_UCS4:
2599       decode_coding_ucs4 (decoding, src, dst, n);
2600       break;
2601     case CODESYS_UTF8:
2602       decode_coding_utf8 (decoding, src, dst, n);
2603       break;
2604     case CODESYS_CCL:
2605       str->ccl.last_block = str->flags & CODING_STATE_END;
2606       /* When applying ccl program to stream, MUST NOT set NULL
2607          pointer to src.  */
2608       ccl_driver (&str->ccl, (src ? (unsigned char *)src : (unsigned char*)""),
2609                   dst, n, 0, CCL_MODE_DECODING);
2610       break;
2611     case CODESYS_ISO2022:
2612       decode_coding_iso2022 (decoding, src, dst, n);
2613       break;
2614 #endif /* MULE */
2615     default:
2616       abort ();
2617     }
2618 }
2619
2620 DEFUN ("decode-coding-region", Fdecode_coding_region, 3, 4, 0, /*
2621 Decode the text between START and END which is encoded in CODING-SYSTEM.
2622 This is useful if you've read in encoded text from a file without decoding
2623 it (e.g. you read in a JIS-formatted file but used the `binary' or
2624 `no-conversion' coding system, so that it shows up as "^[$B!<!+^[(B").
2625 Return length of decoded text.
2626 BUFFER defaults to the current buffer if unspecified.
2627 */
2628        (start, end, coding_system, buffer))
2629 {
2630   Bufpos b, e;
2631   struct buffer *buf = decode_buffer (buffer, 0);
2632   Lisp_Object instream, lb_outstream, de_outstream, outstream;
2633   Lstream *istr, *ostr;
2634   struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2635
2636   get_buffer_range_char (buf, start, end, &b, &e, 0);
2637
2638   barf_if_buffer_read_only (buf, b, e);
2639
2640   coding_system = Fget_coding_system (coding_system);
2641   instream = make_lisp_buffer_input_stream  (buf, b, e, 0);
2642   lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2643   de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2644                                               coding_system);
2645   outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2646                                            Fget_coding_system (Qbinary));
2647   istr = XLSTREAM (instream);
2648   ostr = XLSTREAM (outstream);
2649   GCPRO4 (instream, lb_outstream, de_outstream, outstream);
2650
2651   /* The chain of streams looks like this:
2652
2653      [BUFFER] <----- send through
2654                      ------> [ENCODE AS BINARY]
2655                              ------> [DECODE AS SPECIFIED]
2656                                      ------> [BUFFER]
2657    */
2658
2659   while (1)
2660     {
2661       char tempbuf[1024]; /* some random amount */
2662       Bufpos newpos, even_newer_pos;
2663       Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2664       ssize_t size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2665
2666       if (!size_in_bytes)
2667         break;
2668       newpos = lisp_buffer_stream_startpos (istr);
2669       Lstream_write (ostr, tempbuf, size_in_bytes);
2670       even_newer_pos = lisp_buffer_stream_startpos (istr);
2671       buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2672                            even_newer_pos, 0);
2673     }
2674   Lstream_close (istr);
2675   Lstream_close (ostr);
2676   UNGCPRO;
2677   Lstream_delete (istr);
2678   Lstream_delete (ostr);
2679   Lstream_delete (XLSTREAM (de_outstream));
2680   Lstream_delete (XLSTREAM (lb_outstream));
2681   return Qnil;
2682 }
2683
2684 \f
2685 /************************************************************************/
2686 /*           Converting to an external encoding ("encoding")            */
2687 /************************************************************************/
2688
2689 /* An encoding stream is an output stream.  When you create the
2690    stream, you specify the coding system that governs the encoding
2691    and another stream that the resulting encoded data is to be
2692    sent to, and then start sending data to it. */
2693
2694 #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding)
2695
2696 struct encoding_stream
2697 {
2698   /* Coding system that governs the conversion. */
2699   Lisp_Coding_System *codesys;
2700
2701   /* Stream that we read the encoded data from or
2702      write the decoded data to. */
2703   Lstream *other_end;
2704
2705   /* If we are reading, then we can return only a fixed amount of
2706      data, so if the conversion resulted in too much data, we store it
2707      here for retrieval the next time around. */
2708   unsigned_char_dynarr *runoff;
2709
2710   /* FLAGS holds flags indicating the current state of the encoding.
2711      Some of these flags are dependent on the coding system. */
2712   unsigned int flags;
2713
2714   /* CH holds a partially built-up character.  Since we only deal
2715      with one- and two-byte characters at the moment, we only use
2716      this to store the first byte of a two-byte character. */
2717   unsigned int ch;
2718 #ifdef MULE
2719   /* Additional information used by the ISO2022 encoder. */
2720   struct
2721     {
2722       /* CHARSET holds the character sets currently assigned to the G0
2723          through G3 registers.  It is initialized from the array
2724          INITIAL_CHARSET in CODESYS. */
2725       Lisp_Object charset[4];
2726
2727       /* Which registers are currently invoked into the left (GL) and
2728          right (GR) halves of the 8-bit encoding space? */
2729       int register_left, register_right;
2730
2731       /* Whether we need to explicitly designate the charset in the
2732          G? register before using it.  It is initialized from the
2733          array FORCE_CHARSET_ON_OUTPUT in CODESYS. */
2734       unsigned char force_charset_on_output[4];
2735
2736       /* Other state variables that need to be preserved across
2737          invocations. */
2738       Lisp_Object current_charset;
2739       int current_half;
2740       int current_char_boundary;
2741     } iso2022;
2742
2743   void (*encode_char) (struct encoding_stream *str, Emchar c,
2744                        unsigned_char_dynarr *dst, unsigned int *flags);
2745   void (*finish) (struct encoding_stream *str,
2746                   unsigned_char_dynarr *dst, unsigned int *flags);
2747
2748   /* Additional information (the state of the running CCL program)
2749      used by the CCL encoder. */
2750   struct ccl_program ccl;
2751 #endif /* MULE */
2752 };
2753
2754 static ssize_t encoding_reader (Lstream *stream, unsigned char *data, size_t size);
2755 static ssize_t encoding_writer (Lstream *stream, const unsigned char *data,
2756                                 size_t size);
2757 static int encoding_rewinder   (Lstream *stream);
2758 static int encoding_seekable_p (Lstream *stream);
2759 static int encoding_flusher    (Lstream *stream);
2760 static int encoding_closer     (Lstream *stream);
2761
2762 static Lisp_Object encoding_marker (Lisp_Object stream);
2763
2764 DEFINE_LSTREAM_IMPLEMENTATION ("encoding", lstream_encoding,
2765                                sizeof (struct encoding_stream));
2766
2767 static Lisp_Object
2768 encoding_marker (Lisp_Object stream)
2769 {
2770   Lstream *str = ENCODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2771   Lisp_Object str_obj;
2772
2773   /* We do not need to mark the coding systems or charsets stored
2774      within the stream because they are stored in a global list
2775      and automatically marked. */
2776
2777   XSETLSTREAM (str_obj, str);
2778   mark_object (str_obj);
2779   if (str->imp->marker)
2780     return (str->imp->marker) (str_obj);
2781   else
2782     return Qnil;
2783 }
2784
2785 /* Read SIZE bytes of data and store it into DATA.  We are a encoding stream
2786    so we read data from the other end, encode it, and store it into DATA. */
2787
2788 static ssize_t
2789 encoding_reader (Lstream *stream, unsigned char *data, size_t size)
2790 {
2791   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2792   unsigned char *orig_data = data;
2793   ssize_t read_size;
2794   int error_occurred = 0;
2795
2796   /* We need to interface to mule_encode(), which expects to take some
2797      amount of data and store the result into a Dynarr.  We have
2798      mule_encode() store into str->runoff, and take data from there
2799      as necessary. */
2800
2801   /* We loop until we have enough data, reading chunks from the other
2802      end and encoding it. */
2803   while (1)
2804     {
2805       /* Take data from the runoff if we can.  Make sure to take at
2806          most SIZE bytes, and delete the data from the runoff. */
2807       if (Dynarr_length (str->runoff) > 0)
2808         {
2809           int chunk = min ((int) size, Dynarr_length (str->runoff));
2810           memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2811           Dynarr_delete_many (str->runoff, 0, chunk);
2812           data += chunk;
2813           size -= chunk;
2814         }
2815
2816       if (size == 0)
2817         break; /* No more room for data */
2818
2819       if (str->flags & CODING_STATE_END)
2820         /* This means that on the previous iteration, we hit the EOF on
2821            the other end.  We loop once more so that mule_encode() can
2822            output any final stuff it may be holding, or any "go back
2823            to a sane state" escape sequences. (This latter makes sense
2824            during encoding.) */
2825         break;
2826
2827       /* Exhausted the runoff, so get some more.  DATA at least SIZE bytes
2828          left of storage in it, so it's OK to read directly into it.
2829          (We'll be overwriting above, after we've encoded it into the
2830          runoff.) */
2831       read_size = Lstream_read (str->other_end, data, size);
2832       if (read_size < 0)
2833         {
2834           error_occurred = 1;
2835           break;
2836         }
2837       if (read_size == 0)
2838         /* There might be some more end data produced in the translation.
2839            See the comment above. */
2840         str->flags |= CODING_STATE_END;
2841       mule_encode (stream, data, str->runoff, read_size);
2842     }
2843
2844   if (data == orig_data)
2845     return error_occurred ? -1 : 0;
2846   else
2847     return data - orig_data;
2848 }
2849
2850 static ssize_t
2851 encoding_writer (Lstream *stream, const unsigned char *data, size_t size)
2852 {
2853   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2854   ssize_t retval;
2855
2856   /* Encode all our data into the runoff, and then attempt to write
2857      it all out to the other end.  Remove whatever chunk we succeeded
2858      in writing. */
2859   mule_encode (stream, data, str->runoff, size);
2860   retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2861                           Dynarr_length (str->runoff));
2862   if (retval > 0)
2863     Dynarr_delete_many (str->runoff, 0, retval);
2864   /* Do NOT return retval.  The return value indicates how much
2865      of the incoming data was written, not how many bytes were
2866      written. */
2867   return size;
2868 }
2869
2870 static void
2871 reset_encoding_stream (struct encoding_stream *str)
2872 {
2873 #ifdef MULE
2874   switch (CODING_SYSTEM_TYPE (str->codesys))
2875     {
2876     case CODESYS_ISO2022:
2877       {
2878         int i;
2879
2880         str->encode_char = &char_encode_iso2022;
2881         str->finish = &char_finish_iso2022;
2882         for (i = 0; i < 4; i++)
2883           {
2884             str->iso2022.charset[i] =
2885               CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, i);
2886             str->iso2022.force_charset_on_output[i] =
2887               CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (str->codesys, i);
2888           }
2889         str->iso2022.register_left = 0;
2890         str->iso2022.register_right = 1;
2891         str->iso2022.current_charset = Qnil;
2892         str->iso2022.current_half = 0;
2893         break;
2894       }
2895     case CODESYS_CCL:
2896       setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_ENCODE (str->codesys));
2897       break;
2898     case CODESYS_UTF8:
2899       str->encode_char = &char_encode_utf8;
2900       str->finish = &char_finish_utf8;
2901       break;
2902     case CODESYS_UCS4:
2903       str->encode_char = &char_encode_ucs4;
2904       str->finish = &char_finish_ucs4;
2905       break;
2906     case CODESYS_SHIFT_JIS:
2907       str->encode_char = &char_encode_shift_jis;
2908       str->finish = &char_finish_shift_jis;
2909       break;
2910     case CODESYS_BIG5:
2911       str->encode_char = &char_encode_big5;
2912       str->finish = &char_finish_big5;
2913       break;
2914     default:
2915       break;
2916     }
2917 #endif /* MULE */
2918   str->iso2022.current_char_boundary = 0;
2919   str->flags = str->ch = 0;
2920 }
2921
2922 static int
2923 encoding_rewinder (Lstream *stream)
2924 {
2925   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2926   reset_encoding_stream (str);
2927   Dynarr_reset (str->runoff);
2928   return Lstream_rewind (str->other_end);
2929 }
2930
2931 static int
2932 encoding_seekable_p (Lstream *stream)
2933 {
2934   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2935   return Lstream_seekable_p (str->other_end);
2936 }
2937
2938 static int
2939 encoding_flusher (Lstream *stream)
2940 {
2941   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2942   return Lstream_flush (str->other_end);
2943 }
2944
2945 static int
2946 encoding_closer (Lstream *stream)
2947 {
2948   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2949   if (stream->flags & LSTREAM_FL_WRITE)
2950     {
2951       str->flags |= CODING_STATE_END;
2952       encoding_writer (stream, 0, 0);
2953     }
2954   Dynarr_free (str->runoff);
2955   return Lstream_close (str->other_end);
2956 }
2957
2958 Lisp_Object
2959 encoding_stream_coding_system (Lstream *stream)
2960 {
2961   Lisp_Object coding_system;
2962   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2963
2964   XSETCODING_SYSTEM (coding_system, str->codesys);
2965   return coding_system;
2966 }
2967
2968 void
2969 set_encoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2970 {
2971   Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2972   struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2973   str->codesys = cs;
2974   reset_encoding_stream (str);
2975 }
2976
2977 static Lisp_Object
2978 make_encoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2979                         const char *mode)
2980 {
2981   Lstream *lstr = Lstream_new (lstream_encoding, mode);
2982   struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2983   Lisp_Object obj;
2984
2985   xzero (*str);
2986   str->runoff = Dynarr_new (unsigned_char);
2987   str->other_end = stream;
2988   set_encoding_stream_coding_system (lstr, codesys);
2989   XSETLSTREAM (obj, lstr);
2990   return obj;
2991 }
2992
2993 Lisp_Object
2994 make_encoding_input_stream (Lstream *stream, Lisp_Object codesys)
2995 {
2996   return make_encoding_stream_1 (stream, codesys, "r");
2997 }
2998
2999 Lisp_Object
3000 make_encoding_output_stream (Lstream *stream, Lisp_Object codesys)
3001 {
3002   return make_encoding_stream_1 (stream, codesys, "w");
3003 }
3004
3005 /* Convert N bytes of internally-formatted data stored in SRC to an
3006    external format, according to the encoding stream ENCODING.
3007    Store the encoded data into DST. */
3008
3009 static void
3010 mule_encode (Lstream *encoding, const Bufbyte *src,
3011              unsigned_char_dynarr *dst, size_t n)
3012 {
3013   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3014
3015   switch (CODING_SYSTEM_TYPE (str->codesys))
3016     {
3017 #ifdef DEBUG_XEMACS
3018     case CODESYS_INTERNAL:
3019       Dynarr_add_many (dst, src, n);
3020       break;
3021 #endif
3022     case CODESYS_AUTODETECT:
3023       /* If we got this far and still haven't decided on the coding
3024          system, then do no conversion. */
3025     case CODESYS_NO_CONVERSION:
3026       encode_coding_no_conversion (encoding, src, dst, n);
3027       break;
3028 #ifdef MULE
3029     case CODESYS_CCL:
3030       str->ccl.last_block = str->flags & CODING_STATE_END;
3031       /* When applying ccl program to stream, MUST NOT set NULL
3032          pointer to src.  */
3033       ccl_driver (&str->ccl, ((src) ? src : (unsigned char*)""),
3034                   dst, n, 0, CCL_MODE_ENCODING);
3035       break;
3036 #endif /* MULE */
3037     default:
3038       text_encode_generic (encoding, src, dst, n);
3039     }
3040 }
3041
3042 DEFUN ("encode-coding-region", Fencode_coding_region, 3, 4, 0, /*
3043 Encode the text between START and END using CODING-SYSTEM.
3044 This will, for example, convert Japanese characters into stuff such as
3045 "^[$B!<!+^[(B" if you use the JIS encoding.  Return length of encoded
3046 text.  BUFFER defaults to the current buffer if unspecified.
3047 */
3048        (start, end, coding_system, buffer))
3049 {
3050   Bufpos b, e;
3051   struct buffer *buf = decode_buffer (buffer, 0);
3052   Lisp_Object instream, lb_outstream, de_outstream, outstream;
3053   Lstream *istr, *ostr;
3054   struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
3055
3056   get_buffer_range_char (buf, start, end, &b, &e, 0);
3057
3058   barf_if_buffer_read_only (buf, b, e);
3059
3060   coding_system = Fget_coding_system (coding_system);
3061   instream  = make_lisp_buffer_input_stream  (buf, b, e, 0);
3062   lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
3063   de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
3064                                               Fget_coding_system (Qbinary));
3065   outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
3066                                            coding_system);
3067   istr = XLSTREAM (instream);
3068   ostr = XLSTREAM (outstream);
3069   GCPRO4 (instream, outstream, de_outstream, lb_outstream);
3070   /* The chain of streams looks like this:
3071
3072      [BUFFER] <----- send through
3073                      ------> [ENCODE AS SPECIFIED]
3074                              ------> [DECODE AS BINARY]
3075                                      ------> [BUFFER]
3076    */
3077   while (1)
3078     {
3079       char tempbuf[1024]; /* some random amount */
3080       Bufpos newpos, even_newer_pos;
3081       Bufpos oldpos = lisp_buffer_stream_startpos (istr);
3082       ssize_t size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
3083
3084       if (!size_in_bytes)
3085         break;
3086       newpos = lisp_buffer_stream_startpos (istr);
3087       Lstream_write (ostr, tempbuf, size_in_bytes);
3088       even_newer_pos = lisp_buffer_stream_startpos (istr);
3089       buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
3090                            even_newer_pos, 0);
3091     }
3092
3093   {
3094     Charcount retlen =
3095       lisp_buffer_stream_startpos (XLSTREAM (instream)) - b;
3096     Lstream_close (istr);
3097     Lstream_close (ostr);
3098     UNGCPRO;
3099     Lstream_delete (istr);
3100     Lstream_delete (ostr);
3101     Lstream_delete (XLSTREAM (de_outstream));
3102     Lstream_delete (XLSTREAM (lb_outstream));
3103     return make_int (retlen);
3104   }
3105 }
3106
3107 #ifdef MULE
3108 \f
3109 static void
3110 text_encode_generic (Lstream *encoding, const Bufbyte *src,
3111                      unsigned_char_dynarr *dst, size_t n)
3112 {
3113   unsigned char c;
3114   unsigned char char_boundary;
3115   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3116   unsigned int flags          = str->flags;
3117   Emchar ch                   = str->ch;
3118
3119   char_boundary = str->iso2022.current_char_boundary;
3120
3121   while (n--)
3122     {
3123       c = *src++;
3124
3125       if (char_boundary == 0)
3126         {
3127           if (c >= 0xfc)
3128             {
3129               ch = c & 0x01;
3130               char_boundary = 5;
3131             }
3132           else if (c >= 0xf8)
3133             {
3134               ch = c & 0x03;
3135               char_boundary = 4;
3136             }
3137           else if (c >= 0xf0)
3138             {
3139               ch = c & 0x07;
3140               char_boundary = 3;
3141             }
3142           else if (c >= 0xe0)
3143             {
3144               ch = c & 0x0f;
3145               char_boundary = 2;
3146             }
3147           else if (c >= 0xc0)
3148             {
3149               ch = c & 0x1f;
3150               char_boundary = 1;
3151             }
3152           else
3153             (*str->encode_char) (str, c, dst, &flags);
3154         }
3155       else if (char_boundary == 1)
3156         {
3157           (*str->encode_char) (str, (ch << 6) | (c & 0x3f), dst, &flags);
3158           ch =0;
3159           char_boundary = 0;
3160         }
3161       else
3162         {
3163           ch = (ch << 6) | (c & 0x3f);
3164           char_boundary--;
3165         }
3166     }
3167
3168   if ((char_boundary == 0) && (flags & CODING_STATE_END))
3169     {
3170       (*str->finish) (str, dst, &flags);
3171     }
3172
3173   str->flags = flags;
3174   str->ch    = ch;
3175   str->iso2022.current_char_boundary = char_boundary;
3176 }
3177
3178 \f
3179 /************************************************************************/
3180 /*                          Shift-JIS methods                           */
3181 /************************************************************************/
3182
3183 /* Shift-JIS is a coding system encoding three character sets: ASCII, right
3184    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
3185    as is.  A character of JISX0201-Kana (DIMENSION1_CHARS94 character set) is
3186    encoded by "position-code + 0x80".  A character of JISX0208
3187    (DIMENSION2_CHARS94 character set) is encoded in 2-byte but two
3188    position-codes are divided and shifted so that it fit in the range
3189    below.
3190
3191    --- CODE RANGE of Shift-JIS ---
3192    (character set)      (range)
3193    ASCII                0x00 .. 0x7F
3194    JISX0201-Kana        0xA0 .. 0xDF
3195    JISX0208 (1st byte)  0x80 .. 0x9F and 0xE0 .. 0xEF
3196             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
3197    -------------------------------
3198
3199 */
3200
3201 /* Is this the first byte of a Shift-JIS two-byte char? */
3202
3203 #define BYTE_SJIS_TWO_BYTE_1_P(c) \
3204   (((c) >= 0x81 && (c) <= 0x9F) || ((c) >= 0xE0 && (c) <= 0xEF))
3205
3206 /* Is this the second byte of a Shift-JIS two-byte char? */
3207
3208 #define BYTE_SJIS_TWO_BYTE_2_P(c) \
3209   (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0x80 && (c) <= 0xFC))
3210
3211 #define BYTE_SJIS_KATAKANA_P(c) \
3212   ((c) >= 0xA1 && (c) <= 0xDF)
3213
3214 static int
3215 detect_coding_sjis (struct detection_state *st, const Extbyte *src, size_t n)
3216 {
3217   while (n--)
3218     {
3219       unsigned char c = *(unsigned char *)src++;
3220       if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3221         return 0;
3222       if (st->shift_jis.in_second_byte)
3223         {
3224           st->shift_jis.in_second_byte = 0;
3225           if (c < 0x40)
3226             return 0;
3227         }
3228       else if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
3229         st->shift_jis.in_second_byte = 1;
3230     }
3231   return CODING_CATEGORY_SHIFT_JIS_MASK;
3232 }
3233
3234 /* Convert Shift-JIS data to internal format. */
3235
3236 static void
3237 decode_coding_sjis (Lstream *decoding, const Extbyte *src,
3238                     unsigned_char_dynarr *dst, size_t n)
3239 {
3240   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3241   unsigned int flags  = str->flags;
3242   unsigned int cpos   = str->cpos;
3243   eol_type_t eol_type = str->eol_type;
3244
3245   while (n--)
3246     {
3247       unsigned char c = *(unsigned char *)src++;
3248
3249       if (cpos)
3250         {
3251           /* Previous character was first byte of Shift-JIS Kanji char. */
3252           if (BYTE_SJIS_TWO_BYTE_2_P (c))
3253             {
3254               unsigned char e1, e2;
3255
3256               DECODE_SJIS (cpos, c, e1, e2);
3257 #ifdef UTF2000
3258               DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_japanese_jisx0208,
3259                                             e1 & 0x7F,
3260                                             e2 & 0x7F), dst);
3261 #else
3262               Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
3263               Dynarr_add (dst, e1);
3264               Dynarr_add (dst, e2);
3265 #endif
3266             }
3267           else
3268             {
3269               DECODE_ADD_BINARY_CHAR (cpos, dst);
3270               DECODE_ADD_BINARY_CHAR (c, dst);
3271             }
3272           cpos = 0;
3273         }
3274       else
3275         {
3276           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3277           if (BYTE_SJIS_TWO_BYTE_1_P (c))
3278             cpos = c;
3279           else if (BYTE_SJIS_KATAKANA_P (c))
3280             {
3281 #ifdef UTF2000
3282               DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_katakana_jisx0201,
3283                                             c & 0x7F, 0), dst);
3284 #else
3285               Dynarr_add (dst, LEADING_BYTE_KATAKANA_JISX0201);
3286               Dynarr_add (dst, c);
3287 #endif
3288             }
3289 #ifdef UTF2000
3290           else if (c > 32)
3291             DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_latin_jisx0201,
3292                                           c, 0), dst);
3293 #endif
3294           else
3295             DECODE_ADD_BINARY_CHAR (c, dst);
3296         }
3297     label_continue_loop:;
3298     }
3299
3300   DECODE_HANDLE_END_OF_CONVERSION (flags, cpos, dst);
3301
3302   str->flags = flags;
3303   str->cpos  = cpos;
3304 }
3305
3306 /* Convert internal character representation to Shift_JIS. */
3307
3308 void
3309 char_encode_shift_jis (struct encoding_stream *str, Emchar ch,
3310                        unsigned_char_dynarr *dst, unsigned int *flags)
3311 {
3312   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3313
3314   if (ch == '\n')
3315     {
3316       if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3317         Dynarr_add (dst, '\r');
3318       if (eol_type != EOL_CR)
3319         Dynarr_add (dst, ch);
3320     }
3321   else
3322     {
3323       unsigned int s1, s2;
3324 #ifdef UTF2000
3325       int code_point = charset_code_point (Vcharset_latin_jisx0201, ch);
3326
3327       if (code_point >= 0)
3328         Dynarr_add (dst, code_point);
3329       else if ((code_point
3330                 = charset_code_point (Vcharset_japanese_jisx0208_1990, ch))
3331                >= 0)
3332         {
3333           ENCODE_SJIS ((code_point >> 8) | 0x80,
3334                        (code_point & 0xFF) | 0x80, s1, s2);
3335           Dynarr_add (dst, s1);
3336           Dynarr_add (dst, s2);
3337         }
3338       else if ((code_point
3339                 = charset_code_point (Vcharset_katakana_jisx0201, ch))
3340                >= 0)
3341         Dynarr_add (dst, code_point | 0x80);
3342       else if ((code_point
3343                 = charset_code_point (Vcharset_japanese_jisx0208, ch))
3344                >= 0)
3345         {
3346           ENCODE_SJIS ((code_point >> 8) | 0x80,
3347                        (code_point & 0xFF) | 0x80, s1, s2);
3348           Dynarr_add (dst, s1);
3349           Dynarr_add (dst, s2);
3350         }
3351       else if ((code_point = charset_code_point (Vcharset_ascii, ch))
3352                >= 0)
3353         Dynarr_add (dst, code_point);
3354       else
3355         Dynarr_add (dst, '?');
3356 #else
3357       Lisp_Object charset;
3358       unsigned int c1, c2;
3359
3360       BREAKUP_CHAR (ch, charset, c1, c2);
3361
3362       if (EQ(charset, Vcharset_katakana_jisx0201))
3363         {
3364           Dynarr_add (dst, c1 | 0x80);
3365         }
3366       else if (c2 == 0)
3367         {
3368           Dynarr_add (dst, c1);
3369         }
3370       else if (EQ(charset, Vcharset_japanese_jisx0208))
3371         {
3372           ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3373           Dynarr_add (dst, s1);
3374           Dynarr_add (dst, s2);
3375         }
3376       else
3377         Dynarr_add (dst, '?');
3378 #endif
3379     }
3380 }
3381
3382 void
3383 char_finish_shift_jis (struct encoding_stream *str, unsigned_char_dynarr *dst,
3384                        unsigned int *flags)
3385 {
3386 }
3387
3388 DEFUN ("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /*
3389 Decode a JISX0208 character of Shift-JIS coding-system.
3390 CODE is the character code in Shift-JIS as a cons of type bytes.
3391 Return the corresponding character.
3392 */
3393        (code))
3394 {
3395   unsigned char c1, c2, s1, s2;
3396
3397   CHECK_CONS (code);
3398   CHECK_INT (XCAR (code));
3399   CHECK_INT (XCDR (code));
3400   s1 = XINT (XCAR (code));
3401   s2 = XINT (XCDR (code));
3402   if (BYTE_SJIS_TWO_BYTE_1_P (s1) &&
3403       BYTE_SJIS_TWO_BYTE_2_P (s2))
3404     {
3405       DECODE_SJIS (s1, s2, c1, c2);
3406       return make_char (MAKE_CHAR (Vcharset_japanese_jisx0208,
3407                                    c1 & 0x7F, c2 & 0x7F));
3408     }
3409   else
3410     return Qnil;
3411 }
3412
3413 DEFUN ("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /*
3414 Encode a JISX0208 character CHARACTER to SHIFT-JIS coding-system.
3415 Return the corresponding character code in SHIFT-JIS as a cons of two bytes.
3416 */
3417        (character))
3418 {
3419   Lisp_Object charset;
3420   int c1, c2, s1, s2;
3421
3422   CHECK_CHAR_COERCE_INT (character);
3423   BREAKUP_CHAR (XCHAR (character), charset, c1, c2);
3424   if (EQ (charset, Vcharset_japanese_jisx0208))
3425     {
3426       ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3427       return Fcons (make_int (s1), make_int (s2));
3428     }
3429   else
3430     return Qnil;
3431 }
3432
3433 \f
3434 /************************************************************************/
3435 /*                            Big5 methods                              */
3436 /************************************************************************/
3437
3438 /* BIG5 is a coding system encoding two character sets: ASCII and
3439    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
3440    character set and is encoded in two-byte.
3441
3442    --- CODE RANGE of BIG5 ---
3443    (character set)      (range)
3444    ASCII                0x00 .. 0x7F
3445    Big5 (1st byte)      0xA1 .. 0xFE
3446         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
3447    --------------------------
3448
3449    Since the number of characters in Big5 is larger than maximum
3450    characters in Emacs' charset (96x96), it can't be handled as one
3451    charset.  So, in Emacs, Big5 is divided into two: `charset-big5-1'
3452    and `charset-big5-2'.  Both <type>s are DIMENSION2_CHARS94.  The former
3453    contains frequently used characters and the latter contains less
3454    frequently used characters.  */
3455
3456 #ifdef UTF2000
3457 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
3458   ((c) >= 0x81 && (c) <= 0xFE)
3459 #else
3460 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
3461   ((c) >= 0xA1 && (c) <= 0xFE)
3462 #endif
3463
3464 /* Is this the second byte of a Shift-JIS two-byte char? */
3465
3466 #define BYTE_BIG5_TWO_BYTE_2_P(c) \
3467   (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0xA1 && (c) <= 0xFE))
3468
3469 /* Number of Big5 characters which have the same code in 1st byte.  */
3470
3471 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
3472
3473 /* Code conversion macros.  These are macros because they are used in
3474    inner loops during code conversion.
3475
3476    Note that temporary variables in macros introduce the classic
3477    dynamic-scoping problems with variable names.  We use capital-
3478    lettered variables in the assumption that XEmacs does not use
3479    capital letters in variables except in a very formalized way
3480    (e.g. Qstring). */
3481
3482 /* Convert Big5 code (b1, b2) into its internal string representation
3483    (lb, c1, c2). */
3484
3485 /* There is a much simpler way to split the Big5 charset into two.
3486    For the moment I'm going to leave the algorithm as-is because it
3487    claims to separate out the most-used characters into a single
3488    charset, which perhaps will lead to optimizations in various
3489    places.
3490
3491    The way the algorithm works is something like this:
3492
3493    Big5 can be viewed as a 94x157 charset, where the row is
3494    encoded into the bytes 0xA1 .. 0xFE and the column is encoded
3495    into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE.  As for frequency,
3496    the split between low and high column numbers is apparently
3497    meaningless; ascending rows produce less and less frequent chars.
3498    Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to
3499    the first charset, and the upper half (0xC9 .. 0xFE) to the
3500    second.  To do the conversion, we convert the character into
3501    a single number where 0 .. 156 is the first row, 157 .. 313
3502    is the second, etc.  That way, the characters are ordered by
3503    decreasing frequency.  Then we just chop the space in two
3504    and coerce the result into a 94x94 space.
3505    */
3506
3507 #define DECODE_BIG5(b1, b2, lb, c1, c2) do                              \
3508 {                                                                       \
3509   int B1 = b1, B2 = b2;                                                 \
3510   unsigned int I                                                        \
3511     = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62);     \
3512                                                                         \
3513   if (B1 < 0xC9)                                                        \
3514     {                                                                   \
3515       lb = LEADING_BYTE_CHINESE_BIG5_1;                                 \
3516     }                                                                   \
3517   else                                                                  \
3518     {                                                                   \
3519       lb = LEADING_BYTE_CHINESE_BIG5_2;                                 \
3520       I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1);                             \
3521     }                                                                   \
3522   c1 = I / (0xFF - 0xA1) + 0xA1;                                        \
3523   c2 = I % (0xFF - 0xA1) + 0xA1;                                        \
3524 } while (0)
3525
3526 /* Convert the internal string representation of a Big5 character
3527    (lb, c1, c2) into Big5 code (b1, b2). */
3528
3529 #define ENCODE_BIG5(lb, c1, c2, b1, b2) do                              \
3530 {                                                                       \
3531   unsigned int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1);       \
3532                                                                         \
3533   if (lb == LEADING_BYTE_CHINESE_BIG5_2)                                \
3534     {                                                                   \
3535       I += BIG5_SAME_ROW * (0xC9 - 0xA1);                               \
3536     }                                                                   \
3537   b1 = I / BIG5_SAME_ROW + 0xA1;                                        \
3538   b2 = I % BIG5_SAME_ROW;                                               \
3539   b2 += b2 < 0x3F ? 0x40 : 0x62;                                        \
3540 } while (0)
3541
3542 static int
3543 detect_coding_big5 (struct detection_state *st, const Extbyte *src, size_t n)
3544 {
3545   while (n--)
3546     {
3547       unsigned char c = *(unsigned char *)src++;
3548       if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO
3549 #ifndef UTF2000
3550           || (c >= 0x80 && c <= 0xA0)
3551 #endif
3552           )
3553         return 0;
3554       if (st->big5.in_second_byte)
3555         {
3556           st->big5.in_second_byte = 0;
3557           if (c < 0x40 || (c >= 0x80 && c <= 0xA0))
3558             return 0;
3559         }
3560       else if (
3561 #ifdef UTF2000
3562                c >= 0x81
3563 #else
3564                c >= 0xA1
3565 #endif
3566                )
3567         st->big5.in_second_byte = 1;
3568     }
3569   return CODING_CATEGORY_BIG5_MASK;
3570 }
3571
3572 /* Convert Big5 data to internal format. */
3573
3574 static void
3575 decode_coding_big5 (Lstream *decoding, const Extbyte *src,
3576                     unsigned_char_dynarr *dst, size_t n)
3577 {
3578   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3579   unsigned int flags  = str->flags;
3580   unsigned int cpos   = str->cpos;
3581   eol_type_t eol_type = str->eol_type;
3582
3583   while (n--)
3584     {
3585       unsigned char c = *(unsigned char *)src++;
3586       if (cpos)
3587         {
3588           /* Previous character was first byte of Big5 char. */
3589           if (BYTE_BIG5_TWO_BYTE_2_P (c))
3590             {
3591 #ifdef UTF2000
3592               DECODE_ADD_UCS_CHAR
3593                 (DECODE_CHAR (Vcharset_chinese_big5, (cpos << 8) | c),
3594                  dst);
3595 #else
3596               unsigned char b1, b2, b3;
3597               DECODE_BIG5 (cpos, c, b1, b2, b3);
3598               Dynarr_add (dst, b1);
3599               Dynarr_add (dst, b2);
3600               Dynarr_add (dst, b3);
3601 #endif
3602             }
3603           else
3604             {
3605               DECODE_ADD_BINARY_CHAR (cpos, dst);
3606               DECODE_ADD_BINARY_CHAR (c, dst);
3607             }
3608           cpos = 0;
3609         }
3610       else
3611         {
3612           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3613           if (BYTE_BIG5_TWO_BYTE_1_P (c))
3614             cpos = c;
3615           else
3616             DECODE_ADD_BINARY_CHAR (c, dst);
3617         }
3618     label_continue_loop:;
3619     }
3620
3621   DECODE_HANDLE_END_OF_CONVERSION (flags, cpos, dst);
3622
3623   str->flags = flags;
3624   str->cpos  = cpos;
3625 }
3626
3627 /* Convert internally-formatted data to Big5. */
3628
3629 void
3630 char_encode_big5 (struct encoding_stream *str, Emchar ch,
3631                   unsigned_char_dynarr *dst, unsigned int *flags)
3632 {
3633   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3634
3635   if (ch == '\n')
3636     {
3637       if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3638         Dynarr_add (dst, '\r');
3639       if (eol_type != EOL_CR)
3640         Dynarr_add (dst, ch);
3641     }
3642   else
3643     {
3644 #ifdef UTF2000
3645       int code_point;
3646
3647       if ((code_point = charset_code_point (Vcharset_ascii, ch)) >= 0)
3648         Dynarr_add (dst, code_point);
3649       else if ((code_point
3650                 = charset_code_point (Vcharset_chinese_big5, ch)) >= 0)
3651         {
3652           Dynarr_add (dst, code_point >> 8);
3653           Dynarr_add (dst, code_point & 0xFF);
3654         }
3655       else if ((code_point
3656                 = charset_code_point (Vcharset_chinese_big5_1, ch)) >= 0)
3657         {
3658           unsigned int I
3659             = ((code_point >> 8) - 33) * (0xFF - 0xA1)
3660             + ((code_point & 0xFF) - 33);
3661           unsigned char b1 = I / BIG5_SAME_ROW + 0xA1;
3662           unsigned char b2 = I % BIG5_SAME_ROW;
3663
3664           b2 += b2 < 0x3F ? 0x40 : 0x62;
3665           Dynarr_add (dst, b1);
3666           Dynarr_add (dst, b2);
3667         }
3668       else if ((code_point
3669                 = charset_code_point (Vcharset_chinese_big5_2, ch)) >= 0)
3670         {
3671           unsigned int I
3672             = ((code_point >> 8) - 33) * (0xFF - 0xA1)
3673             + ((code_point & 0xFF) - 33);
3674           unsigned char b1, b2;
3675
3676           I += BIG5_SAME_ROW * (0xC9 - 0xA1);
3677           b1 = I / BIG5_SAME_ROW + 0xA1;
3678           b2 = I % BIG5_SAME_ROW;
3679           b2 += b2 < 0x3F ? 0x40 : 0x62;
3680           Dynarr_add (dst, b1);
3681           Dynarr_add (dst, b2);
3682         }
3683       else
3684         Dynarr_add (dst, '?');
3685 #else
3686 #endif
3687     }
3688 }
3689
3690 void
3691 char_finish_big5 (struct encoding_stream *str, unsigned_char_dynarr *dst,
3692                   unsigned int *flags)
3693 {
3694 }
3695
3696
3697 DEFUN ("decode-big5-char", Fdecode_big5_char, 1, 1, 0, /*
3698 Decode a Big5 character CODE of BIG5 coding-system.
3699 CODE is the character code in BIG5, a cons of two integers.
3700 Return the corresponding character.
3701 */
3702        (code))
3703 {
3704   unsigned char c1, c2, b1, b2;
3705
3706   CHECK_CONS (code);
3707   CHECK_INT (XCAR (code));
3708   CHECK_INT (XCDR (code));
3709   b1 = XINT (XCAR (code));
3710   b2 = XINT (XCDR (code));
3711   if (BYTE_BIG5_TWO_BYTE_1_P (b1) &&
3712       BYTE_BIG5_TWO_BYTE_2_P (b2))
3713     {
3714       Charset_ID leading_byte;
3715       Lisp_Object charset;
3716       DECODE_BIG5 (b1, b2, leading_byte, c1, c2);
3717       charset = CHARSET_BY_LEADING_BYTE (leading_byte);
3718       return make_char (MAKE_CHAR (charset, c1 & 0x7F, c2 & 0x7F));
3719     }
3720   else
3721     return Qnil;
3722 }
3723
3724 DEFUN ("encode-big5-char", Fencode_big5_char, 1, 1, 0, /*
3725 Encode the Big5 character CHARACTER in the BIG5 coding-system.
3726 Return the corresponding character code in Big5.
3727 */
3728        (character))
3729 {
3730   Lisp_Object charset;
3731   int c1, c2, b1, b2;
3732
3733   CHECK_CHAR_COERCE_INT (character);
3734   BREAKUP_CHAR (XCHAR (character), charset, c1, c2);
3735   if (EQ (charset, Vcharset_chinese_big5_1) ||
3736       EQ (charset, Vcharset_chinese_big5_2))
3737     {
3738       ENCODE_BIG5 (XCHARSET_LEADING_BYTE (charset), c1 | 0x80, c2 | 0x80,
3739                    b1, b2);
3740       return Fcons (make_int (b1), make_int (b2));
3741     }
3742   else
3743     return Qnil;
3744 }
3745
3746 \f
3747 /************************************************************************/
3748 /*                           UCS-4 methods                              */
3749 /************************************************************************/
3750
3751 static int
3752 detect_coding_ucs4 (struct detection_state *st, const Extbyte *src, size_t n)
3753 {
3754   while (n--)
3755     {
3756       unsigned char c = *(unsigned char *)src++;
3757       switch (st->ucs4.in_byte)
3758         {
3759         case 0:
3760           if (c >= 128)
3761             return 0;
3762           else
3763             st->ucs4.in_byte++;
3764           break;
3765         case 3:
3766           st->ucs4.in_byte = 0;
3767           break;
3768         default:
3769           st->ucs4.in_byte++;
3770         }
3771     }
3772   return CODING_CATEGORY_UCS4_MASK;
3773 }
3774
3775 static void
3776 decode_coding_ucs4 (Lstream *decoding, const Extbyte *src,
3777                     unsigned_char_dynarr *dst, size_t n)
3778 {
3779   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3780   unsigned int flags = str->flags;
3781   unsigned int cpos  = str->cpos;
3782   unsigned char counter = str->counter;
3783
3784   while (n--)
3785     {
3786       unsigned char c = *(unsigned char *)src++;
3787       switch (counter)
3788         {
3789         case 0:
3790           cpos = c;
3791           counter = 3;
3792           break;
3793         case 1:
3794           DECODE_ADD_UCS_CHAR ((cpos << 8) | c, dst);
3795           cpos = 0;
3796           counter = 0;
3797           break;
3798         default:
3799           cpos = ( cpos << 8 ) | c;
3800           counter--;
3801         }
3802     }
3803   if (counter & CODING_STATE_END)
3804     DECODE_OUTPUT_PARTIAL_CHAR (cpos);
3805
3806   str->flags    = flags;
3807   str->cpos     = cpos;
3808   str->counter  = counter;
3809 }
3810
3811 void
3812 char_encode_ucs4 (struct encoding_stream *str, Emchar ch,
3813                   unsigned_char_dynarr *dst, unsigned int *flags)
3814 {
3815   Dynarr_add (dst, ch >> 24);
3816   Dynarr_add (dst, ch >> 16);
3817   Dynarr_add (dst, ch >>  8);
3818   Dynarr_add (dst, ch      );
3819 }
3820
3821 void
3822 char_finish_ucs4 (struct encoding_stream *str, unsigned_char_dynarr *dst,
3823                   unsigned int *flags)
3824 {
3825 }
3826
3827 \f
3828 /************************************************************************/
3829 /*                           UTF-8 methods                              */
3830 /************************************************************************/
3831
3832 static int
3833 detect_coding_utf8 (struct detection_state *st, const Extbyte *src, size_t n)
3834 {
3835   while (n--)
3836     {
3837       unsigned char c = *(unsigned char *)src++;
3838       switch (st->utf8.in_byte)
3839         {
3840         case 0:
3841           if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3842             return 0;
3843           else if (c >= 0xfc)
3844             st->utf8.in_byte = 5;
3845           else if (c >= 0xf8)
3846             st->utf8.in_byte = 4;
3847           else if (c >= 0xf0)
3848             st->utf8.in_byte = 3;
3849           else if (c >= 0xe0)
3850             st->utf8.in_byte = 2;
3851           else if (c >= 0xc0)
3852             st->utf8.in_byte = 1;
3853           else if (c >= 0x80)
3854             return 0;
3855           break;
3856         default:
3857           if ((c & 0xc0) != 0x80)
3858             return 0;
3859           else
3860             st->utf8.in_byte--;
3861         }
3862     }
3863   return CODING_CATEGORY_UTF8_MASK;
3864 }
3865
3866 static void
3867 decode_output_utf8_partial_char (unsigned char counter,
3868                                  unsigned int cpos,
3869                                  unsigned_char_dynarr *dst)
3870 {
3871   if (counter == 5)
3872     DECODE_ADD_BINARY_CHAR ( (cpos|0xFC), dst);
3873   else if (counter == 4)
3874     {
3875       if (cpos < (1 << 6))
3876         DECODE_ADD_BINARY_CHAR ( (cpos|0xF8), dst);
3877       else
3878         {
3879           DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xFC), dst);
3880           DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
3881         }
3882     }
3883   else if (counter == 3)
3884     {
3885       if (cpos < (1 << 6))
3886         DECODE_ADD_BINARY_CHAR ( (cpos|0xF0), dst);
3887       else if (cpos < (1 << 12))
3888         {
3889           DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xF8), dst);
3890           DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
3891         }
3892       else
3893         {
3894           DECODE_ADD_BINARY_CHAR ( ( (cpos >> 12)|0xFC), dst);
3895           DECODE_ADD_BINARY_CHAR ( (((cpos >>  6)&0x3F)|0x80), dst);
3896           DECODE_ADD_BINARY_CHAR ( ( (cpos       &0x3F)|0x80), dst);
3897         }
3898     }
3899   else if (counter == 2)
3900     {
3901       if (cpos < (1 << 6))
3902         DECODE_ADD_BINARY_CHAR ( (cpos|0xE0), dst);
3903       else if (cpos < (1 << 12))
3904         {
3905           DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xF0), dst);
3906           DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
3907         }
3908       else if (cpos < (1 << 18))
3909         {
3910           DECODE_ADD_BINARY_CHAR ( ( (cpos >> 12)|0xF8), dst);
3911           DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
3912           DECODE_ADD_BINARY_CHAR ( ( (cpos      &0x3F)|0x80), dst);
3913         }
3914       else
3915         {
3916           DECODE_ADD_BINARY_CHAR ( ( (cpos >> 18)|0xFC), dst);
3917           DECODE_ADD_BINARY_CHAR ( (((cpos >> 12)&0x3F)|0x80), dst);
3918           DECODE_ADD_BINARY_CHAR ( (((cpos >>  6)&0x3F)|0x80), dst);
3919           DECODE_ADD_BINARY_CHAR ( ( (cpos       &0x3F)|0x80), dst);
3920         }
3921     }
3922   else
3923     {
3924       if (cpos < (1 << 6))
3925         DECODE_ADD_BINARY_CHAR ( (cpos|0xC0), dst);
3926       else if (cpos < (1 << 12))
3927         {
3928           DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xE0), dst);
3929           DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
3930         }
3931       else if (cpos < (1 << 18))
3932         {
3933           DECODE_ADD_BINARY_CHAR ( ( (cpos >> 12)|0xF0), dst);
3934           DECODE_ADD_BINARY_CHAR ( (((cpos >>  6)&0x3F)|0x80), dst);
3935           DECODE_ADD_BINARY_CHAR ( ( (cpos       &0x3F)|0x80), dst);
3936         }
3937       else if (cpos < (1 << 24))
3938         {
3939           DECODE_ADD_BINARY_CHAR ( ( (cpos >> 18)|0xF8), dst);
3940           DECODE_ADD_BINARY_CHAR ( (((cpos >> 12)&0x3F)|0x80), dst);
3941           DECODE_ADD_BINARY_CHAR ( (((cpos >>  6)&0x3F)|0x80), dst);
3942           DECODE_ADD_BINARY_CHAR ( ( (cpos       &0x3F)|0x80), dst);
3943         }
3944       else
3945         {
3946           DECODE_ADD_BINARY_CHAR ( ( (cpos >> 24)|0xFC), dst);
3947           DECODE_ADD_BINARY_CHAR ( (((cpos >> 18)&0x3F)|0x80), dst);
3948           DECODE_ADD_BINARY_CHAR ( (((cpos >> 12)&0x3F)|0x80), dst);
3949           DECODE_ADD_BINARY_CHAR ( (((cpos >>  6)&0x3F)|0x80), dst);
3950           DECODE_ADD_BINARY_CHAR ( ( (cpos       &0x3F)|0x80), dst);
3951         }
3952     }
3953 }
3954
3955 static void
3956 decode_coding_utf8 (Lstream *decoding, const Extbyte *src,
3957                     unsigned_char_dynarr *dst, size_t n)
3958 {
3959   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3960   unsigned int flags    = str->flags;
3961   unsigned int cpos     = str->cpos;
3962   eol_type_t eol_type   = str->eol_type;
3963   unsigned char counter = str->counter;
3964
3965   while (n--)
3966     {
3967       unsigned char c = *(unsigned char *)src++;
3968       if (counter == 0)
3969         {
3970           if ( c < 0xC0 )
3971             {
3972               DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3973               DECODE_ADD_UCS_CHAR (c, dst);
3974             }
3975           else if ( c < 0xE0 )
3976             {
3977               cpos = c & 0x1f;
3978               counter = 1;
3979             }
3980           else if ( c < 0xF0 )
3981             {
3982               cpos = c & 0x0f;
3983               counter = 2;
3984             }
3985           else if ( c < 0xF8 )
3986             {
3987               cpos = c & 0x07;
3988               counter = 3;
3989             }
3990           else if ( c < 0xFC )
3991             {
3992               cpos = c & 0x03;
3993               counter = 4;
3994             }
3995           else
3996             {
3997               cpos = c & 0x01;
3998               counter = 5;
3999             }
4000         }
4001       else if ( (c & 0xC0) == 0x80 )
4002         {
4003           cpos = ( cpos << 6 ) | ( c & 0x3f );
4004           if (counter == 1)
4005             {
4006               DECODE_ADD_UCS_CHAR (cpos, dst);
4007               cpos = 0;
4008               counter = 0;
4009             }
4010           else
4011             counter--;
4012         }
4013       else
4014         {
4015           decode_output_utf8_partial_char (counter, cpos, dst);
4016           DECODE_ADD_BINARY_CHAR (c, dst);
4017           cpos = 0;
4018           counter = 0;
4019         }
4020     label_continue_loop:;
4021     }
4022
4023   if (flags & CODING_STATE_END)
4024     if (counter > 0)
4025       {
4026         decode_output_utf8_partial_char (counter, cpos, dst);
4027         cpos = 0;
4028         counter = 0;
4029       }
4030   str->flags    = flags;
4031   str->cpos     = cpos;
4032   str->counter  = counter;
4033 }
4034
4035 void
4036 char_encode_utf8 (struct encoding_stream *str, Emchar ch,
4037                   unsigned_char_dynarr *dst, unsigned int *flags)
4038 {
4039   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
4040
4041   if (ch == '\n')
4042     {
4043       if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
4044         Dynarr_add (dst, '\r');
4045       if (eol_type != EOL_CR)
4046         Dynarr_add (dst, ch);
4047     }
4048   else if (ch <= 0x7f)
4049     {
4050       Dynarr_add (dst, ch);
4051     }
4052   else if (ch <= 0x7ff)
4053     {
4054       Dynarr_add (dst, (ch >> 6) | 0xc0);
4055       Dynarr_add (dst, (ch & 0x3f) | 0x80);
4056     }
4057   else if (ch <= 0xffff)
4058     {
4059       Dynarr_add (dst,  (ch >> 12) | 0xe0);
4060       Dynarr_add (dst, ((ch >>  6) & 0x3f) | 0x80);
4061       Dynarr_add (dst,  (ch        & 0x3f) | 0x80);
4062     }
4063   else if (ch <= 0x1fffff)
4064     {
4065       Dynarr_add (dst,  (ch >> 18) | 0xf0);
4066       Dynarr_add (dst, ((ch >> 12) & 0x3f) | 0x80);
4067       Dynarr_add (dst, ((ch >>  6) & 0x3f) | 0x80);
4068       Dynarr_add (dst,  (ch        & 0x3f) | 0x80);
4069     }
4070   else if (ch <= 0x3ffffff)
4071     {
4072       Dynarr_add (dst,  (ch >> 24) | 0xf8);
4073       Dynarr_add (dst, ((ch >> 18) & 0x3f) | 0x80);
4074       Dynarr_add (dst, ((ch >> 12) & 0x3f) | 0x80);
4075       Dynarr_add (dst, ((ch >>  6) & 0x3f) | 0x80);
4076       Dynarr_add (dst,  (ch        & 0x3f) | 0x80);
4077     }
4078   else
4079     {
4080       Dynarr_add (dst,  (ch >> 30) | 0xfc);
4081       Dynarr_add (dst, ((ch >> 24) & 0x3f) | 0x80);
4082       Dynarr_add (dst, ((ch >> 18) & 0x3f) | 0x80);
4083       Dynarr_add (dst, ((ch >> 12) & 0x3f) | 0x80);
4084       Dynarr_add (dst, ((ch >>  6) & 0x3f) | 0x80);
4085       Dynarr_add (dst,  (ch        & 0x3f) | 0x80);
4086     }
4087 }
4088
4089 void
4090 char_finish_utf8 (struct encoding_stream *str, unsigned_char_dynarr *dst,
4091                   unsigned int *flags)
4092 {
4093 }
4094
4095 \f
4096 /************************************************************************/
4097 /*                           ISO2022 methods                            */
4098 /************************************************************************/
4099
4100 /* The following note describes the coding system ISO2022 briefly.
4101    Since the intention of this note is to help understand the
4102    functions in this file, some parts are NOT ACCURATE or OVERLY
4103    SIMPLIFIED.  For thorough understanding, please refer to the
4104    original document of ISO2022.
4105
4106    ISO2022 provides many mechanisms to encode several character sets
4107    in 7-bit and 8-bit environments.  For 7-bit environments, all text
4108    is encoded using bytes less than 128.  This may make the encoded
4109    text a little bit longer, but the text passes more easily through
4110    several gateways, some of which strip off MSB (Most Signigant Bit).
4111
4112    There are two kinds of character sets: control character set and
4113    graphic character set.  The former contains control characters such
4114    as `newline' and `escape' to provide control functions (control
4115    functions are also provided by escape sequences).  The latter
4116    contains graphic characters such as 'A' and '-'.  Emacs recognizes
4117    two control character sets and many graphic character sets.
4118
4119    Graphic character sets are classified into one of the following
4120    four classes, according to the number of bytes (DIMENSION) and
4121    number of characters in one dimension (CHARS) of the set:
4122    - DIMENSION1_CHARS94
4123    - DIMENSION1_CHARS96
4124    - DIMENSION2_CHARS94
4125    - DIMENSION2_CHARS96
4126
4127    In addition, each character set is assigned an identification tag,
4128    unique for each set, called "final character" (denoted as <F>
4129    hereafter).  The <F> of each character set is decided by ECMA(*)
4130    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
4131    (0x30..0x3F are for private use only).
4132
4133    Note (*): ECMA = European Computer Manufacturers Association
4134
4135    Here are examples of graphic character set [NAME(<F>)]:
4136         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
4137         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
4138         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
4139         o DIMENSION2_CHARS96 -- none for the moment
4140
4141    A code area (1 byte = 8 bits) is divided into 4 areas, C0, GL, C1, and GR.
4142         C0 [0x00..0x1F] -- control character plane 0
4143         GL [0x20..0x7F] -- graphic character plane 0
4144         C1 [0x80..0x9F] -- control character plane 1
4145         GR [0xA0..0xFF] -- graphic character plane 1
4146
4147    A control character set is directly designated and invoked to C0 or
4148    C1 by an escape sequence.  The most common case is that:
4149    - ISO646's  control character set is designated/invoked to C0, and
4150    - ISO6429's control character set is designated/invoked to C1,
4151    and usually these designations/invocations are omitted in encoded
4152    text.  In a 7-bit environment, only C0 can be used, and a control
4153    character for C1 is encoded by an appropriate escape sequence to
4154    fit into the environment.  All control characters for C1 are
4155    defined to have corresponding escape sequences.
4156
4157    A graphic character set is at first designated to one of four
4158    graphic registers (G0 through G3), then these graphic registers are
4159    invoked to GL or GR.  These designations and invocations can be
4160    done independently.  The most common case is that G0 is invoked to
4161    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
4162    these invocations and designations are omitted in encoded text.
4163    In a 7-bit environment, only GL can be used.
4164
4165    When a graphic character set of CHARS94 is invoked to GL, codes
4166    0x20 and 0x7F of the GL area work as control characters SPACE and
4167    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
4168    be used.
4169
4170    There are two ways of invocation: locking-shift and single-shift.
4171    With locking-shift, the invocation lasts until the next different
4172    invocation, whereas with single-shift, the invocation affects the
4173    following character only and doesn't affect the locking-shift
4174    state.  Invocations are done by the following control characters or
4175    escape sequences:
4176
4177    ----------------------------------------------------------------------
4178    abbrev  function                  cntrl escape seq   description
4179    ----------------------------------------------------------------------
4180    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
4181    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
4182    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
4183    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
4184    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
4185    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
4186    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
4187    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
4188    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
4189    ----------------------------------------------------------------------
4190    (*) These are not used by any known coding system.
4191
4192    Control characters for these functions are defined by macros
4193    ISO_CODE_XXX in `coding.h'.
4194
4195    Designations are done by the following escape sequences:
4196    ----------------------------------------------------------------------
4197    escape sequence      description
4198    ----------------------------------------------------------------------
4199    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
4200    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
4201    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
4202    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
4203    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
4204    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
4205    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
4206    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
4207    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
4208    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
4209    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
4210    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
4211    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
4212    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
4213    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
4214    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
4215    ----------------------------------------------------------------------
4216
4217    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
4218    of dimension 1, chars 94, and final character <F>, etc...
4219
4220    Note (*): Although these designations are not allowed in ISO2022,
4221    Emacs accepts them on decoding, and produces them on encoding
4222    CHARS96 character sets in a coding system which is characterized as
4223    7-bit environment, non-locking-shift, and non-single-shift.
4224
4225    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
4226    '(' can be omitted.  We refer to this as "short-form" hereafter.
4227
4228    Now you may notice that there are a lot of ways for encoding the
4229    same multilingual text in ISO2022.  Actually, there exist many
4230    coding systems such as Compound Text (used in X11's inter client
4231    communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
4232    (used in Korean internet), EUC (Extended UNIX Code, used in Asian
4233    localized platforms), and all of these are variants of ISO2022.
4234
4235    In addition to the above, Emacs handles two more kinds of escape
4236    sequences: ISO6429's direction specification and Emacs' private
4237    sequence for specifying character composition.
4238
4239    ISO6429's direction specification takes the following form:
4240         o CSI ']'      -- end of the current direction
4241         o CSI '0' ']'  -- end of the current direction
4242         o CSI '1' ']'  -- start of left-to-right text
4243         o CSI '2' ']'  -- start of right-to-left text
4244    The control character CSI (0x9B: control sequence introducer) is
4245    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
4246
4247    Character composition specification takes the following form:
4248         o ESC '0' -- start character composition
4249         o ESC '1' -- end character composition
4250    Since these are not standard escape sequences of any ISO standard,
4251    their use with these meanings is restricted to Emacs only.  */
4252
4253 static void
4254 reset_iso2022 (Lisp_Object coding_system, struct iso2022_decoder *iso)
4255 {
4256   int i;
4257
4258   for (i = 0; i < 4; i++)
4259     {
4260       if (!NILP (coding_system))
4261         iso->charset[i] =
4262           XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, i);
4263       else
4264         iso->charset[i] = Qt;
4265       iso->invalid_designated[i] = 0;
4266     }
4267   iso->esc = ISO_ESC_NOTHING;
4268   iso->esc_bytes_index = 0;
4269   iso->register_left = 0;
4270   iso->register_right = 1;
4271   iso->switched_dir_and_no_valid_charset_yet = 0;
4272   iso->invalid_switch_dir = 0;
4273   iso->output_direction_sequence = 0;
4274   iso->output_literally = 0;
4275 #ifdef ENABLE_COMPOSITE_CHARS
4276   if (iso->composite_chars)
4277     Dynarr_reset (iso->composite_chars);
4278 #endif
4279 }
4280
4281 static int
4282 fit_to_be_escape_quoted (unsigned char c)
4283 {
4284   switch (c)
4285     {
4286     case ISO_CODE_ESC:
4287     case ISO_CODE_CSI:
4288     case ISO_CODE_SS2:
4289     case ISO_CODE_SS3:
4290     case ISO_CODE_SO:
4291     case ISO_CODE_SI:
4292       return 1;
4293
4294     default:
4295       return 0;
4296     }
4297 }
4298
4299 /* Parse one byte of an ISO2022 escape sequence.
4300    If the result is an invalid escape sequence, return 0 and
4301    do not change anything in STR.  Otherwise, if the result is
4302    an incomplete escape sequence, update ISO2022.ESC and
4303    ISO2022.ESC_BYTES and return -1.  Otherwise, update
4304    all the state variables (but not ISO2022.ESC_BYTES) and
4305    return 1.
4306
4307    If CHECK_INVALID_CHARSETS is non-zero, check for designation
4308    or invocation of an invalid character set and treat that as
4309    an unrecognized escape sequence. */
4310
4311 static int
4312 parse_iso2022_esc (Lisp_Object codesys, struct iso2022_decoder *iso,
4313                    unsigned char c, unsigned int *flags,
4314                    int check_invalid_charsets)
4315 {
4316   /* (1) If we're at the end of a designation sequence, CS is the
4317      charset being designated and REG is the register to designate
4318      it to.
4319
4320      (2) If we're at the end of a locking-shift sequence, REG is
4321      the register to invoke and HALF (0 == left, 1 == right) is
4322      the half to invoke it into.
4323
4324      (3) If we're at the end of a single-shift sequence, REG is
4325      the register to invoke. */
4326   Lisp_Object cs = Qnil;
4327   int reg, half;
4328
4329   /* NOTE: This code does goto's all over the fucking place.
4330      The reason for this is that we're basically implementing
4331      a state machine here, and hierarchical languages like C
4332      don't really provide a clean way of doing this. */
4333
4334   if (! (*flags & CODING_STATE_ESCAPE))
4335     /* At beginning of escape sequence; we need to reset our
4336        escape-state variables. */
4337     iso->esc = ISO_ESC_NOTHING;
4338
4339   iso->output_literally = 0;
4340   iso->output_direction_sequence = 0;
4341
4342   switch (iso->esc)
4343     {
4344     case ISO_ESC_NOTHING:
4345       iso->esc_bytes_index = 0;
4346       switch (c)
4347         {
4348         case ISO_CODE_ESC:      /* Start escape sequence */
4349           *flags |= CODING_STATE_ESCAPE;
4350           iso->esc = ISO_ESC;
4351           goto not_done;
4352
4353         case ISO_CODE_CSI:      /* ISO6429 (specifying directionality) */
4354           *flags |= CODING_STATE_ESCAPE;
4355           iso->esc = ISO_ESC_5_11;
4356           goto not_done;
4357
4358         case ISO_CODE_SO:       /* locking shift 1 */
4359           reg = 1; half = 0;
4360           goto locking_shift;
4361         case ISO_CODE_SI:       /* locking shift 0 */
4362           reg = 0; half = 0;
4363           goto locking_shift;
4364
4365         case ISO_CODE_SS2:      /* single shift */
4366           reg = 2;
4367           goto single_shift;
4368         case ISO_CODE_SS3:      /* single shift */
4369           reg = 3;
4370           goto single_shift;
4371
4372         default:                        /* Other control characters */
4373           return 0;
4374         }
4375
4376     case ISO_ESC:
4377       switch (c)
4378         {
4379           /**** single shift ****/
4380
4381         case 'N':       /* single shift 2 */
4382           reg = 2;
4383           goto single_shift;
4384         case 'O':       /* single shift 3 */
4385           reg = 3;
4386           goto single_shift;
4387
4388           /**** locking shift ****/
4389
4390         case '~':       /* locking shift 1 right */
4391           reg = 1; half = 1;
4392           goto locking_shift;
4393         case 'n':       /* locking shift 2 */
4394           reg = 2; half = 0;
4395           goto locking_shift;
4396         case '}':       /* locking shift 2 right */
4397           reg = 2; half = 1;
4398           goto locking_shift;
4399         case 'o':       /* locking shift 3 */
4400           reg = 3; half = 0;
4401           goto locking_shift;
4402         case '|':       /* locking shift 3 right */
4403           reg = 3; half = 1;
4404           goto locking_shift;
4405
4406 #ifdef ENABLE_COMPOSITE_CHARS
4407           /**** composite ****/
4408
4409         case '0':
4410           iso->esc = ISO_ESC_START_COMPOSITE;
4411           *flags = (*flags & CODING_STATE_ISO2022_LOCK) |
4412             CODING_STATE_COMPOSITE;
4413           return 1;
4414
4415         case '1':
4416           iso->esc = ISO_ESC_END_COMPOSITE;
4417           *flags = (*flags & CODING_STATE_ISO2022_LOCK) &
4418             ~CODING_STATE_COMPOSITE;
4419           return 1;
4420 #endif /* ENABLE_COMPOSITE_CHARS */
4421
4422           /**** directionality ****/
4423
4424         case '[':
4425           iso->esc = ISO_ESC_5_11;
4426           goto not_done;
4427
4428           /**** designation ****/
4429
4430         case '$':       /* multibyte charset prefix */
4431           iso->esc = ISO_ESC_2_4;
4432           goto not_done;
4433
4434         default:
4435           if (0x28 <= c && c <= 0x2F)
4436             {
4437               iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_8);
4438               goto not_done;
4439             }
4440
4441           /* This function is called with CODESYS equal to nil when
4442              doing coding-system detection. */
4443           if (!NILP (codesys)
4444               && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
4445               && fit_to_be_escape_quoted (c))
4446             {
4447               iso->esc = ISO_ESC_LITERAL;
4448               *flags &= CODING_STATE_ISO2022_LOCK;
4449               return 1;
4450             }
4451
4452           /* bzzzt! */
4453           return 0;
4454         }
4455
4456
4457
4458       /**** directionality ****/
4459
4460     case ISO_ESC_5_11:          /* ISO6429 direction control */
4461       if (c == ']')
4462         {
4463           *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4464           goto directionality;
4465         }
4466       if      (c == '0') iso->esc = ISO_ESC_5_11_0;
4467       else if (c == '1') iso->esc = ISO_ESC_5_11_1;
4468       else if (c == '2') iso->esc = ISO_ESC_5_11_2;
4469       else               return 0;
4470       goto not_done;
4471
4472     case ISO_ESC_5_11_0:
4473       if (c == ']')
4474         {
4475           *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4476           goto directionality;
4477         }
4478       return 0;
4479
4480     case ISO_ESC_5_11_1:
4481       if (c == ']')
4482         {
4483           *flags = (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4484           goto directionality;
4485         }
4486       return 0;
4487
4488     case ISO_ESC_5_11_2:
4489       if (c == ']')
4490         {
4491           *flags = (*flags & CODING_STATE_ISO2022_LOCK) | CODING_STATE_R2L;
4492           goto directionality;
4493         }
4494       return 0;
4495
4496     directionality:
4497       iso->esc = ISO_ESC_DIRECTIONALITY;
4498       /* Various junk here to attempt to preserve the direction sequences
4499          literally in the text if they would otherwise be swallowed due
4500          to invalid designations that don't show up as actual charset
4501          changes in the text. */
4502       if (iso->invalid_switch_dir)
4503         {
4504           /* We already inserted a direction switch literally into the
4505              text.  We assume (#### this may not be right) that the
4506              next direction switch is the one going the other way,
4507              and we need to output that literally as well. */
4508           iso->output_literally = 1;
4509           iso->invalid_switch_dir = 0;
4510         }
4511       else
4512         {
4513           int jj;
4514
4515           /* If we are in the thrall of an invalid designation,
4516            then stick the directionality sequence literally into the
4517            output stream so it ends up in the original text again. */
4518           for (jj = 0; jj < 4; jj++)
4519             if (iso->invalid_designated[jj])
4520               break;
4521           if (jj < 4)
4522             {
4523               iso->output_literally = 1;
4524               iso->invalid_switch_dir = 1;
4525             }
4526           else
4527             /* Indicate that we haven't yet seen a valid designation,
4528                so that if a switch-dir is directly followed by an
4529                invalid designation, both get inserted literally. */
4530             iso->switched_dir_and_no_valid_charset_yet = 1;
4531         }
4532       return 1;
4533
4534
4535       /**** designation ****/
4536
4537     case ISO_ESC_2_4:
4538       if (0x28 <= c && c <= 0x2F)
4539         {
4540           iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_4_8);
4541           goto not_done;
4542         }
4543       if (0x40 <= c && c <= 0x42)
4544         {
4545           /* 94^n-set */
4546           cs = CHARSET_BY_ATTRIBUTES (94, -1, c,
4547                                       *flags & CODING_STATE_R2L ?
4548                                       CHARSET_RIGHT_TO_LEFT :
4549                                       CHARSET_LEFT_TO_RIGHT);
4550           reg = 0;
4551           goto designated;
4552         }
4553       return 0;
4554
4555     default:
4556       {
4557         int chars = 0;
4558         int single = 0;
4559
4560         if (c < '0' || c > '~')
4561           return 0; /* bad final byte */
4562
4563         if (iso->esc >= ISO_ESC_2_8 &&
4564             iso->esc <= ISO_ESC_2_15)
4565           {
4566             chars = (iso->esc >= ISO_ESC_2_12) ? 96 : 94;
4567             single = 1; /* single-byte */
4568             reg = (iso->esc - ISO_ESC_2_8) & 3;
4569           }
4570         else if (iso->esc >= ISO_ESC_2_4_8 &&
4571                  iso->esc <= ISO_ESC_2_4_15)
4572           {
4573             chars = (iso->esc >= ISO_ESC_2_4_12) ? 96 : 94;
4574             single = -1; /* multi-byte */
4575             reg = (iso->esc - ISO_ESC_2_4_8) & 3;
4576           }
4577         else
4578           {
4579             /* Can this ever be reached? -slb */
4580             abort();
4581           }
4582
4583         cs = CHARSET_BY_ATTRIBUTES (chars, single, c,
4584                                     *flags & CODING_STATE_R2L ?
4585                                     CHARSET_RIGHT_TO_LEFT :
4586                                     CHARSET_LEFT_TO_RIGHT);
4587         goto designated;
4588       }
4589     }
4590
4591  not_done:
4592   iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char) c;
4593   return -1;
4594
4595  single_shift:
4596   if (check_invalid_charsets && !CHARSETP (iso->charset[reg]))
4597     /* can't invoke something that ain't there. */
4598     return 0;
4599   iso->esc = ISO_ESC_SINGLE_SHIFT;
4600   *flags &= CODING_STATE_ISO2022_LOCK;
4601   if (reg == 2)
4602     *flags |= CODING_STATE_SS2;
4603   else
4604     *flags |= CODING_STATE_SS3;
4605   return 1;
4606
4607  locking_shift:
4608   if (check_invalid_charsets &&
4609       !CHARSETP (iso->charset[reg]))
4610     /* can't invoke something that ain't there. */
4611     return 0;
4612   if (half)
4613     iso->register_right = reg;
4614   else
4615     iso->register_left = reg;
4616   *flags &= CODING_STATE_ISO2022_LOCK;
4617   iso->esc = ISO_ESC_LOCKING_SHIFT;
4618   return 1;
4619
4620  designated:
4621   if (NILP (cs) && check_invalid_charsets)
4622     {
4623       iso->invalid_designated[reg] = 1;
4624       iso->charset[reg] = Vcharset_ascii;
4625       iso->esc = ISO_ESC_DESIGNATE;
4626       *flags &= CODING_STATE_ISO2022_LOCK;
4627       iso->output_literally = 1;
4628       if (iso->switched_dir_and_no_valid_charset_yet)
4629         {
4630           /* We encountered a switch-direction followed by an
4631              invalid designation.  Ensure that the switch-direction
4632              gets outputted; otherwise it will probably get eaten
4633              when the text is written out again. */
4634           iso->switched_dir_and_no_valid_charset_yet = 0;
4635           iso->output_direction_sequence = 1;
4636           /* And make sure that the switch-dir going the other
4637              way gets outputted, as well. */
4638           iso->invalid_switch_dir = 1;
4639         }
4640       return 1;
4641     }
4642   /* This function is called with CODESYS equal to nil when
4643      doing coding-system detection. */
4644   if (!NILP (codesys))
4645     {
4646       charset_conversion_spec_dynarr *dyn =
4647         XCODING_SYSTEM (codesys)->iso2022.input_conv;
4648
4649       if (dyn)
4650         {
4651           int i;
4652
4653           for (i = 0; i < Dynarr_length (dyn); i++)
4654             {
4655               struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
4656               if (EQ (cs, spec->from_charset))
4657                 cs = spec->to_charset;
4658             }
4659         }
4660     }
4661
4662   iso->charset[reg] = cs;
4663   iso->esc = ISO_ESC_DESIGNATE;
4664   *flags &= CODING_STATE_ISO2022_LOCK;
4665   if (iso->invalid_designated[reg])
4666     {
4667       iso->invalid_designated[reg] = 0;
4668       iso->output_literally = 1;
4669     }
4670   if (iso->switched_dir_and_no_valid_charset_yet)
4671     iso->switched_dir_and_no_valid_charset_yet = 0;
4672   return 1;
4673 }
4674
4675 static int
4676 detect_coding_iso2022 (struct detection_state *st, const Extbyte *src, size_t n)
4677 {
4678   int mask;
4679
4680   /* #### There are serious deficiencies in the recognition mechanism
4681      here.  This needs to be much smarter if it's going to cut it.
4682      The sequence "\xff\x0f" is currently detected as LOCK_SHIFT while
4683      it should be detected as Latin-1.
4684      All the ISO2022 stuff in this file should be synced up with the
4685      code from FSF Emacs-20.4, in which Mule should be more or less stable.
4686      Perhaps we should wait till R2L works in FSF Emacs? */
4687
4688   if (!st->iso2022.initted)
4689     {
4690       reset_iso2022 (Qnil, &st->iso2022.iso);
4691       st->iso2022.mask = (CODING_CATEGORY_ISO_7_MASK |
4692                           CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4693                           CODING_CATEGORY_ISO_8_1_MASK |
4694                           CODING_CATEGORY_ISO_8_2_MASK |
4695                           CODING_CATEGORY_ISO_LOCK_SHIFT_MASK);
4696       st->iso2022.flags = 0;
4697       st->iso2022.high_byte_count = 0;
4698       st->iso2022.saw_single_shift = 0;
4699       st->iso2022.initted = 1;
4700     }
4701
4702   mask = st->iso2022.mask;
4703
4704   while (n--)
4705     {
4706       unsigned char c = *(unsigned char *)src++;
4707       if (c >= 0xA0)
4708         {
4709           mask &= ~CODING_CATEGORY_ISO_7_MASK;
4710           st->iso2022.high_byte_count++;
4711         }
4712       else
4713         {
4714           if (st->iso2022.high_byte_count && !st->iso2022.saw_single_shift)
4715             {
4716               if (st->iso2022.high_byte_count & 1)
4717                 /* odd number of high bytes; assume not iso-8-2 */
4718                 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4719             }
4720           st->iso2022.high_byte_count = 0;
4721           st->iso2022.saw_single_shift = 0;
4722           if (c > 0x80)
4723             mask &= ~CODING_CATEGORY_ISO_7_MASK;
4724         }
4725       if (!(st->iso2022.flags & CODING_STATE_ESCAPE)
4726           && (BYTE_C0_P (c) || BYTE_C1_P (c)))
4727         { /* control chars */
4728           switch (c)
4729             {
4730               /* Allow and ignore control characters that you might
4731                  reasonably see in a text file */
4732             case '\r':
4733             case '\n':
4734             case '\t':
4735             case  7: /* bell */
4736             case  8: /* backspace */
4737             case 11: /* vertical tab */
4738             case 12: /* form feed */
4739             case 26: /* MS-DOS C-z junk */
4740             case 31: /* '^_' -- for info */
4741               goto label_continue_loop;
4742
4743             default:
4744               break;
4745             }
4746         }
4747
4748       if ((st->iso2022.flags & CODING_STATE_ESCAPE) || BYTE_C0_P (c)
4749           || BYTE_C1_P (c))
4750         {
4751           if (parse_iso2022_esc (Qnil, &st->iso2022.iso, c,
4752                                  &st->iso2022.flags, 0))
4753             {
4754               switch (st->iso2022.iso.esc)
4755                 {
4756                 case ISO_ESC_DESIGNATE:
4757                   mask &= ~CODING_CATEGORY_ISO_8_1_MASK;
4758                   mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4759                   break;
4760                 case ISO_ESC_LOCKING_SHIFT:
4761                   mask = CODING_CATEGORY_ISO_LOCK_SHIFT_MASK;
4762                   goto ran_out_of_chars;
4763                 case ISO_ESC_SINGLE_SHIFT:
4764                   mask &= ~CODING_CATEGORY_ISO_8_DESIGNATE_MASK;
4765                   st->iso2022.saw_single_shift = 1;
4766                   break;
4767                 default:
4768                   break;
4769                 }
4770             }
4771           else
4772             {
4773               mask = 0;
4774               goto ran_out_of_chars;
4775             }
4776         }
4777     label_continue_loop:;
4778     }
4779
4780  ran_out_of_chars:
4781
4782   return mask;
4783 }
4784
4785 static int
4786 postprocess_iso2022_mask (int mask)
4787 {
4788   /* #### kind of cheesy */
4789   /* If seven-bit ISO is allowed, then assume that the encoding is
4790      entirely seven-bit and turn off the eight-bit ones. */
4791   if (mask & CODING_CATEGORY_ISO_7_MASK)
4792     mask &= ~ (CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4793                CODING_CATEGORY_ISO_8_1_MASK |
4794                CODING_CATEGORY_ISO_8_2_MASK);
4795   return mask;
4796 }
4797
4798 /* If FLAGS is a null pointer or specifies right-to-left motion,
4799    output a switch-dir-to-left-to-right sequence to DST.
4800    Also update FLAGS if it is not a null pointer.
4801    If INTERNAL_P is set, we are outputting in internal format and
4802    need to handle the CSI differently. */
4803
4804 static void
4805 restore_left_to_right_direction (Lisp_Coding_System *codesys,
4806                                  unsigned_char_dynarr *dst,
4807                                  unsigned int *flags,
4808                                  int internal_p)
4809 {
4810   if (!flags || (*flags & CODING_STATE_R2L))
4811     {
4812       if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4813         {
4814           Dynarr_add (dst, ISO_CODE_ESC);
4815           Dynarr_add (dst, '[');
4816         }
4817       else if (internal_p)
4818         DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4819       else
4820         Dynarr_add (dst, ISO_CODE_CSI);
4821       Dynarr_add (dst, '0');
4822       Dynarr_add (dst, ']');
4823       if (flags)
4824         *flags &= ~CODING_STATE_R2L;
4825     }
4826 }
4827
4828 /* If FLAGS is a null pointer or specifies a direction different from
4829    DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or
4830    CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape
4831    sequence to DST.  Also update FLAGS if it is not a null pointer.
4832    If INTERNAL_P is set, we are outputting in internal format and
4833    need to handle the CSI differently. */
4834
4835 static void
4836 ensure_correct_direction (int direction, Lisp_Coding_System *codesys,
4837                           unsigned_char_dynarr *dst, unsigned int *flags,
4838                           int internal_p)
4839 {
4840   if ((!flags || (*flags & CODING_STATE_R2L)) &&
4841       direction == CHARSET_LEFT_TO_RIGHT)
4842     restore_left_to_right_direction (codesys, dst, flags, internal_p);
4843   else if (!CODING_SYSTEM_ISO2022_NO_ISO6429 (codesys)
4844            && (!flags || !(*flags & CODING_STATE_R2L)) &&
4845            direction == CHARSET_RIGHT_TO_LEFT)
4846     {
4847       if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4848         {
4849           Dynarr_add (dst, ISO_CODE_ESC);
4850           Dynarr_add (dst, '[');
4851         }
4852       else if (internal_p)
4853         DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4854       else
4855         Dynarr_add (dst, ISO_CODE_CSI);
4856       Dynarr_add (dst, '2');
4857       Dynarr_add (dst, ']');
4858       if (flags)
4859         *flags |= CODING_STATE_R2L;
4860     }
4861 }
4862
4863 /* Convert ISO2022-format data to internal format. */
4864
4865 static void
4866 decode_coding_iso2022 (Lstream *decoding, const Extbyte *src,
4867                        unsigned_char_dynarr *dst, size_t n)
4868 {
4869   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4870   unsigned int flags    = str->flags;
4871   unsigned int cpos     = str->cpos;
4872   unsigned char counter = str->counter;
4873   eol_type_t eol_type   = str->eol_type;
4874 #ifdef ENABLE_COMPOSITE_CHARS
4875   unsigned_char_dynarr *real_dst = dst;
4876 #endif
4877   Lisp_Object coding_system;
4878
4879   XSETCODING_SYSTEM (coding_system, str->codesys);
4880
4881 #ifdef ENABLE_COMPOSITE_CHARS
4882   if (flags & CODING_STATE_COMPOSITE)
4883     dst = str->iso2022.composite_chars;
4884 #endif /* ENABLE_COMPOSITE_CHARS */
4885
4886   while (n--)
4887     {
4888       unsigned char c = *(unsigned char *)src++;
4889       if (flags & CODING_STATE_ESCAPE)
4890         {       /* Within ESC sequence */
4891           int retval = parse_iso2022_esc (coding_system, &str->iso2022,
4892                                           c, &flags, 1);
4893
4894           if (retval)
4895             {
4896               switch (str->iso2022.esc)
4897                 {
4898 #ifdef ENABLE_COMPOSITE_CHARS
4899                 case ISO_ESC_START_COMPOSITE:
4900                   if (str->iso2022.composite_chars)
4901                     Dynarr_reset (str->iso2022.composite_chars);
4902                   else
4903                     str->iso2022.composite_chars = Dynarr_new (unsigned_char);
4904                   dst = str->iso2022.composite_chars;
4905                   break;
4906                 case ISO_ESC_END_COMPOSITE:
4907                   {
4908                     Bufbyte comstr[MAX_EMCHAR_LEN];
4909                     Bytecount len;
4910                     Emchar emch = lookup_composite_char (Dynarr_atp (dst, 0),
4911                                                          Dynarr_length (dst));
4912                     dst = real_dst;
4913                     len = set_charptr_emchar (comstr, emch);
4914                     Dynarr_add_many (dst, comstr, len);
4915                     break;
4916                   }
4917 #endif /* ENABLE_COMPOSITE_CHARS */
4918
4919                 case ISO_ESC_LITERAL:
4920                   COMPOSE_FLUSH_CHARS (str, dst);
4921                   DECODE_ADD_BINARY_CHAR (c, dst);
4922                   break;
4923
4924                 default:
4925                   /* Everything else handled already */
4926                   break;
4927                 }
4928             }
4929
4930           /* Attempted error recovery. */
4931           if (str->iso2022.output_direction_sequence)
4932             ensure_correct_direction (flags & CODING_STATE_R2L ?
4933                                       CHARSET_RIGHT_TO_LEFT :
4934                                       CHARSET_LEFT_TO_RIGHT,
4935                                       str->codesys, dst, 0, 1);
4936           /* More error recovery. */
4937           if (!retval || str->iso2022.output_literally)
4938             {
4939               /* Output the (possibly invalid) sequence */
4940               int i;
4941               COMPOSE_FLUSH_CHARS (str, dst);
4942               for (i = 0; i < str->iso2022.esc_bytes_index; i++)
4943                 DECODE_ADD_BINARY_CHAR (str->iso2022.esc_bytes[i], dst);
4944               flags &= CODING_STATE_ISO2022_LOCK;
4945               if (!retval)
4946                 n++, src--;/* Repeat the loop with the same character. */
4947               else
4948                 {
4949                   /* No sense in reprocessing the final byte of the
4950                      escape sequence; it could mess things up anyway.
4951                      Just add it now. */
4952                   COMPOSE_FLUSH_CHARS (str, dst);
4953                   DECODE_ADD_BINARY_CHAR (c, dst);
4954                 }
4955             }
4956           cpos = 0;
4957           counter = 0;
4958         }
4959       else if (BYTE_C0_P (c) || BYTE_C1_P (c))
4960         { /* Control characters */
4961
4962           /***** Error-handling *****/
4963
4964           /* If we were in the middle of a character, dump out the
4965              partial character. */
4966           if (counter)
4967             {
4968               COMPOSE_FLUSH_CHARS (str, dst);
4969               while (counter > 0)
4970                 {
4971                   counter--;
4972                   DECODE_ADD_BINARY_CHAR
4973                     ((unsigned char)(cpos >> (counter * 8)), dst);
4974                 }
4975               cpos = 0;
4976             }
4977
4978           /* If we just saw a single-shift character, dump it out.
4979              This may dump out the wrong sort of single-shift character,
4980              but least it will give an indication that something went
4981              wrong. */
4982           if (flags & CODING_STATE_SS2)
4983             {
4984               COMPOSE_FLUSH_CHARS (str, dst);
4985               DECODE_ADD_BINARY_CHAR (ISO_CODE_SS2, dst);
4986               flags &= ~CODING_STATE_SS2;
4987             }
4988           if (flags & CODING_STATE_SS3)
4989             {
4990               COMPOSE_FLUSH_CHARS (str, dst);
4991               DECODE_ADD_BINARY_CHAR (ISO_CODE_SS3, dst);
4992               flags &= ~CODING_STATE_SS3;
4993             }
4994
4995           /***** Now handle the control characters. *****/
4996
4997           /* Handle CR/LF */
4998 #ifdef UTF2000
4999           if (c == '\r')
5000             {
5001               COMPOSE_FLUSH_CHARS (str, dst);
5002               if (eol_type == EOL_CR)
5003                 Dynarr_add (dst, '\n');
5004               else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR)
5005                 Dynarr_add (dst, c);
5006               else
5007                 flags |= CODING_STATE_CR;
5008               goto label_continue_loop;
5009             }
5010           else if (flags & CODING_STATE_CR)
5011             {   /* eol_type == CODING_SYSTEM_EOL_CRLF */
5012               if (c != '\n')
5013                 Dynarr_add (dst, '\r');
5014               flags &= ~CODING_STATE_CR;
5015             }
5016 #else
5017           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5018 #endif
5019
5020           flags &= CODING_STATE_ISO2022_LOCK;
5021
5022           if (!parse_iso2022_esc (coding_system, &str->iso2022, c, &flags, 1))
5023             {
5024               COMPOSE_FLUSH_CHARS (str, dst);
5025               DECODE_ADD_BINARY_CHAR (c, dst);
5026             }
5027         }
5028       else
5029         {                       /* Graphic characters */
5030           Lisp_Object charset;
5031 #ifndef UTF2000
5032           Charset_ID lb;
5033 #endif
5034           int reg;
5035
5036 #ifdef UTF2000
5037           if (c == '\r')
5038             {
5039               COMPOSE_FLUSH_CHARS (str, dst);
5040               if (eol_type == EOL_CR)
5041                 Dynarr_add (dst, '\n');
5042               else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR)
5043                 Dynarr_add (dst, c);
5044               else
5045                 flags |= CODING_STATE_CR;
5046               goto label_continue_loop;
5047             }
5048           else if (flags & CODING_STATE_CR)
5049             {   /* eol_type == CODING_SYSTEM_EOL_CRLF */
5050               if (c != '\n')
5051                 Dynarr_add (dst, '\r');
5052               flags &= ~CODING_STATE_CR;
5053             }
5054 #else
5055           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5056 #endif
5057
5058           /* Now determine the charset. */
5059           reg = ((flags & CODING_STATE_SS2) ? 2
5060                  : (flags & CODING_STATE_SS3) ? 3
5061                  : !BYTE_ASCII_P (c) ? str->iso2022.register_right
5062                  : str->iso2022.register_left);
5063           charset = str->iso2022.charset[reg];
5064
5065           /* Error checking: */
5066           if (! CHARSETP (charset)
5067               || str->iso2022.invalid_designated[reg]
5068               || (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL)
5069                   && XCHARSET_CHARS (charset) == 94))
5070             /* Mrmph.  We are trying to invoke a register that has no
5071                or an invalid charset in it, or trying to add a character
5072                outside the range of the charset.  Insert that char literally
5073                to preserve it for the output. */
5074             {
5075               COMPOSE_FLUSH_CHARS (str, dst);
5076               while (counter > 0)
5077                 {
5078                   counter--;
5079                   DECODE_ADD_BINARY_CHAR
5080                     ((unsigned char)(cpos >> (counter * 8)), dst);
5081                 }
5082               cpos = 0;
5083               DECODE_ADD_BINARY_CHAR (c, dst);
5084             }
5085
5086           else
5087             {
5088               /* Things are probably hunky-dorey. */
5089
5090               /* Fetch reverse charset, maybe. */
5091               if (((flags & CODING_STATE_R2L) &&
5092                    XCHARSET_DIRECTION (charset) == CHARSET_LEFT_TO_RIGHT)
5093                   ||
5094                   (!(flags & CODING_STATE_R2L) &&
5095                    XCHARSET_DIRECTION (charset) == CHARSET_RIGHT_TO_LEFT))
5096                 {
5097                   Lisp_Object new_charset =
5098                     XCHARSET_REVERSE_DIRECTION_CHARSET (charset);
5099                   if (!NILP (new_charset))
5100                     charset = new_charset;
5101                 }
5102
5103 #ifdef UTF2000
5104               counter++;
5105               if (XCHARSET_DIMENSION (charset) == counter)
5106                 {
5107                   COMPOSE_ADD_CHAR (str,
5108                                     DECODE_CHAR (charset,
5109                                                  ((cpos & 0x7F7F7F) << 8)
5110                                                  | (c & 0x7F)),
5111                                     dst);
5112                   cpos = 0;
5113                   counter = 0;
5114                 }
5115               else
5116                 cpos = (cpos << 8) | c;
5117 #else
5118               lb = XCHARSET_LEADING_BYTE (charset);
5119               switch (XCHARSET_REP_BYTES (charset))
5120                 {
5121                 case 1: /* ASCII */
5122                   DECODE_OUTPUT_PARTIAL_CHAR (ch);
5123                   Dynarr_add (dst, c & 0x7F);
5124                   break;
5125
5126                 case 2: /* one-byte official */
5127                   DECODE_OUTPUT_PARTIAL_CHAR (ch);
5128                   Dynarr_add (dst, lb);
5129                   Dynarr_add (dst, c | 0x80);
5130                   break;
5131
5132                 case 3: /* one-byte private or two-byte official */
5133                   if (XCHARSET_PRIVATE_P (charset))
5134                     {
5135                       DECODE_OUTPUT_PARTIAL_CHAR (ch);
5136                       Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_1);
5137                       Dynarr_add (dst, lb);
5138                       Dynarr_add (dst, c | 0x80);
5139                     }
5140                   else
5141                     {
5142                       if (ch)
5143                         {
5144                           Dynarr_add (dst, lb);
5145                           Dynarr_add (dst, ch | 0x80);
5146                           Dynarr_add (dst, c | 0x80);
5147                           ch = 0;
5148                         }
5149                       else
5150                         ch = c;
5151                     }
5152                   break;
5153
5154                 default:        /* two-byte private */
5155                   if (ch)
5156                     {
5157                       Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_2);
5158                       Dynarr_add (dst, lb);
5159                       Dynarr_add (dst, ch | 0x80);
5160                       Dynarr_add (dst, c | 0x80);
5161                       ch = 0;
5162                     }
5163                   else
5164                     ch = c;
5165                 }
5166 #endif
5167             }
5168
5169           if (!cpos)
5170             flags &= CODING_STATE_ISO2022_LOCK;
5171         }
5172
5173     label_continue_loop:;
5174     }
5175
5176   if (flags & CODING_STATE_END)
5177     {
5178       COMPOSE_FLUSH_CHARS (str, dst);
5179       DECODE_OUTPUT_PARTIAL_CHAR (cpos);
5180     }
5181   str->flags   = flags;
5182   str->cpos    = cpos;
5183   str->counter = counter;
5184 }
5185
5186
5187 /***** ISO2022 encoder *****/
5188
5189 /* Designate CHARSET into register REG. */
5190
5191 static void
5192 iso2022_designate (Lisp_Object charset, unsigned char reg,
5193                    struct encoding_stream *str, unsigned_char_dynarr *dst)
5194 {
5195   static const char inter94[] = "()*+";
5196   static const char inter96[] = ",-./";
5197   unsigned short chars;
5198   unsigned char dimension;
5199   unsigned char final;
5200   Lisp_Object old_charset = str->iso2022.charset[reg];
5201
5202   str->iso2022.charset[reg] = charset;
5203   if (!CHARSETP (charset))
5204     /* charset might be an initial nil or t. */
5205     return;
5206   chars = XCHARSET_CHARS (charset);
5207   dimension = XCHARSET_DIMENSION (charset);
5208   final = XCHARSET_FINAL (charset);
5209   if (!str->iso2022.force_charset_on_output[reg] &&
5210       CHARSETP (old_charset) &&
5211       XCHARSET_CHARS (old_charset) == chars &&
5212       XCHARSET_DIMENSION (old_charset) == dimension &&
5213       XCHARSET_FINAL (old_charset) == final)
5214     return;
5215
5216   str->iso2022.force_charset_on_output[reg] = 0;
5217
5218   {
5219     charset_conversion_spec_dynarr *dyn =
5220       str->codesys->iso2022.output_conv;
5221
5222     if (dyn)
5223       {
5224         int i;
5225
5226         for (i = 0; i < Dynarr_length (dyn); i++)
5227           {
5228             struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
5229             if (EQ (charset, spec->from_charset))
5230                 charset = spec->to_charset;
5231           }
5232       }
5233   }
5234
5235   Dynarr_add (dst, ISO_CODE_ESC);
5236   switch (chars)
5237     {
5238     case 94:
5239       if (dimension == 1)
5240         Dynarr_add (dst, inter94[reg]);
5241       else
5242         {
5243           Dynarr_add (dst, '$');
5244           if (reg != 0
5245               || !(CODING_SYSTEM_ISO2022_SHORT (str->codesys))
5246               || final < '@'
5247               || final > 'B')
5248             Dynarr_add (dst, inter94[reg]);
5249         }
5250       break;
5251     case 96:
5252       if (dimension == 1)
5253         Dynarr_add (dst, inter96[reg]);
5254       else
5255         {
5256           Dynarr_add (dst, '$');
5257           Dynarr_add (dst, inter96[reg]);
5258         }
5259       break;
5260     }
5261   Dynarr_add (dst, final);
5262 }
5263
5264 static void
5265 ensure_normal_shift (struct encoding_stream *str, unsigned_char_dynarr *dst)
5266 {
5267   if (str->iso2022.register_left != 0)
5268     {
5269       Dynarr_add (dst, ISO_CODE_SI);
5270       str->iso2022.register_left = 0;
5271     }
5272 }
5273
5274 static void
5275 ensure_shift_out (struct encoding_stream *str, unsigned_char_dynarr *dst)
5276 {
5277   if (str->iso2022.register_left != 1)
5278     {
5279       Dynarr_add (dst, ISO_CODE_SO);
5280       str->iso2022.register_left = 1;
5281     }
5282 }
5283
5284 void
5285 char_encode_iso2022 (struct encoding_stream *str, Emchar ch,
5286                      unsigned_char_dynarr *dst, unsigned int *flags)
5287 {
5288   unsigned char charmask;
5289   Lisp_Coding_System* codesys = str->codesys;
5290   eol_type_t eol_type         = CODING_SYSTEM_EOL_TYPE (str->codesys);
5291   int i;
5292   Lisp_Object charset = str->iso2022.current_charset;
5293   int half = str->iso2022.current_half;
5294   int code_point = -1;
5295
5296   if (ch <= 0x7F)
5297     {
5298       restore_left_to_right_direction (codesys, dst, flags, 0);
5299
5300       /* Make sure G0 contains ASCII */
5301       if ((ch > ' ' && ch < ISO_CODE_DEL)
5302           || !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
5303         {
5304           ensure_normal_shift (str, dst);
5305           iso2022_designate (Vcharset_ascii, 0, str, dst);
5306         }
5307
5308       /* If necessary, restore everything to the default state
5309          at end-of-line */
5310       if (ch == '\n' && !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys)))
5311         {
5312           restore_left_to_right_direction (codesys, dst, flags, 0);
5313
5314           ensure_normal_shift (str, dst);
5315
5316           for (i = 0; i < 4; i++)
5317             {
5318               Lisp_Object initial_charset =
5319                 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5320               iso2022_designate (initial_charset, i, str, dst);
5321             }
5322         }
5323       if (ch == '\n')
5324         {
5325           if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5326             Dynarr_add (dst, '\r');
5327           if (eol_type != EOL_CR)
5328             Dynarr_add (dst, ch);
5329         }
5330       else
5331         {
5332           if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5333               && fit_to_be_escape_quoted (ch))
5334             Dynarr_add (dst, ISO_CODE_ESC);
5335           Dynarr_add (dst, ch);
5336         }
5337     }
5338   else if ( (0x80 <= ch) && (ch <= 0x9f) )
5339     {
5340       charmask = (half == 0 ? 0x00 : 0x80);
5341
5342       if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5343           && fit_to_be_escape_quoted (ch))
5344         Dynarr_add (dst, ISO_CODE_ESC);
5345       /* you asked for it ... */
5346       Dynarr_add (dst, ch);
5347     }
5348   else
5349     {
5350       int reg;
5351
5352       /* Now determine which register to use. */
5353       reg = -1;
5354       for (i = 0; i < 4; i++)
5355         {
5356           if ((CHARSETP (charset = str->iso2022.charset[i])
5357                && ((code_point = charset_code_point (charset, ch)) >= 0))
5358               ||
5359               (CHARSETP
5360                (charset
5361                 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i))
5362                && ((code_point = charset_code_point (charset, ch)) >= 0)))
5363             {
5364               reg = i;
5365               break;
5366             }
5367         }
5368       if (reg == -1)
5369         {
5370           Lisp_Object original_default_coded_charset_priority_list
5371             = Vdefault_coded_charset_priority_list;
5372
5373           while (!EQ (Vdefault_coded_charset_priority_list, Qnil))
5374             {
5375               code_point = ENCODE_CHAR (ch, charset);
5376               if (XCHARSET_FINAL (charset))
5377                 goto found;
5378               Vdefault_coded_charset_priority_list
5379                 = Fcdr (Fmemq (XCHARSET_NAME (charset),
5380                                Vdefault_coded_charset_priority_list));
5381             }
5382           code_point = ENCODE_CHAR (ch, charset);
5383           if (!XCHARSET_FINAL (charset))
5384             {
5385               charset = Vcharset_ascii;
5386               code_point = '~';
5387             }
5388         found:
5389           Vdefault_coded_charset_priority_list
5390             = original_default_coded_charset_priority_list;
5391         }
5392       ensure_correct_direction (XCHARSET_DIRECTION (charset),
5393                                 codesys, dst, flags, 0);
5394
5395       if (reg == -1)
5396         {
5397           if (XCHARSET_GRAPHIC (charset) != 0)
5398             {
5399               if (!NILP (str->iso2022.charset[1]) &&
5400                   (!CODING_SYSTEM_ISO2022_SEVEN (codesys)
5401                    || CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys)))
5402                 reg = 1;
5403               else if (!NILP (str->iso2022.charset[2]))
5404                 reg = 2;
5405               else if (!NILP (str->iso2022.charset[3]))
5406                 reg = 3;
5407               else
5408                 reg = 0;
5409             }
5410           else
5411             reg = 0;
5412         }
5413
5414       iso2022_designate (charset, reg, str, dst);
5415
5416       /* Now invoke that register. */
5417       switch (reg)
5418         {
5419         case 0:
5420           ensure_normal_shift (str, dst);
5421           half = 0;
5422           break;
5423         case 1:
5424           if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5425             {
5426               ensure_shift_out (str, dst);
5427               half = 0;
5428             }
5429           else
5430             half = 1;
5431           break;
5432         case 2:
5433           if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5434             {
5435               Dynarr_add (dst, ISO_CODE_ESC);
5436               Dynarr_add (dst, 'N');
5437               half = 0;
5438             }
5439           else
5440             {
5441               Dynarr_add (dst, ISO_CODE_SS2);
5442               half = 1;
5443             }
5444           break;
5445         case 3:
5446           if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5447             {
5448               Dynarr_add (dst, ISO_CODE_ESC);
5449               Dynarr_add (dst, 'O');
5450               half = 0;
5451             }
5452           else
5453             {
5454               Dynarr_add (dst, ISO_CODE_SS3);
5455               half = 1;
5456             }
5457           break;
5458         default:
5459           abort ();
5460         }
5461
5462       charmask = (half == 0 ? 0x00 : 0x80);
5463
5464       switch (XCHARSET_DIMENSION (charset))
5465         {
5466         case 1:
5467           Dynarr_add (dst, (code_point & 0xFF) | charmask);
5468           break;
5469         case 2:
5470           Dynarr_add (dst, ((code_point >> 8) & 0xFF) | charmask);
5471           Dynarr_add (dst, ( code_point       & 0xFF) | charmask);
5472           break;
5473         case 3:
5474           Dynarr_add (dst, ((code_point >> 16) & 0xFF) | charmask);
5475           Dynarr_add (dst, ((code_point >>  8) & 0xFF) | charmask);
5476           Dynarr_add (dst, ( code_point        & 0xFF) | charmask);
5477           break;
5478         case 4:
5479           Dynarr_add (dst, ((code_point >> 24) & 0xFF) | charmask);
5480           Dynarr_add (dst, ((code_point >> 16) & 0xFF) | charmask);
5481           Dynarr_add (dst, ((code_point >>  8) & 0xFF) | charmask);
5482           Dynarr_add (dst, ( code_point        & 0xFF) | charmask);
5483           break;
5484         default:
5485           abort ();
5486         }
5487     }
5488   str->iso2022.current_charset = charset;
5489   str->iso2022.current_half = half;
5490 }
5491
5492 void
5493 char_finish_iso2022 (struct encoding_stream *str, unsigned_char_dynarr *dst,
5494                      unsigned int *flags)
5495 {
5496   Lisp_Coding_System* codesys = str->codesys;
5497   int i;
5498
5499   restore_left_to_right_direction (codesys, dst, flags, 0);
5500   ensure_normal_shift (str, dst);
5501   for (i = 0; i < 4; i++)
5502     {
5503       Lisp_Object initial_charset
5504         = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5505       iso2022_designate (initial_charset, i, str, dst);
5506     }
5507 }
5508 #endif /* MULE */
5509 \f
5510 /************************************************************************/
5511 /*                     No-conversion methods                            */
5512 /************************************************************************/
5513
5514 /* This is used when reading in "binary" files -- i.e. files that may
5515    contain all 256 possible byte values and that are not to be
5516    interpreted as being in any particular decoding. */
5517 static void
5518 decode_coding_no_conversion (Lstream *decoding, const Extbyte *src,
5519                              unsigned_char_dynarr *dst, size_t n)
5520 {
5521   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
5522   unsigned int flags  = str->flags;
5523   unsigned int cpos   = str->cpos;
5524   eol_type_t eol_type = str->eol_type;
5525
5526   while (n--)
5527     {
5528       unsigned char c = *(unsigned char *)src++;
5529
5530       DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5531       DECODE_ADD_BINARY_CHAR (c, dst);
5532     label_continue_loop:;
5533     }
5534
5535   DECODE_HANDLE_END_OF_CONVERSION (flags, cpos, dst);
5536
5537   str->flags = flags;
5538   str->cpos  = cpos;
5539 }
5540
5541 static void
5542 encode_coding_no_conversion (Lstream *encoding, const Bufbyte *src,
5543                              unsigned_char_dynarr *dst, size_t n)
5544 {
5545   unsigned char c;
5546   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
5547   unsigned int flags  = str->flags;
5548   unsigned int ch     = str->ch;
5549   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
5550 #ifdef UTF2000
5551   unsigned char char_boundary = str->iso2022.current_char_boundary;
5552 #endif
5553
5554   while (n--)
5555     {
5556       c = *src++;
5557 #ifdef UTF2000
5558       if (char_boundary == 0)
5559         if ( c >= 0xfc )
5560           {
5561             ch = c & 0x01;
5562             char_boundary = 5;
5563           }
5564         else if ( c >= 0xf8 )
5565           {
5566             ch = c & 0x03;
5567             char_boundary = 4;
5568           }
5569         else if ( c >= 0xf0 )
5570           {
5571             ch = c & 0x07;
5572             char_boundary = 3;
5573           }
5574         else if ( c >= 0xe0 )
5575           {
5576             ch = c & 0x0f;
5577             char_boundary = 2;
5578           }
5579         else if ( c >= 0xc0 )
5580           {
5581             ch = c & 0x1f;
5582             char_boundary = 1;
5583           }
5584         else
5585           {
5586             ch = 0;
5587             if (c == '\n')
5588               {
5589                 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5590                   Dynarr_add (dst, '\r');
5591                 if (eol_type != EOL_CR)
5592                   Dynarr_add (dst, c);
5593               }
5594             else
5595               Dynarr_add (dst, c);
5596             char_boundary = 0;
5597           }
5598       else if (char_boundary == 1)
5599         {
5600           ch = ( ch << 6 ) | ( c & 0x3f );
5601           Dynarr_add (dst, ch & 0xff);
5602           char_boundary = 0;
5603         }
5604       else
5605         {
5606           ch = ( ch << 6 ) | ( c & 0x3f );
5607           char_boundary--;
5608         }
5609 #else /* not UTF2000 */
5610       if (c == '\n')
5611         {
5612           if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5613             Dynarr_add (dst, '\r');
5614           if (eol_type != EOL_CR)
5615             Dynarr_add (dst, '\n');
5616           ch = 0;
5617         }
5618       else if (BYTE_ASCII_P (c))
5619         {
5620           assert (ch == 0);
5621           Dynarr_add (dst, c);
5622         }
5623       else if (BUFBYTE_LEADING_BYTE_P (c))
5624         {
5625           assert (ch == 0);
5626           if (c == LEADING_BYTE_LATIN_ISO8859_1 ||
5627               c == LEADING_BYTE_CONTROL_1)
5628             ch = c;
5629           else
5630             Dynarr_add (dst, '~'); /* untranslatable character */
5631         }
5632       else
5633         {
5634           if (ch == LEADING_BYTE_LATIN_ISO8859_1)
5635             Dynarr_add (dst, c);
5636           else if (ch == LEADING_BYTE_CONTROL_1)
5637             {
5638               assert (c < 0xC0);
5639               Dynarr_add (dst, c - 0x20);
5640             }
5641           /* else it should be the second or third byte of an
5642              untranslatable character, so ignore it */
5643           ch = 0;
5644         }
5645 #endif /* not UTF2000 */
5646     }
5647
5648   str->flags = flags;
5649   str->ch    = ch;
5650 #ifdef UTF2000
5651   str->iso2022.current_char_boundary = char_boundary;
5652 #endif
5653 }
5654
5655 \f
5656
5657 /************************************************************************/
5658 /*                             Initialization                           */
5659 /************************************************************************/
5660
5661 void
5662 syms_of_file_coding (void)
5663 {
5664   INIT_LRECORD_IMPLEMENTATION (coding_system);
5665
5666   deferror (&Qcoding_system_error, "coding-system-error",
5667             "Coding-system error", Qio_error);
5668
5669   DEFSUBR (Fcoding_system_p);
5670   DEFSUBR (Ffind_coding_system);
5671   DEFSUBR (Fget_coding_system);
5672   DEFSUBR (Fcoding_system_list);
5673   DEFSUBR (Fcoding_system_name);
5674   DEFSUBR (Fmake_coding_system);
5675   DEFSUBR (Fcopy_coding_system);
5676   DEFSUBR (Fcoding_system_canonical_name_p);
5677   DEFSUBR (Fcoding_system_alias_p);
5678   DEFSUBR (Fcoding_system_aliasee);
5679   DEFSUBR (Fdefine_coding_system_alias);
5680   DEFSUBR (Fsubsidiary_coding_system);
5681
5682   DEFSUBR (Fcoding_system_type);
5683   DEFSUBR (Fcoding_system_doc_string);
5684 #ifdef MULE
5685   DEFSUBR (Fcoding_system_charset);
5686 #endif
5687   DEFSUBR (Fcoding_system_property);
5688
5689   DEFSUBR (Fcoding_category_list);
5690   DEFSUBR (Fset_coding_priority_list);
5691   DEFSUBR (Fcoding_priority_list);
5692   DEFSUBR (Fset_coding_category_system);
5693   DEFSUBR (Fcoding_category_system);
5694
5695   DEFSUBR (Fdetect_coding_region);
5696   DEFSUBR (Fdecode_coding_region);
5697   DEFSUBR (Fencode_coding_region);
5698 #ifdef MULE
5699   DEFSUBR (Fdecode_shift_jis_char);
5700   DEFSUBR (Fencode_shift_jis_char);
5701   DEFSUBR (Fdecode_big5_char);
5702   DEFSUBR (Fencode_big5_char);
5703 #endif /* MULE */
5704   defsymbol (&Qcoding_systemp, "coding-system-p");
5705   defsymbol (&Qno_conversion, "no-conversion");
5706   defsymbol (&Qraw_text, "raw-text");
5707 #ifdef MULE
5708   defsymbol (&Qbig5, "big5");
5709   defsymbol (&Qshift_jis, "shift-jis");
5710   defsymbol (&Qucs4, "ucs-4");
5711   defsymbol (&Qutf8, "utf-8");
5712   defsymbol (&Qccl, "ccl");
5713   defsymbol (&Qiso2022, "iso2022");
5714 #endif /* MULE */
5715   defsymbol (&Qmnemonic, "mnemonic");
5716   defsymbol (&Qeol_type, "eol-type");
5717   defsymbol (&Qpost_read_conversion, "post-read-conversion");
5718   defsymbol (&Qpre_write_conversion, "pre-write-conversion");
5719
5720   defsymbol (&Qcr, "cr");
5721   defsymbol (&Qlf, "lf");
5722   defsymbol (&Qcrlf, "crlf");
5723   defsymbol (&Qeol_cr, "eol-cr");
5724   defsymbol (&Qeol_lf, "eol-lf");
5725   defsymbol (&Qeol_crlf, "eol-crlf");
5726 #ifdef MULE
5727   defsymbol (&Qcharset_g0, "charset-g0");
5728   defsymbol (&Qcharset_g1, "charset-g1");
5729   defsymbol (&Qcharset_g2, "charset-g2");
5730   defsymbol (&Qcharset_g3, "charset-g3");
5731   defsymbol (&Qforce_g0_on_output, "force-g0-on-output");
5732   defsymbol (&Qforce_g1_on_output, "force-g1-on-output");
5733   defsymbol (&Qforce_g2_on_output, "force-g2-on-output");
5734   defsymbol (&Qforce_g3_on_output, "force-g3-on-output");
5735   defsymbol (&Qno_iso6429, "no-iso6429");
5736   defsymbol (&Qinput_charset_conversion, "input-charset-conversion");
5737   defsymbol (&Qoutput_charset_conversion, "output-charset-conversion");
5738
5739   defsymbol (&Qshort, "short");
5740   defsymbol (&Qno_ascii_eol, "no-ascii-eol");
5741   defsymbol (&Qno_ascii_cntl, "no-ascii-cntl");
5742   defsymbol (&Qseven, "seven");
5743   defsymbol (&Qlock_shift, "lock-shift");
5744   defsymbol (&Qescape_quoted, "escape-quoted");
5745 #endif /* MULE */
5746 #ifdef UTF2000
5747   defsymbol (&Qdisable_composition, "disable-composition");
5748 #endif
5749   defsymbol (&Qencode, "encode");
5750   defsymbol (&Qdecode, "decode");
5751
5752 #ifdef MULE
5753   defsymbol (&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS],
5754              "shift-jis");
5755   defsymbol (&coding_category_symbol[CODING_CATEGORY_BIG5],
5756              "big5");
5757   defsymbol (&coding_category_symbol[CODING_CATEGORY_UCS4],
5758              "ucs-4");
5759   defsymbol (&coding_category_symbol[CODING_CATEGORY_UTF8],
5760              "utf-8");
5761   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_7],
5762              "iso-7");
5763   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE],
5764              "iso-8-designate");
5765   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_1],
5766              "iso-8-1");
5767   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_2],
5768              "iso-8-2");
5769   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_LOCK_SHIFT],
5770              "iso-lock-shift");
5771 #endif /* MULE */
5772   defsymbol (&coding_category_symbol[CODING_CATEGORY_NO_CONVERSION],
5773              "no-conversion");
5774 }
5775
5776 void
5777 lstream_type_create_file_coding (void)
5778 {
5779   LSTREAM_HAS_METHOD (decoding, reader);
5780   LSTREAM_HAS_METHOD (decoding, writer);
5781   LSTREAM_HAS_METHOD (decoding, rewinder);
5782   LSTREAM_HAS_METHOD (decoding, seekable_p);
5783   LSTREAM_HAS_METHOD (decoding, flusher);
5784   LSTREAM_HAS_METHOD (decoding, closer);
5785   LSTREAM_HAS_METHOD (decoding, marker);
5786
5787   LSTREAM_HAS_METHOD (encoding, reader);
5788   LSTREAM_HAS_METHOD (encoding, writer);
5789   LSTREAM_HAS_METHOD (encoding, rewinder);
5790   LSTREAM_HAS_METHOD (encoding, seekable_p);
5791   LSTREAM_HAS_METHOD (encoding, flusher);
5792   LSTREAM_HAS_METHOD (encoding, closer);
5793   LSTREAM_HAS_METHOD (encoding, marker);
5794 }
5795
5796 void
5797 vars_of_file_coding (void)
5798 {
5799   int i;
5800
5801   fcd = xnew (struct file_coding_dump);
5802   dump_add_root_struct_ptr (&fcd, &fcd_description);
5803
5804   /* Initialize to something reasonable ... */
5805   for (i = 0; i < CODING_CATEGORY_LAST; i++)
5806     {
5807       fcd->coding_category_system[i] = Qnil;
5808       fcd->coding_category_by_priority[i] = i;
5809     }
5810
5811   Fprovide (intern ("file-coding"));
5812
5813   DEFVAR_LISP ("keyboard-coding-system", &Vkeyboard_coding_system /*
5814 Coding system used for TTY keyboard input.
5815 Not used under a windowing system.
5816 */ );
5817   Vkeyboard_coding_system = Qnil;
5818
5819   DEFVAR_LISP ("terminal-coding-system", &Vterminal_coding_system /*
5820 Coding system used for TTY display output.
5821 Not used under a windowing system.
5822 */ );
5823   Vterminal_coding_system = Qnil;
5824
5825   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read /*
5826 Overriding coding system used when reading from a file or process.
5827 You should bind this variable with `let', but do not set it globally.
5828 If this is non-nil, it specifies the coding system that will be used
5829 to decode input on read operations, such as from a file or process.
5830 It overrides `buffer-file-coding-system-for-read',
5831 `insert-file-contents-pre-hook', etc.  Use those variables instead of
5832 this one for permanent changes to the environment.  */ );
5833   Vcoding_system_for_read = Qnil;
5834
5835   DEFVAR_LISP ("coding-system-for-write",
5836                &Vcoding_system_for_write /*
5837 Overriding coding system used when writing to a file or process.
5838 You should bind this variable with `let', but do not set it globally.
5839 If this is non-nil, it specifies the coding system that will be used
5840 to encode output for write operations, such as to a file or process.
5841 It overrides `buffer-file-coding-system', `write-region-pre-hook', etc.
5842 Use those variables instead of this one for permanent changes to the
5843 environment.  */ );
5844   Vcoding_system_for_write = Qnil;
5845
5846   DEFVAR_LISP ("file-name-coding-system", &Vfile_name_coding_system /*
5847 Coding system used to convert pathnames when accessing files.
5848 */ );
5849   Vfile_name_coding_system = Qnil;
5850
5851   DEFVAR_BOOL ("enable-multibyte-characters", &enable_multibyte_characters /*
5852 Non-nil means the buffer contents are regarded as multi-byte form
5853 of characters, not a binary code.  This affects the display, file I/O,
5854 and behaviors of various editing commands.
5855
5856 Setting this to nil does not do anything.
5857 */ );
5858   enable_multibyte_characters = 1;
5859 }
5860
5861 void
5862 complex_vars_of_file_coding (void)
5863 {
5864   staticpro (&Vcoding_system_hash_table);
5865   Vcoding_system_hash_table =
5866     make_lisp_hash_table (50, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ);
5867
5868   the_codesys_prop_dynarr = Dynarr_new (codesys_prop);
5869   dump_add_root_struct_ptr (&the_codesys_prop_dynarr, &codesys_prop_dynarr_description);
5870
5871 #define DEFINE_CODESYS_PROP(Prop_Type, Sym) do  \
5872 {                                               \
5873   struct codesys_prop csp;                      \
5874   csp.sym = (Sym);                              \
5875   csp.prop_type = (Prop_Type);                  \
5876   Dynarr_add (the_codesys_prop_dynarr, csp);    \
5877 } while (0)
5878
5879   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qmnemonic);
5880   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_type);
5881   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_cr);
5882   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_crlf);
5883   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_lf);
5884   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qpost_read_conversion);
5885   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qpre_write_conversion);
5886 #ifdef MULE
5887   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g0);
5888   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g1);
5889   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g2);
5890   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g3);
5891   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g0_on_output);
5892   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g1_on_output);
5893   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g2_on_output);
5894   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g3_on_output);
5895   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qshort);
5896   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_eol);
5897   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_cntl);
5898   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qseven);
5899   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qlock_shift);
5900   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_iso6429);
5901   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qescape_quoted);
5902   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qinput_charset_conversion);
5903   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qoutput_charset_conversion);
5904
5905   DEFINE_CODESYS_PROP (CODESYS_PROP_CCL,     Qencode);
5906   DEFINE_CODESYS_PROP (CODESYS_PROP_CCL,     Qdecode);
5907 #endif /* MULE */
5908   /* Need to create this here or we're really screwed. */
5909   Fmake_coding_system
5910     (Qraw_text, Qno_conversion,
5911      build_string ("Raw text, which means it converts only line-break-codes."),
5912      list2 (Qmnemonic, build_string ("Raw")));
5913
5914   Fmake_coding_system
5915     (Qbinary, Qno_conversion,
5916      build_string ("Binary, which means it does not convert anything."),
5917      list4 (Qeol_type, Qlf,
5918             Qmnemonic, build_string ("Binary")));
5919
5920 #ifdef UTF2000
5921   Fmake_coding_system
5922     (Qutf8, Qutf8,
5923      build_string ("Coding-system of ISO/IEC 10646 UTF-8."),
5924      list2 (Qmnemonic, build_string ("UTF8")));
5925 #endif
5926
5927   Fdefine_coding_system_alias (Qno_conversion, Qraw_text);
5928
5929   Fdefine_coding_system_alias (Qfile_name, Qbinary);
5930
5931   Fdefine_coding_system_alias (Qterminal, Qbinary);
5932   Fdefine_coding_system_alias (Qkeyboard, Qbinary);
5933
5934   /* Need this for bootstrapping */
5935   fcd->coding_category_system[CODING_CATEGORY_NO_CONVERSION] =
5936     Fget_coding_system (Qraw_text);
5937
5938 #ifdef UTF2000
5939   fcd->coding_category_system[CODING_CATEGORY_UTF8]
5940    = Fget_coding_system (Qutf8);
5941 #endif
5942
5943 #if defined(MULE) && !defined(UTF2000)
5944   {
5945     size_t i;
5946
5947     for (i = 0; i < countof (fcd->ucs_to_mule_table); i++)
5948       fcd->ucs_to_mule_table[i] = Qnil;
5949   }
5950   staticpro (&mule_to_ucs_table);
5951   mule_to_ucs_table = Fmake_char_table(Qgeneric);
5952 #endif /* defined(MULE) && !defined(UTF2000) */
5953 }