git.chise.org Git - chise/xemacs-chise.git-/blob - src/text-coding.c

   1 /* Code conversion functions.
   2    Copyright (C) 1991, 1995 Free Software Foundation, Inc.
   3    Copyright (C) 1995 Sun Microsystems, Inc.
   4    Copyright (C) 1999,2000 MORIOKA Tomohiko
   5
   6 This file is part of XEmacs.
   7
   8 XEmacs is free software; you can redistribute it and/or modify it
   9 under the terms of the GNU General Public License as published by the
  10 Free Software Foundation; either version 2, or (at your option) any
  11 later version.
  12
  13 XEmacs is distributed in the hope that it will be useful, but WITHOUT
  14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with XEmacs; see the file COPYING.  If not, write to
  20 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  21 Boston, MA 02111-1307, USA.  */
  22
  23 /* Synched up with: Mule 2.3.   Not in FSF. */
  24
  25 /* Rewritten by Ben Wing <ben@xemacs.org>. */
  26
  27 #include <config.h>
  28 #include "lisp.h"
  29
  30 #include "buffer.h"
  31 #include "elhash.h"
  32 #include "insdel.h"
  33 #include "lstream.h"
  34 #include "opaque.h"
  35 #ifdef MULE
  36 #include "mule-ccl.h"
  37 #include "chartab.h"
  38 #endif
  39 #include "file-coding.h"
  40
  41 Lisp_Object Qcoding_system_error;
  42
  43 Lisp_Object Vkeyboard_coding_system;
  44 Lisp_Object Vterminal_coding_system;
  45 Lisp_Object Vcoding_system_for_read;
  46 Lisp_Object Vcoding_system_for_write;
  47 Lisp_Object Vfile_name_coding_system;
  48
  49 /* Table of symbols identifying each coding category. */
  50 Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST + 1];
  51
  52
  53
  54 struct file_coding_dump {
  55   /* Coding system currently associated with each coding category. */
  56   Lisp_Object coding_category_system[CODING_CATEGORY_LAST + 1];
  57
  58   /* Table of all coding categories in decreasing order of priority.
  59      This describes a permutation of the possible coding categories. */
  60   int coding_category_by_priority[CODING_CATEGORY_LAST + 1];
  61
  62 #if defined(MULE) && !defined(UTF2000)
  63   Lisp_Object ucs_to_mule_table[65536];
  64 #endif
  65 } *fcd;
  66
  67 static const struct lrecord_description fcd_description_1[] = {
  68   { XD_LISP_OBJECT_ARRAY, offsetof (struct file_coding_dump, coding_category_system), CODING_CATEGORY_LAST + 1 },
  69 #if defined(MULE) && !defined(UTF2000)
  70   { XD_LISP_OBJECT_ARRAY, offsetof (struct file_coding_dump, ucs_to_mule_table), 65536 },
  71 #endif
  72   { XD_END }
  73 };
  74
  75 static const struct struct_description fcd_description = {
  76   sizeof (struct file_coding_dump),
  77   fcd_description_1
  78 };
  79
  80 Lisp_Object mule_to_ucs_table;
  81
  82 Lisp_Object Qcoding_systemp;
  83
  84 Lisp_Object Qraw_text, Qno_conversion, Qccl, Qiso2022;
  85 /* Qinternal in general.c */
  86
  87 Lisp_Object Qmnemonic, Qeol_type;
  88 Lisp_Object Qcr, Qcrlf, Qlf;
  89 Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf;
  90 Lisp_Object Qpost_read_conversion;
  91 Lisp_Object Qpre_write_conversion;
  92
  93 #ifdef MULE
  94 Lisp_Object Qucs4, Qutf8;
  95 Lisp_Object Qbig5, Qshift_jis;
  96 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3;
  97 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output;
  98 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output;
  99 Lisp_Object Qno_iso6429;
 100 Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion;
 101 Lisp_Object Qescape_quoted;
 102 Lisp_Object Qshort, Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift;
 103 #endif
 104 #ifdef UTF2000
 105 Lisp_Object Qdisable_composition;
 106 #endif
 107 Lisp_Object Qencode, Qdecode;
 108
 109 Lisp_Object Vcoding_system_hash_table;
 110
 111 int enable_multibyte_characters;
 112
 113 #ifdef MULE
 114 /* Additional information used by the ISO2022 decoder and detector. */
 115 struct iso2022_decoder
 116 {
 117   /* CHARSET holds the character sets currently assigned to the G0
 118      through G3 variables.  It is initialized from the array
 119      INITIAL_CHARSET in CODESYS. */
 120   Lisp_Object charset[4];
 121
 122   /* Which registers are currently invoked into the left (GL) and
 123      right (GR) halves of the 8-bit encoding space? */
 124   int register_left, register_right;
 125
 126   /* ISO_ESC holds a value indicating part of an escape sequence
 127      that has already been seen. */
 128   enum iso_esc_flag esc;
 129
 130   /* This records the bytes we've seen so far in an escape sequence,
 131      in case the sequence is invalid (we spit out the bytes unchanged). */
 132   unsigned char esc_bytes[8];
 133
 134   /* Index for next byte to store in ISO escape sequence. */
 135   int esc_bytes_index;
 136
 137 #ifdef ENABLE_COMPOSITE_CHARS
 138   /* Stuff seen so far when composing a string. */
 139   unsigned_char_dynarr *composite_chars;
 140 #endif
 141
 142   /* If we saw an invalid designation sequence for a particular
 143      register, we flag it here and switch to ASCII.  The next time we
 144      see a valid designation for this register, we turn off the flag
 145      and do the designation normally, but pretend the sequence was
 146      invalid.  The effect of all this is that (most of the time) the
 147      escape sequences for both the switch to the unknown charset, and
 148      the switch back to the known charset, get inserted literally into
 149      the buffer and saved out as such.  The hope is that we can
 150      preserve the escape sequences so that the resulting written out
 151      file makes sense.  If we don't do any of this, the designation
 152      to the invalid charset will be preserved but that switch back
 153      to the known charset will probably get eaten because it was
 154      the same charset that was already present in the register. */
 155   unsigned char invalid_designated[4];
 156
 157   /* We try to do similar things as above for direction-switching
 158      sequences.  If we encountered a direction switch while an
 159      invalid designation was present, or an invalid designation
 160      just after a direction switch (i.e. no valid designation
 161      encountered yet), we insert the direction-switch escape
 162      sequence literally into the output stream, and later on
 163      insert the corresponding direction-restoring escape sequence
 164      literally also. */
 165   unsigned int switched_dir_and_no_valid_charset_yet :1;
 166   unsigned int invalid_switch_dir :1;
 167
 168   /* Tells the decoder to output the escape sequence literally
 169      even though it was valid.  Used in the games we play to
 170      avoid lossage when we encounter invalid designations. */
 171   unsigned int output_literally :1;
 172   /* We encountered a direction switch followed by an invalid
 173      designation.  We didn't output the direction switch
 174      literally because we didn't know about the invalid designation;
 175      but we have to do so now. */
 176   unsigned int output_direction_sequence :1;
 177 };
 178 #endif /* MULE */
 179 EXFUN (Fcopy_coding_system, 2);
 180 #ifdef MULE
 181 struct detection_state;
 182
 183 static void
 184 text_encode_generic (Lstream *encoding, const unsigned char *src,
 185                      unsigned_char_dynarr *dst, unsigned int n);
 186
 187 static int detect_coding_sjis (struct detection_state *st,
 188                                const unsigned char *src,
 189                                unsigned int n);
 190 static void decode_coding_sjis (Lstream *decoding,
 191                                 const unsigned char *src,
 192                                 unsigned_char_dynarr *dst,
 193                                 unsigned int n);
 194 void char_encode_shift_jis (struct encoding_stream *str, Emchar c,
 195                             unsigned_char_dynarr *dst, unsigned int *flags);
 196 void char_finish_shift_jis (struct encoding_stream *str,
 197                             unsigned_char_dynarr *dst, unsigned int *flags);
 198
 199 static int detect_coding_big5 (struct detection_state *st,
 200                                const unsigned char *src,
 201                                unsigned int n);
 202 static void decode_coding_big5 (Lstream *decoding,
 203                                 const unsigned char *src,
 204                                 unsigned_char_dynarr *dst, unsigned int n);
 205 static void encode_coding_big5 (Lstream *encoding,
 206                                 const unsigned char *src,
 207                                 unsigned_char_dynarr *dst, unsigned int n);
 208 static int detect_coding_ucs4 (struct detection_state *st,
 209                                const unsigned char *src,
 210                                unsigned int n);
 211 static void decode_coding_ucs4 (Lstream *decoding,
 212                                 const unsigned char *src,
 213                                 unsigned_char_dynarr *dst, unsigned int n);
 214 void char_encode_ucs4 (struct encoding_stream *str, Emchar c,
 215                        unsigned_char_dynarr *dst, unsigned int *flags);
 216 void char_finish_ucs4 (struct encoding_stream *str,
 217                        unsigned_char_dynarr *dst, unsigned int *flags);
 218
 219 static int detect_coding_utf8 (struct detection_state *st,
 220                                const unsigned char *src,
 221                                unsigned int n);
 222 static void decode_coding_utf8 (Lstream *decoding,
 223                                 const unsigned char *src,
 224                                 unsigned_char_dynarr *dst, unsigned int n);
 225 void char_encode_utf8 (struct encoding_stream *str, Emchar c,
 226                        unsigned_char_dynarr *dst, unsigned int *flags);
 227 void char_finish_utf8 (struct encoding_stream *str,
 228                        unsigned_char_dynarr *dst, unsigned int *flags);
 229
 230 static int postprocess_iso2022_mask (int mask);
 231 static void reset_iso2022 (Lisp_Object coding_system,
 232                            struct iso2022_decoder *iso);
 233 static int detect_coding_iso2022 (struct detection_state *st,
 234                                   const unsigned char *src,
 235                                   unsigned int n);
 236 static void decode_coding_iso2022 (Lstream *decoding,
 237                                    const unsigned char *src,
 238                                    unsigned_char_dynarr *dst, unsigned int n);
 239 void char_encode_iso2022 (struct encoding_stream *str, Emchar c,
 240                           unsigned_char_dynarr *dst, unsigned int *flags);
 241 void char_finish_iso2022 (struct encoding_stream *str,
 242                           unsigned_char_dynarr *dst, unsigned int *flags);
 243 #endif /* MULE */
 244 static void decode_coding_no_conversion (Lstream *decoding,
 245                                          const unsigned char *src,
 246                                          unsigned_char_dynarr *dst,
 247                                          unsigned int n);
 248 static void encode_coding_no_conversion (Lstream *encoding,
 249                                          const unsigned char *src,
 250                                          unsigned_char_dynarr *dst,
 251                                          unsigned int n);
 252 static void mule_decode (Lstream *decoding, const unsigned char *src,
 253                          unsigned_char_dynarr *dst, unsigned int n);
 254 static void mule_encode (Lstream *encoding, const unsigned char *src,
 255                          unsigned_char_dynarr *dst, unsigned int n);
 256
 257 typedef struct codesys_prop codesys_prop;
 258 struct codesys_prop
 259 {
 260   Lisp_Object sym;
 261   int prop_type;
 262 };
 263
 264 typedef struct
 265 {
 266   Dynarr_declare (codesys_prop);
 267 } codesys_prop_dynarr;
 268
 269 static const struct lrecord_description codesys_prop_description_1[] = {
 270   { XD_LISP_OBJECT, offsetof (codesys_prop, sym) },
 271   { XD_END }
 272 };
 273
 274 static const struct struct_description codesys_prop_description = {
 275   sizeof (codesys_prop),
 276   codesys_prop_description_1
 277 };
 278
 279 static const struct lrecord_description codesys_prop_dynarr_description_1[] = {
 280   XD_DYNARR_DESC (codesys_prop_dynarr, &codesys_prop_description),
 281   { XD_END }
 282 };
 283
 284 static const struct struct_description codesys_prop_dynarr_description = {
 285   sizeof (codesys_prop_dynarr),
 286   codesys_prop_dynarr_description_1
 287 };
 288
 289 codesys_prop_dynarr *the_codesys_prop_dynarr;
 290
 291 enum codesys_prop_enum
 292 {
 293   CODESYS_PROP_ALL_OK,
 294   CODESYS_PROP_ISO2022,
 295   CODESYS_PROP_CCL
 296 };
 297
 298 \f
 299 /************************************************************************/
 300 /*                       Coding system functions                        */
 301 /************************************************************************/
 302
 303 static Lisp_Object mark_coding_system (Lisp_Object);
 304 static void print_coding_system (Lisp_Object, Lisp_Object, int);
 305 static void finalize_coding_system (void *header, int for_disksave);
 306
 307 #ifdef MULE
 308 static const struct lrecord_description ccs_description_1[] = {
 309   { XD_LISP_OBJECT, offsetof (charset_conversion_spec, from_charset) },
 310   { XD_LISP_OBJECT, offsetof (charset_conversion_spec, to_charset) },
 311   { XD_END }
 312 };
 313
 314 static const struct struct_description ccs_description = {
 315   sizeof (charset_conversion_spec),
 316   ccs_description_1
 317 };
 318
 319 static const struct lrecord_description ccsd_description_1[] = {
 320   XD_DYNARR_DESC (charset_conversion_spec_dynarr, &ccs_description),
 321   { XD_END }
 322 };
 323
 324 static const struct struct_description ccsd_description = {
 325   sizeof (charset_conversion_spec_dynarr),
 326   ccsd_description_1
 327 };
 328 #endif
 329
 330 static const struct lrecord_description coding_system_description[] = {
 331   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, name) },
 332   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, doc_string) },
 333   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, mnemonic) },
 334   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, post_read_conversion) },
 335   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, pre_write_conversion) },
 336   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_lf) },
 337   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_crlf) },
 338   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_cr) },
 339 #ifdef MULE
 340   { XD_LISP_OBJECT_ARRAY, offsetof (Lisp_Coding_System, iso2022.initial_charset), 4 },
 341   { XD_STRUCT_PTR,  offsetof (Lisp_Coding_System, iso2022.input_conv),  1, &ccsd_description },
 342   { XD_STRUCT_PTR,  offsetof (Lisp_Coding_System, iso2022.output_conv), 1, &ccsd_description },
 343   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccl.decode) },
 344   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccl.encode) },
 345 #endif
 346   { XD_END }
 347 };
 348
 349 DEFINE_LRECORD_IMPLEMENTATION ("coding-system", coding_system,
 350                                mark_coding_system, print_coding_system,
 351                                finalize_coding_system,
 352                                0, 0, coding_system_description,
 353                                Lisp_Coding_System);
 354
 355 static Lisp_Object
 356 mark_coding_system (Lisp_Object obj)
 357 {
 358   Lisp_Coding_System *codesys = XCODING_SYSTEM (obj);
 359
 360   mark_object (CODING_SYSTEM_NAME (codesys));
 361   mark_object (CODING_SYSTEM_DOC_STRING (codesys));
 362   mark_object (CODING_SYSTEM_MNEMONIC (codesys));
 363   mark_object (CODING_SYSTEM_EOL_LF (codesys));
 364   mark_object (CODING_SYSTEM_EOL_CRLF (codesys));
 365   mark_object (CODING_SYSTEM_EOL_CR (codesys));
 366
 367   switch (CODING_SYSTEM_TYPE (codesys))
 368     {
 369 #ifdef MULE
 370       int i;
 371     case CODESYS_ISO2022:
 372       for (i = 0; i < 4; i++)
 373         mark_object (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i));
 374       if (codesys->iso2022.input_conv)
 375         {
 376           for (i = 0; i < Dynarr_length (codesys->iso2022.input_conv); i++)
 377             {
 378               struct charset_conversion_spec *ccs =
 379                 Dynarr_atp (codesys->iso2022.input_conv, i);
 380               mark_object (ccs->from_charset);
 381               mark_object (ccs->to_charset);
 382             }
 383         }
 384       if (codesys->iso2022.output_conv)
 385         {
 386           for (i = 0; i < Dynarr_length (codesys->iso2022.output_conv); i++)
 387             {
 388               struct charset_conversion_spec *ccs =
 389                 Dynarr_atp (codesys->iso2022.output_conv, i);
 390               mark_object (ccs->from_charset);
 391               mark_object (ccs->to_charset);
 392             }
 393         }
 394       break;
 395
 396     case CODESYS_CCL:
 397       mark_object (CODING_SYSTEM_CCL_DECODE (codesys));
 398       mark_object (CODING_SYSTEM_CCL_ENCODE (codesys));
 399       break;
 400 #endif /* MULE */
 401     default:
 402       break;
 403     }
 404
 405   mark_object (CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys));
 406   return CODING_SYSTEM_POST_READ_CONVERSION (codesys);
 407 }
 408
 409 static void
 410 print_coding_system (Lisp_Object obj, Lisp_Object printcharfun,
 411                      int escapeflag)
 412 {
 413   Lisp_Coding_System *c = XCODING_SYSTEM (obj);
 414   if (print_readably)
 415     error ("printing unreadable object #<coding_system 0x%x>",
 416            c->header.uid);
 417
 418   write_c_string ("#<coding_system ", printcharfun);
 419   print_internal (c->name, printcharfun, 1);
 420   write_c_string (">", printcharfun);
 421 }
 422
 423 static void
 424 finalize_coding_system (void *header, int for_disksave)
 425 {
 426   Lisp_Coding_System *c = (Lisp_Coding_System *) header;
 427   /* Since coding systems never go away, this function is not
 428      necessary.  But it would be necessary if we changed things
 429      so that coding systems could go away. */
 430   if (!for_disksave) /* see comment in lstream.c */
 431     {
 432       switch (CODING_SYSTEM_TYPE (c))
 433         {
 434 #ifdef MULE
 435         case CODESYS_ISO2022:
 436           if (c->iso2022.input_conv)
 437             {
 438               Dynarr_free (c->iso2022.input_conv);
 439               c->iso2022.input_conv = 0;
 440             }
 441           if (c->iso2022.output_conv)
 442             {
 443               Dynarr_free (c->iso2022.output_conv);
 444               c->iso2022.output_conv = 0;
 445             }
 446           break;
 447 #endif /* MULE */
 448         default:
 449           break;
 450         }
 451     }
 452 }
 453
 454 static eol_type_t
 455 symbol_to_eol_type (Lisp_Object symbol)
 456 {
 457   CHECK_SYMBOL (symbol);
 458   if (NILP (symbol))      return EOL_AUTODETECT;
 459   if (EQ (symbol, Qlf))   return EOL_LF;
 460   if (EQ (symbol, Qcrlf)) return EOL_CRLF;
 461   if (EQ (symbol, Qcr))   return EOL_CR;
 462
 463   signal_simple_error ("Unrecognized eol type", symbol);
 464   return EOL_AUTODETECT; /* not reached */
 465 }
 466
 467 static Lisp_Object
 468 eol_type_to_symbol (eol_type_t type)
 469 {
 470   switch (type)
 471     {
 472     default: abort ();
 473     case EOL_LF:         return Qlf;
 474     case EOL_CRLF:       return Qcrlf;
 475     case EOL_CR:         return Qcr;
 476     case EOL_AUTODETECT: return Qnil;
 477     }
 478 }
 479
 480 static void
 481 setup_eol_coding_systems (Lisp_Coding_System *codesys)
 482 {
 483   Lisp_Object codesys_obj;
 484   int len = string_length (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name);
 485   char *codesys_name = (char *) alloca (len + 7);
 486   int mlen = -1;
 487   char *codesys_mnemonic=0;
 488
 489   Lisp_Object codesys_name_sym, sub_codesys_obj;
 490
 491   /* kludge */
 492
 493   XSETCODING_SYSTEM (codesys_obj, codesys);
 494
 495   memcpy (codesys_name,
 496           string_data (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name), len);
 497
 498   if (STRINGP (CODING_SYSTEM_MNEMONIC (codesys)))
 499     {
 500       mlen = XSTRING_LENGTH (CODING_SYSTEM_MNEMONIC (codesys));
 501       codesys_mnemonic = (char *) alloca (mlen + 7);
 502       memcpy (codesys_mnemonic,
 503               XSTRING_DATA (CODING_SYSTEM_MNEMONIC (codesys)), mlen);
 504     }
 505
 506 #define DEFINE_SUB_CODESYS(op_sys, op_sys_abbr, Type) do {                      \
 507   strcpy (codesys_name + len, "-" op_sys);                                      \
 508   if (mlen != -1)                                                               \
 509     strcpy (codesys_mnemonic + mlen, op_sys_abbr);                              \
 510   codesys_name_sym = intern (codesys_name);                                     \
 511   sub_codesys_obj = Fcopy_coding_system (codesys_obj, codesys_name_sym);        \
 512   XCODING_SYSTEM_EOL_TYPE (sub_codesys_obj) = Type;                             \
 513   if (mlen != -1)                                                               \
 514     XCODING_SYSTEM_MNEMONIC(sub_codesys_obj) =                                  \
 515       build_string (codesys_mnemonic);                                          \
 516   CODING_SYSTEM_##Type (codesys) = sub_codesys_obj;                             \
 517 } while (0)
 518
 519   DEFINE_SUB_CODESYS("unix", "", EOL_LF);
 520   DEFINE_SUB_CODESYS("dos",  ":T", EOL_CRLF);
 521   DEFINE_SUB_CODESYS("mac",  ":t", EOL_CR);
 522 }
 523
 524 DEFUN ("coding-system-p", Fcoding_system_p, 1, 1, 0, /*
 525 Return t if OBJECT is a coding system.
 526 A coding system is an object that defines how text containing multiple
 527 character sets is encoded into a stream of (typically 8-bit) bytes.
 528 The coding system is used to decode the stream into a series of
 529 characters (which may be from multiple charsets) when the text is read
 530 from a file or process, and is used to encode the text back into the
 531 same format when it is written out to a file or process.
 532
 533 For example, many ISO2022-compliant coding systems (such as Compound
 534 Text, which is used for inter-client data under the X Window System)
 535 use escape sequences to switch between different charsets -- Japanese
 536 Kanji, for example, is invoked with "ESC $ ( B"; ASCII is invoked
 537 with "ESC ( B"; and Cyrillic is invoked with "ESC - L".  See
 538 `make-coding-system' for more information.
 539
 540 Coding systems are normally identified using a symbol, and the
 541 symbol is accepted in place of the actual coding system object whenever
 542 a coding system is called for. (This is similar to how faces work.)
 543 */
 544        (object))
 545 {
 546   return CODING_SYSTEMP (object) ? Qt : Qnil;
 547 }
 548
 549 DEFUN ("find-coding-system", Ffind_coding_system, 1, 1, 0, /*
 550 Retrieve the coding system of the given name.
 551
 552 If CODING-SYSTEM-OR-NAME is a coding-system object, it is simply
 553 returned.  Otherwise, CODING-SYSTEM-OR-NAME should be a symbol.
 554 If there is no such coding system, nil is returned.  Otherwise the
 555 associated coding system object is returned.
 556 */
 557        (coding_system_or_name))
 558 {
 559   if (NILP (coding_system_or_name))
 560     coding_system_or_name = Qbinary;
 561   else if (CODING_SYSTEMP (coding_system_or_name))
 562     return coding_system_or_name;
 563   else
 564     CHECK_SYMBOL (coding_system_or_name);
 565
 566   while (1)
 567     {
 568       coding_system_or_name =
 569         Fgethash (coding_system_or_name, Vcoding_system_hash_table, Qnil);
 570
 571       if (CODING_SYSTEMP (coding_system_or_name) || NILP (coding_system_or_name))
 572         return coding_system_or_name;
 573     }
 574 }
 575
 576 DEFUN ("get-coding-system", Fget_coding_system, 1, 1, 0, /*
 577 Retrieve the coding system of the given name.
 578 Same as `find-coding-system' except that if there is no such
 579 coding system, an error is signaled instead of returning nil.
 580 */
 581        (name))
 582 {
 583   Lisp_Object coding_system = Ffind_coding_system (name);
 584
 585   if (NILP (coding_system))
 586     signal_simple_error ("No such coding system", name);
 587   return coding_system;
 588 }
 589
 590 /* We store the coding systems in hash tables with the names as the key and the
 591    actual coding system object as the value.  Occasionally we need to use them
 592    in a list format.  These routines provide us with that. */
 593 struct coding_system_list_closure
 594 {
 595   Lisp_Object *coding_system_list;
 596 };
 597
 598 static int
 599 add_coding_system_to_list_mapper (Lisp_Object key, Lisp_Object value,
 600                                   void *coding_system_list_closure)
 601 {
 602   /* This function can GC */
 603   struct coding_system_list_closure *cscl =
 604     (struct coding_system_list_closure *) coding_system_list_closure;
 605   Lisp_Object *coding_system_list = cscl->coding_system_list;
 606
 607   *coding_system_list = Fcons (key, *coding_system_list);
 608   return 0;
 609 }
 610
 611 DEFUN ("coding-system-list", Fcoding_system_list, 0, 0, 0, /*
 612 Return a list of the names of all defined coding systems.
 613 */
 614        ())
 615 {
 616   Lisp_Object coding_system_list = Qnil;
 617   struct gcpro gcpro1;
 618   struct coding_system_list_closure coding_system_list_closure;
 619
 620   GCPRO1 (coding_system_list);
 621   coding_system_list_closure.coding_system_list = &coding_system_list;
 622   elisp_maphash (add_coding_system_to_list_mapper, Vcoding_system_hash_table,
 623                  &coding_system_list_closure);
 624   UNGCPRO;
 625
 626   return coding_system_list;
 627 }
 628
 629 DEFUN ("coding-system-name", Fcoding_system_name, 1, 1, 0, /*
 630 Return the name of the given coding system.
 631 */
 632        (coding_system))
 633 {
 634   coding_system = Fget_coding_system (coding_system);
 635   return XCODING_SYSTEM_NAME (coding_system);
 636 }
 637
 638 static Lisp_Coding_System *
 639 allocate_coding_system (enum coding_system_type type, Lisp_Object name)
 640 {
 641   Lisp_Coding_System *codesys =
 642     alloc_lcrecord_type (Lisp_Coding_System, &lrecord_coding_system);
 643
 644   zero_lcrecord (codesys);
 645   CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = Qnil;
 646   CODING_SYSTEM_POST_READ_CONVERSION (codesys) = Qnil;
 647   CODING_SYSTEM_EOL_TYPE (codesys) = EOL_AUTODETECT;
 648   CODING_SYSTEM_EOL_CRLF (codesys) = Qnil;
 649   CODING_SYSTEM_EOL_CR   (codesys) = Qnil;
 650   CODING_SYSTEM_EOL_LF   (codesys) = Qnil;
 651   CODING_SYSTEM_TYPE     (codesys) = type;
 652   CODING_SYSTEM_MNEMONIC (codesys) = Qnil;
 653 #ifdef MULE
 654   if (type == CODESYS_ISO2022)
 655     {
 656       int i;
 657       for (i = 0; i < 4; i++)
 658         CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i) = Qnil;
 659     }
 660   else if (type == CODESYS_CCL)
 661     {
 662       CODING_SYSTEM_CCL_DECODE (codesys) = Qnil;
 663       CODING_SYSTEM_CCL_ENCODE (codesys) = Qnil;
 664     }
 665 #endif /* MULE */
 666   CODING_SYSTEM_NAME (codesys) = name;
 667
 668   return codesys;
 669 }
 670
 671 #ifdef MULE
 672 /* Given a list of charset conversion specs as specified in a Lisp
 673    program, parse it into STORE_HERE. */
 674
 675 static void
 676 parse_charset_conversion_specs (charset_conversion_spec_dynarr *store_here,
 677                                 Lisp_Object spec_list)
 678 {
 679   Lisp_Object rest;
 680
 681   EXTERNAL_LIST_LOOP (rest, spec_list)
 682     {
 683       Lisp_Object car = XCAR (rest);
 684       Lisp_Object from, to;
 685       struct charset_conversion_spec spec;
 686
 687       if (!CONSP (car) || !CONSP (XCDR (car)) || !NILP (XCDR (XCDR (car))))
 688         signal_simple_error ("Invalid charset conversion spec", car);
 689       from = Fget_charset (XCAR (car));
 690       to = Fget_charset (XCAR (XCDR (car)));
 691       if ( (XCHARSET_CHARS (from) != XCHARSET_CHARS (to)) ||
 692            (XCHARSET_DIMENSION (from) != XCHARSET_DIMENSION (to)) )
 693         signal_simple_error_2
 694           ("Attempted conversion between different charset types",
 695            from, to);
 696       spec.from_charset = from;
 697       spec.to_charset = to;
 698
 699       Dynarr_add (store_here, spec);
 700     }
 701 }
 702
 703 /* Given a dynarr LOAD_HERE of internally-stored charset conversion
 704    specs, return the equivalent as the Lisp programmer would see it.
 705
 706    If LOAD_HERE is 0, return Qnil. */
 707
 708 static Lisp_Object
 709 unparse_charset_conversion_specs (charset_conversion_spec_dynarr *load_here)
 710 {
 711   int i;
 712   Lisp_Object result;
 713
 714   if (!load_here)
 715     return Qnil;
 716   for (i = 0, result = Qnil; i < Dynarr_length (load_here); i++)
 717     {
 718       struct charset_conversion_spec *ccs = Dynarr_atp (load_here, i);
 719       result = Fcons (list2 (ccs->from_charset, ccs->to_charset), result);
 720     }
 721
 722   return Fnreverse (result);
 723 }
 724
 725 #endif /* MULE */
 726
 727 DEFUN ("make-coding-system", Fmake_coding_system, 2, 4, 0, /*
 728 Register symbol NAME as a coding system.
 729
 730 TYPE describes the conversion method used and should be one of
 731
 732 nil or 'undecided
 733      Automatic conversion.  XEmacs attempts to detect the coding system
 734      used in the file.
 735 'no-conversion
 736      No conversion.  Use this for binary files and such.  On output,
 737      graphic characters that are not in ASCII or Latin-1 will be
 738      replaced by a ?. (For a no-conversion-encoded buffer, these
 739      characters will only be present if you explicitly insert them.)
 740 'shift-jis
 741      Shift-JIS (a Japanese encoding commonly used in PC operating systems).
 742 'ucs-4
 743      ISO 10646 UCS-4 encoding.
 744 'utf-8
 745      ISO 10646 UTF-8 encoding.
 746 'iso2022
 747      Any ISO2022-compliant encoding.  Among other things, this includes
 748      JIS (the Japanese encoding commonly used for e-mail), EUC (the
 749      standard Unix encoding for Japanese and other languages), and
 750      Compound Text (the encoding used in X11).  You can specify more
 751      specific information about the conversion with the FLAGS argument.
 752 'big5
 753      Big5 (the encoding commonly used for Taiwanese).
 754 'ccl
 755      The conversion is performed using a user-written pseudo-code
 756      program.  CCL (Code Conversion Language) is the name of this
 757      pseudo-code.
 758 'internal
 759      Write out or read in the raw contents of the memory representing
 760      the buffer's text.  This is primarily useful for debugging
 761      purposes, and is only enabled when XEmacs has been compiled with
 762      DEBUG_XEMACS defined (via the --debug configure option).
 763      WARNING: Reading in a file using 'internal conversion can result
 764      in an internal inconsistency in the memory representing a
 765      buffer's text, which will produce unpredictable results and may
 766      cause XEmacs to crash.  Under normal circumstances you should
 767      never use 'internal conversion.
 768
 769 DOC-STRING is a string describing the coding system.
 770
 771 PROPS is a property list, describing the specific nature of the
 772 character set.  Recognized properties are:
 773
 774 'mnemonic
 775      String to be displayed in the modeline when this coding system is
 776      active.
 777
 778 'eol-type
 779      End-of-line conversion to be used.  It should be one of
 780
 781         nil
 782                 Automatically detect the end-of-line type (LF, CRLF,
 783                 or CR).  Also generate subsidiary coding systems named
 784                 `NAME-unix', `NAME-dos', and `NAME-mac', that are
 785                 identical to this coding system but have an EOL-TYPE
 786                 value of 'lf, 'crlf, and 'cr, respectively.
 787         'lf
 788                 The end of a line is marked externally using ASCII LF.
 789                 Since this is also the way that XEmacs represents an
 790                 end-of-line internally, specifying this option results
 791                 in no end-of-line conversion.  This is the standard
 792                 format for Unix text files.
 793         'crlf
 794                 The end of a line is marked externally using ASCII
 795                 CRLF.  This is the standard format for MS-DOS text
 796                 files.
 797         'cr
 798                 The end of a line is marked externally using ASCII CR.
 799                 This is the standard format for Macintosh text files.
 800         t
 801                 Automatically detect the end-of-line type but do not
 802                 generate subsidiary coding systems.  (This value is
 803                 converted to nil when stored internally, and
 804                 `coding-system-property' will return nil.)
 805
 806 'disable-composition
 807      If non-nil, composition/decomposition for combining characters
 808      are disabled.
 809
 810 'post-read-conversion
 811      Function called after a file has been read in, to perform the
 812      decoding.  Called with two arguments, BEG and END, denoting
 813      a region of the current buffer to be decoded.
 814
 815 'pre-write-conversion
 816      Function called before a file is written out, to perform the
 817      encoding.  Called with two arguments, BEG and END, denoting
 818      a region of the current buffer to be encoded.
 819
 820
 821 The following additional properties are recognized if TYPE is 'iso2022:
 822
 823 'charset-g0
 824 'charset-g1
 825 'charset-g2
 826 'charset-g3
 827      The character set initially designated to the G0 - G3 registers.
 828      The value should be one of
 829
 830           -- A charset object (designate that character set)
 831           -- nil (do not ever use this register)
 832           -- t (no character set is initially designated to
 833                 the register, but may be later on; this automatically
 834                 sets the corresponding `force-g*-on-output' property)
 835
 836 'force-g0-on-output
 837 'force-g1-on-output
 838 'force-g2-on-output
 839 'force-g2-on-output
 840      If non-nil, send an explicit designation sequence on output before
 841      using the specified register.
 842
 843 'short
 844      If non-nil, use the short forms "ESC $ @", "ESC $ A", and
 845      "ESC $ B" on output in place of the full designation sequences
 846      "ESC $ ( @", "ESC $ ( A", and "ESC $ ( B".
 847
 848 'no-ascii-eol
 849      If non-nil, don't designate ASCII to G0 at each end of line on output.
 850      Setting this to non-nil also suppresses other state-resetting that
 851      normally happens at the end of a line.
 852
 853 'no-ascii-cntl
 854      If non-nil, don't designate ASCII to G0 before control chars on output.
 855
 856 'seven
 857      If non-nil, use 7-bit environment on output.  Otherwise, use 8-bit
 858      environment.
 859
 860 'lock-shift
 861      If non-nil, use locking-shift (SO/SI) instead of single-shift
 862      or designation by escape sequence.
 863
 864 'no-iso6429
 865      If non-nil, don't use ISO6429's direction specification.
 866
 867 'escape-quoted
 868      If non-nil, literal control characters that are the same as
 869      the beginning of a recognized ISO2022 or ISO6429 escape sequence
 870      (in particular, ESC (0x1B), SO (0x0E), SI (0x0F), SS2 (0x8E),
 871      SS3 (0x8F), and CSI (0x9B)) are "quoted" with an escape character
 872      so that they can be properly distinguished from an escape sequence.
 873      (Note that doing this results in a non-portable encoding.) This
 874      encoding flag is used for byte-compiled files.  Note that ESC
 875      is a good choice for a quoting character because there are no
 876      escape sequences whose second byte is a character from the Control-0
 877      or Control-1 character sets; this is explicitly disallowed by the
 878      ISO2022 standard.
 879
 880 'input-charset-conversion
 881      A list of conversion specifications, specifying conversion of
 882      characters in one charset to another when decoding is performed.
 883      Each specification is a list of two elements: the source charset,
 884      and the destination charset.
 885
 886 'output-charset-conversion
 887      A list of conversion specifications, specifying conversion of
 888      characters in one charset to another when encoding is performed.
 889      The form of each specification is the same as for
 890      'input-charset-conversion.
 891
 892
 893 The following additional properties are recognized (and required)
 894 if TYPE is 'ccl:
 895
 896 'decode
 897      CCL program used for decoding (converting to internal format).
 898
 899 'encode
 900      CCL program used for encoding (converting to external format).
 901 */
 902        (name, type, doc_string, props))
 903 {
 904   Lisp_Coding_System *codesys;
 905   Lisp_Object rest, key, value;
 906   enum coding_system_type ty;
 907   int need_to_setup_eol_systems = 1;
 908
 909   /* Convert type to constant */
 910   if (NILP (type) || EQ (type, Qundecided))
 911                                       { ty = CODESYS_AUTODETECT; }
 912 #ifdef MULE
 913   else if (EQ (type, Qshift_jis))     { ty = CODESYS_SHIFT_JIS; }
 914   else if (EQ (type, Qiso2022))       { ty = CODESYS_ISO2022; }
 915   else if (EQ (type, Qbig5))          { ty = CODESYS_BIG5; }
 916   else if (EQ (type, Qucs4))          { ty = CODESYS_UCS4; }
 917   else if (EQ (type, Qutf8))          { ty = CODESYS_UTF8; }
 918   else if (EQ (type, Qccl))           { ty = CODESYS_CCL; }
 919 #endif
 920   else if (EQ (type, Qno_conversion)) { ty = CODESYS_NO_CONVERSION; }
 921 #ifdef DEBUG_XEMACS
 922   else if (EQ (type, Qinternal))      { ty = CODESYS_INTERNAL; }
 923 #endif
 924   else
 925     signal_simple_error ("Invalid coding system type", type);
 926
 927   CHECK_SYMBOL (name);
 928
 929   codesys = allocate_coding_system (ty, name);
 930
 931   if (NILP (doc_string))
 932     doc_string = build_string ("");
 933   else
 934     CHECK_STRING (doc_string);
 935   CODING_SYSTEM_DOC_STRING (codesys) = doc_string;
 936
 937   EXTERNAL_PROPERTY_LIST_LOOP (rest, key, value, props)
 938     {
 939       if (EQ (key, Qmnemonic))
 940         {
 941           if (!NILP (value))
 942             CHECK_STRING (value);
 943           CODING_SYSTEM_MNEMONIC (codesys) = value;
 944         }
 945
 946       else if (EQ (key, Qeol_type))
 947         {
 948           need_to_setup_eol_systems = NILP (value);
 949           if (EQ (value, Qt))
 950             value = Qnil;
 951           CODING_SYSTEM_EOL_TYPE (codesys) = symbol_to_eol_type (value);
 952         }
 953
 954       else if (EQ (key, Qpost_read_conversion))
 955         CODING_SYSTEM_POST_READ_CONVERSION (codesys) = value;
 956       else if (EQ (key, Qpre_write_conversion))
 957         CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = value;
 958 #ifdef UTF2000
 959       else if (EQ (key, Qdisable_composition))
 960         CODING_SYSTEM_DISABLE_COMPOSITION (codesys) = !NILP (value);
 961 #endif
 962 #ifdef MULE
 963       else if (ty == CODESYS_ISO2022)
 964         {
 965 #define FROB_INITIAL_CHARSET(charset_num) \
 966   CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \
 967     ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value))
 968
 969           if      (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
 970           else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
 971           else if (EQ (key, Qcharset_g2)) FROB_INITIAL_CHARSET (2);
 972           else if (EQ (key, Qcharset_g3)) FROB_INITIAL_CHARSET (3);
 973
 974 #define FROB_FORCE_CHARSET(charset_num) \
 975   CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = !NILP (value)
 976
 977           else if (EQ (key, Qforce_g0_on_output)) FROB_FORCE_CHARSET (0);
 978           else if (EQ (key, Qforce_g1_on_output)) FROB_FORCE_CHARSET (1);
 979           else if (EQ (key, Qforce_g2_on_output)) FROB_FORCE_CHARSET (2);
 980           else if (EQ (key, Qforce_g3_on_output)) FROB_FORCE_CHARSET (3);
 981
 982 #define FROB_BOOLEAN_PROPERTY(prop) \
 983   CODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value)
 984
 985           else if (EQ (key, Qshort))         FROB_BOOLEAN_PROPERTY (SHORT);
 986           else if (EQ (key, Qno_ascii_eol))  FROB_BOOLEAN_PROPERTY (NO_ASCII_EOL);
 987           else if (EQ (key, Qno_ascii_cntl)) FROB_BOOLEAN_PROPERTY (NO_ASCII_CNTL);
 988           else if (EQ (key, Qseven))         FROB_BOOLEAN_PROPERTY (SEVEN);
 989           else if (EQ (key, Qlock_shift))    FROB_BOOLEAN_PROPERTY (LOCK_SHIFT);
 990           else if (EQ (key, Qno_iso6429))    FROB_BOOLEAN_PROPERTY (NO_ISO6429);
 991           else if (EQ (key, Qescape_quoted)) FROB_BOOLEAN_PROPERTY (ESCAPE_QUOTED);
 992
 993           else if (EQ (key, Qinput_charset_conversion))
 994             {
 995               codesys->iso2022.input_conv =
 996                 Dynarr_new (charset_conversion_spec);
 997               parse_charset_conversion_specs (codesys->iso2022.input_conv,
 998                                               value);
 999             }
1000           else if (EQ (key, Qoutput_charset_conversion))
1001             {
1002               codesys->iso2022.output_conv =
1003                 Dynarr_new (charset_conversion_spec);
1004               parse_charset_conversion_specs (codesys->iso2022.output_conv,
1005                                               value);
1006             }
1007           else
1008             signal_simple_error ("Unrecognized property", key);
1009         }
1010       else if (EQ (type, Qccl))
1011         {
1012           if (EQ (key, Qdecode))
1013             {
1014               CHECK_VECTOR (value);
1015               CODING_SYSTEM_CCL_DECODE (codesys) = value;
1016             }
1017           else if (EQ (key, Qencode))
1018             {
1019               CHECK_VECTOR (value);
1020               CODING_SYSTEM_CCL_ENCODE (codesys) = value;
1021             }
1022           else
1023             signal_simple_error ("Unrecognized property", key);
1024         }
1025 #endif /* MULE */
1026       else
1027         signal_simple_error ("Unrecognized property", key);
1028     }
1029
1030   if (need_to_setup_eol_systems)
1031     setup_eol_coding_systems (codesys);
1032
1033   {
1034     Lisp_Object codesys_obj;
1035     XSETCODING_SYSTEM (codesys_obj, codesys);
1036     Fputhash (name, codesys_obj, Vcoding_system_hash_table);
1037     return codesys_obj;
1038   }
1039 }
1040
1041 DEFUN ("copy-coding-system", Fcopy_coding_system, 2, 2, 0, /*
1042 Copy OLD-CODING-SYSTEM to NEW-NAME.
1043 If NEW-NAME does not name an existing coding system, a new one will
1044 be created.
1045 */
1046        (old_coding_system, new_name))
1047 {
1048   Lisp_Object new_coding_system;
1049   old_coding_system = Fget_coding_system (old_coding_system);
1050   new_coding_system = Ffind_coding_system (new_name);
1051   if (NILP (new_coding_system))
1052     {
1053       XSETCODING_SYSTEM (new_coding_system,
1054                          allocate_coding_system
1055                          (XCODING_SYSTEM_TYPE (old_coding_system),
1056                           new_name));
1057       Fputhash (new_name, new_coding_system, Vcoding_system_hash_table);
1058     }
1059
1060   {
1061     Lisp_Coding_System *to = XCODING_SYSTEM (new_coding_system);
1062     Lisp_Coding_System *from = XCODING_SYSTEM (old_coding_system);
1063     memcpy (((char *) to  ) + sizeof (to->header),
1064             ((char *) from) + sizeof (from->header),
1065             sizeof (*from) - sizeof (from->header));
1066     to->name = new_name;
1067   }
1068   return new_coding_system;
1069 }
1070
1071 DEFUN ("coding-system-canonical-name-p", Fcoding_system_canonical_name_p, 1, 1, 0, /*
1072 Return t if OBJECT names a coding system, and is not a coding system alias.
1073 */
1074        (object))
1075 {
1076   return CODING_SYSTEMP (Fgethash (object, Vcoding_system_hash_table, Qnil))
1077     ? Qt : Qnil;
1078 }
1079
1080 DEFUN ("coding-system-alias-p", Fcoding_system_alias_p, 1, 1, 0, /*
1081 Return t if OBJECT is a coding system alias.
1082 All coding system aliases are created by `define-coding-system-alias'.
1083 */
1084        (object))
1085 {
1086   return SYMBOLP (Fgethash (object, Vcoding_system_hash_table, Qzero))
1087     ? Qt : Qnil;
1088 }
1089
1090 DEFUN ("coding-system-aliasee", Fcoding_system_aliasee, 1, 1, 0, /*
1091 Return the coding-system symbol for which symbol ALIAS is an alias.
1092 */
1093        (alias))
1094 {
1095   Lisp_Object aliasee = Fgethash (alias, Vcoding_system_hash_table, Qnil);
1096   if (SYMBOLP (aliasee))
1097     return aliasee;
1098   else
1099     signal_simple_error ("Symbol is not a coding system alias", alias);
1100 }
1101
1102 static Lisp_Object
1103 append_suffix_to_symbol (Lisp_Object symbol, const char *ascii_string)
1104 {
1105   return Fintern (concat2 (Fsymbol_name (symbol), build_string (ascii_string)),
1106                   Qnil);
1107 }
1108
1109 /* A maphash function, for removing dangling coding system aliases. */
1110 static int
1111 dangling_coding_system_alias_p (Lisp_Object alias,
1112                                 Lisp_Object aliasee,
1113                                 void *dangling_aliases)
1114 {
1115   if (SYMBOLP (aliasee)
1116       && NILP (Fgethash (aliasee, Vcoding_system_hash_table, Qnil)))
1117     {
1118       (*(int *) dangling_aliases)++;
1119       return 1;
1120     }
1121   else
1122     return 0;
1123 }
1124
1125 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias, 2, 2, 0, /*
1126 Define symbol ALIAS as an alias for coding system ALIASEE.
1127
1128 You can use this function to redefine an alias that has already been defined,
1129 but you cannot redefine a name which is the canonical name for a coding system.
1130 \(a canonical name of a coding system is what is returned when you call
1131 `coding-system-name' on a coding system).
1132
1133 ALIASEE itself can be an alias, which allows you to define nested aliases.
1134
1135 You are forbidden, however, from creating alias loops or `dangling' aliases.
1136 These will be detected, and an error will be signaled if you attempt to do so.
1137
1138 If ALIASEE is nil, then ALIAS will simply be undefined.
1139
1140 See also `coding-system-alias-p', `coding-system-aliasee',
1141 and `coding-system-canonical-name-p'.
1142 */
1143        (alias, aliasee))
1144 {
1145   Lisp_Object real_coding_system, probe;
1146
1147   CHECK_SYMBOL (alias);
1148
1149   if (!NILP (Fcoding_system_canonical_name_p (alias)))
1150     signal_simple_error
1151       ("Symbol is the canonical name of a coding system and cannot be redefined",
1152        alias);
1153
1154   if (NILP (aliasee))
1155     {
1156       Lisp_Object subsidiary_unix = append_suffix_to_symbol (alias, "-unix");
1157       Lisp_Object subsidiary_dos  = append_suffix_to_symbol (alias, "-dos");
1158       Lisp_Object subsidiary_mac  = append_suffix_to_symbol (alias, "-mac");
1159
1160       Fremhash (alias, Vcoding_system_hash_table);
1161
1162       /* Undefine subsidiary aliases,
1163          presumably created by a previous call to this function */
1164       if (! NILP (Fcoding_system_alias_p (subsidiary_unix)) &&
1165           ! NILP (Fcoding_system_alias_p (subsidiary_dos))  &&
1166           ! NILP (Fcoding_system_alias_p (subsidiary_mac)))
1167         {
1168           Fdefine_coding_system_alias (subsidiary_unix, Qnil);
1169           Fdefine_coding_system_alias (subsidiary_dos,  Qnil);
1170           Fdefine_coding_system_alias (subsidiary_mac,  Qnil);
1171         }
1172
1173       /* Undefine dangling coding system aliases. */
1174       {
1175         int dangling_aliases;
1176
1177         do {
1178           dangling_aliases = 0;
1179           elisp_map_remhash (dangling_coding_system_alias_p,
1180                              Vcoding_system_hash_table,
1181                              &dangling_aliases);
1182         } while (dangling_aliases > 0);
1183       }
1184
1185       return Qnil;
1186     }
1187
1188   if (CODING_SYSTEMP (aliasee))
1189     aliasee = XCODING_SYSTEM_NAME (aliasee);
1190
1191   /* Checks that aliasee names a coding-system */
1192   real_coding_system = Fget_coding_system (aliasee);
1193
1194   /* Check for coding system alias loops */
1195   if (EQ (alias, aliasee))
1196     alias_loop: signal_simple_error_2
1197       ("Attempt to create a coding system alias loop", alias, aliasee);
1198
1199   for (probe = aliasee;
1200        SYMBOLP (probe);
1201        probe = Fgethash (probe, Vcoding_system_hash_table, Qzero))
1202     {
1203       if (EQ (probe, alias))
1204         goto alias_loop;
1205     }
1206
1207   Fputhash (alias, aliasee, Vcoding_system_hash_table);
1208
1209   /* Set up aliases for subsidiaries.
1210      #### There must be a better way to handle subsidiary coding systems. */
1211   {
1212     static const char *suffixes[] = { "-unix", "-dos", "-mac" };
1213     int i;
1214     for (i = 0; i < countof (suffixes); i++)
1215       {
1216         Lisp_Object alias_subsidiary =
1217           append_suffix_to_symbol (alias, suffixes[i]);
1218         Lisp_Object aliasee_subsidiary =
1219           append_suffix_to_symbol (aliasee, suffixes[i]);
1220
1221         if (! NILP (Ffind_coding_system (aliasee_subsidiary)))
1222           Fdefine_coding_system_alias (alias_subsidiary, aliasee_subsidiary);
1223       }
1224   }
1225   /* FSF return value is a vector of [ALIAS-unix ALIAS-dos ALIAS-mac],
1226      but it doesn't look intentional, so I'd rather return something
1227      meaningful or nothing at all. */
1228   return Qnil;
1229 }
1230
1231 static Lisp_Object
1232 subsidiary_coding_system (Lisp_Object coding_system, eol_type_t type)
1233 {
1234   Lisp_Coding_System *cs = XCODING_SYSTEM (coding_system);
1235   Lisp_Object new_coding_system;
1236
1237   if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
1238     return coding_system;
1239
1240   switch (type)
1241     {
1242     case EOL_AUTODETECT: return coding_system;
1243     case EOL_LF:   new_coding_system = CODING_SYSTEM_EOL_LF   (cs); break;
1244     case EOL_CR:   new_coding_system = CODING_SYSTEM_EOL_CR   (cs); break;
1245     case EOL_CRLF: new_coding_system = CODING_SYSTEM_EOL_CRLF (cs); break;
1246     default:       abort ();
1247     }
1248
1249   return NILP (new_coding_system) ? coding_system : new_coding_system;
1250 }
1251
1252 DEFUN ("subsidiary-coding-system", Fsubsidiary_coding_system, 2, 2, 0, /*
1253 Return the subsidiary coding system of CODING-SYSTEM with eol type EOL-TYPE.
1254 */
1255        (coding_system, eol_type))
1256 {
1257   coding_system = Fget_coding_system (coding_system);
1258
1259   return subsidiary_coding_system (coding_system,
1260                                    symbol_to_eol_type (eol_type));
1261 }
1262
1263 \f
1264 /************************************************************************/
1265 /*                         Coding system accessors                      */
1266 /************************************************************************/
1267
1268 DEFUN ("coding-system-doc-string", Fcoding_system_doc_string, 1, 1, 0, /*
1269 Return the doc string for CODING-SYSTEM.
1270 */
1271        (coding_system))
1272 {
1273   coding_system = Fget_coding_system (coding_system);
1274   return XCODING_SYSTEM_DOC_STRING (coding_system);
1275 }
1276
1277 DEFUN ("coding-system-type", Fcoding_system_type, 1, 1, 0, /*
1278 Return the type of CODING-SYSTEM.
1279 */
1280        (coding_system))
1281 {
1282   switch (XCODING_SYSTEM_TYPE (Fget_coding_system (coding_system)))
1283     {
1284     default: abort ();
1285     case CODESYS_AUTODETECT:    return Qundecided;
1286 #ifdef MULE
1287     case CODESYS_SHIFT_JIS:     return Qshift_jis;
1288     case CODESYS_ISO2022:       return Qiso2022;
1289     case CODESYS_BIG5:          return Qbig5;
1290     case CODESYS_UCS4:          return Qucs4;
1291     case CODESYS_UTF8:          return Qutf8;
1292     case CODESYS_CCL:           return Qccl;
1293 #endif
1294     case CODESYS_NO_CONVERSION: return Qno_conversion;
1295 #ifdef DEBUG_XEMACS
1296     case CODESYS_INTERNAL:      return Qinternal;
1297 #endif
1298     }
1299 }
1300
1301 #ifdef MULE
1302 static
1303 Lisp_Object coding_system_charset (Lisp_Object coding_system, int gnum)
1304 {
1305   Lisp_Object cs
1306     = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, gnum);
1307
1308   return CHARSETP (cs) ? XCHARSET_NAME (cs) : Qnil;
1309 }
1310
1311 DEFUN ("coding-system-charset", Fcoding_system_charset, 2, 2, 0, /*
1312 Return initial charset of CODING-SYSTEM designated to GNUM.
1313 GNUM allows 0 .. 3.
1314 */
1315        (coding_system, gnum))
1316 {
1317   coding_system = Fget_coding_system (coding_system);
1318   CHECK_INT (gnum);
1319
1320   return coding_system_charset (coding_system, XINT (gnum));
1321 }
1322 #endif /* MULE */
1323
1324 DEFUN ("coding-system-property", Fcoding_system_property, 2, 2, 0, /*
1325 Return the PROP property of CODING-SYSTEM.
1326 */
1327        (coding_system, prop))
1328 {
1329   int i, ok = 0;
1330   enum coding_system_type type;
1331
1332   coding_system = Fget_coding_system (coding_system);
1333   CHECK_SYMBOL (prop);
1334   type = XCODING_SYSTEM_TYPE (coding_system);
1335
1336   for (i = 0; !ok && i < Dynarr_length (the_codesys_prop_dynarr); i++)
1337     if (EQ (Dynarr_at (the_codesys_prop_dynarr, i).sym, prop))
1338       {
1339         ok = 1;
1340         switch (Dynarr_at (the_codesys_prop_dynarr, i).prop_type)
1341           {
1342           case CODESYS_PROP_ALL_OK:
1343             break;
1344 #ifdef MULE
1345           case CODESYS_PROP_ISO2022:
1346             if (type != CODESYS_ISO2022)
1347               signal_simple_error
1348                 ("Property only valid in ISO2022 coding systems",
1349                  prop);
1350             break;
1351
1352           case CODESYS_PROP_CCL:
1353             if (type != CODESYS_CCL)
1354               signal_simple_error
1355                 ("Property only valid in CCL coding systems",
1356                  prop);
1357             break;
1358 #endif /* MULE */
1359           default:
1360             abort ();
1361           }
1362       }
1363
1364   if (!ok)
1365     signal_simple_error ("Unrecognized property", prop);
1366
1367   if (EQ (prop, Qname))
1368     return XCODING_SYSTEM_NAME (coding_system);
1369   else if (EQ (prop, Qtype))
1370     return Fcoding_system_type (coding_system);
1371   else if (EQ (prop, Qdoc_string))
1372     return XCODING_SYSTEM_DOC_STRING (coding_system);
1373   else if (EQ (prop, Qmnemonic))
1374     return XCODING_SYSTEM_MNEMONIC (coding_system);
1375   else if (EQ (prop, Qeol_type))
1376     return eol_type_to_symbol (XCODING_SYSTEM_EOL_TYPE (coding_system));
1377   else if (EQ (prop, Qeol_lf))
1378     return XCODING_SYSTEM_EOL_LF (coding_system);
1379   else if (EQ (prop, Qeol_crlf))
1380     return XCODING_SYSTEM_EOL_CRLF (coding_system);
1381   else if (EQ (prop, Qeol_cr))
1382     return XCODING_SYSTEM_EOL_CR (coding_system);
1383   else if (EQ (prop, Qpost_read_conversion))
1384     return XCODING_SYSTEM_POST_READ_CONVERSION (coding_system);
1385   else if (EQ (prop, Qpre_write_conversion))
1386     return XCODING_SYSTEM_PRE_WRITE_CONVERSION (coding_system);
1387 #ifdef MULE
1388   else if (type == CODESYS_ISO2022)
1389     {
1390       if (EQ (prop, Qcharset_g0))
1391         return coding_system_charset (coding_system, 0);
1392       else if (EQ (prop, Qcharset_g1))
1393         return coding_system_charset (coding_system, 1);
1394       else if (EQ (prop, Qcharset_g2))
1395         return coding_system_charset (coding_system, 2);
1396       else if (EQ (prop, Qcharset_g3))
1397         return coding_system_charset (coding_system, 3);
1398
1399 #define FORCE_CHARSET(charset_num) \
1400   (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \
1401    (coding_system, charset_num) ? Qt : Qnil)
1402
1403       else if (EQ (prop, Qforce_g0_on_output)) return FORCE_CHARSET (0);
1404       else if (EQ (prop, Qforce_g1_on_output)) return FORCE_CHARSET (1);
1405       else if (EQ (prop, Qforce_g2_on_output)) return FORCE_CHARSET (2);
1406       else if (EQ (prop, Qforce_g3_on_output)) return FORCE_CHARSET (3);
1407
1408 #define LISP_BOOLEAN(prop) \
1409   (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil)
1410
1411       else if (EQ (prop, Qshort))         return LISP_BOOLEAN (SHORT);
1412       else if (EQ (prop, Qno_ascii_eol))  return LISP_BOOLEAN (NO_ASCII_EOL);
1413       else if (EQ (prop, Qno_ascii_cntl)) return LISP_BOOLEAN (NO_ASCII_CNTL);
1414       else if (EQ (prop, Qseven))         return LISP_BOOLEAN (SEVEN);
1415       else if (EQ (prop, Qlock_shift))    return LISP_BOOLEAN (LOCK_SHIFT);
1416       else if (EQ (prop, Qno_iso6429))    return LISP_BOOLEAN (NO_ISO6429);
1417       else if (EQ (prop, Qescape_quoted)) return LISP_BOOLEAN (ESCAPE_QUOTED);
1418
1419       else if (EQ (prop, Qinput_charset_conversion))
1420         return
1421           unparse_charset_conversion_specs
1422             (XCODING_SYSTEM (coding_system)->iso2022.input_conv);
1423       else if (EQ (prop, Qoutput_charset_conversion))
1424         return
1425           unparse_charset_conversion_specs
1426             (XCODING_SYSTEM (coding_system)->iso2022.output_conv);
1427       else
1428         abort ();
1429     }
1430   else if (type == CODESYS_CCL)
1431     {
1432       if (EQ (prop, Qdecode))
1433         return XCODING_SYSTEM_CCL_DECODE (coding_system);
1434       else if (EQ (prop, Qencode))
1435         return XCODING_SYSTEM_CCL_ENCODE (coding_system);
1436       else
1437         abort ();
1438     }
1439 #endif /* MULE */
1440   else
1441     abort ();
1442
1443   return Qnil; /* not reached */
1444 }
1445
1446 \f
1447 /************************************************************************/
1448 /*                       Coding category functions                      */
1449 /************************************************************************/
1450
1451 static int
1452 decode_coding_category (Lisp_Object symbol)
1453 {
1454   int i;
1455
1456   CHECK_SYMBOL (symbol);
1457   for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1458     if (EQ (coding_category_symbol[i], symbol))
1459       return i;
1460
1461   signal_simple_error ("Unrecognized coding category", symbol);
1462   return 0; /* not reached */
1463 }
1464
1465 DEFUN ("coding-category-list", Fcoding_category_list, 0, 0, 0, /*
1466 Return a list of all recognized coding categories.
1467 */
1468        ())
1469 {
1470   int i;
1471   Lisp_Object list = Qnil;
1472
1473   for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1474     list = Fcons (coding_category_symbol[i], list);
1475   return list;
1476 }
1477
1478 DEFUN ("set-coding-priority-list", Fset_coding_priority_list, 1, 1, 0, /*
1479 Change the priority order of the coding categories.
1480 LIST should be list of coding categories, in descending order of
1481 priority.  Unspecified coding categories will be lower in priority
1482 than all specified ones, in the same relative order they were in
1483 previously.
1484 */
1485        (list))
1486 {
1487   int category_to_priority[CODING_CATEGORY_LAST + 1];
1488   int i, j;
1489   Lisp_Object rest;
1490
1491   /* First generate a list that maps coding categories to priorities. */
1492
1493   for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1494     category_to_priority[i] = -1;
1495
1496   /* Highest priority comes from the specified list. */
1497   i = 0;
1498   EXTERNAL_LIST_LOOP (rest, list)
1499     {
1500       int cat = decode_coding_category (XCAR (rest));
1501
1502       if (category_to_priority[cat] >= 0)
1503         signal_simple_error ("Duplicate coding category in list", XCAR (rest));
1504       category_to_priority[cat] = i++;
1505     }
1506
1507   /* Now go through the existing categories by priority to retrieve
1508      the categories not yet specified and preserve their priority
1509      order. */
1510   for (j = 0; j <= CODING_CATEGORY_LAST; j++)
1511     {
1512       int cat = fcd->coding_category_by_priority[j];
1513       if (category_to_priority[cat] < 0)
1514         category_to_priority[cat] = i++;
1515     }
1516
1517   /* Now we need to construct the inverse of the mapping we just
1518      constructed. */
1519
1520   for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1521     fcd->coding_category_by_priority[category_to_priority[i]] = i;
1522
1523   /* Phew!  That was confusing. */
1524   return Qnil;
1525 }
1526
1527 DEFUN ("coding-priority-list", Fcoding_priority_list, 0, 0, 0, /*
1528 Return a list of coding categories in descending order of priority.
1529 */
1530        ())
1531 {
1532   int i;
1533   Lisp_Object list = Qnil;
1534
1535   for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1536     list = Fcons (coding_category_symbol[fcd->coding_category_by_priority[i]],
1537                   list);
1538   return list;
1539 }
1540
1541 DEFUN ("set-coding-category-system", Fset_coding_category_system, 2, 2, 0, /*
1542 Change the coding system associated with a coding category.
1543 */
1544        (coding_category, coding_system))
1545 {
1546   int cat = decode_coding_category (coding_category);
1547
1548   coding_system = Fget_coding_system (coding_system);
1549   fcd->coding_category_system[cat] = coding_system;
1550   return Qnil;
1551 }
1552
1553 DEFUN ("coding-category-system", Fcoding_category_system, 1, 1, 0, /*
1554 Return the coding system associated with a coding category.
1555 */
1556        (coding_category))
1557 {
1558   int cat = decode_coding_category (coding_category);
1559   Lisp_Object sys = fcd->coding_category_system[cat];
1560
1561   if (!NILP (sys))
1562     return XCODING_SYSTEM_NAME (sys);
1563   return Qnil;
1564 }
1565
1566 \f
1567 /************************************************************************/
1568 /*                     Detecting the encoding of data                   */
1569 /************************************************************************/
1570
1571 struct detection_state
1572 {
1573   eol_type_t eol_type;
1574   int seen_non_ascii;
1575   int mask;
1576 #ifdef MULE
1577   struct
1578     {
1579       int mask;
1580       int in_second_byte;
1581     }
1582   big5;
1583
1584   struct
1585     {
1586       int mask;
1587       int in_second_byte;
1588     }
1589   shift_jis;
1590
1591   struct
1592     {
1593       int mask;
1594       int in_byte;
1595   }
1596   ucs4;
1597
1598   struct
1599     {
1600       int mask;
1601       int in_byte;
1602     }
1603   utf8;
1604
1605   struct
1606     {
1607       int mask;
1608       int initted;
1609       struct iso2022_decoder iso;
1610       unsigned int flags;
1611       int high_byte_count;
1612       unsigned int saw_single_shift:1;
1613     }
1614   iso2022;
1615 #endif
1616   struct
1617     {
1618       int seen_anything;
1619       int just_saw_cr;
1620     }
1621   eol;
1622 };
1623
1624 static int
1625 acceptable_control_char_p (int c)
1626 {
1627   switch (c)
1628     {
1629       /* Allow and ignore control characters that you might
1630          reasonably see in a text file */
1631     case '\r':
1632     case '\n':
1633     case '\t':
1634     case  7: /* bell */
1635     case  8: /* backspace */
1636     case 11: /* vertical tab */
1637     case 12: /* form feed */
1638     case 26: /* MS-DOS C-z junk */
1639     case 31: /* '^_' -- for info */
1640       return 1;
1641     default:
1642       return 0;
1643     }
1644 }
1645
1646 static int
1647 mask_has_at_most_one_bit_p (int mask)
1648 {
1649   /* Perhaps the only thing useful you learn from intensive Microsoft
1650      technical interviews */
1651   return (mask & (mask - 1)) == 0;
1652 }
1653
1654 static eol_type_t
1655 detect_eol_type (struct detection_state *st, const unsigned char *src,
1656                  unsigned int n)
1657 {
1658   int c;
1659
1660   while (n--)
1661     {
1662       c = *src++;
1663       if (c == '\n')
1664         {
1665           if (st->eol.just_saw_cr)
1666             return EOL_CRLF;
1667           else if (st->eol.seen_anything)
1668             return EOL_LF;
1669         }
1670       else if (st->eol.just_saw_cr)
1671         return EOL_CR;
1672       else if (c == '\r')
1673         st->eol.just_saw_cr = 1;
1674       else
1675         st->eol.just_saw_cr = 0;
1676       st->eol.seen_anything = 1;
1677     }
1678
1679   return EOL_AUTODETECT;
1680 }
1681
1682 /* Attempt to determine the encoding and EOL type of the given text.
1683    Before calling this function for the first type, you must initialize
1684    st->eol_type as appropriate and initialize st->mask to ~0.
1685
1686    st->eol_type holds the determined EOL type, or EOL_AUTODETECT if
1687    not yet known.
1688
1689    st->mask holds the determined coding category mask, or ~0 if only
1690    ASCII has been seen so far.
1691
1692    Returns:
1693
1694    0 == st->eol_type is EOL_AUTODETECT and/or more than coding category
1695         is present in st->mask
1696    1 == definitive answers are here for both st->eol_type and st->mask
1697 */
1698
1699 static int
1700 detect_coding_type (struct detection_state *st, const Extbyte *src,
1701                     unsigned int n, int just_do_eol)
1702 {
1703   int c;
1704
1705   if (st->eol_type == EOL_AUTODETECT)
1706     st->eol_type = detect_eol_type (st, src, n);
1707
1708   if (just_do_eol)
1709     return st->eol_type != EOL_AUTODETECT;
1710
1711   if (!st->seen_non_ascii)
1712     {
1713       for (; n; n--, src++)
1714         {
1715           c = *src;
1716           if ((c < 0x20 && !acceptable_control_char_p (c)) || c >= 0x80)
1717             {
1718               st->seen_non_ascii = 1;
1719 #ifdef MULE
1720               st->shift_jis.mask = ~0;
1721               st->big5.mask = ~0;
1722               st->ucs4.mask = ~0;
1723               st->utf8.mask = ~0;
1724               st->iso2022.mask = ~0;
1725 #endif
1726               break;
1727             }
1728         }
1729     }
1730
1731   if (!n)
1732     return 0;
1733 #ifdef MULE
1734   if (!mask_has_at_most_one_bit_p (st->iso2022.mask))
1735     st->iso2022.mask = detect_coding_iso2022 (st, src, n);
1736   if (!mask_has_at_most_one_bit_p (st->shift_jis.mask))
1737     st->shift_jis.mask = detect_coding_sjis (st, src, n);
1738   if (!mask_has_at_most_one_bit_p (st->big5.mask))
1739     st->big5.mask = detect_coding_big5 (st, src, n);
1740   if (!mask_has_at_most_one_bit_p (st->utf8.mask))
1741     st->utf8.mask = detect_coding_utf8 (st, src, n);
1742   if (!mask_has_at_most_one_bit_p (st->ucs4.mask))
1743     st->ucs4.mask = detect_coding_ucs4 (st, src, n);
1744
1745   st->mask
1746     = st->iso2022.mask | st->shift_jis.mask | st->big5.mask
1747     | st->utf8.mask | st->ucs4.mask;
1748 #endif
1749   {
1750     int retval = mask_has_at_most_one_bit_p (st->mask);
1751     st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK;
1752     return retval && st->eol_type != EOL_AUTODETECT;
1753   }
1754 }
1755
1756 static Lisp_Object
1757 coding_system_from_mask (int mask)
1758 {
1759   if (mask == ~0)
1760     {
1761       /* If the file was entirely or basically ASCII, use the
1762          default value of `buffer-file-coding-system'. */
1763       Lisp_Object retval =
1764         XBUFFER (Vbuffer_defaults)->buffer_file_coding_system;
1765       if (!NILP (retval))
1766         {
1767           retval = Ffind_coding_system (retval);
1768           if (NILP (retval))
1769             {
1770               warn_when_safe
1771                 (Qbad_variable, Qwarning,
1772                  "Invalid `default-buffer-file-coding-system', set to nil");
1773               XBUFFER (Vbuffer_defaults)->buffer_file_coding_system = Qnil;
1774             }
1775         }
1776       if (NILP (retval))
1777         retval = Fget_coding_system (Qraw_text);
1778       return retval;
1779     }
1780   else
1781     {
1782       int i;
1783       int cat = -1;
1784 #ifdef MULE
1785       mask = postprocess_iso2022_mask (mask);
1786 #endif
1787       /* Look through the coding categories by priority and find
1788          the first one that is allowed. */
1789       for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1790         {
1791           cat = fcd->coding_category_by_priority[i];
1792           if ((mask & (1 << cat)) &&
1793               !NILP (fcd->coding_category_system[cat]))
1794             break;
1795         }
1796       if (cat >= 0)
1797         return fcd->coding_category_system[cat];
1798       else
1799         return Fget_coding_system (Qraw_text);
1800     }
1801 }
1802
1803 /* Given a seekable read stream and potential coding system and EOL type
1804    as specified, do any autodetection that is called for.  If the
1805    coding system and/or EOL type are not `autodetect', they will be left
1806    alone; but this function will never return an autodetect coding system
1807    or EOL type.
1808
1809    This function does not automatically fetch subsidiary coding systems;
1810    that should be unnecessary with the explicit eol-type argument. */
1811
1812 #define LENGTH(string_constant) (sizeof (string_constant) - 1)
1813
1814 void
1815 determine_real_coding_system (Lstream *stream, Lisp_Object *codesys_in_out,
1816                               eol_type_t *eol_type_in_out)
1817 {
1818   struct detection_state decst;
1819
1820   if (*eol_type_in_out == EOL_AUTODETECT)
1821     *eol_type_in_out = XCODING_SYSTEM_EOL_TYPE (*codesys_in_out);
1822
1823   xzero (decst);
1824   decst.eol_type = *eol_type_in_out;
1825   decst.mask = ~0;
1826
1827   /* If autodetection is called for, do it now. */
1828   if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT
1829       || *eol_type_in_out == EOL_AUTODETECT)
1830     {
1831       Extbyte buf[4096];
1832       Lisp_Object coding_system = Qnil;
1833       Extbyte *p;
1834       ssize_t nread = Lstream_read (stream, buf, sizeof (buf));
1835       Extbyte *scan_end;
1836
1837       /* Look for initial "-*-"; mode line prefix */
1838       for (p = buf,
1839              scan_end = buf + nread - LENGTH ("-*-coding:?-*-");
1840            p <= scan_end
1841              && *p != '\n'
1842              && *p != '\r';
1843            p++)
1844         if (*p == '-' && *(p+1) == '*' && *(p+2) == '-')
1845           {
1846             Extbyte *local_vars_beg = p + 3;
1847             /* Look for final "-*-"; mode line suffix */
1848             for (p = local_vars_beg,
1849                    scan_end = buf + nread - LENGTH ("-*-");
1850                  p <= scan_end
1851                    && *p != '\n'
1852                    && *p != '\r';
1853                  p++)
1854               if (*p == '-' && *(p+1) == '*' && *(p+2) == '-')
1855                 {
1856                   Extbyte *suffix = p;
1857                   /* Look for "coding:" */
1858                   for (p = local_vars_beg,
1859                          scan_end = suffix - LENGTH ("coding:?");
1860                        p <= scan_end;
1861                        p++)
1862                     if (memcmp ("coding:", p, LENGTH ("coding:")) == 0
1863                         && (p == local_vars_beg
1864                             || (*(p-1) == ' '  ||
1865                                 *(p-1) == '\t' ||
1866                                 *(p-1) == ';')))
1867                       {
1868                         Extbyte save;
1869                         int n;
1870                         p += LENGTH ("coding:");
1871                         while (*p == ' ' || *p == '\t') p++;
1872
1873                         /* Get coding system name */
1874                         save = *suffix; *suffix = '\0';
1875                         /* Characters valid in a MIME charset name (rfc 1521),
1876                            and in a Lisp symbol name. */
1877                         n = strspn ( (char *) p,
1878                                     "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
1879                                     "abcdefghijklmnopqrstuvwxyz"
1880                                     "0123456789"
1881                                     "!$%&*+-.^_{|}~");
1882                         *suffix = save;
1883                         if (n > 0)
1884                           {
1885                             save = p[n]; p[n] = '\0';
1886                             coding_system =
1887                               Ffind_coding_system (intern ((char *) p));
1888                             p[n] = save;
1889                           }
1890                         break;
1891                       }
1892                   break;
1893                 }
1894             break;
1895           }
1896
1897       if (NILP (coding_system))
1898         do
1899           {
1900             if (detect_coding_type (&decst, buf, nread,
1901                                     XCODING_SYSTEM_TYPE (*codesys_in_out)
1902                                     != CODESYS_AUTODETECT))
1903               break;
1904             nread = Lstream_read (stream, buf, sizeof (buf));
1905             if (nread == 0)
1906               break;
1907           }
1908         while (1);
1909
1910       else if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT
1911                && XCODING_SYSTEM_EOL_TYPE (coding_system) == EOL_AUTODETECT)
1912         do
1913           {
1914             if (detect_coding_type (&decst, buf, nread, 1))
1915               break;
1916             nread = Lstream_read (stream, buf, sizeof (buf));
1917             if (!nread)
1918               break;
1919           }
1920         while (1);
1921
1922       *eol_type_in_out = decst.eol_type;
1923       if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT)
1924         {
1925           if (NILP (coding_system))
1926             *codesys_in_out = coding_system_from_mask (decst.mask);
1927           else
1928             *codesys_in_out = coding_system;
1929         }
1930     }
1931
1932   /* If we absolutely can't determine the EOL type, just assume LF. */
1933   if (*eol_type_in_out == EOL_AUTODETECT)
1934     *eol_type_in_out = EOL_LF;
1935
1936   Lstream_rewind (stream);
1937 }
1938
1939 DEFUN ("detect-coding-region", Fdetect_coding_region, 2, 3, 0, /*
1940 Detect coding system of the text in the region between START and END.
1941 Returned a list of possible coding systems ordered by priority.
1942 If only ASCII characters are found, it returns 'undecided or one of
1943 its subsidiary coding systems according to a detected end-of-line
1944 type.  Optional arg BUFFER defaults to the current buffer.
1945 */
1946        (start, end, buffer))
1947 {
1948   Lisp_Object val = Qnil;
1949   struct buffer *buf = decode_buffer (buffer, 0);
1950   Bufpos b, e;
1951   Lisp_Object instream, lb_instream;
1952   Lstream *istr, *lb_istr;
1953   struct detection_state decst;
1954   struct gcpro gcpro1, gcpro2;
1955
1956   get_buffer_range_char (buf, start, end, &b, &e, 0);
1957   lb_instream = make_lisp_buffer_input_stream (buf, b, e, 0);
1958   lb_istr = XLSTREAM (lb_instream);
1959   instream = make_encoding_input_stream (lb_istr, Fget_coding_system (Qbinary));
1960   istr = XLSTREAM (instream);
1961   GCPRO2 (instream, lb_instream);
1962   xzero (decst);
1963   decst.eol_type = EOL_AUTODETECT;
1964   decst.mask = ~0;
1965   while (1)
1966     {
1967       unsigned char random_buffer[4096];
1968       ssize_t nread = Lstream_read (istr, random_buffer, sizeof (random_buffer));
1969
1970       if (!nread)
1971         break;
1972       if (detect_coding_type (&decst, random_buffer, nread, 0))
1973         break;
1974     }
1975
1976   if (decst.mask == ~0)
1977     val = subsidiary_coding_system (Fget_coding_system (Qundecided),
1978                                     decst.eol_type);
1979   else
1980     {
1981       int i;
1982
1983       val = Qnil;
1984 #ifdef MULE
1985       decst.mask = postprocess_iso2022_mask (decst.mask);
1986 #endif
1987       for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1988         {
1989           int sys = fcd->coding_category_by_priority[i];
1990           if (decst.mask & (1 << sys))
1991             {
1992               Lisp_Object codesys = fcd->coding_category_system[sys];
1993               if (!NILP (codesys))
1994                 codesys = subsidiary_coding_system (codesys, decst.eol_type);
1995               val = Fcons (codesys, val);
1996             }
1997         }
1998     }
1999   Lstream_close (istr);
2000   UNGCPRO;
2001   Lstream_delete (istr);
2002   Lstream_delete (lb_istr);
2003   return val;
2004 }
2005
2006 \f
2007 /************************************************************************/
2008 /*           Converting to internal Mule format ("decoding")            */
2009 /************************************************************************/
2010
2011 /* A decoding stream is a stream used for decoding text (i.e.
2012    converting from some external format to internal format).
2013    The decoding-stream object keeps track of the actual coding
2014    stream, the stream that is at the other end, and data that
2015    needs to be persistent across the lifetime of the stream. */
2016
2017 /* Handle the EOL stuff related to just-read-in character C.
2018    EOL_TYPE is the EOL type of the coding stream.
2019    FLAGS is the current value of FLAGS in the coding stream, and may
2020    be modified by this macro.  (The macro only looks at the
2021    CODING_STATE_CR flag.)  DST is the Dynarr to which the decoded
2022    bytes are to be written.  You need to also define a local goto
2023    label "label_continue_loop" that is at the end of the main
2024    character-reading loop.
2025
2026    If C is a CR character, then this macro handles it entirely and
2027    jumps to label_continue_loop.  Otherwise, this macro does not add
2028    anything to DST, and continues normally.  You should continue
2029    processing C normally after this macro. */
2030
2031 #define DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst)         \
2032 do {                                                            \
2033   if (c == '\r')                                                \
2034     {                                                           \
2035       if (eol_type == EOL_CR)                                   \
2036         Dynarr_add (dst, '\n');                                 \
2037       else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR) \
2038         Dynarr_add (dst, c);                                    \
2039       else                                                      \
2040         flags |= CODING_STATE_CR;                               \
2041       goto label_continue_loop;                                 \
2042     }                                                           \
2043   else if (flags & CODING_STATE_CR)                             \
2044     {   /* eol_type == CODING_SYSTEM_EOL_CRLF */                \
2045       if (c != '\n')                                            \
2046         Dynarr_add (dst, '\r');                                 \
2047       flags &= ~CODING_STATE_CR;                                \
2048     }                                                           \
2049 } while (0)
2050
2051 /* C should be a binary character in the range 0 - 255; convert
2052    to internal format and add to Dynarr DST. */
2053
2054 #ifdef UTF2000
2055 #define DECODE_ADD_BINARY_CHAR(c, dst) \
2056 do {                                            \
2057   if (BYTE_ASCII_P (c))                         \
2058     Dynarr_add (dst, c);                        \
2059   else                                          \
2060     {                                           \
2061       Dynarr_add (dst, (c >> 6) | 0xc0);        \
2062       Dynarr_add (dst, (c & 0x3f) | 0x80);      \
2063     }                                           \
2064 } while (0)
2065
2066 INLINE_HEADER void DECODE_ADD_UCS_CHAR(Emchar c, unsigned_char_dynarr* dst);
2067 INLINE_HEADER void
2068 DECODE_ADD_UCS_CHAR(Emchar c, unsigned_char_dynarr* dst)
2069 {
2070   if ( c <= 0x7f )
2071     {
2072       Dynarr_add (dst, c);
2073     }
2074   else if ( c <= 0x7ff )
2075     {
2076       Dynarr_add (dst, (c >> 6) | 0xc0);
2077       Dynarr_add (dst, (c & 0x3f) | 0x80);
2078     }
2079   else if ( c <= 0xffff )
2080     {
2081       Dynarr_add (dst,  (c >> 12) | 0xe0);
2082       Dynarr_add (dst, ((c >>  6) & 0x3f) | 0x80);
2083       Dynarr_add (dst,  (c        & 0x3f) | 0x80);
2084     }
2085   else if ( c <= 0x1fffff )
2086     {
2087       Dynarr_add (dst,  (c >> 18) | 0xf0);
2088       Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2089       Dynarr_add (dst, ((c >>  6) & 0x3f) | 0x80);
2090       Dynarr_add (dst,  (c        & 0x3f) | 0x80);
2091     }
2092   else if ( c <= 0x3ffffff )
2093     {
2094       Dynarr_add (dst,  (c >> 24) | 0xf8);
2095       Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
2096       Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2097       Dynarr_add (dst, ((c >>  6) & 0x3f) | 0x80);
2098       Dynarr_add (dst,  (c        & 0x3f) | 0x80);
2099     }
2100   else
2101     {
2102       Dynarr_add (dst,  (c >> 30) | 0xfc);
2103       Dynarr_add (dst, ((c >> 24) & 0x3f) | 0x80);
2104       Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
2105       Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2106       Dynarr_add (dst, ((c >>  6) & 0x3f) | 0x80);
2107       Dynarr_add (dst,  (c        & 0x3f) | 0x80);
2108     }
2109 }
2110 #else
2111 #define DECODE_ADD_BINARY_CHAR(c, dst)          \
2112 do {                                            \
2113   if (BYTE_ASCII_P (c))                         \
2114     Dynarr_add (dst, c);                        \
2115   else if (BYTE_C1_P (c))                       \
2116     {                                           \
2117       Dynarr_add (dst, LEADING_BYTE_CONTROL_1); \
2118       Dynarr_add (dst, c + 0x20);               \
2119     }                                           \
2120   else                                          \
2121     {                                           \
2122       Dynarr_add (dst, LEADING_BYTE_LATIN_ISO8859_1); \
2123       Dynarr_add (dst, c);                      \
2124     }                                           \
2125 } while (0)
2126 #endif
2127
2128 #define DECODE_OUTPUT_PARTIAL_CHAR(ch)  \
2129 do {                                    \
2130   if (ch)                               \
2131     {                                   \
2132       DECODE_ADD_BINARY_CHAR (ch, dst); \
2133       ch = 0;                           \
2134     }                                   \
2135 } while (0)
2136
2137 #define DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst) \
2138 do {                                    \
2139   if (flags & CODING_STATE_END)         \
2140     {                                   \
2141       DECODE_OUTPUT_PARTIAL_CHAR (ch);  \
2142       if (flags & CODING_STATE_CR)      \
2143         Dynarr_add (dst, '\r');         \
2144     }                                   \
2145 } while (0)
2146
2147 #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding)
2148
2149 struct decoding_stream
2150 {
2151   /* Coding system that governs the conversion. */
2152   Lisp_Coding_System *codesys;
2153
2154   /* Stream that we read the encoded data from or
2155      write the decoded data to. */
2156   Lstream *other_end;
2157
2158   /* If we are reading, then we can return only a fixed amount of
2159      data, so if the conversion resulted in too much data, we store it
2160      here for retrieval the next time around. */
2161   unsigned_char_dynarr *runoff;
2162
2163   /* FLAGS holds flags indicating the current state of the decoding.
2164      Some of these flags are dependent on the coding system. */
2165   unsigned int flags;
2166
2167   /* CH holds a partially built-up character.  Since we only deal
2168      with one- and two-byte characters at the moment, we only use
2169      this to store the first byte of a two-byte character. */
2170   unsigned int ch;
2171
2172   /* EOL_TYPE specifies the type of end-of-line conversion that
2173      currently applies.  We need to keep this separate from the
2174      EOL type stored in CODESYS because the latter might indicate
2175      automatic EOL-type detection while the former will always
2176      indicate a particular EOL type. */
2177   eol_type_t eol_type;
2178 #ifdef MULE
2179   /* Additional ISO2022 information.  We define the structure above
2180      because it's also needed by the detection routines. */
2181   struct iso2022_decoder iso2022;
2182
2183   /* Additional information (the state of the running CCL program)
2184      used by the CCL decoder. */
2185   struct ccl_program ccl;
2186
2187   /* counter for UTF-8 or UCS-4 */
2188   unsigned char counter;
2189 #endif
2190 #ifdef UTF2000
2191   unsigned combined_char_count;
2192   Emchar combined_chars[16];
2193   Lisp_Object combining_table;
2194 #endif
2195   struct detection_state decst;
2196 };
2197
2198 #ifdef UTF2000
2199 extern Lisp_Object Vcharacter_composition_table;
2200
2201 INLINE_HEADER void
2202 COMPOSE_FLUSH_CHARS (struct decoding_stream *str, unsigned_char_dynarr* dst);
2203 INLINE_HEADER void
2204 COMPOSE_FLUSH_CHARS (struct decoding_stream *str, unsigned_char_dynarr* dst)
2205 {
2206   unsigned i;
2207
2208   for (i = 0; i < str->combined_char_count; i++)
2209     DECODE_ADD_UCS_CHAR (str->combined_chars[i], dst);
2210   str->combined_char_count = 0;
2211   str->combining_table = Qnil;
2212 }
2213
2214 void COMPOSE_ADD_CHAR(struct decoding_stream *str, Emchar character,
2215                       unsigned_char_dynarr* dst);
2216 void
2217 COMPOSE_ADD_CHAR(struct decoding_stream *str,
2218                  Emchar character, unsigned_char_dynarr* dst)
2219 {
2220   if (CODING_SYSTEM_DISABLE_COMPOSITION (str->codesys))
2221     DECODE_ADD_UCS_CHAR (character, dst);
2222   else if (!CHAR_ID_TABLE_P (str->combining_table))
2223     {
2224       Lisp_Object ret
2225         = get_char_id_table (character, Vcharacter_composition_table);
2226
2227       if (NILP (ret))
2228         DECODE_ADD_UCS_CHAR (character, dst);
2229       else
2230         {
2231           str->combined_chars[0] = character;
2232           str->combined_char_count = 1;
2233           str->combining_table = ret;
2234         }
2235     }
2236   else
2237     {
2238       Lisp_Object ret
2239         = get_char_id_table (character, str->combining_table);
2240
2241       if (CHARP (ret))
2242         {
2243           Emchar char2 = XCHARVAL (ret);
2244           ret = get_char_id_table (char2, Vcharacter_composition_table);
2245           if (NILP (ret))
2246             {
2247               DECODE_ADD_UCS_CHAR (char2, dst);
2248               str->combined_char_count = 0;
2249               str->combining_table = Qnil;
2250             }
2251           else
2252             {
2253               str->combined_chars[0] = char2;
2254               str->combined_char_count = 1;
2255               str->combining_table = ret;
2256             }
2257         }
2258       else if (CHAR_ID_TABLE_P (ret))
2259         {
2260           str->combined_chars[str->combined_char_count++] = character;
2261           str->combining_table = ret;
2262         }
2263       else
2264         {
2265           COMPOSE_FLUSH_CHARS (str, dst);
2266           DECODE_ADD_UCS_CHAR (character, dst);
2267         }
2268     }
2269 }
2270 #else /* not UTF2000 */
2271 #define COMPOSE_FLUSH_CHARS(str, dst)
2272 #define COMPOSE_ADD_CHAR(str, ch, dst) DECODE_ADD_UCS_CHAR (ch, dst)
2273 #endif /* UTF2000 */
2274
2275 static ssize_t decoding_reader (Lstream *stream,
2276                                 unsigned char *data, size_t size);
2277 static ssize_t decoding_writer (Lstream *stream,
2278                                 const unsigned char *data, size_t size);
2279 static int decoding_rewinder   (Lstream *stream);
2280 static int decoding_seekable_p (Lstream *stream);
2281 static int decoding_flusher    (Lstream *stream);
2282 static int decoding_closer     (Lstream *stream);
2283
2284 static Lisp_Object decoding_marker (Lisp_Object stream);
2285
2286 DEFINE_LSTREAM_IMPLEMENTATION ("decoding", lstream_decoding,
2287                                sizeof (struct decoding_stream));
2288
2289 static Lisp_Object
2290 decoding_marker (Lisp_Object stream)
2291 {
2292   Lstream *str = DECODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2293   Lisp_Object str_obj;
2294
2295   /* We do not need to mark the coding systems or charsets stored
2296      within the stream because they are stored in a global list
2297      and automatically marked. */
2298
2299   XSETLSTREAM (str_obj, str);
2300   mark_object (str_obj);
2301   if (str->imp->marker)
2302     return (str->imp->marker) (str_obj);
2303   else
2304     return Qnil;
2305 }
2306
2307 /* Read SIZE bytes of data and store it into DATA.  We are a decoding stream
2308    so we read data from the other end, decode it, and store it into DATA. */
2309
2310 static ssize_t
2311 decoding_reader (Lstream *stream, unsigned char *data, size_t size)
2312 {
2313   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2314   unsigned char *orig_data = data;
2315   ssize_t read_size;
2316   int error_occurred = 0;
2317
2318   /* We need to interface to mule_decode(), which expects to take some
2319      amount of data and store the result into a Dynarr.  We have
2320      mule_decode() store into str->runoff, and take data from there
2321      as necessary. */
2322
2323   /* We loop until we have enough data, reading chunks from the other
2324      end and decoding it. */
2325   while (1)
2326     {
2327       /* Take data from the runoff if we can.  Make sure to take at
2328          most SIZE bytes, and delete the data from the runoff. */
2329       if (Dynarr_length (str->runoff) > 0)
2330         {
2331           size_t chunk = min (size, (size_t) Dynarr_length (str->runoff));
2332           memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2333           Dynarr_delete_many (str->runoff, 0, chunk);
2334           data += chunk;
2335           size -= chunk;
2336         }
2337
2338       if (size == 0)
2339         break; /* No more room for data */
2340
2341       if (str->flags & CODING_STATE_END)
2342         /* This means that on the previous iteration, we hit the EOF on
2343            the other end.  We loop once more so that mule_decode() can
2344            output any final stuff it may be holding, or any "go back
2345            to a sane state" escape sequences. (This latter makes sense
2346            during encoding.) */
2347         break;
2348
2349       /* Exhausted the runoff, so get some more.  DATA has at least
2350          SIZE bytes left of storage in it, so it's OK to read directly
2351          into it.  (We'll be overwriting above, after we've decoded it
2352          into the runoff.) */
2353       read_size = Lstream_read (str->other_end, data, size);
2354       if (read_size < 0)
2355         {
2356           error_occurred = 1;
2357           break;
2358         }
2359       if (read_size == 0)
2360         /* There might be some more end data produced in the translation.
2361            See the comment above. */
2362         str->flags |= CODING_STATE_END;
2363       mule_decode (stream, data, str->runoff, read_size);
2364     }
2365
2366   if (data - orig_data == 0)
2367     return error_occurred ? -1 : 0;
2368   else
2369     return data - orig_data;
2370 }
2371
2372 static ssize_t
2373 decoding_writer (Lstream *stream, const unsigned char *data, size_t size)
2374 {
2375   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2376   ssize_t retval;
2377
2378   /* Decode all our data into the runoff, and then attempt to write
2379      it all out to the other end.  Remove whatever chunk we succeeded
2380      in writing. */
2381   mule_decode (stream, data, str->runoff, size);
2382   retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2383                           Dynarr_length (str->runoff));
2384   if (retval > 0)
2385     Dynarr_delete_many (str->runoff, 0, retval);
2386   /* Do NOT return retval.  The return value indicates how much
2387      of the incoming data was written, not how many bytes were
2388      written. */
2389   return size;
2390 }
2391
2392 static void
2393 reset_decoding_stream (struct decoding_stream *str)
2394 {
2395 #ifdef MULE
2396   if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_ISO2022)
2397     {
2398       Lisp_Object coding_system;
2399       XSETCODING_SYSTEM (coding_system, str->codesys);
2400       reset_iso2022 (coding_system, &str->iso2022);
2401     }
2402   else if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_CCL)
2403     {
2404       setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_DECODE (str->codesys));
2405     }
2406   str->counter = 0;
2407 #endif /* MULE */
2408 #ifdef UTF2000
2409   str->combined_char_count = 0;
2410   str->combining_table = Qnil;
2411 #endif
2412   str->flags = str->ch = 0;
2413 }
2414
2415 static int
2416 decoding_rewinder (Lstream *stream)
2417 {
2418   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2419   reset_decoding_stream (str);
2420   Dynarr_reset (str->runoff);
2421   return Lstream_rewind (str->other_end);
2422 }
2423
2424 static int
2425 decoding_seekable_p (Lstream *stream)
2426 {
2427   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2428   return Lstream_seekable_p (str->other_end);
2429 }
2430
2431 static int
2432 decoding_flusher (Lstream *stream)
2433 {
2434   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2435   return Lstream_flush (str->other_end);
2436 }
2437
2438 static int
2439 decoding_closer (Lstream *stream)
2440 {
2441   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2442   if (stream->flags & LSTREAM_FL_WRITE)
2443     {
2444       str->flags |= CODING_STATE_END;
2445       decoding_writer (stream, 0, 0);
2446     }
2447   Dynarr_free (str->runoff);
2448 #ifdef MULE
2449 #ifdef ENABLE_COMPOSITE_CHARS
2450   if (str->iso2022.composite_chars)
2451     Dynarr_free (str->iso2022.composite_chars);
2452 #endif
2453 #endif
2454   return Lstream_close (str->other_end);
2455 }
2456
2457 Lisp_Object
2458 decoding_stream_coding_system (Lstream *stream)
2459 {
2460   Lisp_Object coding_system;
2461   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2462
2463   XSETCODING_SYSTEM (coding_system, str->codesys);
2464   return subsidiary_coding_system (coding_system, str->eol_type);
2465 }
2466
2467 void
2468 set_decoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2469 {
2470   Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2471   struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2472   str->codesys = cs;
2473   if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
2474     str->eol_type = CODING_SYSTEM_EOL_TYPE (cs);
2475   reset_decoding_stream (str);
2476 }
2477
2478 /* WARNING WARNING WARNING WARNING!!!!!  If you open up a decoding
2479    stream for writing, no automatic code detection will be performed.
2480    The reason for this is that automatic code detection requires a
2481    seekable input.  Things will also fail if you open a decoding
2482    stream for reading using a non-fully-specified coding system and
2483    a non-seekable input stream. */
2484
2485 static Lisp_Object
2486 make_decoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2487                         const char *mode)
2488 {
2489   Lstream *lstr = Lstream_new (lstream_decoding, mode);
2490   struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2491   Lisp_Object obj;
2492
2493   xzero (*str);
2494   str->other_end = stream;
2495   str->runoff = (unsigned_char_dynarr *) Dynarr_new (unsigned_char);
2496   str->eol_type = EOL_AUTODETECT;
2497   if (!strcmp (mode, "r")
2498       && Lstream_seekable_p (stream))
2499     /* We can determine the coding system now. */
2500     determine_real_coding_system (stream, &codesys, &str->eol_type);
2501   set_decoding_stream_coding_system (lstr, codesys);
2502   str->decst.eol_type = str->eol_type;
2503   str->decst.mask = ~0;
2504   XSETLSTREAM (obj, lstr);
2505   return obj;
2506 }
2507
2508 Lisp_Object
2509 make_decoding_input_stream (Lstream *stream, Lisp_Object codesys)
2510 {
2511   return make_decoding_stream_1 (stream, codesys, "r");
2512 }
2513
2514 Lisp_Object
2515 make_decoding_output_stream (Lstream *stream, Lisp_Object codesys)
2516 {
2517   return make_decoding_stream_1 (stream, codesys, "w");
2518 }
2519
2520 /* Note: the decode_coding_* functions all take the same
2521    arguments as mule_decode(), which is to say some SRC data of
2522    size N, which is to be stored into dynamic array DST.
2523    DECODING is the stream within which the decoding is
2524    taking place, but no data is actually read from or
2525    written to that stream; that is handled in decoding_reader()
2526    or decoding_writer().  This allows the same functions to
2527    be used for both reading and writing. */
2528
2529 static void
2530 mule_decode (Lstream *decoding, const unsigned char *src,
2531              unsigned_char_dynarr *dst, unsigned int n)
2532 {
2533   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2534
2535   /* If necessary, do encoding-detection now.  We do this when
2536      we're a writing stream or a non-seekable reading stream,
2537      meaning that we can't just process the whole input,
2538      rewind, and start over. */
2539
2540   if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT ||
2541       str->eol_type == EOL_AUTODETECT)
2542     {
2543       Lisp_Object codesys;
2544
2545       XSETCODING_SYSTEM (codesys, str->codesys);
2546       detect_coding_type (&str->decst, src, n,
2547                           CODING_SYSTEM_TYPE (str->codesys) !=
2548                           CODESYS_AUTODETECT);
2549       if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT &&
2550           str->decst.mask != ~0)
2551         /* #### This is cheesy.  What we really ought to do is
2552            buffer up a certain amount of data so as to get a
2553            less random result. */
2554         codesys = coding_system_from_mask (str->decst.mask);
2555       str->eol_type = str->decst.eol_type;
2556       if (XCODING_SYSTEM (codesys) != str->codesys)
2557         {
2558           /* Preserve the CODING_STATE_END flag in case it was set.
2559              If we erase it, bad things might happen. */
2560           int was_end = str->flags & CODING_STATE_END;
2561           set_decoding_stream_coding_system (decoding, codesys);
2562           if (was_end)
2563             str->flags |= CODING_STATE_END;
2564         }
2565     }
2566
2567   switch (CODING_SYSTEM_TYPE (str->codesys))
2568     {
2569 #ifdef DEBUG_XEMACS
2570     case CODESYS_INTERNAL:
2571       Dynarr_add_many (dst, src, n);
2572       break;
2573 #endif
2574     case CODESYS_AUTODETECT:
2575       /* If we got this far and still haven't decided on the coding
2576          system, then do no conversion. */
2577     case CODESYS_NO_CONVERSION:
2578       decode_coding_no_conversion (decoding, src, dst, n);
2579       break;
2580 #ifdef MULE
2581     case CODESYS_SHIFT_JIS:
2582       decode_coding_sjis (decoding, src, dst, n);
2583       break;
2584     case CODESYS_BIG5:
2585       decode_coding_big5 (decoding, src, dst, n);
2586       break;
2587     case CODESYS_UCS4:
2588       decode_coding_ucs4 (decoding, src, dst, n);
2589       break;
2590     case CODESYS_UTF8:
2591       decode_coding_utf8 (decoding, src, dst, n);
2592       break;
2593     case CODESYS_CCL:
2594       str->ccl.last_block = str->flags & CODING_STATE_END;
2595       ccl_driver (&str->ccl, src, dst, n, 0, CCL_MODE_DECODING);
2596       break;
2597     case CODESYS_ISO2022:
2598       decode_coding_iso2022 (decoding, src, dst, n);
2599       break;
2600 #endif /* MULE */
2601     default:
2602       abort ();
2603     }
2604 }
2605
2606 DEFUN ("decode-coding-region", Fdecode_coding_region, 3, 4, 0, /*
2607 Decode the text between START and END which is encoded in CODING-SYSTEM.
2608 This is useful if you've read in encoded text from a file without decoding
2609 it (e.g. you read in a JIS-formatted file but used the `binary' or
2610 `no-conversion' coding system, so that it shows up as "^[$B!<!+^[(B").
2611 Return length of decoded text.
2612 BUFFER defaults to the current buffer if unspecified.
2613 */
2614        (start, end, coding_system, buffer))
2615 {
2616   Bufpos b, e;
2617   struct buffer *buf = decode_buffer (buffer, 0);
2618   Lisp_Object instream, lb_outstream, de_outstream, outstream;
2619   Lstream *istr, *ostr;
2620   struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2621
2622   get_buffer_range_char (buf, start, end, &b, &e, 0);
2623
2624   barf_if_buffer_read_only (buf, b, e);
2625
2626   coding_system = Fget_coding_system (coding_system);
2627   instream = make_lisp_buffer_input_stream  (buf, b, e, 0);
2628   lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2629   de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2630                                               coding_system);
2631   outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2632                                            Fget_coding_system (Qbinary));
2633   istr = XLSTREAM (instream);
2634   ostr = XLSTREAM (outstream);
2635   GCPRO4 (instream, lb_outstream, de_outstream, outstream);
2636
2637   /* The chain of streams looks like this:
2638
2639      [BUFFER] <----- send through
2640                      ------> [ENCODE AS BINARY]
2641                              ------> [DECODE AS SPECIFIED]
2642                                      ------> [BUFFER]
2643    */
2644
2645   while (1)
2646     {
2647       char tempbuf[1024]; /* some random amount */
2648       Bufpos newpos, even_newer_pos;
2649       Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2650       ssize_t size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2651
2652       if (!size_in_bytes)
2653         break;
2654       newpos = lisp_buffer_stream_startpos (istr);
2655       Lstream_write (ostr, tempbuf, size_in_bytes);
2656       even_newer_pos = lisp_buffer_stream_startpos (istr);
2657       buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2658                            even_newer_pos, 0);
2659     }
2660   Lstream_close (istr);
2661   Lstream_close (ostr);
2662   UNGCPRO;
2663   Lstream_delete (istr);
2664   Lstream_delete (ostr);
2665   Lstream_delete (XLSTREAM (de_outstream));
2666   Lstream_delete (XLSTREAM (lb_outstream));
2667   return Qnil;
2668 }
2669
2670 \f
2671 /************************************************************************/
2672 /*           Converting to an external encoding ("encoding")            */
2673 /************************************************************************/
2674
2675 /* An encoding stream is an output stream.  When you create the
2676    stream, you specify the coding system that governs the encoding
2677    and another stream that the resulting encoded data is to be
2678    sent to, and then start sending data to it. */
2679
2680 #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding)
2681
2682 struct encoding_stream
2683 {
2684   /* Coding system that governs the conversion. */
2685   Lisp_Coding_System *codesys;
2686
2687   /* Stream that we read the encoded data from or
2688      write the decoded data to. */
2689   Lstream *other_end;
2690
2691   /* If we are reading, then we can return only a fixed amount of
2692      data, so if the conversion resulted in too much data, we store it
2693      here for retrieval the next time around. */
2694   unsigned_char_dynarr *runoff;
2695
2696   /* FLAGS holds flags indicating the current state of the encoding.
2697      Some of these flags are dependent on the coding system. */
2698   unsigned int flags;
2699
2700   /* CH holds a partially built-up character.  Since we only deal
2701      with one- and two-byte characters at the moment, we only use
2702      this to store the first byte of a two-byte character. */
2703   unsigned int ch;
2704 #ifdef MULE
2705   /* Additional information used by the ISO2022 encoder. */
2706   struct
2707     {
2708       /* CHARSET holds the character sets currently assigned to the G0
2709          through G3 registers.  It is initialized from the array
2710          INITIAL_CHARSET in CODESYS. */
2711       Lisp_Object charset[4];
2712
2713       /* Which registers are currently invoked into the left (GL) and
2714          right (GR) halves of the 8-bit encoding space? */
2715       int register_left, register_right;
2716
2717       /* Whether we need to explicitly designate the charset in the
2718          G? register before using it.  It is initialized from the
2719          array FORCE_CHARSET_ON_OUTPUT in CODESYS. */
2720       unsigned char force_charset_on_output[4];
2721
2722       /* Other state variables that need to be preserved across
2723          invocations. */
2724       Lisp_Object current_charset;
2725       int current_half;
2726       int current_char_boundary;
2727     } iso2022;
2728
2729   void (*encode_char) (struct encoding_stream *str, Emchar c,
2730                        unsigned_char_dynarr *dst, unsigned int *flags);
2731   void (*finish) (struct encoding_stream *str,
2732                   unsigned_char_dynarr *dst, unsigned int *flags);
2733
2734   /* Additional information (the state of the running CCL program)
2735      used by the CCL encoder. */
2736   struct ccl_program ccl;
2737 #endif /* MULE */
2738 };
2739
2740 static ssize_t encoding_reader (Lstream *stream, unsigned char *data, size_t size);
2741 static ssize_t encoding_writer (Lstream *stream, const unsigned char *data,
2742                                 size_t size);
2743 static int encoding_rewinder   (Lstream *stream);
2744 static int encoding_seekable_p (Lstream *stream);
2745 static int encoding_flusher    (Lstream *stream);
2746 static int encoding_closer     (Lstream *stream);
2747
2748 static Lisp_Object encoding_marker (Lisp_Object stream);
2749
2750 DEFINE_LSTREAM_IMPLEMENTATION ("encoding", lstream_encoding,
2751                                sizeof (struct encoding_stream));
2752
2753 static Lisp_Object
2754 encoding_marker (Lisp_Object stream)
2755 {
2756   Lstream *str = ENCODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2757   Lisp_Object str_obj;
2758
2759   /* We do not need to mark the coding systems or charsets stored
2760      within the stream because they are stored in a global list
2761      and automatically marked. */
2762
2763   XSETLSTREAM (str_obj, str);
2764   mark_object (str_obj);
2765   if (str->imp->marker)
2766     return (str->imp->marker) (str_obj);
2767   else
2768     return Qnil;
2769 }
2770
2771 /* Read SIZE bytes of data and store it into DATA.  We are a encoding stream
2772    so we read data from the other end, encode it, and store it into DATA. */
2773
2774 static ssize_t
2775 encoding_reader (Lstream *stream, unsigned char *data, size_t size)
2776 {
2777   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2778   unsigned char *orig_data = data;
2779   ssize_t read_size;
2780   int error_occurred = 0;
2781
2782   /* We need to interface to mule_encode(), which expects to take some
2783      amount of data and store the result into a Dynarr.  We have
2784      mule_encode() store into str->runoff, and take data from there
2785      as necessary. */
2786
2787   /* We loop until we have enough data, reading chunks from the other
2788      end and encoding it. */
2789   while (1)
2790     {
2791       /* Take data from the runoff if we can.  Make sure to take at
2792          most SIZE bytes, and delete the data from the runoff. */
2793       if (Dynarr_length (str->runoff) > 0)
2794         {
2795           int chunk = min ((int) size, Dynarr_length (str->runoff));
2796           memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2797           Dynarr_delete_many (str->runoff, 0, chunk);
2798           data += chunk;
2799           size -= chunk;
2800         }
2801
2802       if (size == 0)
2803         break; /* No more room for data */
2804
2805       if (str->flags & CODING_STATE_END)
2806         /* This means that on the previous iteration, we hit the EOF on
2807            the other end.  We loop once more so that mule_encode() can
2808            output any final stuff it may be holding, or any "go back
2809            to a sane state" escape sequences. (This latter makes sense
2810            during encoding.) */
2811         break;
2812
2813       /* Exhausted the runoff, so get some more.  DATA at least SIZE bytes
2814          left of storage in it, so it's OK to read directly into it.
2815          (We'll be overwriting above, after we've encoded it into the
2816          runoff.) */
2817       read_size = Lstream_read (str->other_end, data, size);
2818       if (read_size < 0)
2819         {
2820           error_occurred = 1;
2821           break;
2822         }
2823       if (read_size == 0)
2824         /* There might be some more end data produced in the translation.
2825            See the comment above. */
2826         str->flags |= CODING_STATE_END;
2827       mule_encode (stream, data, str->runoff, read_size);
2828     }
2829
2830   if (data == orig_data)
2831     return error_occurred ? -1 : 0;
2832   else
2833     return data - orig_data;
2834 }
2835
2836 static ssize_t
2837 encoding_writer (Lstream *stream, const unsigned char *data, size_t size)
2838 {
2839   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2840   ssize_t retval;
2841
2842   /* Encode all our data into the runoff, and then attempt to write
2843      it all out to the other end.  Remove whatever chunk we succeeded
2844      in writing. */
2845   mule_encode (stream, data, str->runoff, size);
2846   retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2847                           Dynarr_length (str->runoff));
2848   if (retval > 0)
2849     Dynarr_delete_many (str->runoff, 0, retval);
2850   /* Do NOT return retval.  The return value indicates how much
2851      of the incoming data was written, not how many bytes were
2852      written. */
2853   return size;
2854 }
2855
2856 static void
2857 reset_encoding_stream (struct encoding_stream *str)
2858 {
2859 #ifdef MULE
2860   switch (CODING_SYSTEM_TYPE (str->codesys))
2861     {
2862     case CODESYS_ISO2022:
2863       {
2864         int i;
2865
2866         str->encode_char = &char_encode_iso2022;
2867         str->finish = &char_finish_iso2022;
2868         for (i = 0; i < 4; i++)
2869           {
2870             str->iso2022.charset[i] =
2871               CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, i);
2872             str->iso2022.force_charset_on_output[i] =
2873               CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (str->codesys, i);
2874           }
2875         str->iso2022.register_left = 0;
2876         str->iso2022.register_right = 1;
2877         str->iso2022.current_charset = Qnil;
2878         str->iso2022.current_half = 0;
2879         break;
2880       }
2881     case CODESYS_CCL:
2882       setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_ENCODE (str->codesys));
2883       break;
2884     case CODESYS_UTF8:
2885       str->encode_char = &char_encode_utf8;
2886       str->finish = &char_finish_utf8;
2887       break;
2888     case CODESYS_UCS4:
2889       str->encode_char = &char_encode_ucs4;
2890       str->finish = &char_finish_ucs4;
2891       break;
2892     case CODESYS_SHIFT_JIS:
2893       str->encode_char = &char_encode_shift_jis;
2894       str->finish = &char_finish_shift_jis;
2895       break;
2896     default:
2897       break;
2898     }
2899 #endif /* MULE */
2900   str->iso2022.current_char_boundary = 0;
2901   str->flags = str->ch = 0;
2902 }
2903
2904 static int
2905 encoding_rewinder (Lstream *stream)
2906 {
2907   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2908   reset_encoding_stream (str);
2909   Dynarr_reset (str->runoff);
2910   return Lstream_rewind (str->other_end);
2911 }
2912
2913 static int
2914 encoding_seekable_p (Lstream *stream)
2915 {
2916   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2917   return Lstream_seekable_p (str->other_end);
2918 }
2919
2920 static int
2921 encoding_flusher (Lstream *stream)
2922 {
2923   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2924   return Lstream_flush (str->other_end);
2925 }
2926
2927 static int
2928 encoding_closer (Lstream *stream)
2929 {
2930   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2931   if (stream->flags & LSTREAM_FL_WRITE)
2932     {
2933       str->flags |= CODING_STATE_END;
2934       encoding_writer (stream, 0, 0);
2935     }
2936   Dynarr_free (str->runoff);
2937   return Lstream_close (str->other_end);
2938 }
2939
2940 Lisp_Object
2941 encoding_stream_coding_system (Lstream *stream)
2942 {
2943   Lisp_Object coding_system;
2944   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2945
2946   XSETCODING_SYSTEM (coding_system, str->codesys);
2947   return coding_system;
2948 }
2949
2950 void
2951 set_encoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2952 {
2953   Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2954   struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2955   str->codesys = cs;
2956   reset_encoding_stream (str);
2957 }
2958
2959 static Lisp_Object
2960 make_encoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2961                         const char *mode)
2962 {
2963   Lstream *lstr = Lstream_new (lstream_encoding, mode);
2964   struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2965   Lisp_Object obj;
2966
2967   xzero (*str);
2968   str->runoff = Dynarr_new (unsigned_char);
2969   str->other_end = stream;
2970   set_encoding_stream_coding_system (lstr, codesys);
2971   XSETLSTREAM (obj, lstr);
2972   return obj;
2973 }
2974
2975 Lisp_Object
2976 make_encoding_input_stream (Lstream *stream, Lisp_Object codesys)
2977 {
2978   return make_encoding_stream_1 (stream, codesys, "r");
2979 }
2980
2981 Lisp_Object
2982 make_encoding_output_stream (Lstream *stream, Lisp_Object codesys)
2983 {
2984   return make_encoding_stream_1 (stream, codesys, "w");
2985 }
2986
2987 /* Convert N bytes of internally-formatted data stored in SRC to an
2988    external format, according to the encoding stream ENCODING.
2989    Store the encoded data into DST. */
2990
2991 static void
2992 mule_encode (Lstream *encoding, const unsigned char *src,
2993              unsigned_char_dynarr *dst, unsigned int n)
2994 {
2995   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2996
2997   switch (CODING_SYSTEM_TYPE (str->codesys))
2998     {
2999 #ifdef DEBUG_XEMACS
3000     case CODESYS_INTERNAL:
3001       Dynarr_add_many (dst, src, n);
3002       break;
3003 #endif
3004     case CODESYS_AUTODETECT:
3005       /* If we got this far and still haven't decided on the coding
3006          system, then do no conversion. */
3007     case CODESYS_NO_CONVERSION:
3008       encode_coding_no_conversion (encoding, src, dst, n);
3009       break;
3010 #ifdef MULE
3011     case CODESYS_BIG5:
3012       encode_coding_big5 (encoding, src, dst, n);
3013       break;
3014     case CODESYS_CCL:
3015       str->ccl.last_block = str->flags & CODING_STATE_END;
3016       ccl_driver (&str->ccl, src, dst, n, 0, CCL_MODE_ENCODING);
3017       break;
3018 #endif /* MULE */
3019     default:
3020       text_encode_generic (encoding, src, dst, n);
3021     }
3022 }
3023
3024 DEFUN ("encode-coding-region", Fencode_coding_region, 3, 4, 0, /*
3025 Encode the text between START and END using CODING-SYSTEM.
3026 This will, for example, convert Japanese characters into stuff such as
3027 "^[$B!<!+^[(B" if you use the JIS encoding.  Return length of encoded
3028 text.  BUFFER defaults to the current buffer if unspecified.
3029 */
3030        (start, end, coding_system, buffer))
3031 {
3032   Bufpos b, e;
3033   struct buffer *buf = decode_buffer (buffer, 0);
3034   Lisp_Object instream, lb_outstream, de_outstream, outstream;
3035   Lstream *istr, *ostr;
3036   struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
3037
3038   get_buffer_range_char (buf, start, end, &b, &e, 0);
3039
3040   barf_if_buffer_read_only (buf, b, e);
3041
3042   coding_system = Fget_coding_system (coding_system);
3043   instream  = make_lisp_buffer_input_stream  (buf, b, e, 0);
3044   lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
3045   de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
3046                                               Fget_coding_system (Qbinary));
3047   outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
3048                                            coding_system);
3049   istr = XLSTREAM (instream);
3050   ostr = XLSTREAM (outstream);
3051   GCPRO4 (instream, outstream, de_outstream, lb_outstream);
3052   /* The chain of streams looks like this:
3053
3054      [BUFFER] <----- send through
3055                      ------> [ENCODE AS SPECIFIED]
3056                              ------> [DECODE AS BINARY]
3057                                      ------> [BUFFER]
3058    */
3059   while (1)
3060     {
3061       char tempbuf[1024]; /* some random amount */
3062       Bufpos newpos, even_newer_pos;
3063       Bufpos oldpos = lisp_buffer_stream_startpos (istr);
3064       ssize_t size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
3065
3066       if (!size_in_bytes)
3067         break;
3068       newpos = lisp_buffer_stream_startpos (istr);
3069       Lstream_write (ostr, tempbuf, size_in_bytes);
3070       even_newer_pos = lisp_buffer_stream_startpos (istr);
3071       buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
3072                            even_newer_pos, 0);
3073     }
3074
3075   {
3076     Charcount retlen =
3077       lisp_buffer_stream_startpos (XLSTREAM (instream)) - b;
3078     Lstream_close (istr);
3079     Lstream_close (ostr);
3080     UNGCPRO;
3081     Lstream_delete (istr);
3082     Lstream_delete (ostr);
3083     Lstream_delete (XLSTREAM (de_outstream));
3084     Lstream_delete (XLSTREAM (lb_outstream));
3085     return make_int (retlen);
3086   }
3087 }
3088
3089 #ifdef MULE
3090 \f
3091 static void
3092 text_encode_generic (Lstream *encoding, const unsigned char *src,
3093                      unsigned_char_dynarr *dst, unsigned int n)
3094 {
3095   unsigned char c;
3096   unsigned char char_boundary;
3097   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3098   unsigned int flags          = str->flags;
3099   Emchar ch                   = str->ch;
3100
3101   char_boundary = str->iso2022.current_char_boundary;
3102
3103   while (n--)
3104     {
3105       c = *src++;
3106
3107       if (char_boundary == 0)
3108         {
3109           if (c >= 0xfc)
3110             {
3111               ch = c & 0x01;
3112               char_boundary = 5;
3113             }
3114           else if (c >= 0xf8)
3115             {
3116               ch = c & 0x03;
3117               char_boundary = 4;
3118             }
3119           else if (c >= 0xf0)
3120             {
3121               ch = c & 0x07;
3122               char_boundary = 3;
3123             }
3124           else if (c >= 0xe0)
3125             {
3126               ch = c & 0x0f;
3127               char_boundary = 2;
3128             }
3129           else if (c >= 0xc0)
3130             {
3131               ch = c & 0x1f;
3132               char_boundary = 1;
3133             }
3134           else
3135             (*str->encode_char) (str, c, dst, &flags);
3136         }
3137       else if (char_boundary == 1)
3138         {
3139           (*str->encode_char) (str, (ch << 6) | (c & 0x3f), dst, &flags);
3140           ch =0;
3141           char_boundary = 0;
3142         }
3143       else
3144         {
3145           ch = (ch << 6) | (c & 0x3f);
3146           char_boundary--;
3147         }
3148     }
3149
3150   if ((char_boundary == 0) && (flags & CODING_STATE_END))
3151     {
3152       (*str->finish) (str, dst, &flags);
3153     }
3154
3155   str->flags = flags;
3156   str->ch    = ch;
3157   str->iso2022.current_char_boundary = char_boundary;
3158 }
3159
3160 \f
3161 /************************************************************************/
3162 /*                          Shift-JIS methods                           */
3163 /************************************************************************/
3164
3165 /* Shift-JIS is a coding system encoding three character sets: ASCII, right
3166    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
3167    as is.  A character of JISX0201-Kana (DIMENSION1_CHARS94 character set) is
3168    encoded by "position-code + 0x80".  A character of JISX0208
3169    (DIMENSION2_CHARS94 character set) is encoded in 2-byte but two
3170    position-codes are divided and shifted so that it fit in the range
3171    below.
3172
3173    --- CODE RANGE of Shift-JIS ---
3174    (character set)      (range)
3175    ASCII                0x00 .. 0x7F
3176    JISX0201-Kana        0xA0 .. 0xDF
3177    JISX0208 (1st byte)  0x80 .. 0x9F and 0xE0 .. 0xEF
3178             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
3179    -------------------------------
3180
3181 */
3182
3183 /* Is this the first byte of a Shift-JIS two-byte char? */
3184
3185 #define BYTE_SJIS_TWO_BYTE_1_P(c) \
3186   (((c) >= 0x81 && (c) <= 0x9F) || ((c) >= 0xE0 && (c) <= 0xEF))
3187
3188 /* Is this the second byte of a Shift-JIS two-byte char? */
3189
3190 #define BYTE_SJIS_TWO_BYTE_2_P(c) \
3191   (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0x80 && (c) <= 0xFC))
3192
3193 #define BYTE_SJIS_KATAKANA_P(c) \
3194   ((c) >= 0xA1 && (c) <= 0xDF)
3195
3196 static int
3197 detect_coding_sjis (struct detection_state *st, const unsigned char *src,
3198                     unsigned int n)
3199 {
3200   int c;
3201
3202   while (n--)
3203     {
3204       c = *src++;
3205       if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3206         return 0;
3207       if (st->shift_jis.in_second_byte)
3208         {
3209           st->shift_jis.in_second_byte = 0;
3210           if (c < 0x40)
3211             return 0;
3212         }
3213       else if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
3214         st->shift_jis.in_second_byte = 1;
3215     }
3216   return CODING_CATEGORY_SHIFT_JIS_MASK;
3217 }
3218
3219 /* Convert Shift-JIS data to internal format. */
3220
3221 static void
3222 decode_coding_sjis (Lstream *decoding, const unsigned char *src,
3223                     unsigned_char_dynarr *dst, unsigned int n)
3224 {
3225   unsigned char c;
3226   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3227   unsigned int flags  = str->flags;
3228   unsigned int ch     = str->ch;
3229   eol_type_t eol_type = str->eol_type;
3230
3231   while (n--)
3232     {
3233       c = *src++;
3234
3235       if (ch)
3236         {
3237           /* Previous character was first byte of Shift-JIS Kanji char. */
3238           if (BYTE_SJIS_TWO_BYTE_2_P (c))
3239             {
3240               unsigned char e1, e2;
3241
3242               DECODE_SJIS (ch, c, e1, e2);
3243 #ifdef UTF2000
3244               DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_japanese_jisx0208,
3245                                             e1 & 0x7F,
3246                                             e2 & 0x7F), dst);
3247 #else
3248               Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
3249               Dynarr_add (dst, e1);
3250               Dynarr_add (dst, e2);
3251 #endif
3252             }
3253           else
3254             {
3255               DECODE_ADD_BINARY_CHAR (ch, dst);
3256               DECODE_ADD_BINARY_CHAR (c, dst);
3257             }
3258           ch = 0;
3259         }
3260       else
3261         {
3262           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3263           if (BYTE_SJIS_TWO_BYTE_1_P (c))
3264             ch = c;
3265           else if (BYTE_SJIS_KATAKANA_P (c))
3266             {
3267 #ifdef UTF2000
3268               DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_katakana_jisx0201,
3269                                             c & 0x7F, 0), dst);
3270 #else
3271               Dynarr_add (dst, LEADING_BYTE_KATAKANA_JISX0201);
3272               Dynarr_add (dst, c);
3273 #endif
3274             }
3275 #ifdef UTF2000
3276           else if (c > 32)
3277             DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_latin_jisx0201,
3278                                           c, 0), dst);
3279 #endif
3280           else
3281             DECODE_ADD_BINARY_CHAR (c, dst);
3282         }
3283     label_continue_loop:;
3284     }
3285
3286   DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
3287
3288   str->flags = flags;
3289   str->ch    = ch;
3290 }
3291
3292 /* Convert internal character representation to Shift_JIS. */
3293
3294 void
3295 char_encode_shift_jis (struct encoding_stream *str, Emchar ch,
3296                        unsigned_char_dynarr *dst, unsigned int *flags)
3297 {
3298   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3299
3300   if (ch == '\n')
3301     {
3302       if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3303         Dynarr_add (dst, '\r');
3304       if (eol_type != EOL_CR)
3305         Dynarr_add (dst, ch);
3306     }
3307   else
3308     {
3309       Lisp_Object charset;
3310       unsigned int c1, c2, s1, s2;
3311 #ifdef UTF2000
3312       int code_point = charset_code_point (Vcharset_latin_jisx0201, ch);
3313
3314       if (code_point >= 0)
3315         {
3316           charset = Vcharset_latin_jisx0201;
3317           c1 = code_point;
3318           c2 = 0;
3319         }
3320       else
3321 #endif
3322         BREAKUP_CHAR (ch, charset, c1, c2);
3323
3324       if (EQ(charset, Vcharset_katakana_jisx0201))
3325         {
3326           Dynarr_add (dst, c1 | 0x80);
3327         }
3328       else if (c2 == 0)
3329         {
3330           Dynarr_add (dst, c1);
3331         }
3332       else if (EQ(charset, Vcharset_japanese_jisx0208))
3333         {
3334           ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3335           Dynarr_add (dst, s1);
3336           Dynarr_add (dst, s2);
3337         }
3338       else
3339         Dynarr_add (dst, '?');
3340     }
3341 }
3342
3343 void
3344 char_finish_shift_jis (struct encoding_stream *str, unsigned_char_dynarr *dst,
3345                        unsigned int *flags)
3346 {
3347 }
3348
3349 DEFUN ("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /*
3350 Decode a JISX0208 character of Shift-JIS coding-system.
3351 CODE is the character code in Shift-JIS as a cons of type bytes.
3352 Return the corresponding character.
3353 */
3354        (code))
3355 {
3356   unsigned char c1, c2, s1, s2;
3357
3358   CHECK_CONS (code);
3359   CHECK_INT (XCAR (code));
3360   CHECK_INT (XCDR (code));
3361   s1 = XINT (XCAR (code));
3362   s2 = XINT (XCDR (code));
3363   if (BYTE_SJIS_TWO_BYTE_1_P (s1) &&
3364       BYTE_SJIS_TWO_BYTE_2_P (s2))
3365     {
3366       DECODE_SJIS (s1, s2, c1, c2);
3367       return make_char (MAKE_CHAR (Vcharset_japanese_jisx0208,
3368                                    c1 & 0x7F, c2 & 0x7F));
3369     }
3370   else
3371     return Qnil;
3372 }
3373
3374 DEFUN ("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /*
3375 Encode a JISX0208 character CHAR to SHIFT-JIS coding-system.
3376 Return the corresponding character code in SHIFT-JIS as a cons of two bytes.
3377 */
3378        (ch))
3379 {
3380   Lisp_Object charset;
3381   int c1, c2, s1, s2;
3382
3383   CHECK_CHAR_COERCE_INT (ch);
3384   BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
3385   if (EQ (charset, Vcharset_japanese_jisx0208))
3386     {
3387       ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3388       return Fcons (make_int (s1), make_int (s2));
3389     }
3390   else
3391     return Qnil;
3392 }
3393
3394 \f
3395 /************************************************************************/
3396 /*                            Big5 methods                              */
3397 /************************************************************************/
3398
3399 /* BIG5 is a coding system encoding two character sets: ASCII and
3400    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
3401    character set and is encoded in two-byte.
3402
3403    --- CODE RANGE of BIG5 ---
3404    (character set)      (range)
3405    ASCII                0x00 .. 0x7F
3406    Big5 (1st byte)      0xA1 .. 0xFE
3407         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
3408    --------------------------
3409
3410    Since the number of characters in Big5 is larger than maximum
3411    characters in Emacs' charset (96x96), it can't be handled as one
3412    charset.  So, in Emacs, Big5 is divided into two: `charset-big5-1'
3413    and `charset-big5-2'.  Both <type>s are DIMENSION2_CHARS94.  The former
3414    contains frequently used characters and the latter contains less
3415    frequently used characters.  */
3416
3417 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
3418   ((c) >= 0xA1 && (c) <= 0xFE)
3419
3420 /* Is this the second byte of a Shift-JIS two-byte char? */
3421
3422 #define BYTE_BIG5_TWO_BYTE_2_P(c) \
3423   (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0xA1 && (c) <= 0xFE))
3424
3425 /* Number of Big5 characters which have the same code in 1st byte.  */
3426
3427 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
3428
3429 /* Code conversion macros.  These are macros because they are used in
3430    inner loops during code conversion.
3431
3432    Note that temporary variables in macros introduce the classic
3433    dynamic-scoping problems with variable names.  We use capital-
3434    lettered variables in the assumption that XEmacs does not use
3435    capital letters in variables except in a very formalized way
3436    (e.g. Qstring). */
3437
3438 /* Convert Big5 code (b1, b2) into its internal string representation
3439    (lb, c1, c2). */
3440
3441 /* There is a much simpler way to split the Big5 charset into two.
3442    For the moment I'm going to leave the algorithm as-is because it
3443    claims to separate out the most-used characters into a single
3444    charset, which perhaps will lead to optimizations in various
3445    places.
3446
3447    The way the algorithm works is something like this:
3448
3449    Big5 can be viewed as a 94x157 charset, where the row is
3450    encoded into the bytes 0xA1 .. 0xFE and the column is encoded
3451    into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE.  As for frequency,
3452    the split between low and high column numbers is apparently
3453    meaningless; ascending rows produce less and less frequent chars.
3454    Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to
3455    the first charset, and the upper half (0xC9 .. 0xFE) to the
3456    second.  To do the conversion, we convert the character into
3457    a single number where 0 .. 156 is the first row, 157 .. 313
3458    is the second, etc.  That way, the characters are ordered by
3459    decreasing frequency.  Then we just chop the space in two
3460    and coerce the result into a 94x94 space.
3461    */
3462
3463 #define DECODE_BIG5(b1, b2, lb, c1, c2) do                              \
3464 {                                                                       \
3465   int B1 = b1, B2 = b2;                                                 \
3466   unsigned int I                                                        \
3467     = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62);     \
3468                                                                         \
3469   if (B1 < 0xC9)                                                        \
3470     {                                                                   \
3471       lb = LEADING_BYTE_CHINESE_BIG5_1;                                 \
3472     }                                                                   \
3473   else                                                                  \
3474     {                                                                   \
3475       lb = LEADING_BYTE_CHINESE_BIG5_2;                                 \
3476       I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1);                             \
3477     }                                                                   \
3478   c1 = I / (0xFF - 0xA1) + 0xA1;                                        \
3479   c2 = I % (0xFF - 0xA1) + 0xA1;                                        \
3480 } while (0)
3481
3482 /* Convert the internal string representation of a Big5 character
3483    (lb, c1, c2) into Big5 code (b1, b2). */
3484
3485 #define ENCODE_BIG5(lb, c1, c2, b1, b2) do                              \
3486 {                                                                       \
3487   unsigned int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1);       \
3488                                                                         \
3489   if (lb == LEADING_BYTE_CHINESE_BIG5_2)                                \
3490     {                                                                   \
3491       I += BIG5_SAME_ROW * (0xC9 - 0xA1);                               \
3492     }                                                                   \
3493   b1 = I / BIG5_SAME_ROW + 0xA1;                                        \
3494   b2 = I % BIG5_SAME_ROW;                                               \
3495   b2 += b2 < 0x3F ? 0x40 : 0x62;                                        \
3496 } while (0)
3497
3498 static int
3499 detect_coding_big5 (struct detection_state *st, const unsigned char *src,
3500                     unsigned int n)
3501 {
3502   int c;
3503
3504   while (n--)
3505     {
3506       c = *src++;
3507       if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO ||
3508           (c >= 0x80 && c <= 0xA0))
3509         return 0;
3510       if (st->big5.in_second_byte)
3511         {
3512           st->big5.in_second_byte = 0;
3513           if (c < 0x40 || (c >= 0x80 && c <= 0xA0))
3514             return 0;
3515         }
3516       else if (c >= 0xA1)
3517         st->big5.in_second_byte = 1;
3518     }
3519   return CODING_CATEGORY_BIG5_MASK;
3520 }
3521
3522 /* Convert Big5 data to internal format. */
3523
3524 static void
3525 decode_coding_big5 (Lstream *decoding, const unsigned char *src,
3526                     unsigned_char_dynarr *dst, unsigned int n)
3527 {
3528   unsigned char c;
3529   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3530   unsigned int flags  = str->flags;
3531   unsigned int ch     = str->ch;
3532   eol_type_t eol_type = str->eol_type;
3533
3534   while (n--)
3535     {
3536       c = *src++;
3537       if (ch)
3538         {
3539           /* Previous character was first byte of Big5 char. */
3540           if (BYTE_BIG5_TWO_BYTE_2_P (c))
3541             {
3542               unsigned char b1, b2, b3;
3543               DECODE_BIG5 (ch, c, b1, b2, b3);
3544               Dynarr_add (dst, b1);
3545               Dynarr_add (dst, b2);
3546               Dynarr_add (dst, b3);
3547             }
3548           else
3549             {
3550               DECODE_ADD_BINARY_CHAR (ch, dst);
3551               DECODE_ADD_BINARY_CHAR (c, dst);
3552             }
3553           ch = 0;
3554         }
3555       else
3556         {
3557           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3558           if (BYTE_BIG5_TWO_BYTE_1_P (c))
3559             ch = c;
3560           else
3561             DECODE_ADD_BINARY_CHAR (c, dst);
3562         }
3563     label_continue_loop:;
3564     }
3565
3566   DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
3567
3568   str->flags = flags;
3569   str->ch    = ch;
3570 }
3571
3572 /* Convert internally-formatted data to Big5. */
3573
3574 static void
3575 encode_coding_big5 (Lstream *encoding, const unsigned char *src,
3576                     unsigned_char_dynarr *dst, unsigned int n)
3577 {
3578 #ifndef UTF2000
3579   unsigned char c;
3580   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3581   unsigned int flags  = str->flags;
3582   unsigned int ch     = str->ch;
3583   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3584
3585   while (n--)
3586     {
3587       c = *src++;
3588       if (c == '\n')
3589         {
3590           if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3591             Dynarr_add (dst, '\r');
3592           if (eol_type != EOL_CR)
3593             Dynarr_add (dst, '\n');
3594         }
3595       else if (BYTE_ASCII_P (c))
3596         {
3597           /* ASCII. */
3598           Dynarr_add (dst, c);
3599         }
3600       else if (BUFBYTE_LEADING_BYTE_P (c))
3601         {
3602           if (c == LEADING_BYTE_CHINESE_BIG5_1 ||
3603               c == LEADING_BYTE_CHINESE_BIG5_2)
3604             {
3605               /* A recognized leading byte. */
3606               ch = c;
3607               continue; /* not done with this character. */
3608             }
3609           /* otherwise just ignore this character. */
3610         }
3611       else if (ch == LEADING_BYTE_CHINESE_BIG5_1 ||
3612                ch == LEADING_BYTE_CHINESE_BIG5_2)
3613         {
3614           /* Previous char was a recognized leading byte. */
3615           ch = (ch << 8) | c;
3616           continue; /* not done with this character. */
3617         }
3618       else if (ch)
3619         {
3620           /* Encountering second byte of a Big5 character. */
3621           unsigned char b1, b2;
3622
3623           ENCODE_BIG5 (ch >> 8, ch & 0xFF, c, b1, b2);
3624           Dynarr_add (dst, b1);
3625           Dynarr_add (dst, b2);
3626         }
3627
3628       ch = 0;
3629     }
3630
3631   str->flags = flags;
3632   str->ch    = ch;
3633 #endif
3634 }
3635
3636
3637 DEFUN ("decode-big5-char", Fdecode_big5_char, 1, 1, 0, /*
3638 Decode a Big5 character CODE of BIG5 coding-system.
3639 CODE is the character code in BIG5, a cons of two integers.
3640 Return the corresponding character.
3641 */
3642        (code))
3643 {
3644   unsigned char c1, c2, b1, b2;
3645
3646   CHECK_CONS (code);
3647   CHECK_INT (XCAR (code));
3648   CHECK_INT (XCDR (code));
3649   b1 = XINT (XCAR (code));
3650   b2 = XINT (XCDR (code));
3651   if (BYTE_BIG5_TWO_BYTE_1_P (b1) &&
3652       BYTE_BIG5_TWO_BYTE_2_P (b2))
3653     {
3654       Charset_ID leading_byte;
3655       Lisp_Object charset;
3656       DECODE_BIG5 (b1, b2, leading_byte, c1, c2);
3657       charset = CHARSET_BY_LEADING_BYTE (leading_byte);
3658       return make_char (MAKE_CHAR (charset, c1 & 0x7F, c2 & 0x7F));
3659     }
3660   else
3661     return Qnil;
3662 }
3663
3664 DEFUN ("encode-big5-char", Fencode_big5_char, 1, 1, 0, /*
3665 Encode the Big5 character CH to BIG5 coding-system.
3666 Return the corresponding character code in Big5.
3667 */
3668        (ch))
3669 {
3670   Lisp_Object charset;
3671   int c1, c2, b1, b2;
3672
3673   CHECK_CHAR_COERCE_INT (ch);
3674   BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
3675   if (EQ (charset, Vcharset_chinese_big5_1) ||
3676       EQ (charset, Vcharset_chinese_big5_2))
3677     {
3678       ENCODE_BIG5 (XCHARSET_LEADING_BYTE (charset), c1 | 0x80, c2 | 0x80,
3679                    b1, b2);
3680       return Fcons (make_int (b1), make_int (b2));
3681     }
3682   else
3683     return Qnil;
3684 }
3685
3686 \f
3687 /************************************************************************/
3688 /*                           UCS-4 methods                              */
3689 /************************************************************************/
3690
3691 static int
3692 detect_coding_ucs4 (struct detection_state *st, const unsigned char *src,
3693                     unsigned int n)
3694 {
3695   while (n--)
3696     {
3697       int c = *src++;
3698       switch (st->ucs4.in_byte)
3699         {
3700         case 0:
3701           if (c >= 128)
3702             return 0;
3703           else
3704             st->ucs4.in_byte++;
3705           break;
3706         case 3:
3707           st->ucs4.in_byte = 0;
3708           break;
3709         default:
3710           st->ucs4.in_byte++;
3711         }
3712     }
3713   return CODING_CATEGORY_UCS4_MASK;
3714 }
3715
3716 static void
3717 decode_coding_ucs4 (Lstream *decoding, const unsigned char *src,
3718                     unsigned_char_dynarr *dst, unsigned int n)
3719 {
3720   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3721   unsigned int flags = str->flags;
3722   unsigned int ch    = str->ch;
3723   unsigned char counter = str->counter;
3724
3725   while (n--)
3726     {
3727       unsigned char c = *src++;
3728       switch (counter)
3729         {
3730         case 0:
3731           ch = c;
3732           counter = 3;
3733           break;
3734         case 1:
3735           DECODE_ADD_UCS_CHAR ((ch << 8) | c, dst);
3736           ch = 0;
3737           counter = 0;
3738           break;
3739         default:
3740           ch = ( ch << 8 ) | c;
3741           counter--;
3742         }
3743     }
3744   if (counter & CODING_STATE_END)
3745     DECODE_OUTPUT_PARTIAL_CHAR (ch);
3746
3747   str->flags = flags;
3748   str->ch    = ch;
3749   str->counter = counter;
3750 }
3751
3752 void
3753 char_encode_ucs4 (struct encoding_stream *str, Emchar ch,
3754                   unsigned_char_dynarr *dst, unsigned int *flags)
3755 {
3756   Dynarr_add (dst, ch >> 24);
3757   Dynarr_add (dst, ch >> 16);
3758   Dynarr_add (dst, ch >>  8);
3759   Dynarr_add (dst, ch      );
3760 }
3761
3762 void
3763 char_finish_ucs4 (struct encoding_stream *str, unsigned_char_dynarr *dst,
3764                   unsigned int *flags)
3765 {
3766 }
3767
3768 \f
3769 /************************************************************************/
3770 /*                           UTF-8 methods                              */
3771 /************************************************************************/
3772
3773 static int
3774 detect_coding_utf8 (struct detection_state *st, const unsigned char *src,
3775                     unsigned int n)
3776 {
3777   while (n--)
3778     {
3779       unsigned char c = *src++;
3780       switch (st->utf8.in_byte)
3781         {
3782         case 0:
3783           if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3784             return 0;
3785           else if (c >= 0xfc)
3786             st->utf8.in_byte = 5;
3787           else if (c >= 0xf8)
3788             st->utf8.in_byte = 4;
3789           else if (c >= 0xf0)
3790             st->utf8.in_byte = 3;
3791           else if (c >= 0xe0)
3792             st->utf8.in_byte = 2;
3793           else if (c >= 0xc0)
3794             st->utf8.in_byte = 1;
3795           else if (c >= 0x80)
3796             return 0;
3797           break;
3798         default:
3799           if ((c & 0xc0) != 0x80)
3800             return 0;
3801           else
3802             st->utf8.in_byte--;
3803         }
3804     }
3805   return CODING_CATEGORY_UTF8_MASK;
3806 }
3807
3808 static void
3809 decode_coding_utf8 (Lstream *decoding, const unsigned char *src,
3810                     unsigned_char_dynarr *dst, unsigned int n)
3811 {
3812   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3813   unsigned int flags  = str->flags;
3814   unsigned int ch     = str->ch;
3815   eol_type_t eol_type = str->eol_type;
3816   unsigned char counter = str->counter;
3817
3818   while (n--)
3819     {
3820       unsigned char c = *src++;
3821       switch (counter)
3822         {
3823         case 0:
3824           if ( c >= 0xfc )
3825             {
3826               ch = c & 0x01;
3827               counter = 5;
3828             }
3829           else if ( c >= 0xf8 )
3830             {
3831               ch = c & 0x03;
3832               counter = 4;
3833             }
3834           else if ( c >= 0xf0 )
3835             {
3836               ch = c & 0x07;
3837               counter = 3;
3838             }
3839           else if ( c >= 0xe0 )
3840             {
3841               ch = c & 0x0f;
3842               counter = 2;
3843             }
3844           else if ( c >= 0xc0 )
3845             {
3846               ch = c & 0x1f;
3847               counter = 1;
3848             }
3849           else
3850             {
3851               DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3852               DECODE_ADD_UCS_CHAR (c, dst);
3853             }
3854           break;
3855         case 1:
3856           ch = ( ch << 6 ) | ( c & 0x3f );
3857           DECODE_ADD_UCS_CHAR (ch, dst);
3858           ch = 0;
3859           counter = 0;
3860           break;
3861         default:
3862           ch = ( ch << 6 ) | ( c & 0x3f );
3863           counter--;
3864         }
3865     label_continue_loop:;
3866     }
3867
3868   if (flags & CODING_STATE_END)
3869     DECODE_OUTPUT_PARTIAL_CHAR (ch);
3870
3871   str->flags = flags;
3872   str->ch    = ch;
3873   str->counter = counter;
3874 }
3875
3876 void
3877 char_encode_utf8 (struct encoding_stream *str, Emchar ch,
3878                   unsigned_char_dynarr *dst, unsigned int *flags)
3879 {
3880   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3881
3882   if (ch == '\n')
3883     {
3884       if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3885         Dynarr_add (dst, '\r');
3886       if (eol_type != EOL_CR)
3887         Dynarr_add (dst, ch);
3888     }
3889   else if (ch <= 0x7f)
3890     {
3891       Dynarr_add (dst, ch);
3892     }
3893   else if (ch <= 0x7ff)
3894     {
3895       Dynarr_add (dst, (ch >> 6) | 0xc0);
3896       Dynarr_add (dst, (ch & 0x3f) | 0x80);
3897     }
3898   else if (ch <= 0xffff)
3899     {
3900       Dynarr_add (dst,  (ch >> 12) | 0xe0);
3901       Dynarr_add (dst, ((ch >>  6) & 0x3f) | 0x80);
3902       Dynarr_add (dst,  (ch        & 0x3f) | 0x80);
3903     }
3904   else if (ch <= 0x1fffff)
3905     {
3906       Dynarr_add (dst,  (ch >> 18) | 0xf0);
3907       Dynarr_add (dst, ((ch >> 12) & 0x3f) | 0x80);
3908       Dynarr_add (dst, ((ch >>  6) & 0x3f) | 0x80);
3909       Dynarr_add (dst,  (ch        & 0x3f) | 0x80);
3910     }
3911   else if (ch <= 0x3ffffff)
3912     {
3913       Dynarr_add (dst,  (ch >> 24) | 0xf8);
3914       Dynarr_add (dst, ((ch >> 18) & 0x3f) | 0x80);
3915       Dynarr_add (dst, ((ch >> 12) & 0x3f) | 0x80);
3916       Dynarr_add (dst, ((ch >>  6) & 0x3f) | 0x80);
3917       Dynarr_add (dst,  (ch        & 0x3f) | 0x80);
3918     }
3919   else
3920     {
3921       Dynarr_add (dst,  (ch >> 30) | 0xfc);
3922       Dynarr_add (dst, ((ch >> 24) & 0x3f) | 0x80);
3923       Dynarr_add (dst, ((ch >> 18) & 0x3f) | 0x80);
3924       Dynarr_add (dst, ((ch >> 12) & 0x3f) | 0x80);
3925       Dynarr_add (dst, ((ch >>  6) & 0x3f) | 0x80);
3926       Dynarr_add (dst,  (ch        & 0x3f) | 0x80);
3927     }
3928 }
3929
3930 void
3931 char_finish_utf8 (struct encoding_stream *str, unsigned_char_dynarr *dst,
3932                   unsigned int *flags)
3933 {
3934 }
3935
3936 \f
3937 /************************************************************************/
3938 /*                           ISO2022 methods                            */
3939 /************************************************************************/
3940
3941 /* The following note describes the coding system ISO2022 briefly.
3942    Since the intention of this note is to help understand the
3943    functions in this file, some parts are NOT ACCURATE or OVERLY
3944    SIMPLIFIED.  For thorough understanding, please refer to the
3945    original document of ISO2022.
3946
3947    ISO2022 provides many mechanisms to encode several character sets
3948    in 7-bit and 8-bit environments.  For 7-bit environments, all text
3949    is encoded using bytes less than 128.  This may make the encoded
3950    text a little bit longer, but the text passes more easily through
3951    several gateways, some of which strip off MSB (Most Signigant Bit).
3952
3953    There are two kinds of character sets: control character set and
3954    graphic character set.  The former contains control characters such
3955    as `newline' and `escape' to provide control functions (control
3956    functions are also provided by escape sequences).  The latter
3957    contains graphic characters such as 'A' and '-'.  Emacs recognizes
3958    two control character sets and many graphic character sets.
3959
3960    Graphic character sets are classified into one of the following
3961    four classes, according to the number of bytes (DIMENSION) and
3962    number of characters in one dimension (CHARS) of the set:
3963    - DIMENSION1_CHARS94
3964    - DIMENSION1_CHARS96
3965    - DIMENSION2_CHARS94
3966    - DIMENSION2_CHARS96
3967
3968    In addition, each character set is assigned an identification tag,
3969    unique for each set, called "final character" (denoted as <F>
3970    hereafter).  The <F> of each character set is decided by ECMA(*)
3971    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
3972    (0x30..0x3F are for private use only).
3973
3974    Note (*): ECMA = European Computer Manufacturers Association
3975
3976    Here are examples of graphic character set [NAME(<F>)]:
3977         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
3978         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
3979         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
3980         o DIMENSION2_CHARS96 -- none for the moment
3981
3982    A code area (1 byte = 8 bits) is divided into 4 areas, C0, GL, C1, and GR.
3983         C0 [0x00..0x1F] -- control character plane 0
3984         GL [0x20..0x7F] -- graphic character plane 0
3985         C1 [0x80..0x9F] -- control character plane 1
3986         GR [0xA0..0xFF] -- graphic character plane 1
3987
3988    A control character set is directly designated and invoked to C0 or
3989    C1 by an escape sequence.  The most common case is that:
3990    - ISO646's  control character set is designated/invoked to C0, and
3991    - ISO6429's control character set is designated/invoked to C1,
3992    and usually these designations/invocations are omitted in encoded
3993    text.  In a 7-bit environment, only C0 can be used, and a control
3994    character for C1 is encoded by an appropriate escape sequence to
3995    fit into the environment.  All control characters for C1 are
3996    defined to have corresponding escape sequences.
3997
3998    A graphic character set is at first designated to one of four
3999    graphic registers (G0 through G3), then these graphic registers are
4000    invoked to GL or GR.  These designations and invocations can be
4001    done independently.  The most common case is that G0 is invoked to
4002    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
4003    these invocations and designations are omitted in encoded text.
4004    In a 7-bit environment, only GL can be used.
4005
4006    When a graphic character set of CHARS94 is invoked to GL, codes
4007    0x20 and 0x7F of the GL area work as control characters SPACE and
4008    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
4009    be used.
4010
4011    There are two ways of invocation: locking-shift and single-shift.
4012    With locking-shift, the invocation lasts until the next different
4013    invocation, whereas with single-shift, the invocation affects the
4014    following character only and doesn't affect the locking-shift
4015    state.  Invocations are done by the following control characters or
4016    escape sequences:
4017
4018    ----------------------------------------------------------------------
4019    abbrev  function                  cntrl escape seq   description
4020    ----------------------------------------------------------------------
4021    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
4022    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
4023    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
4024    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
4025    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
4026    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
4027    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
4028    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
4029    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
4030    ----------------------------------------------------------------------
4031    (*) These are not used by any known coding system.
4032
4033    Control characters for these functions are defined by macros
4034    ISO_CODE_XXX in `coding.h'.
4035
4036    Designations are done by the following escape sequences:
4037    ----------------------------------------------------------------------
4038    escape sequence      description
4039    ----------------------------------------------------------------------
4040    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
4041    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
4042    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
4043    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
4044    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
4045    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
4046    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
4047    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
4048    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
4049    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
4050    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
4051    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
4052    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
4053    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
4054    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
4055    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
4056    ----------------------------------------------------------------------
4057
4058    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
4059    of dimension 1, chars 94, and final character <F>, etc...
4060
4061    Note (*): Although these designations are not allowed in ISO2022,
4062    Emacs accepts them on decoding, and produces them on encoding
4063    CHARS96 character sets in a coding system which is characterized as
4064    7-bit environment, non-locking-shift, and non-single-shift.
4065
4066    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
4067    '(' can be omitted.  We refer to this as "short-form" hereafter.
4068
4069    Now you may notice that there are a lot of ways for encoding the
4070    same multilingual text in ISO2022.  Actually, there exist many
4071    coding systems such as Compound Text (used in X11's inter client
4072    communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
4073    (used in Korean internet), EUC (Extended UNIX Code, used in Asian
4074    localized platforms), and all of these are variants of ISO2022.
4075
4076    In addition to the above, Emacs handles two more kinds of escape
4077    sequences: ISO6429's direction specification and Emacs' private
4078    sequence for specifying character composition.
4079
4080    ISO6429's direction specification takes the following form:
4081         o CSI ']'      -- end of the current direction
4082         o CSI '0' ']'  -- end of the current direction
4083         o CSI '1' ']'  -- start of left-to-right text
4084         o CSI '2' ']'  -- start of right-to-left text
4085    The control character CSI (0x9B: control sequence introducer) is
4086    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
4087
4088    Character composition specification takes the following form:
4089         o ESC '0' -- start character composition
4090         o ESC '1' -- end character composition
4091    Since these are not standard escape sequences of any ISO standard,
4092    their use with these meanings is restricted to Emacs only.  */
4093
4094 static void
4095 reset_iso2022 (Lisp_Object coding_system, struct iso2022_decoder *iso)
4096 {
4097   int i;
4098
4099   for (i = 0; i < 4; i++)
4100     {
4101       if (!NILP (coding_system))
4102         iso->charset[i] =
4103           XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, i);
4104       else
4105         iso->charset[i] = Qt;
4106       iso->invalid_designated[i] = 0;
4107     }
4108   iso->esc = ISO_ESC_NOTHING;
4109   iso->esc_bytes_index = 0;
4110   iso->register_left = 0;
4111   iso->register_right = 1;
4112   iso->switched_dir_and_no_valid_charset_yet = 0;
4113   iso->invalid_switch_dir = 0;
4114   iso->output_direction_sequence = 0;
4115   iso->output_literally = 0;
4116 #ifdef ENABLE_COMPOSITE_CHARS
4117   if (iso->composite_chars)
4118     Dynarr_reset (iso->composite_chars);
4119 #endif
4120 }
4121
4122 static int
4123 fit_to_be_escape_quoted (unsigned char c)
4124 {
4125   switch (c)
4126     {
4127     case ISO_CODE_ESC:
4128     case ISO_CODE_CSI:
4129     case ISO_CODE_SS2:
4130     case ISO_CODE_SS3:
4131     case ISO_CODE_SO:
4132     case ISO_CODE_SI:
4133       return 1;
4134
4135     default:
4136       return 0;
4137     }
4138 }
4139
4140 /* Parse one byte of an ISO2022 escape sequence.
4141    If the result is an invalid escape sequence, return 0 and
4142    do not change anything in STR.  Otherwise, if the result is
4143    an incomplete escape sequence, update ISO2022.ESC and
4144    ISO2022.ESC_BYTES and return -1.  Otherwise, update
4145    all the state variables (but not ISO2022.ESC_BYTES) and
4146    return 1.
4147
4148    If CHECK_INVALID_CHARSETS is non-zero, check for designation
4149    or invocation of an invalid character set and treat that as
4150    an unrecognized escape sequence. */
4151
4152 static int
4153 parse_iso2022_esc (Lisp_Object codesys, struct iso2022_decoder *iso,
4154                    unsigned char c, unsigned int *flags,
4155                    int check_invalid_charsets)
4156 {
4157   /* (1) If we're at the end of a designation sequence, CS is the
4158      charset being designated and REG is the register to designate
4159      it to.
4160
4161      (2) If we're at the end of a locking-shift sequence, REG is
4162      the register to invoke and HALF (0 == left, 1 == right) is
4163      the half to invoke it into.
4164
4165      (3) If we're at the end of a single-shift sequence, REG is
4166      the register to invoke. */
4167   Lisp_Object cs = Qnil;
4168   int reg, half;
4169
4170   /* NOTE: This code does goto's all over the fucking place.
4171      The reason for this is that we're basically implementing
4172      a state machine here, and hierarchical languages like C
4173      don't really provide a clean way of doing this. */
4174
4175   if (! (*flags & CODING_STATE_ESCAPE))
4176     /* At beginning of escape sequence; we need to reset our
4177        escape-state variables. */
4178     iso->esc = ISO_ESC_NOTHING;
4179
4180   iso->output_literally = 0;
4181   iso->output_direction_sequence = 0;
4182
4183   switch (iso->esc)
4184     {
4185     case ISO_ESC_NOTHING:
4186       iso->esc_bytes_index = 0;
4187       switch (c)
4188         {
4189         case ISO_CODE_ESC:      /* Start escape sequence */
4190           *flags |= CODING_STATE_ESCAPE;
4191           iso->esc = ISO_ESC;
4192           goto not_done;
4193
4194         case ISO_CODE_CSI:      /* ISO6429 (specifying directionality) */
4195           *flags |= CODING_STATE_ESCAPE;
4196           iso->esc = ISO_ESC_5_11;
4197           goto not_done;
4198
4199         case ISO_CODE_SO:       /* locking shift 1 */
4200           reg = 1; half = 0;
4201           goto locking_shift;
4202         case ISO_CODE_SI:       /* locking shift 0 */
4203           reg = 0; half = 0;
4204           goto locking_shift;
4205
4206         case ISO_CODE_SS2:      /* single shift */
4207           reg = 2;
4208           goto single_shift;
4209         case ISO_CODE_SS3:      /* single shift */
4210           reg = 3;
4211           goto single_shift;
4212
4213         default:                        /* Other control characters */
4214           return 0;
4215         }
4216
4217     case ISO_ESC:
4218       switch (c)
4219         {
4220           /**** single shift ****/
4221
4222         case 'N':       /* single shift 2 */
4223           reg = 2;
4224           goto single_shift;
4225         case 'O':       /* single shift 3 */
4226           reg = 3;
4227           goto single_shift;
4228
4229           /**** locking shift ****/
4230
4231         case '~':       /* locking shift 1 right */
4232           reg = 1; half = 1;
4233           goto locking_shift;
4234         case 'n':       /* locking shift 2 */
4235           reg = 2; half = 0;
4236           goto locking_shift;
4237         case '}':       /* locking shift 2 right */
4238           reg = 2; half = 1;
4239           goto locking_shift;
4240         case 'o':       /* locking shift 3 */
4241           reg = 3; half = 0;
4242           goto locking_shift;
4243         case '|':       /* locking shift 3 right */
4244           reg = 3; half = 1;
4245           goto locking_shift;
4246
4247 #ifdef ENABLE_COMPOSITE_CHARS
4248           /**** composite ****/
4249
4250         case '0':
4251           iso->esc = ISO_ESC_START_COMPOSITE;
4252           *flags = (*flags & CODING_STATE_ISO2022_LOCK) |
4253             CODING_STATE_COMPOSITE;
4254           return 1;
4255
4256         case '1':
4257           iso->esc = ISO_ESC_END_COMPOSITE;
4258           *flags = (*flags & CODING_STATE_ISO2022_LOCK) &
4259             ~CODING_STATE_COMPOSITE;
4260           return 1;
4261 #endif /* ENABLE_COMPOSITE_CHARS */
4262
4263           /**** directionality ****/
4264
4265         case '[':
4266           iso->esc = ISO_ESC_5_11;
4267           goto not_done;
4268
4269           /**** designation ****/
4270
4271         case '$':       /* multibyte charset prefix */
4272           iso->esc = ISO_ESC_2_4;
4273           goto not_done;
4274
4275         default:
4276           if (0x28 <= c && c <= 0x2F)
4277             {
4278               iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_8);
4279               goto not_done;
4280             }
4281
4282           /* This function is called with CODESYS equal to nil when
4283              doing coding-system detection. */
4284           if (!NILP (codesys)
4285               && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
4286               && fit_to_be_escape_quoted (c))
4287             {
4288               iso->esc = ISO_ESC_LITERAL;
4289               *flags &= CODING_STATE_ISO2022_LOCK;
4290               return 1;
4291             }
4292
4293           /* bzzzt! */
4294           return 0;
4295         }
4296
4297
4298
4299       /**** directionality ****/
4300
4301     case ISO_ESC_5_11:          /* ISO6429 direction control */
4302       if (c == ']')
4303         {
4304           *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4305           goto directionality;
4306         }
4307       if      (c == '0') iso->esc = ISO_ESC_5_11_0;
4308       else if (c == '1') iso->esc = ISO_ESC_5_11_1;
4309       else if (c == '2') iso->esc = ISO_ESC_5_11_2;
4310       else               return 0;
4311       goto not_done;
4312
4313     case ISO_ESC_5_11_0:
4314       if (c == ']')
4315         {
4316           *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4317           goto directionality;
4318         }
4319       return 0;
4320
4321     case ISO_ESC_5_11_1:
4322       if (c == ']')
4323         {
4324           *flags = (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4325           goto directionality;
4326         }
4327       return 0;
4328
4329     case ISO_ESC_5_11_2:
4330       if (c == ']')
4331         {
4332           *flags = (*flags & CODING_STATE_ISO2022_LOCK) | CODING_STATE_R2L;
4333           goto directionality;
4334         }
4335       return 0;
4336
4337     directionality:
4338       iso->esc = ISO_ESC_DIRECTIONALITY;
4339       /* Various junk here to attempt to preserve the direction sequences
4340          literally in the text if they would otherwise be swallowed due
4341          to invalid designations that don't show up as actual charset
4342          changes in the text. */
4343       if (iso->invalid_switch_dir)
4344         {
4345           /* We already inserted a direction switch literally into the
4346              text.  We assume (#### this may not be right) that the
4347              next direction switch is the one going the other way,
4348              and we need to output that literally as well. */
4349           iso->output_literally = 1;
4350           iso->invalid_switch_dir = 0;
4351         }
4352       else
4353         {
4354           int jj;
4355
4356           /* If we are in the thrall of an invalid designation,
4357            then stick the directionality sequence literally into the
4358            output stream so it ends up in the original text again. */
4359           for (jj = 0; jj < 4; jj++)
4360             if (iso->invalid_designated[jj])
4361               break;
4362           if (jj < 4)
4363             {
4364               iso->output_literally = 1;
4365               iso->invalid_switch_dir = 1;
4366             }
4367           else
4368             /* Indicate that we haven't yet seen a valid designation,
4369                so that if a switch-dir is directly followed by an
4370                invalid designation, both get inserted literally. */
4371             iso->switched_dir_and_no_valid_charset_yet = 1;
4372         }
4373       return 1;
4374
4375
4376       /**** designation ****/
4377
4378     case ISO_ESC_2_4:
4379       if (0x28 <= c && c <= 0x2F)
4380         {
4381           iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_4_8);
4382           goto not_done;
4383         }
4384       if (0x40 <= c && c <= 0x42)
4385         {
4386           cs = CHARSET_BY_ATTRIBUTES (CHARSET_TYPE_94X94, c,
4387                                       *flags & CODING_STATE_R2L ?
4388                                       CHARSET_RIGHT_TO_LEFT :
4389                                       CHARSET_LEFT_TO_RIGHT);
4390           reg = 0;
4391           goto designated;
4392         }
4393       return 0;
4394
4395     default:
4396       {
4397         int type =-1;
4398
4399         if (c < '0' || c > '~')
4400           return 0; /* bad final byte */
4401
4402         if (iso->esc >= ISO_ESC_2_8 &&
4403             iso->esc <= ISO_ESC_2_15)
4404           {
4405             type = ((iso->esc >= ISO_ESC_2_12) ?
4406                     CHARSET_TYPE_96 : CHARSET_TYPE_94);
4407             reg = (iso->esc - ISO_ESC_2_8) & 3;
4408           }
4409         else if (iso->esc >= ISO_ESC_2_4_8 &&
4410                  iso->esc <= ISO_ESC_2_4_15)
4411           {
4412             type = ((iso->esc >= ISO_ESC_2_4_12) ?
4413                     CHARSET_TYPE_96X96 : CHARSET_TYPE_94X94);
4414             reg = (iso->esc - ISO_ESC_2_4_8) & 3;
4415           }
4416         else
4417           {
4418             /* Can this ever be reached? -slb */
4419             abort();
4420           }
4421
4422         cs = CHARSET_BY_ATTRIBUTES (type, c,
4423                                     *flags & CODING_STATE_R2L ?
4424                                     CHARSET_RIGHT_TO_LEFT :
4425                                     CHARSET_LEFT_TO_RIGHT);
4426         goto designated;
4427       }
4428     }
4429
4430  not_done:
4431   iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char) c;
4432   return -1;
4433
4434  single_shift:
4435   if (check_invalid_charsets && !CHARSETP (iso->charset[reg]))
4436     /* can't invoke something that ain't there. */
4437     return 0;
4438   iso->esc = ISO_ESC_SINGLE_SHIFT;
4439   *flags &= CODING_STATE_ISO2022_LOCK;
4440   if (reg == 2)
4441     *flags |= CODING_STATE_SS2;
4442   else
4443     *flags |= CODING_STATE_SS3;
4444   return 1;
4445
4446  locking_shift:
4447   if (check_invalid_charsets &&
4448       !CHARSETP (iso->charset[reg]))
4449     /* can't invoke something that ain't there. */
4450     return 0;
4451   if (half)
4452     iso->register_right = reg;
4453   else
4454     iso->register_left = reg;
4455   *flags &= CODING_STATE_ISO2022_LOCK;
4456   iso->esc = ISO_ESC_LOCKING_SHIFT;
4457   return 1;
4458
4459  designated:
4460   if (NILP (cs) && check_invalid_charsets)
4461     {
4462       iso->invalid_designated[reg] = 1;
4463       iso->charset[reg] = Vcharset_ascii;
4464       iso->esc = ISO_ESC_DESIGNATE;
4465       *flags &= CODING_STATE_ISO2022_LOCK;
4466       iso->output_literally = 1;
4467       if (iso->switched_dir_and_no_valid_charset_yet)
4468         {
4469           /* We encountered a switch-direction followed by an
4470              invalid designation.  Ensure that the switch-direction
4471              gets outputted; otherwise it will probably get eaten
4472              when the text is written out again. */
4473           iso->switched_dir_and_no_valid_charset_yet = 0;
4474           iso->output_direction_sequence = 1;
4475           /* And make sure that the switch-dir going the other
4476              way gets outputted, as well. */
4477           iso->invalid_switch_dir = 1;
4478         }
4479       return 1;
4480     }
4481   /* This function is called with CODESYS equal to nil when
4482      doing coding-system detection. */
4483   if (!NILP (codesys))
4484     {
4485       charset_conversion_spec_dynarr *dyn =
4486         XCODING_SYSTEM (codesys)->iso2022.input_conv;
4487
4488       if (dyn)
4489         {
4490           int i;
4491
4492           for (i = 0; i < Dynarr_length (dyn); i++)
4493             {
4494               struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
4495               if (EQ (cs, spec->from_charset))
4496                 cs = spec->to_charset;
4497             }
4498         }
4499     }
4500
4501   iso->charset[reg] = cs;
4502   iso->esc = ISO_ESC_DESIGNATE;
4503   *flags &= CODING_STATE_ISO2022_LOCK;
4504   if (iso->invalid_designated[reg])
4505     {
4506       iso->invalid_designated[reg] = 0;
4507       iso->output_literally = 1;
4508     }
4509   if (iso->switched_dir_and_no_valid_charset_yet)
4510     iso->switched_dir_and_no_valid_charset_yet = 0;
4511   return 1;
4512 }
4513
4514 static int
4515 detect_coding_iso2022 (struct detection_state *st, const unsigned char *src,
4516                        unsigned int n)
4517 {
4518   int mask;
4519
4520   /* #### There are serious deficiencies in the recognition mechanism
4521      here.  This needs to be much smarter if it's going to cut it.
4522      The sequence "\xff\x0f" is currently detected as LOCK_SHIFT while
4523      it should be detected as Latin-1.
4524      All the ISO2022 stuff in this file should be synced up with the
4525      code from FSF Emacs-20.4, in which Mule should be more or less stable.
4526      Perhaps we should wait till R2L works in FSF Emacs? */
4527
4528   if (!st->iso2022.initted)
4529     {
4530       reset_iso2022 (Qnil, &st->iso2022.iso);
4531       st->iso2022.mask = (CODING_CATEGORY_ISO_7_MASK |
4532                           CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4533                           CODING_CATEGORY_ISO_8_1_MASK |
4534                           CODING_CATEGORY_ISO_8_2_MASK |
4535                           CODING_CATEGORY_ISO_LOCK_SHIFT_MASK);
4536       st->iso2022.flags = 0;
4537       st->iso2022.high_byte_count = 0;
4538       st->iso2022.saw_single_shift = 0;
4539       st->iso2022.initted = 1;
4540     }
4541
4542   mask = st->iso2022.mask;
4543
4544   while (n--)
4545     {
4546       int c = *src++;
4547       if (c >= 0xA0)
4548         {
4549           mask &= ~CODING_CATEGORY_ISO_7_MASK;
4550           st->iso2022.high_byte_count++;
4551         }
4552       else
4553         {
4554           if (st->iso2022.high_byte_count && !st->iso2022.saw_single_shift)
4555             {
4556               if (st->iso2022.high_byte_count & 1)
4557                 /* odd number of high bytes; assume not iso-8-2 */
4558                 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4559             }
4560           st->iso2022.high_byte_count = 0;
4561           st->iso2022.saw_single_shift = 0;
4562           if (c > 0x80)
4563             mask &= ~CODING_CATEGORY_ISO_7_MASK;
4564         }
4565       if (!(st->iso2022.flags & CODING_STATE_ESCAPE)
4566           && (BYTE_C0_P (c) || BYTE_C1_P (c)))
4567         { /* control chars */
4568           switch (c)
4569             {
4570               /* Allow and ignore control characters that you might
4571                  reasonably see in a text file */
4572             case '\r':
4573             case '\n':
4574             case '\t':
4575             case  7: /* bell */
4576             case  8: /* backspace */
4577             case 11: /* vertical tab */
4578             case 12: /* form feed */
4579             case 26: /* MS-DOS C-z junk */
4580             case 31: /* '^_' -- for info */
4581               goto label_continue_loop;
4582
4583             default:
4584               break;
4585             }
4586         }
4587
4588       if ((st->iso2022.flags & CODING_STATE_ESCAPE) || BYTE_C0_P (c)
4589           || BYTE_C1_P (c))
4590         {
4591           if (parse_iso2022_esc (Qnil, &st->iso2022.iso, c,
4592                                  &st->iso2022.flags, 0))
4593             {
4594               switch (st->iso2022.iso.esc)
4595                 {
4596                 case ISO_ESC_DESIGNATE:
4597                   mask &= ~CODING_CATEGORY_ISO_8_1_MASK;
4598                   mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4599                   break;
4600                 case ISO_ESC_LOCKING_SHIFT:
4601                   mask = CODING_CATEGORY_ISO_LOCK_SHIFT_MASK;
4602                   goto ran_out_of_chars;
4603                 case ISO_ESC_SINGLE_SHIFT:
4604                   mask &= ~CODING_CATEGORY_ISO_8_DESIGNATE_MASK;
4605                   st->iso2022.saw_single_shift = 1;
4606                   break;
4607                 default:
4608                   break;
4609                 }
4610             }
4611           else
4612             {
4613               mask = 0;
4614               goto ran_out_of_chars;
4615             }
4616         }
4617     label_continue_loop:;
4618     }
4619
4620  ran_out_of_chars:
4621
4622   return mask;
4623 }
4624
4625 static int
4626 postprocess_iso2022_mask (int mask)
4627 {
4628   /* #### kind of cheesy */
4629   /* If seven-bit ISO is allowed, then assume that the encoding is
4630      entirely seven-bit and turn off the eight-bit ones. */
4631   if (mask & CODING_CATEGORY_ISO_7_MASK)
4632     mask &= ~ (CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4633                CODING_CATEGORY_ISO_8_1_MASK |
4634                CODING_CATEGORY_ISO_8_2_MASK);
4635   return mask;
4636 }
4637
4638 /* If FLAGS is a null pointer or specifies right-to-left motion,
4639    output a switch-dir-to-left-to-right sequence to DST.
4640    Also update FLAGS if it is not a null pointer.
4641    If INTERNAL_P is set, we are outputting in internal format and
4642    need to handle the CSI differently. */
4643
4644 static void
4645 restore_left_to_right_direction (Lisp_Coding_System *codesys,
4646                                  unsigned_char_dynarr *dst,
4647                                  unsigned int *flags,
4648                                  int internal_p)
4649 {
4650   if (!flags || (*flags & CODING_STATE_R2L))
4651     {
4652       if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4653         {
4654           Dynarr_add (dst, ISO_CODE_ESC);
4655           Dynarr_add (dst, '[');
4656         }
4657       else if (internal_p)
4658         DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4659       else
4660         Dynarr_add (dst, ISO_CODE_CSI);
4661       Dynarr_add (dst, '0');
4662       Dynarr_add (dst, ']');
4663       if (flags)
4664         *flags &= ~CODING_STATE_R2L;
4665     }
4666 }
4667
4668 /* If FLAGS is a null pointer or specifies a direction different from
4669    DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or
4670    CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape
4671    sequence to DST.  Also update FLAGS if it is not a null pointer.
4672    If INTERNAL_P is set, we are outputting in internal format and
4673    need to handle the CSI differently. */
4674
4675 static void
4676 ensure_correct_direction (int direction, Lisp_Coding_System *codesys,
4677                           unsigned_char_dynarr *dst, unsigned int *flags,
4678                           int internal_p)
4679 {
4680   if ((!flags || (*flags & CODING_STATE_R2L)) &&
4681       direction == CHARSET_LEFT_TO_RIGHT)
4682     restore_left_to_right_direction (codesys, dst, flags, internal_p);
4683   else if (!CODING_SYSTEM_ISO2022_NO_ISO6429 (codesys)
4684            && (!flags || !(*flags & CODING_STATE_R2L)) &&
4685            direction == CHARSET_RIGHT_TO_LEFT)
4686     {
4687       if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4688         {
4689           Dynarr_add (dst, ISO_CODE_ESC);
4690           Dynarr_add (dst, '[');
4691         }
4692       else if (internal_p)
4693         DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4694       else
4695         Dynarr_add (dst, ISO_CODE_CSI);
4696       Dynarr_add (dst, '2');
4697       Dynarr_add (dst, ']');
4698       if (flags)
4699         *flags |= CODING_STATE_R2L;
4700     }
4701 }
4702
4703 /* Convert ISO2022-format data to internal format. */
4704
4705 static void
4706 decode_coding_iso2022 (Lstream *decoding, const unsigned char *src,
4707                        unsigned_char_dynarr *dst, unsigned int n)
4708 {
4709   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4710   unsigned int flags  = str->flags;
4711   unsigned int ch     = str->ch;
4712   eol_type_t eol_type = str->eol_type;
4713 #ifdef ENABLE_COMPOSITE_CHARS
4714   unsigned_char_dynarr *real_dst = dst;
4715 #endif
4716   Lisp_Object coding_system;
4717
4718   XSETCODING_SYSTEM (coding_system, str->codesys);
4719
4720 #ifdef ENABLE_COMPOSITE_CHARS
4721   if (flags & CODING_STATE_COMPOSITE)
4722     dst = str->iso2022.composite_chars;
4723 #endif /* ENABLE_COMPOSITE_CHARS */
4724
4725   while (n--)
4726     {
4727       unsigned char c = *src++;
4728       if (flags & CODING_STATE_ESCAPE)
4729         {       /* Within ESC sequence */
4730           int retval = parse_iso2022_esc (coding_system, &str->iso2022,
4731                                           c, &flags, 1);
4732
4733           if (retval)
4734             {
4735               switch (str->iso2022.esc)
4736                 {
4737 #ifdef ENABLE_COMPOSITE_CHARS
4738                 case ISO_ESC_START_COMPOSITE:
4739                   if (str->iso2022.composite_chars)
4740                     Dynarr_reset (str->iso2022.composite_chars);
4741                   else
4742                     str->iso2022.composite_chars = Dynarr_new (unsigned_char);
4743                   dst = str->iso2022.composite_chars;
4744                   break;
4745                 case ISO_ESC_END_COMPOSITE:
4746                   {
4747                     Bufbyte comstr[MAX_EMCHAR_LEN];
4748                     Bytecount len;
4749                     Emchar emch = lookup_composite_char (Dynarr_atp (dst, 0),
4750                                                          Dynarr_length (dst));
4751                     dst = real_dst;
4752                     len = set_charptr_emchar (comstr, emch);
4753                     Dynarr_add_many (dst, comstr, len);
4754                     break;
4755                   }
4756 #endif /* ENABLE_COMPOSITE_CHARS */
4757
4758                 case ISO_ESC_LITERAL:
4759                   COMPOSE_FLUSH_CHARS (str, dst);
4760                   DECODE_ADD_BINARY_CHAR (c, dst);
4761                   break;
4762
4763                 default:
4764                   /* Everything else handled already */
4765                   break;
4766                 }
4767             }
4768
4769           /* Attempted error recovery. */
4770           if (str->iso2022.output_direction_sequence)
4771             ensure_correct_direction (flags & CODING_STATE_R2L ?
4772                                       CHARSET_RIGHT_TO_LEFT :
4773                                       CHARSET_LEFT_TO_RIGHT,
4774                                       str->codesys, dst, 0, 1);
4775           /* More error recovery. */
4776           if (!retval || str->iso2022.output_literally)
4777             {
4778               /* Output the (possibly invalid) sequence */
4779               int i;
4780               COMPOSE_FLUSH_CHARS (str, dst);
4781               for (i = 0; i < str->iso2022.esc_bytes_index; i++)
4782                 DECODE_ADD_BINARY_CHAR (str->iso2022.esc_bytes[i], dst);
4783               flags &= CODING_STATE_ISO2022_LOCK;
4784               if (!retval)
4785                 n++, src--;/* Repeat the loop with the same character. */
4786               else
4787                 {
4788                   /* No sense in reprocessing the final byte of the
4789                      escape sequence; it could mess things up anyway.
4790                      Just add it now. */
4791                   COMPOSE_FLUSH_CHARS (str, dst);
4792                   DECODE_ADD_BINARY_CHAR (c, dst);
4793                 }
4794             }
4795           ch = 0;
4796         }
4797       else if (BYTE_C0_P (c) || BYTE_C1_P (c))
4798         { /* Control characters */
4799
4800           /***** Error-handling *****/
4801
4802           /* If we were in the middle of a character, dump out the
4803              partial character. */
4804           if (ch)
4805             {
4806               COMPOSE_FLUSH_CHARS (str, dst);
4807               DECODE_ADD_BINARY_CHAR (ch, dst);
4808               ch = 0;
4809             }
4810
4811           /* If we just saw a single-shift character, dump it out.
4812              This may dump out the wrong sort of single-shift character,
4813              but least it will give an indication that something went
4814              wrong. */
4815           if (flags & CODING_STATE_SS2)
4816             {
4817               COMPOSE_FLUSH_CHARS (str, dst);
4818               DECODE_ADD_BINARY_CHAR (ISO_CODE_SS2, dst);
4819               flags &= ~CODING_STATE_SS2;
4820             }
4821           if (flags & CODING_STATE_SS3)
4822             {
4823               COMPOSE_FLUSH_CHARS (str, dst);
4824               DECODE_ADD_BINARY_CHAR (ISO_CODE_SS3, dst);
4825               flags &= ~CODING_STATE_SS3;
4826             }
4827
4828           /***** Now handle the control characters. *****/
4829
4830           /* Handle CR/LF */
4831 #ifdef UTF2000
4832           if (c == '\r')
4833             {
4834               COMPOSE_FLUSH_CHARS (str, dst);
4835               if (eol_type == EOL_CR)
4836                 Dynarr_add (dst, '\n');
4837               else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR)
4838                 Dynarr_add (dst, c);
4839               else
4840                 flags |= CODING_STATE_CR;
4841               goto label_continue_loop;
4842             }
4843           else if (flags & CODING_STATE_CR)
4844             {   /* eol_type == CODING_SYSTEM_EOL_CRLF */
4845               if (c != '\n')
4846                 Dynarr_add (dst, '\r');
4847               flags &= ~CODING_STATE_CR;
4848             }
4849 #else
4850           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4851 #endif
4852
4853           flags &= CODING_STATE_ISO2022_LOCK;
4854
4855           if (!parse_iso2022_esc (coding_system, &str->iso2022, c, &flags, 1))
4856             {
4857               COMPOSE_FLUSH_CHARS (str, dst);
4858               DECODE_ADD_BINARY_CHAR (c, dst);
4859             }
4860         }
4861       else
4862         {                       /* Graphic characters */
4863           Lisp_Object charset;
4864 #ifndef UTF2000
4865           Charset_ID lb;
4866 #endif
4867           int reg;
4868
4869 #ifdef UTF2000
4870           if (c == '\r')
4871             {
4872               COMPOSE_FLUSH_CHARS (str, dst);
4873               if (eol_type == EOL_CR)
4874                 Dynarr_add (dst, '\n');
4875               else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR)
4876                 Dynarr_add (dst, c);
4877               else
4878                 flags |= CODING_STATE_CR;
4879               goto label_continue_loop;
4880             }
4881           else if (flags & CODING_STATE_CR)
4882             {   /* eol_type == CODING_SYSTEM_EOL_CRLF */
4883               if (c != '\n')
4884                 Dynarr_add (dst, '\r');
4885               flags &= ~CODING_STATE_CR;
4886             }
4887 #else
4888           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4889 #endif
4890
4891           /* Now determine the charset. */
4892           reg = ((flags & CODING_STATE_SS2) ? 2
4893                  : (flags & CODING_STATE_SS3) ? 3
4894                  : !BYTE_ASCII_P (c) ? str->iso2022.register_right
4895                  : str->iso2022.register_left);
4896           charset = str->iso2022.charset[reg];
4897
4898           /* Error checking: */
4899           if (! CHARSETP (charset)
4900               || str->iso2022.invalid_designated[reg]
4901               || (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL)
4902                   && XCHARSET_CHARS (charset) == 94))
4903             /* Mrmph.  We are trying to invoke a register that has no
4904                or an invalid charset in it, or trying to add a character
4905                outside the range of the charset.  Insert that char literally
4906                to preserve it for the output. */
4907             {
4908               COMPOSE_FLUSH_CHARS (str, dst);
4909               DECODE_OUTPUT_PARTIAL_CHAR (ch);
4910               DECODE_ADD_BINARY_CHAR (c, dst);
4911             }
4912
4913           else
4914             {
4915               /* Things are probably hunky-dorey. */
4916
4917               /* Fetch reverse charset, maybe. */
4918               if (((flags & CODING_STATE_R2L) &&
4919                    XCHARSET_DIRECTION (charset) == CHARSET_LEFT_TO_RIGHT)
4920                   ||
4921                   (!(flags & CODING_STATE_R2L) &&
4922                    XCHARSET_DIRECTION (charset) == CHARSET_RIGHT_TO_LEFT))
4923                 {
4924                   Lisp_Object new_charset =
4925                     XCHARSET_REVERSE_DIRECTION_CHARSET (charset);
4926                   if (!NILP (new_charset))
4927                     charset = new_charset;
4928                 }
4929
4930 #ifdef UTF2000
4931               if (XCHARSET_DIMENSION (charset) == 1)
4932                 {
4933                   if (ch)
4934                     {
4935                       COMPOSE_FLUSH_CHARS (str, dst);
4936                       DECODE_ADD_BINARY_CHAR (ch, dst);
4937                       ch = 0;
4938                     }
4939                   COMPOSE_ADD_CHAR (str,
4940                                     MAKE_CHAR (charset, c & 0x7F, 0), dst);
4941                 }
4942               else if (ch)
4943                 {
4944                   COMPOSE_ADD_CHAR (str,
4945                                     MAKE_CHAR (charset, ch & 0x7F, c & 0x7F),
4946                                     dst);
4947                   ch = 0;
4948                 }
4949               else
4950                 ch = c;
4951 #else
4952               lb = XCHARSET_LEADING_BYTE (charset);
4953               switch (XCHARSET_REP_BYTES (charset))
4954                 {
4955                 case 1: /* ASCII */
4956                   DECODE_OUTPUT_PARTIAL_CHAR (ch);
4957                   Dynarr_add (dst, c & 0x7F);
4958                   break;
4959
4960                 case 2: /* one-byte official */
4961                   DECODE_OUTPUT_PARTIAL_CHAR (ch);
4962                   Dynarr_add (dst, lb);
4963                   Dynarr_add (dst, c | 0x80);
4964                   break;
4965
4966                 case 3: /* one-byte private or two-byte official */
4967                   if (XCHARSET_PRIVATE_P (charset))
4968                     {
4969                       DECODE_OUTPUT_PARTIAL_CHAR (ch);
4970                       Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_1);
4971                       Dynarr_add (dst, lb);
4972                       Dynarr_add (dst, c | 0x80);
4973                     }
4974                   else
4975                     {
4976                       if (ch)
4977                         {
4978                           Dynarr_add (dst, lb);
4979                           Dynarr_add (dst, ch | 0x80);
4980                           Dynarr_add (dst, c | 0x80);
4981                           ch = 0;
4982                         }
4983                       else
4984                         ch = c;
4985                     }
4986                   break;
4987
4988                 default:        /* two-byte private */
4989                   if (ch)
4990                     {
4991                       Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_2);
4992                       Dynarr_add (dst, lb);
4993                       Dynarr_add (dst, ch | 0x80);
4994                       Dynarr_add (dst, c | 0x80);
4995                       ch = 0;
4996                     }
4997                   else
4998                     ch = c;
4999                 }
5000 #endif
5001             }
5002
5003           if (!ch)
5004             flags &= CODING_STATE_ISO2022_LOCK;
5005         }
5006
5007     label_continue_loop:;
5008     }
5009
5010   if (flags & CODING_STATE_END)
5011     {
5012       COMPOSE_FLUSH_CHARS (str, dst);
5013       DECODE_OUTPUT_PARTIAL_CHAR (ch);
5014     }
5015   str->flags = flags;
5016   str->ch    = ch;
5017 }
5018
5019
5020 /***** ISO2022 encoder *****/
5021
5022 /* Designate CHARSET into register REG. */
5023
5024 static void
5025 iso2022_designate (Lisp_Object charset, unsigned char reg,
5026                    struct encoding_stream *str, unsigned_char_dynarr *dst)
5027 {
5028   static const char inter94[] = "()*+";
5029   static const char inter96[] = ",-./";
5030   unsigned short chars;
5031   unsigned char dimension;
5032   unsigned char final;
5033   Lisp_Object old_charset = str->iso2022.charset[reg];
5034
5035   str->iso2022.charset[reg] = charset;
5036   if (!CHARSETP (charset))
5037     /* charset might be an initial nil or t. */
5038     return;
5039   chars = XCHARSET_CHARS (charset);
5040   dimension = XCHARSET_DIMENSION (charset);
5041   final = XCHARSET_FINAL (charset);
5042   if (!str->iso2022.force_charset_on_output[reg] &&
5043       CHARSETP (old_charset) &&
5044       XCHARSET_CHARS (old_charset) == chars &&
5045       XCHARSET_DIMENSION (old_charset) == dimension &&
5046       XCHARSET_FINAL (old_charset) == final)
5047     return;
5048
5049   str->iso2022.force_charset_on_output[reg] = 0;
5050
5051   {
5052     charset_conversion_spec_dynarr *dyn =
5053       str->codesys->iso2022.output_conv;
5054
5055     if (dyn)
5056       {
5057         int i;
5058
5059         for (i = 0; i < Dynarr_length (dyn); i++)
5060           {
5061             struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
5062             if (EQ (charset, spec->from_charset))
5063                 charset = spec->to_charset;
5064           }
5065       }
5066   }
5067
5068   Dynarr_add (dst, ISO_CODE_ESC);
5069   switch (chars)
5070     {
5071     case 94:
5072       if (dimension == 1)
5073         Dynarr_add (dst, inter94[reg]);
5074       else
5075         {
5076           Dynarr_add (dst, '$');
5077           if (reg != 0
5078               || !(CODING_SYSTEM_ISO2022_SHORT (str->codesys))
5079               || final < '@'
5080               || final > 'B')
5081             Dynarr_add (dst, inter94[reg]);
5082         }
5083       break;
5084     case 96:
5085       if (dimension == 1)
5086         Dynarr_add (dst, inter96[reg]);
5087       else
5088         {
5089           Dynarr_add (dst, '$');
5090           Dynarr_add (dst, inter96[reg]);
5091         }
5092       break;
5093     }
5094   Dynarr_add (dst, final);
5095 }
5096
5097 static void
5098 ensure_normal_shift (struct encoding_stream *str, unsigned_char_dynarr *dst)
5099 {
5100   if (str->iso2022.register_left != 0)
5101     {
5102       Dynarr_add (dst, ISO_CODE_SI);
5103       str->iso2022.register_left = 0;
5104     }
5105 }
5106
5107 static void
5108 ensure_shift_out (struct encoding_stream *str, unsigned_char_dynarr *dst)
5109 {
5110   if (str->iso2022.register_left != 1)
5111     {
5112       Dynarr_add (dst, ISO_CODE_SO);
5113       str->iso2022.register_left = 1;
5114     }
5115 }
5116
5117 void
5118 char_encode_iso2022 (struct encoding_stream *str, Emchar ch,
5119                      unsigned_char_dynarr *dst, unsigned int *flags)
5120 {
5121   unsigned char charmask;
5122   Lisp_Coding_System* codesys = str->codesys;
5123   eol_type_t eol_type         = CODING_SYSTEM_EOL_TYPE (str->codesys);
5124   int i;
5125   Lisp_Object charset = str->iso2022.current_charset;
5126   int half = str->iso2022.current_half;
5127   unsigned int byte1, byte2;
5128
5129   if (ch <= 0x7F)
5130     {
5131       restore_left_to_right_direction (codesys, dst, flags, 0);
5132
5133       /* Make sure G0 contains ASCII */
5134       if ((ch > ' ' && ch < ISO_CODE_DEL)
5135           || !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
5136         {
5137           ensure_normal_shift (str, dst);
5138           iso2022_designate (Vcharset_ascii, 0, str, dst);
5139         }
5140
5141       /* If necessary, restore everything to the default state
5142          at end-of-line */
5143       if (ch == '\n' && !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys)))
5144         {
5145           restore_left_to_right_direction (codesys, dst, flags, 0);
5146
5147           ensure_normal_shift (str, dst);
5148
5149           for (i = 0; i < 4; i++)
5150             {
5151               Lisp_Object initial_charset =
5152                 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5153               iso2022_designate (initial_charset, i, str, dst);
5154             }
5155         }
5156       if (ch == '\n')
5157         {
5158           if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5159             Dynarr_add (dst, '\r');
5160           if (eol_type != EOL_CR)
5161             Dynarr_add (dst, ch);
5162         }
5163       else
5164         {
5165           if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5166               && fit_to_be_escape_quoted (ch))
5167             Dynarr_add (dst, ISO_CODE_ESC);
5168           Dynarr_add (dst, ch);
5169         }
5170     }
5171   else if ( (0x80 <= ch) && (ch <= 0x9f) )
5172     {
5173       charmask = (half == 0 ? 0x00 : 0x80);
5174
5175       if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5176           && fit_to_be_escape_quoted (ch))
5177         Dynarr_add (dst, ISO_CODE_ESC);
5178       /* you asked for it ... */
5179       Dynarr_add (dst, ch);
5180     }
5181   else
5182     {
5183       int reg;
5184
5185       /* Now determine which register to use. */
5186       reg = -1;
5187       for (i = 0; i < 4; i++)
5188         {
5189           int code_point;
5190
5191           if ((CHARSETP (charset = str->iso2022.charset[i])
5192                && ((code_point = charset_code_point (charset, ch)) >= 0))
5193               ||
5194               (CHARSETP
5195                (charset
5196                 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i))
5197                && ((code_point = charset_code_point (charset, ch)) >= 0)))
5198             {
5199               if (XCHARSET_DIMENSION (charset) == 1)
5200                 {
5201                   byte1 = code_point;
5202                   byte2 = 0;
5203                 }
5204               else /* if (XCHARSET_DIMENSION (charset) == 2) */
5205                 {
5206                   byte1 = code_point >> 8;
5207                   byte2 = code_point & 255;
5208                 }
5209               reg = i;
5210               break;
5211             }
5212         }
5213       if (reg == -1)
5214         {
5215           Lisp_Object original_default_coded_charset_priority_list
5216             = Vdefault_coded_charset_priority_list;
5217
5218           while (!EQ (Vdefault_coded_charset_priority_list, Qnil))
5219             {
5220               BREAKUP_CHAR (ch, charset, byte1, byte2);
5221               if (XCHARSET_FINAL (charset))
5222                 goto found;
5223               Vdefault_coded_charset_priority_list
5224                 = Fcdr (Fmemq (XCHARSET_NAME (charset),
5225                                Vdefault_coded_charset_priority_list));
5226             }
5227           BREAKUP_CHAR (ch, charset, byte1, byte2);
5228           if (!XCHARSET_FINAL (charset))
5229             {
5230               charset = Vcharset_ascii;
5231               byte1 = '~';
5232             }
5233         found:
5234           Vdefault_coded_charset_priority_list
5235             = original_default_coded_charset_priority_list;
5236         }
5237       ensure_correct_direction (XCHARSET_DIRECTION (charset),
5238                                 codesys, dst, flags, 0);
5239
5240       if (reg == -1)
5241         {
5242           if (XCHARSET_GRAPHIC (charset) != 0)
5243             {
5244               if (!NILP (str->iso2022.charset[1]) &&
5245                   (!CODING_SYSTEM_ISO2022_SEVEN (codesys)
5246                    || CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys)))
5247                 reg = 1;
5248               else if (!NILP (str->iso2022.charset[2]))
5249                 reg = 2;
5250               else if (!NILP (str->iso2022.charset[3]))
5251                 reg = 3;
5252               else
5253                 reg = 0;
5254             }
5255           else
5256             reg = 0;
5257         }
5258
5259       iso2022_designate (charset, reg, str, dst);
5260
5261       /* Now invoke that register. */
5262       switch (reg)
5263         {
5264         case 0:
5265           ensure_normal_shift (str, dst);
5266           half = 0;
5267           break;
5268         case 1:
5269           if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5270             {
5271               ensure_shift_out (str, dst);
5272               half = 0;
5273             }
5274           else
5275             half = 1;
5276           break;
5277         case 2:
5278           if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5279             {
5280               Dynarr_add (dst, ISO_CODE_ESC);
5281               Dynarr_add (dst, 'N');
5282               half = 0;
5283             }
5284           else
5285             {
5286               Dynarr_add (dst, ISO_CODE_SS2);
5287               half = 1;
5288             }
5289           break;
5290         case 3:
5291           if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5292             {
5293               Dynarr_add (dst, ISO_CODE_ESC);
5294               Dynarr_add (dst, 'O');
5295               half = 0;
5296             }
5297           else
5298             {
5299               Dynarr_add (dst, ISO_CODE_SS3);
5300               half = 1;
5301             }
5302           break;
5303         default:
5304           abort ();
5305         }
5306
5307       charmask = (half == 0 ? 0x00 : 0x80);
5308
5309       switch (XCHARSET_DIMENSION (charset))
5310         {
5311         case 1:
5312           Dynarr_add (dst, byte1 | charmask);
5313           break;
5314         case 2:
5315           Dynarr_add (dst, byte1 | charmask);
5316           Dynarr_add (dst, byte2 | charmask);
5317           break;
5318         default:
5319           abort ();
5320         }
5321     }
5322   str->iso2022.current_charset = charset;
5323   str->iso2022.current_half = half;
5324 }
5325
5326 void
5327 char_finish_iso2022 (struct encoding_stream *str, unsigned_char_dynarr *dst,
5328                      unsigned int *flags)
5329 {
5330   Lisp_Coding_System* codesys = str->codesys;
5331   int i;
5332
5333   restore_left_to_right_direction (codesys, dst, flags, 0);
5334   ensure_normal_shift (str, dst);
5335   for (i = 0; i < 4; i++)
5336     {
5337       Lisp_Object initial_charset
5338         = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5339       iso2022_designate (initial_charset, i, str, dst);
5340     }
5341 }
5342 #endif /* MULE */
5343 \f
5344 /************************************************************************/
5345 /*                     No-conversion methods                            */
5346 /************************************************************************/
5347
5348 /* This is used when reading in "binary" files -- i.e. files that may
5349    contain all 256 possible byte values and that are not to be
5350    interpreted as being in any particular decoding. */
5351 static void
5352 decode_coding_no_conversion (Lstream *decoding, const unsigned char *src,
5353                              unsigned_char_dynarr *dst, unsigned int n)
5354 {
5355   unsigned char c;
5356   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
5357   unsigned int flags  = str->flags;
5358   unsigned int ch     = str->ch;
5359   eol_type_t eol_type = str->eol_type;
5360
5361   while (n--)
5362     {
5363       c = *src++;
5364
5365       DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5366       DECODE_ADD_BINARY_CHAR (c, dst);
5367     label_continue_loop:;
5368     }
5369
5370   DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
5371
5372   str->flags = flags;
5373   str->ch    = ch;
5374 }
5375
5376 static void
5377 encode_coding_no_conversion (Lstream *encoding, const unsigned char *src,
5378                              unsigned_char_dynarr *dst, unsigned int n)
5379 {
5380   unsigned char c;
5381   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
5382   unsigned int flags  = str->flags;
5383   unsigned int ch     = str->ch;
5384   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
5385 #ifdef UTF2000
5386   unsigned char char_boundary = str->iso2022.current_char_boundary;
5387 #endif
5388
5389   while (n--)
5390     {
5391       c = *src++;
5392 #ifdef UTF2000
5393       if (char_boundary == 0)
5394         if ( c >= 0xfc )
5395           {
5396             ch = c & 0x01;
5397             char_boundary = 5;
5398           }
5399         else if ( c >= 0xf8 )
5400           {
5401             ch = c & 0x03;
5402             char_boundary = 4;
5403           }
5404         else if ( c >= 0xf0 )
5405           {
5406             ch = c & 0x07;
5407             char_boundary = 3;
5408           }
5409         else if ( c >= 0xe0 )
5410           {
5411             ch = c & 0x0f;
5412             char_boundary = 2;
5413           }
5414         else if ( c >= 0xc0 )
5415           {
5416             ch = c & 0x1f;
5417             char_boundary = 1;
5418           }
5419         else
5420           {
5421             ch = 0;
5422             if (c == '\n')
5423               {
5424                 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5425                   Dynarr_add (dst, '\r');
5426                 if (eol_type != EOL_CR)
5427                   Dynarr_add (dst, c);
5428               }
5429             else
5430               Dynarr_add (dst, c);
5431             char_boundary = 0;
5432           }
5433       else if (char_boundary == 1)
5434         {
5435           ch = ( ch << 6 ) | ( c & 0x3f );
5436           Dynarr_add (dst, ch & 0xff);
5437           char_boundary = 0;
5438         }
5439       else
5440         {
5441           ch = ( ch << 6 ) | ( c & 0x3f );
5442           char_boundary--;
5443         }
5444 #else /* not UTF2000 */
5445       if (c == '\n')
5446         {
5447           if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5448             Dynarr_add (dst, '\r');
5449           if (eol_type != EOL_CR)
5450             Dynarr_add (dst, '\n');
5451           ch = 0;
5452         }
5453       else if (BYTE_ASCII_P (c))
5454         {
5455           assert (ch == 0);
5456           Dynarr_add (dst, c);
5457         }
5458       else if (BUFBYTE_LEADING_BYTE_P (c))
5459         {
5460           assert (ch == 0);
5461           if (c == LEADING_BYTE_LATIN_ISO8859_1 ||
5462               c == LEADING_BYTE_CONTROL_1)
5463             ch = c;
5464           else
5465             Dynarr_add (dst, '~'); /* untranslatable character */
5466         }
5467       else
5468         {
5469           if (ch == LEADING_BYTE_LATIN_ISO8859_1)
5470             Dynarr_add (dst, c);
5471           else if (ch == LEADING_BYTE_CONTROL_1)
5472             {
5473               assert (c < 0xC0);
5474               Dynarr_add (dst, c - 0x20);
5475             }
5476           /* else it should be the second or third byte of an
5477              untranslatable character, so ignore it */
5478           ch = 0;
5479         }
5480 #endif /* not UTF2000 */
5481     }
5482
5483   str->flags = flags;
5484   str->ch    = ch;
5485 #ifdef UTF2000
5486   str->iso2022.current_char_boundary = char_boundary;
5487 #endif
5488 }
5489
5490 \f
5491
5492 /************************************************************************/
5493 /*                             Initialization                           */
5494 /************************************************************************/
5495
5496 void
5497 syms_of_file_coding (void)
5498 {
5499   INIT_LRECORD_IMPLEMENTATION (coding_system);
5500
5501   deferror (&Qcoding_system_error, "coding-system-error",
5502             "Coding-system error", Qio_error);
5503
5504   DEFSUBR (Fcoding_system_p);
5505   DEFSUBR (Ffind_coding_system);
5506   DEFSUBR (Fget_coding_system);
5507   DEFSUBR (Fcoding_system_list);
5508   DEFSUBR (Fcoding_system_name);
5509   DEFSUBR (Fmake_coding_system);
5510   DEFSUBR (Fcopy_coding_system);
5511   DEFSUBR (Fcoding_system_canonical_name_p);
5512   DEFSUBR (Fcoding_system_alias_p);
5513   DEFSUBR (Fcoding_system_aliasee);
5514   DEFSUBR (Fdefine_coding_system_alias);
5515   DEFSUBR (Fsubsidiary_coding_system);
5516
5517   DEFSUBR (Fcoding_system_type);
5518   DEFSUBR (Fcoding_system_doc_string);
5519 #ifdef MULE
5520   DEFSUBR (Fcoding_system_charset);
5521 #endif
5522   DEFSUBR (Fcoding_system_property);
5523
5524   DEFSUBR (Fcoding_category_list);
5525   DEFSUBR (Fset_coding_priority_list);
5526   DEFSUBR (Fcoding_priority_list);
5527   DEFSUBR (Fset_coding_category_system);
5528   DEFSUBR (Fcoding_category_system);
5529
5530   DEFSUBR (Fdetect_coding_region);
5531   DEFSUBR (Fdecode_coding_region);
5532   DEFSUBR (Fencode_coding_region);
5533 #ifdef MULE
5534   DEFSUBR (Fdecode_shift_jis_char);
5535   DEFSUBR (Fencode_shift_jis_char);
5536   DEFSUBR (Fdecode_big5_char);
5537   DEFSUBR (Fencode_big5_char);
5538 #endif /* MULE */
5539   defsymbol (&Qcoding_systemp, "coding-system-p");
5540   defsymbol (&Qno_conversion, "no-conversion");
5541   defsymbol (&Qraw_text, "raw-text");
5542 #ifdef MULE
5543   defsymbol (&Qbig5, "big5");
5544   defsymbol (&Qshift_jis, "shift-jis");
5545   defsymbol (&Qucs4, "ucs-4");
5546   defsymbol (&Qutf8, "utf-8");
5547   defsymbol (&Qccl, "ccl");
5548   defsymbol (&Qiso2022, "iso2022");
5549 #endif /* MULE */
5550   defsymbol (&Qmnemonic, "mnemonic");
5551   defsymbol (&Qeol_type, "eol-type");
5552   defsymbol (&Qpost_read_conversion, "post-read-conversion");
5553   defsymbol (&Qpre_write_conversion, "pre-write-conversion");
5554
5555   defsymbol (&Qcr, "cr");
5556   defsymbol (&Qlf, "lf");
5557   defsymbol (&Qcrlf, "crlf");
5558   defsymbol (&Qeol_cr, "eol-cr");
5559   defsymbol (&Qeol_lf, "eol-lf");
5560   defsymbol (&Qeol_crlf, "eol-crlf");
5561 #ifdef MULE
5562   defsymbol (&Qcharset_g0, "charset-g0");
5563   defsymbol (&Qcharset_g1, "charset-g1");
5564   defsymbol (&Qcharset_g2, "charset-g2");
5565   defsymbol (&Qcharset_g3, "charset-g3");
5566   defsymbol (&Qforce_g0_on_output, "force-g0-on-output");
5567   defsymbol (&Qforce_g1_on_output, "force-g1-on-output");
5568   defsymbol (&Qforce_g2_on_output, "force-g2-on-output");
5569   defsymbol (&Qforce_g3_on_output, "force-g3-on-output");
5570   defsymbol (&Qno_iso6429, "no-iso6429");
5571   defsymbol (&Qinput_charset_conversion, "input-charset-conversion");
5572   defsymbol (&Qoutput_charset_conversion, "output-charset-conversion");
5573
5574   defsymbol (&Qshort, "short");
5575   defsymbol (&Qno_ascii_eol, "no-ascii-eol");
5576   defsymbol (&Qno_ascii_cntl, "no-ascii-cntl");
5577   defsymbol (&Qseven, "seven");
5578   defsymbol (&Qlock_shift, "lock-shift");
5579   defsymbol (&Qescape_quoted, "escape-quoted");
5580 #endif /* MULE */
5581 #ifdef UTF2000
5582   defsymbol (&Qdisable_composition, "disable-composition");
5583 #endif
5584   defsymbol (&Qencode, "encode");
5585   defsymbol (&Qdecode, "decode");
5586
5587 #ifdef MULE
5588   defsymbol (&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS],
5589              "shift-jis");
5590   defsymbol (&coding_category_symbol[CODING_CATEGORY_BIG5],
5591              "big5");
5592   defsymbol (&coding_category_symbol[CODING_CATEGORY_UCS4],
5593              "ucs-4");
5594   defsymbol (&coding_category_symbol[CODING_CATEGORY_UTF8],
5595              "utf-8");
5596   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_7],
5597              "iso-7");
5598   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE],
5599              "iso-8-designate");
5600   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_1],
5601              "iso-8-1");
5602   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_2],
5603              "iso-8-2");
5604   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_LOCK_SHIFT],
5605              "iso-lock-shift");
5606 #endif /* MULE */
5607   defsymbol (&coding_category_symbol[CODING_CATEGORY_NO_CONVERSION],
5608              "no-conversion");
5609 }
5610
5611 void
5612 lstream_type_create_file_coding (void)
5613 {
5614   LSTREAM_HAS_METHOD (decoding, reader);
5615   LSTREAM_HAS_METHOD (decoding, writer);
5616   LSTREAM_HAS_METHOD (decoding, rewinder);
5617   LSTREAM_HAS_METHOD (decoding, seekable_p);
5618   LSTREAM_HAS_METHOD (decoding, flusher);
5619   LSTREAM_HAS_METHOD (decoding, closer);
5620   LSTREAM_HAS_METHOD (decoding, marker);
5621
5622   LSTREAM_HAS_METHOD (encoding, reader);
5623   LSTREAM_HAS_METHOD (encoding, writer);
5624   LSTREAM_HAS_METHOD (encoding, rewinder);
5625   LSTREAM_HAS_METHOD (encoding, seekable_p);
5626   LSTREAM_HAS_METHOD (encoding, flusher);
5627   LSTREAM_HAS_METHOD (encoding, closer);
5628   LSTREAM_HAS_METHOD (encoding, marker);
5629 }
5630
5631 void
5632 vars_of_file_coding (void)
5633 {
5634   int i;
5635
5636   fcd = xnew (struct file_coding_dump);
5637   dumpstruct (&fcd, &fcd_description);
5638
5639   /* Initialize to something reasonable ... */
5640   for (i = 0; i <= CODING_CATEGORY_LAST; i++)
5641     {
5642       fcd->coding_category_system[i] = Qnil;
5643       fcd->coding_category_by_priority[i] = i;
5644     }
5645
5646   Fprovide (intern ("file-coding"));
5647
5648   DEFVAR_LISP ("keyboard-coding-system", &Vkeyboard_coding_system /*
5649 Coding system used for TTY keyboard input.
5650 Not used under a windowing system.
5651 */ );
5652   Vkeyboard_coding_system = Qnil;
5653
5654   DEFVAR_LISP ("terminal-coding-system", &Vterminal_coding_system /*
5655 Coding system used for TTY display output.
5656 Not used under a windowing system.
5657 */ );
5658   Vterminal_coding_system = Qnil;
5659
5660   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read /*
5661 Overriding coding system used when reading from a file or process.
5662 You should bind this variable with `let', but do not set it globally.
5663 If this is non-nil, it specifies the coding system that will be used
5664 to decode input on read operations, such as from a file or process.
5665 It overrides `buffer-file-coding-system-for-read',
5666 `insert-file-contents-pre-hook', etc.  Use those variables instead of
5667 this one for permanent changes to the environment.  */ );
5668   Vcoding_system_for_read = Qnil;
5669
5670   DEFVAR_LISP ("coding-system-for-write",
5671                &Vcoding_system_for_write /*
5672 Overriding coding system used when writing to a file or process.
5673 You should bind this variable with `let', but do not set it globally.
5674 If this is non-nil, it specifies the coding system that will be used
5675 to encode output for write operations, such as to a file or process.
5676 It overrides `buffer-file-coding-system', `write-region-pre-hook', etc.
5677 Use those variables instead of this one for permanent changes to the
5678 environment.  */ );
5679   Vcoding_system_for_write = Qnil;
5680
5681   DEFVAR_LISP ("file-name-coding-system", &Vfile_name_coding_system /*
5682 Coding system used to convert pathnames when accessing files.
5683 */ );
5684   Vfile_name_coding_system = Qnil;
5685
5686   DEFVAR_BOOL ("enable-multibyte-characters", &enable_multibyte_characters /*
5687 Non-nil means the buffer contents are regarded as multi-byte form
5688 of characters, not a binary code.  This affects the display, file I/O,
5689 and behaviors of various editing commands.
5690
5691 Setting this to nil does not do anything.
5692 */ );
5693   enable_multibyte_characters = 1;
5694 }
5695
5696 void
5697 complex_vars_of_file_coding (void)
5698 {
5699   staticpro (&Vcoding_system_hash_table);
5700   Vcoding_system_hash_table =
5701     make_lisp_hash_table (50, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ);
5702
5703   the_codesys_prop_dynarr = Dynarr_new (codesys_prop);
5704   dumpstruct (&the_codesys_prop_dynarr, &codesys_prop_dynarr_description);
5705
5706 #define DEFINE_CODESYS_PROP(Prop_Type, Sym) do  \
5707 {                                               \
5708   struct codesys_prop csp;                      \
5709   csp.sym = (Sym);                              \
5710   csp.prop_type = (Prop_Type);                  \
5711   Dynarr_add (the_codesys_prop_dynarr, csp);    \
5712 } while (0)
5713
5714   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qmnemonic);
5715   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_type);
5716   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_cr);
5717   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_crlf);
5718   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_lf);
5719   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qpost_read_conversion);
5720   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qpre_write_conversion);
5721 #ifdef MULE
5722   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g0);
5723   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g1);
5724   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g2);
5725   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g3);
5726   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g0_on_output);
5727   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g1_on_output);
5728   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g2_on_output);
5729   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g3_on_output);
5730   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qshort);
5731   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_eol);
5732   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_cntl);
5733   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qseven);
5734   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qlock_shift);
5735   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_iso6429);
5736   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qescape_quoted);
5737   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qinput_charset_conversion);
5738   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qoutput_charset_conversion);
5739
5740   DEFINE_CODESYS_PROP (CODESYS_PROP_CCL,     Qencode);
5741   DEFINE_CODESYS_PROP (CODESYS_PROP_CCL,     Qdecode);
5742 #endif /* MULE */
5743   /* Need to create this here or we're really screwed. */
5744   Fmake_coding_system
5745     (Qraw_text, Qno_conversion,
5746      build_string ("Raw text, which means it converts only line-break-codes."),
5747      list2 (Qmnemonic, build_string ("Raw")));
5748
5749   Fmake_coding_system
5750     (Qbinary, Qno_conversion,
5751      build_string ("Binary, which means it does not convert anything."),
5752      list4 (Qeol_type, Qlf,
5753             Qmnemonic, build_string ("Binary")));
5754
5755 #ifdef UTF2000
5756   Fmake_coding_system
5757     (Qutf8, Qutf8,
5758      build_string ("Coding-system of ISO/IEC 10646 UTF-8."),
5759      list2 (Qmnemonic, build_string ("UTF8")));
5760 #endif
5761
5762   Fdefine_coding_system_alias (Qno_conversion, Qraw_text);
5763
5764   Fdefine_coding_system_alias (Qfile_name, Qbinary);
5765
5766   Fdefine_coding_system_alias (Qterminal, Qbinary);
5767   Fdefine_coding_system_alias (Qkeyboard, Qbinary);
5768
5769   /* Need this for bootstrapping */
5770   fcd->coding_category_system[CODING_CATEGORY_NO_CONVERSION] =
5771     Fget_coding_system (Qraw_text);
5772
5773 #ifdef UTF2000
5774   fcd->coding_category_system[CODING_CATEGORY_UTF8]
5775    = Fget_coding_system (Qutf8);
5776 #endif
5777
5778 #if defined(MULE) && !defined(UTF2000)
5779   {
5780     unsigned int i;
5781
5782     for (i = 0; i < 65536; i++)
5783       fcd->ucs_to_mule_table[i] = Qnil;
5784   }
5785   staticpro (&mule_to_ucs_table);
5786   mule_to_ucs_table = Fmake_char_table(Qgeneric);
5787 #endif /* defined(MULE) && !defined(UTF2000) */
5788 }