git.chise.org Git - chise/xemacs-chise.git.1/blob - src/file-coding.c

   1 /* Code conversion functions.
   2    Copyright (C) 1991, 1995 Free Software Foundation, Inc.
   3    Copyright (C) 1995 Sun Microsystems, Inc.
   4
   5 This file is part of XEmacs.
   6
   7 XEmacs is free software; you can redistribute it and/or modify it
   8 under the terms of the GNU General Public License as published by the
   9 Free Software Foundation; either version 2, or (at your option) any
  10 later version.
  11
  12 XEmacs is distributed in the hope that it will be useful, but WITHOUT
  13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  14 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  15 for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with XEmacs; see the file COPYING.  If not, write to
  19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  20 Boston, MA 02111-1307, USA.  */
  21
  22 /* Synched up with: Mule 2.3.   Not in FSF. */
  23
  24 /* Rewritten by Ben Wing <ben@xemacs.org>. */
  25
  26 #include <config.h>
  27 #include "lisp.h"
  28
  29 #include "buffer.h"
  30 #include "elhash.h"
  31 #include "insdel.h"
  32 #include "lstream.h"
  33 #include "opaque.h"
  34 #ifdef MULE
  35 #include "mule-ccl.h"
  36 #include "chartab.h"
  37 #endif
  38 #include "file-coding.h"
  39
  40 Lisp_Object Qcoding_system_error;
  41
  42 Lisp_Object Vkeyboard_coding_system;
  43 Lisp_Object Vterminal_coding_system;
  44 Lisp_Object Vcoding_system_for_read;
  45 Lisp_Object Vcoding_system_for_write;
  46 Lisp_Object Vfile_name_coding_system;
  47
  48 /* Table of symbols identifying each coding category. */
  49 Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST];
  50
  51
  52
  53 struct file_coding_dump {
  54   /* Coding system currently associated with each coding category. */
  55   Lisp_Object coding_category_system[CODING_CATEGORY_LAST];
  56
  57   /* Table of all coding categories in decreasing order of priority.
  58      This describes a permutation of the possible coding categories. */
  59   int coding_category_by_priority[CODING_CATEGORY_LAST];
  60
  61 #if defined(MULE) && !defined(UTF2000)
  62   Lisp_Object ucs_to_mule_table[65536];
  63 #endif
  64 } *fcd;
  65
  66 static const struct lrecord_description fcd_description_1[] = {
  67   { XD_LISP_OBJECT_ARRAY, offsetof (struct file_coding_dump, coding_category_system), CODING_CATEGORY_LAST },
  68 #if defined(MULE) && !defined(UTF2000)
  69   { XD_LISP_OBJECT_ARRAY, offsetof (struct file_coding_dump, ucs_to_mule_table), countof (fcd->ucs_to_mule_table) },
  70 #endif
  71   { XD_END }
  72 };
  73
  74 static const struct struct_description fcd_description = {
  75   sizeof (struct file_coding_dump),
  76   fcd_description_1
  77 };
  78
  79 Lisp_Object mule_to_ucs_table;
  80
  81 Lisp_Object Qcoding_systemp;
  82
  83 Lisp_Object Qraw_text, Qno_conversion, Qccl, Qiso2022;
  84 /* Qinternal in general.c */
  85
  86 Lisp_Object Qmnemonic, Qeol_type;
  87 Lisp_Object Qcr, Qcrlf, Qlf;
  88 Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf;
  89 Lisp_Object Qpost_read_conversion;
  90 Lisp_Object Qpre_write_conversion;
  91
  92 #ifdef MULE
  93 Lisp_Object Qucs4, Qutf8;
  94 Lisp_Object Qbig5, Qshift_jis;
  95 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3;
  96 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output;
  97 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output;
  98 Lisp_Object Qno_iso6429;
  99 Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion;
 100 Lisp_Object Qescape_quoted;
 101 Lisp_Object Qshort, Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift;
 102 #endif
 103 Lisp_Object Qencode, Qdecode;
 104
 105 Lisp_Object Vcoding_system_hash_table;
 106
 107 int enable_multibyte_characters;
 108
 109 #ifdef MULE
 110 /* Additional information used by the ISO2022 decoder and detector. */
 111 struct iso2022_decoder
 112 {
 113   /* CHARSET holds the character sets currently assigned to the G0
 114      through G3 variables.  It is initialized from the array
 115      INITIAL_CHARSET in CODESYS. */
 116   Lisp_Object charset[4];
 117
 118   /* Which registers are currently invoked into the left (GL) and
 119      right (GR) halves of the 8-bit encoding space? */
 120   int register_left, register_right;
 121
 122   /* ISO_ESC holds a value indicating part of an escape sequence
 123      that has already been seen. */
 124   enum iso_esc_flag esc;
 125
 126   /* This records the bytes we've seen so far in an escape sequence,
 127      in case the sequence is invalid (we spit out the bytes unchanged). */
 128   unsigned char esc_bytes[8];
 129
 130   /* Index for next byte to store in ISO escape sequence. */
 131   int esc_bytes_index;
 132
 133 #ifdef ENABLE_COMPOSITE_CHARS
 134   /* Stuff seen so far when composing a string. */
 135   unsigned_char_dynarr *composite_chars;
 136 #endif
 137
 138   /* If we saw an invalid designation sequence for a particular
 139      register, we flag it here and switch to ASCII.  The next time we
 140      see a valid designation for this register, we turn off the flag
 141      and do the designation normally, but pretend the sequence was
 142      invalid.  The effect of all this is that (most of the time) the
 143      escape sequences for both the switch to the unknown charset, and
 144      the switch back to the known charset, get inserted literally into
 145      the buffer and saved out as such.  The hope is that we can
 146      preserve the escape sequences so that the resulting written out
 147      file makes sense.  If we don't do any of this, the designation
 148      to the invalid charset will be preserved but that switch back
 149      to the known charset will probably get eaten because it was
 150      the same charset that was already present in the register. */
 151   unsigned char invalid_designated[4];
 152
 153   /* We try to do similar things as above for direction-switching
 154      sequences.  If we encountered a direction switch while an
 155      invalid designation was present, or an invalid designation
 156      just after a direction switch (i.e. no valid designation
 157      encountered yet), we insert the direction-switch escape
 158      sequence literally into the output stream, and later on
 159      insert the corresponding direction-restoring escape sequence
 160      literally also. */
 161   unsigned int switched_dir_and_no_valid_charset_yet :1;
 162   unsigned int invalid_switch_dir :1;
 163
 164   /* Tells the decoder to output the escape sequence literally
 165      even though it was valid.  Used in the games we play to
 166      avoid lossage when we encounter invalid designations. */
 167   unsigned int output_literally :1;
 168   /* We encountered a direction switch followed by an invalid
 169      designation.  We didn't output the direction switch
 170      literally because we didn't know about the invalid designation;
 171      but we have to do so now. */
 172   unsigned int output_direction_sequence :1;
 173 };
 174 #endif /* MULE */
 175 EXFUN (Fcopy_coding_system, 2);
 176 #ifdef MULE
 177 struct detection_state;
 178 static int detect_coding_sjis (struct detection_state *st,
 179                                const Extbyte *src, Lstream_data_count n);
 180 static void decode_coding_sjis (Lstream *decoding, const Extbyte *src,
 181                                 unsigned_char_dynarr *dst, Lstream_data_count n);
 182 static void encode_coding_sjis (Lstream *encoding, const Bufbyte *src,
 183                                 unsigned_char_dynarr *dst, Lstream_data_count n);
 184 static int detect_coding_big5 (struct detection_state *st,
 185                                const Extbyte *src, Lstream_data_count n);
 186 static void decode_coding_big5 (Lstream *decoding, const Extbyte *src,
 187                                 unsigned_char_dynarr *dst, Lstream_data_count n);
 188 static void encode_coding_big5 (Lstream *encoding, const Bufbyte *src,
 189                                 unsigned_char_dynarr *dst, Lstream_data_count n);
 190 static int detect_coding_ucs4 (struct detection_state *st,
 191                                const Extbyte *src, Lstream_data_count n);
 192 static void decode_coding_ucs4 (Lstream *decoding, const Extbyte *src,
 193                                 unsigned_char_dynarr *dst, Lstream_data_count n);
 194 static void encode_coding_ucs4 (Lstream *encoding, const Bufbyte *src,
 195                                 unsigned_char_dynarr *dst, Lstream_data_count n);
 196 static int detect_coding_utf8 (struct detection_state *st,
 197                                const Extbyte *src, Lstream_data_count n);
 198 static void decode_coding_utf8 (Lstream *decoding, const Extbyte *src,
 199                                 unsigned_char_dynarr *dst, Lstream_data_count n);
 200 static void encode_coding_utf8 (Lstream *encoding, const Bufbyte *src,
 201                                 unsigned_char_dynarr *dst, Lstream_data_count n);
 202 static int postprocess_iso2022_mask (int mask);
 203 static void reset_iso2022 (Lisp_Object coding_system,
 204                            struct iso2022_decoder *iso);
 205 static int detect_coding_iso2022 (struct detection_state *st,
 206                                   const Extbyte *src, Lstream_data_count n);
 207 static void decode_coding_iso2022 (Lstream *decoding, const Extbyte *src,
 208                                    unsigned_char_dynarr *dst, Lstream_data_count n);
 209 static void encode_coding_iso2022 (Lstream *encoding, const Bufbyte *src,
 210                                    unsigned_char_dynarr *dst, Lstream_data_count n);
 211 #endif /* MULE */
 212 static void decode_coding_no_conversion (Lstream *decoding, const Extbyte *src,
 213                                          unsigned_char_dynarr *dst, Lstream_data_count n);
 214 static void encode_coding_no_conversion (Lstream *encoding, const Bufbyte *src,
 215                                          unsigned_char_dynarr *dst, Lstream_data_count n);
 216 static void mule_decode (Lstream *decoding, const Extbyte *src,
 217                          unsigned_char_dynarr *dst, Lstream_data_count n);
 218 static void mule_encode (Lstream *encoding, const Bufbyte *src,
 219                          unsigned_char_dynarr *dst, Lstream_data_count n);
 220
 221 typedef struct codesys_prop codesys_prop;
 222 struct codesys_prop
 223 {
 224   Lisp_Object sym;
 225   int prop_type;
 226 };
 227
 228 typedef struct
 229 {
 230   Dynarr_declare (codesys_prop);
 231 } codesys_prop_dynarr;
 232
 233 static const struct lrecord_description codesys_prop_description_1[] = {
 234   { XD_LISP_OBJECT, offsetof (codesys_prop, sym) },
 235   { XD_END }
 236 };
 237
 238 static const struct struct_description codesys_prop_description = {
 239   sizeof (codesys_prop),
 240   codesys_prop_description_1
 241 };
 242
 243 static const struct lrecord_description codesys_prop_dynarr_description_1[] = {
 244   XD_DYNARR_DESC (codesys_prop_dynarr, &codesys_prop_description),
 245   { XD_END }
 246 };
 247
 248 static const struct struct_description codesys_prop_dynarr_description = {
 249   sizeof (codesys_prop_dynarr),
 250   codesys_prop_dynarr_description_1
 251 };
 252
 253 codesys_prop_dynarr *the_codesys_prop_dynarr;
 254
 255 enum codesys_prop_enum
 256 {
 257   CODESYS_PROP_ALL_OK,
 258   CODESYS_PROP_ISO2022,
 259   CODESYS_PROP_CCL
 260 };
 261
 262 \f
 263 /************************************************************************/
 264 /*                       Coding system functions                        */
 265 /************************************************************************/
 266
 267 static Lisp_Object mark_coding_system (Lisp_Object);
 268 static void print_coding_system (Lisp_Object, Lisp_Object, int);
 269 static void finalize_coding_system (void *header, int for_disksave);
 270
 271 #ifdef MULE
 272 static const struct lrecord_description ccs_description_1[] = {
 273   { XD_LISP_OBJECT, offsetof (charset_conversion_spec, from_charset) },
 274   { XD_LISP_OBJECT, offsetof (charset_conversion_spec, to_charset) },
 275   { XD_END }
 276 };
 277
 278 static const struct struct_description ccs_description = {
 279   sizeof (charset_conversion_spec),
 280   ccs_description_1
 281 };
 282
 283 static const struct lrecord_description ccsd_description_1[] = {
 284   XD_DYNARR_DESC (charset_conversion_spec_dynarr, &ccs_description),
 285   { XD_END }
 286 };
 287
 288 static const struct struct_description ccsd_description = {
 289   sizeof (charset_conversion_spec_dynarr),
 290   ccsd_description_1
 291 };
 292 #endif
 293
 294 static const struct lrecord_description coding_system_description[] = {
 295   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, name) },
 296   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, doc_string) },
 297   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, mnemonic) },
 298   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, post_read_conversion) },
 299   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, pre_write_conversion) },
 300   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_lf) },
 301   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_crlf) },
 302   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_cr) },
 303 #ifdef MULE
 304   { XD_LISP_OBJECT_ARRAY, offsetof (Lisp_Coding_System, iso2022.initial_charset), 4 },
 305   { XD_STRUCT_PTR,  offsetof (Lisp_Coding_System, iso2022.input_conv),  1, &ccsd_description },
 306   { XD_STRUCT_PTR,  offsetof (Lisp_Coding_System, iso2022.output_conv), 1, &ccsd_description },
 307   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccl.decode) },
 308   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccl.encode) },
 309 #endif
 310   { XD_END }
 311 };
 312
 313 DEFINE_LRECORD_IMPLEMENTATION ("coding-system", coding_system,
 314                                mark_coding_system, print_coding_system,
 315                                finalize_coding_system,
 316                                0, 0, coding_system_description,
 317                                Lisp_Coding_System);
 318
 319 static Lisp_Object
 320 mark_coding_system (Lisp_Object obj)
 321 {
 322   Lisp_Coding_System *codesys = XCODING_SYSTEM (obj);
 323
 324   mark_object (CODING_SYSTEM_NAME (codesys));
 325   mark_object (CODING_SYSTEM_DOC_STRING (codesys));
 326   mark_object (CODING_SYSTEM_MNEMONIC (codesys));
 327   mark_object (CODING_SYSTEM_EOL_LF (codesys));
 328   mark_object (CODING_SYSTEM_EOL_CRLF (codesys));
 329   mark_object (CODING_SYSTEM_EOL_CR (codesys));
 330
 331   switch (CODING_SYSTEM_TYPE (codesys))
 332     {
 333 #ifdef MULE
 334       int i;
 335     case CODESYS_ISO2022:
 336       for (i = 0; i < 4; i++)
 337         mark_object (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i));
 338       if (codesys->iso2022.input_conv)
 339         {
 340           for (i = 0; i < Dynarr_length (codesys->iso2022.input_conv); i++)
 341             {
 342               struct charset_conversion_spec *ccs =
 343                 Dynarr_atp (codesys->iso2022.input_conv, i);
 344               mark_object (ccs->from_charset);
 345               mark_object (ccs->to_charset);
 346             }
 347         }
 348       if (codesys->iso2022.output_conv)
 349         {
 350           for (i = 0; i < Dynarr_length (codesys->iso2022.output_conv); i++)
 351             {
 352               struct charset_conversion_spec *ccs =
 353                 Dynarr_atp (codesys->iso2022.output_conv, i);
 354               mark_object (ccs->from_charset);
 355               mark_object (ccs->to_charset);
 356             }
 357         }
 358       break;
 359
 360     case CODESYS_CCL:
 361       mark_object (CODING_SYSTEM_CCL_DECODE (codesys));
 362       mark_object (CODING_SYSTEM_CCL_ENCODE (codesys));
 363       break;
 364 #endif /* MULE */
 365     default:
 366       break;
 367     }
 368
 369   mark_object (CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys));
 370   return CODING_SYSTEM_POST_READ_CONVERSION (codesys);
 371 }
 372
 373 static void
 374 print_coding_system (Lisp_Object obj, Lisp_Object printcharfun,
 375                      int escapeflag)
 376 {
 377   Lisp_Coding_System *c = XCODING_SYSTEM (obj);
 378   if (print_readably)
 379     error ("printing unreadable object #<coding_system 0x%x>",
 380            c->header.uid);
 381
 382   write_c_string ("#<coding_system ", printcharfun);
 383   print_internal (c->name, printcharfun, 1);
 384   write_c_string (">", printcharfun);
 385 }
 386
 387 static void
 388 finalize_coding_system (void *header, int for_disksave)
 389 {
 390   Lisp_Coding_System *c = (Lisp_Coding_System *) header;
 391   /* Since coding systems never go away, this function is not
 392      necessary.  But it would be necessary if we changed things
 393      so that coding systems could go away. */
 394   if (!for_disksave) /* see comment in lstream.c */
 395     {
 396       switch (CODING_SYSTEM_TYPE (c))
 397         {
 398 #ifdef MULE
 399         case CODESYS_ISO2022:
 400           if (c->iso2022.input_conv)
 401             {
 402               Dynarr_free (c->iso2022.input_conv);
 403               c->iso2022.input_conv = 0;
 404             }
 405           if (c->iso2022.output_conv)
 406             {
 407               Dynarr_free (c->iso2022.output_conv);
 408               c->iso2022.output_conv = 0;
 409             }
 410           break;
 411 #endif /* MULE */
 412         default:
 413           break;
 414         }
 415     }
 416 }
 417
 418 static eol_type_t
 419 symbol_to_eol_type (Lisp_Object symbol)
 420 {
 421   CHECK_SYMBOL (symbol);
 422   if (NILP (symbol))      return EOL_AUTODETECT;
 423   if (EQ (symbol, Qlf))   return EOL_LF;
 424   if (EQ (symbol, Qcrlf)) return EOL_CRLF;
 425   if (EQ (symbol, Qcr))   return EOL_CR;
 426
 427   signal_simple_error ("Unrecognized eol type", symbol);
 428   return EOL_AUTODETECT; /* not reached */
 429 }
 430
 431 static Lisp_Object
 432 eol_type_to_symbol (eol_type_t type)
 433 {
 434   switch (type)
 435     {
 436     default: ABORT ();
 437     case EOL_LF:         return Qlf;
 438     case EOL_CRLF:       return Qcrlf;
 439     case EOL_CR:         return Qcr;
 440     case EOL_AUTODETECT: return Qnil;
 441     }
 442 }
 443
 444 static void
 445 setup_eol_coding_systems (Lisp_Coding_System *codesys)
 446 {
 447   Lisp_Object codesys_obj;
 448   int len = string_length (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name);
 449   char *codesys_name = (char *) alloca (len + 7);
 450   int mlen = -1;
 451   char *codesys_mnemonic=0;
 452
 453   Lisp_Object codesys_name_sym, sub_codesys_obj;
 454
 455   /* kludge */
 456
 457   XSETCODING_SYSTEM (codesys_obj, codesys);
 458
 459   memcpy (codesys_name,
 460           string_data (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name), len);
 461
 462   if (STRINGP (CODING_SYSTEM_MNEMONIC (codesys)))
 463     {
 464       mlen = XSTRING_LENGTH (CODING_SYSTEM_MNEMONIC (codesys));
 465       codesys_mnemonic = (char *) alloca (mlen + 7);
 466       memcpy (codesys_mnemonic,
 467               XSTRING_DATA (CODING_SYSTEM_MNEMONIC (codesys)), mlen);
 468     }
 469
 470 #define DEFINE_SUB_CODESYS(op_sys, op_sys_abbr, Type) do {                      \
 471   strcpy (codesys_name + len, "-" op_sys);                                      \
 472   if (mlen != -1)                                                               \
 473     strcpy (codesys_mnemonic + mlen, op_sys_abbr);                              \
 474   codesys_name_sym = intern (codesys_name);                                     \
 475   sub_codesys_obj = Fcopy_coding_system (codesys_obj, codesys_name_sym);        \
 476   XCODING_SYSTEM_EOL_TYPE (sub_codesys_obj) = Type;                             \
 477   if (mlen != -1)                                                               \
 478     XCODING_SYSTEM_MNEMONIC(sub_codesys_obj) =                                  \
 479       build_string (codesys_mnemonic);                                          \
 480   CODING_SYSTEM_##Type (codesys) = sub_codesys_obj;                             \
 481 } while (0)
 482
 483   DEFINE_SUB_CODESYS("unix", "", EOL_LF);
 484   DEFINE_SUB_CODESYS("dos",  ":T", EOL_CRLF);
 485   DEFINE_SUB_CODESYS("mac",  ":t", EOL_CR);
 486 }
 487
 488 DEFUN ("coding-system-p", Fcoding_system_p, 1, 1, 0, /*
 489 Return t if OBJECT is a coding system.
 490 A coding system is an object that defines how text containing multiple
 491 character sets is encoded into a stream of (typically 8-bit) bytes.
 492 The coding system is used to decode the stream into a series of
 493 characters (which may be from multiple charsets) when the text is read
 494 from a file or process, and is used to encode the text back into the
 495 same format when it is written out to a file or process.
 496
 497 For example, many ISO2022-compliant coding systems (such as Compound
 498 Text, which is used for inter-client data under the X Window System)
 499 use escape sequences to switch between different charsets -- Japanese
 500 Kanji, for example, is invoked with "ESC $ ( B"; ASCII is invoked
 501 with "ESC ( B"; and Cyrillic is invoked with "ESC - L".  See
 502 `make-coding-system' for more information.
 503
 504 Coding systems are normally identified using a symbol, and the
 505 symbol is accepted in place of the actual coding system object whenever
 506 a coding system is called for. (This is similar to how faces work.)
 507 */
 508        (object))
 509 {
 510   return CODING_SYSTEMP (object) ? Qt : Qnil;
 511 }
 512
 513 DEFUN ("find-coding-system", Ffind_coding_system, 1, 1, 0, /*
 514 Retrieve the coding system of the given name.
 515
 516 If CODING-SYSTEM-OR-NAME is a coding-system object, it is simply
 517 returned.  Otherwise, CODING-SYSTEM-OR-NAME should be a symbol.
 518 If there is no such coding system, nil is returned.  Otherwise the
 519 associated coding system object is returned.
 520 */
 521        (coding_system_or_name))
 522 {
 523   if (NILP (coding_system_or_name))
 524     coding_system_or_name = Qbinary;
 525   else if (CODING_SYSTEMP (coding_system_or_name))
 526     return coding_system_or_name;
 527   else
 528     CHECK_SYMBOL (coding_system_or_name);
 529
 530   while (1)
 531     {
 532       coding_system_or_name =
 533         Fgethash (coding_system_or_name, Vcoding_system_hash_table, Qnil);
 534
 535       if (CODING_SYSTEMP (coding_system_or_name) || NILP (coding_system_or_name))
 536         return coding_system_or_name;
 537     }
 538 }
 539
 540 DEFUN ("get-coding-system", Fget_coding_system, 1, 1, 0, /*
 541 Retrieve the coding system of the given name.
 542 Same as `find-coding-system' except that if there is no such
 543 coding system, an error is signaled instead of returning nil.
 544 */
 545        (name))
 546 {
 547   Lisp_Object coding_system = Ffind_coding_system (name);
 548
 549   if (NILP (coding_system))
 550     signal_simple_error ("No such coding system", name);
 551   return coding_system;
 552 }
 553
 554 /* We store the coding systems in hash tables with the names as the key and the
 555    actual coding system object as the value.  Occasionally we need to use them
 556    in a list format.  These routines provide us with that. */
 557 struct coding_system_list_closure
 558 {
 559   Lisp_Object *coding_system_list;
 560 };
 561
 562 static int
 563 add_coding_system_to_list_mapper (Lisp_Object key, Lisp_Object value,
 564                                   void *coding_system_list_closure)
 565 {
 566   /* This function can GC */
 567   struct coding_system_list_closure *cscl =
 568     (struct coding_system_list_closure *) coding_system_list_closure;
 569   Lisp_Object *coding_system_list = cscl->coding_system_list;
 570
 571   *coding_system_list = Fcons (key, *coding_system_list);
 572   return 0;
 573 }
 574
 575 DEFUN ("coding-system-list", Fcoding_system_list, 0, 0, 0, /*
 576 Return a list of the names of all defined coding systems.
 577 */
 578        ())
 579 {
 580   Lisp_Object coding_system_list = Qnil;
 581   struct gcpro gcpro1;
 582   struct coding_system_list_closure coding_system_list_closure;
 583
 584   GCPRO1 (coding_system_list);
 585   coding_system_list_closure.coding_system_list = &coding_system_list;
 586   elisp_maphash (add_coding_system_to_list_mapper, Vcoding_system_hash_table,
 587                  &coding_system_list_closure);
 588   UNGCPRO;
 589
 590   return coding_system_list;
 591 }
 592
 593 DEFUN ("coding-system-name", Fcoding_system_name, 1, 1, 0, /*
 594 Return the name of the given coding system.
 595 */
 596        (coding_system))
 597 {
 598   coding_system = Fget_coding_system (coding_system);
 599   return XCODING_SYSTEM_NAME (coding_system);
 600 }
 601
 602 static Lisp_Coding_System *
 603 allocate_coding_system (enum coding_system_type type, Lisp_Object name)
 604 {
 605   Lisp_Coding_System *codesys =
 606     alloc_lcrecord_type (Lisp_Coding_System, &lrecord_coding_system);
 607
 608   zero_lcrecord (codesys);
 609   CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = Qnil;
 610   CODING_SYSTEM_POST_READ_CONVERSION (codesys) = Qnil;
 611   CODING_SYSTEM_EOL_TYPE (codesys) = EOL_AUTODETECT;
 612   CODING_SYSTEM_EOL_CRLF (codesys) = Qnil;
 613   CODING_SYSTEM_EOL_CR   (codesys) = Qnil;
 614   CODING_SYSTEM_EOL_LF   (codesys) = Qnil;
 615   CODING_SYSTEM_TYPE     (codesys) = type;
 616   CODING_SYSTEM_MNEMONIC (codesys) = Qnil;
 617 #ifdef MULE
 618   if (type == CODESYS_ISO2022)
 619     {
 620       int i;
 621       for (i = 0; i < 4; i++)
 622         CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i) = Qnil;
 623     }
 624   else if (type == CODESYS_CCL)
 625     {
 626       CODING_SYSTEM_CCL_DECODE (codesys) = Qnil;
 627       CODING_SYSTEM_CCL_ENCODE (codesys) = Qnil;
 628     }
 629 #endif /* MULE */
 630   CODING_SYSTEM_NAME (codesys) = name;
 631
 632   return codesys;
 633 }
 634
 635 #ifdef MULE
 636 /* Given a list of charset conversion specs as specified in a Lisp
 637    program, parse it into STORE_HERE. */
 638
 639 static void
 640 parse_charset_conversion_specs (charset_conversion_spec_dynarr *store_here,
 641                                 Lisp_Object spec_list)
 642 {
 643   Lisp_Object rest;
 644
 645   EXTERNAL_LIST_LOOP (rest, spec_list)
 646     {
 647       Lisp_Object car = XCAR (rest);
 648       Lisp_Object from, to;
 649       struct charset_conversion_spec spec;
 650
 651       if (!CONSP (car) || !CONSP (XCDR (car)) || !NILP (XCDR (XCDR (car))))
 652         signal_simple_error ("Invalid charset conversion spec", car);
 653       from = Fget_charset (XCAR (car));
 654       to = Fget_charset (XCAR (XCDR (car)));
 655       if (XCHARSET_TYPE (from) != XCHARSET_TYPE (to))
 656         signal_simple_error_2
 657           ("Attempted conversion between different charset types",
 658            from, to);
 659       spec.from_charset = from;
 660       spec.to_charset = to;
 661
 662       Dynarr_add (store_here, spec);
 663     }
 664 }
 665
 666 /* Given a dynarr LOAD_HERE of internally-stored charset conversion
 667    specs, return the equivalent as the Lisp programmer would see it.
 668
 669    If LOAD_HERE is 0, return Qnil. */
 670
 671 static Lisp_Object
 672 unparse_charset_conversion_specs (charset_conversion_spec_dynarr *load_here)
 673 {
 674   int i;
 675   Lisp_Object result;
 676
 677   if (!load_here)
 678     return Qnil;
 679   for (i = 0, result = Qnil; i < Dynarr_length (load_here); i++)
 680     {
 681       struct charset_conversion_spec *ccs = Dynarr_atp (load_here, i);
 682       result = Fcons (list2 (ccs->from_charset, ccs->to_charset), result);
 683     }
 684
 685   return Fnreverse (result);
 686 }
 687
 688 #endif /* MULE */
 689
 690 DEFUN ("make-coding-system", Fmake_coding_system, 2, 4, 0, /*
 691 Register symbol NAME as a coding system.
 692
 693 TYPE describes the conversion method used and should be one of
 694
 695 nil or 'undecided
 696      Automatic conversion.  XEmacs attempts to detect the coding system
 697      used in the file.
 698 'no-conversion
 699      No conversion.  Use this for binary files and such.  On output,
 700      graphic characters that are not in ASCII or Latin-1 will be
 701      replaced by a ?. (For a no-conversion-encoded buffer, these
 702      characters will only be present if you explicitly insert them.)
 703 'shift-jis
 704      Shift-JIS (a Japanese encoding commonly used in PC operating systems).
 705 'ucs-4
 706      ISO 10646 UCS-4 encoding.
 707 'utf-8
 708      ISO 10646 UTF-8 encoding.
 709 'iso2022
 710      Any ISO2022-compliant encoding.  Among other things, this includes
 711      JIS (the Japanese encoding commonly used for e-mail), EUC (the
 712      standard Unix encoding for Japanese and other languages), and
 713      Compound Text (the encoding used in X11).  You can specify more
 714      specific information about the conversion with the PROPS argument.
 715 'big5
 716      Big5 (the encoding commonly used for Taiwanese).
 717 'ccl
 718      The conversion is performed using a user-written pseudo-code
 719      program.  CCL (Code Conversion Language) is the name of this
 720      pseudo-code.
 721 'internal
 722      Write out or read in the raw contents of the memory representing
 723      the buffer's text.  This is primarily useful for debugging
 724      purposes, and is only enabled when XEmacs has been compiled with
 725      DEBUG_XEMACS defined (via the --debug configure option).
 726      WARNING: Reading in a file using 'internal conversion can result
 727      in an internal inconsistency in the memory representing a
 728      buffer's text, which will produce unpredictable results and may
 729      cause XEmacs to crash.  Under normal circumstances you should
 730      never use 'internal conversion.
 731
 732 DOC-STRING is a string describing the coding system.
 733
 734 PROPS is a property list, describing the specific nature of the
 735 character set.  Recognized properties are:
 736
 737 'mnemonic
 738      String to be displayed in the modeline when this coding system is
 739      active.
 740
 741 'eol-type
 742      End-of-line conversion to be used.  It should be one of
 743
 744         nil
 745                 Automatically detect the end-of-line type (LF, CRLF,
 746                 or CR).  Also generate subsidiary coding systems named
 747                 `NAME-unix', `NAME-dos', and `NAME-mac', that are
 748                 identical to this coding system but have an EOL-TYPE
 749                 value of 'lf, 'crlf, and 'cr, respectively.
 750         'lf
 751                 The end of a line is marked externally using ASCII LF.
 752                 Since this is also the way that XEmacs represents an
 753                 end-of-line internally, specifying this option results
 754                 in no end-of-line conversion.  This is the standard
 755                 format for Unix text files.
 756         'crlf
 757                 The end of a line is marked externally using ASCII
 758                 CRLF.  This is the standard format for MS-DOS text
 759                 files.
 760         'cr
 761                 The end of a line is marked externally using ASCII CR.
 762                 This is the standard format for Macintosh text files.
 763         t
 764                 Automatically detect the end-of-line type but do not
 765                 generate subsidiary coding systems.  (This value is
 766                 converted to nil when stored internally, and
 767                 `coding-system-property' will return nil.)
 768
 769 'post-read-conversion
 770      Function called after a file has been read in, to perform the
 771      decoding.  Called with two arguments, START and END, denoting
 772      a region of the current buffer to be decoded.
 773
 774 'pre-write-conversion
 775      Function called before a file is written out, to perform the
 776      encoding.  Called with two arguments, START and END, denoting
 777      a region of the current buffer to be encoded.
 778
 779
 780 The following additional properties are recognized if TYPE is 'iso2022:
 781
 782 'charset-g0
 783 'charset-g1
 784 'charset-g2
 785 'charset-g3
 786      The character set initially designated to the G0 - G3 registers.
 787      The value should be one of
 788
 789           -- A charset object (designate that character set)
 790           -- nil (do not ever use this register)
 791           -- t (no character set is initially designated to
 792                 the register, but may be later on; this automatically
 793                 sets the corresponding `force-g*-on-output' property)
 794
 795 'force-g0-on-output
 796 'force-g1-on-output
 797 'force-g2-on-output
 798 'force-g2-on-output
 799      If non-nil, send an explicit designation sequence on output before
 800      using the specified register.
 801
 802 'short
 803      If non-nil, use the short forms "ESC $ @", "ESC $ A", and
 804      "ESC $ B" on output in place of the full designation sequences
 805      "ESC $ ( @", "ESC $ ( A", and "ESC $ ( B".
 806
 807 'no-ascii-eol
 808      If non-nil, don't designate ASCII to G0 at each end of line on output.
 809      Setting this to non-nil also suppresses other state-resetting that
 810      normally happens at the end of a line.
 811
 812 'no-ascii-cntl
 813      If non-nil, don't designate ASCII to G0 before control chars on output.
 814
 815 'seven
 816      If non-nil, use 7-bit environment on output.  Otherwise, use 8-bit
 817      environment.
 818
 819 'lock-shift
 820      If non-nil, use locking-shift (SO/SI) instead of single-shift
 821      or designation by escape sequence.
 822
 823 'no-iso6429
 824      If non-nil, don't use ISO6429's direction specification.
 825
 826 'escape-quoted
 827      If non-nil, literal control characters that are the same as
 828      the beginning of a recognized ISO2022 or ISO6429 escape sequence
 829      (in particular, ESC (0x1B), SO (0x0E), SI (0x0F), SS2 (0x8E),
 830      SS3 (0x8F), and CSI (0x9B)) are "quoted" with an escape character
 831      so that they can be properly distinguished from an escape sequence.
 832      (Note that doing this results in a non-portable encoding.) This
 833      encoding flag is used for byte-compiled files.  Note that ESC
 834      is a good choice for a quoting character because there are no
 835      escape sequences whose second byte is a character from the Control-0
 836      or Control-1 character sets; this is explicitly disallowed by the
 837      ISO2022 standard.
 838
 839 'input-charset-conversion
 840      A list of conversion specifications, specifying conversion of
 841      characters in one charset to another when decoding is performed.
 842      Each specification is a list of two elements: the source charset,
 843      and the destination charset.
 844
 845 'output-charset-conversion
 846      A list of conversion specifications, specifying conversion of
 847      characters in one charset to another when encoding is performed.
 848      The form of each specification is the same as for
 849      'input-charset-conversion.
 850
 851
 852 The following additional properties are recognized (and required)
 853 if TYPE is 'ccl:
 854
 855 'decode
 856      CCL program used for decoding (converting to internal format).
 857
 858 'encode
 859      CCL program used for encoding (converting to external format).
 860 */
 861        (name, type, doc_string, props))
 862 {
 863   Lisp_Coding_System *codesys;
 864   enum coding_system_type ty;
 865   int need_to_setup_eol_systems = 1;
 866
 867   /* Convert type to constant */
 868   if (NILP (type) || EQ (type, Qundecided))
 869                                       { ty = CODESYS_AUTODETECT; }
 870 #ifdef MULE
 871   else if (EQ (type, Qshift_jis))     { ty = CODESYS_SHIFT_JIS; }
 872   else if (EQ (type, Qiso2022))       { ty = CODESYS_ISO2022; }
 873   else if (EQ (type, Qbig5))          { ty = CODESYS_BIG5; }
 874   else if (EQ (type, Qucs4))          { ty = CODESYS_UCS4; }
 875   else if (EQ (type, Qutf8))          { ty = CODESYS_UTF8; }
 876   else if (EQ (type, Qccl))           { ty = CODESYS_CCL; }
 877 #endif
 878   else if (EQ (type, Qno_conversion)) { ty = CODESYS_NO_CONVERSION; }
 879 #ifdef DEBUG_XEMACS
 880   else if (EQ (type, Qinternal))      { ty = CODESYS_INTERNAL; }
 881 #endif
 882   else
 883     signal_simple_error ("Invalid coding system type", type);
 884
 885   CHECK_SYMBOL (name);
 886
 887   codesys = allocate_coding_system (ty, name);
 888
 889   if (NILP (doc_string))
 890     doc_string = build_string ("");
 891   else
 892     CHECK_STRING (doc_string);
 893   CODING_SYSTEM_DOC_STRING (codesys) = doc_string;
 894
 895   {
 896     EXTERNAL_PROPERTY_LIST_LOOP_3 (key, value, props)
 897       {
 898         if (EQ (key, Qmnemonic))
 899           {
 900             if (!NILP (value))
 901               CHECK_STRING (value);
 902             CODING_SYSTEM_MNEMONIC (codesys) = value;
 903           }
 904
 905         else if (EQ (key, Qeol_type))
 906           {
 907             need_to_setup_eol_systems = NILP (value);
 908             if (EQ (value, Qt))
 909               value = Qnil;
 910             CODING_SYSTEM_EOL_TYPE (codesys) = symbol_to_eol_type (value);
 911           }
 912
 913         else if (EQ (key, Qpost_read_conversion)) CODING_SYSTEM_POST_READ_CONVERSION (codesys) = value;
 914         else if (EQ (key, Qpre_write_conversion)) CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = value;
 915 #ifdef MULE
 916         else if (ty == CODESYS_ISO2022)
 917           {
 918 #define FROB_INITIAL_CHARSET(charset_num) \
 919   CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \
 920     ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value))
 921
 922             if      (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
 923             else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
 924             else if (EQ (key, Qcharset_g2)) FROB_INITIAL_CHARSET (2);
 925             else if (EQ (key, Qcharset_g3)) FROB_INITIAL_CHARSET (3);
 926
 927 #define FROB_FORCE_CHARSET(charset_num) \
 928   CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = !NILP (value)
 929
 930             else if (EQ (key, Qforce_g0_on_output)) FROB_FORCE_CHARSET (0);
 931             else if (EQ (key, Qforce_g1_on_output)) FROB_FORCE_CHARSET (1);
 932             else if (EQ (key, Qforce_g2_on_output)) FROB_FORCE_CHARSET (2);
 933             else if (EQ (key, Qforce_g3_on_output)) FROB_FORCE_CHARSET (3);
 934
 935 #define FROB_BOOLEAN_PROPERTY(prop) \
 936   CODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value)
 937
 938             else if (EQ (key, Qshort))         FROB_BOOLEAN_PROPERTY (SHORT);
 939             else if (EQ (key, Qno_ascii_eol))  FROB_BOOLEAN_PROPERTY (NO_ASCII_EOL);
 940             else if (EQ (key, Qno_ascii_cntl)) FROB_BOOLEAN_PROPERTY (NO_ASCII_CNTL);
 941             else if (EQ (key, Qseven))         FROB_BOOLEAN_PROPERTY (SEVEN);
 942             else if (EQ (key, Qlock_shift))    FROB_BOOLEAN_PROPERTY (LOCK_SHIFT);
 943             else if (EQ (key, Qno_iso6429))    FROB_BOOLEAN_PROPERTY (NO_ISO6429);
 944             else if (EQ (key, Qescape_quoted)) FROB_BOOLEAN_PROPERTY (ESCAPE_QUOTED);
 945
 946             else if (EQ (key, Qinput_charset_conversion))
 947               {
 948                 codesys->iso2022.input_conv =
 949                   Dynarr_new (charset_conversion_spec);
 950                 parse_charset_conversion_specs (codesys->iso2022.input_conv,
 951                                                 value);
 952               }
 953             else if (EQ (key, Qoutput_charset_conversion))
 954               {
 955                 codesys->iso2022.output_conv =
 956                   Dynarr_new (charset_conversion_spec);
 957                 parse_charset_conversion_specs (codesys->iso2022.output_conv,
 958                                                 value);
 959               }
 960             else
 961               signal_simple_error ("Unrecognized property", key);
 962           }
 963         else if (EQ (type, Qccl))
 964           {
 965             Lisp_Object sym;
 966             struct ccl_program test_ccl;
 967             Extbyte *suffix;
 968
 969             /* Check key first.  */
 970             if (EQ (key, Qdecode))
 971               suffix = "-ccl-decode";
 972             else if (EQ (key, Qencode))
 973               suffix = "-ccl-encode";
 974             else
 975               signal_simple_error ("Unrecognized property", key);
 976
 977             /* If value is vector, register it as a ccl program
 978                associated with an newly created symbol for
 979                backward compatibility.  */
 980             if (VECTORP (value))
 981               {
 982                 sym = Fintern (concat2 (Fsymbol_name (name),
 983                                         build_string (suffix)),
 984                                Qnil);
 985                 Fregister_ccl_program (sym, value);
 986               }
 987             else
 988               {
 989                 CHECK_SYMBOL (value);
 990                 sym = value;
 991               }
 992             /* check if the given ccl programs are valid.  */
 993             if (setup_ccl_program (&test_ccl, sym) < 0)
 994               signal_simple_error ("Invalid CCL program", value);
 995
 996             if (EQ (key, Qdecode))
 997               CODING_SYSTEM_CCL_DECODE (codesys) = sym;
 998             else if (EQ (key, Qencode))
 999               CODING_SYSTEM_CCL_ENCODE (codesys) = sym;
1000
1001           }
1002 #endif /* MULE */
1003         else
1004           signal_simple_error ("Unrecognized property", key);
1005       }
1006   }
1007
1008   if (need_to_setup_eol_systems)
1009     setup_eol_coding_systems (codesys);
1010
1011   {
1012     Lisp_Object codesys_obj;
1013     XSETCODING_SYSTEM (codesys_obj, codesys);
1014     Fputhash (name, codesys_obj, Vcoding_system_hash_table);
1015     return codesys_obj;
1016   }
1017 }
1018
1019 DEFUN ("copy-coding-system", Fcopy_coding_system, 2, 2, 0, /*
1020 Copy OLD-CODING-SYSTEM to NEW-NAME.
1021 If NEW-NAME does not name an existing coding system, a new one will
1022 be created.
1023 */
1024        (old_coding_system, new_name))
1025 {
1026   Lisp_Object new_coding_system;
1027   old_coding_system = Fget_coding_system (old_coding_system);
1028   new_coding_system = Ffind_coding_system (new_name);
1029   if (NILP (new_coding_system))
1030     {
1031       XSETCODING_SYSTEM (new_coding_system,
1032                          allocate_coding_system
1033                          (XCODING_SYSTEM_TYPE (old_coding_system),
1034                           new_name));
1035       Fputhash (new_name, new_coding_system, Vcoding_system_hash_table);
1036     }
1037
1038   {
1039     Lisp_Coding_System *to = XCODING_SYSTEM (new_coding_system);
1040     Lisp_Coding_System *from = XCODING_SYSTEM (old_coding_system);
1041     memcpy (((char *) to  ) + sizeof (to->header),
1042             ((char *) from) + sizeof (from->header),
1043             sizeof (*from) - sizeof (from->header));
1044     to->name = new_name;
1045   }
1046   return new_coding_system;
1047 }
1048
1049 DEFUN ("coding-system-canonical-name-p", Fcoding_system_canonical_name_p, 1, 1, 0, /*
1050 Return t if OBJECT names a coding system, and is not a coding system alias.
1051 */
1052        (object))
1053 {
1054   return CODING_SYSTEMP (Fgethash (object, Vcoding_system_hash_table, Qnil))
1055     ? Qt : Qnil;
1056 }
1057
1058 DEFUN ("coding-system-alias-p", Fcoding_system_alias_p, 1, 1, 0, /*
1059 Return t if OBJECT is a coding system alias.
1060 All coding system aliases are created by `define-coding-system-alias'.
1061 */
1062        (object))
1063 {
1064   return SYMBOLP (Fgethash (object, Vcoding_system_hash_table, Qzero))
1065     ? Qt : Qnil;
1066 }
1067
1068 DEFUN ("coding-system-aliasee", Fcoding_system_aliasee, 1, 1, 0, /*
1069 Return the coding-system symbol for which symbol ALIAS is an alias.
1070 */
1071        (alias))
1072 {
1073   Lisp_Object aliasee = Fgethash (alias, Vcoding_system_hash_table, Qnil);
1074   if (SYMBOLP (aliasee))
1075     return aliasee;
1076   else
1077     signal_simple_error ("Symbol is not a coding system alias", alias);
1078   return Qnil;          /* To keep the compiler happy */
1079 }
1080
1081 static Lisp_Object
1082 append_suffix_to_symbol (Lisp_Object symbol, const char *ascii_string)
1083 {
1084   return Fintern (concat2 (Fsymbol_name (symbol), build_string (ascii_string)),
1085                   Qnil);
1086 }
1087
1088 /* A maphash function, for removing dangling coding system aliases. */
1089 static int
1090 dangling_coding_system_alias_p (Lisp_Object alias,
1091                                 Lisp_Object aliasee,
1092                                 void *dangling_aliases)
1093 {
1094   if (SYMBOLP (aliasee)
1095       && NILP (Fgethash (aliasee, Vcoding_system_hash_table, Qnil)))
1096     {
1097       (*(int *) dangling_aliases)++;
1098       return 1;
1099     }
1100   else
1101     return 0;
1102 }
1103
1104 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias, 2, 2, 0, /*
1105 Define symbol ALIAS as an alias for coding system ALIASEE.
1106
1107 You can use this function to redefine an alias that has already been defined,
1108 but you cannot redefine a name which is the canonical name for a coding system.
1109 \(a canonical name of a coding system is what is returned when you call
1110 `coding-system-name' on a coding system).
1111
1112 ALIASEE itself can be an alias, which allows you to define nested aliases.
1113
1114 You are forbidden, however, from creating alias loops or `dangling' aliases.
1115 These will be detected, and an error will be signaled if you attempt to do so.
1116
1117 If ALIASEE is nil, then ALIAS will simply be undefined.
1118
1119 See also `coding-system-alias-p', `coding-system-aliasee',
1120 and `coding-system-canonical-name-p'.
1121 */
1122        (alias, aliasee))
1123 {
1124   Lisp_Object real_coding_system, probe;
1125
1126   CHECK_SYMBOL (alias);
1127
1128   if (!NILP (Fcoding_system_canonical_name_p (alias)))
1129     signal_simple_error
1130       ("Symbol is the canonical name of a coding system and cannot be redefined",
1131        alias);
1132
1133   if (NILP (aliasee))
1134     {
1135       Lisp_Object subsidiary_unix = append_suffix_to_symbol (alias, "-unix");
1136       Lisp_Object subsidiary_dos  = append_suffix_to_symbol (alias, "-dos");
1137       Lisp_Object subsidiary_mac  = append_suffix_to_symbol (alias, "-mac");
1138
1139       Fremhash (alias, Vcoding_system_hash_table);
1140
1141       /* Undefine subsidiary aliases,
1142          presumably created by a previous call to this function */
1143       if (! NILP (Fcoding_system_alias_p (subsidiary_unix)) &&
1144           ! NILP (Fcoding_system_alias_p (subsidiary_dos))  &&
1145           ! NILP (Fcoding_system_alias_p (subsidiary_mac)))
1146         {
1147           Fdefine_coding_system_alias (subsidiary_unix, Qnil);
1148           Fdefine_coding_system_alias (subsidiary_dos,  Qnil);
1149           Fdefine_coding_system_alias (subsidiary_mac,  Qnil);
1150         }
1151
1152       /* Undefine dangling coding system aliases. */
1153       {
1154         int dangling_aliases;
1155
1156         do {
1157           dangling_aliases = 0;
1158           elisp_map_remhash (dangling_coding_system_alias_p,
1159                              Vcoding_system_hash_table,
1160                              &dangling_aliases);
1161         } while (dangling_aliases > 0);
1162       }
1163
1164       return Qnil;
1165     }
1166
1167   if (CODING_SYSTEMP (aliasee))
1168     aliasee = XCODING_SYSTEM_NAME (aliasee);
1169
1170   /* Checks that aliasee names a coding-system */
1171   real_coding_system = Fget_coding_system (aliasee);
1172
1173   /* Check for coding system alias loops */
1174   if (EQ (alias, aliasee))
1175     alias_loop: signal_simple_error_2
1176       ("Attempt to create a coding system alias loop", alias, aliasee);
1177
1178   for (probe = aliasee;
1179        SYMBOLP (probe);
1180        probe = Fgethash (probe, Vcoding_system_hash_table, Qzero))
1181     {
1182       if (EQ (probe, alias))
1183         goto alias_loop;
1184     }
1185
1186   Fputhash (alias, aliasee, Vcoding_system_hash_table);
1187
1188   /* Set up aliases for subsidiaries.
1189      #### There must be a better way to handle subsidiary coding systems. */
1190   {
1191     static const char *suffixes[] = { "-unix", "-dos", "-mac" };
1192     int i;
1193     for (i = 0; i < countof (suffixes); i++)
1194       {
1195         Lisp_Object alias_subsidiary =
1196           append_suffix_to_symbol (alias, suffixes[i]);
1197         Lisp_Object aliasee_subsidiary =
1198           append_suffix_to_symbol (aliasee, suffixes[i]);
1199
1200         if (! NILP (Ffind_coding_system (aliasee_subsidiary)))
1201           Fdefine_coding_system_alias (alias_subsidiary, aliasee_subsidiary);
1202       }
1203   }
1204   /* FSF return value is a vector of [ALIAS-unix ALIAS-dos ALIAS-mac],
1205      but it doesn't look intentional, so I'd rather return something
1206      meaningful or nothing at all. */
1207   return Qnil;
1208 }
1209
1210 static Lisp_Object
1211 subsidiary_coding_system (Lisp_Object coding_system, eol_type_t type)
1212 {
1213   Lisp_Coding_System *cs = XCODING_SYSTEM (coding_system);
1214   Lisp_Object new_coding_system;
1215
1216   if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
1217     return coding_system;
1218
1219   switch (type)
1220     {
1221     case EOL_AUTODETECT: return coding_system;
1222     case EOL_LF:   new_coding_system = CODING_SYSTEM_EOL_LF   (cs); break;
1223     case EOL_CR:   new_coding_system = CODING_SYSTEM_EOL_CR   (cs); break;
1224     case EOL_CRLF: new_coding_system = CODING_SYSTEM_EOL_CRLF (cs); break;
1225     default:       ABORT (); return Qnil;
1226     }
1227
1228   return NILP (new_coding_system) ? coding_system : new_coding_system;
1229 }
1230
1231 DEFUN ("subsidiary-coding-system", Fsubsidiary_coding_system, 2, 2, 0, /*
1232 Return the subsidiary coding system of CODING-SYSTEM with eol type EOL-TYPE.
1233 */
1234        (coding_system, eol_type))
1235 {
1236   coding_system = Fget_coding_system (coding_system);
1237
1238   return subsidiary_coding_system (coding_system,
1239                                    symbol_to_eol_type (eol_type));
1240 }
1241
1242 \f
1243 /************************************************************************/
1244 /*                         Coding system accessors                      */
1245 /************************************************************************/
1246
1247 DEFUN ("coding-system-doc-string", Fcoding_system_doc_string, 1, 1, 0, /*
1248 Return the doc string for CODING-SYSTEM.
1249 */
1250        (coding_system))
1251 {
1252   coding_system = Fget_coding_system (coding_system);
1253   return XCODING_SYSTEM_DOC_STRING (coding_system);
1254 }
1255
1256 DEFUN ("coding-system-type", Fcoding_system_type, 1, 1, 0, /*
1257 Return the type of CODING-SYSTEM.
1258 */
1259        (coding_system))
1260 {
1261   switch (XCODING_SYSTEM_TYPE (Fget_coding_system (coding_system)))
1262     {
1263     default: ABORT ();
1264     case CODESYS_AUTODETECT:    return Qundecided;
1265 #ifdef MULE
1266     case CODESYS_SHIFT_JIS:     return Qshift_jis;
1267     case CODESYS_ISO2022:       return Qiso2022;
1268     case CODESYS_BIG5:          return Qbig5;
1269     case CODESYS_UCS4:          return Qucs4;
1270     case CODESYS_UTF8:          return Qutf8;
1271     case CODESYS_CCL:           return Qccl;
1272 #endif
1273     case CODESYS_NO_CONVERSION: return Qno_conversion;
1274 #ifdef DEBUG_XEMACS
1275     case CODESYS_INTERNAL:      return Qinternal;
1276 #endif
1277     }
1278 }
1279
1280 #ifdef MULE
1281 static
1282 Lisp_Object coding_system_charset (Lisp_Object coding_system, int gnum)
1283 {
1284   Lisp_Object cs
1285     = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, gnum);
1286
1287   return CHARSETP (cs) ? XCHARSET_NAME (cs) : Qnil;
1288 }
1289
1290 DEFUN ("coding-system-charset", Fcoding_system_charset, 2, 2, 0, /*
1291 Return initial charset of CODING-SYSTEM designated to GNUM.
1292 GNUM allows 0 .. 3.
1293 */
1294        (coding_system, gnum))
1295 {
1296   coding_system = Fget_coding_system (coding_system);
1297   CHECK_INT (gnum);
1298
1299   return coding_system_charset (coding_system, XINT (gnum));
1300 }
1301 #endif /* MULE */
1302
1303 DEFUN ("coding-system-property", Fcoding_system_property, 2, 2, 0, /*
1304 Return the PROP property of CODING-SYSTEM.
1305 */
1306        (coding_system, prop))
1307 {
1308   int i, ok = 0;
1309   enum coding_system_type type;
1310
1311   coding_system = Fget_coding_system (coding_system);
1312   CHECK_SYMBOL (prop);
1313   type = XCODING_SYSTEM_TYPE (coding_system);
1314
1315   for (i = 0; !ok && i < Dynarr_length (the_codesys_prop_dynarr); i++)
1316     if (EQ (Dynarr_at (the_codesys_prop_dynarr, i).sym, prop))
1317       {
1318         ok = 1;
1319         switch (Dynarr_at (the_codesys_prop_dynarr, i).prop_type)
1320           {
1321           case CODESYS_PROP_ALL_OK:
1322             break;
1323 #ifdef MULE
1324           case CODESYS_PROP_ISO2022:
1325             if (type != CODESYS_ISO2022)
1326               signal_simple_error
1327                 ("Property only valid in ISO2022 coding systems",
1328                  prop);
1329             break;
1330
1331           case CODESYS_PROP_CCL:
1332             if (type != CODESYS_CCL)
1333               signal_simple_error
1334                 ("Property only valid in CCL coding systems",
1335                  prop);
1336             break;
1337 #endif /* MULE */
1338           default:
1339             ABORT ();
1340           }
1341       }
1342
1343   if (!ok)
1344     signal_simple_error ("Unrecognized property", prop);
1345
1346   if (EQ (prop, Qname))
1347     return XCODING_SYSTEM_NAME (coding_system);
1348   else if (EQ (prop, Qtype))
1349     return Fcoding_system_type (coding_system);
1350   else if (EQ (prop, Qdoc_string))
1351     return XCODING_SYSTEM_DOC_STRING (coding_system);
1352   else if (EQ (prop, Qmnemonic))
1353     return XCODING_SYSTEM_MNEMONIC (coding_system);
1354   else if (EQ (prop, Qeol_type))
1355     return eol_type_to_symbol (XCODING_SYSTEM_EOL_TYPE (coding_system));
1356   else if (EQ (prop, Qeol_lf))
1357     return XCODING_SYSTEM_EOL_LF (coding_system);
1358   else if (EQ (prop, Qeol_crlf))
1359     return XCODING_SYSTEM_EOL_CRLF (coding_system);
1360   else if (EQ (prop, Qeol_cr))
1361     return XCODING_SYSTEM_EOL_CR (coding_system);
1362   else if (EQ (prop, Qpost_read_conversion))
1363     return XCODING_SYSTEM_POST_READ_CONVERSION (coding_system);
1364   else if (EQ (prop, Qpre_write_conversion))
1365     return XCODING_SYSTEM_PRE_WRITE_CONVERSION (coding_system);
1366 #ifdef MULE
1367   else if (type == CODESYS_ISO2022)
1368     {
1369       if (EQ (prop, Qcharset_g0))
1370         return coding_system_charset (coding_system, 0);
1371       else if (EQ (prop, Qcharset_g1))
1372         return coding_system_charset (coding_system, 1);
1373       else if (EQ (prop, Qcharset_g2))
1374         return coding_system_charset (coding_system, 2);
1375       else if (EQ (prop, Qcharset_g3))
1376         return coding_system_charset (coding_system, 3);
1377
1378 #define FORCE_CHARSET(charset_num) \
1379   (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \
1380    (coding_system, charset_num) ? Qt : Qnil)
1381
1382       else if (EQ (prop, Qforce_g0_on_output)) return FORCE_CHARSET (0);
1383       else if (EQ (prop, Qforce_g1_on_output)) return FORCE_CHARSET (1);
1384       else if (EQ (prop, Qforce_g2_on_output)) return FORCE_CHARSET (2);
1385       else if (EQ (prop, Qforce_g3_on_output)) return FORCE_CHARSET (3);
1386
1387 #define LISP_BOOLEAN(prop) \
1388   (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil)
1389
1390       else if (EQ (prop, Qshort))         return LISP_BOOLEAN (SHORT);
1391       else if (EQ (prop, Qno_ascii_eol))  return LISP_BOOLEAN (NO_ASCII_EOL);
1392       else if (EQ (prop, Qno_ascii_cntl)) return LISP_BOOLEAN (NO_ASCII_CNTL);
1393       else if (EQ (prop, Qseven))         return LISP_BOOLEAN (SEVEN);
1394       else if (EQ (prop, Qlock_shift))    return LISP_BOOLEAN (LOCK_SHIFT);
1395       else if (EQ (prop, Qno_iso6429))    return LISP_BOOLEAN (NO_ISO6429);
1396       else if (EQ (prop, Qescape_quoted)) return LISP_BOOLEAN (ESCAPE_QUOTED);
1397
1398       else if (EQ (prop, Qinput_charset_conversion))
1399         return
1400           unparse_charset_conversion_specs
1401             (XCODING_SYSTEM (coding_system)->iso2022.input_conv);
1402       else if (EQ (prop, Qoutput_charset_conversion))
1403         return
1404           unparse_charset_conversion_specs
1405             (XCODING_SYSTEM (coding_system)->iso2022.output_conv);
1406       else
1407         ABORT ();
1408     }
1409   else if (type == CODESYS_CCL)
1410     {
1411       if (EQ (prop, Qdecode))
1412         return XCODING_SYSTEM_CCL_DECODE (coding_system);
1413       else if (EQ (prop, Qencode))
1414         return XCODING_SYSTEM_CCL_ENCODE (coding_system);
1415       else
1416         ABORT ();
1417     }
1418 #endif /* MULE */
1419   else
1420     ABORT ();
1421
1422   return Qnil; /* not reached */
1423 }
1424
1425 \f
1426 /************************************************************************/
1427 /*                       Coding category functions                      */
1428 /************************************************************************/
1429
1430 static int
1431 decode_coding_category (Lisp_Object symbol)
1432 {
1433   int i;
1434
1435   CHECK_SYMBOL (symbol);
1436   for (i = 0; i < CODING_CATEGORY_LAST; i++)
1437     if (EQ (coding_category_symbol[i], symbol))
1438       return i;
1439
1440   signal_simple_error ("Unrecognized coding category", symbol);
1441   return 0; /* not reached */
1442 }
1443
1444 DEFUN ("coding-category-list", Fcoding_category_list, 0, 0, 0, /*
1445 Return a list of all recognized coding categories.
1446 */
1447        ())
1448 {
1449   int i;
1450   Lisp_Object list = Qnil;
1451
1452   for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
1453     list = Fcons (coding_category_symbol[i], list);
1454   return list;
1455 }
1456
1457 DEFUN ("set-coding-priority-list", Fset_coding_priority_list, 1, 1, 0, /*
1458 Change the priority order of the coding categories.
1459 LIST should be list of coding categories, in descending order of
1460 priority.  Unspecified coding categories will be lower in priority
1461 than all specified ones, in the same relative order they were in
1462 previously.
1463 */
1464        (list))
1465 {
1466   int category_to_priority[CODING_CATEGORY_LAST];
1467   int i, j;
1468   Lisp_Object rest;
1469
1470   /* First generate a list that maps coding categories to priorities. */
1471
1472   for (i = 0; i < CODING_CATEGORY_LAST; i++)
1473     category_to_priority[i] = -1;
1474
1475   /* Highest priority comes from the specified list. */
1476   i = 0;
1477   EXTERNAL_LIST_LOOP (rest, list)
1478     {
1479       int cat = decode_coding_category (XCAR (rest));
1480
1481       if (category_to_priority[cat] >= 0)
1482         signal_simple_error ("Duplicate coding category in list", XCAR (rest));
1483       category_to_priority[cat] = i++;
1484     }
1485
1486   /* Now go through the existing categories by priority to retrieve
1487      the categories not yet specified and preserve their priority
1488      order. */
1489   for (j = 0; j < CODING_CATEGORY_LAST; j++)
1490     {
1491       int cat = fcd->coding_category_by_priority[j];
1492       if (category_to_priority[cat] < 0)
1493         category_to_priority[cat] = i++;
1494     }
1495
1496   /* Now we need to construct the inverse of the mapping we just
1497      constructed. */
1498
1499   for (i = 0; i < CODING_CATEGORY_LAST; i++)
1500     fcd->coding_category_by_priority[category_to_priority[i]] = i;
1501
1502   /* Phew!  That was confusing. */
1503   return Qnil;
1504 }
1505
1506 DEFUN ("coding-priority-list", Fcoding_priority_list, 0, 0, 0, /*
1507 Return a list of coding categories in descending order of priority.
1508 */
1509        ())
1510 {
1511   int i;
1512   Lisp_Object list = Qnil;
1513
1514   for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
1515     list = Fcons (coding_category_symbol[fcd->coding_category_by_priority[i]],
1516                   list);
1517   return list;
1518 }
1519
1520 DEFUN ("set-coding-category-system", Fset_coding_category_system, 2, 2, 0, /*
1521 Change the coding system associated with a coding category.
1522 */
1523        (coding_category, coding_system))
1524 {
1525   int cat = decode_coding_category (coding_category);
1526
1527   coding_system = Fget_coding_system (coding_system);
1528   fcd->coding_category_system[cat] = coding_system;
1529   return Qnil;
1530 }
1531
1532 DEFUN ("coding-category-system", Fcoding_category_system, 1, 1, 0, /*
1533 Return the coding system associated with a coding category.
1534 */
1535        (coding_category))
1536 {
1537   int cat = decode_coding_category (coding_category);
1538   Lisp_Object sys = fcd->coding_category_system[cat];
1539
1540   if (!NILP (sys))
1541     return XCODING_SYSTEM_NAME (sys);
1542   return Qnil;
1543 }
1544
1545 \f
1546 /************************************************************************/
1547 /*                     Detecting the encoding of data                   */
1548 /************************************************************************/
1549
1550 struct detection_state
1551 {
1552   eol_type_t eol_type;
1553   int seen_non_ascii;
1554   int mask;
1555 #ifdef MULE
1556   struct
1557     {
1558       int mask;
1559       int in_second_byte;
1560     }
1561   big5;
1562
1563   struct
1564     {
1565       int mask;
1566       int in_second_byte;
1567     }
1568   shift_jis;
1569
1570   struct
1571     {
1572       int mask;
1573       int in_byte;
1574   }
1575   ucs4;
1576
1577   struct
1578     {
1579       int mask;
1580       int in_byte;
1581     }
1582   utf8;
1583
1584   struct
1585     {
1586       int mask;
1587       int initted;
1588       struct iso2022_decoder iso;
1589       unsigned int flags;
1590       int high_byte_count;
1591       unsigned int saw_single_shift:1;
1592     }
1593   iso2022;
1594 #endif
1595   struct
1596     {
1597       int seen_anything;
1598       int just_saw_cr;
1599     }
1600   eol;
1601 };
1602
1603 static int
1604 acceptable_control_char_p (int c)
1605 {
1606   switch (c)
1607     {
1608       /* Allow and ignore control characters that you might
1609          reasonably see in a text file */
1610     case '\r':
1611     case '\n':
1612     case '\t':
1613     case  7: /* bell */
1614     case  8: /* backspace */
1615     case 11: /* vertical tab */
1616     case 12: /* form feed */
1617     case 26: /* MS-DOS C-z junk */
1618     case 31: /* '^_' -- for info */
1619       return 1;
1620     default:
1621       return 0;
1622     }
1623 }
1624
1625 static int
1626 mask_has_at_most_one_bit_p (int mask)
1627 {
1628   /* Perhaps the only thing useful you learn from intensive Microsoft
1629      technical interviews */
1630   return (mask & (mask - 1)) == 0;
1631 }
1632
1633 static eol_type_t
1634 detect_eol_type (struct detection_state *st, const Extbyte *src,
1635                  Lstream_data_count n)
1636 {
1637   while (n--)
1638     {
1639       unsigned char c = *(unsigned char *)src++;
1640       if (c == '\n')
1641         {
1642           if (st->eol.just_saw_cr)
1643             return EOL_CRLF;
1644           else if (st->eol.seen_anything)
1645             return EOL_LF;
1646         }
1647       else if (st->eol.just_saw_cr)
1648         return EOL_CR;
1649       else if (c == '\r')
1650         st->eol.just_saw_cr = 1;
1651       else
1652         st->eol.just_saw_cr = 0;
1653       st->eol.seen_anything = 1;
1654     }
1655
1656   return EOL_AUTODETECT;
1657 }
1658
1659 /* Attempt to determine the encoding and EOL type of the given text.
1660    Before calling this function for the first type, you must initialize
1661    st->eol_type as appropriate and initialize st->mask to ~0.
1662
1663    st->eol_type holds the determined EOL type, or EOL_AUTODETECT if
1664    not yet known.
1665
1666    st->mask holds the determined coding category mask, or ~0 if only
1667    ASCII has been seen so far.
1668
1669    Returns:
1670
1671    0 == st->eol_type is EOL_AUTODETECT and/or more than coding category
1672         is present in st->mask
1673    1 == definitive answers are here for both st->eol_type and st->mask
1674 */
1675
1676 static int
1677 detect_coding_type (struct detection_state *st, const Extbyte *src,
1678                     Lstream_data_count n, int just_do_eol)
1679 {
1680   if (st->eol_type == EOL_AUTODETECT)
1681     st->eol_type = detect_eol_type (st, src, n);
1682
1683   if (just_do_eol)
1684     return st->eol_type != EOL_AUTODETECT;
1685
1686   if (!st->seen_non_ascii)
1687     {
1688       for (; n; n--, src++)
1689         {
1690           unsigned char c = *(unsigned char *) src;
1691           if ((c < 0x20 && !acceptable_control_char_p (c)) || c >= 0x80)
1692             {
1693               st->seen_non_ascii = 1;
1694 #ifdef MULE
1695               st->shift_jis.mask = ~0;
1696               st->big5.mask = ~0;
1697               st->ucs4.mask = ~0;
1698               st->utf8.mask = ~0;
1699               st->iso2022.mask = ~0;
1700 #endif
1701               break;
1702             }
1703         }
1704     }
1705
1706   if (!n)
1707     return 0;
1708 #ifdef MULE
1709   if (!mask_has_at_most_one_bit_p (st->iso2022.mask))
1710     st->iso2022.mask = detect_coding_iso2022 (st, src, n);
1711   if (!mask_has_at_most_one_bit_p (st->shift_jis.mask))
1712     st->shift_jis.mask = detect_coding_sjis (st, src, n);
1713   if (!mask_has_at_most_one_bit_p (st->big5.mask))
1714     st->big5.mask = detect_coding_big5 (st, src, n);
1715   if (!mask_has_at_most_one_bit_p (st->utf8.mask))
1716     st->utf8.mask = detect_coding_utf8 (st, src, n);
1717   if (!mask_has_at_most_one_bit_p (st->ucs4.mask))
1718     st->ucs4.mask = detect_coding_ucs4 (st, src, n);
1719
1720   st->mask
1721     = st->iso2022.mask | st->shift_jis.mask | st->big5.mask
1722     | st->utf8.mask | st->ucs4.mask;
1723 #endif
1724   {
1725     int retval = mask_has_at_most_one_bit_p (st->mask);
1726     st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK;
1727     return retval && st->eol_type != EOL_AUTODETECT;
1728   }
1729 }
1730
1731 static Lisp_Object
1732 coding_system_from_mask (int mask)
1733 {
1734   if (mask == ~0)
1735     {
1736       /* If the file was entirely or basically ASCII, use the
1737          default value of `buffer-file-coding-system'. */
1738       Lisp_Object retval =
1739         XBUFFER (Vbuffer_defaults)->buffer_file_coding_system;
1740       if (!NILP (retval))
1741         {
1742           retval = Ffind_coding_system (retval);
1743           if (NILP (retval))
1744             {
1745               warn_when_safe
1746                 (Qbad_variable, Qwarning,
1747                  "Invalid `default-buffer-file-coding-system', set to nil");
1748               XBUFFER (Vbuffer_defaults)->buffer_file_coding_system = Qnil;
1749             }
1750         }
1751       if (NILP (retval))
1752         retval = Fget_coding_system (Qraw_text);
1753       return retval;
1754     }
1755   else
1756     {
1757       int i;
1758       int cat = -1;
1759 #ifdef MULE
1760       mask = postprocess_iso2022_mask (mask);
1761 #endif
1762       /* Look through the coding categories by priority and find
1763          the first one that is allowed. */
1764       for (i = 0; i < CODING_CATEGORY_LAST; i++)
1765         {
1766           cat = fcd->coding_category_by_priority[i];
1767           if ((mask & (1 << cat)) &&
1768               !NILP (fcd->coding_category_system[cat]))
1769             break;
1770         }
1771       if (cat >= 0)
1772         return fcd->coding_category_system[cat];
1773       else
1774         return Fget_coding_system (Qraw_text);
1775     }
1776 }
1777
1778 /* Given a seekable read stream and potential coding system and EOL type
1779    as specified, do any autodetection that is called for.  If the
1780    coding system and/or EOL type are not `autodetect', they will be left
1781    alone; but this function will never return an autodetect coding system
1782    or EOL type.
1783
1784    This function does not automatically fetch subsidiary coding systems;
1785    that should be unnecessary with the explicit eol-type argument. */
1786
1787 #define LENGTH(string_constant) (sizeof (string_constant) - 1)
1788 /* number of leading lines to check for a coding cookie */
1789 #define LINES_TO_CHECK 2
1790
1791 void
1792 determine_real_coding_system (Lstream *stream, Lisp_Object *codesys_in_out,
1793                               eol_type_t *eol_type_in_out)
1794 {
1795   struct detection_state decst;
1796
1797   if (*eol_type_in_out == EOL_AUTODETECT)
1798     *eol_type_in_out = XCODING_SYSTEM_EOL_TYPE (*codesys_in_out);
1799
1800   xzero (decst);
1801   decst.eol_type = *eol_type_in_out;
1802   decst.mask = ~0;
1803
1804   /* If autodetection is called for, do it now. */
1805   if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT
1806       || *eol_type_in_out == EOL_AUTODETECT)
1807     {
1808       Extbyte buf[4096];
1809       Lisp_Object coding_system = Qnil;
1810       Extbyte *p;
1811       Lstream_data_count nread = Lstream_read (stream, buf, sizeof (buf));
1812       Extbyte *scan_end;
1813       int lines_checked = 0;
1814
1815       /* Look for initial "-*-"; mode line prefix */
1816       for (p = buf,
1817              scan_end = buf + nread - LENGTH ("-*-coding:?-*-");
1818            p <= scan_end
1819              && lines_checked < LINES_TO_CHECK;
1820            p++)
1821         if (*p == '-' && *(p+1) == '*' && *(p+2) == '-')
1822           {
1823             Extbyte *local_vars_beg = p + 3;
1824             /* Look for final "-*-"; mode line suffix */
1825             for (p = local_vars_beg,
1826                    scan_end = buf + nread - LENGTH ("-*-");
1827                  p <= scan_end
1828                    && lines_checked < LINES_TO_CHECK;
1829                  p++)
1830               if (*p == '-' && *(p+1) == '*' && *(p+2) == '-')
1831                 {
1832                   Extbyte *suffix = p;
1833                   /* Look for "coding:" */
1834                   for (p = local_vars_beg,
1835                          scan_end = suffix - LENGTH ("coding:?");
1836                        p <= scan_end;
1837                        p++)
1838                     if (memcmp ("coding:", p, LENGTH ("coding:")) == 0
1839                         && (p == local_vars_beg
1840                             || (*(p-1) == ' '  ||
1841                                 *(p-1) == '\t' ||
1842                                 *(p-1) == ';')))
1843                       {
1844                         Extbyte save;
1845                         int n;
1846                         p += LENGTH ("coding:");
1847                         while (*p == ' ' || *p == '\t') p++;
1848
1849                         /* Get coding system name */
1850                         save = *suffix; *suffix = '\0';
1851                         /* Characters valid in a MIME charset name (rfc 1521),
1852                            and in a Lisp symbol name. */
1853                         n = strspn ( (char *) p,
1854                                     "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
1855                                     "abcdefghijklmnopqrstuvwxyz"
1856                                     "0123456789"
1857                                     "!$%&*+-.^_{|}~");
1858                         *suffix = save;
1859                         if (n > 0)
1860                           {
1861                             save = p[n]; p[n] = '\0';
1862                             coding_system =
1863                               Ffind_coding_system (intern ((char *) p));
1864                             p[n] = save;
1865                           }
1866                         break;
1867                       }
1868                   break;
1869                 }
1870               /* #### file must use standard EOLs or we miss 2d line */
1871               /* #### not to mention this is broken for UTF-16 DOS files */
1872               else if (*p == '\n' || *p == '\r')
1873                 {
1874                   lines_checked++;
1875                   /* skip past multibyte (DOS) newline */
1876                   if (*p == '\r' && *(p+1) == '\n') p++;
1877                 }
1878             break;
1879           }
1880         /* #### file must use standard EOLs or we miss 2d line */
1881         /* #### not to mention this is broken for UTF-16 DOS files */
1882         else if (*p == '\n' || *p == '\r')
1883           {
1884             lines_checked++;
1885             /* skip past multibyte (DOS) newline */
1886             if (*p == '\r' && *(p+1) == '\n') p++;
1887           }
1888
1889       if (NILP (coding_system))
1890         do
1891           {
1892             if (detect_coding_type (&decst, buf, nread,
1893                                     XCODING_SYSTEM_TYPE (*codesys_in_out)
1894                                     != CODESYS_AUTODETECT))
1895               break;
1896             nread = Lstream_read (stream, buf, sizeof (buf));
1897             if (nread == 0)
1898               break;
1899           }
1900         while (1);
1901
1902       else if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT
1903                && XCODING_SYSTEM_EOL_TYPE (coding_system) == EOL_AUTODETECT)
1904         do
1905           {
1906             if (detect_coding_type (&decst, buf, nread, 1))
1907               break;
1908             nread = Lstream_read (stream, buf, sizeof (buf));
1909             if (!nread)
1910               break;
1911           }
1912         while (1);
1913
1914       *eol_type_in_out = decst.eol_type;
1915       if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT)
1916         {
1917           if (NILP (coding_system))
1918             *codesys_in_out = coding_system_from_mask (decst.mask);
1919           else
1920             *codesys_in_out = coding_system;
1921         }
1922     }
1923
1924   /* If we absolutely can't determine the EOL type, just assume LF. */
1925   if (*eol_type_in_out == EOL_AUTODETECT)
1926     *eol_type_in_out = EOL_LF;
1927
1928   Lstream_rewind (stream);
1929 }
1930
1931 DEFUN ("detect-coding-region", Fdetect_coding_region, 2, 3, 0, /*
1932 Detect coding system of the text in the region between START and END.
1933 Return a list of possible coding systems ordered by priority.
1934 If only ASCII characters are found, return 'undecided or one of
1935 its subsidiary coding systems according to a detected end-of-line
1936 type.  Optional arg BUFFER defaults to the current buffer.
1937 */
1938        (start, end, buffer))
1939 {
1940   Lisp_Object val = Qnil;
1941   struct buffer *buf = decode_buffer (buffer, 0);
1942   Bufpos b, e;
1943   Lisp_Object instream, lb_instream;
1944   Lstream *istr, *lb_istr;
1945   struct detection_state decst;
1946   struct gcpro gcpro1, gcpro2;
1947
1948   get_buffer_range_char (buf, start, end, &b, &e, 0);
1949   lb_instream = make_lisp_buffer_input_stream (buf, b, e, 0);
1950   lb_istr = XLSTREAM (lb_instream);
1951   instream = make_encoding_input_stream (lb_istr, Fget_coding_system (Qbinary));
1952   istr = XLSTREAM (instream);
1953   GCPRO2 (instream, lb_instream);
1954   xzero (decst);
1955   decst.eol_type = EOL_AUTODETECT;
1956   decst.mask = ~0;
1957   while (1)
1958     {
1959       Extbyte random_buffer[4096];
1960       Lstream_data_count nread = Lstream_read (istr, random_buffer, sizeof (random_buffer));
1961
1962       if (!nread)
1963         break;
1964       if (detect_coding_type (&decst, random_buffer, nread, 0))
1965         break;
1966     }
1967
1968   if (decst.mask == ~0)
1969     val = subsidiary_coding_system (Fget_coding_system (Qundecided),
1970                                     decst.eol_type);
1971   else
1972     {
1973       int i;
1974
1975       val = Qnil;
1976 #ifdef MULE
1977       decst.mask = postprocess_iso2022_mask (decst.mask);
1978 #endif
1979       for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
1980         {
1981           int sys = fcd->coding_category_by_priority[i];
1982           if (decst.mask & (1 << sys))
1983             {
1984               Lisp_Object codesys = fcd->coding_category_system[sys];
1985               if (!NILP (codesys))
1986                 codesys = subsidiary_coding_system (codesys, decst.eol_type);
1987               val = Fcons (codesys, val);
1988             }
1989         }
1990     }
1991   Lstream_close (istr);
1992   UNGCPRO;
1993   Lstream_delete (istr);
1994   Lstream_delete (lb_istr);
1995   return val;
1996 }
1997
1998 \f
1999 /************************************************************************/
2000 /*           Converting to internal Mule format ("decoding")            */
2001 /************************************************************************/
2002
2003 /* A decoding stream is a stream used for decoding text (i.e.
2004    converting from some external format to internal format).
2005    The decoding-stream object keeps track of the actual coding
2006    stream, the stream that is at the other end, and data that
2007    needs to be persistent across the lifetime of the stream. */
2008
2009 /* Handle the EOL stuff related to just-read-in character C.
2010    EOL_TYPE is the EOL type of the coding stream.
2011    FLAGS is the current value of FLAGS in the coding stream, and may
2012    be modified by this macro.  (The macro only looks at the
2013    CODING_STATE_CR flag.)  DST is the Dynarr to which the decoded
2014    bytes are to be written.  You need to also define a local goto
2015    label "label_continue_loop" that is at the end of the main
2016    character-reading loop.
2017
2018    If C is a CR character, then this macro handles it entirely and
2019    jumps to label_continue_loop.  Otherwise, this macro does not add
2020    anything to DST, and continues normally.  You should continue
2021    processing C normally after this macro. */
2022
2023 #define DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst)         \
2024 do {                                                            \
2025   if (c == '\r')                                                \
2026     {                                                           \
2027       if (eol_type == EOL_CR)                                   \
2028         Dynarr_add (dst, '\n');                                 \
2029       else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR) \
2030         Dynarr_add (dst, c);                                    \
2031       else                                                      \
2032         flags |= CODING_STATE_CR;                               \
2033       goto label_continue_loop;                                 \
2034     }                                                           \
2035   else if (flags & CODING_STATE_CR)                             \
2036     {   /* eol_type == CODING_SYSTEM_EOL_CRLF */                \
2037       if (c != '\n')                                            \
2038         Dynarr_add (dst, '\r');                                 \
2039       flags &= ~CODING_STATE_CR;                                \
2040     }                                                           \
2041 } while (0)
2042
2043 /* C should be a binary character in the range 0 - 255; convert
2044    to internal format and add to Dynarr DST. */
2045
2046 #ifdef UTF2000
2047 #define DECODE_ADD_BINARY_CHAR(c, dst) \
2048 do {                                            \
2049   if (BYTE_ASCII_P (c))                         \
2050     Dynarr_add (dst, c);                        \
2051   else                                          \
2052     {                                           \
2053       Dynarr_add (dst, (c >> 6) | 0xc0);        \
2054       Dynarr_add (dst, (c & 0x3f) | 0x80);      \
2055     }                                           \
2056 } while (0)
2057
2058 INLINE void
2059 DECODE_ADD_UCS_CHAR(Emchar c, unsigned_char_dynarr* dst)
2060 {
2061   if ( c <= 0x7f )
2062     {
2063       Dynarr_add (dst, c);
2064     }
2065   else if ( c <= 0x7ff )
2066     {
2067       Dynarr_add (dst, (c >> 6) | 0xc0);
2068       Dynarr_add (dst, (c & 0x3f) | 0x80);
2069     }
2070   else if ( c <= 0xffff )
2071     {
2072       Dynarr_add (dst,  (c >> 12) | 0xe0);
2073       Dynarr_add (dst, ((c >>  6) & 0x3f) | 0x80);
2074       Dynarr_add (dst,  (c        & 0x3f) | 0x80);
2075     }
2076   else if ( c <= 0x1fffff )
2077     {
2078       Dynarr_add (dst,  (c >> 18) | 0xf0);
2079       Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2080       Dynarr_add (dst, ((c >>  6) & 0x3f) | 0x80);
2081       Dynarr_add (dst,  (c        & 0x3f) | 0x80);
2082     }
2083   else if ( c <= 0x3ffffff )
2084     {
2085       Dynarr_add (dst,  (c >> 24) | 0xf8);
2086       Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
2087       Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2088       Dynarr_add (dst, ((c >>  6) & 0x3f) | 0x80);
2089       Dynarr_add (dst,  (c        & 0x3f) | 0x80);
2090     }
2091   else
2092     {
2093       Dynarr_add (dst,  (c >> 30) | 0xfc);
2094       Dynarr_add (dst, ((c >> 24) & 0x3f) | 0x80);
2095       Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
2096       Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2097       Dynarr_add (dst, ((c >>  6) & 0x3f) | 0x80);
2098       Dynarr_add (dst,  (c        & 0x3f) | 0x80);
2099     }
2100 }
2101 #else
2102 #define DECODE_ADD_BINARY_CHAR(c, dst)          \
2103 do {                                            \
2104   if (BYTE_ASCII_P (c))                         \
2105     Dynarr_add (dst, c);                        \
2106   else if (BYTE_C1_P (c))                       \
2107     {                                           \
2108       Dynarr_add (dst, LEADING_BYTE_CONTROL_1); \
2109       Dynarr_add (dst, c + 0x20);               \
2110     }                                           \
2111   else                                          \
2112     {                                           \
2113       Dynarr_add (dst, LEADING_BYTE_LATIN_ISO8859_1); \
2114       Dynarr_add (dst, c);                      \
2115     }                                           \
2116 } while (0)
2117 #endif
2118
2119 #define DECODE_OUTPUT_PARTIAL_CHAR(ch)  \
2120 do {                                    \
2121   if (ch)                               \
2122     {                                   \
2123       DECODE_ADD_BINARY_CHAR (ch, dst); \
2124       ch = 0;                           \
2125     }                                   \
2126 } while (0)
2127
2128 #define DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst) \
2129 do {                                    \
2130   if (flags & CODING_STATE_END)         \
2131     {                                   \
2132       DECODE_OUTPUT_PARTIAL_CHAR (ch);  \
2133       if (flags & CODING_STATE_CR)      \
2134         Dynarr_add (dst, '\r');         \
2135     }                                   \
2136 } while (0)
2137
2138 #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding)
2139
2140 struct decoding_stream
2141 {
2142   /* Coding system that governs the conversion. */
2143   Lisp_Coding_System *codesys;
2144
2145   /* Stream that we read the encoded data from or
2146      write the decoded data to. */
2147   Lstream *other_end;
2148
2149   /* If we are reading, then we can return only a fixed amount of
2150      data, so if the conversion resulted in too much data, we store it
2151      here for retrieval the next time around. */
2152   unsigned_char_dynarr *runoff;
2153
2154   /* FLAGS holds flags indicating the current state of the decoding.
2155      Some of these flags are dependent on the coding system. */
2156   unsigned int flags;
2157
2158   /* CH holds a partially built-up character.  Since we only deal
2159      with one- and two-byte characters at the moment, we only use
2160      this to store the first byte of a two-byte character. */
2161   unsigned int ch;
2162
2163   /* EOL_TYPE specifies the type of end-of-line conversion that
2164      currently applies.  We need to keep this separate from the
2165      EOL type stored in CODESYS because the latter might indicate
2166      automatic EOL-type detection while the former will always
2167      indicate a particular EOL type. */
2168   eol_type_t eol_type;
2169 #ifdef MULE
2170   /* Additional ISO2022 information.  We define the structure above
2171      because it's also needed by the detection routines. */
2172   struct iso2022_decoder iso2022;
2173
2174   /* Additional information (the state of the running CCL program)
2175      used by the CCL decoder. */
2176   struct ccl_program ccl;
2177
2178   /* counter for UTF-8 or UCS-4 */
2179   unsigned char counter;
2180 #endif
2181   struct detection_state decst;
2182 };
2183
2184 static Lstream_data_count decoding_reader (Lstream *stream,
2185                                 unsigned char *data, Lstream_data_count size);
2186 static Lstream_data_count decoding_writer (Lstream *stream,
2187                                 const unsigned char *data, Lstream_data_count size);
2188 static int decoding_rewinder   (Lstream *stream);
2189 static int decoding_seekable_p (Lstream *stream);
2190 static int decoding_flusher    (Lstream *stream);
2191 static int decoding_closer     (Lstream *stream);
2192
2193 static Lisp_Object decoding_marker (Lisp_Object stream);
2194
2195 DEFINE_LSTREAM_IMPLEMENTATION ("decoding", lstream_decoding,
2196                                sizeof (struct decoding_stream));
2197
2198 static Lisp_Object
2199 decoding_marker (Lisp_Object stream)
2200 {
2201   Lstream *str = DECODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2202   Lisp_Object str_obj;
2203
2204   /* We do not need to mark the coding systems or charsets stored
2205      within the stream because they are stored in a global list
2206      and automatically marked. */
2207
2208   XSETLSTREAM (str_obj, str);
2209   mark_object (str_obj);
2210   if (str->imp->marker)
2211     return (str->imp->marker) (str_obj);
2212   else
2213     return Qnil;
2214 }
2215
2216 /* Read SIZE bytes of data and store it into DATA.  We are a decoding stream
2217    so we read data from the other end, decode it, and store it into DATA. */
2218
2219 static Lstream_data_count
2220 decoding_reader (Lstream *stream, unsigned char *data, Lstream_data_count size)
2221 {
2222   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2223   unsigned char *orig_data = data;
2224   Lstream_data_count read_size;
2225   int error_occurred = 0;
2226
2227   /* We need to interface to mule_decode(), which expects to take some
2228      amount of data and store the result into a Dynarr.  We have
2229      mule_decode() store into str->runoff, and take data from there
2230      as necessary. */
2231
2232   /* We loop until we have enough data, reading chunks from the other
2233      end and decoding it. */
2234   while (1)
2235     {
2236       /* Take data from the runoff if we can.  Make sure to take at
2237          most SIZE bytes, and delete the data from the runoff. */
2238       if (Dynarr_length (str->runoff) > 0)
2239         {
2240           Lstream_data_count chunk = min (size, (Lstream_data_count) Dynarr_length (str->runoff));
2241           memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2242           Dynarr_delete_many (str->runoff, 0, chunk);
2243           data += chunk;
2244           size -= chunk;
2245         }
2246
2247       if (size == 0)
2248         break; /* No more room for data */
2249
2250       if (str->flags & CODING_STATE_END)
2251         /* This means that on the previous iteration, we hit the EOF on
2252            the other end.  We loop once more so that mule_decode() can
2253            output any final stuff it may be holding, or any "go back
2254            to a sane state" escape sequences. (This latter makes sense
2255            during encoding.) */
2256         break;
2257
2258       /* Exhausted the runoff, so get some more.  DATA has at least
2259          SIZE bytes left of storage in it, so it's OK to read directly
2260          into it.  (We'll be overwriting above, after we've decoded it
2261          into the runoff.) */
2262       read_size = Lstream_read (str->other_end, data, size);
2263       if (read_size < 0)
2264         {
2265           error_occurred = 1;
2266           break;
2267         }
2268       if (read_size == 0)
2269         /* There might be some more end data produced in the translation.
2270            See the comment above. */
2271         str->flags |= CODING_STATE_END;
2272       mule_decode (stream, (Extbyte *) data, str->runoff, read_size);
2273     }
2274
2275   if (data - orig_data == 0)
2276     return error_occurred ? -1 : 0;
2277   else
2278     return data - orig_data;
2279 }
2280
2281 static Lstream_data_count
2282 decoding_writer (Lstream *stream, const unsigned char *data, Lstream_data_count size)
2283 {
2284   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2285   Lstream_data_count retval;
2286
2287   /* Decode all our data into the runoff, and then attempt to write
2288      it all out to the other end.  Remove whatever chunk we succeeded
2289      in writing. */
2290   mule_decode (stream, (Extbyte *) data, str->runoff, size);
2291   retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2292                           Dynarr_length (str->runoff));
2293   if (retval > 0)
2294     Dynarr_delete_many (str->runoff, 0, retval);
2295   /* Do NOT return retval.  The return value indicates how much
2296      of the incoming data was written, not how many bytes were
2297      written. */
2298   return size;
2299 }
2300
2301 static void
2302 reset_decoding_stream (struct decoding_stream *str)
2303 {
2304 #ifdef MULE
2305   if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_ISO2022)
2306     {
2307       Lisp_Object coding_system;
2308       XSETCODING_SYSTEM (coding_system, str->codesys);
2309       reset_iso2022 (coding_system, &str->iso2022);
2310     }
2311   else if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_CCL)
2312     {
2313       setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_DECODE (str->codesys));
2314     }
2315   str->counter = 0;
2316 #endif /* MULE */
2317   if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT
2318       || CODING_SYSTEM_EOL_TYPE (str->codesys) == EOL_AUTODETECT)
2319     {
2320       xzero (str->decst);
2321       str->decst.eol_type = EOL_AUTODETECT;
2322       str->decst.mask = ~0;
2323     }
2324   str->flags = str->ch = 0;
2325 }
2326
2327 static int
2328 decoding_rewinder (Lstream *stream)
2329 {
2330   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2331   reset_decoding_stream (str);
2332   Dynarr_reset (str->runoff);
2333   return Lstream_rewind (str->other_end);
2334 }
2335
2336 static int
2337 decoding_seekable_p (Lstream *stream)
2338 {
2339   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2340   return Lstream_seekable_p (str->other_end);
2341 }
2342
2343 static int
2344 decoding_flusher (Lstream *stream)
2345 {
2346   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2347   return Lstream_flush (str->other_end);
2348 }
2349
2350 static int
2351 decoding_closer (Lstream *stream)
2352 {
2353   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2354   if (stream->flags & LSTREAM_FL_WRITE)
2355     {
2356       str->flags |= CODING_STATE_END;
2357       decoding_writer (stream, 0, 0);
2358     }
2359   Dynarr_free (str->runoff);
2360 #ifdef MULE
2361 #ifdef ENABLE_COMPOSITE_CHARS
2362   if (str->iso2022.composite_chars)
2363     Dynarr_free (str->iso2022.composite_chars);
2364 #endif
2365 #endif
2366   return Lstream_close (str->other_end);
2367 }
2368
2369 Lisp_Object
2370 decoding_stream_coding_system (Lstream *stream)
2371 {
2372   Lisp_Object coding_system;
2373   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2374
2375   XSETCODING_SYSTEM (coding_system, str->codesys);
2376   return subsidiary_coding_system (coding_system, str->eol_type);
2377 }
2378
2379 void
2380 set_decoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2381 {
2382   Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2383   struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2384   str->codesys = cs;
2385   if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
2386     str->eol_type = CODING_SYSTEM_EOL_TYPE (cs);
2387   reset_decoding_stream (str);
2388 }
2389
2390 /* WARNING WARNING WARNING WARNING!!!!!  If you open up a decoding
2391    stream for writing, no automatic code detection will be performed.
2392    The reason for this is that automatic code detection requires a
2393    seekable input.  Things will also fail if you open a decoding
2394    stream for reading using a non-fully-specified coding system and
2395    a non-seekable input stream. */
2396
2397 static Lisp_Object
2398 make_decoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2399                         const char *mode)
2400 {
2401   Lstream *lstr = Lstream_new (lstream_decoding, mode);
2402   struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2403   Lisp_Object obj;
2404
2405   xzero (*str);
2406   str->other_end = stream;
2407   str->runoff = (unsigned_char_dynarr *) Dynarr_new (unsigned_char);
2408   str->eol_type = EOL_AUTODETECT;
2409   if (!strcmp (mode, "r")
2410       && Lstream_seekable_p (stream))
2411     /* We can determine the coding system now. */
2412     determine_real_coding_system (stream, &codesys, &str->eol_type);
2413   set_decoding_stream_coding_system (lstr, codesys);
2414   str->decst.eol_type = str->eol_type;
2415   str->decst.mask = ~0;
2416   XSETLSTREAM (obj, lstr);
2417   return obj;
2418 }
2419
2420 Lisp_Object
2421 make_decoding_input_stream (Lstream *stream, Lisp_Object codesys)
2422 {
2423   return make_decoding_stream_1 (stream, codesys, "r");
2424 }
2425
2426 Lisp_Object
2427 make_decoding_output_stream (Lstream *stream, Lisp_Object codesys)
2428 {
2429   return make_decoding_stream_1 (stream, codesys, "w");
2430 }
2431
2432 /* Note: the decode_coding_* functions all take the same
2433    arguments as mule_decode(), which is to say some SRC data of
2434    size N, which is to be stored into dynamic array DST.
2435    DECODING is the stream within which the decoding is
2436    taking place, but no data is actually read from or
2437    written to that stream; that is handled in decoding_reader()
2438    or decoding_writer().  This allows the same functions to
2439    be used for both reading and writing. */
2440
2441 static void
2442 mule_decode (Lstream *decoding, const Extbyte *src,
2443              unsigned_char_dynarr *dst, Lstream_data_count n)
2444 {
2445   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2446
2447   /* If necessary, do encoding-detection now.  We do this when
2448      we're a writing stream or a non-seekable reading stream,
2449      meaning that we can't just process the whole input,
2450      rewind, and start over. */
2451
2452   if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT ||
2453       str->eol_type == EOL_AUTODETECT)
2454     {
2455       Lisp_Object codesys;
2456
2457       XSETCODING_SYSTEM (codesys, str->codesys);
2458       detect_coding_type (&str->decst, src, n,
2459                           CODING_SYSTEM_TYPE (str->codesys) !=
2460                           CODESYS_AUTODETECT);
2461       if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT &&
2462           str->decst.mask != ~0)
2463         /* #### This is cheesy.  What we really ought to do is
2464            buffer up a certain amount of data so as to get a
2465            less random result. */
2466         codesys = coding_system_from_mask (str->decst.mask);
2467       str->eol_type = str->decst.eol_type;
2468       if (XCODING_SYSTEM (codesys) != str->codesys)
2469         {
2470           /* Preserve the CODING_STATE_END flag in case it was set.
2471              If we erase it, bad things might happen. */
2472           int was_end = str->flags & CODING_STATE_END;
2473           set_decoding_stream_coding_system (decoding, codesys);
2474           if (was_end)
2475             str->flags |= CODING_STATE_END;
2476         }
2477     }
2478
2479   switch (CODING_SYSTEM_TYPE (str->codesys))
2480     {
2481 #ifdef DEBUG_XEMACS
2482     case CODESYS_INTERNAL:
2483       Dynarr_add_many (dst, src, n);
2484       break;
2485 #endif
2486     case CODESYS_AUTODETECT:
2487       /* If we got this far and still haven't decided on the coding
2488          system, then do no conversion. */
2489     case CODESYS_NO_CONVERSION:
2490       decode_coding_no_conversion (decoding, src, dst, n);
2491       break;
2492 #ifdef MULE
2493     case CODESYS_SHIFT_JIS:
2494       decode_coding_sjis (decoding, src, dst, n);
2495       break;
2496     case CODESYS_BIG5:
2497       decode_coding_big5 (decoding, src, dst, n);
2498       break;
2499     case CODESYS_UCS4:
2500       decode_coding_ucs4 (decoding, src, dst, n);
2501       break;
2502     case CODESYS_UTF8:
2503       decode_coding_utf8 (decoding, src, dst, n);
2504       break;
2505     case CODESYS_CCL:
2506       str->ccl.last_block = str->flags & CODING_STATE_END;
2507       /* When applying ccl program to stream, MUST NOT set NULL
2508          pointer to src.  */
2509       ccl_driver (&str->ccl, (src ? (unsigned char *)src : (unsigned char*)""),
2510                   dst, n, 0, CCL_MODE_DECODING);
2511       break;
2512     case CODESYS_ISO2022:
2513       decode_coding_iso2022 (decoding, src, dst, n);
2514       break;
2515 #endif /* MULE */
2516     default:
2517       ABORT ();
2518     }
2519 }
2520
2521 DEFUN ("decode-coding-region", Fdecode_coding_region, 3, 4, 0, /*
2522 Decode the text between START and END which is encoded in CODING-SYSTEM.
2523 This is useful if you've read in encoded text from a file without decoding
2524 it (e.g. you read in a JIS-formatted file but used the `binary' or
2525 `no-conversion' coding system, so that it shows up as "^[$B!<!+^[(B").
2526 Return length of decoded text.
2527 BUFFER defaults to the current buffer if unspecified.
2528 */
2529        (start, end, coding_system, buffer))
2530 {
2531   Bufpos b, e;
2532   struct buffer *buf = decode_buffer (buffer, 0);
2533   Lisp_Object instream, lb_outstream, de_outstream, outstream;
2534   Lstream *istr, *ostr;
2535   struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2536
2537   get_buffer_range_char (buf, start, end, &b, &e, 0);
2538
2539   barf_if_buffer_read_only (buf, b, e);
2540
2541   coding_system = Fget_coding_system (coding_system);
2542   instream = make_lisp_buffer_input_stream  (buf, b, e, 0);
2543   lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2544   de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2545                                               coding_system);
2546   outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2547                                            Fget_coding_system (Qbinary));
2548   istr = XLSTREAM (instream);
2549   ostr = XLSTREAM (outstream);
2550   GCPRO4 (instream, lb_outstream, de_outstream, outstream);
2551
2552   /* The chain of streams looks like this:
2553
2554      [BUFFER] <----- send through
2555                      ------> [ENCODE AS BINARY]
2556                              ------> [DECODE AS SPECIFIED]
2557                                      ------> [BUFFER]
2558    */
2559
2560   while (1)
2561     {
2562       char tempbuf[1024]; /* some random amount */
2563       Bufpos newpos, even_newer_pos;
2564       Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2565       Lstream_data_count size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2566
2567       if (!size_in_bytes)
2568         break;
2569       newpos = lisp_buffer_stream_startpos (istr);
2570       Lstream_write (ostr, tempbuf, size_in_bytes);
2571       even_newer_pos = lisp_buffer_stream_startpos (istr);
2572       buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2573                            even_newer_pos, 0);
2574     }
2575   Lstream_close (istr);
2576   Lstream_close (ostr);
2577   UNGCPRO;
2578   Lstream_delete (istr);
2579   Lstream_delete (ostr);
2580   Lstream_delete (XLSTREAM (de_outstream));
2581   Lstream_delete (XLSTREAM (lb_outstream));
2582   return Qnil;
2583 }
2584
2585 \f
2586 /************************************************************************/
2587 /*           Converting to an external encoding ("encoding")            */
2588 /************************************************************************/
2589
2590 /* An encoding stream is an output stream.  When you create the
2591    stream, you specify the coding system that governs the encoding
2592    and another stream that the resulting encoded data is to be
2593    sent to, and then start sending data to it. */
2594
2595 #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding)
2596
2597 struct encoding_stream
2598 {
2599   /* Coding system that governs the conversion. */
2600   Lisp_Coding_System *codesys;
2601
2602   /* Stream that we read the encoded data from or
2603      write the decoded data to. */
2604   Lstream *other_end;
2605
2606   /* If we are reading, then we can return only a fixed amount of
2607      data, so if the conversion resulted in too much data, we store it
2608      here for retrieval the next time around. */
2609   unsigned_char_dynarr *runoff;
2610
2611   /* FLAGS holds flags indicating the current state of the encoding.
2612      Some of these flags are dependent on the coding system. */
2613   unsigned int flags;
2614
2615   /* CH holds a partially built-up character.  Since we only deal
2616      with one- and two-byte characters at the moment, we only use
2617      this to store the first byte of a two-byte character. */
2618   unsigned int ch;
2619 #ifdef MULE
2620   /* Additional information used by the ISO2022 encoder. */
2621   struct
2622     {
2623       /* CHARSET holds the character sets currently assigned to the G0
2624          through G3 registers.  It is initialized from the array
2625          INITIAL_CHARSET in CODESYS. */
2626       Lisp_Object charset[4];
2627
2628       /* Which registers are currently invoked into the left (GL) and
2629          right (GR) halves of the 8-bit encoding space? */
2630       int register_left, register_right;
2631
2632       /* Whether we need to explicitly designate the charset in the
2633          G? register before using it.  It is initialized from the
2634          array FORCE_CHARSET_ON_OUTPUT in CODESYS. */
2635       unsigned char force_charset_on_output[4];
2636
2637       /* Other state variables that need to be preserved across
2638          invocations. */
2639       Lisp_Object current_charset;
2640       int current_half;
2641       int current_char_boundary;
2642     } iso2022;
2643
2644   /* Additional information (the state of the running CCL program)
2645      used by the CCL encoder. */
2646   struct ccl_program ccl;
2647 #endif /* MULE */
2648 };
2649
2650 static Lstream_data_count encoding_reader (Lstream *stream, unsigned char *data, Lstream_data_count size);
2651 static Lstream_data_count encoding_writer (Lstream *stream, const unsigned char *data,
2652                                 Lstream_data_count size);
2653 static int encoding_rewinder   (Lstream *stream);
2654 static int encoding_seekable_p (Lstream *stream);
2655 static int encoding_flusher    (Lstream *stream);
2656 static int encoding_closer     (Lstream *stream);
2657
2658 static Lisp_Object encoding_marker (Lisp_Object stream);
2659
2660 DEFINE_LSTREAM_IMPLEMENTATION ("encoding", lstream_encoding,
2661                                sizeof (struct encoding_stream));
2662
2663 static Lisp_Object
2664 encoding_marker (Lisp_Object stream)
2665 {
2666   Lstream *str = ENCODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2667   Lisp_Object str_obj;
2668
2669   /* We do not need to mark the coding systems or charsets stored
2670      within the stream because they are stored in a global list
2671      and automatically marked. */
2672
2673   XSETLSTREAM (str_obj, str);
2674   mark_object (str_obj);
2675   if (str->imp->marker)
2676     return (str->imp->marker) (str_obj);
2677   else
2678     return Qnil;
2679 }
2680
2681 /* Read SIZE bytes of data and store it into DATA.  We are a encoding stream
2682    so we read data from the other end, encode it, and store it into DATA. */
2683
2684 static Lstream_data_count
2685 encoding_reader (Lstream *stream, unsigned char *data, Lstream_data_count size)
2686 {
2687   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2688   unsigned char *orig_data = data;
2689   Lstream_data_count read_size;
2690   int error_occurred = 0;
2691
2692   /* We need to interface to mule_encode(), which expects to take some
2693      amount of data and store the result into a Dynarr.  We have
2694      mule_encode() store into str->runoff, and take data from there
2695      as necessary. */
2696
2697   /* We loop until we have enough data, reading chunks from the other
2698      end and encoding it. */
2699   while (1)
2700     {
2701       /* Take data from the runoff if we can.  Make sure to take at
2702          most SIZE bytes, and delete the data from the runoff. */
2703       if (Dynarr_length (str->runoff) > 0)
2704         {
2705           int chunk = min ((int) size, Dynarr_length (str->runoff));
2706           memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2707           Dynarr_delete_many (str->runoff, 0, chunk);
2708           data += chunk;
2709           size -= chunk;
2710         }
2711
2712       if (size == 0)
2713         break; /* No more room for data */
2714
2715       if (str->flags & CODING_STATE_END)
2716         /* This means that on the previous iteration, we hit the EOF on
2717            the other end.  We loop once more so that mule_encode() can
2718            output any final stuff it may be holding, or any "go back
2719            to a sane state" escape sequences. (This latter makes sense
2720            during encoding.) */
2721         break;
2722
2723       /* Exhausted the runoff, so get some more.  DATA at least SIZE bytes
2724          left of storage in it, so it's OK to read directly into it.
2725          (We'll be overwriting above, after we've encoded it into the
2726          runoff.) */
2727       read_size = Lstream_read (str->other_end, data, size);
2728       if (read_size < 0)
2729         {
2730           error_occurred = 1;
2731           break;
2732         }
2733       if (read_size == 0)
2734         /* There might be some more end data produced in the translation.
2735            See the comment above. */
2736         str->flags |= CODING_STATE_END;
2737       mule_encode (stream, data, str->runoff, read_size);
2738     }
2739
2740   if (data == orig_data)
2741     return error_occurred ? -1 : 0;
2742   else
2743     return data - orig_data;
2744 }
2745
2746 static Lstream_data_count
2747 encoding_writer (Lstream *stream, const unsigned char *data, Lstream_data_count size)
2748 {
2749   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2750   Lstream_data_count retval;
2751
2752   /* Encode all our data into the runoff, and then attempt to write
2753      it all out to the other end.  Remove whatever chunk we succeeded
2754      in writing. */
2755   mule_encode (stream, data, str->runoff, size);
2756   retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2757                           Dynarr_length (str->runoff));
2758   if (retval > 0)
2759     Dynarr_delete_many (str->runoff, 0, retval);
2760   /* Do NOT return retval.  The return value indicates how much
2761      of the incoming data was written, not how many bytes were
2762      written. */
2763   return size;
2764 }
2765
2766 static void
2767 reset_encoding_stream (struct encoding_stream *str)
2768 {
2769 #ifdef MULE
2770   switch (CODING_SYSTEM_TYPE (str->codesys))
2771     {
2772     case CODESYS_ISO2022:
2773       {
2774         int i;
2775
2776         for (i = 0; i < 4; i++)
2777           {
2778             str->iso2022.charset[i] =
2779               CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, i);
2780             str->iso2022.force_charset_on_output[i] =
2781               CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (str->codesys, i);
2782           }
2783         str->iso2022.register_left = 0;
2784         str->iso2022.register_right = 1;
2785         str->iso2022.current_charset = Qnil;
2786         str->iso2022.current_half = 0;
2787 #ifdef UTF2000
2788         str->iso2022.current_char_boundary = 0;
2789 #else
2790         str->iso2022.current_char_boundary = 1;
2791 #endif
2792         break;
2793       }
2794     case CODESYS_CCL:
2795       setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_ENCODE (str->codesys));
2796       break;
2797     default:
2798       break;
2799     }
2800 #endif /* MULE */
2801
2802   str->flags = str->ch = 0;
2803 }
2804
2805 static int
2806 encoding_rewinder (Lstream *stream)
2807 {
2808   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2809   reset_encoding_stream (str);
2810   Dynarr_reset (str->runoff);
2811   return Lstream_rewind (str->other_end);
2812 }
2813
2814 static int
2815 encoding_seekable_p (Lstream *stream)
2816 {
2817   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2818   return Lstream_seekable_p (str->other_end);
2819 }
2820
2821 static int
2822 encoding_flusher (Lstream *stream)
2823 {
2824   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2825   return Lstream_flush (str->other_end);
2826 }
2827
2828 static int
2829 encoding_closer (Lstream *stream)
2830 {
2831   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2832   if (stream->flags & LSTREAM_FL_WRITE)
2833     {
2834       str->flags |= CODING_STATE_END;
2835       encoding_writer (stream, 0, 0);
2836     }
2837   Dynarr_free (str->runoff);
2838   return Lstream_close (str->other_end);
2839 }
2840
2841 Lisp_Object
2842 encoding_stream_coding_system (Lstream *stream)
2843 {
2844   Lisp_Object coding_system;
2845   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2846
2847   XSETCODING_SYSTEM (coding_system, str->codesys);
2848   return coding_system;
2849 }
2850
2851 void
2852 set_encoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2853 {
2854   Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2855   struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2856   str->codesys = cs;
2857   reset_encoding_stream (str);
2858 }
2859
2860 static Lisp_Object
2861 make_encoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2862                         const char *mode)
2863 {
2864   Lstream *lstr = Lstream_new (lstream_encoding, mode);
2865   struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2866   Lisp_Object obj;
2867
2868   xzero (*str);
2869   str->runoff = Dynarr_new (unsigned_char);
2870   str->other_end = stream;
2871   set_encoding_stream_coding_system (lstr, codesys);
2872   XSETLSTREAM (obj, lstr);
2873   return obj;
2874 }
2875
2876 Lisp_Object
2877 make_encoding_input_stream (Lstream *stream, Lisp_Object codesys)
2878 {
2879   return make_encoding_stream_1 (stream, codesys, "r");
2880 }
2881
2882 Lisp_Object
2883 make_encoding_output_stream (Lstream *stream, Lisp_Object codesys)
2884 {
2885   return make_encoding_stream_1 (stream, codesys, "w");
2886 }
2887
2888 /* Convert N bytes of internally-formatted data stored in SRC to an
2889    external format, according to the encoding stream ENCODING.
2890    Store the encoded data into DST. */
2891
2892 static void
2893 mule_encode (Lstream *encoding, const Bufbyte *src,
2894              unsigned_char_dynarr *dst, Lstream_data_count n)
2895 {
2896   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2897
2898   switch (CODING_SYSTEM_TYPE (str->codesys))
2899     {
2900 #ifdef DEBUG_XEMACS
2901     case CODESYS_INTERNAL:
2902       Dynarr_add_many (dst, src, n);
2903       break;
2904 #endif
2905     case CODESYS_AUTODETECT:
2906       /* If we got this far and still haven't decided on the coding
2907          system, then do no conversion. */
2908     case CODESYS_NO_CONVERSION:
2909       encode_coding_no_conversion (encoding, src, dst, n);
2910       break;
2911 #ifdef MULE
2912     case CODESYS_SHIFT_JIS:
2913       encode_coding_sjis (encoding, src, dst, n);
2914       break;
2915     case CODESYS_BIG5:
2916       encode_coding_big5 (encoding, src, dst, n);
2917       break;
2918     case CODESYS_UCS4:
2919       encode_coding_ucs4 (encoding, src, dst, n);
2920       break;
2921     case CODESYS_UTF8:
2922       encode_coding_utf8 (encoding, src, dst, n);
2923       break;
2924     case CODESYS_CCL:
2925       str->ccl.last_block = str->flags & CODING_STATE_END;
2926       /* When applying ccl program to stream, MUST NOT set NULL
2927          pointer to src.  */
2928       ccl_driver (&str->ccl, ((src) ? src : (unsigned char*)""),
2929                   dst, n, 0, CCL_MODE_ENCODING);
2930       break;
2931     case CODESYS_ISO2022:
2932       encode_coding_iso2022 (encoding, src, dst, n);
2933       break;
2934 #endif /* MULE */
2935     default:
2936       ABORT ();
2937     }
2938 }
2939
2940 DEFUN ("encode-coding-region", Fencode_coding_region, 3, 4, 0, /*
2941 Encode the text between START and END using CODING-SYSTEM.
2942 This will, for example, convert Japanese characters into stuff such as
2943 "^[$B!<!+^[(B" if you use the JIS encoding.  Return length of encoded
2944 text.  BUFFER defaults to the current buffer if unspecified.
2945 */
2946        (start, end, coding_system, buffer))
2947 {
2948   Bufpos b, e;
2949   struct buffer *buf = decode_buffer (buffer, 0);
2950   Lisp_Object instream, lb_outstream, de_outstream, outstream;
2951   Lstream *istr, *ostr;
2952   struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2953
2954   get_buffer_range_char (buf, start, end, &b, &e, 0);
2955
2956   barf_if_buffer_read_only (buf, b, e);
2957
2958   coding_system = Fget_coding_system (coding_system);
2959   instream  = make_lisp_buffer_input_stream  (buf, b, e, 0);
2960   lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2961   de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2962                                               Fget_coding_system (Qbinary));
2963   outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2964                                            coding_system);
2965   istr = XLSTREAM (instream);
2966   ostr = XLSTREAM (outstream);
2967   GCPRO4 (instream, outstream, de_outstream, lb_outstream);
2968   /* The chain of streams looks like this:
2969
2970      [BUFFER] <----- send through
2971                      ------> [ENCODE AS SPECIFIED]
2972                              ------> [DECODE AS BINARY]
2973                                      ------> [BUFFER]
2974    */
2975   while (1)
2976     {
2977       char tempbuf[1024]; /* some random amount */
2978       Bufpos newpos, even_newer_pos;
2979       Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2980       Lstream_data_count size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2981
2982       if (!size_in_bytes)
2983         break;
2984       newpos = lisp_buffer_stream_startpos (istr);
2985       Lstream_write (ostr, tempbuf, size_in_bytes);
2986       even_newer_pos = lisp_buffer_stream_startpos (istr);
2987       buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2988                            even_newer_pos, 0);
2989     }
2990
2991   {
2992     Charcount retlen =
2993       lisp_buffer_stream_startpos (XLSTREAM (instream)) - b;
2994     Lstream_close (istr);
2995     Lstream_close (ostr);
2996     UNGCPRO;
2997     Lstream_delete (istr);
2998     Lstream_delete (ostr);
2999     Lstream_delete (XLSTREAM (de_outstream));
3000     Lstream_delete (XLSTREAM (lb_outstream));
3001     return make_int (retlen);
3002   }
3003 }
3004
3005 #ifdef MULE
3006 \f
3007 /************************************************************************/
3008 /*                          Shift-JIS methods                           */
3009 /************************************************************************/
3010
3011 /* Shift-JIS is a coding system encoding three character sets: ASCII, right
3012    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
3013    as is.  A character of JISX0201-Kana (DIMENSION1_CHARS94 character set) is
3014    encoded by "position-code + 0x80".  A character of JISX0208
3015    (DIMENSION2_CHARS94 character set) is encoded in 2-byte but two
3016    position-codes are divided and shifted so that it fit in the range
3017    below.
3018
3019    --- CODE RANGE of Shift-JIS ---
3020    (character set)      (range)
3021    ASCII                0x00 .. 0x7F
3022    JISX0201-Kana        0xA0 .. 0xDF
3023    JISX0208 (1st byte)  0x80 .. 0x9F and 0xE0 .. 0xEF
3024             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
3025    -------------------------------
3026
3027 */
3028
3029 /* Is this the first byte of a Shift-JIS two-byte char? */
3030
3031 #define BYTE_SJIS_TWO_BYTE_1_P(c) \
3032   (((c) >= 0x81 && (c) <= 0x9F) || ((c) >= 0xE0 && (c) <= 0xEF))
3033
3034 /* Is this the second byte of a Shift-JIS two-byte char? */
3035
3036 #define BYTE_SJIS_TWO_BYTE_2_P(c) \
3037   (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0x80 && (c) <= 0xFC))
3038
3039 #define BYTE_SJIS_KATAKANA_P(c) \
3040   ((c) >= 0xA1 && (c) <= 0xDF)
3041
3042 static int
3043 detect_coding_sjis (struct detection_state *st, const Extbyte *src, Lstream_data_count n)
3044 {
3045   while (n--)
3046     {
3047       unsigned char c = *(unsigned char *)src++;
3048       if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3049         return 0;
3050       if (st->shift_jis.in_second_byte)
3051         {
3052           st->shift_jis.in_second_byte = 0;
3053           if (c < 0x40)
3054             return 0;
3055         }
3056       else if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
3057         st->shift_jis.in_second_byte = 1;
3058     }
3059   return CODING_CATEGORY_SHIFT_JIS_MASK;
3060 }
3061
3062 /* Convert Shift-JIS data to internal format. */
3063
3064 static void
3065 decode_coding_sjis (Lstream *decoding, const Extbyte *src,
3066                     unsigned_char_dynarr *dst, Lstream_data_count n)
3067 {
3068   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3069   unsigned int flags  = str->flags;
3070   unsigned int ch     = str->ch;
3071   eol_type_t eol_type = str->eol_type;
3072
3073   while (n--)
3074     {
3075       unsigned char c = *(unsigned char *)src++;
3076
3077       if (ch)
3078         {
3079           /* Previous character was first byte of Shift-JIS Kanji char. */
3080           if (BYTE_SJIS_TWO_BYTE_2_P (c))
3081             {
3082               unsigned char e1, e2;
3083
3084               DECODE_SJIS (ch, c, e1, e2);
3085 #ifdef UTF2000
3086               DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_japanese_jisx0208,
3087                                             e1 & 0x7F,
3088                                             e2 & 0x7F), dst);
3089 #else
3090               Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
3091               Dynarr_add (dst, e1);
3092               Dynarr_add (dst, e2);
3093 #endif
3094             }
3095           else
3096             {
3097               DECODE_ADD_BINARY_CHAR (ch, dst);
3098               DECODE_ADD_BINARY_CHAR (c, dst);
3099             }
3100           ch = 0;
3101         }
3102       else
3103         {
3104           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3105           if (BYTE_SJIS_TWO_BYTE_1_P (c))
3106             ch = c;
3107           else if (BYTE_SJIS_KATAKANA_P (c))
3108             {
3109 #ifdef UTF2000
3110               DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_katakana_jisx0201,
3111                                             c & 0x7F, 0), dst);
3112 #else
3113               Dynarr_add (dst, LEADING_BYTE_KATAKANA_JISX0201);
3114               Dynarr_add (dst, c);
3115 #endif
3116             }
3117           else
3118             DECODE_ADD_BINARY_CHAR (c, dst);
3119         }
3120     label_continue_loop:;
3121     }
3122
3123   DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
3124
3125   str->flags = flags;
3126   str->ch    = ch;
3127 }
3128
3129 /* Convert internally-formatted data to Shift-JIS. */
3130
3131 static void
3132 encode_coding_sjis (Lstream *encoding, const Bufbyte *src,
3133                     unsigned_char_dynarr *dst, Lstream_data_count n)
3134 {
3135   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3136   unsigned int flags  = str->flags;
3137   unsigned int ch     = str->ch;
3138   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3139 #ifdef UTF2000
3140   unsigned char char_boundary = str->iso2022.current_char_boundary;
3141 #endif
3142
3143   while (n--)
3144     {
3145       Bufbyte c = *src++;
3146 #ifdef UTF2000
3147       switch (char_boundary)
3148         {
3149         case 0:
3150           if ( c >= 0xfc )
3151             {
3152               ch = c & 0x01;
3153               char_boundary = 5;
3154             }
3155           else if ( c >= 0xf8 )
3156             {
3157               ch = c & 0x03;
3158               char_boundary = 4;
3159             }
3160           else if ( c >= 0xf0 )
3161             {
3162               ch = c & 0x07;
3163               char_boundary = 3;
3164             }
3165           else if ( c >= 0xe0 )
3166             {
3167               ch = c & 0x0f;
3168               char_boundary = 2;
3169             }
3170           else if ( c >= 0xc0 )
3171             {
3172               ch = c & 0x1f;
3173               char_boundary = 1;
3174             }
3175           else
3176             {
3177               ch = 0;
3178               if (c == '\n')
3179                 {
3180                   if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3181                     Dynarr_add (dst, '\r');
3182                   if (eol_type != EOL_CR)
3183                     Dynarr_add (dst, c);
3184                 }
3185               else
3186                 Dynarr_add (dst, c);
3187               char_boundary = 0;
3188             }
3189           break;
3190         case 1:
3191           ch = ( ch << 6 ) | ( c & 0x3f );
3192           {
3193             Lisp_Object charset;
3194             unsigned int c1, c2, s1, s2;
3195
3196             BREAKUP_CHAR (ch, charset, c1, c2);
3197             if (EQ(charset, Vcharset_katakana_jisx0201))
3198               {
3199                 Dynarr_add (dst, c1 | 0x80);
3200               }
3201             else if (EQ(charset, Vcharset_japanese_jisx0208))
3202               {
3203                 ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3204                 Dynarr_add (dst, s1);
3205                 Dynarr_add (dst, s2);
3206               }
3207           }
3208           char_boundary = 0;
3209           break;
3210         default:
3211           ch = ( ch << 6 ) | ( c & 0x3f );
3212           char_boundary--;
3213         }
3214 #else
3215       if (c == '\n')
3216         {
3217           if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3218             Dynarr_add (dst, '\r');
3219           if (eol_type != EOL_CR)
3220             Dynarr_add (dst, '\n');
3221           ch = 0;
3222         }
3223       else if (BYTE_ASCII_P (c))
3224         {
3225           Dynarr_add (dst, c);
3226           ch = 0;
3227         }
3228       else if (BUFBYTE_LEADING_BYTE_P (c))
3229         ch = (c == LEADING_BYTE_KATAKANA_JISX0201 ||
3230               c == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
3231               c == LEADING_BYTE_JAPANESE_JISX0208) ? c : 0;
3232       else if (ch)
3233         {
3234           if (ch == LEADING_BYTE_KATAKANA_JISX0201)
3235             {
3236               Dynarr_add (dst, c);
3237               ch = 0;
3238             }
3239           else if (ch == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
3240                    ch == LEADING_BYTE_JAPANESE_JISX0208)
3241             ch = c;
3242           else
3243             {
3244               unsigned char j1, j2;
3245               ENCODE_SJIS (ch, c, j1, j2);
3246               Dynarr_add (dst, j1);
3247               Dynarr_add (dst, j2);
3248               ch = 0;
3249             }
3250         }
3251 #endif
3252     }
3253
3254   str->flags = flags;
3255   str->ch    = ch;
3256 #ifdef UTF2000
3257   str->iso2022.current_char_boundary = char_boundary;
3258 #endif
3259 }
3260
3261 DEFUN ("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /*
3262 Decode a JISX0208 character of Shift-JIS coding-system.
3263 CODE is the character code in Shift-JIS as a cons of type bytes.
3264 Return the corresponding character.
3265 */
3266        (code))
3267 {
3268   unsigned char c1, c2, s1, s2;
3269
3270   CHECK_CONS (code);
3271   CHECK_INT (XCAR (code));
3272   CHECK_INT (XCDR (code));
3273   s1 = XINT (XCAR (code));
3274   s2 = XINT (XCDR (code));
3275   if (BYTE_SJIS_TWO_BYTE_1_P (s1) &&
3276       BYTE_SJIS_TWO_BYTE_2_P (s2))
3277     {
3278       DECODE_SJIS (s1, s2, c1, c2);
3279       return make_char (MAKE_CHAR (Vcharset_japanese_jisx0208,
3280                                    c1 & 0x7F, c2 & 0x7F));
3281     }
3282   else
3283     return Qnil;
3284 }
3285
3286 DEFUN ("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /*
3287 Encode a JISX0208 character CHARACTER to SHIFT-JIS coding-system.
3288 Return the corresponding character code in SHIFT-JIS as a cons of two bytes.
3289 */
3290        (character))
3291 {
3292   Lisp_Object charset;
3293   int c1, c2, s1, s2;
3294
3295   CHECK_CHAR_COERCE_INT (character);
3296   BREAKUP_CHAR (XCHAR (character), charset, c1, c2);
3297   if (EQ (charset, Vcharset_japanese_jisx0208))
3298     {
3299       ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3300       return Fcons (make_int (s1), make_int (s2));
3301     }
3302   else
3303     return Qnil;
3304 }
3305
3306 \f
3307 /************************************************************************/
3308 /*                            Big5 methods                              */
3309 /************************************************************************/
3310
3311 /* BIG5 is a coding system encoding two character sets: ASCII and
3312    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
3313    character set and is encoded in two-byte.
3314
3315    --- CODE RANGE of BIG5 ---
3316    (character set)      (range)
3317    ASCII                0x00 .. 0x7F
3318    Big5 (1st byte)      0xA1 .. 0xFE
3319         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
3320    --------------------------
3321
3322    Since the number of characters in Big5 is larger than maximum
3323    characters in Emacs' charset (96x96), it can't be handled as one
3324    charset.  So, in Emacs, Big5 is divided into two: `charset-big5-1'
3325    and `charset-big5-2'.  Both <type>s are DIMENSION2_CHARS94.  The former
3326    contains frequently used characters and the latter contains less
3327    frequently used characters.  */
3328
3329 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
3330   ((c) >= 0xA1 && (c) <= 0xFE)
3331
3332 /* Is this the second byte of a Shift-JIS two-byte char? */
3333
3334 #define BYTE_BIG5_TWO_BYTE_2_P(c) \
3335   (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0xA1 && (c) <= 0xFE))
3336
3337 /* Number of Big5 characters which have the same code in 1st byte.  */
3338
3339 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
3340
3341 /* Code conversion macros.  These are macros because they are used in
3342    inner loops during code conversion.
3343
3344    Note that temporary variables in macros introduce the classic
3345    dynamic-scoping problems with variable names.  We use capital-
3346    lettered variables in the assumption that XEmacs does not use
3347    capital letters in variables except in a very formalized way
3348    (e.g. Qstring). */
3349
3350 /* Convert Big5 code (b1, b2) into its internal string representation
3351    (lb, c1, c2). */
3352
3353 /* There is a much simpler way to split the Big5 charset into two.
3354    For the moment I'm going to leave the algorithm as-is because it
3355    claims to separate out the most-used characters into a single
3356    charset, which perhaps will lead to optimizations in various
3357    places.
3358
3359    The way the algorithm works is something like this:
3360
3361    Big5 can be viewed as a 94x157 charset, where the row is
3362    encoded into the bytes 0xA1 .. 0xFE and the column is encoded
3363    into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE.  As for frequency,
3364    the split between low and high column numbers is apparently
3365    meaningless; ascending rows produce less and less frequent chars.
3366    Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to
3367    the first charset, and the upper half (0xC9 .. 0xFE) to the
3368    second.  To do the conversion, we convert the character into
3369    a single number where 0 .. 156 is the first row, 157 .. 313
3370    is the second, etc.  That way, the characters are ordered by
3371    decreasing frequency.  Then we just chop the space in two
3372    and coerce the result into a 94x94 space.
3373    */
3374
3375 #define DECODE_BIG5(b1, b2, lb, c1, c2) do                              \
3376 {                                                                       \
3377   int B1 = b1, B2 = b2;                                                 \
3378   unsigned int I                                                        \
3379     = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62);     \
3380                                                                         \
3381   if (B1 < 0xC9)                                                        \
3382     {                                                                   \
3383       lb = LEADING_BYTE_CHINESE_BIG5_1;                                 \
3384     }                                                                   \
3385   else                                                                  \
3386     {                                                                   \
3387       lb = LEADING_BYTE_CHINESE_BIG5_2;                                 \
3388       I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1);                             \
3389     }                                                                   \
3390   c1 = I / (0xFF - 0xA1) + 0xA1;                                        \
3391   c2 = I % (0xFF - 0xA1) + 0xA1;                                        \
3392 } while (0)
3393
3394 /* Convert the internal string representation of a Big5 character
3395    (lb, c1, c2) into Big5 code (b1, b2). */
3396
3397 #define ENCODE_BIG5(lb, c1, c2, b1, b2) do                              \
3398 {                                                                       \
3399   unsigned int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1);       \
3400                                                                         \
3401   if (lb == LEADING_BYTE_CHINESE_BIG5_2)                                \
3402     {                                                                   \
3403       I += BIG5_SAME_ROW * (0xC9 - 0xA1);                               \
3404     }                                                                   \
3405   b1 = I / BIG5_SAME_ROW + 0xA1;                                        \
3406   b2 = I % BIG5_SAME_ROW;                                               \
3407   b2 += b2 < 0x3F ? 0x40 : 0x62;                                        \
3408 } while (0)
3409
3410 static int
3411 detect_coding_big5 (struct detection_state *st, const Extbyte *src, Lstream_data_count n)
3412 {
3413   while (n--)
3414     {
3415       unsigned char c = *(unsigned char *)src++;
3416       if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO ||
3417           (c >= 0x80 && c <= 0xA0))
3418         return 0;
3419       if (st->big5.in_second_byte)
3420         {
3421           st->big5.in_second_byte = 0;
3422           if (c < 0x40 || (c >= 0x80 && c <= 0xA0))
3423             return 0;
3424         }
3425       else if (c >= 0xA1)
3426         st->big5.in_second_byte = 1;
3427     }
3428   return CODING_CATEGORY_BIG5_MASK;
3429 }
3430
3431 /* Convert Big5 data to internal format. */
3432
3433 static void
3434 decode_coding_big5 (Lstream *decoding, const Extbyte *src,
3435                     unsigned_char_dynarr *dst, Lstream_data_count n)
3436 {
3437   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3438   unsigned int flags  = str->flags;
3439   unsigned int ch     = str->ch;
3440   eol_type_t eol_type = str->eol_type;
3441
3442   while (n--)
3443     {
3444       unsigned char c = *(unsigned char *)src++;
3445       if (ch)
3446         {
3447           /* Previous character was first byte of Big5 char. */
3448           if (BYTE_BIG5_TWO_BYTE_2_P (c))
3449             {
3450               unsigned char b1, b2, b3;
3451               DECODE_BIG5 (ch, c, b1, b2, b3);
3452               Dynarr_add (dst, b1);
3453               Dynarr_add (dst, b2);
3454               Dynarr_add (dst, b3);
3455             }
3456           else
3457             {
3458               DECODE_ADD_BINARY_CHAR (ch, dst);
3459               DECODE_ADD_BINARY_CHAR (c, dst);
3460             }
3461           ch = 0;
3462         }
3463       else
3464         {
3465           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3466           if (BYTE_BIG5_TWO_BYTE_1_P (c))
3467             ch = c;
3468           else
3469             DECODE_ADD_BINARY_CHAR (c, dst);
3470         }
3471     label_continue_loop:;
3472     }
3473
3474   DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
3475
3476   str->flags = flags;
3477   str->ch    = ch;
3478 }
3479
3480 /* Convert internally-formatted data to Big5. */
3481
3482 static void
3483 encode_coding_big5 (Lstream *encoding, const Bufbyte *src,
3484                     unsigned_char_dynarr *dst, Lstream_data_count n)
3485 {
3486 #ifndef UTF2000
3487   unsigned char c;
3488   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3489   unsigned int flags  = str->flags;
3490   unsigned int ch     = str->ch;
3491   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3492
3493   while (n--)
3494     {
3495       c = *src++;
3496       if (c == '\n')
3497         {
3498           if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3499             Dynarr_add (dst, '\r');
3500           if (eol_type != EOL_CR)
3501             Dynarr_add (dst, '\n');
3502         }
3503       else if (BYTE_ASCII_P (c))
3504         {
3505           /* ASCII. */
3506           Dynarr_add (dst, c);
3507         }
3508       else if (BUFBYTE_LEADING_BYTE_P (c))
3509         {
3510           if (c == LEADING_BYTE_CHINESE_BIG5_1 ||
3511               c == LEADING_BYTE_CHINESE_BIG5_2)
3512             {
3513               /* A recognized leading byte. */
3514               ch = c;
3515               continue; /* not done with this character. */
3516             }
3517           /* otherwise just ignore this character. */
3518         }
3519       else if (ch == LEADING_BYTE_CHINESE_BIG5_1 ||
3520                ch == LEADING_BYTE_CHINESE_BIG5_2)
3521         {
3522           /* Previous char was a recognized leading byte. */
3523           ch = (ch << 8) | c;
3524           continue; /* not done with this character. */
3525         }
3526       else if (ch)
3527         {
3528           /* Encountering second byte of a Big5 character. */
3529           unsigned char b1, b2;
3530
3531           ENCODE_BIG5 (ch >> 8, ch & 0xFF, c, b1, b2);
3532           Dynarr_add (dst, b1);
3533           Dynarr_add (dst, b2);
3534         }
3535
3536       ch = 0;
3537     }
3538
3539   str->flags = flags;
3540   str->ch    = ch;
3541 #endif
3542 }
3543
3544
3545 DEFUN ("decode-big5-char", Fdecode_big5_char, 1, 1, 0, /*
3546 Decode a Big5 character CODE of BIG5 coding-system.
3547 CODE is the character code in BIG5, a cons of two integers.
3548 Return the corresponding character.
3549 */
3550        (code))
3551 {
3552   unsigned char c1, c2, b1, b2;
3553
3554   CHECK_CONS (code);
3555   CHECK_INT (XCAR (code));
3556   CHECK_INT (XCDR (code));
3557   b1 = XINT (XCAR (code));
3558   b2 = XINT (XCDR (code));
3559   if (BYTE_BIG5_TWO_BYTE_1_P (b1) &&
3560       BYTE_BIG5_TWO_BYTE_2_P (b2))
3561     {
3562       Charset_ID leading_byte;
3563       Lisp_Object charset;
3564       DECODE_BIG5 (b1, b2, leading_byte, c1, c2);
3565       charset = CHARSET_BY_LEADING_BYTE (leading_byte);
3566       return make_char (MAKE_CHAR (charset, c1 & 0x7F, c2 & 0x7F));
3567     }
3568   else
3569     return Qnil;
3570 }
3571
3572 DEFUN ("encode-big5-char", Fencode_big5_char, 1, 1, 0, /*
3573 Encode the Big5 character CHARACTER in the BIG5 coding-system.
3574 Return the corresponding character code in Big5.
3575 */
3576        (character))
3577 {
3578   Lisp_Object charset;
3579   int c1, c2, b1, b2;
3580
3581   CHECK_CHAR_COERCE_INT (character);
3582   BREAKUP_CHAR (XCHAR (character), charset, c1, c2);
3583   if (EQ (charset, Vcharset_chinese_big5_1) ||
3584       EQ (charset, Vcharset_chinese_big5_2))
3585     {
3586       ENCODE_BIG5 (XCHARSET_LEADING_BYTE (charset), c1 | 0x80, c2 | 0x80,
3587                    b1, b2);
3588       return Fcons (make_int (b1), make_int (b2));
3589     }
3590   else
3591     return Qnil;
3592 }
3593
3594 \f
3595 /************************************************************************/
3596 /*                           UCS-4 methods                              */
3597 /*                                                                      */
3598 /*  UCS-4 character codes are implemented as nonnegative integers.      */
3599 /*                                                                      */
3600 /************************************************************************/
3601
3602 #ifndef UTF2000
3603 DEFUN ("set-ucs-char", Fset_ucs_char, 2, 2, 0, /*
3604 Map UCS-4 code CODE to Mule character CHARACTER.
3605
3606 Return T on success, NIL on failure.
3607 */
3608        (code, character))
3609 {
3610   size_t c;
3611
3612   CHECK_CHAR (character);
3613   CHECK_NATNUM (code);
3614   c = XINT (code);
3615
3616   if (c < countof (fcd->ucs_to_mule_table))
3617     {
3618       fcd->ucs_to_mule_table[c] = character;
3619       return Qt;
3620     }
3621   else
3622     return Qnil;
3623 }
3624
3625 static Lisp_Object
3626 ucs_to_char (unsigned long code)
3627 {
3628   if (code < countof (fcd->ucs_to_mule_table))
3629     {
3630       return fcd->ucs_to_mule_table[code];
3631     }
3632   else if ((0xe00000 <= code) && (code <= 0xe00000 + 94 * 94 * 14))
3633     {
3634       unsigned int c;
3635
3636       code -= 0xe00000;
3637       c = code % (94 * 94);
3638       return make_char
3639         (MAKE_CHAR (CHARSET_BY_ATTRIBUTES
3640                     (94, 2, code / (94 * 94) + '@',
3641                      CHARSET_LEFT_TO_RIGHT),
3642                     c / 94 + 33, c % 94 + 33));
3643     }
3644   else
3645     return Qnil;
3646 }
3647
3648 DEFUN ("ucs-char", Fucs_char, 1, 1, 0, /*
3649 Return Mule character corresponding to UCS code CODE (a positive integer).
3650 */
3651        (code))
3652 {
3653   CHECK_NATNUM (code);
3654   return ucs_to_char (XINT (code));
3655 }
3656
3657 DEFUN ("set-char-ucs", Fset_char_ucs, 2, 2, 0, /*
3658 Map Mule character CHARACTER to UCS code CODE (a positive integer).
3659 */
3660        (character, code))
3661 {
3662   /* #### Isn't this gilding the lily?  Fput_char_table checks its args.
3663           Fset_char_ucs is more restrictive on index arg, but should
3664           check code arg in a char_table method. */
3665   CHECK_CHAR (character);
3666   CHECK_NATNUM (code);
3667   return Fput_char_table (character, code, mule_to_ucs_table);
3668 }
3669
3670 DEFUN ("char-ucs", Fchar_ucs, 1, 1, 0, /*
3671 Return the UCS code (a positive integer) corresponding to CHARACTER.
3672 */
3673        (character))
3674 {
3675   return Fget_char_table (character, mule_to_ucs_table);
3676 }
3677 #endif
3678
3679 #ifdef UTF2000
3680 #define decode_ucs4 DECODE_ADD_UCS_CHAR
3681 #else
3682 /* Decode a UCS-4 character into a buffer.  If the lookup fails, use
3683    <GETA MARK> (U+3013) of JIS X 0208, which means correct character
3684    is not found, instead.
3685    #### do something more appropriate (use blob?)
3686         Danger, Will Robinson!  Data loss.  Should we signal user? */
3687 static void
3688 decode_ucs4 (unsigned long ch, unsigned_char_dynarr *dst)
3689 {
3690   Lisp_Object chr = ucs_to_char (ch);
3691
3692   if (! NILP (chr))
3693     {
3694       Bufbyte work[MAX_EMCHAR_LEN];
3695       int len;
3696
3697       ch = XCHAR (chr);
3698       len = (ch < 128) ?
3699         simple_set_charptr_emchar (work, ch) :
3700         non_ascii_set_charptr_emchar (work, ch);
3701       Dynarr_add_many (dst, work, len);
3702     }
3703   else
3704     {
3705       Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
3706       Dynarr_add (dst, 34 + 128);
3707       Dynarr_add (dst, 46 + 128);
3708     }
3709 }
3710 #endif
3711
3712 #ifndef UTF2000
3713 static unsigned long
3714 mule_char_to_ucs4 (Lisp_Object charset,
3715                    unsigned char h, unsigned char l)
3716 {
3717   Lisp_Object code
3718     = Fget_char_table (make_char (MAKE_CHAR (charset, h & 127, l & 127)),
3719                        mule_to_ucs_table);
3720
3721   if (INTP (code))
3722     {
3723       return XINT (code);
3724     }
3725   else if ( (XCHARSET_DIMENSION (charset) == 2) &&
3726             (XCHARSET_CHARS (charset) == 94) )
3727     {
3728       unsigned char final = XCHARSET_FINAL (charset);
3729
3730       if ( ('@' <= final) && (final < 0x7f) )
3731         {
3732           return 0xe00000 + (final - '@') * 94 * 94
3733             + ((h & 127) - 33) * 94 + (l & 127) - 33;
3734         }
3735       else
3736         {
3737           return '?';
3738         }
3739     }
3740   else
3741     {
3742       return '?';
3743     }
3744 }
3745
3746 static void
3747 encode_ucs4 (Lisp_Object charset,
3748              unsigned char h, unsigned char l, unsigned_char_dynarr *dst)
3749 {
3750   unsigned long code = mule_char_to_ucs4 (charset, h, l);
3751   Dynarr_add (dst,  code >> 24);
3752   Dynarr_add (dst, (code >> 16) & 255);
3753   Dynarr_add (dst, (code >>  8) & 255);
3754   Dynarr_add (dst,  code        & 255);
3755 }
3756 #endif
3757
3758 static int
3759 detect_coding_ucs4 (struct detection_state *st, const Extbyte *src, Lstream_data_count n)
3760 {
3761   while (n--)
3762     {
3763       unsigned char c = *(unsigned char *)src++;
3764       switch (st->ucs4.in_byte)
3765         {
3766         case 0:
3767           if (c >= 128)
3768             return 0;
3769           else
3770             st->ucs4.in_byte++;
3771           break;
3772         case 3:
3773           st->ucs4.in_byte = 0;
3774           break;
3775         default:
3776           st->ucs4.in_byte++;
3777         }
3778     }
3779   return CODING_CATEGORY_UCS4_MASK;
3780 }
3781
3782 static void
3783 decode_coding_ucs4 (Lstream *decoding, const Extbyte *src,
3784                     unsigned_char_dynarr *dst, Lstream_data_count n)
3785 {
3786   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3787   unsigned int flags = str->flags;
3788   unsigned int ch    = str->ch;
3789   unsigned char counter = str->counter;
3790
3791   while (n--)
3792     {
3793       unsigned char c = *(unsigned char *)src++;
3794       switch (counter)
3795         {
3796         case 0:
3797           ch = c;
3798           counter = 3;
3799           break;
3800         case 1:
3801           decode_ucs4 ( ( ch << 8 ) | c, dst);
3802           ch = 0;
3803           counter = 0;
3804           break;
3805         default:
3806           ch = ( ch << 8 ) | c;
3807           counter--;
3808         }
3809     }
3810   if (counter & CODING_STATE_END)
3811     DECODE_OUTPUT_PARTIAL_CHAR (ch);
3812
3813   str->flags = flags;
3814   str->ch    = ch;
3815   str->counter = counter;
3816 }
3817
3818 static void
3819 encode_coding_ucs4 (Lstream *encoding, const Bufbyte *src,
3820                     unsigned_char_dynarr *dst, Lstream_data_count n)
3821 {
3822 #ifndef UTF2000
3823   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3824   unsigned int flags = str->flags;
3825   unsigned int ch = str->ch;
3826   unsigned char char_boundary = str->iso2022.current_char_boundary;
3827   Lisp_Object charset = str->iso2022.current_charset;
3828
3829 #ifdef ENABLE_COMPOSITE_CHARS
3830   /* flags for handling composite chars.  We do a little switcharoo
3831      on the source while we're outputting the composite char. */
3832   unsigned int saved_n = 0;
3833   const unsigned char *saved_src = NULL;
3834   int in_composite = 0;
3835
3836  back_to_square_n:
3837 #endif
3838
3839   while (n--)
3840     {
3841       unsigned char c = *src++;
3842
3843       if (BYTE_ASCII_P (c))
3844         {               /* Processing ASCII character */
3845           ch = 0;
3846           encode_ucs4 (Vcharset_ascii, c, 0, dst);
3847           char_boundary = 1;
3848         }
3849       else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
3850         { /* Processing Leading Byte */
3851           ch = 0;
3852           charset = CHARSET_BY_LEADING_BYTE (c);
3853           if (LEADING_BYTE_PREFIX_P(c))
3854             ch = c;
3855           char_boundary = 0;
3856         }
3857       else
3858         {                       /* Processing Non-ASCII character */
3859           char_boundary = 1;
3860           if (EQ (charset, Vcharset_control_1))
3861             {
3862               encode_ucs4 (Vcharset_control_1, c, 0, dst);
3863             }
3864           else
3865             {
3866               switch (XCHARSET_REP_BYTES (charset))
3867                 {
3868                 case 2:
3869                   encode_ucs4 (charset, c, 0, dst);
3870                   break;
3871                 case 3:
3872                   if (XCHARSET_PRIVATE_P (charset))
3873                     {
3874                       encode_ucs4 (charset, c, 0, dst);
3875                       ch = 0;
3876                     }
3877                   else if (ch)
3878                     {
3879 #ifdef ENABLE_COMPOSITE_CHARS
3880                       if (EQ (charset, Vcharset_composite))
3881                         {
3882                           if (in_composite)
3883                             {
3884                               /* #### Bother! We don't know how to
3885                                  handle this yet. */
3886                               Dynarr_add (dst, '\0');
3887                               Dynarr_add (dst, '\0');
3888                               Dynarr_add (dst, '\0');
3889                               Dynarr_add (dst, '~');
3890                             }
3891                           else
3892                             {
3893                               Emchar emch = MAKE_CHAR (Vcharset_composite,
3894                                                        ch & 0x7F, c & 0x7F);
3895                               Lisp_Object lstr = composite_char_string (emch);
3896                               saved_n = n;
3897                               saved_src = src;
3898                               in_composite = 1;
3899                               src = XSTRING_DATA   (lstr);
3900                               n   = XSTRING_LENGTH (lstr);
3901                             }
3902                         }
3903                       else
3904 #endif /* ENABLE_COMPOSITE_CHARS */
3905                         {
3906                           encode_ucs4(charset, ch, c, dst);
3907                         }
3908                       ch = 0;
3909                     }
3910                   else
3911                     {
3912                       ch = c;
3913                       char_boundary = 0;
3914                     }
3915                   break;
3916                 case 4:
3917                   if (ch)
3918                     {
3919                       encode_ucs4 (charset, ch, c, dst);
3920                       ch = 0;
3921                     }
3922                   else
3923                     {
3924                       ch = c;
3925                       char_boundary = 0;
3926                     }
3927                   break;
3928                 default:
3929                   ABORT ();
3930                 }
3931             }
3932         }
3933     }
3934
3935 #ifdef ENABLE_COMPOSITE_CHARS
3936   if (in_composite)
3937     {
3938       n = saved_n;
3939       src = saved_src;
3940       in_composite = 0;
3941       goto back_to_square_n; /* Wheeeeeeeee ..... */
3942     }
3943 #endif /* ENABLE_COMPOSITE_CHARS */
3944
3945   str->flags = flags;
3946   str->ch = ch;
3947   str->iso2022.current_char_boundary = char_boundary;
3948   str->iso2022.current_charset = charset;
3949
3950   /* Verbum caro factum est! */
3951 #endif
3952 }
3953
3954 \f
3955 /************************************************************************/
3956 /*                           UTF-8 methods                              */
3957 /************************************************************************/
3958
3959 static int
3960 detect_coding_utf8 (struct detection_state *st, const Extbyte *src, Lstream_data_count n)
3961 {
3962   while (n--)
3963     {
3964       unsigned char c = *(unsigned char *)src++;
3965       switch (st->utf8.in_byte)
3966         {
3967         case 0:
3968           if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3969             return 0;
3970           else if (c >= 0xfc)
3971             st->utf8.in_byte = 5;
3972           else if (c >= 0xf8)
3973             st->utf8.in_byte = 4;
3974           else if (c >= 0xf0)
3975             st->utf8.in_byte = 3;
3976           else if (c >= 0xe0)
3977             st->utf8.in_byte = 2;
3978           else if (c >= 0xc0)
3979             st->utf8.in_byte = 1;
3980           else if (c >= 0x80)
3981             return 0;
3982           break;
3983         default:
3984           if ((c & 0xc0) != 0x80)
3985             return 0;
3986           else
3987             st->utf8.in_byte--;
3988         }
3989     }
3990   return CODING_CATEGORY_UTF8_MASK;
3991 }
3992
3993 static void
3994 decode_coding_utf8 (Lstream *decoding, const Extbyte *src,
3995                     unsigned_char_dynarr *dst, Lstream_data_count n)
3996 {
3997   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3998   unsigned int flags  = str->flags;
3999   unsigned int ch     = str->ch;
4000   eol_type_t eol_type = str->eol_type;
4001   unsigned char counter = str->counter;
4002
4003   while (n--)
4004     {
4005       unsigned char c = *(unsigned char *)src++;
4006       switch (counter)
4007         {
4008         case 0:
4009           if ( c >= 0xfc )
4010             {
4011               ch = c & 0x01;
4012               counter = 5;
4013             }
4014           else if ( c >= 0xf8 )
4015             {
4016               ch = c & 0x03;
4017               counter = 4;
4018             }
4019           else if ( c >= 0xf0 )
4020             {
4021               ch = c & 0x07;
4022               counter = 3;
4023             }
4024           else if ( c >= 0xe0 )
4025             {
4026               ch = c & 0x0f;
4027               counter = 2;
4028             }
4029           else if ( c >= 0xc0 )
4030             {
4031               ch = c & 0x1f;
4032               counter = 1;
4033             }
4034           else
4035             {
4036               DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4037               decode_ucs4 (c, dst);
4038             }
4039           break;
4040         case 1:
4041           ch = ( ch << 6 ) | ( c & 0x3f );
4042           decode_ucs4 (ch, dst);
4043           ch = 0;
4044           counter = 0;
4045           break;
4046         default:
4047           ch = ( ch << 6 ) | ( c & 0x3f );
4048           counter--;
4049         }
4050     label_continue_loop:;
4051     }
4052
4053   if (flags & CODING_STATE_END)
4054     DECODE_OUTPUT_PARTIAL_CHAR (ch);
4055
4056   str->flags = flags;
4057   str->ch    = ch;
4058   str->counter = counter;
4059 }
4060
4061 #ifndef UTF2000
4062 static void
4063 encode_utf8 (Lisp_Object charset,
4064              unsigned char h, unsigned char l, unsigned_char_dynarr *dst)
4065 {
4066   unsigned long code = mule_char_to_ucs4 (charset, h, l);
4067   if ( code <= 0x7f )
4068     {
4069       Dynarr_add (dst, code);
4070     }
4071   else if ( code <= 0x7ff )
4072     {
4073       Dynarr_add (dst, (code >> 6) | 0xc0);
4074       Dynarr_add (dst, (code & 0x3f) | 0x80);
4075     }
4076   else if ( code <= 0xffff )
4077     {
4078       Dynarr_add (dst,  (code >> 12) | 0xe0);
4079       Dynarr_add (dst, ((code >>  6) & 0x3f) | 0x80);
4080       Dynarr_add (dst,  (code        & 0x3f) | 0x80);
4081     }
4082   else if ( code <= 0x1fffff )
4083     {
4084       Dynarr_add (dst,  (code >> 18) | 0xf0);
4085       Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
4086       Dynarr_add (dst, ((code >>  6) & 0x3f) | 0x80);
4087       Dynarr_add (dst,  (code        & 0x3f) | 0x80);
4088     }
4089   else if ( code <= 0x3ffffff )
4090     {
4091       Dynarr_add (dst,  (code >> 24) | 0xf8);
4092       Dynarr_add (dst, ((code >> 18) & 0x3f) | 0x80);
4093       Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
4094       Dynarr_add (dst, ((code >>  6) & 0x3f) | 0x80);
4095       Dynarr_add (dst,  (code        & 0x3f) | 0x80);
4096     }
4097   else
4098     {
4099       Dynarr_add (dst,  (code >> 30) | 0xfc);
4100       Dynarr_add (dst, ((code >> 24) & 0x3f) | 0x80);
4101       Dynarr_add (dst, ((code >> 18) & 0x3f) | 0x80);
4102       Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
4103       Dynarr_add (dst, ((code >>  6) & 0x3f) | 0x80);
4104       Dynarr_add (dst,  (code        & 0x3f) | 0x80);
4105     }
4106 }
4107 #endif
4108
4109 static void
4110 encode_coding_utf8 (Lstream *encoding, const Bufbyte *src,
4111                     unsigned_char_dynarr *dst, Lstream_data_count n)
4112 {
4113   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
4114   unsigned int flags  = str->flags;
4115   unsigned int ch     = str->ch;
4116   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
4117   unsigned char char_boundary = str->iso2022.current_char_boundary;
4118 #ifdef UTF2000
4119
4120   while (n--)
4121     {
4122       unsigned char c = *src++;
4123       switch (char_boundary)
4124         {
4125         case 0:
4126           if ( c >= 0xfc )
4127             {
4128               Dynarr_add (dst, c);
4129               char_boundary = 5;
4130             }
4131           else if ( c >= 0xf8 )
4132             {
4133               Dynarr_add (dst, c);
4134               char_boundary = 4;
4135             }
4136           else if ( c >= 0xf0 )
4137             {
4138               Dynarr_add (dst, c);
4139               char_boundary = 3;
4140             }
4141           else if ( c >= 0xe0 )
4142             {
4143               Dynarr_add (dst, c);
4144               char_boundary = 2;
4145             }
4146           else if ( c >= 0xc0 )
4147             {
4148               Dynarr_add (dst, c);
4149               char_boundary = 1;
4150             }
4151           else
4152             {
4153               if (c == '\n')
4154                 {
4155                   if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
4156                     Dynarr_add (dst, '\r');
4157                   if (eol_type != EOL_CR)
4158                     Dynarr_add (dst, c);
4159                 }
4160               else
4161                 Dynarr_add (dst, c);
4162               char_boundary = 0;
4163             }
4164           break;
4165         case 1:
4166           Dynarr_add (dst, c);
4167           char_boundary = 0;
4168           break;
4169         default:
4170           Dynarr_add (dst, c);
4171           char_boundary--;
4172         }
4173     }
4174 #else /* not UTF2000 */
4175   Lisp_Object charset = str->iso2022.current_charset;
4176
4177 #ifdef ENABLE_COMPOSITE_CHARS
4178   /* flags for handling composite chars.  We do a little switcharoo
4179      on the source while we're outputting the composite char. */
4180   unsigned int saved_n = 0;
4181   const unsigned char *saved_src = NULL;
4182   int in_composite = 0;
4183
4184  back_to_square_n:
4185 #endif /* ENABLE_COMPOSITE_CHARS */
4186
4187   while (n--)
4188     {
4189       unsigned char c = *src++;
4190
4191       if (BYTE_ASCII_P (c))
4192         {               /* Processing ASCII character */
4193           ch = 0;
4194           if (c == '\n')
4195             {
4196               if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
4197                 Dynarr_add (dst, '\r');
4198               if (eol_type != EOL_CR)
4199                 Dynarr_add (dst, c);
4200             }
4201           else
4202             encode_utf8 (Vcharset_ascii, c, 0, dst);
4203           char_boundary = 1;
4204         }
4205       else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
4206         { /* Processing Leading Byte */
4207           ch = 0;
4208           charset = CHARSET_BY_LEADING_BYTE (c);
4209           if (LEADING_BYTE_PREFIX_P(c))
4210             ch = c;
4211           char_boundary = 0;
4212         }
4213       else
4214         {                       /* Processing Non-ASCII character */
4215           char_boundary = 1;
4216           if (EQ (charset, Vcharset_control_1))
4217             {
4218               encode_utf8 (Vcharset_control_1, c, 0, dst);
4219             }
4220           else
4221             {
4222               switch (XCHARSET_REP_BYTES (charset))
4223                 {
4224                 case 2:
4225                   encode_utf8 (charset, c, 0, dst);
4226                   break;
4227                 case 3:
4228                   if (XCHARSET_PRIVATE_P (charset))
4229                     {
4230                       encode_utf8 (charset, c, 0, dst);
4231                       ch = 0;
4232                     }
4233                   else if (ch)
4234                     {
4235 #ifdef ENABLE_COMPOSITE_CHARS
4236                       if (EQ (charset, Vcharset_composite))
4237                         {
4238                           if (in_composite)
4239                             {
4240                               /* #### Bother! We don't know how to
4241                                  handle this yet. */
4242                               encode_utf8 (Vcharset_ascii, '~', 0, dst);
4243                             }
4244                           else
4245                             {
4246                               Emchar emch = MAKE_CHAR (Vcharset_composite,
4247                                                        ch & 0x7F, c & 0x7F);
4248                               Lisp_Object lstr = composite_char_string (emch);
4249                               saved_n = n;
4250                               saved_src = src;
4251                               in_composite = 1;
4252                               src = XSTRING_DATA   (lstr);
4253                               n   = XSTRING_LENGTH (lstr);
4254                             }
4255                         }
4256                       else
4257 #endif /* ENABLE_COMPOSITE_CHARS */
4258                         {
4259                           encode_utf8 (charset, ch, c, dst);
4260                         }
4261                       ch = 0;
4262                     }
4263                   else
4264                     {
4265                       ch = c;
4266                       char_boundary = 0;
4267                     }
4268                   break;
4269                 case 4:
4270                   if (ch)
4271                     {
4272                       encode_utf8 (charset, ch, c, dst);
4273                       ch = 0;
4274                     }
4275                   else
4276                     {
4277                       ch = c;
4278                       char_boundary = 0;
4279                     }
4280                   break;
4281                 default:
4282                   ABORT ();
4283                 }
4284             }
4285         }
4286     }
4287
4288 #ifdef ENABLE_COMPOSITE_CHARS
4289   if (in_composite)
4290     {
4291       n = saved_n;
4292       src = saved_src;
4293       in_composite = 0;
4294       goto back_to_square_n; /* Wheeeeeeeee ..... */
4295     }
4296 #endif
4297
4298 #endif /* not UTF2000 */
4299   str->flags = flags;
4300   str->ch    = ch;
4301   str->iso2022.current_char_boundary = char_boundary;
4302 #ifndef UTF2000
4303   str->iso2022.current_charset = charset;
4304 #endif
4305
4306   /* Verbum caro factum est! */
4307 }
4308
4309 \f
4310 /************************************************************************/
4311 /*                           ISO2022 methods                            */
4312 /************************************************************************/
4313
4314 /* The following note describes the coding system ISO2022 briefly.
4315    Since the intention of this note is to help understand the
4316    functions in this file, some parts are NOT ACCURATE or OVERLY
4317    SIMPLIFIED.  For thorough understanding, please refer to the
4318    original document of ISO2022.
4319
4320    ISO2022 provides many mechanisms to encode several character sets
4321    in 7-bit and 8-bit environments.  For 7-bit environments, all text
4322    is encoded using bytes less than 128.  This may make the encoded
4323    text a little bit longer, but the text passes more easily through
4324    several gateways, some of which strip off MSB (Most Signigant Bit).
4325
4326    There are two kinds of character sets: control character set and
4327    graphic character set.  The former contains control characters such
4328    as `newline' and `escape' to provide control functions (control
4329    functions are also provided by escape sequences).  The latter
4330    contains graphic characters such as 'A' and '-'.  Emacs recognizes
4331    two control character sets and many graphic character sets.
4332
4333    Graphic character sets are classified into one of the following
4334    four classes, according to the number of bytes (DIMENSION) and
4335    number of characters in one dimension (CHARS) of the set:
4336    - DIMENSION1_CHARS94
4337    - DIMENSION1_CHARS96
4338    - DIMENSION2_CHARS94
4339    - DIMENSION2_CHARS96
4340
4341    In addition, each character set is assigned an identification tag,
4342    unique for each set, called "final character" (denoted as <F>
4343    hereafter).  The <F> of each character set is decided by ECMA(*)
4344    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
4345    (0x30..0x3F are for private use only).
4346
4347    Note (*): ECMA = European Computer Manufacturers Association
4348
4349    Here are examples of graphic character set [NAME(<F>)]:
4350         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
4351         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
4352         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
4353         o DIMENSION2_CHARS96 -- none for the moment
4354
4355    A code area (1 byte = 8 bits) is divided into 4 areas, C0, GL, C1, and GR.
4356         C0 [0x00..0x1F] -- control character plane 0
4357         GL [0x20..0x7F] -- graphic character plane 0
4358         C1 [0x80..0x9F] -- control character plane 1
4359         GR [0xA0..0xFF] -- graphic character plane 1
4360
4361    A control character set is directly designated and invoked to C0 or
4362    C1 by an escape sequence.  The most common case is that:
4363    - ISO646's  control character set is designated/invoked to C0, and
4364    - ISO6429's control character set is designated/invoked to C1,
4365    and usually these designations/invocations are omitted in encoded
4366    text.  In a 7-bit environment, only C0 can be used, and a control
4367    character for C1 is encoded by an appropriate escape sequence to
4368    fit into the environment.  All control characters for C1 are
4369    defined to have corresponding escape sequences.
4370
4371    A graphic character set is at first designated to one of four
4372    graphic registers (G0 through G3), then these graphic registers are
4373    invoked to GL or GR.  These designations and invocations can be
4374    done independently.  The most common case is that G0 is invoked to
4375    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
4376    these invocations and designations are omitted in encoded text.
4377    In a 7-bit environment, only GL can be used.
4378
4379    When a graphic character set of CHARS94 is invoked to GL, codes
4380    0x20 and 0x7F of the GL area work as control characters SPACE and
4381    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
4382    be used.
4383
4384    There are two ways of invocation: locking-shift and single-shift.
4385    With locking-shift, the invocation lasts until the next different
4386    invocation, whereas with single-shift, the invocation affects the
4387    following character only and doesn't affect the locking-shift
4388    state.  Invocations are done by the following control characters or
4389    escape sequences:
4390
4391    ----------------------------------------------------------------------
4392    abbrev  function                  cntrl escape seq   description
4393    ----------------------------------------------------------------------
4394    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
4395    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
4396    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
4397    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
4398    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
4399    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
4400    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
4401    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
4402    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
4403    ----------------------------------------------------------------------
4404    (*) These are not used by any known coding system.
4405
4406    Control characters for these functions are defined by macros
4407    ISO_CODE_XXX in `coding.h'.
4408
4409    Designations are done by the following escape sequences:
4410    ----------------------------------------------------------------------
4411    escape sequence      description
4412    ----------------------------------------------------------------------
4413    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
4414    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
4415    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
4416    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
4417    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
4418    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
4419    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
4420    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
4421    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
4422    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
4423    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
4424    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
4425    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
4426    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
4427    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
4428    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
4429    ----------------------------------------------------------------------
4430
4431    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
4432    of dimension 1, chars 94, and final character <F>, etc...
4433
4434    Note (*): Although these designations are not allowed in ISO2022,
4435    Emacs accepts them on decoding, and produces them on encoding
4436    CHARS96 character sets in a coding system which is characterized as
4437    7-bit environment, non-locking-shift, and non-single-shift.
4438
4439    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
4440    '(' can be omitted.  We refer to this as "short-form" hereafter.
4441
4442    Now you may notice that there are a lot of ways for encoding the
4443    same multilingual text in ISO2022.  Actually, there exist many
4444    coding systems such as Compound Text (used in X11's inter client
4445    communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
4446    (used in Korean internet), EUC (Extended UNIX Code, used in Asian
4447    localized platforms), and all of these are variants of ISO2022.
4448
4449    In addition to the above, Emacs handles two more kinds of escape
4450    sequences: ISO6429's direction specification and Emacs' private
4451    sequence for specifying character composition.
4452
4453    ISO6429's direction specification takes the following form:
4454         o CSI ']'      -- end of the current direction
4455         o CSI '0' ']'  -- end of the current direction
4456         o CSI '1' ']'  -- start of left-to-right text
4457         o CSI '2' ']'  -- start of right-to-left text
4458    The control character CSI (0x9B: control sequence introducer) is
4459    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
4460
4461    Character composition specification takes the following form:
4462         o ESC '0' -- start character composition
4463         o ESC '1' -- end character composition
4464    Since these are not standard escape sequences of any ISO standard,
4465    their use with these meanings is restricted to Emacs only.  */
4466
4467 static void
4468 reset_iso2022 (Lisp_Object coding_system, struct iso2022_decoder *iso)
4469 {
4470   int i;
4471
4472   for (i = 0; i < 4; i++)
4473     {
4474       if (!NILP (coding_system))
4475         iso->charset[i] =
4476           XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, i);
4477       else
4478         iso->charset[i] = Qt;
4479       iso->invalid_designated[i] = 0;
4480     }
4481   iso->esc = ISO_ESC_NOTHING;
4482   iso->esc_bytes_index = 0;
4483   iso->register_left = 0;
4484   iso->register_right = 1;
4485   iso->switched_dir_and_no_valid_charset_yet = 0;
4486   iso->invalid_switch_dir = 0;
4487   iso->output_direction_sequence = 0;
4488   iso->output_literally = 0;
4489 #ifdef ENABLE_COMPOSITE_CHARS
4490   if (iso->composite_chars)
4491     Dynarr_reset (iso->composite_chars);
4492 #endif
4493 }
4494
4495 static int
4496 fit_to_be_escape_quoted (unsigned char c)
4497 {
4498   switch (c)
4499     {
4500     case ISO_CODE_ESC:
4501     case ISO_CODE_CSI:
4502     case ISO_CODE_SS2:
4503     case ISO_CODE_SS3:
4504     case ISO_CODE_SO:
4505     case ISO_CODE_SI:
4506       return 1;
4507
4508     default:
4509       return 0;
4510     }
4511 }
4512
4513 /* Parse one byte of an ISO2022 escape sequence.
4514    If the result is an invalid escape sequence, return 0 and
4515    do not change anything in STR.  Otherwise, if the result is
4516    an incomplete escape sequence, update ISO2022.ESC and
4517    ISO2022.ESC_BYTES and return -1.  Otherwise, update
4518    all the state variables (but not ISO2022.ESC_BYTES) and
4519    return 1.
4520
4521    If CHECK_INVALID_CHARSETS is non-zero, check for designation
4522    or invocation of an invalid character set and treat that as
4523    an unrecognized escape sequence.
4524
4525    ********************************************************************
4526
4527    #### Strategies for error annotation and coding orthogonalization
4528
4529    We really want to separate out a number of things.  Conceptually,
4530    there is a nested syntax.
4531
4532    At the top level is the ISO 2022 extension syntax, including charset
4533    designation and invocation, and certain auxiliary controls such as the
4534    ISO 6429 direction specification.  These are octet-oriented, with the
4535    single exception (AFAIK) of the "exit Unicode" sequence which uses the
4536    UTF's natural width (1 byte for UTF-7 and UTF-8, 2 bytes for UCS-2 and
4537    UTF-16, and 4 bytes for UCS-4 and UTF-32).  This will be treated as a
4538    (deprecated) special case in Unicode processing.
4539
4540    The middle layer is ISO 2022 character interpretation.  This will depend
4541    on the current state of the ISO 2022 registers, and assembles octets
4542    into the character's internal representation.
4543
4544    The lowest level is translating system control conventions.  At present
4545    this is restricted to newline translation, but one could imagine doing
4546    tab conversion or line wrapping here.  "Escape from Unicode" processing
4547    would be done at this level.
4548
4549    At each level the parser will verify the syntax.  In the case of a
4550    syntax error or warning (such as a redundant escape sequence that affects
4551    no characters), the parser will take some action, typically inserting the
4552    erroneous octets directly into the output and creating an annotation
4553    which can be used by higher level I/O to mark the affected region.
4554
4555    This should make it possible to do something sensible about separating
4556    newline convention processing from character construction, and about
4557    preventing ISO 2022 escape sequences from being recognized
4558    inappropriately.
4559
4560    The basic strategy will be to have octet classification tables, and
4561    switch processing according to the table entry.
4562
4563    It's possible that, by doing the processing with tables of functions or
4564    the like, the parser can be used for both detection and translation. */
4565
4566 static int
4567 parse_iso2022_esc (Lisp_Object codesys, struct iso2022_decoder *iso,
4568                    unsigned char c, unsigned int *flags,
4569                    int check_invalid_charsets)
4570 {
4571   /* (1) If we're at the end of a designation sequence, CS is the
4572      charset being designated and REG is the register to designate
4573      it to.
4574
4575      (2) If we're at the end of a locking-shift sequence, REG is
4576      the register to invoke and HALF (0 == left, 1 == right) is
4577      the half to invoke it into.
4578
4579      (3) If we're at the end of a single-shift sequence, REG is
4580      the register to invoke. */
4581   Lisp_Object cs = Qnil;
4582   int reg, half;
4583
4584   /* NOTE: This code does goto's all over the fucking place.
4585      The reason for this is that we're basically implementing
4586      a state machine here, and hierarchical languages like C
4587      don't really provide a clean way of doing this. */
4588
4589   if (! (*flags & CODING_STATE_ESCAPE))
4590     /* At beginning of escape sequence; we need to reset our
4591        escape-state variables. */
4592     iso->esc = ISO_ESC_NOTHING;
4593
4594   iso->output_literally = 0;
4595   iso->output_direction_sequence = 0;
4596
4597   switch (iso->esc)
4598     {
4599     case ISO_ESC_NOTHING:
4600       iso->esc_bytes_index = 0;
4601       switch (c)
4602         {
4603         case ISO_CODE_ESC:      /* Start escape sequence */
4604           *flags |= CODING_STATE_ESCAPE;
4605           iso->esc = ISO_ESC;
4606           goto not_done;
4607
4608         case ISO_CODE_CSI:      /* ISO6429 (specifying directionality) */
4609           *flags |= CODING_STATE_ESCAPE;
4610           iso->esc = ISO_ESC_5_11;
4611           goto not_done;
4612
4613         case ISO_CODE_SO:       /* locking shift 1 */
4614           reg = 1; half = 0;
4615           goto locking_shift;
4616         case ISO_CODE_SI:       /* locking shift 0 */
4617           reg = 0; half = 0;
4618           goto locking_shift;
4619
4620         case ISO_CODE_SS2:      /* single shift */
4621           reg = 2;
4622           goto single_shift;
4623         case ISO_CODE_SS3:      /* single shift */
4624           reg = 3;
4625           goto single_shift;
4626
4627         default:                        /* Other control characters */
4628           return 0;
4629         }
4630
4631     case ISO_ESC:
4632       switch (c)
4633         {
4634           /**** single shift ****/
4635
4636         case 'N':       /* single shift 2 */
4637           reg = 2;
4638           goto single_shift;
4639         case 'O':       /* single shift 3 */
4640           reg = 3;
4641           goto single_shift;
4642
4643           /**** locking shift ****/
4644
4645         case '~':       /* locking shift 1 right */
4646           reg = 1; half = 1;
4647           goto locking_shift;
4648         case 'n':       /* locking shift 2 */
4649           reg = 2; half = 0;
4650           goto locking_shift;
4651         case '}':       /* locking shift 2 right */
4652           reg = 2; half = 1;
4653           goto locking_shift;
4654         case 'o':       /* locking shift 3 */
4655           reg = 3; half = 0;
4656           goto locking_shift;
4657         case '|':       /* locking shift 3 right */
4658           reg = 3; half = 1;
4659           goto locking_shift;
4660
4661 #ifdef ENABLE_COMPOSITE_CHARS
4662           /**** composite ****/
4663
4664         case '0':
4665           iso->esc = ISO_ESC_START_COMPOSITE;
4666           *flags = (*flags & CODING_STATE_ISO2022_LOCK) |
4667             CODING_STATE_COMPOSITE;
4668           return 1;
4669
4670         case '1':
4671           iso->esc = ISO_ESC_END_COMPOSITE;
4672           *flags = (*flags & CODING_STATE_ISO2022_LOCK) &
4673             ~CODING_STATE_COMPOSITE;
4674           return 1;
4675 #endif /* ENABLE_COMPOSITE_CHARS */
4676
4677           /**** directionality ****/
4678
4679         case '[':
4680           iso->esc = ISO_ESC_5_11;
4681           goto not_done;
4682
4683           /**** designation ****/
4684
4685         case '$':       /* multibyte charset prefix */
4686           iso->esc = ISO_ESC_2_4;
4687           goto not_done;
4688
4689         default:
4690           if (0x28 <= c && c <= 0x2F)
4691             {
4692               iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_8);
4693               goto not_done;
4694             }
4695
4696           /* This function is called with CODESYS equal to nil when
4697              doing coding-system detection. */
4698           if (!NILP (codesys)
4699               && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
4700               && fit_to_be_escape_quoted (c))
4701             {
4702               iso->esc = ISO_ESC_LITERAL;
4703               *flags &= CODING_STATE_ISO2022_LOCK;
4704               return 1;
4705             }
4706
4707           /* bzzzt! */
4708           return 0;
4709         }
4710
4711
4712
4713       /**** directionality ****/
4714
4715     case ISO_ESC_5_11:          /* ISO6429 direction control */
4716       if (c == ']')
4717         {
4718           *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4719           goto directionality;
4720         }
4721       if      (c == '0') iso->esc = ISO_ESC_5_11_0;
4722       else if (c == '1') iso->esc = ISO_ESC_5_11_1;
4723       else if (c == '2') iso->esc = ISO_ESC_5_11_2;
4724       else               return 0;
4725       goto not_done;
4726
4727     case ISO_ESC_5_11_0:
4728       if (c == ']')
4729         {
4730           *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4731           goto directionality;
4732         }
4733       return 0;
4734
4735     case ISO_ESC_5_11_1:
4736       if (c == ']')
4737         {
4738           *flags = (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4739           goto directionality;
4740         }
4741       return 0;
4742
4743     case ISO_ESC_5_11_2:
4744       if (c == ']')
4745         {
4746           *flags = (*flags & CODING_STATE_ISO2022_LOCK) | CODING_STATE_R2L;
4747           goto directionality;
4748         }
4749       return 0;
4750
4751     directionality:
4752       iso->esc = ISO_ESC_DIRECTIONALITY;
4753       /* Various junk here to attempt to preserve the direction sequences
4754          literally in the text if they would otherwise be swallowed due
4755          to invalid designations that don't show up as actual charset
4756          changes in the text. */
4757       if (iso->invalid_switch_dir)
4758         {
4759           /* We already inserted a direction switch literally into the
4760              text.  We assume (#### this may not be right) that the
4761              next direction switch is the one going the other way,
4762              and we need to output that literally as well. */
4763           iso->output_literally = 1;
4764           iso->invalid_switch_dir = 0;
4765         }
4766       else
4767         {
4768           int jj;
4769
4770           /* If we are in the thrall of an invalid designation,
4771            then stick the directionality sequence literally into the
4772            output stream so it ends up in the original text again. */
4773           for (jj = 0; jj < 4; jj++)
4774             if (iso->invalid_designated[jj])
4775               break;
4776           if (jj < 4)
4777             {
4778               iso->output_literally = 1;
4779               iso->invalid_switch_dir = 1;
4780             }
4781           else
4782             /* Indicate that we haven't yet seen a valid designation,
4783                so that if a switch-dir is directly followed by an
4784                invalid designation, both get inserted literally. */
4785             iso->switched_dir_and_no_valid_charset_yet = 1;
4786         }
4787       return 1;
4788
4789
4790       /**** designation ****/
4791
4792     case ISO_ESC_2_4:
4793       if (0x28 <= c && c <= 0x2F)
4794         {
4795           iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_4_8);
4796           goto not_done;
4797         }
4798       if (0x40 <= c && c <= 0x42)
4799         {
4800           cs = CHARSET_BY_ATTRIBUTES (94, 2, c,
4801                                       *flags & CODING_STATE_R2L ?
4802                                       CHARSET_RIGHT_TO_LEFT :
4803                                       CHARSET_LEFT_TO_RIGHT);
4804           reg = 0;
4805           goto designated;
4806         }
4807       return 0;
4808
4809     default:
4810       {
4811         /* int type =-1; */
4812         int chars = 0;
4813         int single = 0;
4814
4815         if (c < '0' || c > '~')
4816           return 0; /* bad final byte */
4817
4818         if (iso->esc >= ISO_ESC_2_8 &&
4819             iso->esc <= ISO_ESC_2_15)
4820           {
4821             chars = (iso->esc >= ISO_ESC_2_12) ? 96 : 94;
4822             single = 1; /* single-byte */
4823             reg = (iso->esc - ISO_ESC_2_8) & 3;
4824           }
4825         else if (iso->esc >= ISO_ESC_2_4_8 &&
4826                  iso->esc <= ISO_ESC_2_4_15)
4827           {
4828             chars = (iso->esc >= ISO_ESC_2_4_12) ? 96 : 94;
4829             single = -1; /* multi-byte */
4830             reg = (iso->esc - ISO_ESC_2_4_8) & 3;
4831           }
4832         else
4833           {
4834             /* Can this ever be reached? -slb */
4835             ABORT();
4836             return 0;
4837           }
4838
4839         cs = CHARSET_BY_ATTRIBUTES (chars, single, c,
4840                                     *flags & CODING_STATE_R2L ?
4841                                     CHARSET_RIGHT_TO_LEFT :
4842                                     CHARSET_LEFT_TO_RIGHT);
4843         goto designated;
4844       }
4845     }
4846
4847  not_done:
4848   iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char) c;
4849   return -1;
4850
4851  single_shift:
4852   if (check_invalid_charsets && !CHARSETP (iso->charset[reg]))
4853     /* can't invoke something that ain't there. */
4854     return 0;
4855   iso->esc = ISO_ESC_SINGLE_SHIFT;
4856   *flags &= CODING_STATE_ISO2022_LOCK;
4857   if (reg == 2)
4858     *flags |= CODING_STATE_SS2;
4859   else
4860     *flags |= CODING_STATE_SS3;
4861   return 1;
4862
4863  locking_shift:
4864   if (check_invalid_charsets &&
4865       !CHARSETP (iso->charset[reg]))
4866     /* can't invoke something that ain't there. */
4867     return 0;
4868   if (half)
4869     iso->register_right = reg;
4870   else
4871     iso->register_left = reg;
4872   *flags &= CODING_STATE_ISO2022_LOCK;
4873   iso->esc = ISO_ESC_LOCKING_SHIFT;
4874   return 1;
4875
4876  designated:
4877   if (NILP (cs) && check_invalid_charsets)
4878     {
4879       iso->invalid_designated[reg] = 1;
4880       iso->charset[reg] = Vcharset_ascii;
4881       iso->esc = ISO_ESC_DESIGNATE;
4882       *flags &= CODING_STATE_ISO2022_LOCK;
4883       iso->output_literally = 1;
4884       if (iso->switched_dir_and_no_valid_charset_yet)
4885         {
4886           /* We encountered a switch-direction followed by an
4887              invalid designation.  Ensure that the switch-direction
4888              gets outputted; otherwise it will probably get eaten
4889              when the text is written out again. */
4890           iso->switched_dir_and_no_valid_charset_yet = 0;
4891           iso->output_direction_sequence = 1;
4892           /* And make sure that the switch-dir going the other
4893              way gets outputted, as well. */
4894           iso->invalid_switch_dir = 1;
4895         }
4896       return 1;
4897     }
4898   /* This function is called with CODESYS equal to nil when
4899      doing coding-system detection. */
4900   if (!NILP (codesys))
4901     {
4902       charset_conversion_spec_dynarr *dyn =
4903         XCODING_SYSTEM (codesys)->iso2022.input_conv;
4904
4905       if (dyn)
4906         {
4907           int i;
4908
4909           for (i = 0; i < Dynarr_length (dyn); i++)
4910             {
4911               struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
4912               if (EQ (cs, spec->from_charset))
4913                 cs = spec->to_charset;
4914             }
4915         }
4916     }
4917
4918   iso->charset[reg] = cs;
4919   iso->esc = ISO_ESC_DESIGNATE;
4920   *flags &= CODING_STATE_ISO2022_LOCK;
4921   if (iso->invalid_designated[reg])
4922     {
4923       iso->invalid_designated[reg] = 0;
4924       iso->output_literally = 1;
4925     }
4926   if (iso->switched_dir_and_no_valid_charset_yet)
4927     iso->switched_dir_and_no_valid_charset_yet = 0;
4928   return 1;
4929 }
4930
4931 static int
4932 detect_coding_iso2022 (struct detection_state *st, const Extbyte *src, Lstream_data_count n)
4933 {
4934   int mask;
4935
4936   /* #### There are serious deficiencies in the recognition mechanism
4937      here.  This needs to be much smarter if it's going to cut it.
4938      The sequence "\xff\x0f" is currently detected as LOCK_SHIFT while
4939      it should be detected as Latin-1.
4940      All the ISO2022 stuff in this file should be synced up with the
4941      code from FSF Emacs-20.4, in which Mule should be more or less stable.
4942      Perhaps we should wait till R2L works in FSF Emacs? */
4943
4944   if (!st->iso2022.initted)
4945     {
4946       reset_iso2022 (Qnil, &st->iso2022.iso);
4947       st->iso2022.mask = (CODING_CATEGORY_ISO_7_MASK |
4948                           CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4949                           CODING_CATEGORY_ISO_8_1_MASK |
4950                           CODING_CATEGORY_ISO_8_2_MASK |
4951                           CODING_CATEGORY_ISO_LOCK_SHIFT_MASK);
4952       st->iso2022.flags = 0;
4953       st->iso2022.high_byte_count = 0;
4954       st->iso2022.saw_single_shift = 0;
4955       st->iso2022.initted = 1;
4956     }
4957
4958   mask = st->iso2022.mask;
4959
4960   while (n--)
4961     {
4962       unsigned char c = *(unsigned char *)src++;
4963       if (c >= 0xA0)
4964         {
4965           mask &= ~CODING_CATEGORY_ISO_7_MASK;
4966           st->iso2022.high_byte_count++;
4967         }
4968       else
4969         {
4970           if (st->iso2022.high_byte_count && !st->iso2022.saw_single_shift)
4971             {
4972               if (st->iso2022.high_byte_count & 1)
4973                 /* odd number of high bytes; assume not iso-8-2 */
4974                 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4975             }
4976           st->iso2022.high_byte_count = 0;
4977           st->iso2022.saw_single_shift = 0;
4978           if (c > 0x80)
4979             mask &= ~CODING_CATEGORY_ISO_7_MASK;
4980         }
4981       if (!(st->iso2022.flags & CODING_STATE_ESCAPE)
4982           && (BYTE_C0_P (c) || BYTE_C1_P (c)))
4983         { /* control chars */
4984           switch (c)
4985             {
4986               /* Allow and ignore control characters that you might
4987                  reasonably see in a text file */
4988             case '\r':
4989             case '\n':
4990             case '\t':
4991             case  7: /* bell */
4992             case  8: /* backspace */
4993             case 11: /* vertical tab */
4994             case 12: /* form feed */
4995             case 26: /* MS-DOS C-z junk */
4996             case 31: /* '^_' -- for info */
4997               goto label_continue_loop;
4998
4999             default:
5000               break;
5001             }
5002         }
5003
5004       if ((st->iso2022.flags & CODING_STATE_ESCAPE) || BYTE_C0_P (c)
5005           || BYTE_C1_P (c))
5006         {
5007           if (parse_iso2022_esc (Qnil, &st->iso2022.iso, c,
5008                                  &st->iso2022.flags, 0))
5009             {
5010               switch (st->iso2022.iso.esc)
5011                 {
5012                 case ISO_ESC_DESIGNATE:
5013                   mask &= ~CODING_CATEGORY_ISO_8_1_MASK;
5014                   mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
5015                   break;
5016                 case ISO_ESC_LOCKING_SHIFT:
5017                   mask = CODING_CATEGORY_ISO_LOCK_SHIFT_MASK;
5018                   goto ran_out_of_chars;
5019                 case ISO_ESC_SINGLE_SHIFT:
5020                   mask &= ~CODING_CATEGORY_ISO_8_DESIGNATE_MASK;
5021                   st->iso2022.saw_single_shift = 1;
5022                   break;
5023                 default:
5024                   break;
5025                 }
5026             }
5027           else
5028             {
5029               mask = 0;
5030               goto ran_out_of_chars;
5031             }
5032         }
5033     label_continue_loop:;
5034     }
5035
5036  ran_out_of_chars:
5037
5038   return mask;
5039 }
5040
5041 static int
5042 postprocess_iso2022_mask (int mask)
5043 {
5044   /* #### kind of cheesy */
5045   /* If seven-bit ISO is allowed, then assume that the encoding is
5046      entirely seven-bit and turn off the eight-bit ones. */
5047   if (mask & CODING_CATEGORY_ISO_7_MASK)
5048     mask &= ~ (CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
5049                CODING_CATEGORY_ISO_8_1_MASK |
5050                CODING_CATEGORY_ISO_8_2_MASK);
5051   return mask;
5052 }
5053
5054 /* If FLAGS is a null pointer or specifies right-to-left motion,
5055    output a switch-dir-to-left-to-right sequence to DST.
5056    Also update FLAGS if it is not a null pointer.
5057    If INTERNAL_P is set, we are outputting in internal format and
5058    need to handle the CSI differently. */
5059
5060 static void
5061 restore_left_to_right_direction (Lisp_Coding_System *codesys,
5062                                  unsigned_char_dynarr *dst,
5063                                  unsigned int *flags,
5064                                  int internal_p)
5065 {
5066   if (!flags || (*flags & CODING_STATE_R2L))
5067     {
5068       if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5069         {
5070           Dynarr_add (dst, ISO_CODE_ESC);
5071           Dynarr_add (dst, '[');
5072         }
5073       else if (internal_p)
5074         DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
5075       else
5076         Dynarr_add (dst, ISO_CODE_CSI);
5077       Dynarr_add (dst, '0');
5078       Dynarr_add (dst, ']');
5079       if (flags)
5080         *flags &= ~CODING_STATE_R2L;
5081     }
5082 }
5083
5084 /* If FLAGS is a null pointer or specifies a direction different from
5085    DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or
5086    CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape
5087    sequence to DST.  Also update FLAGS if it is not a null pointer.
5088    If INTERNAL_P is set, we are outputting in internal format and
5089    need to handle the CSI differently. */
5090
5091 static void
5092 ensure_correct_direction (int direction, Lisp_Coding_System *codesys,
5093                           unsigned_char_dynarr *dst, unsigned int *flags,
5094                           int internal_p)
5095 {
5096   if ((!flags || (*flags & CODING_STATE_R2L)) &&
5097       direction == CHARSET_LEFT_TO_RIGHT)
5098     restore_left_to_right_direction (codesys, dst, flags, internal_p);
5099   else if (!CODING_SYSTEM_ISO2022_NO_ISO6429 (codesys)
5100            && (!flags || !(*flags & CODING_STATE_R2L)) &&
5101            direction == CHARSET_RIGHT_TO_LEFT)
5102     {
5103       if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5104         {
5105           Dynarr_add (dst, ISO_CODE_ESC);
5106           Dynarr_add (dst, '[');
5107         }
5108       else if (internal_p)
5109         DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
5110       else
5111         Dynarr_add (dst, ISO_CODE_CSI);
5112       Dynarr_add (dst, '2');
5113       Dynarr_add (dst, ']');
5114       if (flags)
5115         *flags |= CODING_STATE_R2L;
5116     }
5117 }
5118
5119 /* Convert ISO2022-format data to internal format. */
5120
5121 static void
5122 decode_coding_iso2022 (Lstream *decoding, const Extbyte *src,
5123                        unsigned_char_dynarr *dst, Lstream_data_count n)
5124 {
5125   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
5126   unsigned int flags  = str->flags;
5127   unsigned int ch     = str->ch;
5128   eol_type_t eol_type = str->eol_type;
5129 #ifdef ENABLE_COMPOSITE_CHARS
5130   unsigned_char_dynarr *real_dst = dst;
5131 #endif
5132   Lisp_Object coding_system;
5133
5134   XSETCODING_SYSTEM (coding_system, str->codesys);
5135
5136 #ifdef ENABLE_COMPOSITE_CHARS
5137   if (flags & CODING_STATE_COMPOSITE)
5138     dst = str->iso2022.composite_chars;
5139 #endif /* ENABLE_COMPOSITE_CHARS */
5140
5141   while (n--)
5142     {
5143       unsigned char c = *(unsigned char *)src++;
5144       if (flags & CODING_STATE_ESCAPE)
5145         {       /* Within ESC sequence */
5146           int retval = parse_iso2022_esc (coding_system, &str->iso2022,
5147                                           c, &flags, 1);
5148
5149           if (retval)
5150             {
5151               switch (str->iso2022.esc)
5152                 {
5153 #ifdef ENABLE_COMPOSITE_CHARS
5154                 case ISO_ESC_START_COMPOSITE:
5155                   if (str->iso2022.composite_chars)
5156                     Dynarr_reset (str->iso2022.composite_chars);
5157                   else
5158                     str->iso2022.composite_chars = Dynarr_new (unsigned_char);
5159                   dst = str->iso2022.composite_chars;
5160                   break;
5161                 case ISO_ESC_END_COMPOSITE:
5162                   {
5163                     Bufbyte comstr[MAX_EMCHAR_LEN];
5164                     Bytecount len;
5165                     Emchar emch = lookup_composite_char (Dynarr_atp (dst, 0),
5166                                                          Dynarr_length (dst));
5167                     dst = real_dst;
5168                     len = set_charptr_emchar (comstr, emch);
5169                     Dynarr_add_many (dst, comstr, len);
5170                     break;
5171                   }
5172 #endif /* ENABLE_COMPOSITE_CHARS */
5173
5174                 case ISO_ESC_LITERAL:
5175                   DECODE_ADD_BINARY_CHAR (c, dst);
5176                   break;
5177
5178                 default:
5179                   /* Everything else handled already */
5180                   break;
5181                 }
5182             }
5183
5184           /* Attempted error recovery. */
5185           if (str->iso2022.output_direction_sequence)
5186             ensure_correct_direction (flags & CODING_STATE_R2L ?
5187                                       CHARSET_RIGHT_TO_LEFT :
5188                                       CHARSET_LEFT_TO_RIGHT,
5189                                       str->codesys, dst, 0, 1);
5190           /* More error recovery. */
5191           if (!retval || str->iso2022.output_literally)
5192             {
5193               /* Output the (possibly invalid) sequence */
5194               int i;
5195               for (i = 0; i < str->iso2022.esc_bytes_index; i++)
5196                 DECODE_ADD_BINARY_CHAR (str->iso2022.esc_bytes[i], dst);
5197               flags &= CODING_STATE_ISO2022_LOCK;
5198               if (!retval)
5199                 n++, src--;/* Repeat the loop with the same character. */
5200               else
5201                 {
5202                   /* No sense in reprocessing the final byte of the
5203                      escape sequence; it could mess things up anyway.
5204                      Just add it now. */
5205                   DECODE_ADD_BINARY_CHAR (c, dst);
5206                 }
5207             }
5208           ch = 0;
5209         }
5210       else if (BYTE_C0_P (c) || BYTE_C1_P (c))
5211         { /* Control characters */
5212
5213           /***** Error-handling *****/
5214
5215           /* If we were in the middle of a character, dump out the
5216              partial character. */
5217           DECODE_OUTPUT_PARTIAL_CHAR (ch);
5218
5219           /* If we just saw a single-shift character, dump it out.
5220              This may dump out the wrong sort of single-shift character,
5221              but least it will give an indication that something went
5222              wrong. */
5223           if (flags & CODING_STATE_SS2)
5224             {
5225               DECODE_ADD_BINARY_CHAR (ISO_CODE_SS2, dst);
5226               flags &= ~CODING_STATE_SS2;
5227             }
5228           if (flags & CODING_STATE_SS3)
5229             {
5230               DECODE_ADD_BINARY_CHAR (ISO_CODE_SS3, dst);
5231               flags &= ~CODING_STATE_SS3;
5232             }
5233
5234           /***** Now handle the control characters. *****/
5235
5236           /* Handle CR/LF */
5237           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5238
5239           flags &= CODING_STATE_ISO2022_LOCK;
5240
5241           if (!parse_iso2022_esc (coding_system, &str->iso2022, c, &flags, 1))
5242             DECODE_ADD_BINARY_CHAR (c, dst);
5243         }
5244       else
5245         {                       /* Graphic characters */
5246           Lisp_Object charset;
5247 #ifndef UTF2000
5248           Charset_ID lb;
5249 #endif
5250           int reg;
5251
5252           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5253
5254           /* Now determine the charset. */
5255           reg = ((flags & CODING_STATE_SS2) ? 2
5256                  : (flags & CODING_STATE_SS3) ? 3
5257                  : !BYTE_ASCII_P (c) ? str->iso2022.register_right
5258                  : str->iso2022.register_left);
5259           charset = str->iso2022.charset[reg];
5260
5261           /* Error checking: */
5262           if (! CHARSETP (charset)
5263               || str->iso2022.invalid_designated[reg]
5264               || (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL)
5265                   && XCHARSET_CHARS (charset) == 94))
5266             /* Mrmph.  We are trying to invoke a register that has no
5267                or an invalid charset in it, or trying to add a character
5268                outside the range of the charset.  Insert that char literally
5269                to preserve it for the output. */
5270             {
5271               DECODE_OUTPUT_PARTIAL_CHAR (ch);
5272               DECODE_ADD_BINARY_CHAR (c, dst);
5273             }
5274
5275           else
5276             {
5277               /* Things are probably hunky-dorey. */
5278
5279               /* Fetch reverse charset, maybe. */
5280               if (((flags & CODING_STATE_R2L) &&
5281                    XCHARSET_DIRECTION (charset) == CHARSET_LEFT_TO_RIGHT)
5282                   ||
5283                   (!(flags & CODING_STATE_R2L) &&
5284                    XCHARSET_DIRECTION (charset) == CHARSET_RIGHT_TO_LEFT))
5285                 {
5286                   Lisp_Object new_charset =
5287                     XCHARSET_REVERSE_DIRECTION_CHARSET (charset);
5288                   if (!NILP (new_charset))
5289                     charset = new_charset;
5290                 }
5291
5292 #ifdef UTF2000
5293               if (XCHARSET_DIMENSION (charset) == 1)
5294                 {
5295                   DECODE_OUTPUT_PARTIAL_CHAR (ch);
5296                   DECODE_ADD_UCS_CHAR
5297                     (MAKE_CHAR (charset, c & 0x7F, 0), dst);
5298                 }
5299               else if (ch)
5300                 {
5301                   DECODE_ADD_UCS_CHAR
5302                     (MAKE_CHAR (charset, ch & 0x7F, c & 0x7F), dst);
5303                   ch = 0;
5304                 }
5305               else
5306                 ch = c;
5307 #else
5308               lb = XCHARSET_LEADING_BYTE (charset);
5309               switch (XCHARSET_REP_BYTES (charset))
5310                 {
5311                 case 1: /* ASCII */
5312                   DECODE_OUTPUT_PARTIAL_CHAR (ch);
5313                   Dynarr_add (dst, c & 0x7F);
5314                   break;
5315
5316                 case 2: /* one-byte official */
5317                   DECODE_OUTPUT_PARTIAL_CHAR (ch);
5318                   Dynarr_add (dst, lb);
5319                   Dynarr_add (dst, c | 0x80);
5320                   break;
5321
5322                 case 3: /* one-byte private or two-byte official */
5323                   if (XCHARSET_PRIVATE_P (charset))
5324                     {
5325                       DECODE_OUTPUT_PARTIAL_CHAR (ch);
5326                       Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_1);
5327                       Dynarr_add (dst, lb);
5328                       Dynarr_add (dst, c | 0x80);
5329                     }
5330                   else
5331                     {
5332                       if (ch)
5333                         {
5334                           Dynarr_add (dst, lb);
5335                           Dynarr_add (dst, ch | 0x80);
5336                           Dynarr_add (dst, c | 0x80);
5337                           ch = 0;
5338                         }
5339                       else
5340                         ch = c;
5341                     }
5342                   break;
5343
5344                 default:        /* two-byte private */
5345                   if (ch)
5346                     {
5347                       Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_2);
5348                       Dynarr_add (dst, lb);
5349                       Dynarr_add (dst, ch | 0x80);
5350                       Dynarr_add (dst, c | 0x80);
5351                       ch = 0;
5352                     }
5353                   else
5354                     ch = c;
5355                 }
5356 #endif
5357             }
5358
5359           if (!ch)
5360             flags &= CODING_STATE_ISO2022_LOCK;
5361         }
5362
5363     label_continue_loop:;
5364     }
5365
5366   if (flags & CODING_STATE_END)
5367     DECODE_OUTPUT_PARTIAL_CHAR (ch);
5368
5369   str->flags = flags;
5370   str->ch    = ch;
5371 }
5372
5373
5374 /***** ISO2022 encoder *****/
5375
5376 /* Designate CHARSET into register REG. */
5377
5378 static void
5379 iso2022_designate (Lisp_Object charset, unsigned char reg,
5380                    struct encoding_stream *str, unsigned_char_dynarr *dst)
5381 {
5382   static const char inter94[] = "()*+";
5383   static const char inter96[] = ",-./";
5384   unsigned int type;
5385   unsigned char final;
5386   Lisp_Object old_charset = str->iso2022.charset[reg];
5387
5388   str->iso2022.charset[reg] = charset;
5389   if (!CHARSETP (charset))
5390     /* charset might be an initial nil or t. */
5391     return;
5392   type = XCHARSET_TYPE (charset);
5393   final = XCHARSET_FINAL (charset);
5394   if (!str->iso2022.force_charset_on_output[reg] &&
5395       CHARSETP (old_charset) &&
5396       XCHARSET_TYPE (old_charset) == type &&
5397       XCHARSET_FINAL (old_charset) == final)
5398     return;
5399
5400   str->iso2022.force_charset_on_output[reg] = 0;
5401
5402   {
5403     charset_conversion_spec_dynarr *dyn =
5404       str->codesys->iso2022.output_conv;
5405
5406     if (dyn)
5407       {
5408         int i;
5409
5410         for (i = 0; i < Dynarr_length (dyn); i++)
5411           {
5412             struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
5413             if (EQ (charset, spec->from_charset))
5414                 charset = spec->to_charset;
5415           }
5416       }
5417   }
5418
5419   Dynarr_add (dst, ISO_CODE_ESC);
5420   switch (type)
5421     {
5422     case CHARSET_TYPE_94:
5423       Dynarr_add (dst, inter94[reg]);
5424       break;
5425     case CHARSET_TYPE_96:
5426       Dynarr_add (dst, inter96[reg]);
5427       break;
5428     case CHARSET_TYPE_94X94:
5429       Dynarr_add (dst, '$');
5430       if (reg != 0
5431           || !(CODING_SYSTEM_ISO2022_SHORT (str->codesys))
5432           || final < '@'
5433           || final > 'B')
5434         Dynarr_add (dst, inter94[reg]);
5435       break;
5436     case CHARSET_TYPE_96X96:
5437       Dynarr_add (dst, '$');
5438       Dynarr_add (dst, inter96[reg]);
5439       break;
5440     }
5441   Dynarr_add (dst, final);
5442 }
5443
5444 static void
5445 ensure_normal_shift (struct encoding_stream *str, unsigned_char_dynarr *dst)
5446 {
5447   if (str->iso2022.register_left != 0)
5448     {
5449       Dynarr_add (dst, ISO_CODE_SI);
5450       str->iso2022.register_left = 0;
5451     }
5452 }
5453
5454 static void
5455 ensure_shift_out (struct encoding_stream *str, unsigned_char_dynarr *dst)
5456 {
5457   if (str->iso2022.register_left != 1)
5458     {
5459       Dynarr_add (dst, ISO_CODE_SO);
5460       str->iso2022.register_left = 1;
5461     }
5462 }
5463
5464 /* Convert internally-formatted data to ISO2022 format. */
5465
5466 static void
5467 encode_coding_iso2022 (Lstream *encoding, const Bufbyte *src,
5468                        unsigned_char_dynarr *dst, Lstream_data_count n)
5469 {
5470   unsigned char charmask, c;
5471   unsigned char char_boundary;
5472   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
5473   unsigned int flags          = str->flags;
5474   Emchar ch                   = str->ch;
5475   Lisp_Coding_System *codesys = str->codesys;
5476   eol_type_t eol_type         = CODING_SYSTEM_EOL_TYPE (str->codesys);
5477   int i;
5478   Lisp_Object charset;
5479   int half;
5480 #ifdef UTF2000
5481   unsigned int byte1, byte2;
5482 #endif
5483
5484 #ifdef ENABLE_COMPOSITE_CHARS
5485   /* flags for handling composite chars.  We do a little switcharoo
5486      on the source while we're outputting the composite char. */
5487   unsigned int saved_n = 0;
5488   const unsigned char *saved_src = NULL;
5489   int in_composite = 0;
5490 #endif /* ENABLE_COMPOSITE_CHARS */
5491
5492   char_boundary = str->iso2022.current_char_boundary;
5493   charset = str->iso2022.current_charset;
5494   half = str->iso2022.current_half;
5495
5496 #ifdef ENABLE_COMPOSITE_CHARS
5497  back_to_square_n:
5498 #endif
5499 #ifdef UTF2000
5500   while (n--)
5501     {
5502       c = *src++;
5503
5504       switch (char_boundary)
5505         {
5506         case 0:
5507           if ( c >= 0xfc )
5508             {
5509               ch = c & 0x01;
5510               char_boundary = 5;
5511             }
5512           else if ( c >= 0xf8 )
5513             {
5514               ch = c & 0x03;
5515               char_boundary = 4;
5516             }
5517           else if ( c >= 0xf0 )
5518             {
5519               ch = c & 0x07;
5520               char_boundary = 3;
5521             }
5522           else if ( c >= 0xe0 )
5523             {
5524               ch = c & 0x0f;
5525               char_boundary = 2;
5526             }
5527           else if ( c >= 0xc0 )
5528             {
5529               ch = c & 0x1f;
5530               char_boundary = 1;
5531             }
5532           else
5533             {
5534               ch = 0;
5535
5536               restore_left_to_right_direction (codesys, dst, &flags, 0);
5537
5538               /* Make sure G0 contains ASCII */
5539               if ((c > ' ' && c < ISO_CODE_DEL) ||
5540                   !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
5541                 {
5542                   ensure_normal_shift (str, dst);
5543                   iso2022_designate (Vcharset_ascii, 0, str, dst);
5544                 }
5545
5546               /* If necessary, restore everything to the default state
5547                  at end-of-line */
5548               if (c == '\n' &&
5549                   !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys)))
5550                 {
5551                   restore_left_to_right_direction (codesys, dst, &flags, 0);
5552
5553                   ensure_normal_shift (str, dst);
5554
5555                   for (i = 0; i < 4; i++)
5556                     {
5557                       Lisp_Object initial_charset =
5558                         CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5559                       iso2022_designate (initial_charset, i, str, dst);
5560                     }
5561                 }
5562               if (c == '\n')
5563                 {
5564                   if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5565                     Dynarr_add (dst, '\r');
5566                   if (eol_type != EOL_CR)
5567                     Dynarr_add (dst, c);
5568                 }
5569               else
5570                 {
5571                   if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5572                       && fit_to_be_escape_quoted (c))
5573                     Dynarr_add (dst, ISO_CODE_ESC);
5574                   Dynarr_add (dst, c);
5575                 }
5576               char_boundary = 0;
5577             }
5578           break;
5579         case 1:
5580           ch = ( ch << 6 ) | ( c & 0x3f );
5581
5582           char_boundary = 0;
5583           if ( (0x80 <= ch) && (ch <= 0x9f) )
5584             {
5585               charmask = (half == 0 ? 0x00 : 0x80);
5586
5587               if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5588                   && fit_to_be_escape_quoted (ch))
5589                 Dynarr_add (dst, ISO_CODE_ESC);
5590               /* you asked for it ... */
5591               Dynarr_add (dst, ch);
5592             }
5593           else
5594             {
5595               int reg;
5596
5597               BREAKUP_CHAR (ch, charset, byte1, byte2);
5598               ensure_correct_direction (XCHARSET_DIRECTION (charset),
5599                                         codesys, dst, &flags, 0);
5600
5601               /* Now determine which register to use. */
5602               reg = -1;
5603               for (i = 0; i < 4; i++)
5604                 {
5605                   if (EQ (charset, str->iso2022.charset[i]) ||
5606                       EQ (charset,
5607                           CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i)))
5608                     {
5609                       reg = i;
5610                       break;
5611                     }
5612                 }
5613
5614               if (reg == -1)
5615                 {
5616                   if (XCHARSET_GRAPHIC (charset) != 0)
5617                     {
5618                       if (!NILP (str->iso2022.charset[1]) &&
5619                           (!CODING_SYSTEM_ISO2022_SEVEN (codesys) ||
5620                            CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys)))
5621                         reg = 1;
5622                       else if (!NILP (str->iso2022.charset[2]))
5623                         reg = 2;
5624                       else if (!NILP (str->iso2022.charset[3]))
5625                         reg = 3;
5626                       else
5627                         reg = 0;
5628                     }
5629                   else
5630                     reg = 0;
5631                 }
5632
5633               iso2022_designate (charset, reg, str, dst);
5634
5635               /* Now invoke that register. */
5636               switch (reg)
5637                 {
5638                 case 0:
5639                   ensure_normal_shift (str, dst);
5640                   half = 0;
5641                   break;
5642
5643                 case 1:
5644                   if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5645                     {
5646                       ensure_shift_out (str, dst);
5647                       half = 0;
5648                     }
5649                   else
5650                     half = 1;
5651                   break;
5652
5653                 case 2:
5654                   if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5655                     {
5656                       Dynarr_add (dst, ISO_CODE_ESC);
5657                       Dynarr_add (dst, 'N');
5658                       half = 0;
5659                     }
5660                   else
5661                     {
5662                       Dynarr_add (dst, ISO_CODE_SS2);
5663                       half = 1;
5664                     }
5665                   break;
5666
5667                 case 3:
5668                   if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5669                     {
5670                       Dynarr_add (dst, ISO_CODE_ESC);
5671                       Dynarr_add (dst, 'O');
5672                       half = 0;
5673                     }
5674                   else
5675                     {
5676                       Dynarr_add (dst, ISO_CODE_SS3);
5677                       half = 1;
5678                     }
5679                   break;
5680
5681                 default:
5682                   abort ();
5683                 }
5684
5685               charmask = (half == 0 ? 0x00 : 0x80);
5686
5687               switch (XCHARSET_DIMENSION (charset))
5688                 {
5689                 case 1:
5690                   Dynarr_add (dst, byte1 | charmask);
5691                   break;
5692                 case 2:
5693                   Dynarr_add (dst, byte1 | charmask);
5694                   Dynarr_add (dst, byte2 | charmask);
5695                   break;
5696                 default:
5697                   abort ();
5698                 }
5699             }
5700           ch =0;
5701           break;
5702         default:
5703           ch = ( ch << 6 ) | ( c & 0x3f );
5704           char_boundary--;
5705         }
5706     }
5707 #else /* not UTF2000 */
5708
5709   while (n--)
5710     {
5711       c = *src++;
5712
5713       if (BYTE_ASCII_P (c))
5714         {               /* Processing ASCII character */
5715           ch = 0;
5716
5717           restore_left_to_right_direction (codesys, dst, &flags, 0);
5718
5719           /* Make sure G0 contains ASCII */
5720           if ((c > ' ' && c < ISO_CODE_DEL) ||
5721               !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
5722             {
5723               ensure_normal_shift (str, dst);
5724               iso2022_designate (Vcharset_ascii, 0, str, dst);
5725             }
5726
5727           /* If necessary, restore everything to the default state
5728              at end-of-line */
5729           if (c == '\n' &&
5730               !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys)))
5731             {
5732               restore_left_to_right_direction (codesys, dst, &flags, 0);
5733
5734               ensure_normal_shift (str, dst);
5735
5736               for (i = 0; i < 4; i++)
5737                 {
5738                   Lisp_Object initial_charset =
5739                     CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5740                   iso2022_designate (initial_charset, i, str, dst);
5741                 }
5742             }
5743           if (c == '\n')
5744             {
5745               if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5746                 Dynarr_add (dst, '\r');
5747               if (eol_type != EOL_CR)
5748                 Dynarr_add (dst, c);
5749             }
5750           else
5751             {
5752               if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5753                   && fit_to_be_escape_quoted (c))
5754                 Dynarr_add (dst, ISO_CODE_ESC);
5755               Dynarr_add (dst, c);
5756             }
5757           char_boundary = 1;
5758         }
5759
5760       else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
5761         { /* Processing Leading Byte */
5762           ch = 0;
5763           charset = CHARSET_BY_LEADING_BYTE (c);
5764           if (LEADING_BYTE_PREFIX_P(c))
5765             ch = c;
5766           else if (!EQ (charset, Vcharset_control_1)
5767 #ifdef ENABLE_COMPOSITE_CHARS
5768                    && !EQ (charset, Vcharset_composite)
5769 #endif
5770                    )
5771             {
5772               int reg;
5773
5774               ensure_correct_direction (XCHARSET_DIRECTION (charset),
5775                                         codesys, dst, &flags, 0);
5776
5777               /* Now determine which register to use. */
5778               reg = -1;
5779               for (i = 0; i < 4; i++)
5780                 {
5781                   if (EQ (charset, str->iso2022.charset[i]) ||
5782                       EQ (charset,
5783                           CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i)))
5784                     {
5785                       reg = i;
5786                       break;
5787                     }
5788                 }
5789
5790               if (reg == -1)
5791                 {
5792                   if (XCHARSET_GRAPHIC (charset) != 0)
5793                     {
5794                       if (!NILP (str->iso2022.charset[1]) &&
5795                           (!CODING_SYSTEM_ISO2022_SEVEN (codesys) ||
5796                            CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys)))
5797                         reg = 1;
5798                       else if (!NILP (str->iso2022.charset[2]))
5799                         reg = 2;
5800                       else if (!NILP (str->iso2022.charset[3]))
5801                         reg = 3;
5802                       else
5803                         reg = 0;
5804                     }
5805                   else
5806                     reg = 0;
5807                 }
5808
5809               iso2022_designate (charset, reg, str, dst);
5810
5811               /* Now invoke that register. */
5812               switch (reg)
5813                 {
5814                 case 0:
5815                   ensure_normal_shift (str, dst);
5816                   half = 0;
5817                   break;
5818
5819                 case 1:
5820                   if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5821                     {
5822                       ensure_shift_out (str, dst);
5823                       half = 0;
5824                     }
5825                   else
5826                     half = 1;
5827                   break;
5828
5829                 case 2:
5830                   if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5831                     {
5832                       Dynarr_add (dst, ISO_CODE_ESC);
5833                       Dynarr_add (dst, 'N');
5834                       half = 0;
5835                     }
5836                   else
5837                     {
5838                       Dynarr_add (dst, ISO_CODE_SS2);
5839                       half = 1;
5840                     }
5841                   break;
5842
5843                 case 3:
5844                   if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5845                     {
5846                       Dynarr_add (dst, ISO_CODE_ESC);
5847                       Dynarr_add (dst, 'O');
5848                       half = 0;
5849                     }
5850                   else
5851                     {
5852                       Dynarr_add (dst, ISO_CODE_SS3);
5853                       half = 1;
5854                     }
5855                   break;
5856
5857                 default:
5858                   ABORT ();
5859                 }
5860             }
5861           char_boundary = 0;
5862         }
5863       else
5864         {                       /* Processing Non-ASCII character */
5865           charmask = (half == 0 ? 0x7F : 0xFF);
5866           char_boundary = 1;
5867           if (EQ (charset, Vcharset_control_1))
5868             {
5869               if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5870                   && fit_to_be_escape_quoted (c))
5871                 Dynarr_add (dst, ISO_CODE_ESC);
5872               /* you asked for it ... */
5873               Dynarr_add (dst, c - 0x20);
5874             }
5875           else
5876             {
5877               switch (XCHARSET_REP_BYTES (charset))
5878                 {
5879                 case 2:
5880                   Dynarr_add (dst, c & charmask);
5881                   break;
5882                 case 3:
5883                   if (XCHARSET_PRIVATE_P (charset))
5884                     {
5885                       Dynarr_add (dst, c & charmask);
5886                       ch = 0;
5887                     }
5888                   else if (ch)
5889                     {
5890 #ifdef ENABLE_COMPOSITE_CHARS
5891                       if (EQ (charset, Vcharset_composite))
5892                         {
5893                           if (in_composite)
5894                             {
5895                               /* #### Bother! We don't know how to
5896                                  handle this yet. */
5897                               Dynarr_add (dst, '~');
5898                             }
5899                           else
5900                             {
5901                               Emchar emch = MAKE_CHAR (Vcharset_composite,
5902                                                        ch & 0x7F, c & 0x7F);
5903                               Lisp_Object lstr = composite_char_string (emch);
5904                               saved_n = n;
5905                               saved_src = src;
5906                               in_composite = 1;
5907                               src = XSTRING_DATA   (lstr);
5908                               n   = XSTRING_LENGTH (lstr);
5909                               Dynarr_add (dst, ISO_CODE_ESC);
5910                               Dynarr_add (dst, '0'); /* start composing */
5911                             }
5912                         }
5913                       else
5914 #endif /* ENABLE_COMPOSITE_CHARS */
5915                         {
5916                           Dynarr_add (dst, ch & charmask);
5917                           Dynarr_add (dst, c & charmask);
5918                         }
5919                       ch = 0;
5920                     }
5921                   else
5922                     {
5923                       ch = c;
5924                       char_boundary = 0;
5925                     }
5926                   break;
5927                 case 4:
5928                   if (ch)
5929                     {
5930                       Dynarr_add (dst, ch & charmask);
5931                       Dynarr_add (dst, c & charmask);
5932                       ch = 0;
5933                     }
5934                   else
5935                     {
5936                       ch = c;
5937                       char_boundary = 0;
5938                     }
5939                   break;
5940                 default:
5941                   ABORT ();
5942                 }
5943             }
5944         }
5945     }
5946 #endif /* not UTF2000 */
5947
5948 #ifdef ENABLE_COMPOSITE_CHARS
5949   if (in_composite)
5950     {
5951       n = saved_n;
5952       src = saved_src;
5953       in_composite = 0;
5954       Dynarr_add (dst, ISO_CODE_ESC);
5955       Dynarr_add (dst, '1'); /* end composing */
5956       goto back_to_square_n; /* Wheeeeeeeee ..... */
5957     }
5958 #endif /* ENABLE_COMPOSITE_CHARS */
5959
5960 #ifdef UTF2000
5961   if ( (char_boundary == 0) && flags & CODING_STATE_END)
5962 #else
5963   if (char_boundary && flags & CODING_STATE_END)
5964 #endif
5965     {
5966       restore_left_to_right_direction (codesys, dst, &flags, 0);
5967       ensure_normal_shift (str, dst);
5968       for (i = 0; i < 4; i++)
5969         {
5970           Lisp_Object initial_charset =
5971             CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5972           iso2022_designate (initial_charset, i, str, dst);
5973         }
5974     }
5975
5976   str->flags = flags;
5977   str->ch    = ch;
5978   str->iso2022.current_char_boundary = char_boundary;
5979   str->iso2022.current_charset = charset;
5980   str->iso2022.current_half = half;
5981
5982   /* Verbum caro factum est! */
5983 }
5984 #endif /* MULE */
5985 \f
5986 /************************************************************************/
5987 /*                     No-conversion methods                            */
5988 /************************************************************************/
5989
5990 /* This is used when reading in "binary" files -- i.e. files that may
5991    contain all 256 possible byte values and that are not to be
5992    interpreted as being in any particular decoding. */
5993 static void
5994 decode_coding_no_conversion (Lstream *decoding, const Extbyte *src,
5995                              unsigned_char_dynarr *dst, Lstream_data_count n)
5996 {
5997   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
5998   unsigned int flags  = str->flags;
5999   unsigned int ch     = str->ch;
6000   eol_type_t eol_type = str->eol_type;
6001
6002   while (n--)
6003     {
6004       unsigned char c = *(unsigned char *)src++;
6005
6006       DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
6007       DECODE_ADD_BINARY_CHAR (c, dst);
6008     label_continue_loop:;
6009     }
6010
6011   DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
6012
6013   str->flags = flags;
6014   str->ch    = ch;
6015 }
6016
6017 static void
6018 encode_coding_no_conversion (Lstream *encoding, const Bufbyte *src,
6019                              unsigned_char_dynarr *dst, Lstream_data_count n)
6020 {
6021   unsigned char c;
6022   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
6023   unsigned int flags  = str->flags;
6024   unsigned int ch     = str->ch;
6025   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
6026 #ifdef UTF2000
6027   unsigned char char_boundary = str->iso2022.current_char_boundary;
6028 #endif
6029
6030   while (n--)
6031     {
6032       c = *src++;
6033 #ifdef UTF2000
6034       switch (char_boundary)
6035         {
6036         case 0:
6037           if ( c >= 0xfc )
6038             {
6039               ch = c & 0x01;
6040               char_boundary = 5;
6041             }
6042           else if ( c >= 0xf8 )
6043             {
6044               ch = c & 0x03;
6045               char_boundary = 4;
6046             }
6047           else if ( c >= 0xf0 )
6048             {
6049               ch = c & 0x07;
6050               char_boundary = 3;
6051             }
6052           else if ( c >= 0xe0 )
6053             {
6054               ch = c & 0x0f;
6055               char_boundary = 2;
6056             }
6057           else if ( c >= 0xc0 )
6058             {
6059               ch = c & 0x1f;
6060               char_boundary = 1;
6061             }
6062           else
6063             {
6064               ch = 0;
6065
6066               if (c == '\n')
6067                 {
6068                   if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
6069                     Dynarr_add (dst, '\r');
6070                   if (eol_type != EOL_CR)
6071                     Dynarr_add (dst, c);
6072                 }
6073               else
6074                 Dynarr_add (dst, c);
6075               char_boundary = 0;
6076             }
6077           break;
6078         case 1:
6079           ch = ( ch << 6 ) | ( c & 0x3f );
6080           Dynarr_add (dst, ch & 0xff);
6081           char_boundary = 0;
6082           break;
6083         default:
6084           ch = ( ch << 6 ) | ( c & 0x3f );
6085           char_boundary--;
6086         }
6087 #else /* not UTF2000 */
6088       if (c == '\n')
6089         {
6090           if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
6091             Dynarr_add (dst, '\r');
6092           if (eol_type != EOL_CR)
6093             Dynarr_add (dst, '\n');
6094           ch = 0;
6095         }
6096       else if (BYTE_ASCII_P (c))
6097         {
6098           assert (ch == 0);
6099           Dynarr_add (dst, c);
6100         }
6101       else if (BUFBYTE_LEADING_BYTE_P (c))
6102         {
6103           assert (ch == 0);
6104           if (c == LEADING_BYTE_LATIN_ISO8859_1 ||
6105               c == LEADING_BYTE_CONTROL_1)
6106             ch = c;
6107           else
6108             Dynarr_add (dst, '~'); /* untranslatable character */
6109         }
6110       else
6111         {
6112           if (ch == LEADING_BYTE_LATIN_ISO8859_1)
6113             Dynarr_add (dst, c);
6114           else if (ch == LEADING_BYTE_CONTROL_1)
6115             {
6116               assert (c < 0xC0);
6117               Dynarr_add (dst, c - 0x20);
6118             }
6119           /* else it should be the second or third byte of an
6120              untranslatable character, so ignore it */
6121           ch = 0;
6122         }
6123 #endif /* not UTF2000 */
6124     }
6125
6126   str->flags = flags;
6127   str->ch    = ch;
6128 #ifdef UTF2000
6129   str->iso2022.current_char_boundary = char_boundary;
6130 #endif
6131 }
6132
6133 \f
6134
6135 /************************************************************************/
6136 /*                             Initialization                           */
6137 /************************************************************************/
6138
6139 void
6140 syms_of_file_coding (void)
6141 {
6142   INIT_LRECORD_IMPLEMENTATION (coding_system);
6143
6144   DEFERROR_STANDARD (Qcoding_system_error, Qio_error);
6145
6146   DEFSUBR (Fcoding_system_p);
6147   DEFSUBR (Ffind_coding_system);
6148   DEFSUBR (Fget_coding_system);
6149   DEFSUBR (Fcoding_system_list);
6150   DEFSUBR (Fcoding_system_name);
6151   DEFSUBR (Fmake_coding_system);
6152   DEFSUBR (Fcopy_coding_system);
6153   DEFSUBR (Fcoding_system_canonical_name_p);
6154   DEFSUBR (Fcoding_system_alias_p);
6155   DEFSUBR (Fcoding_system_aliasee);
6156   DEFSUBR (Fdefine_coding_system_alias);
6157   DEFSUBR (Fsubsidiary_coding_system);
6158
6159   DEFSUBR (Fcoding_system_type);
6160   DEFSUBR (Fcoding_system_doc_string);
6161 #ifdef MULE
6162   DEFSUBR (Fcoding_system_charset);
6163 #endif
6164   DEFSUBR (Fcoding_system_property);
6165
6166   DEFSUBR (Fcoding_category_list);
6167   DEFSUBR (Fset_coding_priority_list);
6168   DEFSUBR (Fcoding_priority_list);
6169   DEFSUBR (Fset_coding_category_system);
6170   DEFSUBR (Fcoding_category_system);
6171
6172   DEFSUBR (Fdetect_coding_region);
6173   DEFSUBR (Fdecode_coding_region);
6174   DEFSUBR (Fencode_coding_region);
6175 #ifdef MULE
6176   DEFSUBR (Fdecode_shift_jis_char);
6177   DEFSUBR (Fencode_shift_jis_char);
6178   DEFSUBR (Fdecode_big5_char);
6179   DEFSUBR (Fencode_big5_char);
6180 #ifndef UTF2000
6181   DEFSUBR (Fset_ucs_char);
6182   DEFSUBR (Fucs_char);
6183   DEFSUBR (Fset_char_ucs);
6184   DEFSUBR (Fchar_ucs);
6185 #endif /* not UTF2000 */
6186 #endif /* MULE */
6187   defsymbol (&Qcoding_systemp, "coding-system-p");
6188   defsymbol (&Qno_conversion, "no-conversion");
6189   defsymbol (&Qraw_text, "raw-text");
6190 #ifdef MULE
6191   defsymbol (&Qbig5, "big5");
6192   defsymbol (&Qshift_jis, "shift-jis");
6193   defsymbol (&Qucs4, "ucs-4");
6194   defsymbol (&Qutf8, "utf-8");
6195   defsymbol (&Qccl, "ccl");
6196   defsymbol (&Qiso2022, "iso2022");
6197 #endif /* MULE */
6198   defsymbol (&Qmnemonic, "mnemonic");
6199   defsymbol (&Qeol_type, "eol-type");
6200   defsymbol (&Qpost_read_conversion, "post-read-conversion");
6201   defsymbol (&Qpre_write_conversion, "pre-write-conversion");
6202
6203   defsymbol (&Qcr, "cr");
6204   defsymbol (&Qlf, "lf");
6205   defsymbol (&Qcrlf, "crlf");
6206   defsymbol (&Qeol_cr, "eol-cr");
6207   defsymbol (&Qeol_lf, "eol-lf");
6208   defsymbol (&Qeol_crlf, "eol-crlf");
6209 #ifdef MULE
6210   defsymbol (&Qcharset_g0, "charset-g0");
6211   defsymbol (&Qcharset_g1, "charset-g1");
6212   defsymbol (&Qcharset_g2, "charset-g2");
6213   defsymbol (&Qcharset_g3, "charset-g3");
6214   defsymbol (&Qforce_g0_on_output, "force-g0-on-output");
6215   defsymbol (&Qforce_g1_on_output, "force-g1-on-output");
6216   defsymbol (&Qforce_g2_on_output, "force-g2-on-output");
6217   defsymbol (&Qforce_g3_on_output, "force-g3-on-output");
6218   defsymbol (&Qno_iso6429, "no-iso6429");
6219   defsymbol (&Qinput_charset_conversion, "input-charset-conversion");
6220   defsymbol (&Qoutput_charset_conversion, "output-charset-conversion");
6221
6222   defsymbol (&Qshort, "short");
6223   defsymbol (&Qno_ascii_eol, "no-ascii-eol");
6224   defsymbol (&Qno_ascii_cntl, "no-ascii-cntl");
6225   defsymbol (&Qseven, "seven");
6226   defsymbol (&Qlock_shift, "lock-shift");
6227   defsymbol (&Qescape_quoted, "escape-quoted");
6228 #endif /* MULE */
6229   defsymbol (&Qencode, "encode");
6230   defsymbol (&Qdecode, "decode");
6231
6232 #ifdef MULE
6233   defsymbol (&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS],
6234              "shift-jis");
6235   defsymbol (&coding_category_symbol[CODING_CATEGORY_BIG5],
6236              "big5");
6237   defsymbol (&coding_category_symbol[CODING_CATEGORY_UCS4],
6238              "ucs-4");
6239   defsymbol (&coding_category_symbol[CODING_CATEGORY_UTF8],
6240              "utf-8");
6241   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_7],
6242              "iso-7");
6243   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE],
6244              "iso-8-designate");
6245   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_1],
6246              "iso-8-1");
6247   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_2],
6248              "iso-8-2");
6249   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_LOCK_SHIFT],
6250              "iso-lock-shift");
6251 #endif /* MULE */
6252   defsymbol (&coding_category_symbol[CODING_CATEGORY_NO_CONVERSION],
6253              "no-conversion");
6254 }
6255
6256 void
6257 lstream_type_create_file_coding (void)
6258 {
6259   LSTREAM_HAS_METHOD (decoding, reader);
6260   LSTREAM_HAS_METHOD (decoding, writer);
6261   LSTREAM_HAS_METHOD (decoding, rewinder);
6262   LSTREAM_HAS_METHOD (decoding, seekable_p);
6263   LSTREAM_HAS_METHOD (decoding, flusher);
6264   LSTREAM_HAS_METHOD (decoding, closer);
6265   LSTREAM_HAS_METHOD (decoding, marker);
6266
6267   LSTREAM_HAS_METHOD (encoding, reader);
6268   LSTREAM_HAS_METHOD (encoding, writer);
6269   LSTREAM_HAS_METHOD (encoding, rewinder);
6270   LSTREAM_HAS_METHOD (encoding, seekable_p);
6271   LSTREAM_HAS_METHOD (encoding, flusher);
6272   LSTREAM_HAS_METHOD (encoding, closer);
6273   LSTREAM_HAS_METHOD (encoding, marker);
6274 }
6275
6276 void
6277 vars_of_file_coding (void)
6278 {
6279   int i;
6280
6281   fcd = xnew (struct file_coding_dump);
6282   dump_add_root_struct_ptr (&fcd, &fcd_description);
6283
6284   /* Initialize to something reasonable ... */
6285   for (i = 0; i < CODING_CATEGORY_LAST; i++)
6286     {
6287       fcd->coding_category_system[i] = Qnil;
6288       fcd->coding_category_by_priority[i] = i;
6289     }
6290
6291   Fprovide (intern ("file-coding"));
6292
6293   DEFVAR_LISP ("keyboard-coding-system", &Vkeyboard_coding_system /*
6294 Coding system used for TTY keyboard input.
6295 Not used under a windowing system.
6296 */ );
6297   Vkeyboard_coding_system = Qnil;
6298
6299   DEFVAR_LISP ("terminal-coding-system", &Vterminal_coding_system /*
6300 Coding system used for TTY display output.
6301 Not used under a windowing system.
6302 */ );
6303   Vterminal_coding_system = Qnil;
6304
6305   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read /*
6306 Overriding coding system used when reading from a file or process.
6307 You should bind this variable with `let', but do not set it globally.
6308 If this is non-nil, it specifies the coding system that will be used
6309 to decode input on read operations, such as from a file or process.
6310 It overrides `buffer-file-coding-system-for-read',
6311 `insert-file-contents-pre-hook', etc.  Use those variables instead of
6312 this one for permanent changes to the environment.  */ );
6313   Vcoding_system_for_read = Qnil;
6314
6315   DEFVAR_LISP ("coding-system-for-write",
6316                &Vcoding_system_for_write /*
6317 Overriding coding system used when writing to a file or process.
6318 You should bind this variable with `let', but do not set it globally.
6319 If this is non-nil, it specifies the coding system that will be used
6320 to encode output for write operations, such as to a file or process.
6321 It overrides `buffer-file-coding-system', `write-region-pre-hook', etc.
6322 Use those variables instead of this one for permanent changes to the
6323 environment.  */ );
6324   Vcoding_system_for_write = Qnil;
6325
6326   DEFVAR_LISP ("file-name-coding-system", &Vfile_name_coding_system /*
6327 Coding system used to convert pathnames when accessing files.
6328 */ );
6329   Vfile_name_coding_system = Qnil;
6330
6331   DEFVAR_BOOL ("enable-multibyte-characters", &enable_multibyte_characters /*
6332 Non-nil means the buffer contents are regarded as multi-byte form
6333 of characters, not a binary code.  This affects the display, file I/O,
6334 and behaviors of various editing commands.
6335
6336 Setting this to nil does not do anything.
6337 */ );
6338   enable_multibyte_characters = 1;
6339 }
6340
6341 void
6342 complex_vars_of_file_coding (void)
6343 {
6344   staticpro (&Vcoding_system_hash_table);
6345   Vcoding_system_hash_table =
6346     make_lisp_hash_table (50, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ);
6347
6348   the_codesys_prop_dynarr = Dynarr_new (codesys_prop);
6349   dump_add_root_struct_ptr (&the_codesys_prop_dynarr, &codesys_prop_dynarr_description);
6350
6351 #define DEFINE_CODESYS_PROP(Prop_Type, Sym) do  \
6352 {                                               \
6353   struct codesys_prop csp;                      \
6354   csp.sym = (Sym);                              \
6355   csp.prop_type = (Prop_Type);                  \
6356   Dynarr_add (the_codesys_prop_dynarr, csp);    \
6357 } while (0)
6358
6359   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qmnemonic);
6360   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_type);
6361   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_cr);
6362   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_crlf);
6363   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_lf);
6364   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qpost_read_conversion);
6365   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qpre_write_conversion);
6366 #ifdef MULE
6367   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g0);
6368   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g1);
6369   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g2);
6370   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g3);
6371   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g0_on_output);
6372   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g1_on_output);
6373   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g2_on_output);
6374   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g3_on_output);
6375   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qshort);
6376   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_eol);
6377   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_cntl);
6378   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qseven);
6379   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qlock_shift);
6380   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_iso6429);
6381   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qescape_quoted);
6382   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qinput_charset_conversion);
6383   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qoutput_charset_conversion);
6384
6385   DEFINE_CODESYS_PROP (CODESYS_PROP_CCL,     Qencode);
6386   DEFINE_CODESYS_PROP (CODESYS_PROP_CCL,     Qdecode);
6387 #endif /* MULE */
6388   /* Need to create this here or we're really screwed. */
6389   Fmake_coding_system
6390     (Qraw_text, Qno_conversion,
6391      build_string ("Raw text, which means it converts only line-break-codes."),
6392      list2 (Qmnemonic, build_string ("Raw")));
6393
6394   Fmake_coding_system
6395     (Qbinary, Qno_conversion,
6396      build_string ("Binary, which means it does not convert anything."),
6397      list4 (Qeol_type, Qlf,
6398             Qmnemonic, build_string ("Binary")));
6399
6400 #ifdef UTF2000
6401   Fmake_coding_system
6402     (Qutf8, Qutf8,
6403      build_string ("Coding-system of ISO/IEC 10646 UTF-8."),
6404      list2 (Qmnemonic, build_string ("UTF8")));
6405 #endif
6406
6407   Fdefine_coding_system_alias (Qno_conversion, Qraw_text);
6408
6409   Fdefine_coding_system_alias (Qfile_name, Qbinary);
6410
6411   Fdefine_coding_system_alias (Qterminal, Qbinary);
6412   Fdefine_coding_system_alias (Qkeyboard, Qbinary);
6413
6414   /* Need this for bootstrapping */
6415   fcd->coding_category_system[CODING_CATEGORY_NO_CONVERSION] =
6416     Fget_coding_system (Qraw_text);
6417
6418 #ifdef UTF2000
6419   fcd->coding_category_system[CODING_CATEGORY_UTF8]
6420    = Fget_coding_system (Qutf8);
6421 #endif
6422
6423 #if defined(MULE) && !defined(UTF2000)
6424   {
6425     size_t i;
6426
6427     for (i = 0; i < countof (fcd->ucs_to_mule_table); i++)
6428       fcd->ucs_to_mule_table[i] = Qnil;
6429   }
6430   staticpro (&mule_to_ucs_table);
6431   mule_to_ucs_table = Fmake_char_table(Qgeneric);
6432 #endif /* defined(MULE) && !defined(UTF2000) */
6433 }