src/text-coding.c

   1 /* Code conversion functions.
   2    Copyright (C) 1991, 1995 Free Software Foundation, Inc.
   3    Copyright (C) 1995 Sun Microsystems, Inc.
   4    Copyright (C) 1999,2000 MORIOKA Tomohiko
   5
   6 This file is part of XEmacs.
   7
   8 XEmacs is free software; you can redistribute it and/or modify it
   9 under the terms of the GNU General Public License as published by the
  10 Free Software Foundation; either version 2, or (at your option) any
  11 later version.
  12
  13 XEmacs is distributed in the hope that it will be useful, but WITHOUT
  14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with XEmacs; see the file COPYING.  If not, write to
  20 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  21 Boston, MA 02111-1307, USA.  */
  22
  23 /* Synched up with: Mule 2.3.   Not in FSF. */
  24
  25 /* Rewritten by Ben Wing <ben@xemacs.org>. */
  26
  27 #include <config.h>
  28 #include "lisp.h"
  29
  30 #include "buffer.h"
  31 #include "elhash.h"
  32 #include "insdel.h"
  33 #include "lstream.h"
  34 #ifdef MULE
  35 #include "mule-ccl.h"
  36 #include "chartab.h"
  37 #endif
  38 #include "file-coding.h"
  39
  40 Lisp_Object Qcoding_system_error;
  41
  42 Lisp_Object Vkeyboard_coding_system;
  43 Lisp_Object Vterminal_coding_system;
  44 Lisp_Object Vcoding_system_for_read;
  45 Lisp_Object Vcoding_system_for_write;
  46 Lisp_Object Vfile_name_coding_system;
  47
  48 /* Table of symbols identifying each coding category. */
  49 Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST + 1];
  50
  51
  52
  53 struct file_coding_dump {
  54   /* Coding system currently associated with each coding category. */
  55   Lisp_Object coding_category_system[CODING_CATEGORY_LAST + 1];
  56
  57   /* Table of all coding categories in decreasing order of priority.
  58      This describes a permutation of the possible coding categories. */
  59   int coding_category_by_priority[CODING_CATEGORY_LAST + 1];
  60
  61 #ifndef UTF2000
  62   Lisp_Object ucs_to_mule_table[65536];
  63 #endif
  64 } *fcd;
  65
  66 static const struct lrecord_description fcd_description_1[] = {
  67   { XD_LISP_OBJECT, offsetof(struct file_coding_dump, coding_category_system), CODING_CATEGORY_LAST + 1 },
  68 #ifndef UTF2000
  69   { XD_LISP_OBJECT, offsetof(struct file_coding_dump, ucs_to_mule_table),      65536 },
  70 #endif
  71   { XD_END }
  72 };
  73
  74 static const struct struct_description fcd_description = {
  75   sizeof(struct file_coding_dump),
  76   fcd_description_1
  77 };
  78
  79 Lisp_Object mule_to_ucs_table;
  80
  81 Lisp_Object Qcoding_systemp;
  82
  83 Lisp_Object Qraw_text, Qno_conversion, Qccl, Qiso2022;
  84 /* Qinternal in general.c */
  85
  86 Lisp_Object Qmnemonic, Qeol_type;
  87 Lisp_Object Qcr, Qcrlf, Qlf;
  88 Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf;
  89 Lisp_Object Qpost_read_conversion;
  90 Lisp_Object Qpre_write_conversion;
  91
  92 #ifdef MULE
  93 Lisp_Object Qucs4, Qutf8;
  94 Lisp_Object Qbig5, Qshift_jis;
  95 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3;
  96 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output;
  97 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output;
  98 Lisp_Object Qno_iso6429;
  99 Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion;
 100 Lisp_Object Qctext, Qescape_quoted;
 101 Lisp_Object Qshort, Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift;
 102 #endif
 103 #ifdef UTF2000
 104 Lisp_Object Qdisable_composition;
 105 #endif
 106 Lisp_Object Qencode, Qdecode;
 107
 108 Lisp_Object Vcoding_system_hash_table;
 109
 110 int enable_multibyte_characters;
 111
 112 #ifdef MULE
 113 /* Additional information used by the ISO2022 decoder and detector. */
 114 struct iso2022_decoder
 115 {
 116   /* CHARSET holds the character sets currently assigned to the G0
 117      through G3 variables.  It is initialized from the array
 118      INITIAL_CHARSET in CODESYS. */
 119   Lisp_Object charset[4];
 120
 121   /* Which registers are currently invoked into the left (GL) and
 122      right (GR) halves of the 8-bit encoding space? */
 123   int register_left, register_right;
 124
 125   /* ISO_ESC holds a value indicating part of an escape sequence
 126      that has already been seen. */
 127   enum iso_esc_flag esc;
 128
 129   /* This records the bytes we've seen so far in an escape sequence,
 130      in case the sequence is invalid (we spit out the bytes unchanged). */
 131   unsigned char esc_bytes[8];
 132
 133   /* Index for next byte to store in ISO escape sequence. */
 134   int esc_bytes_index;
 135
 136 #ifdef ENABLE_COMPOSITE_CHARS
 137   /* Stuff seen so far when composing a string. */
 138   unsigned_char_dynarr *composite_chars;
 139 #endif
 140
 141   /* If we saw an invalid designation sequence for a particular
 142      register, we flag it here and switch to ASCII.  The next time we
 143      see a valid designation for this register, we turn off the flag
 144      and do the designation normally, but pretend the sequence was
 145      invalid.  The effect of all this is that (most of the time) the
 146      escape sequences for both the switch to the unknown charset, and
 147      the switch back to the known charset, get inserted literally into
 148      the buffer and saved out as such.  The hope is that we can
 149      preserve the escape sequences so that the resulting written out
 150      file makes sense.  If we don't do any of this, the designation
 151      to the invalid charset will be preserved but that switch back
 152      to the known charset will probably get eaten because it was
 153      the same charset that was already present in the register. */
 154   unsigned char invalid_designated[4];
 155
 156   /* We try to do similar things as above for direction-switching
 157      sequences.  If we encountered a direction switch while an
 158      invalid designation was present, or an invalid designation
 159      just after a direction switch (i.e. no valid designation
 160      encountered yet), we insert the direction-switch escape
 161      sequence literally into the output stream, and later on
 162      insert the corresponding direction-restoring escape sequence
 163      literally also. */
 164   unsigned int switched_dir_and_no_valid_charset_yet :1;
 165   unsigned int invalid_switch_dir :1;
 166
 167   /* Tells the decoder to output the escape sequence literally
 168      even though it was valid.  Used in the games we play to
 169      avoid lossage when we encounter invalid designations. */
 170   unsigned int output_literally :1;
 171   /* We encountered a direction switch followed by an invalid
 172      designation.  We didn't output the direction switch
 173      literally because we didn't know about the invalid designation;
 174      but we have to do so now. */
 175   unsigned int output_direction_sequence :1;
 176 };
 177 #endif /* MULE */
 178 EXFUN (Fcopy_coding_system, 2);
 179 #ifdef MULE
 180 struct detection_state;
 181
 182 static void
 183 text_encode_generic (Lstream *encoding, CONST unsigned char *src,
 184                      unsigned_char_dynarr *dst, unsigned int n);
 185
 186 static int detect_coding_sjis (struct detection_state *st,
 187                                CONST unsigned char *src,
 188                                unsigned int n);
 189 static void decode_coding_sjis (Lstream *decoding,
 190                                 CONST unsigned char *src,
 191                                 unsigned_char_dynarr *dst,
 192                                 unsigned int n);
 193 void char_encode_shift_jis (struct encoding_stream *str, Emchar c,
 194                             unsigned_char_dynarr *dst, unsigned int *flags);
 195 void char_finish_shift_jis (struct encoding_stream *str,
 196                             unsigned_char_dynarr *dst, unsigned int *flags);
 197
 198 static int detect_coding_big5 (struct detection_state *st,
 199                                CONST unsigned char *src,
 200                                unsigned int n);
 201 static void decode_coding_big5 (Lstream *decoding,
 202                                 CONST unsigned char *src,
 203                                 unsigned_char_dynarr *dst, unsigned int n);
 204 static void encode_coding_big5 (Lstream *encoding,
 205                                 CONST unsigned char *src,
 206                                 unsigned_char_dynarr *dst, unsigned int n);
 207 static int detect_coding_ucs4 (struct detection_state *st,
 208                                CONST unsigned char *src,
 209                                unsigned int n);
 210 static void decode_coding_ucs4 (Lstream *decoding,
 211                                 CONST unsigned char *src,
 212                                 unsigned_char_dynarr *dst, unsigned int n);
 213 void char_encode_ucs4 (struct encoding_stream *str, Emchar c,
 214                        unsigned_char_dynarr *dst, unsigned int *flags);
 215 void char_finish_ucs4 (struct encoding_stream *str,
 216                        unsigned_char_dynarr *dst, unsigned int *flags);
 217
 218 static int detect_coding_utf8 (struct detection_state *st,
 219                                CONST unsigned char *src,
 220                                unsigned int n);
 221 static void decode_coding_utf8 (Lstream *decoding,
 222                                 CONST unsigned char *src,
 223                                 unsigned_char_dynarr *dst, unsigned int n);
 224 void char_encode_utf8 (struct encoding_stream *str, Emchar c,
 225                        unsigned_char_dynarr *dst, unsigned int *flags);
 226 void char_finish_utf8 (struct encoding_stream *str,
 227                        unsigned_char_dynarr *dst, unsigned int *flags);
 228
 229 static int postprocess_iso2022_mask (int mask);
 230 static void reset_iso2022 (Lisp_Object coding_system,
 231                            struct iso2022_decoder *iso);
 232 static int detect_coding_iso2022 (struct detection_state *st,
 233                                   CONST unsigned char *src,
 234                                   unsigned int n);
 235 static void decode_coding_iso2022 (Lstream *decoding,
 236                                    CONST unsigned char *src,
 237                                    unsigned_char_dynarr *dst, unsigned int n);
 238 void char_encode_iso2022 (struct encoding_stream *str, Emchar c,
 239                           unsigned_char_dynarr *dst, unsigned int *flags);
 240 void char_finish_iso2022 (struct encoding_stream *str,
 241                           unsigned_char_dynarr *dst, unsigned int *flags);
 242 #endif /* MULE */
 243 static void decode_coding_no_conversion (Lstream *decoding,
 244                                          CONST unsigned char *src,
 245                                          unsigned_char_dynarr *dst,
 246                                          unsigned int n);
 247 static void encode_coding_no_conversion (Lstream *encoding,
 248                                          CONST unsigned char *src,
 249                                          unsigned_char_dynarr *dst,
 250                                          unsigned int n);
 251 static void mule_decode (Lstream *decoding, CONST unsigned char *src,
 252                          unsigned_char_dynarr *dst, unsigned int n);
 253 static void mule_encode (Lstream *encoding, CONST unsigned char *src,
 254                          unsigned_char_dynarr *dst, unsigned int n);
 255
 256 typedef struct codesys_prop codesys_prop;
 257 struct codesys_prop
 258 {
 259   Lisp_Object sym;
 260   int prop_type;
 261 };
 262
 263 typedef struct
 264 {
 265   Dynarr_declare (codesys_prop);
 266 } codesys_prop_dynarr;
 267
 268 static const struct lrecord_description codesys_prop_description_1[] = {
 269   { XD_LISP_OBJECT, offsetof(codesys_prop, sym), 1 },
 270   { XD_END }
 271 };
 272
 273 static const struct struct_description codesys_prop_description = {
 274   sizeof(codesys_prop),
 275   codesys_prop_description_1
 276 };
 277
 278 static const struct lrecord_description codesys_prop_dynarr_description_1[] = {
 279   XD_DYNARR_DESC(codesys_prop_dynarr, &codesys_prop_description),
 280   { XD_END }
 281 };
 282
 283 static const struct struct_description codesys_prop_dynarr_description = {
 284   sizeof(codesys_prop_dynarr),
 285   codesys_prop_dynarr_description_1
 286 };
 287
 288 codesys_prop_dynarr *the_codesys_prop_dynarr;
 289
 290 enum codesys_prop_enum
 291 {
 292   CODESYS_PROP_ALL_OK,
 293   CODESYS_PROP_ISO2022,
 294   CODESYS_PROP_CCL
 295 };
 296
 297 \f
 298 /************************************************************************/
 299 /*                       Coding system functions                        */
 300 /************************************************************************/
 301
 302 static Lisp_Object mark_coding_system (Lisp_Object);
 303 static void print_coding_system (Lisp_Object, Lisp_Object, int);
 304 static void finalize_coding_system (void *header, int for_disksave);
 305
 306 #ifdef MULE
 307 static const struct lrecord_description ccs_description_1[] = {
 308   { XD_LISP_OBJECT, offsetof(charset_conversion_spec, from_charset), 2 },
 309   { XD_END }
 310 };
 311
 312 static const struct struct_description ccs_description = {
 313   sizeof(charset_conversion_spec),
 314   ccs_description_1
 315 };
 316
 317 static const struct lrecord_description ccsd_description_1[] = {
 318   XD_DYNARR_DESC(charset_conversion_spec_dynarr, &ccs_description),
 319   { XD_END }
 320 };
 321
 322 static const struct struct_description ccsd_description = {
 323   sizeof(charset_conversion_spec_dynarr),
 324   ccsd_description_1
 325 };
 326 #endif
 327
 328 static const struct lrecord_description coding_system_description[] = {
 329   { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, name), 2 },
 330   { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, mnemonic), 3 },
 331   { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, eol_lf), 3 },
 332 #ifdef MULE
 333   { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, iso2022.initial_charset), 4 },
 334   { XD_STRUCT_PTR,  offsetof(struct Lisp_Coding_System, iso2022.input_conv),  1, &ccsd_description },
 335   { XD_STRUCT_PTR,  offsetof(struct Lisp_Coding_System, iso2022.output_conv), 1, &ccsd_description },
 336   { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, ccl.decode), 2 },
 337 #endif
 338   { XD_END }
 339 };
 340
 341 DEFINE_LRECORD_IMPLEMENTATION ("coding-system", coding_system,
 342                                mark_coding_system, print_coding_system,
 343                                finalize_coding_system,
 344                                0, 0, coding_system_description,
 345                                struct Lisp_Coding_System);
 346
 347 static Lisp_Object
 348 mark_coding_system (Lisp_Object obj)
 349 {
 350   Lisp_Coding_System *codesys = XCODING_SYSTEM (obj);
 351
 352   mark_object (CODING_SYSTEM_NAME (codesys));
 353   mark_object (CODING_SYSTEM_DOC_STRING (codesys));
 354   mark_object (CODING_SYSTEM_MNEMONIC (codesys));
 355   mark_object (CODING_SYSTEM_EOL_LF (codesys));
 356   mark_object (CODING_SYSTEM_EOL_CRLF (codesys));
 357   mark_object (CODING_SYSTEM_EOL_CR (codesys));
 358
 359   switch (CODING_SYSTEM_TYPE (codesys))
 360     {
 361 #ifdef MULE
 362       int i;
 363     case CODESYS_ISO2022:
 364       for (i = 0; i < 4; i++)
 365         mark_object (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i));
 366       if (codesys->iso2022.input_conv)
 367         {
 368           for (i = 0; i < Dynarr_length (codesys->iso2022.input_conv); i++)
 369             {
 370               struct charset_conversion_spec *ccs =
 371                 Dynarr_atp (codesys->iso2022.input_conv, i);
 372               mark_object (ccs->from_charset);
 373               mark_object (ccs->to_charset);
 374             }
 375         }
 376       if (codesys->iso2022.output_conv)
 377         {
 378           for (i = 0; i < Dynarr_length (codesys->iso2022.output_conv); i++)
 379             {
 380               struct charset_conversion_spec *ccs =
 381                 Dynarr_atp (codesys->iso2022.output_conv, i);
 382               mark_object (ccs->from_charset);
 383               mark_object (ccs->to_charset);
 384             }
 385         }
 386       break;
 387
 388     case CODESYS_CCL:
 389       mark_object (CODING_SYSTEM_CCL_DECODE (codesys));
 390       mark_object (CODING_SYSTEM_CCL_ENCODE (codesys));
 391       break;
 392 #endif /* MULE */
 393     default:
 394       break;
 395     }
 396
 397   mark_object (CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys));
 398   return CODING_SYSTEM_POST_READ_CONVERSION (codesys);
 399 }
 400
 401 static void
 402 print_coding_system (Lisp_Object obj, Lisp_Object printcharfun,
 403                      int escapeflag)
 404 {
 405   Lisp_Coding_System *c = XCODING_SYSTEM (obj);
 406   if (print_readably)
 407     error ("printing unreadable object #<coding_system 0x%x>",
 408            c->header.uid);
 409
 410   write_c_string ("#<coding_system ", printcharfun);
 411   print_internal (c->name, printcharfun, 1);
 412   write_c_string (">", printcharfun);
 413 }
 414
 415 static void
 416 finalize_coding_system (void *header, int for_disksave)
 417 {
 418   Lisp_Coding_System *c = (Lisp_Coding_System *) header;
 419   /* Since coding systems never go away, this function is not
 420      necessary.  But it would be necessary if we changed things
 421      so that coding systems could go away. */
 422   if (!for_disksave) /* see comment in lstream.c */
 423     {
 424       switch (CODING_SYSTEM_TYPE (c))
 425         {
 426 #ifdef MULE
 427         case CODESYS_ISO2022:
 428           if (c->iso2022.input_conv)
 429             {
 430               Dynarr_free (c->iso2022.input_conv);
 431               c->iso2022.input_conv = 0;
 432             }
 433           if (c->iso2022.output_conv)
 434             {
 435               Dynarr_free (c->iso2022.output_conv);
 436               c->iso2022.output_conv = 0;
 437             }
 438           break;
 439 #endif /* MULE */
 440         default:
 441           break;
 442         }
 443     }
 444 }
 445
 446 static enum eol_type
 447 symbol_to_eol_type (Lisp_Object symbol)
 448 {
 449   CHECK_SYMBOL (symbol);
 450   if (NILP (symbol))      return EOL_AUTODETECT;
 451   if (EQ (symbol, Qlf))   return EOL_LF;
 452   if (EQ (symbol, Qcrlf)) return EOL_CRLF;
 453   if (EQ (symbol, Qcr))   return EOL_CR;
 454
 455   signal_simple_error ("Unrecognized eol type", symbol);
 456   return EOL_AUTODETECT; /* not reached */
 457 }
 458
 459 static Lisp_Object
 460 eol_type_to_symbol (enum eol_type type)
 461 {
 462   switch (type)
 463     {
 464     default: abort ();
 465     case EOL_LF:         return Qlf;
 466     case EOL_CRLF:       return Qcrlf;
 467     case EOL_CR:         return Qcr;
 468     case EOL_AUTODETECT: return Qnil;
 469     }
 470 }
 471
 472 static void
 473 setup_eol_coding_systems (Lisp_Coding_System *codesys)
 474 {
 475   Lisp_Object codesys_obj;
 476   int len = string_length (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name);
 477   char *codesys_name = (char *) alloca (len + 7);
 478   int mlen = -1;
 479   char *codesys_mnemonic=0;
 480
 481   Lisp_Object codesys_name_sym, sub_codesys_obj;
 482
 483   /* kludge */
 484
 485   XSETCODING_SYSTEM (codesys_obj, codesys);
 486
 487   memcpy (codesys_name,
 488           string_data (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name), len);
 489
 490   if (STRINGP (CODING_SYSTEM_MNEMONIC (codesys)))
 491     {
 492       mlen = XSTRING_LENGTH (CODING_SYSTEM_MNEMONIC (codesys));
 493       codesys_mnemonic = (char *) alloca (mlen + 7);
 494       memcpy (codesys_mnemonic,
 495               XSTRING_DATA (CODING_SYSTEM_MNEMONIC (codesys)), mlen);
 496     }
 497
 498 #define DEFINE_SUB_CODESYS(op_sys, op_sys_abbr, Type) do {                      \
 499   strcpy (codesys_name + len, "-" op_sys);                                      \
 500   if (mlen != -1)                                                               \
 501     strcpy (codesys_mnemonic + mlen, op_sys_abbr);                              \
 502   codesys_name_sym = intern (codesys_name);                                     \
 503   sub_codesys_obj = Fcopy_coding_system (codesys_obj, codesys_name_sym);        \
 504   XCODING_SYSTEM_EOL_TYPE (sub_codesys_obj) = Type;                             \
 505   if (mlen != -1)                                                               \
 506     XCODING_SYSTEM_MNEMONIC(sub_codesys_obj) =                                  \
 507       build_string (codesys_mnemonic);                                          \
 508   CODING_SYSTEM_##Type (codesys) = sub_codesys_obj;                             \
 509 } while (0)
 510
 511   DEFINE_SUB_CODESYS("unix", "", EOL_LF);
 512   DEFINE_SUB_CODESYS("dos",  ":T", EOL_CRLF);
 513   DEFINE_SUB_CODESYS("mac",  ":t", EOL_CR);
 514 }
 515
 516 DEFUN ("coding-system-p", Fcoding_system_p, 1, 1, 0, /*
 517 Return t if OBJECT is a coding system.
 518 A coding system is an object that defines how text containing multiple
 519 character sets is encoded into a stream of (typically 8-bit) bytes.
 520 The coding system is used to decode the stream into a series of
 521 characters (which may be from multiple charsets) when the text is read
 522 from a file or process, and is used to encode the text back into the
 523 same format when it is written out to a file or process.
 524
 525 For example, many ISO2022-compliant coding systems (such as Compound
 526 Text, which is used for inter-client data under the X Window System)
 527 use escape sequences to switch between different charsets -- Japanese
 528 Kanji, for example, is invoked with "ESC $ ( B"; ASCII is invoked
 529 with "ESC ( B"; and Cyrillic is invoked with "ESC - L".  See
 530 `make-coding-system' for more information.
 531
 532 Coding systems are normally identified using a symbol, and the
 533 symbol is accepted in place of the actual coding system object whenever
 534 a coding system is called for. (This is similar to how faces work.)
 535 */
 536        (object))
 537 {
 538   return CODING_SYSTEMP (object) ? Qt : Qnil;
 539 }
 540
 541 DEFUN ("find-coding-system", Ffind_coding_system, 1, 1, 0, /*
 542 Retrieve the coding system of the given name.
 543
 544 If CODING-SYSTEM-OR-NAME is a coding-system object, it is simply
 545 returned.  Otherwise, CODING-SYSTEM-OR-NAME should be a symbol.
 546 If there is no such coding system, nil is returned.  Otherwise the
 547 associated coding system object is returned.
 548 */
 549        (coding_system_or_name))
 550 {
 551   if (CODING_SYSTEMP (coding_system_or_name))
 552     return coding_system_or_name;
 553
 554   if (NILP (coding_system_or_name))
 555     coding_system_or_name = Qbinary;
 556   else
 557     CHECK_SYMBOL (coding_system_or_name);
 558
 559   return Fgethash (coding_system_or_name, Vcoding_system_hash_table, Qnil);
 560 }
 561
 562 DEFUN ("get-coding-system", Fget_coding_system, 1, 1, 0, /*
 563 Retrieve the coding system of the given name.
 564 Same as `find-coding-system' except that if there is no such
 565 coding system, an error is signaled instead of returning nil.
 566 */
 567        (name))
 568 {
 569   Lisp_Object coding_system = Ffind_coding_system (name);
 570
 571   if (NILP (coding_system))
 572     signal_simple_error ("No such coding system", name);
 573   return coding_system;
 574 }
 575
 576 /* We store the coding systems in hash tables with the names as the key and the
 577    actual coding system object as the value.  Occasionally we need to use them
 578    in a list format.  These routines provide us with that. */
 579 struct coding_system_list_closure
 580 {
 581   Lisp_Object *coding_system_list;
 582 };
 583
 584 static int
 585 add_coding_system_to_list_mapper (Lisp_Object key, Lisp_Object value,
 586                                   void *coding_system_list_closure)
 587 {
 588   /* This function can GC */
 589   struct coding_system_list_closure *cscl =
 590     (struct coding_system_list_closure *) coding_system_list_closure;
 591   Lisp_Object *coding_system_list = cscl->coding_system_list;
 592
 593   *coding_system_list = Fcons (key, *coding_system_list);
 594   return 0;
 595 }
 596
 597 DEFUN ("coding-system-list", Fcoding_system_list, 0, 0, 0, /*
 598 Return a list of the names of all defined coding systems.
 599 */
 600        ())
 601 {
 602   Lisp_Object coding_system_list = Qnil;
 603   struct gcpro gcpro1;
 604   struct coding_system_list_closure coding_system_list_closure;
 605
 606   GCPRO1 (coding_system_list);
 607   coding_system_list_closure.coding_system_list = &coding_system_list;
 608   elisp_maphash (add_coding_system_to_list_mapper, Vcoding_system_hash_table,
 609                  &coding_system_list_closure);
 610   UNGCPRO;
 611
 612   return coding_system_list;
 613 }
 614
 615 DEFUN ("coding-system-name", Fcoding_system_name, 1, 1, 0, /*
 616 Return the name of the given coding system.
 617 */
 618        (coding_system))
 619 {
 620   coding_system = Fget_coding_system (coding_system);
 621   return XCODING_SYSTEM_NAME (coding_system);
 622 }
 623
 624 static Lisp_Coding_System *
 625 allocate_coding_system (enum coding_system_type type, Lisp_Object name)
 626 {
 627   Lisp_Coding_System *codesys =
 628     alloc_lcrecord_type (Lisp_Coding_System, &lrecord_coding_system);
 629
 630   zero_lcrecord (codesys);
 631   CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = Qnil;
 632   CODING_SYSTEM_POST_READ_CONVERSION (codesys) = Qnil;
 633   CODING_SYSTEM_EOL_TYPE (codesys) = EOL_AUTODETECT;
 634   CODING_SYSTEM_EOL_CRLF (codesys) = Qnil;
 635   CODING_SYSTEM_EOL_CR   (codesys) = Qnil;
 636   CODING_SYSTEM_EOL_LF   (codesys) = Qnil;
 637   CODING_SYSTEM_TYPE     (codesys) = type;
 638   CODING_SYSTEM_MNEMONIC (codesys) = Qnil;
 639 #ifdef MULE
 640   if (type == CODESYS_ISO2022)
 641     {
 642       int i;
 643       for (i = 0; i < 4; i++)
 644         CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i) = Qnil;
 645     }
 646   else if (type == CODESYS_CCL)
 647     {
 648       CODING_SYSTEM_CCL_DECODE (codesys) = Qnil;
 649       CODING_SYSTEM_CCL_ENCODE (codesys) = Qnil;
 650     }
 651 #endif /* MULE */
 652   CODING_SYSTEM_NAME (codesys) = name;
 653
 654   return codesys;
 655 }
 656
 657 #ifdef MULE
 658 /* Given a list of charset conversion specs as specified in a Lisp
 659    program, parse it into STORE_HERE. */
 660
 661 static void
 662 parse_charset_conversion_specs (charset_conversion_spec_dynarr *store_here,
 663                                 Lisp_Object spec_list)
 664 {
 665   Lisp_Object rest;
 666
 667   EXTERNAL_LIST_LOOP (rest, spec_list)
 668     {
 669       Lisp_Object car = XCAR (rest);
 670       Lisp_Object from, to;
 671       struct charset_conversion_spec spec;
 672
 673       if (!CONSP (car) || !CONSP (XCDR (car)) || !NILP (XCDR (XCDR (car))))
 674         signal_simple_error ("Invalid charset conversion spec", car);
 675       from = Fget_charset (XCAR (car));
 676       to = Fget_charset (XCAR (XCDR (car)));
 677       if ( (XCHARSET_CHARS (from) != XCHARSET_CHARS (to)) ||
 678            (XCHARSET_DIMENSION (from) != XCHARSET_DIMENSION (to)) )
 679         signal_simple_error_2
 680           ("Attempted conversion between different charset types",
 681            from, to);
 682       spec.from_charset = from;
 683       spec.to_charset = to;
 684
 685       Dynarr_add (store_here, spec);
 686     }
 687 }
 688
 689 /* Given a dynarr LOAD_HERE of internally-stored charset conversion
 690    specs, return the equivalent as the Lisp programmer would see it.
 691
 692    If LOAD_HERE is 0, return Qnil. */
 693
 694 static Lisp_Object
 695 unparse_charset_conversion_specs (charset_conversion_spec_dynarr *load_here)
 696 {
 697   int i;
 698   Lisp_Object result;
 699
 700   if (!load_here)
 701     return Qnil;
 702   for (i = 0, result = Qnil; i < Dynarr_length (load_here); i++)
 703     {
 704       struct charset_conversion_spec *ccs = Dynarr_atp (load_here, i);
 705       result = Fcons (list2 (ccs->from_charset, ccs->to_charset), result);
 706     }
 707
 708   return Fnreverse (result);
 709 }
 710
 711 #endif /* MULE */
 712
 713 DEFUN ("make-coding-system", Fmake_coding_system, 2, 4, 0, /*
 714 Register symbol NAME as a coding system.
 715
 716 TYPE describes the conversion method used and should be one of
 717
 718 nil or 'undecided
 719      Automatic conversion.  XEmacs attempts to detect the coding system
 720      used in the file.
 721 'no-conversion
 722      No conversion.  Use this for binary files and such.  On output,
 723      graphic characters that are not in ASCII or Latin-1 will be
 724      replaced by a ?. (For a no-conversion-encoded buffer, these
 725      characters will only be present if you explicitly insert them.)
 726 'shift-jis
 727      Shift-JIS (a Japanese encoding commonly used in PC operating systems).
 728 'ucs-4
 729      ISO 10646 UCS-4 encoding.
 730 'utf-8
 731      ISO 10646 UTF-8 encoding.
 732 'iso2022
 733      Any ISO2022-compliant encoding.  Among other things, this includes
 734      JIS (the Japanese encoding commonly used for e-mail), EUC (the
 735      standard Unix encoding for Japanese and other languages), and
 736      Compound Text (the encoding used in X11).  You can specify more
 737      specific information about the conversion with the FLAGS argument.
 738 'big5
 739      Big5 (the encoding commonly used for Taiwanese).
 740 'ccl
 741      The conversion is performed using a user-written pseudo-code
 742      program.  CCL (Code Conversion Language) is the name of this
 743      pseudo-code.
 744 'internal
 745      Write out or read in the raw contents of the memory representing
 746      the buffer's text.  This is primarily useful for debugging
 747      purposes, and is only enabled when XEmacs has been compiled with
 748      DEBUG_XEMACS defined (via the --debug configure option).
 749      WARNING: Reading in a file using 'internal conversion can result
 750      in an internal inconsistency in the memory representing a
 751      buffer's text, which will produce unpredictable results and may
 752      cause XEmacs to crash.  Under normal circumstances you should
 753      never use 'internal conversion.
 754
 755 DOC-STRING is a string describing the coding system.
 756
 757 PROPS is a property list, describing the specific nature of the
 758 character set.  Recognized properties are:
 759
 760 'mnemonic
 761      String to be displayed in the modeline when this coding system is
 762      active.
 763
 764 'eol-type
 765      End-of-line conversion to be used.  It should be one of
 766
 767         nil
 768                 Automatically detect the end-of-line type (LF, CRLF,
 769                 or CR).  Also generate subsidiary coding systems named
 770                 `NAME-unix', `NAME-dos', and `NAME-mac', that are
 771                 identical to this coding system but have an EOL-TYPE
 772                 value of 'lf, 'crlf, and 'cr, respectively.
 773         'lf
 774                 The end of a line is marked externally using ASCII LF.
 775                 Since this is also the way that XEmacs represents an
 776                 end-of-line internally, specifying this option results
 777                 in no end-of-line conversion.  This is the standard
 778                 format for Unix text files.
 779         'crlf
 780                 The end of a line is marked externally using ASCII
 781                 CRLF.  This is the standard format for MS-DOS text
 782                 files.
 783         'cr
 784                 The end of a line is marked externally using ASCII CR.
 785                 This is the standard format for Macintosh text files.
 786         t
 787                 Automatically detect the end-of-line type but do not
 788                 generate subsidiary coding systems.  (This value is
 789                 converted to nil when stored internally, and
 790                 `coding-system-property' will return nil.)
 791
 792 'disable-composition
 793      If non-nil, composition/decomposition for combining characters
 794      are disabled.
 795
 796 'post-read-conversion
 797      Function called after a file has been read in, to perform the
 798      decoding.  Called with two arguments, BEG and END, denoting
 799      a region of the current buffer to be decoded.
 800
 801 'pre-write-conversion
 802      Function called before a file is written out, to perform the
 803      encoding.  Called with two arguments, BEG and END, denoting
 804      a region of the current buffer to be encoded.
 805
 806
 807 The following additional properties are recognized if TYPE is 'iso2022:
 808
 809 'charset-g0
 810 'charset-g1
 811 'charset-g2
 812 'charset-g3
 813      The character set initially designated to the G0 - G3 registers.
 814      The value should be one of
 815
 816           -- A charset object (designate that character set)
 817           -- nil (do not ever use this register)
 818           -- t (no character set is initially designated to
 819                 the register, but may be later on; this automatically
 820                 sets the corresponding `force-g*-on-output' property)
 821
 822 'force-g0-on-output
 823 'force-g1-on-output
 824 'force-g2-on-output
 825 'force-g2-on-output
 826      If non-nil, send an explicit designation sequence on output before
 827      using the specified register.
 828
 829 'short
 830      If non-nil, use the short forms "ESC $ @", "ESC $ A", and
 831      "ESC $ B" on output in place of the full designation sequences
 832      "ESC $ ( @", "ESC $ ( A", and "ESC $ ( B".
 833
 834 'no-ascii-eol
 835      If non-nil, don't designate ASCII to G0 at each end of line on output.
 836      Setting this to non-nil also suppresses other state-resetting that
 837      normally happens at the end of a line.
 838
 839 'no-ascii-cntl
 840      If non-nil, don't designate ASCII to G0 before control chars on output.
 841
 842 'seven
 843      If non-nil, use 7-bit environment on output.  Otherwise, use 8-bit
 844      environment.
 845
 846 'lock-shift
 847      If non-nil, use locking-shift (SO/SI) instead of single-shift
 848      or designation by escape sequence.
 849
 850 'no-iso6429
 851      If non-nil, don't use ISO6429's direction specification.
 852
 853 'escape-quoted
 854      If non-nil, literal control characters that are the same as
 855      the beginning of a recognized ISO2022 or ISO6429 escape sequence
 856      (in particular, ESC (0x1B), SO (0x0E), SI (0x0F), SS2 (0x8E),
 857      SS3 (0x8F), and CSI (0x9B)) are "quoted" with an escape character
 858      so that they can be properly distinguished from an escape sequence.
 859      (Note that doing this results in a non-portable encoding.) This
 860      encoding flag is used for byte-compiled files.  Note that ESC
 861      is a good choice for a quoting character because there are no
 862      escape sequences whose second byte is a character from the Control-0
 863      or Control-1 character sets; this is explicitly disallowed by the
 864      ISO2022 standard.
 865
 866 'input-charset-conversion
 867      A list of conversion specifications, specifying conversion of
 868      characters in one charset to another when decoding is performed.
 869      Each specification is a list of two elements: the source charset,
 870      and the destination charset.
 871
 872 'output-charset-conversion
 873      A list of conversion specifications, specifying conversion of
 874      characters in one charset to another when encoding is performed.
 875      The form of each specification is the same as for
 876      'input-charset-conversion.
 877
 878
 879 The following additional properties are recognized (and required)
 880 if TYPE is 'ccl:
 881
 882 'decode
 883      CCL program used for decoding (converting to internal format).
 884
 885 'encode
 886      CCL program used for encoding (converting to external format).
 887 */
 888        (name, type, doc_string, props))
 889 {
 890   Lisp_Coding_System *codesys;
 891   Lisp_Object rest, key, value;
 892   enum coding_system_type ty;
 893   int need_to_setup_eol_systems = 1;
 894
 895   /* Convert type to constant */
 896   if (NILP (type) || EQ (type, Qundecided))
 897                                       { ty = CODESYS_AUTODETECT; }
 898 #ifdef MULE
 899   else if (EQ (type, Qshift_jis))     { ty = CODESYS_SHIFT_JIS; }
 900   else if (EQ (type, Qiso2022))       { ty = CODESYS_ISO2022; }
 901   else if (EQ (type, Qbig5))          { ty = CODESYS_BIG5; }
 902   else if (EQ (type, Qucs4))          { ty = CODESYS_UCS4; }
 903   else if (EQ (type, Qutf8))          { ty = CODESYS_UTF8; }
 904   else if (EQ (type, Qccl))           { ty = CODESYS_CCL; }
 905 #endif
 906   else if (EQ (type, Qno_conversion)) { ty = CODESYS_NO_CONVERSION; }
 907 #ifdef DEBUG_XEMACS
 908   else if (EQ (type, Qinternal))      { ty = CODESYS_INTERNAL; }
 909 #endif
 910   else
 911     signal_simple_error ("Invalid coding system type", type);
 912
 913   CHECK_SYMBOL (name);
 914
 915   codesys = allocate_coding_system (ty, name);
 916
 917   if (NILP (doc_string))
 918     doc_string = build_string ("");
 919   else
 920     CHECK_STRING (doc_string);
 921   CODING_SYSTEM_DOC_STRING (codesys) = doc_string;
 922
 923   EXTERNAL_PROPERTY_LIST_LOOP (rest, key, value, props)
 924     {
 925       if (EQ (key, Qmnemonic))
 926         {
 927           if (!NILP (value))
 928             CHECK_STRING (value);
 929           CODING_SYSTEM_MNEMONIC (codesys) = value;
 930         }
 931
 932       else if (EQ (key, Qeol_type))
 933         {
 934           need_to_setup_eol_systems = NILP (value);
 935           if (EQ (value, Qt))
 936             value = Qnil;
 937           CODING_SYSTEM_EOL_TYPE (codesys) = symbol_to_eol_type (value);
 938         }
 939
 940       else if (EQ (key, Qpost_read_conversion))
 941         CODING_SYSTEM_POST_READ_CONVERSION (codesys) = value;
 942       else if (EQ (key, Qpre_write_conversion))
 943         CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = value;
 944 #ifdef UTF2000
 945       else if (EQ (key, Qdisable_composition))
 946         CODING_SYSTEM_DISABLE_COMPOSITION (codesys) = !NILP (value);
 947 #endif
 948 #ifdef MULE
 949       else if (ty == CODESYS_ISO2022)
 950         {
 951 #define FROB_INITIAL_CHARSET(charset_num) \
 952   CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \
 953     ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value))
 954
 955           if      (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
 956           else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
 957           else if (EQ (key, Qcharset_g2)) FROB_INITIAL_CHARSET (2);
 958           else if (EQ (key, Qcharset_g3)) FROB_INITIAL_CHARSET (3);
 959
 960 #define FROB_FORCE_CHARSET(charset_num) \
 961   CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = !NILP (value)
 962
 963           else if (EQ (key, Qforce_g0_on_output)) FROB_FORCE_CHARSET (0);
 964           else if (EQ (key, Qforce_g1_on_output)) FROB_FORCE_CHARSET (1);
 965           else if (EQ (key, Qforce_g2_on_output)) FROB_FORCE_CHARSET (2);
 966           else if (EQ (key, Qforce_g3_on_output)) FROB_FORCE_CHARSET (3);
 967
 968 #define FROB_BOOLEAN_PROPERTY(prop) \
 969   CODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value)
 970
 971           else if (EQ (key, Qshort))         FROB_BOOLEAN_PROPERTY (SHORT);
 972           else if (EQ (key, Qno_ascii_eol))  FROB_BOOLEAN_PROPERTY (NO_ASCII_EOL);
 973           else if (EQ (key, Qno_ascii_cntl)) FROB_BOOLEAN_PROPERTY (NO_ASCII_CNTL);
 974           else if (EQ (key, Qseven))         FROB_BOOLEAN_PROPERTY (SEVEN);
 975           else if (EQ (key, Qlock_shift))    FROB_BOOLEAN_PROPERTY (LOCK_SHIFT);
 976           else if (EQ (key, Qno_iso6429))    FROB_BOOLEAN_PROPERTY (NO_ISO6429);
 977           else if (EQ (key, Qescape_quoted)) FROB_BOOLEAN_PROPERTY (ESCAPE_QUOTED);
 978
 979           else if (EQ (key, Qinput_charset_conversion))
 980             {
 981               codesys->iso2022.input_conv =
 982                 Dynarr_new (charset_conversion_spec);
 983               parse_charset_conversion_specs (codesys->iso2022.input_conv,
 984                                               value);
 985             }
 986           else if (EQ (key, Qoutput_charset_conversion))
 987             {
 988               codesys->iso2022.output_conv =
 989                 Dynarr_new (charset_conversion_spec);
 990               parse_charset_conversion_specs (codesys->iso2022.output_conv,
 991                                               value);
 992             }
 993           else
 994             signal_simple_error ("Unrecognized property", key);
 995         }
 996       else if (EQ (type, Qccl))
 997         {
 998           if (EQ (key, Qdecode))
 999             {
1000               CHECK_VECTOR (value);
1001               CODING_SYSTEM_CCL_DECODE (codesys) = value;
1002             }
1003           else if (EQ (key, Qencode))
1004             {
1005               CHECK_VECTOR (value);
1006               CODING_SYSTEM_CCL_ENCODE (codesys) = value;
1007             }
1008           else
1009             signal_simple_error ("Unrecognized property", key);
1010         }
1011 #endif /* MULE */
1012       else
1013         signal_simple_error ("Unrecognized property", key);
1014     }
1015
1016   if (need_to_setup_eol_systems)
1017     setup_eol_coding_systems (codesys);
1018
1019   {
1020     Lisp_Object codesys_obj;
1021     XSETCODING_SYSTEM (codesys_obj, codesys);
1022     Fputhash (name, codesys_obj, Vcoding_system_hash_table);
1023     return codesys_obj;
1024   }
1025 }
1026
1027 DEFUN ("copy-coding-system", Fcopy_coding_system, 2, 2, 0, /*
1028 Copy OLD-CODING-SYSTEM to NEW-NAME.
1029 If NEW-NAME does not name an existing coding system, a new one will
1030 be created.
1031 */
1032        (old_coding_system, new_name))
1033 {
1034   Lisp_Object new_coding_system;
1035   old_coding_system = Fget_coding_system (old_coding_system);
1036   new_coding_system = Ffind_coding_system (new_name);
1037   if (NILP (new_coding_system))
1038     {
1039       XSETCODING_SYSTEM (new_coding_system,
1040                          allocate_coding_system
1041                          (XCODING_SYSTEM_TYPE (old_coding_system),
1042                           new_name));
1043       Fputhash (new_name, new_coding_system, Vcoding_system_hash_table);
1044     }
1045
1046   {
1047     Lisp_Coding_System *to = XCODING_SYSTEM (new_coding_system);
1048     Lisp_Coding_System *from = XCODING_SYSTEM (old_coding_system);
1049     memcpy (((char *) to  ) + sizeof (to->header),
1050             ((char *) from) + sizeof (from->header),
1051             sizeof (*from) - sizeof (from->header));
1052     to->name = new_name;
1053   }
1054   return new_coding_system;
1055 }
1056
1057 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias, 2, 2, 0, /*
1058 Define symbol ALIAS as an alias for coding system CODING-SYSTEM.
1059 */
1060        (alias, coding_system))
1061 {
1062   CHECK_SYMBOL (alias);
1063   if (!NILP (Ffind_coding_system (alias)))
1064     signal_simple_error ("Symbol already names a coding system", alias);
1065   coding_system = Fget_coding_system (coding_system);
1066   Fputhash (alias, coding_system, Vcoding_system_hash_table);
1067
1068   /* Set up aliases for subsidiaries. */
1069   if (XCODING_SYSTEM_EOL_TYPE (coding_system) == EOL_AUTODETECT)
1070     {
1071       Lisp_Object str;
1072       XSETSTRING (str, symbol_name (XSYMBOL (alias)));
1073 #define FROB(type, name)                                                        \
1074       do {                                                                      \
1075         Lisp_Object subsidiary = XCODING_SYSTEM_EOL_##type (coding_system);     \
1076         if (!NILP (subsidiary))                                                 \
1077           Fdefine_coding_system_alias                                           \
1078             (Fintern (concat2 (str, build_string (name)), Qnil), subsidiary);   \
1079       } while (0)
1080       FROB (LF,   "-unix");
1081       FROB (CRLF, "-dos");
1082       FROB (CR,   "-mac");
1083 #undef FROB
1084     }
1085   /* FSF return value is a vector of [ALIAS-unix ALIAS-dos ALIAS-mac],
1086      but it doesn't look intentional, so I'd rather return something
1087      meaningful or nothing at all. */
1088   return Qnil;
1089 }
1090
1091 static Lisp_Object
1092 subsidiary_coding_system (Lisp_Object coding_system, enum eol_type type)
1093 {
1094   Lisp_Coding_System *cs = XCODING_SYSTEM (coding_system);
1095   Lisp_Object new_coding_system;
1096
1097   if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
1098     return coding_system;
1099
1100   switch (type)
1101     {
1102     case EOL_AUTODETECT: return coding_system;
1103     case EOL_LF:   new_coding_system = CODING_SYSTEM_EOL_LF   (cs); break;
1104     case EOL_CR:   new_coding_system = CODING_SYSTEM_EOL_CR   (cs); break;
1105     case EOL_CRLF: new_coding_system = CODING_SYSTEM_EOL_CRLF (cs); break;
1106     default:       abort ();
1107     }
1108
1109   return NILP (new_coding_system) ? coding_system : new_coding_system;
1110 }
1111
1112 DEFUN ("subsidiary-coding-system", Fsubsidiary_coding_system, 2, 2, 0, /*
1113 Return the subsidiary coding system of CODING-SYSTEM with eol type EOL-TYPE.
1114 */
1115        (coding_system, eol_type))
1116 {
1117   coding_system = Fget_coding_system (coding_system);
1118
1119   return subsidiary_coding_system (coding_system,
1120                                    symbol_to_eol_type (eol_type));
1121 }
1122
1123 \f
1124 /************************************************************************/
1125 /*                         Coding system accessors                      */
1126 /************************************************************************/
1127
1128 DEFUN ("coding-system-doc-string", Fcoding_system_doc_string, 1, 1, 0, /*
1129 Return the doc string for CODING-SYSTEM.
1130 */
1131        (coding_system))
1132 {
1133   coding_system = Fget_coding_system (coding_system);
1134   return XCODING_SYSTEM_DOC_STRING (coding_system);
1135 }
1136
1137 DEFUN ("coding-system-type", Fcoding_system_type, 1, 1, 0, /*
1138 Return the type of CODING-SYSTEM.
1139 */
1140        (coding_system))
1141 {
1142   switch (XCODING_SYSTEM_TYPE (Fget_coding_system (coding_system)))
1143     {
1144     default: abort ();
1145     case CODESYS_AUTODETECT:    return Qundecided;
1146 #ifdef MULE
1147     case CODESYS_SHIFT_JIS:     return Qshift_jis;
1148     case CODESYS_ISO2022:       return Qiso2022;
1149     case CODESYS_BIG5:          return Qbig5;
1150     case CODESYS_UCS4:          return Qucs4;
1151     case CODESYS_UTF8:          return Qutf8;
1152     case CODESYS_CCL:           return Qccl;
1153 #endif
1154     case CODESYS_NO_CONVERSION: return Qno_conversion;
1155 #ifdef DEBUG_XEMACS
1156     case CODESYS_INTERNAL:      return Qinternal;
1157 #endif
1158     }
1159 }
1160
1161 #ifdef MULE
1162 static
1163 Lisp_Object coding_system_charset (Lisp_Object coding_system, int gnum)
1164 {
1165   Lisp_Object cs
1166     = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, gnum);
1167
1168   return CHARSETP (cs) ? XCHARSET_NAME (cs) : Qnil;
1169 }
1170
1171 DEFUN ("coding-system-charset", Fcoding_system_charset, 2, 2, 0, /*
1172 Return initial charset of CODING-SYSTEM designated to GNUM.
1173 GNUM allows 0 .. 3.
1174 */
1175        (coding_system, gnum))
1176 {
1177   coding_system = Fget_coding_system (coding_system);
1178   CHECK_INT (gnum);
1179
1180   return coding_system_charset (coding_system, XINT (gnum));
1181 }
1182 #endif /* MULE */
1183
1184 DEFUN ("coding-system-property", Fcoding_system_property, 2, 2, 0, /*
1185 Return the PROP property of CODING-SYSTEM.
1186 */
1187        (coding_system, prop))
1188 {
1189   int i, ok = 0;
1190   enum coding_system_type type;
1191
1192   coding_system = Fget_coding_system (coding_system);
1193   CHECK_SYMBOL (prop);
1194   type = XCODING_SYSTEM_TYPE (coding_system);
1195
1196   for (i = 0; !ok && i < Dynarr_length (the_codesys_prop_dynarr); i++)
1197     if (EQ (Dynarr_at (the_codesys_prop_dynarr, i).sym, prop))
1198       {
1199         ok = 1;
1200         switch (Dynarr_at (the_codesys_prop_dynarr, i).prop_type)
1201           {
1202           case CODESYS_PROP_ALL_OK:
1203             break;
1204 #ifdef MULE
1205           case CODESYS_PROP_ISO2022:
1206             if (type != CODESYS_ISO2022)
1207               signal_simple_error
1208                 ("Property only valid in ISO2022 coding systems",
1209                  prop);
1210             break;
1211
1212           case CODESYS_PROP_CCL:
1213             if (type != CODESYS_CCL)
1214               signal_simple_error
1215                 ("Property only valid in CCL coding systems",
1216                  prop);
1217             break;
1218 #endif /* MULE */
1219           default:
1220             abort ();
1221           }
1222       }
1223
1224   if (!ok)
1225     signal_simple_error ("Unrecognized property", prop);
1226
1227   if (EQ (prop, Qname))
1228     return XCODING_SYSTEM_NAME (coding_system);
1229   else if (EQ (prop, Qtype))
1230     return Fcoding_system_type (coding_system);
1231   else if (EQ (prop, Qdoc_string))
1232     return XCODING_SYSTEM_DOC_STRING (coding_system);
1233   else if (EQ (prop, Qmnemonic))
1234     return XCODING_SYSTEM_MNEMONIC (coding_system);
1235   else if (EQ (prop, Qeol_type))
1236     return eol_type_to_symbol (XCODING_SYSTEM_EOL_TYPE (coding_system));
1237   else if (EQ (prop, Qeol_lf))
1238     return XCODING_SYSTEM_EOL_LF (coding_system);
1239   else if (EQ (prop, Qeol_crlf))
1240     return XCODING_SYSTEM_EOL_CRLF (coding_system);
1241   else if (EQ (prop, Qeol_cr))
1242     return XCODING_SYSTEM_EOL_CR (coding_system);
1243   else if (EQ (prop, Qpost_read_conversion))
1244     return XCODING_SYSTEM_POST_READ_CONVERSION (coding_system);
1245   else if (EQ (prop, Qpre_write_conversion))
1246     return XCODING_SYSTEM_PRE_WRITE_CONVERSION (coding_system);
1247 #ifdef MULE
1248   else if (type == CODESYS_ISO2022)
1249     {
1250       if (EQ (prop, Qcharset_g0))
1251         return coding_system_charset (coding_system, 0);
1252       else if (EQ (prop, Qcharset_g1))
1253         return coding_system_charset (coding_system, 1);
1254       else if (EQ (prop, Qcharset_g2))
1255         return coding_system_charset (coding_system, 2);
1256       else if (EQ (prop, Qcharset_g3))
1257         return coding_system_charset (coding_system, 3);
1258
1259 #define FORCE_CHARSET(charset_num) \
1260   (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \
1261    (coding_system, charset_num) ? Qt : Qnil)
1262
1263       else if (EQ (prop, Qforce_g0_on_output)) return FORCE_CHARSET (0);
1264       else if (EQ (prop, Qforce_g1_on_output)) return FORCE_CHARSET (1);
1265       else if (EQ (prop, Qforce_g2_on_output)) return FORCE_CHARSET (2);
1266       else if (EQ (prop, Qforce_g3_on_output)) return FORCE_CHARSET (3);
1267
1268 #define LISP_BOOLEAN(prop) \
1269   (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil)
1270
1271       else if (EQ (prop, Qshort))         return LISP_BOOLEAN (SHORT);
1272       else if (EQ (prop, Qno_ascii_eol))  return LISP_BOOLEAN (NO_ASCII_EOL);
1273       else if (EQ (prop, Qno_ascii_cntl)) return LISP_BOOLEAN (NO_ASCII_CNTL);
1274       else if (EQ (prop, Qseven))         return LISP_BOOLEAN (SEVEN);
1275       else if (EQ (prop, Qlock_shift))    return LISP_BOOLEAN (LOCK_SHIFT);
1276       else if (EQ (prop, Qno_iso6429))    return LISP_BOOLEAN (NO_ISO6429);
1277       else if (EQ (prop, Qescape_quoted)) return LISP_BOOLEAN (ESCAPE_QUOTED);
1278
1279       else if (EQ (prop, Qinput_charset_conversion))
1280         return
1281           unparse_charset_conversion_specs
1282             (XCODING_SYSTEM (coding_system)->iso2022.input_conv);
1283       else if (EQ (prop, Qoutput_charset_conversion))
1284         return
1285           unparse_charset_conversion_specs
1286             (XCODING_SYSTEM (coding_system)->iso2022.output_conv);
1287       else
1288         abort ();
1289     }
1290   else if (type == CODESYS_CCL)
1291     {
1292       if (EQ (prop, Qdecode))
1293         return XCODING_SYSTEM_CCL_DECODE (coding_system);
1294       else if (EQ (prop, Qencode))
1295         return XCODING_SYSTEM_CCL_ENCODE (coding_system);
1296       else
1297         abort ();
1298     }
1299 #endif /* MULE */
1300   else
1301     abort ();
1302
1303   return Qnil; /* not reached */
1304 }
1305
1306 \f
1307 /************************************************************************/
1308 /*                       Coding category functions                      */
1309 /************************************************************************/
1310
1311 static int
1312 decode_coding_category (Lisp_Object symbol)
1313 {
1314   int i;
1315
1316   CHECK_SYMBOL (symbol);
1317   for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1318     if (EQ (coding_category_symbol[i], symbol))
1319       return i;
1320
1321   signal_simple_error ("Unrecognized coding category", symbol);
1322   return 0; /* not reached */
1323 }
1324
1325 DEFUN ("coding-category-list", Fcoding_category_list, 0, 0, 0, /*
1326 Return a list of all recognized coding categories.
1327 */
1328        ())
1329 {
1330   int i;
1331   Lisp_Object list = Qnil;
1332
1333   for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1334     list = Fcons (coding_category_symbol[i], list);
1335   return list;
1336 }
1337
1338 DEFUN ("set-coding-priority-list", Fset_coding_priority_list, 1, 1, 0, /*
1339 Change the priority order of the coding categories.
1340 LIST should be list of coding categories, in descending order of
1341 priority.  Unspecified coding categories will be lower in priority
1342 than all specified ones, in the same relative order they were in
1343 previously.
1344 */
1345        (list))
1346 {
1347   int category_to_priority[CODING_CATEGORY_LAST + 1];
1348   int i, j;
1349   Lisp_Object rest;
1350
1351   /* First generate a list that maps coding categories to priorities. */
1352
1353   for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1354     category_to_priority[i] = -1;
1355
1356   /* Highest priority comes from the specified list. */
1357   i = 0;
1358   EXTERNAL_LIST_LOOP (rest, list)
1359     {
1360       int cat = decode_coding_category (XCAR (rest));
1361
1362       if (category_to_priority[cat] >= 0)
1363         signal_simple_error ("Duplicate coding category in list", XCAR (rest));
1364       category_to_priority[cat] = i++;
1365     }
1366
1367   /* Now go through the existing categories by priority to retrieve
1368      the categories not yet specified and preserve their priority
1369      order. */
1370   for (j = 0; j <= CODING_CATEGORY_LAST; j++)
1371     {
1372       int cat = fcd->coding_category_by_priority[j];
1373       if (category_to_priority[cat] < 0)
1374         category_to_priority[cat] = i++;
1375     }
1376
1377   /* Now we need to construct the inverse of the mapping we just
1378      constructed. */
1379
1380   for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1381     fcd->coding_category_by_priority[category_to_priority[i]] = i;
1382
1383   /* Phew!  That was confusing. */
1384   return Qnil;
1385 }
1386
1387 DEFUN ("coding-priority-list", Fcoding_priority_list, 0, 0, 0, /*
1388 Return a list of coding categories in descending order of priority.
1389 */
1390        ())
1391 {
1392   int i;
1393   Lisp_Object list = Qnil;
1394
1395   for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1396     list = Fcons (coding_category_symbol[fcd->coding_category_by_priority[i]],
1397                   list);
1398   return list;
1399 }
1400
1401 DEFUN ("set-coding-category-system", Fset_coding_category_system, 2, 2, 0, /*
1402 Change the coding system associated with a coding category.
1403 */
1404        (coding_category, coding_system))
1405 {
1406   int cat = decode_coding_category (coding_category);
1407
1408   coding_system = Fget_coding_system (coding_system);
1409   fcd->coding_category_system[cat] = coding_system;
1410   return Qnil;
1411 }
1412
1413 DEFUN ("coding-category-system", Fcoding_category_system, 1, 1, 0, /*
1414 Return the coding system associated with a coding category.
1415 */
1416        (coding_category))
1417 {
1418   int cat = decode_coding_category (coding_category);
1419   Lisp_Object sys = fcd->coding_category_system[cat];
1420
1421   if (!NILP (sys))
1422     return XCODING_SYSTEM_NAME (sys);
1423   return Qnil;
1424 }
1425
1426 \f
1427 /************************************************************************/
1428 /*                     Detecting the encoding of data                   */
1429 /************************************************************************/
1430
1431 struct detection_state
1432 {
1433   enum eol_type eol_type;
1434   int seen_non_ascii;
1435   int mask;
1436 #ifdef MULE
1437   struct
1438     {
1439       int mask;
1440       int in_second_byte;
1441     }
1442   big5;
1443
1444   struct
1445     {
1446       int mask;
1447       int in_second_byte;
1448     }
1449   shift_jis;
1450
1451   struct
1452     {
1453       int mask;
1454       int in_byte;
1455   }
1456   ucs4;
1457
1458   struct
1459     {
1460       int mask;
1461       int in_byte;
1462     }
1463   utf8;
1464
1465   struct
1466     {
1467       int mask;
1468       int initted;
1469       struct iso2022_decoder iso;
1470       unsigned int flags;
1471       int high_byte_count;
1472       unsigned int saw_single_shift:1;
1473     }
1474   iso2022;
1475 #endif
1476   struct
1477     {
1478       int seen_anything;
1479       int just_saw_cr;
1480     }
1481   eol;
1482 };
1483
1484 static int
1485 acceptable_control_char_p (int c)
1486 {
1487   switch (c)
1488     {
1489       /* Allow and ignore control characters that you might
1490          reasonably see in a text file */
1491     case '\r':
1492     case '\n':
1493     case '\t':
1494     case  7: /* bell */
1495     case  8: /* backspace */
1496     case 11: /* vertical tab */
1497     case 12: /* form feed */
1498     case 26: /* MS-DOS C-z junk */
1499     case 31: /* '^_' -- for info */
1500       return 1;
1501     default:
1502       return 0;
1503     }
1504 }
1505
1506 static int
1507 mask_has_at_most_one_bit_p (int mask)
1508 {
1509   /* Perhaps the only thing useful you learn from intensive Microsoft
1510      technical interviews */
1511   return (mask & (mask - 1)) == 0;
1512 }
1513
1514 static enum eol_type
1515 detect_eol_type (struct detection_state *st, CONST unsigned char *src,
1516                  unsigned int n)
1517 {
1518   int c;
1519
1520   while (n--)
1521     {
1522       c = *src++;
1523       if (c == '\n')
1524         {
1525           if (st->eol.just_saw_cr)
1526             return EOL_CRLF;
1527           else if (st->eol.seen_anything)
1528             return EOL_LF;
1529         }
1530       else if (st->eol.just_saw_cr)
1531         return EOL_CR;
1532       else if (c == '\r')
1533         st->eol.just_saw_cr = 1;
1534       else
1535         st->eol.just_saw_cr = 0;
1536       st->eol.seen_anything = 1;
1537     }
1538
1539   return EOL_AUTODETECT;
1540 }
1541
1542 /* Attempt to determine the encoding and EOL type of the given text.
1543    Before calling this function for the first type, you must initialize
1544    st->eol_type as appropriate and initialize st->mask to ~0.
1545
1546    st->eol_type holds the determined EOL type, or EOL_AUTODETECT if
1547    not yet known.
1548
1549    st->mask holds the determined coding category mask, or ~0 if only
1550    ASCII has been seen so far.
1551
1552    Returns:
1553
1554    0 == st->eol_type is EOL_AUTODETECT and/or more than coding category
1555         is present in st->mask
1556    1 == definitive answers are here for both st->eol_type and st->mask
1557 */
1558
1559 static int
1560 detect_coding_type (struct detection_state *st, CONST Extbyte *src,
1561                     unsigned int n, int just_do_eol)
1562 {
1563   int c;
1564
1565   if (st->eol_type == EOL_AUTODETECT)
1566     st->eol_type = detect_eol_type (st, src, n);
1567
1568   if (just_do_eol)
1569     return st->eol_type != EOL_AUTODETECT;
1570
1571   if (!st->seen_non_ascii)
1572     {
1573       for (; n; n--, src++)
1574         {
1575           c = *src;
1576           if ((c < 0x20 && !acceptable_control_char_p (c)) || c >= 0x80)
1577             {
1578               st->seen_non_ascii = 1;
1579 #ifdef MULE
1580               st->shift_jis.mask = ~0;
1581               st->big5.mask = ~0;
1582               st->ucs4.mask = ~0;
1583               st->utf8.mask = ~0;
1584               st->iso2022.mask = ~0;
1585 #endif
1586               break;
1587             }
1588         }
1589     }
1590
1591   if (!n)
1592     return 0;
1593 #ifdef MULE
1594   if (!mask_has_at_most_one_bit_p (st->iso2022.mask))
1595     st->iso2022.mask = detect_coding_iso2022 (st, src, n);
1596   if (!mask_has_at_most_one_bit_p (st->shift_jis.mask))
1597     st->shift_jis.mask = detect_coding_sjis (st, src, n);
1598   if (!mask_has_at_most_one_bit_p (st->big5.mask))
1599     st->big5.mask = detect_coding_big5 (st, src, n);
1600   if (!mask_has_at_most_one_bit_p (st->utf8.mask))
1601     st->utf8.mask = detect_coding_utf8 (st, src, n);
1602   if (!mask_has_at_most_one_bit_p (st->ucs4.mask))
1603     st->ucs4.mask = detect_coding_ucs4 (st, src, n);
1604
1605   st->mask
1606     = st->iso2022.mask | st->shift_jis.mask | st->big5.mask
1607     | st->utf8.mask | st->ucs4.mask;
1608 #endif
1609   {
1610     int retval = mask_has_at_most_one_bit_p (st->mask);
1611     st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK;
1612     return retval && st->eol_type != EOL_AUTODETECT;
1613   }
1614 }
1615
1616 static Lisp_Object
1617 coding_system_from_mask (int mask)
1618 {
1619   if (mask == ~0)
1620     {
1621       /* If the file was entirely or basically ASCII, use the
1622          default value of `buffer-file-coding-system'. */
1623       Lisp_Object retval =
1624         XBUFFER (Vbuffer_defaults)->buffer_file_coding_system;
1625       if (!NILP (retval))
1626         {
1627           retval = Ffind_coding_system (retval);
1628           if (NILP (retval))
1629             {
1630               warn_when_safe
1631                 (Qbad_variable, Qwarning,
1632                  "Invalid `default-buffer-file-coding-system', set to nil");
1633               XBUFFER (Vbuffer_defaults)->buffer_file_coding_system = Qnil;
1634             }
1635         }
1636       if (NILP (retval))
1637         retval = Fget_coding_system (Qraw_text);
1638       return retval;
1639     }
1640   else
1641     {
1642       int i;
1643       int cat = -1;
1644 #ifdef MULE
1645       mask = postprocess_iso2022_mask (mask);
1646 #endif
1647       /* Look through the coding categories by priority and find
1648          the first one that is allowed. */
1649       for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1650         {
1651           cat = fcd->coding_category_by_priority[i];
1652           if ((mask & (1 << cat)) &&
1653               !NILP (fcd->coding_category_system[cat]))
1654             break;
1655         }
1656       if (cat >= 0)
1657         return fcd->coding_category_system[cat];
1658       else
1659         return Fget_coding_system (Qraw_text);
1660     }
1661 }
1662
1663 /* Given a seekable read stream and potential coding system and EOL type
1664    as specified, do any autodetection that is called for.  If the
1665    coding system and/or EOL type are not `autodetect', they will be left
1666    alone; but this function will never return an autodetect coding system
1667    or EOL type.
1668
1669    This function does not automatically fetch subsidiary coding systems;
1670    that should be unnecessary with the explicit eol-type argument. */
1671
1672 #define LENGTH(string_constant) (sizeof (string_constant) - 1)
1673
1674 void
1675 determine_real_coding_system (Lstream *stream, Lisp_Object *codesys_in_out,
1676                               enum eol_type *eol_type_in_out)
1677 {
1678   struct detection_state decst;
1679
1680   if (*eol_type_in_out == EOL_AUTODETECT)
1681     *eol_type_in_out = XCODING_SYSTEM_EOL_TYPE (*codesys_in_out);
1682
1683   xzero (decst);
1684   decst.eol_type = *eol_type_in_out;
1685   decst.mask = ~0;
1686
1687   /* If autodetection is called for, do it now. */
1688   if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT
1689       || *eol_type_in_out == EOL_AUTODETECT)
1690     {
1691       Extbyte buf[4096];
1692       Lisp_Object coding_system = Qnil;
1693       Extbyte *p;
1694       ssize_t nread = Lstream_read (stream, buf, sizeof (buf));
1695       Extbyte *scan_end;
1696
1697       /* Look for initial "-*-"; mode line prefix */
1698       for (p = buf,
1699              scan_end = buf + nread - LENGTH ("-*-coding:?-*-");
1700            p <= scan_end
1701              && *p != '\n'
1702              && *p != '\r';
1703            p++)
1704         if (*p == '-' && *(p+1) == '*' && *(p+2) == '-')
1705           {
1706             Extbyte *local_vars_beg = p + 3;
1707             /* Look for final "-*-"; mode line suffix */
1708             for (p = local_vars_beg,
1709                    scan_end = buf + nread - LENGTH ("-*-");
1710                  p <= scan_end
1711                    && *p != '\n'
1712                    && *p != '\r';
1713                  p++)
1714               if (*p == '-' && *(p+1) == '*' && *(p+2) == '-')
1715                 {
1716                   Extbyte *suffix = p;
1717                   /* Look for "coding:" */
1718                   for (p = local_vars_beg,
1719                          scan_end = suffix - LENGTH ("coding:?");
1720                        p <= scan_end;
1721                        p++)
1722                     if (memcmp ("coding:", p, LENGTH ("coding:")) == 0
1723                         && (p == local_vars_beg
1724                             || (*(p-1) == ' '  ||
1725                                 *(p-1) == '\t' ||
1726                                 *(p-1) == ';')))
1727                       {
1728                         Extbyte save;
1729                         int n;
1730                         p += LENGTH ("coding:");
1731                         while (*p == ' ' || *p == '\t') p++;
1732
1733                         /* Get coding system name */
1734                         save = *suffix; *suffix = '\0';
1735                         /* Characters valid in a MIME charset name (rfc 1521),
1736                            and in a Lisp symbol name. */
1737                         n = strspn ( (char *) p,
1738                                     "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
1739                                     "abcdefghijklmnopqrstuvwxyz"
1740                                     "0123456789"
1741                                     "!$%&*+-.^_{|}~");
1742                         *suffix = save;
1743                         if (n > 0)
1744                           {
1745                             save = p[n]; p[n] = '\0';
1746                             coding_system =
1747                               Ffind_coding_system (intern ((char *) p));
1748                             p[n] = save;
1749                           }
1750                         break;
1751                       }
1752                   break;
1753                 }
1754             break;
1755           }
1756
1757       if (NILP (coding_system))
1758         do
1759           {
1760             if (detect_coding_type (&decst, buf, nread,
1761                                     XCODING_SYSTEM_TYPE (*codesys_in_out)
1762                                     != CODESYS_AUTODETECT))
1763               break;
1764             nread = Lstream_read (stream, buf, sizeof (buf));
1765             if (nread == 0)
1766               break;
1767           }
1768         while (1);
1769
1770       else if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT
1771                && XCODING_SYSTEM_EOL_TYPE (coding_system) == EOL_AUTODETECT)
1772         do
1773           {
1774             if (detect_coding_type (&decst, buf, nread, 1))
1775               break;
1776             nread = Lstream_read (stream, buf, sizeof (buf));
1777             if (!nread)
1778               break;
1779           }
1780         while (1);
1781
1782       *eol_type_in_out = decst.eol_type;
1783       if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT)
1784         {
1785           if (NILP (coding_system))
1786             *codesys_in_out = coding_system_from_mask (decst.mask);
1787           else
1788             *codesys_in_out = coding_system;
1789         }
1790     }
1791
1792   /* If we absolutely can't determine the EOL type, just assume LF. */
1793   if (*eol_type_in_out == EOL_AUTODETECT)
1794     *eol_type_in_out = EOL_LF;
1795
1796   Lstream_rewind (stream);
1797 }
1798
1799 DEFUN ("detect-coding-region", Fdetect_coding_region, 2, 3, 0, /*
1800 Detect coding system of the text in the region between START and END.
1801 Returned a list of possible coding systems ordered by priority.
1802 If only ASCII characters are found, it returns 'undecided or one of
1803 its subsidiary coding systems according to a detected end-of-line
1804 type.  Optional arg BUFFER defaults to the current buffer.
1805 */
1806        (start, end, buffer))
1807 {
1808   Lisp_Object val = Qnil;
1809   struct buffer *buf = decode_buffer (buffer, 0);
1810   Bufpos b, e;
1811   Lisp_Object instream, lb_instream;
1812   Lstream *istr, *lb_istr;
1813   struct detection_state decst;
1814   struct gcpro gcpro1, gcpro2;
1815
1816   get_buffer_range_char (buf, start, end, &b, &e, 0);
1817   lb_instream = make_lisp_buffer_input_stream (buf, b, e, 0);
1818   lb_istr = XLSTREAM (lb_instream);
1819   instream = make_encoding_input_stream (lb_istr, Fget_coding_system (Qbinary));
1820   istr = XLSTREAM (instream);
1821   GCPRO2 (instream, lb_instream);
1822   xzero (decst);
1823   decst.eol_type = EOL_AUTODETECT;
1824   decst.mask = ~0;
1825   while (1)
1826     {
1827       unsigned char random_buffer[4096];
1828       ssize_t nread = Lstream_read (istr, random_buffer, sizeof (random_buffer));
1829
1830       if (!nread)
1831         break;
1832       if (detect_coding_type (&decst, random_buffer, nread, 0))
1833         break;
1834     }
1835
1836   if (decst.mask == ~0)
1837     val = subsidiary_coding_system (Fget_coding_system (Qundecided),
1838                                     decst.eol_type);
1839   else
1840     {
1841       int i;
1842
1843       val = Qnil;
1844 #ifdef MULE
1845       decst.mask = postprocess_iso2022_mask (decst.mask);
1846 #endif
1847       for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1848         {
1849           int sys = fcd->coding_category_by_priority[i];
1850           if (decst.mask & (1 << sys))
1851             {
1852               Lisp_Object codesys = fcd->coding_category_system[sys];
1853               if (!NILP (codesys))
1854                 codesys = subsidiary_coding_system (codesys, decst.eol_type);
1855               val = Fcons (codesys, val);
1856             }
1857         }
1858     }
1859   Lstream_close (istr);
1860   UNGCPRO;
1861   Lstream_delete (istr);
1862   Lstream_delete (lb_istr);
1863   return val;
1864 }
1865
1866 \f
1867 /************************************************************************/
1868 /*           Converting to internal Mule format ("decoding")            */
1869 /************************************************************************/
1870
1871 /* A decoding stream is a stream used for decoding text (i.e.
1872    converting from some external format to internal format).
1873    The decoding-stream object keeps track of the actual coding
1874    stream, the stream that is at the other end, and data that
1875    needs to be persistent across the lifetime of the stream. */
1876
1877 /* Handle the EOL stuff related to just-read-in character C.
1878    EOL_TYPE is the EOL type of the coding stream.
1879    FLAGS is the current value of FLAGS in the coding stream, and may
1880    be modified by this macro.  (The macro only looks at the
1881    CODING_STATE_CR flag.)  DST is the Dynarr to which the decoded
1882    bytes are to be written.  You need to also define a local goto
1883    label "label_continue_loop" that is at the end of the main
1884    character-reading loop.
1885
1886    If C is a CR character, then this macro handles it entirely and
1887    jumps to label_continue_loop.  Otherwise, this macro does not add
1888    anything to DST, and continues normally.  You should continue
1889    processing C normally after this macro. */
1890
1891 #define DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst)         \
1892 do {                                                            \
1893   if (c == '\r')                                                \
1894     {                                                           \
1895       if (eol_type == EOL_CR)                                   \
1896         Dynarr_add (dst, '\n');                                 \
1897       else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR) \
1898         Dynarr_add (dst, c);                                    \
1899       else                                                      \
1900         flags |= CODING_STATE_CR;                               \
1901       goto label_continue_loop;                                 \
1902     }                                                           \
1903   else if (flags & CODING_STATE_CR)                             \
1904     {   /* eol_type == CODING_SYSTEM_EOL_CRLF */                \
1905       if (c != '\n')                                            \
1906         Dynarr_add (dst, '\r');                                 \
1907       flags &= ~CODING_STATE_CR;                                \
1908     }                                                           \
1909 } while (0)
1910
1911 /* C should be a binary character in the range 0 - 255; convert
1912    to internal format and add to Dynarr DST. */
1913
1914 #ifdef UTF2000
1915 #define DECODE_ADD_BINARY_CHAR(c, dst) \
1916 do {                                            \
1917   if (BYTE_ASCII_P (c))                         \
1918     Dynarr_add (dst, c);                        \
1919   else                                          \
1920     {                                           \
1921       Dynarr_add (dst, (c >> 6) | 0xc0);        \
1922       Dynarr_add (dst, (c & 0x3f) | 0x80);      \
1923     }                                           \
1924 } while (0)
1925
1926 INLINE void
1927 DECODE_ADD_UCS_CHAR(Emchar c, unsigned_char_dynarr* dst)
1928 {
1929   if ( c <= 0x7f )
1930     {
1931       Dynarr_add (dst, c);
1932     }
1933   else if ( c <= 0x7ff )
1934     {
1935       Dynarr_add (dst, (c >> 6) | 0xc0);
1936       Dynarr_add (dst, (c & 0x3f) | 0x80);
1937     }
1938   else if ( c <= 0xffff )
1939     {
1940       Dynarr_add (dst,  (c >> 12) | 0xe0);
1941       Dynarr_add (dst, ((c >>  6) & 0x3f) | 0x80);
1942       Dynarr_add (dst,  (c        & 0x3f) | 0x80);
1943     }
1944   else if ( c <= 0x1fffff )
1945     {
1946       Dynarr_add (dst,  (c >> 18) | 0xf0);
1947       Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
1948       Dynarr_add (dst, ((c >>  6) & 0x3f) | 0x80);
1949       Dynarr_add (dst,  (c        & 0x3f) | 0x80);
1950     }
1951   else if ( c <= 0x3ffffff )
1952     {
1953       Dynarr_add (dst,  (c >> 24) | 0xf8);
1954       Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
1955       Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
1956       Dynarr_add (dst, ((c >>  6) & 0x3f) | 0x80);
1957       Dynarr_add (dst,  (c        & 0x3f) | 0x80);
1958     }
1959   else
1960     {
1961       Dynarr_add (dst,  (c >> 30) | 0xfc);
1962       Dynarr_add (dst, ((c >> 24) & 0x3f) | 0x80);
1963       Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
1964       Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
1965       Dynarr_add (dst, ((c >>  6) & 0x3f) | 0x80);
1966       Dynarr_add (dst,  (c        & 0x3f) | 0x80);
1967     }
1968 }
1969 #else
1970 #define DECODE_ADD_BINARY_CHAR(c, dst)          \
1971 do {                                            \
1972   if (BYTE_ASCII_P (c))                         \
1973     Dynarr_add (dst, c);                        \
1974   else if (BYTE_C1_P (c))                       \
1975     {                                           \
1976       Dynarr_add (dst, LEADING_BYTE_CONTROL_1); \
1977       Dynarr_add (dst, c + 0x20);               \
1978     }                                           \
1979   else                                          \
1980     {                                           \
1981       Dynarr_add (dst, LEADING_BYTE_LATIN_ISO8859_1); \
1982       Dynarr_add (dst, c);                      \
1983     }                                           \
1984 } while (0)
1985 #endif
1986
1987 #define DECODE_OUTPUT_PARTIAL_CHAR(ch)  \
1988 do {                                    \
1989   if (ch)                               \
1990     {                                   \
1991       DECODE_ADD_BINARY_CHAR (ch, dst); \
1992       ch = 0;                           \
1993     }                                   \
1994 } while (0)
1995
1996 #define DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst) \
1997 do {                                    \
1998   if (flags & CODING_STATE_END)         \
1999     {                                   \
2000       DECODE_OUTPUT_PARTIAL_CHAR (ch);  \
2001       if (flags & CODING_STATE_CR)      \
2002         Dynarr_add (dst, '\r');         \
2003     }                                   \
2004 } while (0)
2005
2006 #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding)
2007
2008 struct decoding_stream
2009 {
2010   /* Coding system that governs the conversion. */
2011   Lisp_Coding_System *codesys;
2012
2013   /* Stream that we read the encoded data from or
2014      write the decoded data to. */
2015   Lstream *other_end;
2016
2017   /* If we are reading, then we can return only a fixed amount of
2018      data, so if the conversion resulted in too much data, we store it
2019      here for retrieval the next time around. */
2020   unsigned_char_dynarr *runoff;
2021
2022   /* FLAGS holds flags indicating the current state of the decoding.
2023      Some of these flags are dependent on the coding system. */
2024   unsigned int flags;
2025
2026   /* CH holds a partially built-up character.  Since we only deal
2027      with one- and two-byte characters at the moment, we only use
2028      this to store the first byte of a two-byte character. */
2029   unsigned int ch;
2030
2031   /* EOL_TYPE specifies the type of end-of-line conversion that
2032      currently applies.  We need to keep this separate from the
2033      EOL type stored in CODESYS because the latter might indicate
2034      automatic EOL-type detection while the former will always
2035      indicate a particular EOL type. */
2036   enum eol_type eol_type;
2037 #ifdef MULE
2038   /* Additional ISO2022 information.  We define the structure above
2039      because it's also needed by the detection routines. */
2040   struct iso2022_decoder iso2022;
2041
2042   /* Additional information (the state of the running CCL program)
2043      used by the CCL decoder. */
2044   struct ccl_program ccl;
2045
2046   /* counter for UTF-8 or UCS-4 */
2047   unsigned char counter;
2048 #endif
2049 #ifdef UTF2000
2050   unsigned combined_char_count;
2051   Emchar combined_chars[16];
2052   Lisp_Object combining_table;
2053 #endif
2054   struct detection_state decst;
2055 };
2056
2057 #ifdef UTF2000
2058 extern Lisp_Object Vcharacter_composition_table;
2059
2060 INLINE void
2061 COMPOSE_FLUSH_CHARS (struct decoding_stream *str, unsigned_char_dynarr* dst)
2062 {
2063   unsigned i;
2064
2065   for (i = 0; i < str->combined_char_count; i++)
2066     DECODE_ADD_UCS_CHAR (str->combined_chars[i], dst);
2067   str->combined_char_count = 0;
2068   str->combining_table = Qnil;
2069 }
2070
2071 void
2072 COMPOSE_ADD_CHAR(struct decoding_stream *str,
2073                  Emchar character, unsigned_char_dynarr* dst)
2074 {
2075   if (CODING_SYSTEM_DISABLE_COMPOSITION (str->codesys))
2076     DECODE_ADD_UCS_CHAR (character, dst);
2077   else if (!CHAR_CODE_TABLE_P (str->combining_table))
2078     {
2079       Lisp_Object ret
2080         = get_char_code_table (character, Vcharacter_composition_table);
2081
2082       if (NILP (ret))
2083         DECODE_ADD_UCS_CHAR (character, dst);
2084       else
2085         {
2086           str->combined_chars[0] = character;
2087           str->combined_char_count = 1;
2088           str->combining_table = ret;
2089         }
2090     }
2091   else
2092     {
2093       Lisp_Object ret
2094         = get_char_code_table (character, str->combining_table);
2095
2096       if (CHARP (ret))
2097         {
2098           Emchar char2 = XCHARVAL (ret);
2099           ret = get_char_code_table (char2, Vcharacter_composition_table);
2100           if (NILP (ret))
2101             {
2102               DECODE_ADD_UCS_CHAR (char2, dst);
2103               str->combined_char_count = 0;
2104               str->combining_table = Qnil;
2105             }
2106           else
2107             {
2108               str->combined_chars[0] = char2;
2109               str->combined_char_count = 1;
2110               str->combining_table = ret;
2111             }
2112         }
2113       else if (CHAR_CODE_TABLE_P (ret))
2114         {
2115           str->combined_chars[str->combined_char_count++] = character;
2116           str->combining_table = ret;
2117         }
2118       else
2119         {
2120           COMPOSE_FLUSH_CHARS (str, dst);
2121           DECODE_ADD_UCS_CHAR (character, dst);
2122         }
2123     }
2124 }
2125 #else /* not UTF2000 */
2126 #define COMPOSE_FLUSH_CHARS(str, dst)
2127 #define COMPOSE_ADD_CHAR(str, ch, dst) DECODE_ADD_UCS_CHAR (ch, dst)
2128 #endif /* UTF2000 */
2129
2130 static ssize_t decoding_reader (Lstream *stream,
2131                                 unsigned char *data, size_t size);
2132 static ssize_t decoding_writer (Lstream *stream,
2133                                 CONST unsigned char *data, size_t size);
2134 static int decoding_rewinder   (Lstream *stream);
2135 static int decoding_seekable_p (Lstream *stream);
2136 static int decoding_flusher    (Lstream *stream);
2137 static int decoding_closer     (Lstream *stream);
2138
2139 static Lisp_Object decoding_marker (Lisp_Object stream);
2140
2141 DEFINE_LSTREAM_IMPLEMENTATION ("decoding", lstream_decoding,
2142                                sizeof (struct decoding_stream));
2143
2144 static Lisp_Object
2145 decoding_marker (Lisp_Object stream)
2146 {
2147   Lstream *str = DECODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2148   Lisp_Object str_obj;
2149
2150   /* We do not need to mark the coding systems or charsets stored
2151      within the stream because they are stored in a global list
2152      and automatically marked. */
2153
2154   XSETLSTREAM (str_obj, str);
2155   mark_object (str_obj);
2156   if (str->imp->marker)
2157     return (str->imp->marker) (str_obj);
2158   else
2159     return Qnil;
2160 }
2161
2162 /* Read SIZE bytes of data and store it into DATA.  We are a decoding stream
2163    so we read data from the other end, decode it, and store it into DATA. */
2164
2165 static ssize_t
2166 decoding_reader (Lstream *stream, unsigned char *data, size_t size)
2167 {
2168   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2169   unsigned char *orig_data = data;
2170   ssize_t read_size;
2171   int error_occurred = 0;
2172
2173   /* We need to interface to mule_decode(), which expects to take some
2174      amount of data and store the result into a Dynarr.  We have
2175      mule_decode() store into str->runoff, and take data from there
2176      as necessary. */
2177
2178   /* We loop until we have enough data, reading chunks from the other
2179      end and decoding it. */
2180   while (1)
2181     {
2182       /* Take data from the runoff if we can.  Make sure to take at
2183          most SIZE bytes, and delete the data from the runoff. */
2184       if (Dynarr_length (str->runoff) > 0)
2185         {
2186           size_t chunk = min (size, (size_t) Dynarr_length (str->runoff));
2187           memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2188           Dynarr_delete_many (str->runoff, 0, chunk);
2189           data += chunk;
2190           size -= chunk;
2191         }
2192
2193       if (size == 0)
2194         break; /* No more room for data */
2195
2196       if (str->flags & CODING_STATE_END)
2197         /* This means that on the previous iteration, we hit the EOF on
2198            the other end.  We loop once more so that mule_decode() can
2199            output any final stuff it may be holding, or any "go back
2200            to a sane state" escape sequences. (This latter makes sense
2201            during encoding.) */
2202         break;
2203
2204       /* Exhausted the runoff, so get some more.  DATA has at least
2205          SIZE bytes left of storage in it, so it's OK to read directly
2206          into it.  (We'll be overwriting above, after we've decoded it
2207          into the runoff.) */
2208       read_size = Lstream_read (str->other_end, data, size);
2209       if (read_size < 0)
2210         {
2211           error_occurred = 1;
2212           break;
2213         }
2214       if (read_size == 0)
2215         /* There might be some more end data produced in the translation.
2216            See the comment above. */
2217         str->flags |= CODING_STATE_END;
2218       mule_decode (stream, data, str->runoff, read_size);
2219     }
2220
2221   if (data - orig_data == 0)
2222     return error_occurred ? -1 : 0;
2223   else
2224     return data - orig_data;
2225 }
2226
2227 static ssize_t
2228 decoding_writer (Lstream *stream, CONST unsigned char *data, size_t size)
2229 {
2230   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2231   ssize_t retval;
2232
2233   /* Decode all our data into the runoff, and then attempt to write
2234      it all out to the other end.  Remove whatever chunk we succeeded
2235      in writing. */
2236   mule_decode (stream, data, str->runoff, size);
2237   retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2238                           Dynarr_length (str->runoff));
2239   if (retval > 0)
2240     Dynarr_delete_many (str->runoff, 0, retval);
2241   /* Do NOT return retval.  The return value indicates how much
2242      of the incoming data was written, not how many bytes were
2243      written. */
2244   return size;
2245 }
2246
2247 static void
2248 reset_decoding_stream (struct decoding_stream *str)
2249 {
2250 #ifdef MULE
2251   if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_ISO2022)
2252     {
2253       Lisp_Object coding_system;
2254       XSETCODING_SYSTEM (coding_system, str->codesys);
2255       reset_iso2022 (coding_system, &str->iso2022);
2256     }
2257   else if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_CCL)
2258     {
2259       setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_DECODE (str->codesys));
2260     }
2261   str->counter = 0;
2262 #endif /* MULE */
2263 #ifdef UTF2000
2264   str->combined_char_count = 0;
2265   str->combining_table = Qnil;
2266 #endif
2267   str->flags = str->ch = 0;
2268 }
2269
2270 static int
2271 decoding_rewinder (Lstream *stream)
2272 {
2273   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2274   reset_decoding_stream (str);
2275   Dynarr_reset (str->runoff);
2276   return Lstream_rewind (str->other_end);
2277 }
2278
2279 static int
2280 decoding_seekable_p (Lstream *stream)
2281 {
2282   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2283   return Lstream_seekable_p (str->other_end);
2284 }
2285
2286 static int
2287 decoding_flusher (Lstream *stream)
2288 {
2289   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2290   return Lstream_flush (str->other_end);
2291 }
2292
2293 static int
2294 decoding_closer (Lstream *stream)
2295 {
2296   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2297   if (stream->flags & LSTREAM_FL_WRITE)
2298     {
2299       str->flags |= CODING_STATE_END;
2300       decoding_writer (stream, 0, 0);
2301     }
2302   Dynarr_free (str->runoff);
2303 #ifdef MULE
2304 #ifdef ENABLE_COMPOSITE_CHARS
2305   if (str->iso2022.composite_chars)
2306     Dynarr_free (str->iso2022.composite_chars);
2307 #endif
2308 #endif
2309   return Lstream_close (str->other_end);
2310 }
2311
2312 Lisp_Object
2313 decoding_stream_coding_system (Lstream *stream)
2314 {
2315   Lisp_Object coding_system;
2316   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2317
2318   XSETCODING_SYSTEM (coding_system, str->codesys);
2319   return subsidiary_coding_system (coding_system, str->eol_type);
2320 }
2321
2322 void
2323 set_decoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2324 {
2325   Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2326   struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2327   str->codesys = cs;
2328   if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
2329     str->eol_type = CODING_SYSTEM_EOL_TYPE (cs);
2330   reset_decoding_stream (str);
2331 }
2332
2333 /* WARNING WARNING WARNING WARNING!!!!!  If you open up a decoding
2334    stream for writing, no automatic code detection will be performed.
2335    The reason for this is that automatic code detection requires a
2336    seekable input.  Things will also fail if you open a decoding
2337    stream for reading using a non-fully-specified coding system and
2338    a non-seekable input stream. */
2339
2340 static Lisp_Object
2341 make_decoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2342                         CONST char *mode)
2343 {
2344   Lstream *lstr = Lstream_new (lstream_decoding, mode);
2345   struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2346   Lisp_Object obj;
2347
2348   xzero (*str);
2349   str->other_end = stream;
2350   str->runoff = (unsigned_char_dynarr *) Dynarr_new (unsigned_char);
2351   str->eol_type = EOL_AUTODETECT;
2352   if (!strcmp (mode, "r")
2353       && Lstream_seekable_p (stream))
2354     /* We can determine the coding system now. */
2355     determine_real_coding_system (stream, &codesys, &str->eol_type);
2356   set_decoding_stream_coding_system (lstr, codesys);
2357   str->decst.eol_type = str->eol_type;
2358   str->decst.mask = ~0;
2359   XSETLSTREAM (obj, lstr);
2360   return obj;
2361 }
2362
2363 Lisp_Object
2364 make_decoding_input_stream (Lstream *stream, Lisp_Object codesys)
2365 {
2366   return make_decoding_stream_1 (stream, codesys, "r");
2367 }
2368
2369 Lisp_Object
2370 make_decoding_output_stream (Lstream *stream, Lisp_Object codesys)
2371 {
2372   return make_decoding_stream_1 (stream, codesys, "w");
2373 }
2374
2375 /* Note: the decode_coding_* functions all take the same
2376    arguments as mule_decode(), which is to say some SRC data of
2377    size N, which is to be stored into dynamic array DST.
2378    DECODING is the stream within which the decoding is
2379    taking place, but no data is actually read from or
2380    written to that stream; that is handled in decoding_reader()
2381    or decoding_writer().  This allows the same functions to
2382    be used for both reading and writing. */
2383
2384 static void
2385 mule_decode (Lstream *decoding, CONST unsigned char *src,
2386              unsigned_char_dynarr *dst, unsigned int n)
2387 {
2388   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2389
2390   /* If necessary, do encoding-detection now.  We do this when
2391      we're a writing stream or a non-seekable reading stream,
2392      meaning that we can't just process the whole input,
2393      rewind, and start over. */
2394
2395   if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT ||
2396       str->eol_type == EOL_AUTODETECT)
2397     {
2398       Lisp_Object codesys;
2399
2400       XSETCODING_SYSTEM (codesys, str->codesys);
2401       detect_coding_type (&str->decst, src, n,
2402                           CODING_SYSTEM_TYPE (str->codesys) !=
2403                           CODESYS_AUTODETECT);
2404       if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT &&
2405           str->decst.mask != ~0)
2406         /* #### This is cheesy.  What we really ought to do is
2407            buffer up a certain amount of data so as to get a
2408            less random result. */
2409         codesys = coding_system_from_mask (str->decst.mask);
2410       str->eol_type = str->decst.eol_type;
2411       if (XCODING_SYSTEM (codesys) != str->codesys)
2412         {
2413           /* Preserve the CODING_STATE_END flag in case it was set.
2414              If we erase it, bad things might happen. */
2415           int was_end = str->flags & CODING_STATE_END;
2416           set_decoding_stream_coding_system (decoding, codesys);
2417           if (was_end)
2418             str->flags |= CODING_STATE_END;
2419         }
2420     }
2421
2422   switch (CODING_SYSTEM_TYPE (str->codesys))
2423     {
2424 #ifdef DEBUG_XEMACS
2425     case CODESYS_INTERNAL:
2426       Dynarr_add_many (dst, src, n);
2427       break;
2428 #endif
2429     case CODESYS_AUTODETECT:
2430       /* If we got this far and still haven't decided on the coding
2431          system, then do no conversion. */
2432     case CODESYS_NO_CONVERSION:
2433       decode_coding_no_conversion (decoding, src, dst, n);
2434       break;
2435 #ifdef MULE
2436     case CODESYS_SHIFT_JIS:
2437       decode_coding_sjis (decoding, src, dst, n);
2438       break;
2439     case CODESYS_BIG5:
2440       decode_coding_big5 (decoding, src, dst, n);
2441       break;
2442     case CODESYS_UCS4:
2443       decode_coding_ucs4 (decoding, src, dst, n);
2444       break;
2445     case CODESYS_UTF8:
2446       decode_coding_utf8 (decoding, src, dst, n);
2447       break;
2448     case CODESYS_CCL:
2449       str->ccl.last_block = str->flags & CODING_STATE_END;
2450       ccl_driver (&str->ccl, src, dst, n, 0, CCL_MODE_DECODING);
2451       break;
2452     case CODESYS_ISO2022:
2453       decode_coding_iso2022 (decoding, src, dst, n);
2454       break;
2455 #endif /* MULE */
2456     default:
2457       abort ();
2458     }
2459 }
2460
2461 DEFUN ("decode-coding-region", Fdecode_coding_region, 3, 4, 0, /*
2462 Decode the text between START and END which is encoded in CODING-SYSTEM.
2463 This is useful if you've read in encoded text from a file without decoding
2464 it (e.g. you read in a JIS-formatted file but used the `binary' or
2465 `no-conversion' coding system, so that it shows up as "^[$B!<!+^[(B").
2466 Return length of decoded text.
2467 BUFFER defaults to the current buffer if unspecified.
2468 */
2469        (start, end, coding_system, buffer))
2470 {
2471   Bufpos b, e;
2472   struct buffer *buf = decode_buffer (buffer, 0);
2473   Lisp_Object instream, lb_outstream, de_outstream, outstream;
2474   Lstream *istr, *ostr;
2475   struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2476
2477   get_buffer_range_char (buf, start, end, &b, &e, 0);
2478
2479   barf_if_buffer_read_only (buf, b, e);
2480
2481   coding_system = Fget_coding_system (coding_system);
2482   instream = make_lisp_buffer_input_stream  (buf, b, e, 0);
2483   lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2484   de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2485                                               coding_system);
2486   outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2487                                            Fget_coding_system (Qbinary));
2488   istr = XLSTREAM (instream);
2489   ostr = XLSTREAM (outstream);
2490   GCPRO4 (instream, lb_outstream, de_outstream, outstream);
2491
2492   /* The chain of streams looks like this:
2493
2494      [BUFFER] <----- send through
2495                      ------> [ENCODE AS BINARY]
2496                              ------> [DECODE AS SPECIFIED]
2497                                      ------> [BUFFER]
2498    */
2499
2500   while (1)
2501     {
2502       char tempbuf[1024]; /* some random amount */
2503       Bufpos newpos, even_newer_pos;
2504       Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2505       ssize_t size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2506
2507       if (!size_in_bytes)
2508         break;
2509       newpos = lisp_buffer_stream_startpos (istr);
2510       Lstream_write (ostr, tempbuf, size_in_bytes);
2511       even_newer_pos = lisp_buffer_stream_startpos (istr);
2512       buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2513                            even_newer_pos, 0);
2514     }
2515   Lstream_close (istr);
2516   Lstream_close (ostr);
2517   UNGCPRO;
2518   Lstream_delete (istr);
2519   Lstream_delete (ostr);
2520   Lstream_delete (XLSTREAM (de_outstream));
2521   Lstream_delete (XLSTREAM (lb_outstream));
2522   return Qnil;
2523 }
2524
2525 \f
2526 /************************************************************************/
2527 /*           Converting to an external encoding ("encoding")            */
2528 /************************************************************************/
2529
2530 /* An encoding stream is an output stream.  When you create the
2531    stream, you specify the coding system that governs the encoding
2532    and another stream that the resulting encoded data is to be
2533    sent to, and then start sending data to it. */
2534
2535 #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding)
2536
2537 struct encoding_stream
2538 {
2539   /* Coding system that governs the conversion. */
2540   Lisp_Coding_System *codesys;
2541
2542   /* Stream that we read the encoded data from or
2543      write the decoded data to. */
2544   Lstream *other_end;
2545
2546   /* If we are reading, then we can return only a fixed amount of
2547      data, so if the conversion resulted in too much data, we store it
2548      here for retrieval the next time around. */
2549   unsigned_char_dynarr *runoff;
2550
2551   /* FLAGS holds flags indicating the current state of the encoding.
2552      Some of these flags are dependent on the coding system. */
2553   unsigned int flags;
2554
2555   /* CH holds a partially built-up character.  Since we only deal
2556      with one- and two-byte characters at the moment, we only use
2557      this to store the first byte of a two-byte character. */
2558   unsigned int ch;
2559 #ifdef MULE
2560   /* Additional information used by the ISO2022 encoder. */
2561   struct
2562     {
2563       /* CHARSET holds the character sets currently assigned to the G0
2564          through G3 registers.  It is initialized from the array
2565          INITIAL_CHARSET in CODESYS. */
2566       Lisp_Object charset[4];
2567
2568       /* Which registers are currently invoked into the left (GL) and
2569          right (GR) halves of the 8-bit encoding space? */
2570       int register_left, register_right;
2571
2572       /* Whether we need to explicitly designate the charset in the
2573          G? register before using it.  It is initialized from the
2574          array FORCE_CHARSET_ON_OUTPUT in CODESYS. */
2575       unsigned char force_charset_on_output[4];
2576
2577       /* Other state variables that need to be preserved across
2578          invocations. */
2579       Lisp_Object current_charset;
2580       int current_half;
2581       int current_char_boundary;
2582     } iso2022;
2583
2584   void (*encode_char) (struct encoding_stream *str, Emchar c,
2585                        unsigned_char_dynarr *dst, unsigned int *flags);
2586   void (*finish) (struct encoding_stream *str,
2587                   unsigned_char_dynarr *dst, unsigned int *flags);
2588
2589   /* Additional information (the state of the running CCL program)
2590      used by the CCL encoder. */
2591   struct ccl_program ccl;
2592 #endif /* MULE */
2593 };
2594
2595 static ssize_t encoding_reader (Lstream *stream, unsigned char *data, size_t size);
2596 static ssize_t encoding_writer (Lstream *stream, CONST unsigned char *data,
2597                                 size_t size);
2598 static int encoding_rewinder   (Lstream *stream);
2599 static int encoding_seekable_p (Lstream *stream);
2600 static int encoding_flusher    (Lstream *stream);
2601 static int encoding_closer     (Lstream *stream);
2602
2603 static Lisp_Object encoding_marker (Lisp_Object stream);
2604
2605 DEFINE_LSTREAM_IMPLEMENTATION ("encoding", lstream_encoding,
2606                                sizeof (struct encoding_stream));
2607
2608 static Lisp_Object
2609 encoding_marker (Lisp_Object stream)
2610 {
2611   Lstream *str = ENCODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2612   Lisp_Object str_obj;
2613
2614   /* We do not need to mark the coding systems or charsets stored
2615      within the stream because they are stored in a global list
2616      and automatically marked. */
2617
2618   XSETLSTREAM (str_obj, str);
2619   mark_object (str_obj);
2620   if (str->imp->marker)
2621     return (str->imp->marker) (str_obj);
2622   else
2623     return Qnil;
2624 }
2625
2626 /* Read SIZE bytes of data and store it into DATA.  We are a encoding stream
2627    so we read data from the other end, encode it, and store it into DATA. */
2628
2629 static ssize_t
2630 encoding_reader (Lstream *stream, unsigned char *data, size_t size)
2631 {
2632   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2633   unsigned char *orig_data = data;
2634   ssize_t read_size;
2635   int error_occurred = 0;
2636
2637   /* We need to interface to mule_encode(), which expects to take some
2638      amount of data and store the result into a Dynarr.  We have
2639      mule_encode() store into str->runoff, and take data from there
2640      as necessary. */
2641
2642   /* We loop until we have enough data, reading chunks from the other
2643      end and encoding it. */
2644   while (1)
2645     {
2646       /* Take data from the runoff if we can.  Make sure to take at
2647          most SIZE bytes, and delete the data from the runoff. */
2648       if (Dynarr_length (str->runoff) > 0)
2649         {
2650           int chunk = min ((int) size, Dynarr_length (str->runoff));
2651           memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2652           Dynarr_delete_many (str->runoff, 0, chunk);
2653           data += chunk;
2654           size -= chunk;
2655         }
2656
2657       if (size == 0)
2658         break; /* No more room for data */
2659
2660       if (str->flags & CODING_STATE_END)
2661         /* This means that on the previous iteration, we hit the EOF on
2662            the other end.  We loop once more so that mule_encode() can
2663            output any final stuff it may be holding, or any "go back
2664            to a sane state" escape sequences. (This latter makes sense
2665            during encoding.) */
2666         break;
2667
2668       /* Exhausted the runoff, so get some more.  DATA at least SIZE bytes
2669          left of storage in it, so it's OK to read directly into it.
2670          (We'll be overwriting above, after we've encoded it into the
2671          runoff.) */
2672       read_size = Lstream_read (str->other_end, data, size);
2673       if (read_size < 0)
2674         {
2675           error_occurred = 1;
2676           break;
2677         }
2678       if (read_size == 0)
2679         /* There might be some more end data produced in the translation.
2680            See the comment above. */
2681         str->flags |= CODING_STATE_END;
2682       mule_encode (stream, data, str->runoff, read_size);
2683     }
2684
2685   if (data == orig_data)
2686     return error_occurred ? -1 : 0;
2687   else
2688     return data - orig_data;
2689 }
2690
2691 static ssize_t
2692 encoding_writer (Lstream *stream, CONST unsigned char *data, size_t size)
2693 {
2694   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2695   ssize_t retval;
2696
2697   /* Encode all our data into the runoff, and then attempt to write
2698      it all out to the other end.  Remove whatever chunk we succeeded
2699      in writing. */
2700   mule_encode (stream, data, str->runoff, size);
2701   retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2702                           Dynarr_length (str->runoff));
2703   if (retval > 0)
2704     Dynarr_delete_many (str->runoff, 0, retval);
2705   /* Do NOT return retval.  The return value indicates how much
2706      of the incoming data was written, not how many bytes were
2707      written. */
2708   return size;
2709 }
2710
2711 static void
2712 reset_encoding_stream (struct encoding_stream *str)
2713 {
2714 #ifdef MULE
2715   switch (CODING_SYSTEM_TYPE (str->codesys))
2716     {
2717     case CODESYS_ISO2022:
2718       {
2719         int i;
2720
2721         str->encode_char = &char_encode_iso2022;
2722         str->finish = &char_finish_iso2022;
2723         for (i = 0; i < 4; i++)
2724           {
2725             str->iso2022.charset[i] =
2726               CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, i);
2727             str->iso2022.force_charset_on_output[i] =
2728               CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (str->codesys, i);
2729           }
2730         str->iso2022.register_left = 0;
2731         str->iso2022.register_right = 1;
2732         str->iso2022.current_charset = Qnil;
2733         str->iso2022.current_half = 0;
2734         break;
2735       }
2736     case CODESYS_CCL:
2737       setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_ENCODE (str->codesys));
2738       break;
2739     case CODESYS_UTF8:
2740       str->encode_char = &char_encode_utf8;
2741       str->finish = &char_finish_utf8;
2742       break;
2743     case CODESYS_UCS4:
2744       str->encode_char = &char_encode_ucs4;
2745       str->finish = &char_finish_ucs4;
2746       break;
2747     case CODESYS_SHIFT_JIS:
2748       str->encode_char = &char_encode_shift_jis;
2749       str->finish = &char_finish_shift_jis;
2750       break;
2751     default:
2752       break;
2753     }
2754 #endif /* MULE */
2755   str->iso2022.current_char_boundary = 0;
2756   str->flags = str->ch = 0;
2757 }
2758
2759 static int
2760 encoding_rewinder (Lstream *stream)
2761 {
2762   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2763   reset_encoding_stream (str);
2764   Dynarr_reset (str->runoff);
2765   return Lstream_rewind (str->other_end);
2766 }
2767
2768 static int
2769 encoding_seekable_p (Lstream *stream)
2770 {
2771   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2772   return Lstream_seekable_p (str->other_end);
2773 }
2774
2775 static int
2776 encoding_flusher (Lstream *stream)
2777 {
2778   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2779   return Lstream_flush (str->other_end);
2780 }
2781
2782 static int
2783 encoding_closer (Lstream *stream)
2784 {
2785   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2786   if (stream->flags & LSTREAM_FL_WRITE)
2787     {
2788       str->flags |= CODING_STATE_END;
2789       encoding_writer (stream, 0, 0);
2790     }
2791   Dynarr_free (str->runoff);
2792   return Lstream_close (str->other_end);
2793 }
2794
2795 Lisp_Object
2796 encoding_stream_coding_system (Lstream *stream)
2797 {
2798   Lisp_Object coding_system;
2799   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2800
2801   XSETCODING_SYSTEM (coding_system, str->codesys);
2802   return coding_system;
2803 }
2804
2805 void
2806 set_encoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2807 {
2808   Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2809   struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2810   str->codesys = cs;
2811   reset_encoding_stream (str);
2812 }
2813
2814 static Lisp_Object
2815 make_encoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2816                         CONST char *mode)
2817 {
2818   Lstream *lstr = Lstream_new (lstream_encoding, mode);
2819   struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2820   Lisp_Object obj;
2821
2822   xzero (*str);
2823   str->runoff = Dynarr_new (unsigned_char);
2824   str->other_end = stream;
2825   set_encoding_stream_coding_system (lstr, codesys);
2826   XSETLSTREAM (obj, lstr);
2827   return obj;
2828 }
2829
2830 Lisp_Object
2831 make_encoding_input_stream (Lstream *stream, Lisp_Object codesys)
2832 {
2833   return make_encoding_stream_1 (stream, codesys, "r");
2834 }
2835
2836 Lisp_Object
2837 make_encoding_output_stream (Lstream *stream, Lisp_Object codesys)
2838 {
2839   return make_encoding_stream_1 (stream, codesys, "w");
2840 }
2841
2842 /* Convert N bytes of internally-formatted data stored in SRC to an
2843    external format, according to the encoding stream ENCODING.
2844    Store the encoded data into DST. */
2845
2846 static void
2847 mule_encode (Lstream *encoding, CONST unsigned char *src,
2848              unsigned_char_dynarr *dst, unsigned int n)
2849 {
2850   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2851
2852   switch (CODING_SYSTEM_TYPE (str->codesys))
2853     {
2854 #ifdef DEBUG_XEMACS
2855     case CODESYS_INTERNAL:
2856       Dynarr_add_many (dst, src, n);
2857       break;
2858 #endif
2859     case CODESYS_AUTODETECT:
2860       /* If we got this far and still haven't decided on the coding
2861          system, then do no conversion. */
2862     case CODESYS_NO_CONVERSION:
2863       encode_coding_no_conversion (encoding, src, dst, n);
2864       break;
2865 #ifdef MULE
2866     case CODESYS_BIG5:
2867       encode_coding_big5 (encoding, src, dst, n);
2868       break;
2869     case CODESYS_CCL:
2870       str->ccl.last_block = str->flags & CODING_STATE_END;
2871       ccl_driver (&str->ccl, src, dst, n, 0, CCL_MODE_ENCODING);
2872       break;
2873 #endif /* MULE */
2874     default:
2875       text_encode_generic (encoding, src, dst, n);
2876     }
2877 }
2878
2879 DEFUN ("encode-coding-region", Fencode_coding_region, 3, 4, 0, /*
2880 Encode the text between START and END using CODING-SYSTEM.
2881 This will, for example, convert Japanese characters into stuff such as
2882 "^[$B!<!+^[(B" if you use the JIS encoding.  Return length of encoded
2883 text.  BUFFER defaults to the current buffer if unspecified.
2884 */
2885        (start, end, coding_system, buffer))
2886 {
2887   Bufpos b, e;
2888   struct buffer *buf = decode_buffer (buffer, 0);
2889   Lisp_Object instream, lb_outstream, de_outstream, outstream;
2890   Lstream *istr, *ostr;
2891   struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2892
2893   get_buffer_range_char (buf, start, end, &b, &e, 0);
2894
2895   barf_if_buffer_read_only (buf, b, e);
2896
2897   coding_system = Fget_coding_system (coding_system);
2898   instream  = make_lisp_buffer_input_stream  (buf, b, e, 0);
2899   lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2900   de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2901                                               Fget_coding_system (Qbinary));
2902   outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2903                                            coding_system);
2904   istr = XLSTREAM (instream);
2905   ostr = XLSTREAM (outstream);
2906   GCPRO4 (instream, outstream, de_outstream, lb_outstream);
2907   /* The chain of streams looks like this:
2908
2909      [BUFFER] <----- send through
2910                      ------> [ENCODE AS SPECIFIED]
2911                              ------> [DECODE AS BINARY]
2912                                      ------> [BUFFER]
2913    */
2914   while (1)
2915     {
2916       char tempbuf[1024]; /* some random amount */
2917       Bufpos newpos, even_newer_pos;
2918       Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2919       ssize_t size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2920
2921       if (!size_in_bytes)
2922         break;
2923       newpos = lisp_buffer_stream_startpos (istr);
2924       Lstream_write (ostr, tempbuf, size_in_bytes);
2925       even_newer_pos = lisp_buffer_stream_startpos (istr);
2926       buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2927                            even_newer_pos, 0);
2928     }
2929
2930   {
2931     Charcount retlen =
2932       lisp_buffer_stream_startpos (XLSTREAM (instream)) - b;
2933     Lstream_close (istr);
2934     Lstream_close (ostr);
2935     UNGCPRO;
2936     Lstream_delete (istr);
2937     Lstream_delete (ostr);
2938     Lstream_delete (XLSTREAM (de_outstream));
2939     Lstream_delete (XLSTREAM (lb_outstream));
2940     return make_int (retlen);
2941   }
2942 }
2943
2944 #ifdef MULE
2945 \f
2946 static void
2947 text_encode_generic (Lstream *encoding, CONST unsigned char *src,
2948                      unsigned_char_dynarr *dst, unsigned int n)
2949 {
2950   unsigned char c;
2951   unsigned char char_boundary;
2952   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2953   unsigned int flags          = str->flags;
2954   Emchar ch                   = str->ch;
2955
2956   char_boundary = str->iso2022.current_char_boundary;
2957
2958   while (n--)
2959     {
2960       c = *src++;
2961
2962       if (char_boundary == 0)
2963         {
2964           if (c >= 0xfc)
2965             {
2966               ch = c & 0x01;
2967               char_boundary = 5;
2968             }
2969           else if (c >= 0xf8)
2970             {
2971               ch = c & 0x03;
2972               char_boundary = 4;
2973             }
2974           else if (c >= 0xf0)
2975             {
2976               ch = c & 0x07;
2977               char_boundary = 3;
2978             }
2979           else if (c >= 0xe0)
2980             {
2981               ch = c & 0x0f;
2982               char_boundary = 2;
2983             }
2984           else if (c >= 0xc0)
2985             {
2986               ch = c & 0x1f;
2987               char_boundary = 1;
2988             }
2989           else
2990             (*str->encode_char) (str, c, dst, &flags);
2991         }
2992       else if (char_boundary == 1)
2993         {
2994           (*str->encode_char) (str, (ch << 6) | (c & 0x3f), dst, &flags);
2995           ch =0;
2996           char_boundary = 0;
2997         }
2998       else
2999         {
3000           ch = (ch << 6) | (c & 0x3f);
3001           char_boundary--;
3002         }
3003     }
3004
3005   if ((char_boundary == 0) && (flags & CODING_STATE_END))
3006     {
3007       (*str->finish) (str, dst, &flags);
3008     }
3009
3010   str->flags = flags;
3011   str->ch    = ch;
3012   str->iso2022.current_char_boundary = char_boundary;
3013 }
3014
3015 \f
3016 /************************************************************************/
3017 /*                          Shift-JIS methods                           */
3018 /************************************************************************/
3019
3020 /* Shift-JIS is a coding system encoding three character sets: ASCII, right
3021    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
3022    as is.  A character of JISX0201-Kana (DIMENSION1_CHARS94 character set) is
3023    encoded by "position-code + 0x80".  A character of JISX0208
3024    (DIMENSION2_CHARS94 character set) is encoded in 2-byte but two
3025    position-codes are divided and shifted so that it fit in the range
3026    below.
3027
3028    --- CODE RANGE of Shift-JIS ---
3029    (character set)      (range)
3030    ASCII                0x00 .. 0x7F
3031    JISX0201-Kana        0xA0 .. 0xDF
3032    JISX0208 (1st byte)  0x80 .. 0x9F and 0xE0 .. 0xEF
3033             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
3034    -------------------------------
3035
3036 */
3037
3038 /* Is this the first byte of a Shift-JIS two-byte char? */
3039
3040 #define BYTE_SJIS_TWO_BYTE_1_P(c) \
3041   (((c) >= 0x81 && (c) <= 0x9F) || ((c) >= 0xE0 && (c) <= 0xEF))
3042
3043 /* Is this the second byte of a Shift-JIS two-byte char? */
3044
3045 #define BYTE_SJIS_TWO_BYTE_2_P(c) \
3046   (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0x80 && (c) <= 0xFC))
3047
3048 #define BYTE_SJIS_KATAKANA_P(c) \
3049   ((c) >= 0xA1 && (c) <= 0xDF)
3050
3051 static int
3052 detect_coding_sjis (struct detection_state *st, CONST unsigned char *src,
3053                     unsigned int n)
3054 {
3055   int c;
3056
3057   while (n--)
3058     {
3059       c = *src++;
3060       if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3061         return 0;
3062       if (st->shift_jis.in_second_byte)
3063         {
3064           st->shift_jis.in_second_byte = 0;
3065           if (c < 0x40)
3066             return 0;
3067         }
3068       else if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
3069         st->shift_jis.in_second_byte = 1;
3070     }
3071   return CODING_CATEGORY_SHIFT_JIS_MASK;
3072 }
3073
3074 /* Convert Shift-JIS data to internal format. */
3075
3076 static void
3077 decode_coding_sjis (Lstream *decoding, CONST unsigned char *src,
3078                     unsigned_char_dynarr *dst, unsigned int n)
3079 {
3080   unsigned char c;
3081   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3082   unsigned int flags  = str->flags;
3083   unsigned int ch     = str->ch;
3084   eol_type_t eol_type = str->eol_type;
3085
3086   while (n--)
3087     {
3088       c = *src++;
3089
3090       if (ch)
3091         {
3092           /* Previous character was first byte of Shift-JIS Kanji char. */
3093           if (BYTE_SJIS_TWO_BYTE_2_P (c))
3094             {
3095               unsigned char e1, e2;
3096
3097               DECODE_SJIS (ch, c, e1, e2);
3098 #ifdef UTF2000
3099               DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_japanese_jisx0208,
3100                                             e1 & 0x7F,
3101                                             e2 & 0x7F), dst);
3102 #else
3103               Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
3104               Dynarr_add (dst, e1);
3105               Dynarr_add (dst, e2);
3106 #endif
3107             }
3108           else
3109             {
3110               DECODE_ADD_BINARY_CHAR (ch, dst);
3111               DECODE_ADD_BINARY_CHAR (c, dst);
3112             }
3113           ch = 0;
3114         }
3115       else
3116         {
3117           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3118           if (BYTE_SJIS_TWO_BYTE_1_P (c))
3119             ch = c;
3120           else if (BYTE_SJIS_KATAKANA_P (c))
3121             {
3122 #ifdef UTF2000
3123               DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_katakana_jisx0201,
3124                                             c & 0x7F, 0), dst);
3125 #else
3126               Dynarr_add (dst, LEADING_BYTE_KATAKANA_JISX0201);
3127               Dynarr_add (dst, c);
3128 #endif
3129             }
3130 #ifdef UTF2000
3131           else if (c > 32)
3132             DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_latin_jisx0201,
3133                                           c, 0), dst);
3134 #endif
3135           else
3136             DECODE_ADD_BINARY_CHAR (c, dst);
3137         }
3138     label_continue_loop:;
3139     }
3140
3141   DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
3142
3143   str->flags = flags;
3144   str->ch    = ch;
3145 }
3146
3147 /* Convert internal character representation to Shift_JIS. */
3148
3149 void
3150 char_encode_shift_jis (struct encoding_stream *str, Emchar ch,
3151                        unsigned_char_dynarr *dst, unsigned int *flags)
3152 {
3153   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3154
3155   if (ch == '\n')
3156     {
3157       if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3158         Dynarr_add (dst, '\r');
3159       if (eol_type != EOL_CR)
3160         Dynarr_add (dst, ch);
3161     }
3162   else
3163     {
3164       Lisp_Object charset;
3165       unsigned int c1, c2, s1, s2;
3166 #ifdef UTF2000
3167       int code_point = charset_code_point (Vcharset_latin_jisx0201, ch);
3168
3169       if (code_point >= 0)
3170         {
3171           charset = Vcharset_latin_jisx0201;
3172           c1 = code_point;
3173           c2 = 0;
3174         }
3175       else
3176 #endif
3177         BREAKUP_CHAR (ch, charset, c1, c2);
3178
3179       if (EQ(charset, Vcharset_katakana_jisx0201))
3180         {
3181           Dynarr_add (dst, c1 | 0x80);
3182         }
3183       else if (c2 == 0)
3184         {
3185           Dynarr_add (dst, c1);
3186         }
3187       else if (EQ(charset, Vcharset_japanese_jisx0208))
3188         {
3189           ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3190           Dynarr_add (dst, s1);
3191           Dynarr_add (dst, s2);
3192         }
3193       else
3194         Dynarr_add (dst, '?');
3195     }
3196 }
3197
3198 void
3199 char_finish_shift_jis (struct encoding_stream *str, unsigned_char_dynarr *dst,
3200                        unsigned int *flags)
3201 {
3202 }
3203
3204 DEFUN ("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /*
3205 Decode a JISX0208 character of Shift-JIS coding-system.
3206 CODE is the character code in Shift-JIS as a cons of type bytes.
3207 Return the corresponding character.
3208 */
3209        (code))
3210 {
3211   unsigned char c1, c2, s1, s2;
3212
3213   CHECK_CONS (code);
3214   CHECK_INT (XCAR (code));
3215   CHECK_INT (XCDR (code));
3216   s1 = XINT (XCAR (code));
3217   s2 = XINT (XCDR (code));
3218   if (BYTE_SJIS_TWO_BYTE_1_P (s1) &&
3219       BYTE_SJIS_TWO_BYTE_2_P (s2))
3220     {
3221       DECODE_SJIS (s1, s2, c1, c2);
3222       return make_char (MAKE_CHAR (Vcharset_japanese_jisx0208,
3223                                    c1 & 0x7F, c2 & 0x7F));
3224     }
3225   else
3226     return Qnil;
3227 }
3228
3229 DEFUN ("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /*
3230 Encode a JISX0208 character CHAR to SHIFT-JIS coding-system.
3231 Return the corresponding character code in SHIFT-JIS as a cons of two bytes.
3232 */
3233        (ch))
3234 {
3235   Lisp_Object charset;
3236   int c1, c2, s1, s2;
3237
3238   CHECK_CHAR_COERCE_INT (ch);
3239   BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
3240   if (EQ (charset, Vcharset_japanese_jisx0208))
3241     {
3242       ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3243       return Fcons (make_int (s1), make_int (s2));
3244     }
3245   else
3246     return Qnil;
3247 }
3248
3249 \f
3250 /************************************************************************/
3251 /*                            Big5 methods                              */
3252 /************************************************************************/
3253
3254 /* BIG5 is a coding system encoding two character sets: ASCII and
3255    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
3256    character set and is encoded in two-byte.
3257
3258    --- CODE RANGE of BIG5 ---
3259    (character set)      (range)
3260    ASCII                0x00 .. 0x7F
3261    Big5 (1st byte)      0xA1 .. 0xFE
3262         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
3263    --------------------------
3264
3265    Since the number of characters in Big5 is larger than maximum
3266    characters in Emacs' charset (96x96), it can't be handled as one
3267    charset.  So, in Emacs, Big5 is divided into two: `charset-big5-1'
3268    and `charset-big5-2'.  Both <type>s are DIMENSION2_CHARS94.  The former
3269    contains frequently used characters and the latter contains less
3270    frequently used characters.  */
3271
3272 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
3273   ((c) >= 0xA1 && (c) <= 0xFE)
3274
3275 /* Is this the second byte of a Shift-JIS two-byte char? */
3276
3277 #define BYTE_BIG5_TWO_BYTE_2_P(c) \
3278   (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0xA1 && (c) <= 0xFE))
3279
3280 /* Number of Big5 characters which have the same code in 1st byte.  */
3281
3282 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
3283
3284 /* Code conversion macros.  These are macros because they are used in
3285    inner loops during code conversion.
3286
3287    Note that temporary variables in macros introduce the classic
3288    dynamic-scoping problems with variable names.  We use capital-
3289    lettered variables in the assumption that XEmacs does not use
3290    capital letters in variables except in a very formalized way
3291    (e.g. Qstring). */
3292
3293 /* Convert Big5 code (b1, b2) into its internal string representation
3294    (lb, c1, c2). */
3295
3296 /* There is a much simpler way to split the Big5 charset into two.
3297    For the moment I'm going to leave the algorithm as-is because it
3298    claims to separate out the most-used characters into a single
3299    charset, which perhaps will lead to optimizations in various
3300    places.
3301
3302    The way the algorithm works is something like this:
3303
3304    Big5 can be viewed as a 94x157 charset, where the row is
3305    encoded into the bytes 0xA1 .. 0xFE and the column is encoded
3306    into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE.  As for frequency,
3307    the split between low and high column numbers is apparently
3308    meaningless; ascending rows produce less and less frequent chars.
3309    Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to
3310    the first charset, and the upper half (0xC9 .. 0xFE) to the
3311    second.  To do the conversion, we convert the character into
3312    a single number where 0 .. 156 is the first row, 157 .. 313
3313    is the second, etc.  That way, the characters are ordered by
3314    decreasing frequency.  Then we just chop the space in two
3315    and coerce the result into a 94x94 space.
3316    */
3317
3318 #define DECODE_BIG5(b1, b2, lb, c1, c2) do                              \
3319 {                                                                       \
3320   int B1 = b1, B2 = b2;                                                 \
3321   unsigned int I                                                        \
3322     = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62);     \
3323                                                                         \
3324   if (B1 < 0xC9)                                                        \
3325     {                                                                   \
3326       lb = LEADING_BYTE_CHINESE_BIG5_1;                                 \
3327     }                                                                   \
3328   else                                                                  \
3329     {                                                                   \
3330       lb = LEADING_BYTE_CHINESE_BIG5_2;                                 \
3331       I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1);                             \
3332     }                                                                   \
3333   c1 = I / (0xFF - 0xA1) + 0xA1;                                        \
3334   c2 = I % (0xFF - 0xA1) + 0xA1;                                        \
3335 } while (0)
3336
3337 /* Convert the internal string representation of a Big5 character
3338    (lb, c1, c2) into Big5 code (b1, b2). */
3339
3340 #define ENCODE_BIG5(lb, c1, c2, b1, b2) do                              \
3341 {                                                                       \
3342   unsigned int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1);       \
3343                                                                         \
3344   if (lb == LEADING_BYTE_CHINESE_BIG5_2)                                \
3345     {                                                                   \
3346       I += BIG5_SAME_ROW * (0xC9 - 0xA1);                               \
3347     }                                                                   \
3348   b1 = I / BIG5_SAME_ROW + 0xA1;                                        \
3349   b2 = I % BIG5_SAME_ROW;                                               \
3350   b2 += b2 < 0x3F ? 0x40 : 0x62;                                        \
3351 } while (0)
3352
3353 static int
3354 detect_coding_big5 (struct detection_state *st, CONST unsigned char *src,
3355                     unsigned int n)
3356 {
3357   int c;
3358
3359   while (n--)
3360     {
3361       c = *src++;
3362       if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO ||
3363           (c >= 0x80 && c <= 0xA0))
3364         return 0;
3365       if (st->big5.in_second_byte)
3366         {
3367           st->big5.in_second_byte = 0;
3368           if (c < 0x40 || (c >= 0x80 && c <= 0xA0))
3369             return 0;
3370         }
3371       else if (c >= 0xA1)
3372         st->big5.in_second_byte = 1;
3373     }
3374   return CODING_CATEGORY_BIG5_MASK;
3375 }
3376
3377 /* Convert Big5 data to internal format. */
3378
3379 static void
3380 decode_coding_big5 (Lstream *decoding, CONST unsigned char *src,
3381                     unsigned_char_dynarr *dst, unsigned int n)
3382 {
3383   unsigned char c;
3384   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3385   unsigned int flags  = str->flags;
3386   unsigned int ch     = str->ch;
3387   eol_type_t eol_type = str->eol_type;
3388
3389   while (n--)
3390     {
3391       c = *src++;
3392       if (ch)
3393         {
3394           /* Previous character was first byte of Big5 char. */
3395           if (BYTE_BIG5_TWO_BYTE_2_P (c))
3396             {
3397               unsigned char b1, b2, b3;
3398               DECODE_BIG5 (ch, c, b1, b2, b3);
3399               Dynarr_add (dst, b1);
3400               Dynarr_add (dst, b2);
3401               Dynarr_add (dst, b3);
3402             }
3403           else
3404             {
3405               DECODE_ADD_BINARY_CHAR (ch, dst);
3406               DECODE_ADD_BINARY_CHAR (c, dst);
3407             }
3408           ch = 0;
3409         }
3410       else
3411         {
3412           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3413           if (BYTE_BIG5_TWO_BYTE_1_P (c))
3414             ch = c;
3415           else
3416             DECODE_ADD_BINARY_CHAR (c, dst);
3417         }
3418     label_continue_loop:;
3419     }
3420
3421   DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
3422
3423   str->flags = flags;
3424   str->ch    = ch;
3425 }
3426
3427 /* Convert internally-formatted data to Big5. */
3428
3429 static void
3430 encode_coding_big5 (Lstream *encoding, CONST unsigned char *src,
3431                     unsigned_char_dynarr *dst, unsigned int n)
3432 {
3433 #ifndef UTF2000
3434   unsigned char c;
3435   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3436   unsigned int flags  = str->flags;
3437   unsigned int ch     = str->ch;
3438   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3439
3440   while (n--)
3441     {
3442       c = *src++;
3443       if (c == '\n')
3444         {
3445           if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3446             Dynarr_add (dst, '\r');
3447           if (eol_type != EOL_CR)
3448             Dynarr_add (dst, '\n');
3449         }
3450       else if (BYTE_ASCII_P (c))
3451         {
3452           /* ASCII. */
3453           Dynarr_add (dst, c);
3454         }
3455       else if (BUFBYTE_LEADING_BYTE_P (c))
3456         {
3457           if (c == LEADING_BYTE_CHINESE_BIG5_1 ||
3458               c == LEADING_BYTE_CHINESE_BIG5_2)
3459             {
3460               /* A recognized leading byte. */
3461               ch = c;
3462               continue; /* not done with this character. */
3463             }
3464           /* otherwise just ignore this character. */
3465         }
3466       else if (ch == LEADING_BYTE_CHINESE_BIG5_1 ||
3467                ch == LEADING_BYTE_CHINESE_BIG5_2)
3468         {
3469           /* Previous char was a recognized leading byte. */
3470           ch = (ch << 8) | c;
3471           continue; /* not done with this character. */
3472         }
3473       else if (ch)
3474         {
3475           /* Encountering second byte of a Big5 character. */
3476           unsigned char b1, b2;
3477
3478           ENCODE_BIG5 (ch >> 8, ch & 0xFF, c, b1, b2);
3479           Dynarr_add (dst, b1);
3480           Dynarr_add (dst, b2);
3481         }
3482
3483       ch = 0;
3484     }
3485
3486   str->flags = flags;
3487   str->ch    = ch;
3488 #endif
3489 }
3490
3491
3492 DEFUN ("decode-big5-char", Fdecode_big5_char, 1, 1, 0, /*
3493 Decode a Big5 character CODE of BIG5 coding-system.
3494 CODE is the character code in BIG5, a cons of two integers.
3495 Return the corresponding character.
3496 */
3497        (code))
3498 {
3499   unsigned char c1, c2, b1, b2;
3500
3501   CHECK_CONS (code);
3502   CHECK_INT (XCAR (code));
3503   CHECK_INT (XCDR (code));
3504   b1 = XINT (XCAR (code));
3505   b2 = XINT (XCDR (code));
3506   if (BYTE_BIG5_TWO_BYTE_1_P (b1) &&
3507       BYTE_BIG5_TWO_BYTE_2_P (b2))
3508     {
3509       Charset_ID leading_byte;
3510       Lisp_Object charset;
3511       DECODE_BIG5 (b1, b2, leading_byte, c1, c2);
3512       charset = CHARSET_BY_LEADING_BYTE (leading_byte);
3513       return make_char (MAKE_CHAR (charset, c1 & 0x7F, c2 & 0x7F));
3514     }
3515   else
3516     return Qnil;
3517 }
3518
3519 DEFUN ("encode-big5-char", Fencode_big5_char, 1, 1, 0, /*
3520 Encode the Big5 character CH to BIG5 coding-system.
3521 Return the corresponding character code in Big5.
3522 */
3523        (ch))
3524 {
3525   Lisp_Object charset;
3526   int c1, c2, b1, b2;
3527
3528   CHECK_CHAR_COERCE_INT (ch);
3529   BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
3530   if (EQ (charset, Vcharset_chinese_big5_1) ||
3531       EQ (charset, Vcharset_chinese_big5_2))
3532     {
3533       ENCODE_BIG5 (XCHARSET_LEADING_BYTE (charset), c1 | 0x80, c2 | 0x80,
3534                    b1, b2);
3535       return Fcons (make_int (b1), make_int (b2));
3536     }
3537   else
3538     return Qnil;
3539 }
3540
3541 \f
3542 /************************************************************************/
3543 /*                           UCS-4 methods                              */
3544 /************************************************************************/
3545
3546 static int
3547 detect_coding_ucs4 (struct detection_state *st, CONST unsigned char *src,
3548                     unsigned int n)
3549 {
3550   while (n--)
3551     {
3552       int c = *src++;
3553       switch (st->ucs4.in_byte)
3554         {
3555         case 0:
3556           if (c >= 128)
3557             return 0;
3558           else
3559             st->ucs4.in_byte++;
3560           break;
3561         case 3:
3562           st->ucs4.in_byte = 0;
3563           break;
3564         default:
3565           st->ucs4.in_byte++;
3566         }
3567     }
3568   return CODING_CATEGORY_UCS4_MASK;
3569 }
3570
3571 static void
3572 decode_coding_ucs4 (Lstream *decoding, CONST unsigned char *src,
3573                     unsigned_char_dynarr *dst, unsigned int n)
3574 {
3575   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3576   unsigned int flags = str->flags;
3577   unsigned int ch    = str->ch;
3578   unsigned char counter = str->counter;
3579
3580   while (n--)
3581     {
3582       unsigned char c = *src++;
3583       switch (counter)
3584         {
3585         case 0:
3586           ch = c;
3587           counter = 3;
3588           break;
3589         case 1:
3590           DECODE_ADD_UCS_CHAR ((ch << 8) | c, dst);
3591           ch = 0;
3592           counter = 0;
3593           break;
3594         default:
3595           ch = ( ch << 8 ) | c;
3596           counter--;
3597         }
3598     }
3599   if (counter & CODING_STATE_END)
3600     DECODE_OUTPUT_PARTIAL_CHAR (ch);
3601
3602   str->flags = flags;
3603   str->ch    = ch;
3604   str->counter = counter;
3605 }
3606
3607 void
3608 char_encode_ucs4 (struct encoding_stream *str, Emchar ch,
3609                   unsigned_char_dynarr *dst, unsigned int *flags)
3610 {
3611   Dynarr_add (dst, ch >> 24);
3612   Dynarr_add (dst, ch >> 16);
3613   Dynarr_add (dst, ch >>  8);
3614   Dynarr_add (dst, ch      );
3615 }
3616
3617 void
3618 char_finish_ucs4 (struct encoding_stream *str, unsigned_char_dynarr *dst,
3619                   unsigned int *flags)
3620 {
3621 }
3622
3623 \f
3624 /************************************************************************/
3625 /*                           UTF-8 methods                              */
3626 /************************************************************************/
3627
3628 static int
3629 detect_coding_utf8 (struct detection_state *st, CONST unsigned char *src,
3630                     unsigned int n)
3631 {
3632   while (n--)
3633     {
3634       unsigned char c = *src++;
3635       switch (st->utf8.in_byte)
3636         {
3637         case 0:
3638           if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3639             return 0;
3640           else if (c >= 0xfc)
3641             st->utf8.in_byte = 5;
3642           else if (c >= 0xf8)
3643             st->utf8.in_byte = 4;
3644           else if (c >= 0xf0)
3645             st->utf8.in_byte = 3;
3646           else if (c >= 0xe0)
3647             st->utf8.in_byte = 2;
3648           else if (c >= 0xc0)
3649             st->utf8.in_byte = 1;
3650           else if (c >= 0x80)
3651             return 0;
3652           break;
3653         default:
3654           if ((c & 0xc0) != 0x80)
3655             return 0;
3656           else
3657             st->utf8.in_byte--;
3658         }
3659     }
3660   return CODING_CATEGORY_UTF8_MASK;
3661 }
3662
3663 static void
3664 decode_coding_utf8 (Lstream *decoding, CONST unsigned char *src,
3665                     unsigned_char_dynarr *dst, unsigned int n)
3666 {
3667   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3668   unsigned int flags  = str->flags;
3669   unsigned int ch     = str->ch;
3670   eol_type_t eol_type = str->eol_type;
3671   unsigned char counter = str->counter;
3672
3673   while (n--)
3674     {
3675       unsigned char c = *src++;
3676       switch (counter)
3677         {
3678         case 0:
3679           if ( c >= 0xfc )
3680             {
3681               ch = c & 0x01;
3682               counter = 5;
3683             }
3684           else if ( c >= 0xf8 )
3685             {
3686               ch = c & 0x03;
3687               counter = 4;
3688             }
3689           else if ( c >= 0xf0 )
3690             {
3691               ch = c & 0x07;
3692               counter = 3;
3693             }
3694           else if ( c >= 0xe0 )
3695             {
3696               ch = c & 0x0f;
3697               counter = 2;
3698             }
3699           else if ( c >= 0xc0 )
3700             {
3701               ch = c & 0x1f;
3702               counter = 1;
3703             }
3704           else
3705             {
3706               DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3707               DECODE_ADD_UCS_CHAR (c, dst);
3708             }
3709           break;
3710         case 1:
3711           ch = ( ch << 6 ) | ( c & 0x3f );
3712           DECODE_ADD_UCS_CHAR (ch, dst);
3713           ch = 0;
3714           counter = 0;
3715           break;
3716         default:
3717           ch = ( ch << 6 ) | ( c & 0x3f );
3718           counter--;
3719         }
3720     label_continue_loop:;
3721     }
3722
3723   if (flags & CODING_STATE_END)
3724     DECODE_OUTPUT_PARTIAL_CHAR (ch);
3725
3726   str->flags = flags;
3727   str->ch    = ch;
3728   str->counter = counter;
3729 }
3730
3731 void
3732 char_encode_utf8 (struct encoding_stream *str, Emchar ch,
3733                   unsigned_char_dynarr *dst, unsigned int *flags)
3734 {
3735   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3736
3737   if (ch == '\n')
3738     {
3739       if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3740         Dynarr_add (dst, '\r');
3741       if (eol_type != EOL_CR)
3742         Dynarr_add (dst, ch);
3743     }
3744   else if (ch <= 0x7f)
3745     {
3746       Dynarr_add (dst, ch);
3747     }
3748   else if (ch <= 0x7ff)
3749     {
3750       Dynarr_add (dst, (ch >> 6) | 0xc0);
3751       Dynarr_add (dst, (ch & 0x3f) | 0x80);
3752     }
3753   else if (ch <= 0xffff)
3754     {
3755       Dynarr_add (dst,  (ch >> 12) | 0xe0);
3756       Dynarr_add (dst, ((ch >>  6) & 0x3f) | 0x80);
3757       Dynarr_add (dst,  (ch        & 0x3f) | 0x80);
3758     }
3759   else if (ch <= 0x1fffff)
3760     {
3761       Dynarr_add (dst,  (ch >> 18) | 0xf0);
3762       Dynarr_add (dst, ((ch >> 12) & 0x3f) | 0x80);
3763       Dynarr_add (dst, ((ch >>  6) & 0x3f) | 0x80);
3764       Dynarr_add (dst,  (ch        & 0x3f) | 0x80);
3765     }
3766   else if (ch <= 0x3ffffff)
3767     {
3768       Dynarr_add (dst,  (ch >> 24) | 0xf8);
3769       Dynarr_add (dst, ((ch >> 18) & 0x3f) | 0x80);
3770       Dynarr_add (dst, ((ch >> 12) & 0x3f) | 0x80);
3771       Dynarr_add (dst, ((ch >>  6) & 0x3f) | 0x80);
3772       Dynarr_add (dst,  (ch        & 0x3f) | 0x80);
3773     }
3774   else
3775     {
3776       Dynarr_add (dst,  (ch >> 30) | 0xfc);
3777       Dynarr_add (dst, ((ch >> 24) & 0x3f) | 0x80);
3778       Dynarr_add (dst, ((ch >> 18) & 0x3f) | 0x80);
3779       Dynarr_add (dst, ((ch >> 12) & 0x3f) | 0x80);
3780       Dynarr_add (dst, ((ch >>  6) & 0x3f) | 0x80);
3781       Dynarr_add (dst,  (ch        & 0x3f) | 0x80);
3782     }
3783 }
3784
3785 void
3786 char_finish_utf8 (struct encoding_stream *str, unsigned_char_dynarr *dst,
3787                   unsigned int *flags)
3788 {
3789 }
3790
3791 \f
3792 /************************************************************************/
3793 /*                           ISO2022 methods                            */
3794 /************************************************************************/
3795
3796 /* The following note describes the coding system ISO2022 briefly.
3797    Since the intention of this note is to help understand the
3798    functions in this file, some parts are NOT ACCURATE or OVERLY
3799    SIMPLIFIED.  For thorough understanding, please refer to the
3800    original document of ISO2022.
3801
3802    ISO2022 provides many mechanisms to encode several character sets
3803    in 7-bit and 8-bit environments.  For 7-bit environments, all text
3804    is encoded using bytes less than 128.  This may make the encoded
3805    text a little bit longer, but the text passes more easily through
3806    several gateways, some of which strip off MSB (Most Signigant Bit).
3807
3808    There are two kinds of character sets: control character set and
3809    graphic character set.  The former contains control characters such
3810    as `newline' and `escape' to provide control functions (control
3811    functions are also provided by escape sequences).  The latter
3812    contains graphic characters such as 'A' and '-'.  Emacs recognizes
3813    two control character sets and many graphic character sets.
3814
3815    Graphic character sets are classified into one of the following
3816    four classes, according to the number of bytes (DIMENSION) and
3817    number of characters in one dimension (CHARS) of the set:
3818    - DIMENSION1_CHARS94
3819    - DIMENSION1_CHARS96
3820    - DIMENSION2_CHARS94
3821    - DIMENSION2_CHARS96
3822
3823    In addition, each character set is assigned an identification tag,
3824    unique for each set, called "final character" (denoted as <F>
3825    hereafter).  The <F> of each character set is decided by ECMA(*)
3826    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
3827    (0x30..0x3F are for private use only).
3828
3829    Note (*): ECMA = European Computer Manufacturers Association
3830
3831    Here are examples of graphic character set [NAME(<F>)]:
3832         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
3833         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
3834         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
3835         o DIMENSION2_CHARS96 -- none for the moment
3836
3837    A code area (1 byte = 8 bits) is divided into 4 areas, C0, GL, C1, and GR.
3838         C0 [0x00..0x1F] -- control character plane 0
3839         GL [0x20..0x7F] -- graphic character plane 0
3840         C1 [0x80..0x9F] -- control character plane 1
3841         GR [0xA0..0xFF] -- graphic character plane 1
3842
3843    A control character set is directly designated and invoked to C0 or
3844    C1 by an escape sequence.  The most common case is that:
3845    - ISO646's  control character set is designated/invoked to C0, and
3846    - ISO6429's control character set is designated/invoked to C1,
3847    and usually these designations/invocations are omitted in encoded
3848    text.  In a 7-bit environment, only C0 can be used, and a control
3849    character for C1 is encoded by an appropriate escape sequence to
3850    fit into the environment.  All control characters for C1 are
3851    defined to have corresponding escape sequences.
3852
3853    A graphic character set is at first designated to one of four
3854    graphic registers (G0 through G3), then these graphic registers are
3855    invoked to GL or GR.  These designations and invocations can be
3856    done independently.  The most common case is that G0 is invoked to
3857    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
3858    these invocations and designations are omitted in encoded text.
3859    In a 7-bit environment, only GL can be used.
3860
3861    When a graphic character set of CHARS94 is invoked to GL, codes
3862    0x20 and 0x7F of the GL area work as control characters SPACE and
3863    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
3864    be used.
3865
3866    There are two ways of invocation: locking-shift and single-shift.
3867    With locking-shift, the invocation lasts until the next different
3868    invocation, whereas with single-shift, the invocation affects the
3869    following character only and doesn't affect the locking-shift
3870    state.  Invocations are done by the following control characters or
3871    escape sequences:
3872
3873    ----------------------------------------------------------------------
3874    abbrev  function                  cntrl escape seq   description
3875    ----------------------------------------------------------------------
3876    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
3877    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
3878    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
3879    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
3880    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
3881    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
3882    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
3883    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
3884    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
3885    ----------------------------------------------------------------------
3886    (*) These are not used by any known coding system.
3887
3888    Control characters for these functions are defined by macros
3889    ISO_CODE_XXX in `coding.h'.
3890
3891    Designations are done by the following escape sequences:
3892    ----------------------------------------------------------------------
3893    escape sequence      description
3894    ----------------------------------------------------------------------
3895    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
3896    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
3897    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
3898    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
3899    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
3900    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
3901    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
3902    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
3903    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
3904    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
3905    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
3906    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
3907    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
3908    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
3909    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
3910    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
3911    ----------------------------------------------------------------------
3912
3913    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
3914    of dimension 1, chars 94, and final character <F>, etc...
3915
3916    Note (*): Although these designations are not allowed in ISO2022,
3917    Emacs accepts them on decoding, and produces them on encoding
3918    CHARS96 character sets in a coding system which is characterized as
3919    7-bit environment, non-locking-shift, and non-single-shift.
3920
3921    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
3922    '(' can be omitted.  We refer to this as "short-form" hereafter.
3923
3924    Now you may notice that there are a lot of ways for encoding the
3925    same multilingual text in ISO2022.  Actually, there exist many
3926    coding systems such as Compound Text (used in X11's inter client
3927    communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
3928    (used in Korean internet), EUC (Extended UNIX Code, used in Asian
3929    localized platforms), and all of these are variants of ISO2022.
3930
3931    In addition to the above, Emacs handles two more kinds of escape
3932    sequences: ISO6429's direction specification and Emacs' private
3933    sequence for specifying character composition.
3934
3935    ISO6429's direction specification takes the following form:
3936         o CSI ']'      -- end of the current direction
3937         o CSI '0' ']'  -- end of the current direction
3938         o CSI '1' ']'  -- start of left-to-right text
3939         o CSI '2' ']'  -- start of right-to-left text
3940    The control character CSI (0x9B: control sequence introducer) is
3941    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
3942
3943    Character composition specification takes the following form:
3944         o ESC '0' -- start character composition
3945         o ESC '1' -- end character composition
3946    Since these are not standard escape sequences of any ISO standard,
3947    their use with these meanings is restricted to Emacs only.  */
3948
3949 static void
3950 reset_iso2022 (Lisp_Object coding_system, struct iso2022_decoder *iso)
3951 {
3952   int i;
3953
3954   for (i = 0; i < 4; i++)
3955     {
3956       if (!NILP (coding_system))
3957         iso->charset[i] =
3958           XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, i);
3959       else
3960         iso->charset[i] = Qt;
3961       iso->invalid_designated[i] = 0;
3962     }
3963   iso->esc = ISO_ESC_NOTHING;
3964   iso->esc_bytes_index = 0;
3965   iso->register_left = 0;
3966   iso->register_right = 1;
3967   iso->switched_dir_and_no_valid_charset_yet = 0;
3968   iso->invalid_switch_dir = 0;
3969   iso->output_direction_sequence = 0;
3970   iso->output_literally = 0;
3971 #ifdef ENABLE_COMPOSITE_CHARS
3972   if (iso->composite_chars)
3973     Dynarr_reset (iso->composite_chars);
3974 #endif
3975 }
3976
3977 static int
3978 fit_to_be_escape_quoted (unsigned char c)
3979 {
3980   switch (c)
3981     {
3982     case ISO_CODE_ESC:
3983     case ISO_CODE_CSI:
3984     case ISO_CODE_SS2:
3985     case ISO_CODE_SS3:
3986     case ISO_CODE_SO:
3987     case ISO_CODE_SI:
3988       return 1;
3989
3990     default:
3991       return 0;
3992     }
3993 }
3994
3995 /* Parse one byte of an ISO2022 escape sequence.
3996    If the result is an invalid escape sequence, return 0 and
3997    do not change anything in STR.  Otherwise, if the result is
3998    an incomplete escape sequence, update ISO2022.ESC and
3999    ISO2022.ESC_BYTES and return -1.  Otherwise, update
4000    all the state variables (but not ISO2022.ESC_BYTES) and
4001    return 1.
4002
4003    If CHECK_INVALID_CHARSETS is non-zero, check for designation
4004    or invocation of an invalid character set and treat that as
4005    an unrecognized escape sequence. */
4006
4007 static int
4008 parse_iso2022_esc (Lisp_Object codesys, struct iso2022_decoder *iso,
4009                    unsigned char c, unsigned int *flags,
4010                    int check_invalid_charsets)
4011 {
4012   /* (1) If we're at the end of a designation sequence, CS is the
4013      charset being designated and REG is the register to designate
4014      it to.
4015
4016      (2) If we're at the end of a locking-shift sequence, REG is
4017      the register to invoke and HALF (0 == left, 1 == right) is
4018      the half to invoke it into.
4019
4020      (3) If we're at the end of a single-shift sequence, REG is
4021      the register to invoke. */
4022   Lisp_Object cs = Qnil;
4023   int reg, half;
4024
4025   /* NOTE: This code does goto's all over the fucking place.
4026      The reason for this is that we're basically implementing
4027      a state machine here, and hierarchical languages like C
4028      don't really provide a clean way of doing this. */
4029
4030   if (! (*flags & CODING_STATE_ESCAPE))
4031     /* At beginning of escape sequence; we need to reset our
4032        escape-state variables. */
4033     iso->esc = ISO_ESC_NOTHING;
4034
4035   iso->output_literally = 0;
4036   iso->output_direction_sequence = 0;
4037
4038   switch (iso->esc)
4039     {
4040     case ISO_ESC_NOTHING:
4041       iso->esc_bytes_index = 0;
4042       switch (c)
4043         {
4044         case ISO_CODE_ESC:      /* Start escape sequence */
4045           *flags |= CODING_STATE_ESCAPE;
4046           iso->esc = ISO_ESC;
4047           goto not_done;
4048
4049         case ISO_CODE_CSI:      /* ISO6429 (specifying directionality) */
4050           *flags |= CODING_STATE_ESCAPE;
4051           iso->esc = ISO_ESC_5_11;
4052           goto not_done;
4053
4054         case ISO_CODE_SO:       /* locking shift 1 */
4055           reg = 1; half = 0;
4056           goto locking_shift;
4057         case ISO_CODE_SI:       /* locking shift 0 */
4058           reg = 0; half = 0;
4059           goto locking_shift;
4060
4061         case ISO_CODE_SS2:      /* single shift */
4062           reg = 2;
4063           goto single_shift;
4064         case ISO_CODE_SS3:      /* single shift */
4065           reg = 3;
4066           goto single_shift;
4067
4068         default:                        /* Other control characters */
4069           return 0;
4070         }
4071
4072     case ISO_ESC:
4073       switch (c)
4074         {
4075           /**** single shift ****/
4076
4077         case 'N':       /* single shift 2 */
4078           reg = 2;
4079           goto single_shift;
4080         case 'O':       /* single shift 3 */
4081           reg = 3;
4082           goto single_shift;
4083
4084           /**** locking shift ****/
4085
4086         case '~':       /* locking shift 1 right */
4087           reg = 1; half = 1;
4088           goto locking_shift;
4089         case 'n':       /* locking shift 2 */
4090           reg = 2; half = 0;
4091           goto locking_shift;
4092         case '}':       /* locking shift 2 right */
4093           reg = 2; half = 1;
4094           goto locking_shift;
4095         case 'o':       /* locking shift 3 */
4096           reg = 3; half = 0;
4097           goto locking_shift;
4098         case '|':       /* locking shift 3 right */
4099           reg = 3; half = 1;
4100           goto locking_shift;
4101
4102 #ifdef ENABLE_COMPOSITE_CHARS
4103           /**** composite ****/
4104
4105         case '0':
4106           iso->esc = ISO_ESC_START_COMPOSITE;
4107           *flags = (*flags & CODING_STATE_ISO2022_LOCK) |
4108             CODING_STATE_COMPOSITE;
4109           return 1;
4110
4111         case '1':
4112           iso->esc = ISO_ESC_END_COMPOSITE;
4113           *flags = (*flags & CODING_STATE_ISO2022_LOCK) &
4114             ~CODING_STATE_COMPOSITE;
4115           return 1;
4116 #endif /* ENABLE_COMPOSITE_CHARS */
4117
4118           /**** directionality ****/
4119
4120         case '[':
4121           iso->esc = ISO_ESC_5_11;
4122           goto not_done;
4123
4124           /**** designation ****/
4125
4126         case '$':       /* multibyte charset prefix */
4127           iso->esc = ISO_ESC_2_4;
4128           goto not_done;
4129
4130         default:
4131           if (0x28 <= c && c <= 0x2F)
4132             {
4133               iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_8);
4134               goto not_done;
4135             }
4136
4137           /* This function is called with CODESYS equal to nil when
4138              doing coding-system detection. */
4139           if (!NILP (codesys)
4140               && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
4141               && fit_to_be_escape_quoted (c))
4142             {
4143               iso->esc = ISO_ESC_LITERAL;
4144               *flags &= CODING_STATE_ISO2022_LOCK;
4145               return 1;
4146             }
4147
4148           /* bzzzt! */
4149           return 0;
4150         }
4151
4152
4153
4154       /**** directionality ****/
4155
4156     case ISO_ESC_5_11:          /* ISO6429 direction control */
4157       if (c == ']')
4158         {
4159           *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4160           goto directionality;
4161         }
4162       if      (c == '0') iso->esc = ISO_ESC_5_11_0;
4163       else if (c == '1') iso->esc = ISO_ESC_5_11_1;
4164       else if (c == '2') iso->esc = ISO_ESC_5_11_2;
4165       else               return 0;
4166       goto not_done;
4167
4168     case ISO_ESC_5_11_0:
4169       if (c == ']')
4170         {
4171           *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4172           goto directionality;
4173         }
4174       return 0;
4175
4176     case ISO_ESC_5_11_1:
4177       if (c == ']')
4178         {
4179           *flags = (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4180           goto directionality;
4181         }
4182       return 0;
4183
4184     case ISO_ESC_5_11_2:
4185       if (c == ']')
4186         {
4187           *flags = (*flags & CODING_STATE_ISO2022_LOCK) | CODING_STATE_R2L;
4188           goto directionality;
4189         }
4190       return 0;
4191
4192     directionality:
4193       iso->esc = ISO_ESC_DIRECTIONALITY;
4194       /* Various junk here to attempt to preserve the direction sequences
4195          literally in the text if they would otherwise be swallowed due
4196          to invalid designations that don't show up as actual charset
4197          changes in the text. */
4198       if (iso->invalid_switch_dir)
4199         {
4200           /* We already inserted a direction switch literally into the
4201              text.  We assume (#### this may not be right) that the
4202              next direction switch is the one going the other way,
4203              and we need to output that literally as well. */
4204           iso->output_literally = 1;
4205           iso->invalid_switch_dir = 0;
4206         }
4207       else
4208         {
4209           int jj;
4210
4211           /* If we are in the thrall of an invalid designation,
4212            then stick the directionality sequence literally into the
4213            output stream so it ends up in the original text again. */
4214           for (jj = 0; jj < 4; jj++)
4215             if (iso->invalid_designated[jj])
4216               break;
4217           if (jj < 4)
4218             {
4219               iso->output_literally = 1;
4220               iso->invalid_switch_dir = 1;
4221             }
4222           else
4223             /* Indicate that we haven't yet seen a valid designation,
4224                so that if a switch-dir is directly followed by an
4225                invalid designation, both get inserted literally. */
4226             iso->switched_dir_and_no_valid_charset_yet = 1;
4227         }
4228       return 1;
4229
4230
4231       /**** designation ****/
4232
4233     case ISO_ESC_2_4:
4234       if (0x28 <= c && c <= 0x2F)
4235         {
4236           iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_4_8);
4237           goto not_done;
4238         }
4239       if (0x40 <= c && c <= 0x42)
4240         {
4241           cs = CHARSET_BY_ATTRIBUTES (CHARSET_TYPE_94X94, c,
4242                                       *flags & CODING_STATE_R2L ?
4243                                       CHARSET_RIGHT_TO_LEFT :
4244                                       CHARSET_LEFT_TO_RIGHT);
4245           reg = 0;
4246           goto designated;
4247         }
4248       return 0;
4249
4250     default:
4251       {
4252         int type =-1;
4253
4254         if (c < '0' || c > '~')
4255           return 0; /* bad final byte */
4256
4257         if (iso->esc >= ISO_ESC_2_8 &&
4258             iso->esc <= ISO_ESC_2_15)
4259           {
4260             type = ((iso->esc >= ISO_ESC_2_12) ?
4261                     CHARSET_TYPE_96 : CHARSET_TYPE_94);
4262             reg = (iso->esc - ISO_ESC_2_8) & 3;
4263           }
4264         else if (iso->esc >= ISO_ESC_2_4_8 &&
4265                  iso->esc <= ISO_ESC_2_4_15)
4266           {
4267             type = ((iso->esc >= ISO_ESC_2_4_12) ?
4268                     CHARSET_TYPE_96X96 : CHARSET_TYPE_94X94);
4269             reg = (iso->esc - ISO_ESC_2_4_8) & 3;
4270           }
4271         else
4272           {
4273             /* Can this ever be reached? -slb */
4274             abort();
4275           }
4276
4277         cs = CHARSET_BY_ATTRIBUTES (type, c,
4278                                     *flags & CODING_STATE_R2L ?
4279                                     CHARSET_RIGHT_TO_LEFT :
4280                                     CHARSET_LEFT_TO_RIGHT);
4281         goto designated;
4282       }
4283     }
4284
4285  not_done:
4286   iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char) c;
4287   return -1;
4288
4289  single_shift:
4290   if (check_invalid_charsets && !CHARSETP (iso->charset[reg]))
4291     /* can't invoke something that ain't there. */
4292     return 0;
4293   iso->esc = ISO_ESC_SINGLE_SHIFT;
4294   *flags &= CODING_STATE_ISO2022_LOCK;
4295   if (reg == 2)
4296     *flags |= CODING_STATE_SS2;
4297   else
4298     *flags |= CODING_STATE_SS3;
4299   return 1;
4300
4301  locking_shift:
4302   if (check_invalid_charsets &&
4303       !CHARSETP (iso->charset[reg]))
4304     /* can't invoke something that ain't there. */
4305     return 0;
4306   if (half)
4307     iso->register_right = reg;
4308   else
4309     iso->register_left = reg;
4310   *flags &= CODING_STATE_ISO2022_LOCK;
4311   iso->esc = ISO_ESC_LOCKING_SHIFT;
4312   return 1;
4313
4314  designated:
4315   if (NILP (cs) && check_invalid_charsets)
4316     {
4317       iso->invalid_designated[reg] = 1;
4318       iso->charset[reg] = Vcharset_ascii;
4319       iso->esc = ISO_ESC_DESIGNATE;
4320       *flags &= CODING_STATE_ISO2022_LOCK;
4321       iso->output_literally = 1;
4322       if (iso->switched_dir_and_no_valid_charset_yet)
4323         {
4324           /* We encountered a switch-direction followed by an
4325              invalid designation.  Ensure that the switch-direction
4326              gets outputted; otherwise it will probably get eaten
4327              when the text is written out again. */
4328           iso->switched_dir_and_no_valid_charset_yet = 0;
4329           iso->output_direction_sequence = 1;
4330           /* And make sure that the switch-dir going the other
4331              way gets outputted, as well. */
4332           iso->invalid_switch_dir = 1;
4333         }
4334       return 1;
4335     }
4336   /* This function is called with CODESYS equal to nil when
4337      doing coding-system detection. */
4338   if (!NILP (codesys))
4339     {
4340       charset_conversion_spec_dynarr *dyn =
4341         XCODING_SYSTEM (codesys)->iso2022.input_conv;
4342
4343       if (dyn)
4344         {
4345           int i;
4346
4347           for (i = 0; i < Dynarr_length (dyn); i++)
4348             {
4349               struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
4350               if (EQ (cs, spec->from_charset))
4351                 cs = spec->to_charset;
4352             }
4353         }
4354     }
4355
4356   iso->charset[reg] = cs;
4357   iso->esc = ISO_ESC_DESIGNATE;
4358   *flags &= CODING_STATE_ISO2022_LOCK;
4359   if (iso->invalid_designated[reg])
4360     {
4361       iso->invalid_designated[reg] = 0;
4362       iso->output_literally = 1;
4363     }
4364   if (iso->switched_dir_and_no_valid_charset_yet)
4365     iso->switched_dir_and_no_valid_charset_yet = 0;
4366   return 1;
4367 }
4368
4369 static int
4370 detect_coding_iso2022 (struct detection_state *st, CONST unsigned char *src,
4371                        unsigned int n)
4372 {
4373   int mask;
4374
4375   /* #### There are serious deficiencies in the recognition mechanism
4376      here.  This needs to be much smarter if it's going to cut it.
4377      The sequence "\xff\x0f" is currently detected as LOCK_SHIFT while
4378      it should be detected as Latin-1.
4379      All the ISO2022 stuff in this file should be synced up with the
4380      code from FSF Emacs-20.4, in which Mule should be more or less stable.
4381      Perhaps we should wait till R2L works in FSF Emacs? */
4382
4383   if (!st->iso2022.initted)
4384     {
4385       reset_iso2022 (Qnil, &st->iso2022.iso);
4386       st->iso2022.mask = (CODING_CATEGORY_ISO_7_MASK |
4387                           CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4388                           CODING_CATEGORY_ISO_8_1_MASK |
4389                           CODING_CATEGORY_ISO_8_2_MASK |
4390                           CODING_CATEGORY_ISO_LOCK_SHIFT_MASK);
4391       st->iso2022.flags = 0;
4392       st->iso2022.high_byte_count = 0;
4393       st->iso2022.saw_single_shift = 0;
4394       st->iso2022.initted = 1;
4395     }
4396
4397   mask = st->iso2022.mask;
4398
4399   while (n--)
4400     {
4401       int c = *src++;
4402       if (c >= 0xA0)
4403         {
4404           mask &= ~CODING_CATEGORY_ISO_7_MASK;
4405           st->iso2022.high_byte_count++;
4406         }
4407       else
4408         {
4409           if (st->iso2022.high_byte_count && !st->iso2022.saw_single_shift)
4410             {
4411               if (st->iso2022.high_byte_count & 1)
4412                 /* odd number of high bytes; assume not iso-8-2 */
4413                 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4414             }
4415           st->iso2022.high_byte_count = 0;
4416           st->iso2022.saw_single_shift = 0;
4417           if (c > 0x80)
4418             mask &= ~CODING_CATEGORY_ISO_7_MASK;
4419         }
4420       if (!(st->iso2022.flags & CODING_STATE_ESCAPE)
4421           && (BYTE_C0_P (c) || BYTE_C1_P (c)))
4422         { /* control chars */
4423           switch (c)
4424             {
4425               /* Allow and ignore control characters that you might
4426                  reasonably see in a text file */
4427             case '\r':
4428             case '\n':
4429             case '\t':
4430             case  7: /* bell */
4431             case  8: /* backspace */
4432             case 11: /* vertical tab */
4433             case 12: /* form feed */
4434             case 26: /* MS-DOS C-z junk */
4435             case 31: /* '^_' -- for info */
4436               goto label_continue_loop;
4437
4438             default:
4439               break;
4440             }
4441         }
4442
4443       if ((st->iso2022.flags & CODING_STATE_ESCAPE) || BYTE_C0_P (c)
4444           || BYTE_C1_P (c))
4445         {
4446           if (parse_iso2022_esc (Qnil, &st->iso2022.iso, c,
4447                                  &st->iso2022.flags, 0))
4448             {
4449               switch (st->iso2022.iso.esc)
4450                 {
4451                 case ISO_ESC_DESIGNATE:
4452                   mask &= ~CODING_CATEGORY_ISO_8_1_MASK;
4453                   mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4454                   break;
4455                 case ISO_ESC_LOCKING_SHIFT:
4456                   mask = CODING_CATEGORY_ISO_LOCK_SHIFT_MASK;
4457                   goto ran_out_of_chars;
4458                 case ISO_ESC_SINGLE_SHIFT:
4459                   mask &= ~CODING_CATEGORY_ISO_8_DESIGNATE_MASK;
4460                   st->iso2022.saw_single_shift = 1;
4461                   break;
4462                 default:
4463                   break;
4464                 }
4465             }
4466           else
4467             {
4468               mask = 0;
4469               goto ran_out_of_chars;
4470             }
4471         }
4472     label_continue_loop:;
4473     }
4474
4475  ran_out_of_chars:
4476
4477   return mask;
4478 }
4479
4480 static int
4481 postprocess_iso2022_mask (int mask)
4482 {
4483   /* #### kind of cheesy */
4484   /* If seven-bit ISO is allowed, then assume that the encoding is
4485      entirely seven-bit and turn off the eight-bit ones. */
4486   if (mask & CODING_CATEGORY_ISO_7_MASK)
4487     mask &= ~ (CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4488                CODING_CATEGORY_ISO_8_1_MASK |
4489                CODING_CATEGORY_ISO_8_2_MASK);
4490   return mask;
4491 }
4492
4493 /* If FLAGS is a null pointer or specifies right-to-left motion,
4494    output a switch-dir-to-left-to-right sequence to DST.
4495    Also update FLAGS if it is not a null pointer.
4496    If INTERNAL_P is set, we are outputting in internal format and
4497    need to handle the CSI differently. */
4498
4499 static void
4500 restore_left_to_right_direction (Lisp_Coding_System *codesys,
4501                                  unsigned_char_dynarr *dst,
4502                                  unsigned int *flags,
4503                                  int internal_p)
4504 {
4505   if (!flags || (*flags & CODING_STATE_R2L))
4506     {
4507       if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4508         {
4509           Dynarr_add (dst, ISO_CODE_ESC);
4510           Dynarr_add (dst, '[');
4511         }
4512       else if (internal_p)
4513         DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4514       else
4515         Dynarr_add (dst, ISO_CODE_CSI);
4516       Dynarr_add (dst, '0');
4517       Dynarr_add (dst, ']');
4518       if (flags)
4519         *flags &= ~CODING_STATE_R2L;
4520     }
4521 }
4522
4523 /* If FLAGS is a null pointer or specifies a direction different from
4524    DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or
4525    CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape
4526    sequence to DST.  Also update FLAGS if it is not a null pointer.
4527    If INTERNAL_P is set, we are outputting in internal format and
4528    need to handle the CSI differently. */
4529
4530 static void
4531 ensure_correct_direction (int direction, Lisp_Coding_System *codesys,
4532                           unsigned_char_dynarr *dst, unsigned int *flags,
4533                           int internal_p)
4534 {
4535   if ((!flags || (*flags & CODING_STATE_R2L)) &&
4536       direction == CHARSET_LEFT_TO_RIGHT)
4537     restore_left_to_right_direction (codesys, dst, flags, internal_p);
4538   else if (!CODING_SYSTEM_ISO2022_NO_ISO6429 (codesys)
4539            && (!flags || !(*flags & CODING_STATE_R2L)) &&
4540            direction == CHARSET_RIGHT_TO_LEFT)
4541     {
4542       if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4543         {
4544           Dynarr_add (dst, ISO_CODE_ESC);
4545           Dynarr_add (dst, '[');
4546         }
4547       else if (internal_p)
4548         DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4549       else
4550         Dynarr_add (dst, ISO_CODE_CSI);
4551       Dynarr_add (dst, '2');
4552       Dynarr_add (dst, ']');
4553       if (flags)
4554         *flags |= CODING_STATE_R2L;
4555     }
4556 }
4557
4558 /* Convert ISO2022-format data to internal format. */
4559
4560 static void
4561 decode_coding_iso2022 (Lstream *decoding, CONST unsigned char *src,
4562                        unsigned_char_dynarr *dst, unsigned int n)
4563 {
4564   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4565   unsigned int flags  = str->flags;
4566   unsigned int ch     = str->ch;
4567   eol_type_t eol_type = str->eol_type;
4568 #ifdef ENABLE_COMPOSITE_CHARS
4569   unsigned_char_dynarr *real_dst = dst;
4570 #endif
4571   Lisp_Object coding_system;
4572
4573   XSETCODING_SYSTEM (coding_system, str->codesys);
4574
4575 #ifdef ENABLE_COMPOSITE_CHARS
4576   if (flags & CODING_STATE_COMPOSITE)
4577     dst = str->iso2022.composite_chars;
4578 #endif /* ENABLE_COMPOSITE_CHARS */
4579
4580   while (n--)
4581     {
4582       unsigned char c = *src++;
4583       if (flags & CODING_STATE_ESCAPE)
4584         {       /* Within ESC sequence */
4585           int retval = parse_iso2022_esc (coding_system, &str->iso2022,
4586                                           c, &flags, 1);
4587
4588           if (retval)
4589             {
4590               switch (str->iso2022.esc)
4591                 {
4592 #ifdef ENABLE_COMPOSITE_CHARS
4593                 case ISO_ESC_START_COMPOSITE:
4594                   if (str->iso2022.composite_chars)
4595                     Dynarr_reset (str->iso2022.composite_chars);
4596                   else
4597                     str->iso2022.composite_chars = Dynarr_new (unsigned_char);
4598                   dst = str->iso2022.composite_chars;
4599                   break;
4600                 case ISO_ESC_END_COMPOSITE:
4601                   {
4602                     Bufbyte comstr[MAX_EMCHAR_LEN];
4603                     Bytecount len;
4604                     Emchar emch = lookup_composite_char (Dynarr_atp (dst, 0),
4605                                                          Dynarr_length (dst));
4606                     dst = real_dst;
4607                     len = set_charptr_emchar (comstr, emch);
4608                     Dynarr_add_many (dst, comstr, len);
4609                     break;
4610                   }
4611 #endif /* ENABLE_COMPOSITE_CHARS */
4612
4613                 case ISO_ESC_LITERAL:
4614                   COMPOSE_FLUSH_CHARS (str, dst);
4615                   DECODE_ADD_BINARY_CHAR (c, dst);
4616                   break;
4617
4618                 default:
4619                   /* Everything else handled already */
4620                   break;
4621                 }
4622             }
4623
4624           /* Attempted error recovery. */
4625           if (str->iso2022.output_direction_sequence)
4626             ensure_correct_direction (flags & CODING_STATE_R2L ?
4627                                       CHARSET_RIGHT_TO_LEFT :
4628                                       CHARSET_LEFT_TO_RIGHT,
4629                                       str->codesys, dst, 0, 1);
4630           /* More error recovery. */
4631           if (!retval || str->iso2022.output_literally)
4632             {
4633               /* Output the (possibly invalid) sequence */
4634               int i;
4635               COMPOSE_FLUSH_CHARS (str, dst);
4636               for (i = 0; i < str->iso2022.esc_bytes_index; i++)
4637                 DECODE_ADD_BINARY_CHAR (str->iso2022.esc_bytes[i], dst);
4638               flags &= CODING_STATE_ISO2022_LOCK;
4639               if (!retval)
4640                 n++, src--;/* Repeat the loop with the same character. */
4641               else
4642                 {
4643                   /* No sense in reprocessing the final byte of the
4644                      escape sequence; it could mess things up anyway.
4645                      Just add it now. */
4646                   COMPOSE_FLUSH_CHARS (str, dst);
4647                   DECODE_ADD_BINARY_CHAR (c, dst);
4648                 }
4649             }
4650           ch = 0;
4651         }
4652       else if (BYTE_C0_P (c) || BYTE_C1_P (c))
4653         { /* Control characters */
4654
4655           /***** Error-handling *****/
4656
4657           /* If we were in the middle of a character, dump out the
4658              partial character. */
4659           if (ch)
4660             {
4661               COMPOSE_FLUSH_CHARS (str, dst);
4662               DECODE_ADD_BINARY_CHAR (ch, dst);
4663               ch = 0;
4664             }
4665
4666           /* If we just saw a single-shift character, dump it out.
4667              This may dump out the wrong sort of single-shift character,
4668              but least it will give an indication that something went
4669              wrong. */
4670           if (flags & CODING_STATE_SS2)
4671             {
4672               COMPOSE_FLUSH_CHARS (str, dst);
4673               DECODE_ADD_BINARY_CHAR (ISO_CODE_SS2, dst);
4674               flags &= ~CODING_STATE_SS2;
4675             }
4676           if (flags & CODING_STATE_SS3)
4677             {
4678               COMPOSE_FLUSH_CHARS (str, dst);
4679               DECODE_ADD_BINARY_CHAR (ISO_CODE_SS3, dst);
4680               flags &= ~CODING_STATE_SS3;
4681             }
4682
4683           /***** Now handle the control characters. *****/
4684
4685           /* Handle CR/LF */
4686 #ifdef UTF2000
4687           if (c == '\r')
4688             {
4689               COMPOSE_FLUSH_CHARS (str, dst);
4690               if (eol_type == EOL_CR)
4691                 Dynarr_add (dst, '\n');
4692               else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR)
4693                 Dynarr_add (dst, c);
4694               else
4695                 flags |= CODING_STATE_CR;
4696               goto label_continue_loop;
4697             }
4698           else if (flags & CODING_STATE_CR)
4699             {   /* eol_type == CODING_SYSTEM_EOL_CRLF */
4700               if (c != '\n')
4701                 Dynarr_add (dst, '\r');
4702               flags &= ~CODING_STATE_CR;
4703             }
4704 #else
4705           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4706 #endif
4707
4708           flags &= CODING_STATE_ISO2022_LOCK;
4709
4710           if (!parse_iso2022_esc (coding_system, &str->iso2022, c, &flags, 1))
4711             {
4712               COMPOSE_FLUSH_CHARS (str, dst);
4713               DECODE_ADD_BINARY_CHAR (c, dst);
4714             }
4715         }
4716       else
4717         {                       /* Graphic characters */
4718           Lisp_Object charset;
4719 #ifndef UTF2000
4720           Charset_ID lb;
4721 #endif
4722           int reg;
4723
4724 #ifdef UTF2000
4725           if (c == '\r')
4726             {
4727               COMPOSE_FLUSH_CHARS (str, dst);
4728               if (eol_type == EOL_CR)
4729                 Dynarr_add (dst, '\n');
4730               else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR)
4731                 Dynarr_add (dst, c);
4732               else
4733                 flags |= CODING_STATE_CR;
4734               goto label_continue_loop;
4735             }
4736           else if (flags & CODING_STATE_CR)
4737             {   /* eol_type == CODING_SYSTEM_EOL_CRLF */
4738               if (c != '\n')
4739                 Dynarr_add (dst, '\r');
4740               flags &= ~CODING_STATE_CR;
4741             }
4742 #else
4743           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4744 #endif
4745
4746           /* Now determine the charset. */
4747           reg = ((flags & CODING_STATE_SS2) ? 2
4748                  : (flags & CODING_STATE_SS3) ? 3
4749                  : !BYTE_ASCII_P (c) ? str->iso2022.register_right
4750                  : str->iso2022.register_left);
4751           charset = str->iso2022.charset[reg];
4752
4753           /* Error checking: */
4754           if (! CHARSETP (charset)
4755               || str->iso2022.invalid_designated[reg]
4756               || (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL)
4757                   && XCHARSET_CHARS (charset) == 94))
4758             /* Mrmph.  We are trying to invoke a register that has no
4759                or an invalid charset in it, or trying to add a character
4760                outside the range of the charset.  Insert that char literally
4761                to preserve it for the output. */
4762             {
4763               COMPOSE_FLUSH_CHARS (str, dst);
4764               DECODE_OUTPUT_PARTIAL_CHAR (ch);
4765               DECODE_ADD_BINARY_CHAR (c, dst);
4766             }
4767
4768           else
4769             {
4770               /* Things are probably hunky-dorey. */
4771
4772               /* Fetch reverse charset, maybe. */
4773               if (((flags & CODING_STATE_R2L) &&
4774                    XCHARSET_DIRECTION (charset) == CHARSET_LEFT_TO_RIGHT)
4775                   ||
4776                   (!(flags & CODING_STATE_R2L) &&
4777                    XCHARSET_DIRECTION (charset) == CHARSET_RIGHT_TO_LEFT))
4778                 {
4779                   Lisp_Object new_charset =
4780                     XCHARSET_REVERSE_DIRECTION_CHARSET (charset);
4781                   if (!NILP (new_charset))
4782                     charset = new_charset;
4783                 }
4784
4785 #ifdef UTF2000
4786               if (XCHARSET_DIMENSION (charset) == 1)
4787                 {
4788                   if (ch)
4789                     {
4790                       COMPOSE_FLUSH_CHARS (str, dst);
4791                       DECODE_ADD_BINARY_CHAR (ch, dst);
4792                       ch = 0;
4793                     }
4794                   COMPOSE_ADD_CHAR (str,
4795                                     MAKE_CHAR (charset, c & 0x7F, 0), dst);
4796                 }
4797               else if (ch)
4798                 {
4799                   COMPOSE_ADD_CHAR (str,
4800                                     MAKE_CHAR (charset, ch & 0x7F, c & 0x7F),
4801                                     dst);
4802                   ch = 0;
4803                 }
4804               else
4805                 ch = c;
4806 #else
4807               lb = XCHARSET_LEADING_BYTE (charset);
4808               switch (XCHARSET_REP_BYTES (charset))
4809                 {
4810                 case 1: /* ASCII */
4811                   DECODE_OUTPUT_PARTIAL_CHAR (ch);
4812                   Dynarr_add (dst, c & 0x7F);
4813                   break;
4814
4815                 case 2: /* one-byte official */
4816                   DECODE_OUTPUT_PARTIAL_CHAR (ch);
4817                   Dynarr_add (dst, lb);
4818                   Dynarr_add (dst, c | 0x80);
4819                   break;
4820
4821                 case 3: /* one-byte private or two-byte official */
4822                   if (XCHARSET_PRIVATE_P (charset))
4823                     {
4824                       DECODE_OUTPUT_PARTIAL_CHAR (ch);
4825                       Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_1);
4826                       Dynarr_add (dst, lb);
4827                       Dynarr_add (dst, c | 0x80);
4828                     }
4829                   else
4830                     {
4831                       if (ch)
4832                         {
4833                           Dynarr_add (dst, lb);
4834                           Dynarr_add (dst, ch | 0x80);
4835                           Dynarr_add (dst, c | 0x80);
4836                           ch = 0;
4837                         }
4838                       else
4839                         ch = c;
4840                     }
4841                   break;
4842
4843                 default:        /* two-byte private */
4844                   if (ch)
4845                     {
4846                       Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_2);
4847                       Dynarr_add (dst, lb);
4848                       Dynarr_add (dst, ch | 0x80);
4849                       Dynarr_add (dst, c | 0x80);
4850                       ch = 0;
4851                     }
4852                   else
4853                     ch = c;
4854                 }
4855 #endif
4856             }
4857
4858           if (!ch)
4859             flags &= CODING_STATE_ISO2022_LOCK;
4860         }
4861
4862     label_continue_loop:;
4863     }
4864
4865   if (flags & CODING_STATE_END)
4866     {
4867       COMPOSE_FLUSH_CHARS (str, dst);
4868       DECODE_OUTPUT_PARTIAL_CHAR (ch);
4869     }
4870   str->flags = flags;
4871   str->ch    = ch;
4872 }
4873
4874
4875 /***** ISO2022 encoder *****/
4876
4877 /* Designate CHARSET into register REG. */
4878
4879 static void
4880 iso2022_designate (Lisp_Object charset, unsigned char reg,
4881                    struct encoding_stream *str, unsigned_char_dynarr *dst)
4882 {
4883   static CONST char inter94[] = "()*+";
4884   static CONST char inter96[] = ",-./";
4885   unsigned short chars;
4886   unsigned char dimension;
4887   unsigned char final;
4888   Lisp_Object old_charset = str->iso2022.charset[reg];
4889
4890   str->iso2022.charset[reg] = charset;
4891   if (!CHARSETP (charset))
4892     /* charset might be an initial nil or t. */
4893     return;
4894   chars = XCHARSET_CHARS (charset);
4895   dimension = XCHARSET_DIMENSION (charset);
4896   final = XCHARSET_FINAL (charset);
4897   if (!str->iso2022.force_charset_on_output[reg] &&
4898       CHARSETP (old_charset) &&
4899       XCHARSET_CHARS (old_charset) == chars &&
4900       XCHARSET_DIMENSION (old_charset) == dimension &&
4901       XCHARSET_FINAL (old_charset) == final)
4902     return;
4903
4904   str->iso2022.force_charset_on_output[reg] = 0;
4905
4906   {
4907     charset_conversion_spec_dynarr *dyn =
4908       str->codesys->iso2022.output_conv;
4909
4910     if (dyn)
4911       {
4912         int i;
4913
4914         for (i = 0; i < Dynarr_length (dyn); i++)
4915           {
4916             struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
4917             if (EQ (charset, spec->from_charset))
4918                 charset = spec->to_charset;
4919           }
4920       }
4921   }
4922
4923   Dynarr_add (dst, ISO_CODE_ESC);
4924   switch (chars)
4925     {
4926     case 94:
4927       if (dimension == 1)
4928         Dynarr_add (dst, inter94[reg]);
4929       else
4930         {
4931           Dynarr_add (dst, '$');
4932           if (reg != 0
4933               || !(CODING_SYSTEM_ISO2022_SHORT (str->codesys))
4934               || final < '@'
4935               || final > 'B')
4936             Dynarr_add (dst, inter94[reg]);
4937         }
4938       break;
4939     case 96:
4940       if (dimension == 1)
4941         Dynarr_add (dst, inter96[reg]);
4942       else
4943         {
4944           Dynarr_add (dst, '$');
4945           Dynarr_add (dst, inter96[reg]);
4946         }
4947       break;
4948     }
4949   Dynarr_add (dst, final);
4950 }
4951
4952 static void
4953 ensure_normal_shift (struct encoding_stream *str, unsigned_char_dynarr *dst)
4954 {
4955   if (str->iso2022.register_left != 0)
4956     {
4957       Dynarr_add (dst, ISO_CODE_SI);
4958       str->iso2022.register_left = 0;
4959     }
4960 }
4961
4962 static void
4963 ensure_shift_out (struct encoding_stream *str, unsigned_char_dynarr *dst)
4964 {
4965   if (str->iso2022.register_left != 1)
4966     {
4967       Dynarr_add (dst, ISO_CODE_SO);
4968       str->iso2022.register_left = 1;
4969     }
4970 }
4971
4972 void
4973 char_encode_iso2022 (struct encoding_stream *str, Emchar ch,
4974                      unsigned_char_dynarr *dst, unsigned int *flags)
4975 {
4976   unsigned char charmask;
4977   Lisp_Coding_System* codesys = str->codesys;
4978   eol_type_t eol_type         = CODING_SYSTEM_EOL_TYPE (str->codesys);
4979   int i;
4980   Lisp_Object charset = str->iso2022.current_charset;
4981   int half = str->iso2022.current_half;
4982   unsigned int byte1, byte2;
4983
4984   if (ch <= 0x7F)
4985     {
4986       restore_left_to_right_direction (codesys, dst, flags, 0);
4987
4988       /* Make sure G0 contains ASCII */
4989       if ((ch > ' ' && ch < ISO_CODE_DEL)
4990           || !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
4991         {
4992           ensure_normal_shift (str, dst);
4993           iso2022_designate (Vcharset_ascii, 0, str, dst);
4994         }
4995
4996       /* If necessary, restore everything to the default state
4997          at end-of-line */
4998       if (ch == '\n' && !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys)))
4999         {
5000           restore_left_to_right_direction (codesys, dst, flags, 0);
5001
5002           ensure_normal_shift (str, dst);
5003
5004           for (i = 0; i < 4; i++)
5005             {
5006               Lisp_Object initial_charset =
5007                 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5008               iso2022_designate (initial_charset, i, str, dst);
5009             }
5010         }
5011       if (ch == '\n')
5012         {
5013           if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5014             Dynarr_add (dst, '\r');
5015           if (eol_type != EOL_CR)
5016             Dynarr_add (dst, ch);
5017         }
5018       else
5019         {
5020           if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5021               && fit_to_be_escape_quoted (ch))
5022             Dynarr_add (dst, ISO_CODE_ESC);
5023           Dynarr_add (dst, ch);
5024         }
5025     }
5026   else if ( (0x80 <= ch) && (ch <= 0x9f) )
5027     {
5028       charmask = (half == 0 ? 0x00 : 0x80);
5029
5030       if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5031           && fit_to_be_escape_quoted (ch))
5032         Dynarr_add (dst, ISO_CODE_ESC);
5033       /* you asked for it ... */
5034       Dynarr_add (dst, ch);
5035     }
5036   else
5037     {
5038       int reg;
5039
5040       /* Now determine which register to use. */
5041       reg = -1;
5042       for (i = 0; i < 4; i++)
5043         {
5044           int code_point;
5045
5046           if ((CHARSETP (charset = str->iso2022.charset[i])
5047                && ((code_point = charset_code_point (charset, ch)) >= 0))
5048               ||
5049               (CHARSETP
5050                (charset
5051                 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i))
5052                && ((code_point = charset_code_point (charset, ch)) >= 0)))
5053             {
5054               if (XCHARSET_DIMENSION (charset) == 1)
5055                 {
5056                   byte1 = code_point;
5057                   byte2 = 0;
5058                 }
5059               else /* if (XCHARSET_DIMENSION (charset) == 2) */
5060                 {
5061                   byte1 = code_point >> 8;
5062                   byte2 = code_point & 255;
5063                 }
5064               reg = i;
5065               break;
5066             }
5067         }
5068       if (reg == -1)
5069         {
5070           Lisp_Object original_default_coded_charset_priority_list
5071             = Vdefault_coded_charset_priority_list;
5072
5073           while (!EQ (Vdefault_coded_charset_priority_list, Qnil))
5074             {
5075               BREAKUP_CHAR (ch, charset, byte1, byte2);
5076               if (XCHARSET_FINAL (charset))
5077                 goto found;
5078               Vdefault_coded_charset_priority_list
5079                 = Fcdr (Fmemq (XCHARSET_NAME (charset),
5080                                Vdefault_coded_charset_priority_list));
5081             }
5082           BREAKUP_CHAR (ch, charset, byte1, byte2);
5083           if (!XCHARSET_FINAL (charset))
5084             {
5085               charset = Vcharset_ascii;
5086               byte1 = '~';
5087             }
5088         found:
5089           Vdefault_coded_charset_priority_list
5090             = original_default_coded_charset_priority_list;
5091         }
5092       ensure_correct_direction (XCHARSET_DIRECTION (charset),
5093                                 codesys, dst, flags, 0);
5094
5095       if (reg == -1)
5096         {
5097           if (XCHARSET_GRAPHIC (charset) != 0)
5098             {
5099               if (!NILP (str->iso2022.charset[1]) &&
5100                   (!CODING_SYSTEM_ISO2022_SEVEN (codesys)
5101                    || CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys)))
5102                 reg = 1;
5103               else if (!NILP (str->iso2022.charset[2]))
5104                 reg = 2;
5105               else if (!NILP (str->iso2022.charset[3]))
5106                 reg = 3;
5107               else
5108                 reg = 0;
5109             }
5110           else
5111             reg = 0;
5112         }
5113
5114       iso2022_designate (charset, reg, str, dst);
5115
5116       /* Now invoke that register. */
5117       switch (reg)
5118         {
5119         case 0:
5120           ensure_normal_shift (str, dst);
5121           half = 0;
5122           break;
5123         case 1:
5124           if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5125             {
5126               ensure_shift_out (str, dst);
5127               half = 0;
5128             }
5129           else
5130             half = 1;
5131           break;
5132         case 2:
5133           if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5134             {
5135               Dynarr_add (dst, ISO_CODE_ESC);
5136               Dynarr_add (dst, 'N');
5137               half = 0;
5138             }
5139           else
5140             {
5141               Dynarr_add (dst, ISO_CODE_SS2);
5142               half = 1;
5143             }
5144           break;
5145         case 3:
5146           if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5147             {
5148               Dynarr_add (dst, ISO_CODE_ESC);
5149               Dynarr_add (dst, 'O');
5150               half = 0;
5151             }
5152           else
5153             {
5154               Dynarr_add (dst, ISO_CODE_SS3);
5155               half = 1;
5156             }
5157           break;
5158         default:
5159           abort ();
5160         }
5161
5162       charmask = (half == 0 ? 0x00 : 0x80);
5163
5164       switch (XCHARSET_DIMENSION (charset))
5165         {
5166         case 1:
5167           Dynarr_add (dst, byte1 | charmask);
5168           break;
5169         case 2:
5170           Dynarr_add (dst, byte1 | charmask);
5171           Dynarr_add (dst, byte2 | charmask);
5172           break;
5173         default:
5174           abort ();
5175         }
5176     }
5177   str->iso2022.current_charset = charset;
5178   str->iso2022.current_half = half;
5179 }
5180
5181 void
5182 char_finish_iso2022 (struct encoding_stream *str, unsigned_char_dynarr *dst,
5183                      unsigned int *flags)
5184 {
5185   Lisp_Coding_System* codesys = str->codesys;
5186   int i;
5187
5188   restore_left_to_right_direction (codesys, dst, flags, 0);
5189   ensure_normal_shift (str, dst);
5190   for (i = 0; i < 4; i++)
5191     {
5192       Lisp_Object initial_charset
5193         = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5194       iso2022_designate (initial_charset, i, str, dst);
5195     }
5196 }
5197 #endif /* MULE */
5198 \f
5199 /************************************************************************/
5200 /*                     No-conversion methods                            */
5201 /************************************************************************/
5202
5203 /* This is used when reading in "binary" files -- i.e. files that may
5204    contain all 256 possible byte values and that are not to be
5205    interpreted as being in any particular decoding. */
5206 static void
5207 decode_coding_no_conversion (Lstream *decoding, CONST unsigned char *src,
5208                              unsigned_char_dynarr *dst, unsigned int n)
5209 {
5210   unsigned char c;
5211   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
5212   unsigned int flags  = str->flags;
5213   unsigned int ch     = str->ch;
5214   eol_type_t eol_type = str->eol_type;
5215
5216   while (n--)
5217     {
5218       c = *src++;
5219
5220       DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5221       DECODE_ADD_BINARY_CHAR (c, dst);
5222     label_continue_loop:;
5223     }
5224
5225   DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
5226
5227   str->flags = flags;
5228   str->ch    = ch;
5229 }
5230
5231 static void
5232 encode_coding_no_conversion (Lstream *encoding, CONST unsigned char *src,
5233                              unsigned_char_dynarr *dst, unsigned int n)
5234 {
5235   unsigned char c;
5236   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
5237   unsigned int flags  = str->flags;
5238   unsigned int ch     = str->ch;
5239   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
5240 #ifdef UTF2000
5241   unsigned char char_boundary = str->iso2022.current_char_boundary;
5242 #endif
5243
5244   while (n--)
5245     {
5246       c = *src++;
5247 #ifdef UTF2000
5248       if (char_boundary == 0)
5249         if ( c >= 0xfc )
5250           {
5251             ch = c & 0x01;
5252             char_boundary = 5;
5253           }
5254         else if ( c >= 0xf8 )
5255           {
5256             ch = c & 0x03;
5257             char_boundary = 4;
5258           }
5259         else if ( c >= 0xf0 )
5260           {
5261             ch = c & 0x07;
5262             char_boundary = 3;
5263           }
5264         else if ( c >= 0xe0 )
5265           {
5266             ch = c & 0x0f;
5267             char_boundary = 2;
5268           }
5269         else if ( c >= 0xc0 )
5270           {
5271             ch = c & 0x1f;
5272             char_boundary = 1;
5273           }
5274         else
5275           {
5276             ch = 0;
5277             if (c == '\n')
5278               {
5279                 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5280                   Dynarr_add (dst, '\r');
5281                 if (eol_type != EOL_CR)
5282                   Dynarr_add (dst, c);
5283               }
5284             else
5285               Dynarr_add (dst, c);
5286             char_boundary = 0;
5287           }
5288       else if (char_boundary == 1)
5289         {
5290           ch = ( ch << 6 ) | ( c & 0x3f );
5291           Dynarr_add (dst, ch & 0xff);
5292           char_boundary = 0;
5293         }
5294       else
5295         {
5296           ch = ( ch << 6 ) | ( c & 0x3f );
5297           char_boundary--;
5298         }
5299 #else /* not UTF2000 */
5300       if (c == '\n')
5301         {
5302           if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5303             Dynarr_add (dst, '\r');
5304           if (eol_type != EOL_CR)
5305             Dynarr_add (dst, '\n');
5306           ch = 0;
5307         }
5308       else if (BYTE_ASCII_P (c))
5309         {
5310           assert (ch == 0);
5311           Dynarr_add (dst, c);
5312         }
5313       else if (BUFBYTE_LEADING_BYTE_P (c))
5314         {
5315           assert (ch == 0);
5316           if (c == LEADING_BYTE_LATIN_ISO8859_1 ||
5317               c == LEADING_BYTE_CONTROL_1)
5318             ch = c;
5319           else
5320             Dynarr_add (dst, '~'); /* untranslatable character */
5321         }
5322       else
5323         {
5324           if (ch == LEADING_BYTE_LATIN_ISO8859_1)
5325             Dynarr_add (dst, c);
5326           else if (ch == LEADING_BYTE_CONTROL_1)
5327             {
5328               assert (c < 0xC0);
5329               Dynarr_add (dst, c - 0x20);
5330             }
5331           /* else it should be the second or third byte of an
5332              untranslatable character, so ignore it */
5333           ch = 0;
5334         }
5335 #endif /* not UTF2000 */
5336     }
5337
5338   str->flags = flags;
5339   str->ch    = ch;
5340 #ifdef UTF2000
5341   str->iso2022.current_char_boundary = char_boundary;
5342 #endif
5343 }
5344
5345 \f
5346 /************************************************************************/
5347 /*                   Simple internal/external functions                 */
5348 /************************************************************************/
5349
5350 static Extbyte_dynarr *conversion_out_dynarr;
5351 static Bufbyte_dynarr *conversion_in_dynarr;
5352
5353 /* Determine coding system from coding format */
5354
5355 /* #### not correct for all values of `fmt'! */
5356 static Lisp_Object
5357 external_data_format_to_coding_system (enum external_data_format fmt)
5358 {
5359   switch (fmt)
5360     {
5361     case FORMAT_FILENAME:
5362     case FORMAT_TERMINAL:
5363       if (EQ (Vfile_name_coding_system, Qnil) ||
5364           EQ (Vfile_name_coding_system, Qbinary))
5365         return Qnil;
5366       else
5367         return Fget_coding_system (Vfile_name_coding_system);
5368 #ifdef MULE
5369     case FORMAT_CTEXT:
5370       return Fget_coding_system (Qctext);
5371 #endif
5372     default:
5373       return Qnil;
5374     }
5375 }
5376
5377 Extbyte *
5378 convert_to_external_format (CONST Bufbyte *ptr,
5379                             Bytecount len,
5380                             Extcount *len_out,
5381                             enum external_data_format fmt)
5382 {
5383   Lisp_Object coding_system = external_data_format_to_coding_system (fmt);
5384
5385   if (!conversion_out_dynarr)
5386     conversion_out_dynarr = Dynarr_new (Extbyte);
5387   else
5388     Dynarr_reset (conversion_out_dynarr);
5389
5390   if (NILP (coding_system))
5391     {
5392       CONST Bufbyte *end = ptr + len;
5393
5394       for (; ptr < end;)
5395         {
5396 #ifdef UTF2000
5397           Bufbyte c =
5398             (*ptr < 0xc0) ? *ptr :
5399             ((*ptr & 0x1f) << 6) | (*(ptr+1) & 0x3f);
5400 #else
5401           Bufbyte c =
5402             (BYTE_ASCII_P (*ptr))                  ? *ptr :
5403             (*ptr == LEADING_BYTE_CONTROL_1)       ? (*(ptr+1) - 0x20) :
5404             (*ptr == LEADING_BYTE_LATIN_ISO8859_1) ? (*(ptr+1)) :
5405             '~';
5406 #endif
5407           Dynarr_add (conversion_out_dynarr, (Extbyte) c);
5408           INC_CHARPTR (ptr);
5409         }
5410
5411 #ifdef ERROR_CHECK_BUFPOS
5412       assert (ptr == end);
5413 #endif
5414     }
5415   else
5416     {
5417       Lisp_Object instream, outstream, da_outstream;
5418       Lstream *istr, *ostr;
5419       struct gcpro gcpro1, gcpro2, gcpro3;
5420       char tempbuf[1024]; /* some random amount */
5421
5422       instream = make_fixed_buffer_input_stream ((unsigned char *) ptr, len);
5423       da_outstream = make_dynarr_output_stream
5424         ((unsigned_char_dynarr *) conversion_out_dynarr);
5425       outstream =
5426         make_encoding_output_stream (XLSTREAM (da_outstream), coding_system);
5427       istr = XLSTREAM (instream);
5428       ostr = XLSTREAM (outstream);
5429       GCPRO3 (instream, outstream, da_outstream);
5430       while (1)
5431         {
5432           int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
5433           if (!size_in_bytes)
5434             break;
5435           Lstream_write (ostr, tempbuf, size_in_bytes);
5436         }
5437       Lstream_close (istr);
5438       Lstream_close (ostr);
5439       UNGCPRO;
5440       Lstream_delete (istr);
5441       Lstream_delete (ostr);
5442       Lstream_delete (XLSTREAM (da_outstream));
5443     }
5444
5445   *len_out = Dynarr_length (conversion_out_dynarr);
5446   Dynarr_add (conversion_out_dynarr, 0); /* remember to zero-terminate! */
5447   return Dynarr_atp (conversion_out_dynarr, 0);
5448 }
5449
5450 Bufbyte *
5451 convert_from_external_format (CONST Extbyte *ptr,
5452                               Extcount len,
5453                               Bytecount *len_out,
5454                               enum external_data_format fmt)
5455 {
5456   Lisp_Object coding_system = external_data_format_to_coding_system (fmt);
5457
5458   if (!conversion_in_dynarr)
5459     conversion_in_dynarr = Dynarr_new (Bufbyte);
5460   else
5461     Dynarr_reset (conversion_in_dynarr);
5462
5463   if (NILP (coding_system))
5464     {
5465       CONST Extbyte *end = ptr + len;
5466       for (; ptr < end; ptr++)
5467         {
5468           Extbyte c = *ptr;
5469           DECODE_ADD_BINARY_CHAR (c, conversion_in_dynarr);
5470         }
5471     }
5472   else
5473     {
5474       Lisp_Object instream, outstream, da_outstream;
5475       Lstream *istr, *ostr;
5476       struct gcpro gcpro1, gcpro2, gcpro3;
5477       char tempbuf[1024]; /* some random amount */
5478
5479       instream = make_fixed_buffer_input_stream ((unsigned char *) ptr, len);
5480       da_outstream = make_dynarr_output_stream
5481         ((unsigned_char_dynarr *) conversion_in_dynarr);
5482       outstream =
5483         make_decoding_output_stream (XLSTREAM (da_outstream), coding_system);
5484       istr = XLSTREAM (instream);
5485       ostr = XLSTREAM (outstream);
5486       GCPRO3 (instream, outstream, da_outstream);
5487       while (1)
5488         {
5489           ssize_t size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
5490           if (!size_in_bytes)
5491             break;
5492           Lstream_write (ostr, tempbuf, size_in_bytes);
5493         }
5494       Lstream_close (istr);
5495       Lstream_close (ostr);
5496       UNGCPRO;
5497       Lstream_delete (istr);
5498       Lstream_delete (ostr);
5499       Lstream_delete (XLSTREAM (da_outstream));
5500     }
5501
5502   *len_out = Dynarr_length (conversion_in_dynarr);
5503   Dynarr_add (conversion_in_dynarr, 0); /* remember to zero-terminate! */
5504   return Dynarr_atp (conversion_in_dynarr, 0);
5505 }
5506
5507 \f
5508 /************************************************************************/
5509 /*                             Initialization                           */
5510 /************************************************************************/
5511
5512 void
5513 syms_of_file_coding (void)
5514 {
5515   deferror (&Qcoding_system_error, "coding-system-error",
5516             "Coding-system error", Qio_error);
5517
5518   DEFSUBR (Fcoding_system_p);
5519   DEFSUBR (Ffind_coding_system);
5520   DEFSUBR (Fget_coding_system);
5521   DEFSUBR (Fcoding_system_list);
5522   DEFSUBR (Fcoding_system_name);
5523   DEFSUBR (Fmake_coding_system);
5524   DEFSUBR (Fcopy_coding_system);
5525   DEFSUBR (Fdefine_coding_system_alias);
5526   DEFSUBR (Fsubsidiary_coding_system);
5527
5528   DEFSUBR (Fcoding_system_type);
5529   DEFSUBR (Fcoding_system_doc_string);
5530 #ifdef MULE
5531   DEFSUBR (Fcoding_system_charset);
5532 #endif
5533   DEFSUBR (Fcoding_system_property);
5534
5535   DEFSUBR (Fcoding_category_list);
5536   DEFSUBR (Fset_coding_priority_list);
5537   DEFSUBR (Fcoding_priority_list);
5538   DEFSUBR (Fset_coding_category_system);
5539   DEFSUBR (Fcoding_category_system);
5540
5541   DEFSUBR (Fdetect_coding_region);
5542   DEFSUBR (Fdecode_coding_region);
5543   DEFSUBR (Fencode_coding_region);
5544 #ifdef MULE
5545   DEFSUBR (Fdecode_shift_jis_char);
5546   DEFSUBR (Fencode_shift_jis_char);
5547   DEFSUBR (Fdecode_big5_char);
5548   DEFSUBR (Fencode_big5_char);
5549 #endif /* MULE */
5550   defsymbol (&Qcoding_systemp, "coding-system-p");
5551   defsymbol (&Qno_conversion, "no-conversion");
5552   defsymbol (&Qraw_text, "raw-text");
5553 #ifdef MULE
5554   defsymbol (&Qbig5, "big5");
5555   defsymbol (&Qshift_jis, "shift-jis");
5556   defsymbol (&Qucs4, "ucs-4");
5557   defsymbol (&Qutf8, "utf-8");
5558   defsymbol (&Qccl, "ccl");
5559   defsymbol (&Qiso2022, "iso2022");
5560 #endif /* MULE */
5561   defsymbol (&Qmnemonic, "mnemonic");
5562   defsymbol (&Qeol_type, "eol-type");
5563   defsymbol (&Qpost_read_conversion, "post-read-conversion");
5564   defsymbol (&Qpre_write_conversion, "pre-write-conversion");
5565
5566   defsymbol (&Qcr, "cr");
5567   defsymbol (&Qlf, "lf");
5568   defsymbol (&Qcrlf, "crlf");
5569   defsymbol (&Qeol_cr, "eol-cr");
5570   defsymbol (&Qeol_lf, "eol-lf");
5571   defsymbol (&Qeol_crlf, "eol-crlf");
5572 #ifdef MULE
5573   defsymbol (&Qcharset_g0, "charset-g0");
5574   defsymbol (&Qcharset_g1, "charset-g1");
5575   defsymbol (&Qcharset_g2, "charset-g2");
5576   defsymbol (&Qcharset_g3, "charset-g3");
5577   defsymbol (&Qforce_g0_on_output, "force-g0-on-output");
5578   defsymbol (&Qforce_g1_on_output, "force-g1-on-output");
5579   defsymbol (&Qforce_g2_on_output, "force-g2-on-output");
5580   defsymbol (&Qforce_g3_on_output, "force-g3-on-output");
5581   defsymbol (&Qno_iso6429, "no-iso6429");
5582   defsymbol (&Qinput_charset_conversion, "input-charset-conversion");
5583   defsymbol (&Qoutput_charset_conversion, "output-charset-conversion");
5584
5585   defsymbol (&Qshort, "short");
5586   defsymbol (&Qno_ascii_eol, "no-ascii-eol");
5587   defsymbol (&Qno_ascii_cntl, "no-ascii-cntl");
5588   defsymbol (&Qseven, "seven");
5589   defsymbol (&Qlock_shift, "lock-shift");
5590   defsymbol (&Qescape_quoted, "escape-quoted");
5591 #endif /* MULE */
5592 #ifdef UTF2000
5593   defsymbol (&Qdisable_composition, "disable-composition");
5594 #endif
5595   defsymbol (&Qencode, "encode");
5596   defsymbol (&Qdecode, "decode");
5597
5598 #ifdef MULE
5599   defsymbol (&Qctext, "ctext");
5600   defsymbol (&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS],
5601              "shift-jis");
5602   defsymbol (&coding_category_symbol[CODING_CATEGORY_BIG5],
5603              "big5");
5604   defsymbol (&coding_category_symbol[CODING_CATEGORY_UCS4],
5605              "ucs-4");
5606   defsymbol (&coding_category_symbol[CODING_CATEGORY_UTF8],
5607              "utf-8");
5608   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_7],
5609              "iso-7");
5610   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE],
5611              "iso-8-designate");
5612   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_1],
5613              "iso-8-1");
5614   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_2],
5615              "iso-8-2");
5616   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_LOCK_SHIFT],
5617              "iso-lock-shift");
5618 #endif /* MULE */
5619   defsymbol (&coding_category_symbol[CODING_CATEGORY_NO_CONVERSION],
5620              "no-conversion");
5621 }
5622
5623 void
5624 lstream_type_create_file_coding (void)
5625 {
5626   LSTREAM_HAS_METHOD (decoding, reader);
5627   LSTREAM_HAS_METHOD (decoding, writer);
5628   LSTREAM_HAS_METHOD (decoding, rewinder);
5629   LSTREAM_HAS_METHOD (decoding, seekable_p);
5630   LSTREAM_HAS_METHOD (decoding, flusher);
5631   LSTREAM_HAS_METHOD (decoding, closer);
5632   LSTREAM_HAS_METHOD (decoding, marker);
5633
5634   LSTREAM_HAS_METHOD (encoding, reader);
5635   LSTREAM_HAS_METHOD (encoding, writer);
5636   LSTREAM_HAS_METHOD (encoding, rewinder);
5637   LSTREAM_HAS_METHOD (encoding, seekable_p);
5638   LSTREAM_HAS_METHOD (encoding, flusher);
5639   LSTREAM_HAS_METHOD (encoding, closer);
5640   LSTREAM_HAS_METHOD (encoding, marker);
5641 }
5642
5643 void
5644 vars_of_file_coding (void)
5645 {
5646   int i;
5647
5648   fcd = xnew (struct file_coding_dump);
5649   dumpstruct (&fcd, &fcd_description);
5650
5651   /* Initialize to something reasonable ... */
5652   for (i = 0; i <= CODING_CATEGORY_LAST; i++)
5653     {
5654       fcd->coding_category_system[i] = Qnil;
5655       fcd->coding_category_by_priority[i] = i;
5656     }
5657
5658   Fprovide (intern ("file-coding"));
5659
5660   DEFVAR_LISP ("keyboard-coding-system", &Vkeyboard_coding_system /*
5661 Coding system used for TTY keyboard input.
5662 Not used under a windowing system.
5663 */ );
5664   Vkeyboard_coding_system = Qnil;
5665
5666   DEFVAR_LISP ("terminal-coding-system", &Vterminal_coding_system /*
5667 Coding system used for TTY display output.
5668 Not used under a windowing system.
5669 */ );
5670   Vterminal_coding_system = Qnil;
5671
5672   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read /*
5673 Overriding coding system used when writing a file or process.
5674 You should *bind* this, not set it.  If this is non-nil, it specifies
5675 the coding system that will be used when a file or process is read
5676 in, and overrides `buffer-file-coding-system-for-read',
5677 `insert-file-contents-pre-hook', etc.  Use those variables instead of
5678 this one for permanent changes to the environment.
5679 */ );
5680   Vcoding_system_for_read = Qnil;
5681
5682   DEFVAR_LISP ("coding-system-for-write",
5683                &Vcoding_system_for_write /*
5684 Overriding coding system used when writing a file or process.
5685 You should *bind* this, not set it.  If this is non-nil, it specifies
5686 the coding system that will be used when a file or process is wrote
5687 in, and overrides `buffer-file-coding-system',
5688 `write-region-pre-hook', etc.  Use those variables instead of this one
5689 for permanent changes to the environment.
5690 */ );
5691   Vcoding_system_for_write = Qnil;
5692
5693   DEFVAR_LISP ("file-name-coding-system", &Vfile_name_coding_system /*
5694 Coding system used to convert pathnames when accessing files.
5695 */ );
5696   Vfile_name_coding_system = Qnil;
5697
5698   DEFVAR_BOOL ("enable-multibyte-characters", &enable_multibyte_characters /*
5699 Non-nil means the buffer contents are regarded as multi-byte form
5700 of characters, not a binary code.  This affects the display, file I/O,
5701 and behaviors of various editing commands.
5702
5703 Setting this to nil does not do anything.
5704 */ );
5705   enable_multibyte_characters = 1;
5706 }
5707
5708 void
5709 complex_vars_of_file_coding (void)
5710 {
5711   staticpro (&Vcoding_system_hash_table);
5712   Vcoding_system_hash_table =
5713     make_lisp_hash_table (50, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ);
5714
5715   the_codesys_prop_dynarr = Dynarr_new (codesys_prop);
5716   dumpstruct (&the_codesys_prop_dynarr, &codesys_prop_dynarr_description);
5717
5718 #define DEFINE_CODESYS_PROP(Prop_Type, Sym) do  \
5719 {                                               \
5720   struct codesys_prop csp;                      \
5721   csp.sym = (Sym);                              \
5722   csp.prop_type = (Prop_Type);                  \
5723   Dynarr_add (the_codesys_prop_dynarr, csp);    \
5724 } while (0)
5725
5726   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qmnemonic);
5727   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_type);
5728   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_cr);
5729   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_crlf);
5730   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_lf);
5731   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qpost_read_conversion);
5732   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qpre_write_conversion);
5733 #ifdef MULE
5734   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g0);
5735   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g1);
5736   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g2);
5737   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g3);
5738   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g0_on_output);
5739   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g1_on_output);
5740   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g2_on_output);
5741   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g3_on_output);
5742   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qshort);
5743   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_eol);
5744   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_cntl);
5745   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qseven);
5746   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qlock_shift);
5747   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_iso6429);
5748   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qescape_quoted);
5749   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qinput_charset_conversion);
5750   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qoutput_charset_conversion);
5751
5752   DEFINE_CODESYS_PROP (CODESYS_PROP_CCL,     Qencode);
5753   DEFINE_CODESYS_PROP (CODESYS_PROP_CCL,     Qdecode);
5754 #endif /* MULE */
5755   /* Need to create this here or we're really screwed. */
5756   Fmake_coding_system
5757     (Qraw_text, Qno_conversion,
5758      build_string ("Raw text, which means it converts only line-break-codes."),
5759      list2 (Qmnemonic, build_string ("Raw")));
5760
5761   Fmake_coding_system
5762     (Qbinary, Qno_conversion,
5763      build_string ("Binary, which means it does not convert anything."),
5764      list4 (Qeol_type, Qlf,
5765             Qmnemonic, build_string ("Binary")));
5766
5767 #ifdef UTF2000
5768   Fmake_coding_system
5769     (Qutf8, Qutf8,
5770      build_string ("Coding-system of ISO/IEC 10646 UTF-8."),
5771      list2 (Qmnemonic, build_string ("UTF8")));
5772 #endif
5773
5774   Fdefine_coding_system_alias (Qno_conversion, Qraw_text);
5775
5776   /* Need this for bootstrapping */
5777   fcd->coding_category_system[CODING_CATEGORY_NO_CONVERSION] =
5778     Fget_coding_system (Qraw_text);
5779
5780 #ifdef UTF2000
5781   fcd->coding_category_system[CODING_CATEGORY_UTF8]
5782    = Fget_coding_system (Qutf8);
5783 #endif
5784 }