git.chise.org Git - chise/xemacs-chise.git.1/blob - src/file-coding.c

   1 /* Code conversion functions.
   2    Copyright (C) 1991, 1995 Free Software Foundation, Inc.
   3    Copyright (C) 1995 Sun Microsystems, Inc.
   4
   5 This file is part of XEmacs.
   6
   7 XEmacs is free software; you can redistribute it and/or modify it
   8 under the terms of the GNU General Public License as published by the
   9 Free Software Foundation; either version 2, or (at your option) any
  10 later version.
  11
  12 XEmacs is distributed in the hope that it will be useful, but WITHOUT
  13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  14 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  15 for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with XEmacs; see the file COPYING.  If not, write to
  19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  20 Boston, MA 02111-1307, USA.  */
  21
  22 /* Synched up with: Mule 2.3.   Not in FSF. */
  23
  24 /* Rewritten by Ben Wing <ben@xemacs.org>. */
  25
  26 #include <config.h>
  27 #include "lisp.h"
  28
  29 #include "buffer.h"
  30 #include "elhash.h"
  31 #include "insdel.h"
  32 #include "lstream.h"
  33 #ifdef MULE
  34 #include "mule-ccl.h"
  35 #include "chartab.h"
  36 #endif
  37 #include "file-coding.h"
  38
  39 Lisp_Object Qcoding_system_error;
  40
  41 Lisp_Object Vkeyboard_coding_system;
  42 Lisp_Object Vterminal_coding_system;
  43 Lisp_Object Vcoding_system_for_read;
  44 Lisp_Object Vcoding_system_for_write;
  45 Lisp_Object Vfile_name_coding_system;
  46
  47 /* Table of symbols identifying each coding category. */
  48 Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST + 1];
  49
  50
  51
  52 struct file_coding_dump {
  53   /* Coding system currently associated with each coding category. */
  54   Lisp_Object coding_category_system[CODING_CATEGORY_LAST + 1];
  55
  56   /* Table of all coding categories in decreasing order of priority.
  57      This describes a permutation of the possible coding categories. */
  58   int coding_category_by_priority[CODING_CATEGORY_LAST + 1];
  59
  60   Lisp_Object ucs_to_mule_table[65536];
  61 } *fcd;
  62
  63 static const struct lrecord_description fcd_description_1[] = {
  64   { XD_LISP_OBJECT, offsetof(struct file_coding_dump, coding_category_system), CODING_CATEGORY_LAST + 1 },
  65   { XD_LISP_OBJECT, offsetof(struct file_coding_dump, ucs_to_mule_table),      65536 },
  66   { XD_END }
  67 };
  68
  69 static const struct struct_description fcd_description = {
  70   sizeof(struct file_coding_dump),
  71   fcd_description_1
  72 };
  73
  74 Lisp_Object mule_to_ucs_table;
  75
  76 Lisp_Object Qcoding_systemp;
  77
  78 Lisp_Object Qraw_text, Qno_conversion, Qccl, Qiso2022;
  79 /* Qinternal in general.c */
  80
  81 Lisp_Object Qmnemonic, Qeol_type;
  82 Lisp_Object Qcr, Qcrlf, Qlf;
  83 Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf;
  84 Lisp_Object Qpost_read_conversion;
  85 Lisp_Object Qpre_write_conversion;
  86
  87 #ifdef MULE
  88 Lisp_Object Qucs4, Qutf8;
  89 Lisp_Object Qbig5, Qshift_jis;
  90 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3;
  91 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output;
  92 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output;
  93 Lisp_Object Qno_iso6429;
  94 Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion;
  95 Lisp_Object Qctext, Qescape_quoted;
  96 Lisp_Object Qshort, Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift;
  97 #endif
  98 Lisp_Object Qencode, Qdecode;
  99
 100 Lisp_Object Vcoding_system_hash_table;
 101
 102 int enable_multibyte_characters;
 103
 104 #ifdef MULE
 105 /* Additional information used by the ISO2022 decoder and detector. */
 106 struct iso2022_decoder
 107 {
 108   /* CHARSET holds the character sets currently assigned to the G0
 109      through G3 variables.  It is initialized from the array
 110      INITIAL_CHARSET in CODESYS. */
 111   Lisp_Object charset[4];
 112
 113   /* Which registers are currently invoked into the left (GL) and
 114      right (GR) halves of the 8-bit encoding space? */
 115   int register_left, register_right;
 116
 117   /* ISO_ESC holds a value indicating part of an escape sequence
 118      that has already been seen. */
 119   enum iso_esc_flag esc;
 120
 121   /* This records the bytes we've seen so far in an escape sequence,
 122      in case the sequence is invalid (we spit out the bytes unchanged). */
 123   unsigned char esc_bytes[8];
 124
 125   /* Index for next byte to store in ISO escape sequence. */
 126   int esc_bytes_index;
 127
 128 #ifdef ENABLE_COMPOSITE_CHARS
 129   /* Stuff seen so far when composing a string. */
 130   unsigned_char_dynarr *composite_chars;
 131 #endif
 132
 133   /* If we saw an invalid designation sequence for a particular
 134      register, we flag it here and switch to ASCII.  The next time we
 135      see a valid designation for this register, we turn off the flag
 136      and do the designation normally, but pretend the sequence was
 137      invalid.  The effect of all this is that (most of the time) the
 138      escape sequences for both the switch to the unknown charset, and
 139      the switch back to the known charset, get inserted literally into
 140      the buffer and saved out as such.  The hope is that we can
 141      preserve the escape sequences so that the resulting written out
 142      file makes sense.  If we don't do any of this, the designation
 143      to the invalid charset will be preserved but that switch back
 144      to the known charset will probably get eaten because it was
 145      the same charset that was already present in the register. */
 146   unsigned char invalid_designated[4];
 147
 148   /* We try to do similar things as above for direction-switching
 149      sequences.  If we encountered a direction switch while an
 150      invalid designation was present, or an invalid designation
 151      just after a direction switch (i.e. no valid designation
 152      encountered yet), we insert the direction-switch escape
 153      sequence literally into the output stream, and later on
 154      insert the corresponding direction-restoring escape sequence
 155      literally also. */
 156   unsigned int switched_dir_and_no_valid_charset_yet :1;
 157   unsigned int invalid_switch_dir :1;
 158
 159   /* Tells the decoder to output the escape sequence literally
 160      even though it was valid.  Used in the games we play to
 161      avoid lossage when we encounter invalid designations. */
 162   unsigned int output_literally :1;
 163   /* We encountered a direction switch followed by an invalid
 164      designation.  We didn't output the direction switch
 165      literally because we didn't know about the invalid designation;
 166      but we have to do so now. */
 167   unsigned int output_direction_sequence :1;
 168 };
 169 #endif /* MULE */
 170 EXFUN (Fcopy_coding_system, 2);
 171 #ifdef MULE
 172 struct detection_state;
 173 static int detect_coding_sjis (struct detection_state *st,
 174                                CONST unsigned char *src,
 175                                unsigned int n);
 176 static void decode_coding_sjis (Lstream *decoding,
 177                                 CONST unsigned char *src,
 178                                 unsigned_char_dynarr *dst,
 179                                 unsigned int n);
 180 static void encode_coding_sjis (Lstream *encoding,
 181                                 CONST unsigned char *src,
 182                                 unsigned_char_dynarr *dst,
 183                                 unsigned int n);
 184 static int detect_coding_big5 (struct detection_state *st,
 185                                CONST unsigned char *src,
 186                                unsigned int n);
 187 static void decode_coding_big5 (Lstream *decoding,
 188                                 CONST unsigned char *src,
 189                                 unsigned_char_dynarr *dst, unsigned int n);
 190 static void encode_coding_big5 (Lstream *encoding,
 191                                 CONST unsigned char *src,
 192                                 unsigned_char_dynarr *dst, unsigned int n);
 193 static int detect_coding_ucs4 (struct detection_state *st,
 194                                CONST unsigned char *src,
 195                                unsigned int n);
 196 static void decode_coding_ucs4 (Lstream *decoding,
 197                                 CONST unsigned char *src,
 198                                 unsigned_char_dynarr *dst, unsigned int n);
 199 static void encode_coding_ucs4 (Lstream *encoding,
 200                                 CONST unsigned char *src,
 201                                 unsigned_char_dynarr *dst, unsigned int n);
 202 static int detect_coding_utf8 (struct detection_state *st,
 203                                CONST unsigned char *src,
 204                                unsigned int n);
 205 static void decode_coding_utf8 (Lstream *decoding,
 206                                 CONST unsigned char *src,
 207                                 unsigned_char_dynarr *dst, unsigned int n);
 208 static void encode_coding_utf8 (Lstream *encoding,
 209                                 CONST unsigned char *src,
 210                                 unsigned_char_dynarr *dst, unsigned int n);
 211 static int postprocess_iso2022_mask (int mask);
 212 static void reset_iso2022 (Lisp_Object coding_system,
 213                            struct iso2022_decoder *iso);
 214 static int detect_coding_iso2022 (struct detection_state *st,
 215                                   CONST unsigned char *src,
 216                                   unsigned int n);
 217 static void decode_coding_iso2022 (Lstream *decoding,
 218                                    CONST unsigned char *src,
 219                                    unsigned_char_dynarr *dst, unsigned int n);
 220 static void encode_coding_iso2022 (Lstream *encoding,
 221                                    CONST unsigned char *src,
 222                                    unsigned_char_dynarr *dst, unsigned int n);
 223 #endif /* MULE */
 224 static void decode_coding_no_conversion (Lstream *decoding,
 225                                          CONST unsigned char *src,
 226                                          unsigned_char_dynarr *dst,
 227                                          unsigned int n);
 228 static void encode_coding_no_conversion (Lstream *encoding,
 229                                          CONST unsigned char *src,
 230                                          unsigned_char_dynarr *dst,
 231                                          unsigned int n);
 232 static void mule_decode (Lstream *decoding, CONST unsigned char *src,
 233                          unsigned_char_dynarr *dst, unsigned int n);
 234 static void mule_encode (Lstream *encoding, CONST unsigned char *src,
 235                          unsigned_char_dynarr *dst, unsigned int n);
 236
 237 typedef struct codesys_prop codesys_prop;
 238 struct codesys_prop
 239 {
 240   Lisp_Object sym;
 241   int prop_type;
 242 };
 243
 244 typedef struct
 245 {
 246   Dynarr_declare (codesys_prop);
 247 } codesys_prop_dynarr;
 248
 249 static const struct lrecord_description codesys_prop_description_1[] = {
 250   { XD_LISP_OBJECT, offsetof(codesys_prop, sym), 1 },
 251   { XD_END }
 252 };
 253
 254 static const struct struct_description codesys_prop_description = {
 255   sizeof(codesys_prop),
 256   codesys_prop_description_1
 257 };
 258
 259 static const struct lrecord_description codesys_prop_dynarr_description_1[] = {
 260   XD_DYNARR_DESC(codesys_prop_dynarr, &codesys_prop_description),
 261   { XD_END }
 262 };
 263
 264 static const struct struct_description codesys_prop_dynarr_description = {
 265   sizeof(codesys_prop_dynarr),
 266   codesys_prop_dynarr_description_1
 267 };
 268
 269 codesys_prop_dynarr *the_codesys_prop_dynarr;
 270
 271 enum codesys_prop_enum
 272 {
 273   CODESYS_PROP_ALL_OK,
 274   CODESYS_PROP_ISO2022,
 275   CODESYS_PROP_CCL
 276 };
 277
 278 \f
 279 /************************************************************************/
 280 /*                       Coding system functions                        */
 281 /************************************************************************/
 282
 283 static Lisp_Object mark_coding_system (Lisp_Object);
 284 static void print_coding_system (Lisp_Object, Lisp_Object, int);
 285 static void finalize_coding_system (void *header, int for_disksave);
 286
 287 #ifdef MULE
 288 static const struct lrecord_description ccs_description_1[] = {
 289   { XD_LISP_OBJECT, offsetof(charset_conversion_spec, from_charset), 2 },
 290   { XD_END }
 291 };
 292
 293 static const struct struct_description ccs_description = {
 294   sizeof(charset_conversion_spec),
 295   ccs_description_1
 296 };
 297
 298 static const struct lrecord_description ccsd_description_1[] = {
 299   XD_DYNARR_DESC(charset_conversion_spec_dynarr, &ccs_description),
 300   { XD_END }
 301 };
 302
 303 static const struct struct_description ccsd_description = {
 304   sizeof(charset_conversion_spec_dynarr),
 305   ccsd_description_1
 306 };
 307 #endif
 308
 309 static const struct lrecord_description coding_system_description[] = {
 310   { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, name), 2 },
 311   { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, mnemonic), 3 },
 312   { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, eol_lf), 3 },
 313 #ifdef MULE
 314   { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, iso2022.initial_charset), 4 },
 315   { XD_STRUCT_PTR,  offsetof(struct Lisp_Coding_System, iso2022.input_conv),  1, &ccsd_description },
 316   { XD_STRUCT_PTR,  offsetof(struct Lisp_Coding_System, iso2022.output_conv), 1, &ccsd_description },
 317   { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, ccl.decode), 2 },
 318 #endif
 319   { XD_END }
 320 };
 321
 322 DEFINE_LRECORD_IMPLEMENTATION ("coding-system", coding_system,
 323                                mark_coding_system, print_coding_system,
 324                                finalize_coding_system,
 325                                0, 0, coding_system_description,
 326                                struct Lisp_Coding_System);
 327
 328 static Lisp_Object
 329 mark_coding_system (Lisp_Object obj)
 330 {
 331   Lisp_Coding_System *codesys = XCODING_SYSTEM (obj);
 332
 333   mark_object (CODING_SYSTEM_NAME (codesys));
 334   mark_object (CODING_SYSTEM_DOC_STRING (codesys));
 335   mark_object (CODING_SYSTEM_MNEMONIC (codesys));
 336   mark_object (CODING_SYSTEM_EOL_LF (codesys));
 337   mark_object (CODING_SYSTEM_EOL_CRLF (codesys));
 338   mark_object (CODING_SYSTEM_EOL_CR (codesys));
 339
 340   switch (CODING_SYSTEM_TYPE (codesys))
 341     {
 342 #ifdef MULE
 343       int i;
 344     case CODESYS_ISO2022:
 345       for (i = 0; i < 4; i++)
 346         mark_object (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i));
 347       if (codesys->iso2022.input_conv)
 348         {
 349           for (i = 0; i < Dynarr_length (codesys->iso2022.input_conv); i++)
 350             {
 351               struct charset_conversion_spec *ccs =
 352                 Dynarr_atp (codesys->iso2022.input_conv, i);
 353               mark_object (ccs->from_charset);
 354               mark_object (ccs->to_charset);
 355             }
 356         }
 357       if (codesys->iso2022.output_conv)
 358         {
 359           for (i = 0; i < Dynarr_length (codesys->iso2022.output_conv); i++)
 360             {
 361               struct charset_conversion_spec *ccs =
 362                 Dynarr_atp (codesys->iso2022.output_conv, i);
 363               mark_object (ccs->from_charset);
 364               mark_object (ccs->to_charset);
 365             }
 366         }
 367       break;
 368
 369     case CODESYS_CCL:
 370       mark_object (CODING_SYSTEM_CCL_DECODE (codesys));
 371       mark_object (CODING_SYSTEM_CCL_ENCODE (codesys));
 372       break;
 373 #endif /* MULE */
 374     default:
 375       break;
 376     }
 377
 378   mark_object (CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys));
 379   return CODING_SYSTEM_POST_READ_CONVERSION (codesys);
 380 }
 381
 382 static void
 383 print_coding_system (Lisp_Object obj, Lisp_Object printcharfun,
 384                      int escapeflag)
 385 {
 386   Lisp_Coding_System *c = XCODING_SYSTEM (obj);
 387   if (print_readably)
 388     error ("printing unreadable object #<coding_system 0x%x>",
 389            c->header.uid);
 390
 391   write_c_string ("#<coding_system ", printcharfun);
 392   print_internal (c->name, printcharfun, 1);
 393   write_c_string (">", printcharfun);
 394 }
 395
 396 static void
 397 finalize_coding_system (void *header, int for_disksave)
 398 {
 399   Lisp_Coding_System *c = (Lisp_Coding_System *) header;
 400   /* Since coding systems never go away, this function is not
 401      necessary.  But it would be necessary if we changed things
 402      so that coding systems could go away. */
 403   if (!for_disksave) /* see comment in lstream.c */
 404     {
 405       switch (CODING_SYSTEM_TYPE (c))
 406         {
 407 #ifdef MULE
 408         case CODESYS_ISO2022:
 409           if (c->iso2022.input_conv)
 410             {
 411               Dynarr_free (c->iso2022.input_conv);
 412               c->iso2022.input_conv = 0;
 413             }
 414           if (c->iso2022.output_conv)
 415             {
 416               Dynarr_free (c->iso2022.output_conv);
 417               c->iso2022.output_conv = 0;
 418             }
 419           break;
 420 #endif /* MULE */
 421         default:
 422           break;
 423         }
 424     }
 425 }
 426
 427 static eol_type_t
 428 symbol_to_eol_type (Lisp_Object symbol)
 429 {
 430   CHECK_SYMBOL (symbol);
 431   if (NILP (symbol))      return EOL_AUTODETECT;
 432   if (EQ (symbol, Qlf))   return EOL_LF;
 433   if (EQ (symbol, Qcrlf)) return EOL_CRLF;
 434   if (EQ (symbol, Qcr))   return EOL_CR;
 435
 436   signal_simple_error ("Unrecognized eol type", symbol);
 437   return EOL_AUTODETECT; /* not reached */
 438 }
 439
 440 static Lisp_Object
 441 eol_type_to_symbol (eol_type_t type)
 442 {
 443   switch (type)
 444     {
 445     default: abort ();
 446     case EOL_LF:         return Qlf;
 447     case EOL_CRLF:       return Qcrlf;
 448     case EOL_CR:         return Qcr;
 449     case EOL_AUTODETECT: return Qnil;
 450     }
 451 }
 452
 453 static void
 454 setup_eol_coding_systems (Lisp_Coding_System *codesys)
 455 {
 456   Lisp_Object codesys_obj;
 457   int len = string_length (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name);
 458   char *codesys_name = (char *) alloca (len + 7);
 459   int mlen = -1;
 460   char *codesys_mnemonic=0;
 461
 462   Lisp_Object codesys_name_sym, sub_codesys_obj;
 463
 464   /* kludge */
 465
 466   XSETCODING_SYSTEM (codesys_obj, codesys);
 467
 468   memcpy (codesys_name,
 469           string_data (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name), len);
 470
 471   if (STRINGP (CODING_SYSTEM_MNEMONIC (codesys)))
 472     {
 473       mlen = XSTRING_LENGTH (CODING_SYSTEM_MNEMONIC (codesys));
 474       codesys_mnemonic = (char *) alloca (mlen + 7);
 475       memcpy (codesys_mnemonic,
 476               XSTRING_DATA (CODING_SYSTEM_MNEMONIC (codesys)), mlen);
 477     }
 478
 479 #define DEFINE_SUB_CODESYS(op_sys, op_sys_abbr, Type) do {                      \
 480   strcpy (codesys_name + len, "-" op_sys);                                      \
 481   if (mlen != -1)                                                               \
 482     strcpy (codesys_mnemonic + mlen, op_sys_abbr);                              \
 483   codesys_name_sym = intern (codesys_name);                                     \
 484   sub_codesys_obj = Fcopy_coding_system (codesys_obj, codesys_name_sym);        \
 485   XCODING_SYSTEM_EOL_TYPE (sub_codesys_obj) = Type;                             \
 486   if (mlen != -1)                                                               \
 487     XCODING_SYSTEM_MNEMONIC(sub_codesys_obj) =                                  \
 488       build_string (codesys_mnemonic);                                          \
 489   CODING_SYSTEM_##Type (codesys) = sub_codesys_obj;                             \
 490 } while (0)
 491
 492   DEFINE_SUB_CODESYS("unix", "", EOL_LF);
 493   DEFINE_SUB_CODESYS("dos",  ":T", EOL_CRLF);
 494   DEFINE_SUB_CODESYS("mac",  ":t", EOL_CR);
 495 }
 496
 497 DEFUN ("coding-system-p", Fcoding_system_p, 1, 1, 0, /*
 498 Return t if OBJECT is a coding system.
 499 A coding system is an object that defines how text containing multiple
 500 character sets is encoded into a stream of (typically 8-bit) bytes.
 501 The coding system is used to decode the stream into a series of
 502 characters (which may be from multiple charsets) when the text is read
 503 from a file or process, and is used to encode the text back into the
 504 same format when it is written out to a file or process.
 505
 506 For example, many ISO2022-compliant coding systems (such as Compound
 507 Text, which is used for inter-client data under the X Window System)
 508 use escape sequences to switch between different charsets -- Japanese
 509 Kanji, for example, is invoked with "ESC $ ( B"; ASCII is invoked
 510 with "ESC ( B"; and Cyrillic is invoked with "ESC - L".  See
 511 `make-coding-system' for more information.
 512
 513 Coding systems are normally identified using a symbol, and the
 514 symbol is accepted in place of the actual coding system object whenever
 515 a coding system is called for. (This is similar to how faces work.)
 516 */
 517        (object))
 518 {
 519   return CODING_SYSTEMP (object) ? Qt : Qnil;
 520 }
 521
 522 DEFUN ("find-coding-system", Ffind_coding_system, 1, 1, 0, /*
 523 Retrieve the coding system of the given name.
 524
 525 If CODING-SYSTEM-OR-NAME is a coding-system object, it is simply
 526 returned.  Otherwise, CODING-SYSTEM-OR-NAME should be a symbol.
 527 If there is no such coding system, nil is returned.  Otherwise the
 528 associated coding system object is returned.
 529 */
 530        (coding_system_or_name))
 531 {
 532   if (CODING_SYSTEMP (coding_system_or_name))
 533     return coding_system_or_name;
 534
 535   if (NILP (coding_system_or_name))
 536     coding_system_or_name = Qbinary;
 537   else
 538     CHECK_SYMBOL (coding_system_or_name);
 539
 540   return Fgethash (coding_system_or_name, Vcoding_system_hash_table, Qnil);
 541 }
 542
 543 DEFUN ("get-coding-system", Fget_coding_system, 1, 1, 0, /*
 544 Retrieve the coding system of the given name.
 545 Same as `find-coding-system' except that if there is no such
 546 coding system, an error is signaled instead of returning nil.
 547 */
 548        (name))
 549 {
 550   Lisp_Object coding_system = Ffind_coding_system (name);
 551
 552   if (NILP (coding_system))
 553     signal_simple_error ("No such coding system", name);
 554   return coding_system;
 555 }
 556
 557 /* We store the coding systems in hash tables with the names as the key and the
 558    actual coding system object as the value.  Occasionally we need to use them
 559    in a list format.  These routines provide us with that. */
 560 struct coding_system_list_closure
 561 {
 562   Lisp_Object *coding_system_list;
 563 };
 564
 565 static int
 566 add_coding_system_to_list_mapper (Lisp_Object key, Lisp_Object value,
 567                                   void *coding_system_list_closure)
 568 {
 569   /* This function can GC */
 570   struct coding_system_list_closure *cscl =
 571     (struct coding_system_list_closure *) coding_system_list_closure;
 572   Lisp_Object *coding_system_list = cscl->coding_system_list;
 573
 574   *coding_system_list = Fcons (key, *coding_system_list);
 575   return 0;
 576 }
 577
 578 DEFUN ("coding-system-list", Fcoding_system_list, 0, 0, 0, /*
 579 Return a list of the names of all defined coding systems.
 580 */
 581        ())
 582 {
 583   Lisp_Object coding_system_list = Qnil;
 584   struct gcpro gcpro1;
 585   struct coding_system_list_closure coding_system_list_closure;
 586
 587   GCPRO1 (coding_system_list);
 588   coding_system_list_closure.coding_system_list = &coding_system_list;
 589   elisp_maphash (add_coding_system_to_list_mapper, Vcoding_system_hash_table,
 590                  &coding_system_list_closure);
 591   UNGCPRO;
 592
 593   return coding_system_list;
 594 }
 595
 596 DEFUN ("coding-system-name", Fcoding_system_name, 1, 1, 0, /*
 597 Return the name of the given coding system.
 598 */
 599        (coding_system))
 600 {
 601   coding_system = Fget_coding_system (coding_system);
 602   return XCODING_SYSTEM_NAME (coding_system);
 603 }
 604
 605 static Lisp_Coding_System *
 606 allocate_coding_system (enum coding_system_type type, Lisp_Object name)
 607 {
 608   Lisp_Coding_System *codesys =
 609     alloc_lcrecord_type (Lisp_Coding_System, &lrecord_coding_system);
 610
 611   zero_lcrecord (codesys);
 612   CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = Qnil;
 613   CODING_SYSTEM_POST_READ_CONVERSION (codesys) = Qnil;
 614   CODING_SYSTEM_EOL_TYPE (codesys) = EOL_AUTODETECT;
 615   CODING_SYSTEM_EOL_CRLF (codesys) = Qnil;
 616   CODING_SYSTEM_EOL_CR   (codesys) = Qnil;
 617   CODING_SYSTEM_EOL_LF   (codesys) = Qnil;
 618   CODING_SYSTEM_TYPE     (codesys) = type;
 619   CODING_SYSTEM_MNEMONIC (codesys) = Qnil;
 620 #ifdef MULE
 621   if (type == CODESYS_ISO2022)
 622     {
 623       int i;
 624       for (i = 0; i < 4; i++)
 625         CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i) = Qnil;
 626     }
 627   else if (type == CODESYS_CCL)
 628     {
 629       CODING_SYSTEM_CCL_DECODE (codesys) = Qnil;
 630       CODING_SYSTEM_CCL_ENCODE (codesys) = Qnil;
 631     }
 632 #endif /* MULE */
 633   CODING_SYSTEM_NAME (codesys) = name;
 634
 635   return codesys;
 636 }
 637
 638 #ifdef MULE
 639 /* Given a list of charset conversion specs as specified in a Lisp
 640    program, parse it into STORE_HERE. */
 641
 642 static void
 643 parse_charset_conversion_specs (charset_conversion_spec_dynarr *store_here,
 644                                 Lisp_Object spec_list)
 645 {
 646   Lisp_Object rest;
 647
 648   EXTERNAL_LIST_LOOP (rest, spec_list)
 649     {
 650       Lisp_Object car = XCAR (rest);
 651       Lisp_Object from, to;
 652       struct charset_conversion_spec spec;
 653
 654       if (!CONSP (car) || !CONSP (XCDR (car)) || !NILP (XCDR (XCDR (car))))
 655         signal_simple_error ("Invalid charset conversion spec", car);
 656       from = Fget_charset (XCAR (car));
 657       to = Fget_charset (XCAR (XCDR (car)));
 658       if (XCHARSET_TYPE (from) != XCHARSET_TYPE (to))
 659         signal_simple_error_2
 660           ("Attempted conversion between different charset types",
 661            from, to);
 662       spec.from_charset = from;
 663       spec.to_charset = to;
 664
 665       Dynarr_add (store_here, spec);
 666     }
 667 }
 668
 669 /* Given a dynarr LOAD_HERE of internally-stored charset conversion
 670    specs, return the equivalent as the Lisp programmer would see it.
 671
 672    If LOAD_HERE is 0, return Qnil. */
 673
 674 static Lisp_Object
 675 unparse_charset_conversion_specs (charset_conversion_spec_dynarr *load_here)
 676 {
 677   int i;
 678   Lisp_Object result;
 679
 680   if (!load_here)
 681     return Qnil;
 682   for (i = 0, result = Qnil; i < Dynarr_length (load_here); i++)
 683     {
 684       struct charset_conversion_spec *ccs = Dynarr_atp (load_here, i);
 685       result = Fcons (list2 (ccs->from_charset, ccs->to_charset), result);
 686     }
 687
 688   return Fnreverse (result);
 689 }
 690
 691 #endif /* MULE */
 692
 693 DEFUN ("make-coding-system", Fmake_coding_system, 2, 4, 0, /*
 694 Register symbol NAME as a coding system.
 695
 696 TYPE describes the conversion method used and should be one of
 697
 698 nil or 'undecided
 699      Automatic conversion.  XEmacs attempts to detect the coding system
 700      used in the file.
 701 'no-conversion
 702      No conversion.  Use this for binary files and such.  On output,
 703      graphic characters that are not in ASCII or Latin-1 will be
 704      replaced by a ?. (For a no-conversion-encoded buffer, these
 705      characters will only be present if you explicitly insert them.)
 706 'shift-jis
 707      Shift-JIS (a Japanese encoding commonly used in PC operating systems).
 708 'ucs-4
 709      ISO 10646 UCS-4 encoding.
 710 'utf-8
 711      ISO 10646 UTF-8 encoding.
 712 'iso2022
 713      Any ISO2022-compliant encoding.  Among other things, this includes
 714      JIS (the Japanese encoding commonly used for e-mail), EUC (the
 715      standard Unix encoding for Japanese and other languages), and
 716      Compound Text (the encoding used in X11).  You can specify more
 717      specific information about the conversion with the FLAGS argument.
 718 'big5
 719      Big5 (the encoding commonly used for Taiwanese).
 720 'ccl
 721      The conversion is performed using a user-written pseudo-code
 722      program.  CCL (Code Conversion Language) is the name of this
 723      pseudo-code.
 724 'internal
 725      Write out or read in the raw contents of the memory representing
 726      the buffer's text.  This is primarily useful for debugging
 727      purposes, and is only enabled when XEmacs has been compiled with
 728      DEBUG_XEMACS defined (via the --debug configure option).
 729      WARNING: Reading in a file using 'internal conversion can result
 730      in an internal inconsistency in the memory representing a
 731      buffer's text, which will produce unpredictable results and may
 732      cause XEmacs to crash.  Under normal circumstances you should
 733      never use 'internal conversion.
 734
 735 DOC-STRING is a string describing the coding system.
 736
 737 PROPS is a property list, describing the specific nature of the
 738 character set.  Recognized properties are:
 739
 740 'mnemonic
 741      String to be displayed in the modeline when this coding system is
 742      active.
 743
 744 'eol-type
 745      End-of-line conversion to be used.  It should be one of
 746
 747         nil
 748                 Automatically detect the end-of-line type (LF, CRLF,
 749                 or CR).  Also generate subsidiary coding systems named
 750                 `NAME-unix', `NAME-dos', and `NAME-mac', that are
 751                 identical to this coding system but have an EOL-TYPE
 752                 value of 'lf, 'crlf, and 'cr, respectively.
 753         'lf
 754                 The end of a line is marked externally using ASCII LF.
 755                 Since this is also the way that XEmacs represents an
 756                 end-of-line internally, specifying this option results
 757                 in no end-of-line conversion.  This is the standard
 758                 format for Unix text files.
 759         'crlf
 760                 The end of a line is marked externally using ASCII
 761                 CRLF.  This is the standard format for MS-DOS text
 762                 files.
 763         'cr
 764                 The end of a line is marked externally using ASCII CR.
 765                 This is the standard format for Macintosh text files.
 766         t
 767                 Automatically detect the end-of-line type but do not
 768                 generate subsidiary coding systems.  (This value is
 769                 converted to nil when stored internally, and
 770                 `coding-system-property' will return nil.)
 771
 772 'post-read-conversion
 773      Function called after a file has been read in, to perform the
 774      decoding.  Called with two arguments, BEG and END, denoting
 775      a region of the current buffer to be decoded.
 776
 777 'pre-write-conversion
 778      Function called before a file is written out, to perform the
 779      encoding.  Called with two arguments, BEG and END, denoting
 780      a region of the current buffer to be encoded.
 781
 782
 783 The following additional properties are recognized if TYPE is 'iso2022:
 784
 785 'charset-g0
 786 'charset-g1
 787 'charset-g2
 788 'charset-g3
 789      The character set initially designated to the G0 - G3 registers.
 790      The value should be one of
 791
 792           -- A charset object (designate that character set)
 793           -- nil (do not ever use this register)
 794           -- t (no character set is initially designated to
 795                 the register, but may be later on; this automatically
 796                 sets the corresponding `force-g*-on-output' property)
 797
 798 'force-g0-on-output
 799 'force-g1-on-output
 800 'force-g2-on-output
 801 'force-g2-on-output
 802      If non-nil, send an explicit designation sequence on output before
 803      using the specified register.
 804
 805 'short
 806      If non-nil, use the short forms "ESC $ @", "ESC $ A", and
 807      "ESC $ B" on output in place of the full designation sequences
 808      "ESC $ ( @", "ESC $ ( A", and "ESC $ ( B".
 809
 810 'no-ascii-eol
 811      If non-nil, don't designate ASCII to G0 at each end of line on output.
 812      Setting this to non-nil also suppresses other state-resetting that
 813      normally happens at the end of a line.
 814
 815 'no-ascii-cntl
 816      If non-nil, don't designate ASCII to G0 before control chars on output.
 817
 818 'seven
 819      If non-nil, use 7-bit environment on output.  Otherwise, use 8-bit
 820      environment.
 821
 822 'lock-shift
 823      If non-nil, use locking-shift (SO/SI) instead of single-shift
 824      or designation by escape sequence.
 825
 826 'no-iso6429
 827      If non-nil, don't use ISO6429's direction specification.
 828
 829 'escape-quoted
 830      If non-nil, literal control characters that are the same as
 831      the beginning of a recognized ISO2022 or ISO6429 escape sequence
 832      (in particular, ESC (0x1B), SO (0x0E), SI (0x0F), SS2 (0x8E),
 833      SS3 (0x8F), and CSI (0x9B)) are "quoted" with an escape character
 834      so that they can be properly distinguished from an escape sequence.
 835      (Note that doing this results in a non-portable encoding.) This
 836      encoding flag is used for byte-compiled files.  Note that ESC
 837      is a good choice for a quoting character because there are no
 838      escape sequences whose second byte is a character from the Control-0
 839      or Control-1 character sets; this is explicitly disallowed by the
 840      ISO2022 standard.
 841
 842 'input-charset-conversion
 843      A list of conversion specifications, specifying conversion of
 844      characters in one charset to another when decoding is performed.
 845      Each specification is a list of two elements: the source charset,
 846      and the destination charset.
 847
 848 'output-charset-conversion
 849      A list of conversion specifications, specifying conversion of
 850      characters in one charset to another when encoding is performed.
 851      The form of each specification is the same as for
 852      'input-charset-conversion.
 853
 854
 855 The following additional properties are recognized (and required)
 856 if TYPE is 'ccl:
 857
 858 'decode
 859      CCL program used for decoding (converting to internal format).
 860
 861 'encode
 862      CCL program used for encoding (converting to external format).
 863 */
 864        (name, type, doc_string, props))
 865 {
 866   Lisp_Coding_System *codesys;
 867   Lisp_Object rest, key, value;
 868   enum coding_system_type ty;
 869   int need_to_setup_eol_systems = 1;
 870
 871   /* Convert type to constant */
 872   if (NILP (type) || EQ (type, Qundecided))
 873                                       { ty = CODESYS_AUTODETECT; }
 874 #ifdef MULE
 875   else if (EQ (type, Qshift_jis))     { ty = CODESYS_SHIFT_JIS; }
 876   else if (EQ (type, Qiso2022))       { ty = CODESYS_ISO2022; }
 877   else if (EQ (type, Qbig5))          { ty = CODESYS_BIG5; }
 878   else if (EQ (type, Qucs4))          { ty = CODESYS_UCS4; }
 879   else if (EQ (type, Qutf8))          { ty = CODESYS_UTF8; }
 880   else if (EQ (type, Qccl))           { ty = CODESYS_CCL; }
 881 #endif
 882   else if (EQ (type, Qno_conversion)) { ty = CODESYS_NO_CONVERSION; }
 883 #ifdef DEBUG_XEMACS
 884   else if (EQ (type, Qinternal))      { ty = CODESYS_INTERNAL; }
 885 #endif
 886   else
 887     signal_simple_error ("Invalid coding system type", type);
 888
 889   CHECK_SYMBOL (name);
 890
 891   codesys = allocate_coding_system (ty, name);
 892
 893   if (NILP (doc_string))
 894     doc_string = build_string ("");
 895   else
 896     CHECK_STRING (doc_string);
 897   CODING_SYSTEM_DOC_STRING (codesys) = doc_string;
 898
 899   EXTERNAL_PROPERTY_LIST_LOOP (rest, key, value, props)
 900     {
 901       if (EQ (key, Qmnemonic))
 902         {
 903           if (!NILP (value))
 904             CHECK_STRING (value);
 905           CODING_SYSTEM_MNEMONIC (codesys) = value;
 906         }
 907
 908       else if (EQ (key, Qeol_type))
 909         {
 910           need_to_setup_eol_systems = NILP (value);
 911           if (EQ (value, Qt))
 912             value = Qnil;
 913           CODING_SYSTEM_EOL_TYPE (codesys) = symbol_to_eol_type (value);
 914         }
 915
 916       else if (EQ (key, Qpost_read_conversion)) CODING_SYSTEM_POST_READ_CONVERSION (codesys) = value;
 917       else if (EQ (key, Qpre_write_conversion)) CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = value;
 918 #ifdef MULE
 919       else if (ty == CODESYS_ISO2022)
 920         {
 921 #define FROB_INITIAL_CHARSET(charset_num) \
 922   CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \
 923     ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value))
 924
 925           if      (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
 926           else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
 927           else if (EQ (key, Qcharset_g2)) FROB_INITIAL_CHARSET (2);
 928           else if (EQ (key, Qcharset_g3)) FROB_INITIAL_CHARSET (3);
 929
 930 #define FROB_FORCE_CHARSET(charset_num) \
 931   CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = !NILP (value)
 932
 933           else if (EQ (key, Qforce_g0_on_output)) FROB_FORCE_CHARSET (0);
 934           else if (EQ (key, Qforce_g1_on_output)) FROB_FORCE_CHARSET (1);
 935           else if (EQ (key, Qforce_g2_on_output)) FROB_FORCE_CHARSET (2);
 936           else if (EQ (key, Qforce_g3_on_output)) FROB_FORCE_CHARSET (3);
 937
 938 #define FROB_BOOLEAN_PROPERTY(prop) \
 939   CODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value)
 940
 941           else if (EQ (key, Qshort))         FROB_BOOLEAN_PROPERTY (SHORT);
 942           else if (EQ (key, Qno_ascii_eol))  FROB_BOOLEAN_PROPERTY (NO_ASCII_EOL);
 943           else if (EQ (key, Qno_ascii_cntl)) FROB_BOOLEAN_PROPERTY (NO_ASCII_CNTL);
 944           else if (EQ (key, Qseven))         FROB_BOOLEAN_PROPERTY (SEVEN);
 945           else if (EQ (key, Qlock_shift))    FROB_BOOLEAN_PROPERTY (LOCK_SHIFT);
 946           else if (EQ (key, Qno_iso6429))    FROB_BOOLEAN_PROPERTY (NO_ISO6429);
 947           else if (EQ (key, Qescape_quoted)) FROB_BOOLEAN_PROPERTY (ESCAPE_QUOTED);
 948
 949           else if (EQ (key, Qinput_charset_conversion))
 950             {
 951               codesys->iso2022.input_conv =
 952                 Dynarr_new (charset_conversion_spec);
 953               parse_charset_conversion_specs (codesys->iso2022.input_conv,
 954                                               value);
 955             }
 956           else if (EQ (key, Qoutput_charset_conversion))
 957             {
 958               codesys->iso2022.output_conv =
 959                 Dynarr_new (charset_conversion_spec);
 960               parse_charset_conversion_specs (codesys->iso2022.output_conv,
 961                                               value);
 962             }
 963           else
 964             signal_simple_error ("Unrecognized property", key);
 965         }
 966       else if (EQ (type, Qccl))
 967         {
 968           if (EQ (key, Qdecode))
 969             {
 970               CHECK_VECTOR (value);
 971               CODING_SYSTEM_CCL_DECODE (codesys) = value;
 972             }
 973           else if (EQ (key, Qencode))
 974             {
 975               CHECK_VECTOR (value);
 976               CODING_SYSTEM_CCL_ENCODE (codesys) = value;
 977             }
 978           else
 979             signal_simple_error ("Unrecognized property", key);
 980         }
 981 #endif /* MULE */
 982       else
 983         signal_simple_error ("Unrecognized property", key);
 984     }
 985
 986   if (need_to_setup_eol_systems)
 987     setup_eol_coding_systems (codesys);
 988
 989   {
 990     Lisp_Object codesys_obj;
 991     XSETCODING_SYSTEM (codesys_obj, codesys);
 992     Fputhash (name, codesys_obj, Vcoding_system_hash_table);
 993     return codesys_obj;
 994   }
 995 }
 996
 997 DEFUN ("copy-coding-system", Fcopy_coding_system, 2, 2, 0, /*
 998 Copy OLD-CODING-SYSTEM to NEW-NAME.
 999 If NEW-NAME does not name an existing coding system, a new one will
1000 be created.
1001 */
1002        (old_coding_system, new_name))
1003 {
1004   Lisp_Object new_coding_system;
1005   old_coding_system = Fget_coding_system (old_coding_system);
1006   new_coding_system = Ffind_coding_system (new_name);
1007   if (NILP (new_coding_system))
1008     {
1009       XSETCODING_SYSTEM (new_coding_system,
1010                          allocate_coding_system
1011                          (XCODING_SYSTEM_TYPE (old_coding_system),
1012                           new_name));
1013       Fputhash (new_name, new_coding_system, Vcoding_system_hash_table);
1014     }
1015
1016   {
1017     Lisp_Coding_System *to = XCODING_SYSTEM (new_coding_system);
1018     Lisp_Coding_System *from = XCODING_SYSTEM (old_coding_system);
1019     memcpy (((char *) to  ) + sizeof (to->header),
1020             ((char *) from) + sizeof (from->header),
1021             sizeof (*from) - sizeof (from->header));
1022     to->name = new_name;
1023   }
1024   return new_coding_system;
1025 }
1026
1027 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias, 2, 2, 0, /*
1028 Define symbol ALIAS as an alias for coding system CODING-SYSTEM.
1029 */
1030        (alias, coding_system))
1031 {
1032   CHECK_SYMBOL (alias);
1033   if (!NILP (Ffind_coding_system (alias)))
1034     signal_simple_error ("Symbol already names a coding system", alias);
1035   coding_system = Fget_coding_system (coding_system);
1036   Fputhash (alias, coding_system, Vcoding_system_hash_table);
1037
1038   /* Set up aliases for subsidiaries. */
1039   if (XCODING_SYSTEM_EOL_TYPE (coding_system) == EOL_AUTODETECT)
1040     {
1041       Lisp_Object str;
1042       XSETSTRING (str, symbol_name (XSYMBOL (alias)));
1043 #define FROB(type, name)                                                        \
1044       do {                                                                      \
1045         Lisp_Object subsidiary = XCODING_SYSTEM_EOL_##type (coding_system);     \
1046         if (!NILP (subsidiary))                                                 \
1047           Fdefine_coding_system_alias                                           \
1048             (Fintern (concat2 (str, build_string (name)), Qnil), subsidiary);   \
1049       } while (0)
1050       FROB (LF,   "-unix");
1051       FROB (CRLF, "-dos");
1052       FROB (CR,   "-mac");
1053 #undef FROB
1054     }
1055   /* FSF return value is a vector of [ALIAS-unix ALIAS-dos ALIAS-mac],
1056      but it doesn't look intentional, so I'd rather return something
1057      meaningful or nothing at all. */
1058   return Qnil;
1059 }
1060
1061 static Lisp_Object
1062 subsidiary_coding_system (Lisp_Object coding_system, eol_type_t type)
1063 {
1064   Lisp_Coding_System *cs = XCODING_SYSTEM (coding_system);
1065   Lisp_Object new_coding_system;
1066
1067   if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
1068     return coding_system;
1069
1070   switch (type)
1071     {
1072     case EOL_AUTODETECT: return coding_system;
1073     case EOL_LF:   new_coding_system = CODING_SYSTEM_EOL_LF   (cs); break;
1074     case EOL_CR:   new_coding_system = CODING_SYSTEM_EOL_CR   (cs); break;
1075     case EOL_CRLF: new_coding_system = CODING_SYSTEM_EOL_CRLF (cs); break;
1076     default:       abort ();
1077     }
1078
1079   return NILP (new_coding_system) ? coding_system : new_coding_system;
1080 }
1081
1082 DEFUN ("subsidiary-coding-system", Fsubsidiary_coding_system, 2, 2, 0, /*
1083 Return the subsidiary coding system of CODING-SYSTEM with eol type EOL-TYPE.
1084 */
1085        (coding_system, eol_type))
1086 {
1087   coding_system = Fget_coding_system (coding_system);
1088
1089   return subsidiary_coding_system (coding_system,
1090                                    symbol_to_eol_type (eol_type));
1091 }
1092
1093 \f
1094 /************************************************************************/
1095 /*                         Coding system accessors                      */
1096 /************************************************************************/
1097
1098 DEFUN ("coding-system-doc-string", Fcoding_system_doc_string, 1, 1, 0, /*
1099 Return the doc string for CODING-SYSTEM.
1100 */
1101        (coding_system))
1102 {
1103   coding_system = Fget_coding_system (coding_system);
1104   return XCODING_SYSTEM_DOC_STRING (coding_system);
1105 }
1106
1107 DEFUN ("coding-system-type", Fcoding_system_type, 1, 1, 0, /*
1108 Return the type of CODING-SYSTEM.
1109 */
1110        (coding_system))
1111 {
1112   switch (XCODING_SYSTEM_TYPE (Fget_coding_system (coding_system)))
1113     {
1114     default: abort ();
1115     case CODESYS_AUTODETECT:    return Qundecided;
1116 #ifdef MULE
1117     case CODESYS_SHIFT_JIS:     return Qshift_jis;
1118     case CODESYS_ISO2022:       return Qiso2022;
1119     case CODESYS_BIG5:          return Qbig5;
1120     case CODESYS_UCS4:          return Qucs4;
1121     case CODESYS_UTF8:          return Qutf8;
1122     case CODESYS_CCL:           return Qccl;
1123 #endif
1124     case CODESYS_NO_CONVERSION: return Qno_conversion;
1125 #ifdef DEBUG_XEMACS
1126     case CODESYS_INTERNAL:      return Qinternal;
1127 #endif
1128     }
1129 }
1130
1131 #ifdef MULE
1132 static
1133 Lisp_Object coding_system_charset (Lisp_Object coding_system, int gnum)
1134 {
1135   Lisp_Object cs
1136     = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, gnum);
1137
1138   return CHARSETP (cs) ? XCHARSET_NAME (cs) : Qnil;
1139 }
1140
1141 DEFUN ("coding-system-charset", Fcoding_system_charset, 2, 2, 0, /*
1142 Return initial charset of CODING-SYSTEM designated to GNUM.
1143 GNUM allows 0 .. 3.
1144 */
1145        (coding_system, gnum))
1146 {
1147   coding_system = Fget_coding_system (coding_system);
1148   CHECK_INT (gnum);
1149
1150   return coding_system_charset (coding_system, XINT (gnum));
1151 }
1152 #endif /* MULE */
1153
1154 DEFUN ("coding-system-property", Fcoding_system_property, 2, 2, 0, /*
1155 Return the PROP property of CODING-SYSTEM.
1156 */
1157        (coding_system, prop))
1158 {
1159   int i, ok = 0;
1160   enum coding_system_type type;
1161
1162   coding_system = Fget_coding_system (coding_system);
1163   CHECK_SYMBOL (prop);
1164   type = XCODING_SYSTEM_TYPE (coding_system);
1165
1166   for (i = 0; !ok && i < Dynarr_length (the_codesys_prop_dynarr); i++)
1167     if (EQ (Dynarr_at (the_codesys_prop_dynarr, i).sym, prop))
1168       {
1169         ok = 1;
1170         switch (Dynarr_at (the_codesys_prop_dynarr, i).prop_type)
1171           {
1172           case CODESYS_PROP_ALL_OK:
1173             break;
1174 #ifdef MULE
1175           case CODESYS_PROP_ISO2022:
1176             if (type != CODESYS_ISO2022)
1177               signal_simple_error
1178                 ("Property only valid in ISO2022 coding systems",
1179                  prop);
1180             break;
1181
1182           case CODESYS_PROP_CCL:
1183             if (type != CODESYS_CCL)
1184               signal_simple_error
1185                 ("Property only valid in CCL coding systems",
1186                  prop);
1187             break;
1188 #endif /* MULE */
1189           default:
1190             abort ();
1191           }
1192       }
1193
1194   if (!ok)
1195     signal_simple_error ("Unrecognized property", prop);
1196
1197   if (EQ (prop, Qname))
1198     return XCODING_SYSTEM_NAME (coding_system);
1199   else if (EQ (prop, Qtype))
1200     return Fcoding_system_type (coding_system);
1201   else if (EQ (prop, Qdoc_string))
1202     return XCODING_SYSTEM_DOC_STRING (coding_system);
1203   else if (EQ (prop, Qmnemonic))
1204     return XCODING_SYSTEM_MNEMONIC (coding_system);
1205   else if (EQ (prop, Qeol_type))
1206     return eol_type_to_symbol (XCODING_SYSTEM_EOL_TYPE (coding_system));
1207   else if (EQ (prop, Qeol_lf))
1208     return XCODING_SYSTEM_EOL_LF (coding_system);
1209   else if (EQ (prop, Qeol_crlf))
1210     return XCODING_SYSTEM_EOL_CRLF (coding_system);
1211   else if (EQ (prop, Qeol_cr))
1212     return XCODING_SYSTEM_EOL_CR (coding_system);
1213   else if (EQ (prop, Qpost_read_conversion))
1214     return XCODING_SYSTEM_POST_READ_CONVERSION (coding_system);
1215   else if (EQ (prop, Qpre_write_conversion))
1216     return XCODING_SYSTEM_PRE_WRITE_CONVERSION (coding_system);
1217 #ifdef MULE
1218   else if (type == CODESYS_ISO2022)
1219     {
1220       if (EQ (prop, Qcharset_g0))
1221         return coding_system_charset (coding_system, 0);
1222       else if (EQ (prop, Qcharset_g1))
1223         return coding_system_charset (coding_system, 1);
1224       else if (EQ (prop, Qcharset_g2))
1225         return coding_system_charset (coding_system, 2);
1226       else if (EQ (prop, Qcharset_g3))
1227         return coding_system_charset (coding_system, 3);
1228
1229 #define FORCE_CHARSET(charset_num) \
1230   (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \
1231    (coding_system, charset_num) ? Qt : Qnil)
1232
1233       else if (EQ (prop, Qforce_g0_on_output)) return FORCE_CHARSET (0);
1234       else if (EQ (prop, Qforce_g1_on_output)) return FORCE_CHARSET (1);
1235       else if (EQ (prop, Qforce_g2_on_output)) return FORCE_CHARSET (2);
1236       else if (EQ (prop, Qforce_g3_on_output)) return FORCE_CHARSET (3);
1237
1238 #define LISP_BOOLEAN(prop) \
1239   (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil)
1240
1241       else if (EQ (prop, Qshort))         return LISP_BOOLEAN (SHORT);
1242       else if (EQ (prop, Qno_ascii_eol))  return LISP_BOOLEAN (NO_ASCII_EOL);
1243       else if (EQ (prop, Qno_ascii_cntl)) return LISP_BOOLEAN (NO_ASCII_CNTL);
1244       else if (EQ (prop, Qseven))         return LISP_BOOLEAN (SEVEN);
1245       else if (EQ (prop, Qlock_shift))    return LISP_BOOLEAN (LOCK_SHIFT);
1246       else if (EQ (prop, Qno_iso6429))    return LISP_BOOLEAN (NO_ISO6429);
1247       else if (EQ (prop, Qescape_quoted)) return LISP_BOOLEAN (ESCAPE_QUOTED);
1248
1249       else if (EQ (prop, Qinput_charset_conversion))
1250         return
1251           unparse_charset_conversion_specs
1252             (XCODING_SYSTEM (coding_system)->iso2022.input_conv);
1253       else if (EQ (prop, Qoutput_charset_conversion))
1254         return
1255           unparse_charset_conversion_specs
1256             (XCODING_SYSTEM (coding_system)->iso2022.output_conv);
1257       else
1258         abort ();
1259     }
1260   else if (type == CODESYS_CCL)
1261     {
1262       if (EQ (prop, Qdecode))
1263         return XCODING_SYSTEM_CCL_DECODE (coding_system);
1264       else if (EQ (prop, Qencode))
1265         return XCODING_SYSTEM_CCL_ENCODE (coding_system);
1266       else
1267         abort ();
1268     }
1269 #endif /* MULE */
1270   else
1271     abort ();
1272
1273   return Qnil; /* not reached */
1274 }
1275
1276 \f
1277 /************************************************************************/
1278 /*                       Coding category functions                      */
1279 /************************************************************************/
1280
1281 static int
1282 decode_coding_category (Lisp_Object symbol)
1283 {
1284   int i;
1285
1286   CHECK_SYMBOL (symbol);
1287   for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1288     if (EQ (coding_category_symbol[i], symbol))
1289       return i;
1290
1291   signal_simple_error ("Unrecognized coding category", symbol);
1292   return 0; /* not reached */
1293 }
1294
1295 DEFUN ("coding-category-list", Fcoding_category_list, 0, 0, 0, /*
1296 Return a list of all recognized coding categories.
1297 */
1298        ())
1299 {
1300   int i;
1301   Lisp_Object list = Qnil;
1302
1303   for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1304     list = Fcons (coding_category_symbol[i], list);
1305   return list;
1306 }
1307
1308 DEFUN ("set-coding-priority-list", Fset_coding_priority_list, 1, 1, 0, /*
1309 Change the priority order of the coding categories.
1310 LIST should be list of coding categories, in descending order of
1311 priority.  Unspecified coding categories will be lower in priority
1312 than all specified ones, in the same relative order they were in
1313 previously.
1314 */
1315        (list))
1316 {
1317   int category_to_priority[CODING_CATEGORY_LAST + 1];
1318   int i, j;
1319   Lisp_Object rest;
1320
1321   /* First generate a list that maps coding categories to priorities. */
1322
1323   for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1324     category_to_priority[i] = -1;
1325
1326   /* Highest priority comes from the specified list. */
1327   i = 0;
1328   EXTERNAL_LIST_LOOP (rest, list)
1329     {
1330       int cat = decode_coding_category (XCAR (rest));
1331
1332       if (category_to_priority[cat] >= 0)
1333         signal_simple_error ("Duplicate coding category in list", XCAR (rest));
1334       category_to_priority[cat] = i++;
1335     }
1336
1337   /* Now go through the existing categories by priority to retrieve
1338      the categories not yet specified and preserve their priority
1339      order. */
1340   for (j = 0; j <= CODING_CATEGORY_LAST; j++)
1341     {
1342       int cat = fcd->coding_category_by_priority[j];
1343       if (category_to_priority[cat] < 0)
1344         category_to_priority[cat] = i++;
1345     }
1346
1347   /* Now we need to construct the inverse of the mapping we just
1348      constructed. */
1349
1350   for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1351     fcd->coding_category_by_priority[category_to_priority[i]] = i;
1352
1353   /* Phew!  That was confusing. */
1354   return Qnil;
1355 }
1356
1357 DEFUN ("coding-priority-list", Fcoding_priority_list, 0, 0, 0, /*
1358 Return a list of coding categories in descending order of priority.
1359 */
1360        ())
1361 {
1362   int i;
1363   Lisp_Object list = Qnil;
1364
1365   for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1366     list = Fcons (coding_category_symbol[fcd->coding_category_by_priority[i]],
1367                   list);
1368   return list;
1369 }
1370
1371 DEFUN ("set-coding-category-system", Fset_coding_category_system, 2, 2, 0, /*
1372 Change the coding system associated with a coding category.
1373 */
1374        (coding_category, coding_system))
1375 {
1376   int cat = decode_coding_category (coding_category);
1377
1378   coding_system = Fget_coding_system (coding_system);
1379   fcd->coding_category_system[cat] = coding_system;
1380   return Qnil;
1381 }
1382
1383 DEFUN ("coding-category-system", Fcoding_category_system, 1, 1, 0, /*
1384 Return the coding system associated with a coding category.
1385 */
1386        (coding_category))
1387 {
1388   int cat = decode_coding_category (coding_category);
1389   Lisp_Object sys = fcd->coding_category_system[cat];
1390
1391   if (!NILP (sys))
1392     return XCODING_SYSTEM_NAME (sys);
1393   return Qnil;
1394 }
1395
1396 \f
1397 /************************************************************************/
1398 /*                     Detecting the encoding of data                   */
1399 /************************************************************************/
1400
1401 struct detection_state
1402 {
1403   eol_type_t eol_type;
1404   int seen_non_ascii;
1405   int mask;
1406 #ifdef MULE
1407   struct
1408     {
1409       int mask;
1410       int in_second_byte;
1411     }
1412   big5;
1413
1414   struct
1415     {
1416       int mask;
1417       int in_second_byte;
1418     }
1419   shift_jis;
1420
1421   struct
1422     {
1423       int mask;
1424       int in_byte;
1425   }
1426   ucs4;
1427
1428   struct
1429     {
1430       int mask;
1431       int in_byte;
1432     }
1433   utf8;
1434
1435   struct
1436     {
1437       int mask;
1438       int initted;
1439       struct iso2022_decoder iso;
1440       unsigned int flags;
1441       int high_byte_count;
1442       unsigned int saw_single_shift:1;
1443     }
1444   iso2022;
1445 #endif
1446   struct
1447     {
1448       int seen_anything;
1449       int just_saw_cr;
1450     }
1451   eol;
1452 };
1453
1454 static int
1455 acceptable_control_char_p (int c)
1456 {
1457   switch (c)
1458     {
1459       /* Allow and ignore control characters that you might
1460          reasonably see in a text file */
1461     case '\r':
1462     case '\n':
1463     case '\t':
1464     case  7: /* bell */
1465     case  8: /* backspace */
1466     case 11: /* vertical tab */
1467     case 12: /* form feed */
1468     case 26: /* MS-DOS C-z junk */
1469     case 31: /* '^_' -- for info */
1470       return 1;
1471     default:
1472       return 0;
1473     }
1474 }
1475
1476 static int
1477 mask_has_at_most_one_bit_p (int mask)
1478 {
1479   /* Perhaps the only thing useful you learn from intensive Microsoft
1480      technical interviews */
1481   return (mask & (mask - 1)) == 0;
1482 }
1483
1484 static eol_type_t
1485 detect_eol_type (struct detection_state *st, CONST unsigned char *src,
1486                  unsigned int n)
1487 {
1488   int c;
1489
1490   while (n--)
1491     {
1492       c = *src++;
1493       if (c == '\n')
1494         {
1495           if (st->eol.just_saw_cr)
1496             return EOL_CRLF;
1497           else if (st->eol.seen_anything)
1498             return EOL_LF;
1499         }
1500       else if (st->eol.just_saw_cr)
1501         return EOL_CR;
1502       else if (c == '\r')
1503         st->eol.just_saw_cr = 1;
1504       else
1505         st->eol.just_saw_cr = 0;
1506       st->eol.seen_anything = 1;
1507     }
1508
1509   return EOL_AUTODETECT;
1510 }
1511
1512 /* Attempt to determine the encoding and EOL type of the given text.
1513    Before calling this function for the first type, you must initialize
1514    st->eol_type as appropriate and initialize st->mask to ~0.
1515
1516    st->eol_type holds the determined EOL type, or EOL_AUTODETECT if
1517    not yet known.
1518
1519    st->mask holds the determined coding category mask, or ~0 if only
1520    ASCII has been seen so far.
1521
1522    Returns:
1523
1524    0 == st->eol_type is EOL_AUTODETECT and/or more than coding category
1525         is present in st->mask
1526    1 == definitive answers are here for both st->eol_type and st->mask
1527 */
1528
1529 static int
1530 detect_coding_type (struct detection_state *st, CONST Extbyte *src,
1531                     unsigned int n, int just_do_eol)
1532 {
1533   int c;
1534
1535   if (st->eol_type == EOL_AUTODETECT)
1536     st->eol_type = detect_eol_type (st, src, n);
1537
1538   if (just_do_eol)
1539     return st->eol_type != EOL_AUTODETECT;
1540
1541   if (!st->seen_non_ascii)
1542     {
1543       for (; n; n--, src++)
1544         {
1545           c = *src;
1546           if ((c < 0x20 && !acceptable_control_char_p (c)) || c >= 0x80)
1547             {
1548               st->seen_non_ascii = 1;
1549 #ifdef MULE
1550               st->shift_jis.mask = ~0;
1551               st->big5.mask = ~0;
1552               st->ucs4.mask = ~0;
1553               st->utf8.mask = ~0;
1554               st->iso2022.mask = ~0;
1555 #endif
1556               break;
1557             }
1558         }
1559     }
1560
1561   if (!n)
1562     return 0;
1563 #ifdef MULE
1564   if (!mask_has_at_most_one_bit_p (st->iso2022.mask))
1565     st->iso2022.mask = detect_coding_iso2022 (st, src, n);
1566   if (!mask_has_at_most_one_bit_p (st->shift_jis.mask))
1567     st->shift_jis.mask = detect_coding_sjis (st, src, n);
1568   if (!mask_has_at_most_one_bit_p (st->big5.mask))
1569     st->big5.mask = detect_coding_big5 (st, src, n);
1570   if (!mask_has_at_most_one_bit_p (st->utf8.mask))
1571     st->utf8.mask = detect_coding_utf8 (st, src, n);
1572   if (!mask_has_at_most_one_bit_p (st->ucs4.mask))
1573     st->ucs4.mask = detect_coding_ucs4 (st, src, n);
1574
1575   st->mask
1576     = st->iso2022.mask | st->shift_jis.mask | st->big5.mask
1577     | st->utf8.mask | st->ucs4.mask;
1578 #endif
1579   {
1580     int retval = mask_has_at_most_one_bit_p (st->mask);
1581     st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK;
1582     return retval && st->eol_type != EOL_AUTODETECT;
1583   }
1584 }
1585
1586 static Lisp_Object
1587 coding_system_from_mask (int mask)
1588 {
1589   if (mask == ~0)
1590     {
1591       /* If the file was entirely or basically ASCII, use the
1592          default value of `buffer-file-coding-system'. */
1593       Lisp_Object retval =
1594         XBUFFER (Vbuffer_defaults)->buffer_file_coding_system;
1595       if (!NILP (retval))
1596         {
1597           retval = Ffind_coding_system (retval);
1598           if (NILP (retval))
1599             {
1600               warn_when_safe
1601                 (Qbad_variable, Qwarning,
1602                  "Invalid `default-buffer-file-coding-system', set to nil");
1603               XBUFFER (Vbuffer_defaults)->buffer_file_coding_system = Qnil;
1604             }
1605         }
1606       if (NILP (retval))
1607         retval = Fget_coding_system (Qraw_text);
1608       return retval;
1609     }
1610   else
1611     {
1612       int i;
1613       int cat = -1;
1614 #ifdef MULE
1615       mask = postprocess_iso2022_mask (mask);
1616 #endif
1617       /* Look through the coding categories by priority and find
1618          the first one that is allowed. */
1619       for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1620         {
1621           cat = fcd->coding_category_by_priority[i];
1622           if ((mask & (1 << cat)) &&
1623               !NILP (fcd->coding_category_system[cat]))
1624             break;
1625         }
1626       if (cat >= 0)
1627         return fcd->coding_category_system[cat];
1628       else
1629         return Fget_coding_system (Qraw_text);
1630     }
1631 }
1632
1633 /* Given a seekable read stream and potential coding system and EOL type
1634    as specified, do any autodetection that is called for.  If the
1635    coding system and/or EOL type are not `autodetect', they will be left
1636    alone; but this function will never return an autodetect coding system
1637    or EOL type.
1638
1639    This function does not automatically fetch subsidiary coding systems;
1640    that should be unnecessary with the explicit eol-type argument. */
1641
1642 #define LENGTH(string_constant) (sizeof (string_constant) - 1)
1643
1644 void
1645 determine_real_coding_system (Lstream *stream, Lisp_Object *codesys_in_out,
1646                               eol_type_t *eol_type_in_out)
1647 {
1648   struct detection_state decst;
1649
1650   if (*eol_type_in_out == EOL_AUTODETECT)
1651     *eol_type_in_out = XCODING_SYSTEM_EOL_TYPE (*codesys_in_out);
1652
1653   xzero (decst);
1654   decst.eol_type = *eol_type_in_out;
1655   decst.mask = ~0;
1656
1657   /* If autodetection is called for, do it now. */
1658   if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT
1659       || *eol_type_in_out == EOL_AUTODETECT)
1660     {
1661       Extbyte buf[4096];
1662       Lisp_Object coding_system = Qnil;
1663       Extbyte *p;
1664       ssize_t nread = Lstream_read (stream, buf, sizeof (buf));
1665       Extbyte *scan_end;
1666
1667       /* Look for initial "-*-"; mode line prefix */
1668       for (p = buf,
1669              scan_end = buf + nread - LENGTH ("-*-coding:?-*-");
1670            p <= scan_end
1671              && *p != '\n'
1672              && *p != '\r';
1673            p++)
1674         if (*p == '-' && *(p+1) == '*' && *(p+2) == '-')
1675           {
1676             Extbyte *local_vars_beg = p + 3;
1677             /* Look for final "-*-"; mode line suffix */
1678             for (p = local_vars_beg,
1679                    scan_end = buf + nread - LENGTH ("-*-");
1680                  p <= scan_end
1681                    && *p != '\n'
1682                    && *p != '\r';
1683                  p++)
1684               if (*p == '-' && *(p+1) == '*' && *(p+2) == '-')
1685                 {
1686                   Extbyte *suffix = p;
1687                   /* Look for "coding:" */
1688                   for (p = local_vars_beg,
1689                          scan_end = suffix - LENGTH ("coding:?");
1690                        p <= scan_end;
1691                        p++)
1692                     if (memcmp ("coding:", p, LENGTH ("coding:")) == 0
1693                         && (p == local_vars_beg
1694                             || (*(p-1) == ' '  ||
1695                                 *(p-1) == '\t' ||
1696                                 *(p-1) == ';')))
1697                       {
1698                         Extbyte save;
1699                         int n;
1700                         p += LENGTH ("coding:");
1701                         while (*p == ' ' || *p == '\t') p++;
1702
1703                         /* Get coding system name */
1704                         save = *suffix; *suffix = '\0';
1705                         /* Characters valid in a MIME charset name (rfc 1521),
1706                            and in a Lisp symbol name. */
1707                         n = strspn ( (char *) p,
1708                                     "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
1709                                     "abcdefghijklmnopqrstuvwxyz"
1710                                     "0123456789"
1711                                     "!$%&*+-.^_{|}~");
1712                         *suffix = save;
1713                         if (n > 0)
1714                           {
1715                             save = p[n]; p[n] = '\0';
1716                             coding_system =
1717                               Ffind_coding_system (intern ((char *) p));
1718                             p[n] = save;
1719                           }
1720                         break;
1721                       }
1722                   break;
1723                 }
1724             break;
1725           }
1726
1727       if (NILP (coding_system))
1728         do
1729           {
1730             if (detect_coding_type (&decst, buf, nread,
1731                                     XCODING_SYSTEM_TYPE (*codesys_in_out)
1732                                     != CODESYS_AUTODETECT))
1733               break;
1734             nread = Lstream_read (stream, buf, sizeof (buf));
1735             if (nread == 0)
1736               break;
1737           }
1738         while (1);
1739
1740       else if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT
1741                && XCODING_SYSTEM_EOL_TYPE (coding_system) == EOL_AUTODETECT)
1742         do
1743           {
1744             if (detect_coding_type (&decst, buf, nread, 1))
1745               break;
1746             nread = Lstream_read (stream, buf, sizeof (buf));
1747             if (!nread)
1748               break;
1749           }
1750         while (1);
1751
1752       *eol_type_in_out = decst.eol_type;
1753       if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT)
1754         {
1755           if (NILP (coding_system))
1756             *codesys_in_out = coding_system_from_mask (decst.mask);
1757           else
1758             *codesys_in_out = coding_system;
1759         }
1760     }
1761
1762   /* If we absolutely can't determine the EOL type, just assume LF. */
1763   if (*eol_type_in_out == EOL_AUTODETECT)
1764     *eol_type_in_out = EOL_LF;
1765
1766   Lstream_rewind (stream);
1767 }
1768
1769 DEFUN ("detect-coding-region", Fdetect_coding_region, 2, 3, 0, /*
1770 Detect coding system of the text in the region between START and END.
1771 Returned a list of possible coding systems ordered by priority.
1772 If only ASCII characters are found, it returns 'undecided or one of
1773 its subsidiary coding systems according to a detected end-of-line
1774 type.  Optional arg BUFFER defaults to the current buffer.
1775 */
1776        (start, end, buffer))
1777 {
1778   Lisp_Object val = Qnil;
1779   struct buffer *buf = decode_buffer (buffer, 0);
1780   Bufpos b, e;
1781   Lisp_Object instream, lb_instream;
1782   Lstream *istr, *lb_istr;
1783   struct detection_state decst;
1784   struct gcpro gcpro1, gcpro2;
1785
1786   get_buffer_range_char (buf, start, end, &b, &e, 0);
1787   lb_instream = make_lisp_buffer_input_stream (buf, b, e, 0);
1788   lb_istr = XLSTREAM (lb_instream);
1789   instream = make_encoding_input_stream (lb_istr, Fget_coding_system (Qbinary));
1790   istr = XLSTREAM (instream);
1791   GCPRO2 (instream, lb_instream);
1792   xzero (decst);
1793   decst.eol_type = EOL_AUTODETECT;
1794   decst.mask = ~0;
1795   while (1)
1796     {
1797       unsigned char random_buffer[4096];
1798       ssize_t nread = Lstream_read (istr, random_buffer, sizeof (random_buffer));
1799
1800       if (!nread)
1801         break;
1802       if (detect_coding_type (&decst, random_buffer, nread, 0))
1803         break;
1804     }
1805
1806   if (decst.mask == ~0)
1807     val = subsidiary_coding_system (Fget_coding_system (Qundecided),
1808                                     decst.eol_type);
1809   else
1810     {
1811       int i;
1812
1813       val = Qnil;
1814 #ifdef MULE
1815       decst.mask = postprocess_iso2022_mask (decst.mask);
1816 #endif
1817       for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1818         {
1819           int sys = fcd->coding_category_by_priority[i];
1820           if (decst.mask & (1 << sys))
1821             {
1822               Lisp_Object codesys = fcd->coding_category_system[sys];
1823               if (!NILP (codesys))
1824                 codesys = subsidiary_coding_system (codesys, decst.eol_type);
1825               val = Fcons (codesys, val);
1826             }
1827         }
1828     }
1829   Lstream_close (istr);
1830   UNGCPRO;
1831   Lstream_delete (istr);
1832   Lstream_delete (lb_istr);
1833   return val;
1834 }
1835
1836 \f
1837 /************************************************************************/
1838 /*           Converting to internal Mule format ("decoding")            */
1839 /************************************************************************/
1840
1841 /* A decoding stream is a stream used for decoding text (i.e.
1842    converting from some external format to internal format).
1843    The decoding-stream object keeps track of the actual coding
1844    stream, the stream that is at the other end, and data that
1845    needs to be persistent across the lifetime of the stream. */
1846
1847 /* Handle the EOL stuff related to just-read-in character C.
1848    EOL_TYPE is the EOL type of the coding stream.
1849    FLAGS is the current value of FLAGS in the coding stream, and may
1850    be modified by this macro.  (The macro only looks at the
1851    CODING_STATE_CR flag.)  DST is the Dynarr to which the decoded
1852    bytes are to be written.  You need to also define a local goto
1853    label "label_continue_loop" that is at the end of the main
1854    character-reading loop.
1855
1856    If C is a CR character, then this macro handles it entirely and
1857    jumps to label_continue_loop.  Otherwise, this macro does not add
1858    anything to DST, and continues normally.  You should continue
1859    processing C normally after this macro. */
1860
1861 #define DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst)         \
1862 do {                                                            \
1863   if (c == '\r')                                                \
1864     {                                                           \
1865       if (eol_type == EOL_CR)                                   \
1866         Dynarr_add (dst, '\n');                                 \
1867       else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR) \
1868         Dynarr_add (dst, c);                                    \
1869       else                                                      \
1870         flags |= CODING_STATE_CR;                               \
1871       goto label_continue_loop;                                 \
1872     }                                                           \
1873   else if (flags & CODING_STATE_CR)                             \
1874     {   /* eol_type == CODING_SYSTEM_EOL_CRLF */                \
1875       if (c != '\n')                                            \
1876         Dynarr_add (dst, '\r');                                 \
1877       flags &= ~CODING_STATE_CR;                                \
1878     }                                                           \
1879 } while (0)
1880
1881 /* C should be a binary character in the range 0 - 255; convert
1882    to internal format and add to Dynarr DST. */
1883
1884 #define DECODE_ADD_BINARY_CHAR(c, dst)          \
1885 do {                                            \
1886   if (BYTE_ASCII_P (c))                         \
1887     Dynarr_add (dst, c);                        \
1888   else if (BYTE_C1_P (c))                       \
1889     {                                           \
1890       Dynarr_add (dst, LEADING_BYTE_CONTROL_1); \
1891       Dynarr_add (dst, c + 0x20);               \
1892     }                                           \
1893   else                                          \
1894     {                                           \
1895       Dynarr_add (dst, LEADING_BYTE_LATIN_ISO8859_1); \
1896       Dynarr_add (dst, c);                      \
1897     }                                           \
1898 } while (0)
1899
1900 #define DECODE_OUTPUT_PARTIAL_CHAR(ch)  \
1901 do {                                    \
1902   if (ch)                               \
1903     {                                   \
1904       DECODE_ADD_BINARY_CHAR (ch, dst); \
1905       ch = 0;                           \
1906     }                                   \
1907 } while (0)
1908
1909 #define DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst) \
1910 do {                                    \
1911   if (flags & CODING_STATE_END)         \
1912     {                                   \
1913       DECODE_OUTPUT_PARTIAL_CHAR (ch);  \
1914       if (flags & CODING_STATE_CR)      \
1915         Dynarr_add (dst, '\r');         \
1916     }                                   \
1917 } while (0)
1918
1919 #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding)
1920
1921 struct decoding_stream
1922 {
1923   /* Coding system that governs the conversion. */
1924   Lisp_Coding_System *codesys;
1925
1926   /* Stream that we read the encoded data from or
1927      write the decoded data to. */
1928   Lstream *other_end;
1929
1930   /* If we are reading, then we can return only a fixed amount of
1931      data, so if the conversion resulted in too much data, we store it
1932      here for retrieval the next time around. */
1933   unsigned_char_dynarr *runoff;
1934
1935   /* FLAGS holds flags indicating the current state of the decoding.
1936      Some of these flags are dependent on the coding system. */
1937   unsigned int flags;
1938
1939   /* CH holds a partially built-up character.  Since we only deal
1940      with one- and two-byte characters at the moment, we only use
1941      this to store the first byte of a two-byte character. */
1942   unsigned int ch;
1943
1944   /* EOL_TYPE specifies the type of end-of-line conversion that
1945      currently applies.  We need to keep this separate from the
1946      EOL type stored in CODESYS because the latter might indicate
1947      automatic EOL-type detection while the former will always
1948      indicate a particular EOL type. */
1949   eol_type_t eol_type;
1950 #ifdef MULE
1951   /* Additional ISO2022 information.  We define the structure above
1952      because it's also needed by the detection routines. */
1953   struct iso2022_decoder iso2022;
1954
1955   /* Additional information (the state of the running CCL program)
1956      used by the CCL decoder. */
1957   struct ccl_program ccl;
1958
1959   /* counter for UTF-8 or UCS-4 */
1960   unsigned char counter;
1961 #endif
1962   struct detection_state decst;
1963 };
1964
1965 static ssize_t decoding_reader (Lstream *stream,
1966                                 unsigned char *data, size_t size);
1967 static ssize_t decoding_writer (Lstream *stream,
1968                                 CONST unsigned char *data, size_t size);
1969 static int decoding_rewinder   (Lstream *stream);
1970 static int decoding_seekable_p (Lstream *stream);
1971 static int decoding_flusher    (Lstream *stream);
1972 static int decoding_closer     (Lstream *stream);
1973
1974 static Lisp_Object decoding_marker (Lisp_Object stream);
1975
1976 DEFINE_LSTREAM_IMPLEMENTATION ("decoding", lstream_decoding,
1977                                sizeof (struct decoding_stream));
1978
1979 static Lisp_Object
1980 decoding_marker (Lisp_Object stream)
1981 {
1982   Lstream *str = DECODING_STREAM_DATA (XLSTREAM (stream))->other_end;
1983   Lisp_Object str_obj;
1984
1985   /* We do not need to mark the coding systems or charsets stored
1986      within the stream because they are stored in a global list
1987      and automatically marked. */
1988
1989   XSETLSTREAM (str_obj, str);
1990   mark_object (str_obj);
1991   if (str->imp->marker)
1992     return (str->imp->marker) (str_obj);
1993   else
1994     return Qnil;
1995 }
1996
1997 /* Read SIZE bytes of data and store it into DATA.  We are a decoding stream
1998    so we read data from the other end, decode it, and store it into DATA. */
1999
2000 static ssize_t
2001 decoding_reader (Lstream *stream, unsigned char *data, size_t size)
2002 {
2003   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2004   unsigned char *orig_data = data;
2005   ssize_t read_size;
2006   int error_occurred = 0;
2007
2008   /* We need to interface to mule_decode(), which expects to take some
2009      amount of data and store the result into a Dynarr.  We have
2010      mule_decode() store into str->runoff, and take data from there
2011      as necessary. */
2012
2013   /* We loop until we have enough data, reading chunks from the other
2014      end and decoding it. */
2015   while (1)
2016     {
2017       /* Take data from the runoff if we can.  Make sure to take at
2018          most SIZE bytes, and delete the data from the runoff. */
2019       if (Dynarr_length (str->runoff) > 0)
2020         {
2021           size_t chunk = min (size, (size_t) Dynarr_length (str->runoff));
2022           memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2023           Dynarr_delete_many (str->runoff, 0, chunk);
2024           data += chunk;
2025           size -= chunk;
2026         }
2027
2028       if (size == 0)
2029         break; /* No more room for data */
2030
2031       if (str->flags & CODING_STATE_END)
2032         /* This means that on the previous iteration, we hit the EOF on
2033            the other end.  We loop once more so that mule_decode() can
2034            output any final stuff it may be holding, or any "go back
2035            to a sane state" escape sequences. (This latter makes sense
2036            during encoding.) */
2037         break;
2038
2039       /* Exhausted the runoff, so get some more.  DATA has at least
2040          SIZE bytes left of storage in it, so it's OK to read directly
2041          into it.  (We'll be overwriting above, after we've decoded it
2042          into the runoff.) */
2043       read_size = Lstream_read (str->other_end, data, size);
2044       if (read_size < 0)
2045         {
2046           error_occurred = 1;
2047           break;
2048         }
2049       if (read_size == 0)
2050         /* There might be some more end data produced in the translation.
2051            See the comment above. */
2052         str->flags |= CODING_STATE_END;
2053       mule_decode (stream, data, str->runoff, read_size);
2054     }
2055
2056   if (data - orig_data == 0)
2057     return error_occurred ? -1 : 0;
2058   else
2059     return data - orig_data;
2060 }
2061
2062 static ssize_t
2063 decoding_writer (Lstream *stream, CONST unsigned char *data, size_t size)
2064 {
2065   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2066   ssize_t retval;
2067
2068   /* Decode all our data into the runoff, and then attempt to write
2069      it all out to the other end.  Remove whatever chunk we succeeded
2070      in writing. */
2071   mule_decode (stream, data, str->runoff, size);
2072   retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2073                           Dynarr_length (str->runoff));
2074   if (retval > 0)
2075     Dynarr_delete_many (str->runoff, 0, retval);
2076   /* Do NOT return retval.  The return value indicates how much
2077      of the incoming data was written, not how many bytes were
2078      written. */
2079   return size;
2080 }
2081
2082 static void
2083 reset_decoding_stream (struct decoding_stream *str)
2084 {
2085 #ifdef MULE
2086   if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_ISO2022)
2087     {
2088       Lisp_Object coding_system;
2089       XSETCODING_SYSTEM (coding_system, str->codesys);
2090       reset_iso2022 (coding_system, &str->iso2022);
2091     }
2092   else if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_CCL)
2093     {
2094       setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_DECODE (str->codesys));
2095     }
2096   str->counter = 0;
2097 #endif /* MULE */
2098   str->flags = str->ch = 0;
2099 }
2100
2101 static int
2102 decoding_rewinder (Lstream *stream)
2103 {
2104   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2105   reset_decoding_stream (str);
2106   Dynarr_reset (str->runoff);
2107   return Lstream_rewind (str->other_end);
2108 }
2109
2110 static int
2111 decoding_seekable_p (Lstream *stream)
2112 {
2113   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2114   return Lstream_seekable_p (str->other_end);
2115 }
2116
2117 static int
2118 decoding_flusher (Lstream *stream)
2119 {
2120   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2121   return Lstream_flush (str->other_end);
2122 }
2123
2124 static int
2125 decoding_closer (Lstream *stream)
2126 {
2127   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2128   if (stream->flags & LSTREAM_FL_WRITE)
2129     {
2130       str->flags |= CODING_STATE_END;
2131       decoding_writer (stream, 0, 0);
2132     }
2133   Dynarr_free (str->runoff);
2134 #ifdef MULE
2135 #ifdef ENABLE_COMPOSITE_CHARS
2136   if (str->iso2022.composite_chars)
2137     Dynarr_free (str->iso2022.composite_chars);
2138 #endif
2139 #endif
2140   return Lstream_close (str->other_end);
2141 }
2142
2143 Lisp_Object
2144 decoding_stream_coding_system (Lstream *stream)
2145 {
2146   Lisp_Object coding_system;
2147   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2148
2149   XSETCODING_SYSTEM (coding_system, str->codesys);
2150   return subsidiary_coding_system (coding_system, str->eol_type);
2151 }
2152
2153 void
2154 set_decoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2155 {
2156   Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2157   struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2158   str->codesys = cs;
2159   if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
2160     str->eol_type = CODING_SYSTEM_EOL_TYPE (cs);
2161   reset_decoding_stream (str);
2162 }
2163
2164 /* WARNING WARNING WARNING WARNING!!!!!  If you open up a decoding
2165    stream for writing, no automatic code detection will be performed.
2166    The reason for this is that automatic code detection requires a
2167    seekable input.  Things will also fail if you open a decoding
2168    stream for reading using a non-fully-specified coding system and
2169    a non-seekable input stream. */
2170
2171 static Lisp_Object
2172 make_decoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2173                         CONST char *mode)
2174 {
2175   Lstream *lstr = Lstream_new (lstream_decoding, mode);
2176   struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2177   Lisp_Object obj;
2178
2179   xzero (*str);
2180   str->other_end = stream;
2181   str->runoff = (unsigned_char_dynarr *) Dynarr_new (unsigned_char);
2182   str->eol_type = EOL_AUTODETECT;
2183   if (!strcmp (mode, "r")
2184       && Lstream_seekable_p (stream))
2185     /* We can determine the coding system now. */
2186     determine_real_coding_system (stream, &codesys, &str->eol_type);
2187   set_decoding_stream_coding_system (lstr, codesys);
2188   str->decst.eol_type = str->eol_type;
2189   str->decst.mask = ~0;
2190   XSETLSTREAM (obj, lstr);
2191   return obj;
2192 }
2193
2194 Lisp_Object
2195 make_decoding_input_stream (Lstream *stream, Lisp_Object codesys)
2196 {
2197   return make_decoding_stream_1 (stream, codesys, "r");
2198 }
2199
2200 Lisp_Object
2201 make_decoding_output_stream (Lstream *stream, Lisp_Object codesys)
2202 {
2203   return make_decoding_stream_1 (stream, codesys, "w");
2204 }
2205
2206 /* Note: the decode_coding_* functions all take the same
2207    arguments as mule_decode(), which is to say some SRC data of
2208    size N, which is to be stored into dynamic array DST.
2209    DECODING is the stream within which the decoding is
2210    taking place, but no data is actually read from or
2211    written to that stream; that is handled in decoding_reader()
2212    or decoding_writer().  This allows the same functions to
2213    be used for both reading and writing. */
2214
2215 static void
2216 mule_decode (Lstream *decoding, CONST unsigned char *src,
2217              unsigned_char_dynarr *dst, unsigned int n)
2218 {
2219   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2220
2221   /* If necessary, do encoding-detection now.  We do this when
2222      we're a writing stream or a non-seekable reading stream,
2223      meaning that we can't just process the whole input,
2224      rewind, and start over. */
2225
2226   if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT ||
2227       str->eol_type == EOL_AUTODETECT)
2228     {
2229       Lisp_Object codesys;
2230
2231       XSETCODING_SYSTEM (codesys, str->codesys);
2232       detect_coding_type (&str->decst, src, n,
2233                           CODING_SYSTEM_TYPE (str->codesys) !=
2234                           CODESYS_AUTODETECT);
2235       if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT &&
2236           str->decst.mask != ~0)
2237         /* #### This is cheesy.  What we really ought to do is
2238            buffer up a certain amount of data so as to get a
2239            less random result. */
2240         codesys = coding_system_from_mask (str->decst.mask);
2241       str->eol_type = str->decst.eol_type;
2242       if (XCODING_SYSTEM (codesys) != str->codesys)
2243         {
2244           /* Preserve the CODING_STATE_END flag in case it was set.
2245              If we erase it, bad things might happen. */
2246           int was_end = str->flags & CODING_STATE_END;
2247           set_decoding_stream_coding_system (decoding, codesys);
2248           if (was_end)
2249             str->flags |= CODING_STATE_END;
2250         }
2251     }
2252
2253   switch (CODING_SYSTEM_TYPE (str->codesys))
2254     {
2255 #ifdef DEBUG_XEMACS
2256     case CODESYS_INTERNAL:
2257       Dynarr_add_many (dst, src, n);
2258       break;
2259 #endif
2260     case CODESYS_AUTODETECT:
2261       /* If we got this far and still haven't decided on the coding
2262          system, then do no conversion. */
2263     case CODESYS_NO_CONVERSION:
2264       decode_coding_no_conversion (decoding, src, dst, n);
2265       break;
2266 #ifdef MULE
2267     case CODESYS_SHIFT_JIS:
2268       decode_coding_sjis (decoding, src, dst, n);
2269       break;
2270     case CODESYS_BIG5:
2271       decode_coding_big5 (decoding, src, dst, n);
2272       break;
2273     case CODESYS_UCS4:
2274       decode_coding_ucs4 (decoding, src, dst, n);
2275       break;
2276     case CODESYS_UTF8:
2277       decode_coding_utf8 (decoding, src, dst, n);
2278       break;
2279     case CODESYS_CCL:
2280       str->ccl.last_block = str->flags & CODING_STATE_END;
2281       ccl_driver (&str->ccl, src, dst, n, 0, CCL_MODE_DECODING);
2282       break;
2283     case CODESYS_ISO2022:
2284       decode_coding_iso2022 (decoding, src, dst, n);
2285       break;
2286 #endif /* MULE */
2287     default:
2288       abort ();
2289     }
2290 }
2291
2292 DEFUN ("decode-coding-region", Fdecode_coding_region, 3, 4, 0, /*
2293 Decode the text between START and END which is encoded in CODING-SYSTEM.
2294 This is useful if you've read in encoded text from a file without decoding
2295 it (e.g. you read in a JIS-formatted file but used the `binary' or
2296 `no-conversion' coding system, so that it shows up as "^[$B!<!+^[(B").
2297 Return length of decoded text.
2298 BUFFER defaults to the current buffer if unspecified.
2299 */
2300        (start, end, coding_system, buffer))
2301 {
2302   Bufpos b, e;
2303   struct buffer *buf = decode_buffer (buffer, 0);
2304   Lisp_Object instream, lb_outstream, de_outstream, outstream;
2305   Lstream *istr, *ostr;
2306   struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2307
2308   get_buffer_range_char (buf, start, end, &b, &e, 0);
2309
2310   barf_if_buffer_read_only (buf, b, e);
2311
2312   coding_system = Fget_coding_system (coding_system);
2313   instream = make_lisp_buffer_input_stream  (buf, b, e, 0);
2314   lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2315   de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2316                                               coding_system);
2317   outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2318                                            Fget_coding_system (Qbinary));
2319   istr = XLSTREAM (instream);
2320   ostr = XLSTREAM (outstream);
2321   GCPRO4 (instream, lb_outstream, de_outstream, outstream);
2322
2323   /* The chain of streams looks like this:
2324
2325      [BUFFER] <----- send through
2326                      ------> [ENCODE AS BINARY]
2327                              ------> [DECODE AS SPECIFIED]
2328                                      ------> [BUFFER]
2329    */
2330
2331   while (1)
2332     {
2333       char tempbuf[1024]; /* some random amount */
2334       Bufpos newpos, even_newer_pos;
2335       Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2336       ssize_t size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2337
2338       if (!size_in_bytes)
2339         break;
2340       newpos = lisp_buffer_stream_startpos (istr);
2341       Lstream_write (ostr, tempbuf, size_in_bytes);
2342       even_newer_pos = lisp_buffer_stream_startpos (istr);
2343       buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2344                            even_newer_pos, 0);
2345     }
2346   Lstream_close (istr);
2347   Lstream_close (ostr);
2348   UNGCPRO;
2349   Lstream_delete (istr);
2350   Lstream_delete (ostr);
2351   Lstream_delete (XLSTREAM (de_outstream));
2352   Lstream_delete (XLSTREAM (lb_outstream));
2353   return Qnil;
2354 }
2355
2356 \f
2357 /************************************************************************/
2358 /*           Converting to an external encoding ("encoding")            */
2359 /************************************************************************/
2360
2361 /* An encoding stream is an output stream.  When you create the
2362    stream, you specify the coding system that governs the encoding
2363    and another stream that the resulting encoded data is to be
2364    sent to, and then start sending data to it. */
2365
2366 #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding)
2367
2368 struct encoding_stream
2369 {
2370   /* Coding system that governs the conversion. */
2371   Lisp_Coding_System *codesys;
2372
2373   /* Stream that we read the encoded data from or
2374      write the decoded data to. */
2375   Lstream *other_end;
2376
2377   /* If we are reading, then we can return only a fixed amount of
2378      data, so if the conversion resulted in too much data, we store it
2379      here for retrieval the next time around. */
2380   unsigned_char_dynarr *runoff;
2381
2382   /* FLAGS holds flags indicating the current state of the encoding.
2383      Some of these flags are dependent on the coding system. */
2384   unsigned int flags;
2385
2386   /* CH holds a partially built-up character.  Since we only deal
2387      with one- and two-byte characters at the moment, we only use
2388      this to store the first byte of a two-byte character. */
2389   unsigned int ch;
2390 #ifdef MULE
2391   /* Additional information used by the ISO2022 encoder. */
2392   struct
2393     {
2394       /* CHARSET holds the character sets currently assigned to the G0
2395          through G3 registers.  It is initialized from the array
2396          INITIAL_CHARSET in CODESYS. */
2397       Lisp_Object charset[4];
2398
2399       /* Which registers are currently invoked into the left (GL) and
2400          right (GR) halves of the 8-bit encoding space? */
2401       int register_left, register_right;
2402
2403       /* Whether we need to explicitly designate the charset in the
2404          G? register before using it.  It is initialized from the
2405          array FORCE_CHARSET_ON_OUTPUT in CODESYS. */
2406       unsigned char force_charset_on_output[4];
2407
2408       /* Other state variables that need to be preserved across
2409          invocations. */
2410       Lisp_Object current_charset;
2411       int current_half;
2412       int current_char_boundary;
2413     } iso2022;
2414
2415   /* Additional information (the state of the running CCL program)
2416      used by the CCL encoder. */
2417   struct ccl_program ccl;
2418 #endif /* MULE */
2419 };
2420
2421 static ssize_t encoding_reader (Lstream *stream, unsigned char *data, size_t size);
2422 static ssize_t encoding_writer (Lstream *stream, CONST unsigned char *data,
2423                                 size_t size);
2424 static int encoding_rewinder   (Lstream *stream);
2425 static int encoding_seekable_p (Lstream *stream);
2426 static int encoding_flusher    (Lstream *stream);
2427 static int encoding_closer     (Lstream *stream);
2428
2429 static Lisp_Object encoding_marker (Lisp_Object stream);
2430
2431 DEFINE_LSTREAM_IMPLEMENTATION ("encoding", lstream_encoding,
2432                                sizeof (struct encoding_stream));
2433
2434 static Lisp_Object
2435 encoding_marker (Lisp_Object stream)
2436 {
2437   Lstream *str = ENCODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2438   Lisp_Object str_obj;
2439
2440   /* We do not need to mark the coding systems or charsets stored
2441      within the stream because they are stored in a global list
2442      and automatically marked. */
2443
2444   XSETLSTREAM (str_obj, str);
2445   mark_object (str_obj);
2446   if (str->imp->marker)
2447     return (str->imp->marker) (str_obj);
2448   else
2449     return Qnil;
2450 }
2451
2452 /* Read SIZE bytes of data and store it into DATA.  We are a encoding stream
2453    so we read data from the other end, encode it, and store it into DATA. */
2454
2455 static ssize_t
2456 encoding_reader (Lstream *stream, unsigned char *data, size_t size)
2457 {
2458   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2459   unsigned char *orig_data = data;
2460   ssize_t read_size;
2461   int error_occurred = 0;
2462
2463   /* We need to interface to mule_encode(), which expects to take some
2464      amount of data and store the result into a Dynarr.  We have
2465      mule_encode() store into str->runoff, and take data from there
2466      as necessary. */
2467
2468   /* We loop until we have enough data, reading chunks from the other
2469      end and encoding it. */
2470   while (1)
2471     {
2472       /* Take data from the runoff if we can.  Make sure to take at
2473          most SIZE bytes, and delete the data from the runoff. */
2474       if (Dynarr_length (str->runoff) > 0)
2475         {
2476           int chunk = min ((int) size, Dynarr_length (str->runoff));
2477           memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2478           Dynarr_delete_many (str->runoff, 0, chunk);
2479           data += chunk;
2480           size -= chunk;
2481         }
2482
2483       if (size == 0)
2484         break; /* No more room for data */
2485
2486       if (str->flags & CODING_STATE_END)
2487         /* This means that on the previous iteration, we hit the EOF on
2488            the other end.  We loop once more so that mule_encode() can
2489            output any final stuff it may be holding, or any "go back
2490            to a sane state" escape sequences. (This latter makes sense
2491            during encoding.) */
2492         break;
2493
2494       /* Exhausted the runoff, so get some more.  DATA at least SIZE bytes
2495          left of storage in it, so it's OK to read directly into it.
2496          (We'll be overwriting above, after we've encoded it into the
2497          runoff.) */
2498       read_size = Lstream_read (str->other_end, data, size);
2499       if (read_size < 0)
2500         {
2501           error_occurred = 1;
2502           break;
2503         }
2504       if (read_size == 0)
2505         /* There might be some more end data produced in the translation.
2506            See the comment above. */
2507         str->flags |= CODING_STATE_END;
2508       mule_encode (stream, data, str->runoff, read_size);
2509     }
2510
2511   if (data == orig_data)
2512     return error_occurred ? -1 : 0;
2513   else
2514     return data - orig_data;
2515 }
2516
2517 static ssize_t
2518 encoding_writer (Lstream *stream, CONST unsigned char *data, size_t size)
2519 {
2520   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2521   ssize_t retval;
2522
2523   /* Encode all our data into the runoff, and then attempt to write
2524      it all out to the other end.  Remove whatever chunk we succeeded
2525      in writing. */
2526   mule_encode (stream, data, str->runoff, size);
2527   retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2528                           Dynarr_length (str->runoff));
2529   if (retval > 0)
2530     Dynarr_delete_many (str->runoff, 0, retval);
2531   /* Do NOT return retval.  The return value indicates how much
2532      of the incoming data was written, not how many bytes were
2533      written. */
2534   return size;
2535 }
2536
2537 static void
2538 reset_encoding_stream (struct encoding_stream *str)
2539 {
2540 #ifdef MULE
2541   switch (CODING_SYSTEM_TYPE (str->codesys))
2542     {
2543     case CODESYS_ISO2022:
2544       {
2545         int i;
2546
2547         for (i = 0; i < 4; i++)
2548           {
2549             str->iso2022.charset[i] =
2550               CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, i);
2551             str->iso2022.force_charset_on_output[i] =
2552               CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (str->codesys, i);
2553           }
2554         str->iso2022.register_left = 0;
2555         str->iso2022.register_right = 1;
2556         str->iso2022.current_charset = Qnil;
2557         str->iso2022.current_half = 0;
2558         str->iso2022.current_char_boundary = 1;
2559         break;
2560       }
2561     case CODESYS_CCL:
2562       setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_ENCODE (str->codesys));
2563       break;
2564     default:
2565       break;
2566     }
2567 #endif /* MULE */
2568
2569   str->flags = str->ch = 0;
2570 }
2571
2572 static int
2573 encoding_rewinder (Lstream *stream)
2574 {
2575   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2576   reset_encoding_stream (str);
2577   Dynarr_reset (str->runoff);
2578   return Lstream_rewind (str->other_end);
2579 }
2580
2581 static int
2582 encoding_seekable_p (Lstream *stream)
2583 {
2584   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2585   return Lstream_seekable_p (str->other_end);
2586 }
2587
2588 static int
2589 encoding_flusher (Lstream *stream)
2590 {
2591   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2592   return Lstream_flush (str->other_end);
2593 }
2594
2595 static int
2596 encoding_closer (Lstream *stream)
2597 {
2598   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2599   if (stream->flags & LSTREAM_FL_WRITE)
2600     {
2601       str->flags |= CODING_STATE_END;
2602       encoding_writer (stream, 0, 0);
2603     }
2604   Dynarr_free (str->runoff);
2605   return Lstream_close (str->other_end);
2606 }
2607
2608 Lisp_Object
2609 encoding_stream_coding_system (Lstream *stream)
2610 {
2611   Lisp_Object coding_system;
2612   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2613
2614   XSETCODING_SYSTEM (coding_system, str->codesys);
2615   return coding_system;
2616 }
2617
2618 void
2619 set_encoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2620 {
2621   Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2622   struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2623   str->codesys = cs;
2624   reset_encoding_stream (str);
2625 }
2626
2627 static Lisp_Object
2628 make_encoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2629                         CONST char *mode)
2630 {
2631   Lstream *lstr = Lstream_new (lstream_encoding, mode);
2632   struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2633   Lisp_Object obj;
2634
2635   xzero (*str);
2636   str->runoff = Dynarr_new (unsigned_char);
2637   str->other_end = stream;
2638   set_encoding_stream_coding_system (lstr, codesys);
2639   XSETLSTREAM (obj, lstr);
2640   return obj;
2641 }
2642
2643 Lisp_Object
2644 make_encoding_input_stream (Lstream *stream, Lisp_Object codesys)
2645 {
2646   return make_encoding_stream_1 (stream, codesys, "r");
2647 }
2648
2649 Lisp_Object
2650 make_encoding_output_stream (Lstream *stream, Lisp_Object codesys)
2651 {
2652   return make_encoding_stream_1 (stream, codesys, "w");
2653 }
2654
2655 /* Convert N bytes of internally-formatted data stored in SRC to an
2656    external format, according to the encoding stream ENCODING.
2657    Store the encoded data into DST. */
2658
2659 static void
2660 mule_encode (Lstream *encoding, CONST unsigned char *src,
2661              unsigned_char_dynarr *dst, unsigned int n)
2662 {
2663   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2664
2665   switch (CODING_SYSTEM_TYPE (str->codesys))
2666     {
2667 #ifdef DEBUG_XEMACS
2668     case CODESYS_INTERNAL:
2669       Dynarr_add_many (dst, src, n);
2670       break;
2671 #endif
2672     case CODESYS_AUTODETECT:
2673       /* If we got this far and still haven't decided on the coding
2674          system, then do no conversion. */
2675     case CODESYS_NO_CONVERSION:
2676       encode_coding_no_conversion (encoding, src, dst, n);
2677       break;
2678 #ifdef MULE
2679     case CODESYS_SHIFT_JIS:
2680       encode_coding_sjis (encoding, src, dst, n);
2681       break;
2682     case CODESYS_BIG5:
2683       encode_coding_big5 (encoding, src, dst, n);
2684       break;
2685     case CODESYS_UCS4:
2686       encode_coding_ucs4 (encoding, src, dst, n);
2687       break;
2688     case CODESYS_UTF8:
2689       encode_coding_utf8 (encoding, src, dst, n);
2690       break;
2691     case CODESYS_CCL:
2692       str->ccl.last_block = str->flags & CODING_STATE_END;
2693       ccl_driver (&str->ccl, src, dst, n, 0, CCL_MODE_ENCODING);
2694       break;
2695     case CODESYS_ISO2022:
2696       encode_coding_iso2022 (encoding, src, dst, n);
2697       break;
2698 #endif /* MULE */
2699     default:
2700       abort ();
2701     }
2702 }
2703
2704 DEFUN ("encode-coding-region", Fencode_coding_region, 3, 4, 0, /*
2705 Encode the text between START and END using CODING-SYSTEM.
2706 This will, for example, convert Japanese characters into stuff such as
2707 "^[$B!<!+^[(B" if you use the JIS encoding.  Return length of encoded
2708 text.  BUFFER defaults to the current buffer if unspecified.
2709 */
2710        (start, end, coding_system, buffer))
2711 {
2712   Bufpos b, e;
2713   struct buffer *buf = decode_buffer (buffer, 0);
2714   Lisp_Object instream, lb_outstream, de_outstream, outstream;
2715   Lstream *istr, *ostr;
2716   struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2717
2718   get_buffer_range_char (buf, start, end, &b, &e, 0);
2719
2720   barf_if_buffer_read_only (buf, b, e);
2721
2722   coding_system = Fget_coding_system (coding_system);
2723   instream  = make_lisp_buffer_input_stream  (buf, b, e, 0);
2724   lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2725   de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2726                                               Fget_coding_system (Qbinary));
2727   outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2728                                            coding_system);
2729   istr = XLSTREAM (instream);
2730   ostr = XLSTREAM (outstream);
2731   GCPRO4 (instream, outstream, de_outstream, lb_outstream);
2732   /* The chain of streams looks like this:
2733
2734      [BUFFER] <----- send through
2735                      ------> [ENCODE AS SPECIFIED]
2736                              ------> [DECODE AS BINARY]
2737                                      ------> [BUFFER]
2738    */
2739   while (1)
2740     {
2741       char tempbuf[1024]; /* some random amount */
2742       Bufpos newpos, even_newer_pos;
2743       Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2744       ssize_t size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2745
2746       if (!size_in_bytes)
2747         break;
2748       newpos = lisp_buffer_stream_startpos (istr);
2749       Lstream_write (ostr, tempbuf, size_in_bytes);
2750       even_newer_pos = lisp_buffer_stream_startpos (istr);
2751       buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2752                            even_newer_pos, 0);
2753     }
2754
2755   {
2756     Charcount retlen =
2757       lisp_buffer_stream_startpos (XLSTREAM (instream)) - b;
2758     Lstream_close (istr);
2759     Lstream_close (ostr);
2760     UNGCPRO;
2761     Lstream_delete (istr);
2762     Lstream_delete (ostr);
2763     Lstream_delete (XLSTREAM (de_outstream));
2764     Lstream_delete (XLSTREAM (lb_outstream));
2765     return make_int (retlen);
2766   }
2767 }
2768
2769 #ifdef MULE
2770 \f
2771 /************************************************************************/
2772 /*                          Shift-JIS methods                           */
2773 /************************************************************************/
2774
2775 /* Shift-JIS is a coding system encoding three character sets: ASCII, right
2776    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
2777    as is.  A character of JISX0201-Kana (DIMENSION1_CHARS94 character set) is
2778    encoded by "position-code + 0x80".  A character of JISX0208
2779    (DIMENSION2_CHARS94 character set) is encoded in 2-byte but two
2780    position-codes are divided and shifted so that it fit in the range
2781    below.
2782
2783    --- CODE RANGE of Shift-JIS ---
2784    (character set)      (range)
2785    ASCII                0x00 .. 0x7F
2786    JISX0201-Kana        0xA0 .. 0xDF
2787    JISX0208 (1st byte)  0x80 .. 0x9F and 0xE0 .. 0xEF
2788             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
2789    -------------------------------
2790
2791 */
2792
2793 /* Is this the first byte of a Shift-JIS two-byte char? */
2794
2795 #define BYTE_SJIS_TWO_BYTE_1_P(c) \
2796   (((c) >= 0x81 && (c) <= 0x9F) || ((c) >= 0xE0 && (c) <= 0xEF))
2797
2798 /* Is this the second byte of a Shift-JIS two-byte char? */
2799
2800 #define BYTE_SJIS_TWO_BYTE_2_P(c) \
2801   (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0x80 && (c) <= 0xFC))
2802
2803 #define BYTE_SJIS_KATAKANA_P(c) \
2804   ((c) >= 0xA1 && (c) <= 0xDF)
2805
2806 static int
2807 detect_coding_sjis (struct detection_state *st, CONST unsigned char *src,
2808                     unsigned int n)
2809 {
2810   int c;
2811
2812   while (n--)
2813     {
2814       c = *src++;
2815       if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
2816         return 0;
2817       if (st->shift_jis.in_second_byte)
2818         {
2819           st->shift_jis.in_second_byte = 0;
2820           if (c < 0x40)
2821             return 0;
2822         }
2823       else if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
2824         st->shift_jis.in_second_byte = 1;
2825     }
2826   return CODING_CATEGORY_SHIFT_JIS_MASK;
2827 }
2828
2829 /* Convert Shift-JIS data to internal format. */
2830
2831 static void
2832 decode_coding_sjis (Lstream *decoding, CONST unsigned char *src,
2833                     unsigned_char_dynarr *dst, unsigned int n)
2834 {
2835   unsigned char c;
2836   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2837   unsigned int flags  = str->flags;
2838   unsigned int ch     = str->ch;
2839   eol_type_t eol_type = str->eol_type;
2840
2841   while (n--)
2842     {
2843       c = *src++;
2844
2845       if (ch)
2846         {
2847           /* Previous character was first byte of Shift-JIS Kanji char. */
2848           if (BYTE_SJIS_TWO_BYTE_2_P (c))
2849             {
2850               unsigned char e1, e2;
2851
2852               Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
2853               DECODE_SJIS (ch, c, e1, e2);
2854               Dynarr_add (dst, e1);
2855               Dynarr_add (dst, e2);
2856             }
2857           else
2858             {
2859               DECODE_ADD_BINARY_CHAR (ch, dst);
2860               DECODE_ADD_BINARY_CHAR (c, dst);
2861             }
2862           ch = 0;
2863         }
2864       else
2865         {
2866           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
2867           if (BYTE_SJIS_TWO_BYTE_1_P (c))
2868             ch = c;
2869           else if (BYTE_SJIS_KATAKANA_P (c))
2870             {
2871               Dynarr_add (dst, LEADING_BYTE_KATAKANA_JISX0201);
2872               Dynarr_add (dst, c);
2873             }
2874           else
2875             DECODE_ADD_BINARY_CHAR (c, dst);
2876         }
2877     label_continue_loop:;
2878     }
2879
2880   DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
2881
2882   str->flags = flags;
2883   str->ch    = ch;
2884 }
2885
2886 /* Convert internally-formatted data to Shift-JIS. */
2887
2888 static void
2889 encode_coding_sjis (Lstream *encoding, CONST unsigned char *src,
2890                     unsigned_char_dynarr *dst, unsigned int n)
2891 {
2892   unsigned char c;
2893   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2894   unsigned int flags  = str->flags;
2895   unsigned int ch     = str->ch;
2896   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
2897
2898   while (n--)
2899     {
2900       c = *src++;
2901       if (c == '\n')
2902         {
2903           if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
2904             Dynarr_add (dst, '\r');
2905           if (eol_type != EOL_CR)
2906             Dynarr_add (dst, '\n');
2907           ch = 0;
2908         }
2909       else if (BYTE_ASCII_P (c))
2910         {
2911           Dynarr_add (dst, c);
2912           ch = 0;
2913         }
2914       else if (BUFBYTE_LEADING_BYTE_P (c))
2915         ch = (c == LEADING_BYTE_KATAKANA_JISX0201 ||
2916               c == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
2917               c == LEADING_BYTE_JAPANESE_JISX0208) ? c : 0;
2918       else if (ch)
2919         {
2920           if (ch == LEADING_BYTE_KATAKANA_JISX0201)
2921             {
2922               Dynarr_add (dst, c);
2923               ch = 0;
2924             }
2925           else if (ch == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
2926                    ch == LEADING_BYTE_JAPANESE_JISX0208)
2927             ch = c;
2928           else
2929             {
2930               unsigned char j1, j2;
2931               ENCODE_SJIS (ch, c, j1, j2);
2932               Dynarr_add (dst, j1);
2933               Dynarr_add (dst, j2);
2934               ch = 0;
2935             }
2936         }
2937     }
2938
2939   str->flags = flags;
2940   str->ch    = ch;
2941 }
2942
2943 DEFUN ("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /*
2944 Decode a JISX0208 character of Shift-JIS coding-system.
2945 CODE is the character code in Shift-JIS as a cons of type bytes.
2946 Return the corresponding character.
2947 */
2948        (code))
2949 {
2950   unsigned char c1, c2, s1, s2;
2951
2952   CHECK_CONS (code);
2953   CHECK_INT (XCAR (code));
2954   CHECK_INT (XCDR (code));
2955   s1 = XINT (XCAR (code));
2956   s2 = XINT (XCDR (code));
2957   if (BYTE_SJIS_TWO_BYTE_1_P (s1) &&
2958       BYTE_SJIS_TWO_BYTE_2_P (s2))
2959     {
2960       DECODE_SJIS (s1, s2, c1, c2);
2961       return make_char (MAKE_CHAR (Vcharset_japanese_jisx0208,
2962                                    c1 & 0x7F, c2 & 0x7F));
2963     }
2964   else
2965     return Qnil;
2966 }
2967
2968 DEFUN ("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /*
2969 Encode a JISX0208 character CHAR to SHIFT-JIS coding-system.
2970 Return the corresponding character code in SHIFT-JIS as a cons of two bytes.
2971 */
2972        (ch))
2973 {
2974   Lisp_Object charset;
2975   int c1, c2, s1, s2;
2976
2977   CHECK_CHAR_COERCE_INT (ch);
2978   BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
2979   if (EQ (charset, Vcharset_japanese_jisx0208))
2980     {
2981       ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
2982       return Fcons (make_int (s1), make_int (s2));
2983     }
2984   else
2985     return Qnil;
2986 }
2987
2988 \f
2989 /************************************************************************/
2990 /*                            Big5 methods                              */
2991 /************************************************************************/
2992
2993 /* BIG5 is a coding system encoding two character sets: ASCII and
2994    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
2995    character set and is encoded in two-byte.
2996
2997    --- CODE RANGE of BIG5 ---
2998    (character set)      (range)
2999    ASCII                0x00 .. 0x7F
3000    Big5 (1st byte)      0xA1 .. 0xFE
3001         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
3002    --------------------------
3003
3004    Since the number of characters in Big5 is larger than maximum
3005    characters in Emacs' charset (96x96), it can't be handled as one
3006    charset.  So, in Emacs, Big5 is divided into two: `charset-big5-1'
3007    and `charset-big5-2'.  Both <type>s are DIMENSION2_CHARS94.  The former
3008    contains frequently used characters and the latter contains less
3009    frequently used characters.  */
3010
3011 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
3012   ((c) >= 0xA1 && (c) <= 0xFE)
3013
3014 /* Is this the second byte of a Shift-JIS two-byte char? */
3015
3016 #define BYTE_BIG5_TWO_BYTE_2_P(c) \
3017   (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0xA1 && (c) <= 0xFE))
3018
3019 /* Number of Big5 characters which have the same code in 1st byte.  */
3020
3021 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
3022
3023 /* Code conversion macros.  These are macros because they are used in
3024    inner loops during code conversion.
3025
3026    Note that temporary variables in macros introduce the classic
3027    dynamic-scoping problems with variable names.  We use capital-
3028    lettered variables in the assumption that XEmacs does not use
3029    capital letters in variables except in a very formalized way
3030    (e.g. Qstring). */
3031
3032 /* Convert Big5 code (b1, b2) into its internal string representation
3033    (lb, c1, c2). */
3034
3035 /* There is a much simpler way to split the Big5 charset into two.
3036    For the moment I'm going to leave the algorithm as-is because it
3037    claims to separate out the most-used characters into a single
3038    charset, which perhaps will lead to optimizations in various
3039    places.
3040
3041    The way the algorithm works is something like this:
3042
3043    Big5 can be viewed as a 94x157 charset, where the row is
3044    encoded into the bytes 0xA1 .. 0xFE and the column is encoded
3045    into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE.  As for frequency,
3046    the split between low and high column numbers is apparently
3047    meaningless; ascending rows produce less and less frequent chars.
3048    Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to
3049    the first charset, and the upper half (0xC9 .. 0xFE) to the
3050    second.  To do the conversion, we convert the character into
3051    a single number where 0 .. 156 is the first row, 157 .. 313
3052    is the second, etc.  That way, the characters are ordered by
3053    decreasing frequency.  Then we just chop the space in two
3054    and coerce the result into a 94x94 space.
3055    */
3056
3057 #define DECODE_BIG5(b1, b2, lb, c1, c2) do                              \
3058 {                                                                       \
3059   int B1 = b1, B2 = b2;                                                 \
3060   unsigned int I                                                        \
3061     = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62);     \
3062                                                                         \
3063   if (B1 < 0xC9)                                                        \
3064     {                                                                   \
3065       lb = LEADING_BYTE_CHINESE_BIG5_1;                                 \
3066     }                                                                   \
3067   else                                                                  \
3068     {                                                                   \
3069       lb = LEADING_BYTE_CHINESE_BIG5_2;                                 \
3070       I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1);                             \
3071     }                                                                   \
3072   c1 = I / (0xFF - 0xA1) + 0xA1;                                        \
3073   c2 = I % (0xFF - 0xA1) + 0xA1;                                        \
3074 } while (0)
3075
3076 /* Convert the internal string representation of a Big5 character
3077    (lb, c1, c2) into Big5 code (b1, b2). */
3078
3079 #define ENCODE_BIG5(lb, c1, c2, b1, b2) do                              \
3080 {                                                                       \
3081   unsigned int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1);       \
3082                                                                         \
3083   if (lb == LEADING_BYTE_CHINESE_BIG5_2)                                \
3084     {                                                                   \
3085       I += BIG5_SAME_ROW * (0xC9 - 0xA1);                               \
3086     }                                                                   \
3087   b1 = I / BIG5_SAME_ROW + 0xA1;                                        \
3088   b2 = I % BIG5_SAME_ROW;                                               \
3089   b2 += b2 < 0x3F ? 0x40 : 0x62;                                        \
3090 } while (0)
3091
3092 static int
3093 detect_coding_big5 (struct detection_state *st, CONST unsigned char *src,
3094                     unsigned int n)
3095 {
3096   int c;
3097
3098   while (n--)
3099     {
3100       c = *src++;
3101       if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO ||
3102           (c >= 0x80 && c <= 0xA0))
3103         return 0;
3104       if (st->big5.in_second_byte)
3105         {
3106           st->big5.in_second_byte = 0;
3107           if (c < 0x40 || (c >= 0x80 && c <= 0xA0))
3108             return 0;
3109         }
3110       else if (c >= 0xA1)
3111         st->big5.in_second_byte = 1;
3112     }
3113   return CODING_CATEGORY_BIG5_MASK;
3114 }
3115
3116 /* Convert Big5 data to internal format. */
3117
3118 static void
3119 decode_coding_big5 (Lstream *decoding, CONST unsigned char *src,
3120                     unsigned_char_dynarr *dst, unsigned int n)
3121 {
3122   unsigned char c;
3123   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3124   unsigned int flags  = str->flags;
3125   unsigned int ch     = str->ch;
3126   eol_type_t eol_type = str->eol_type;
3127
3128   while (n--)
3129     {
3130       c = *src++;
3131       if (ch)
3132         {
3133           /* Previous character was first byte of Big5 char. */
3134           if (BYTE_BIG5_TWO_BYTE_2_P (c))
3135             {
3136               unsigned char b1, b2, b3;
3137               DECODE_BIG5 (ch, c, b1, b2, b3);
3138               Dynarr_add (dst, b1);
3139               Dynarr_add (dst, b2);
3140               Dynarr_add (dst, b3);
3141             }
3142           else
3143             {
3144               DECODE_ADD_BINARY_CHAR (ch, dst);
3145               DECODE_ADD_BINARY_CHAR (c, dst);
3146             }
3147           ch = 0;
3148         }
3149       else
3150         {
3151           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3152           if (BYTE_BIG5_TWO_BYTE_1_P (c))
3153             ch = c;
3154           else
3155             DECODE_ADD_BINARY_CHAR (c, dst);
3156         }
3157     label_continue_loop:;
3158     }
3159
3160   DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
3161
3162   str->flags = flags;
3163   str->ch    = ch;
3164 }
3165
3166 /* Convert internally-formatted data to Big5. */
3167
3168 static void
3169 encode_coding_big5 (Lstream *encoding, CONST unsigned char *src,
3170                     unsigned_char_dynarr *dst, unsigned int n)
3171 {
3172   unsigned char c;
3173   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3174   unsigned int flags  = str->flags;
3175   unsigned int ch     = str->ch;
3176   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3177
3178   while (n--)
3179     {
3180       c = *src++;
3181       if (c == '\n')
3182         {
3183           if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3184             Dynarr_add (dst, '\r');
3185           if (eol_type != EOL_CR)
3186             Dynarr_add (dst, '\n');
3187         }
3188       else if (BYTE_ASCII_P (c))
3189         {
3190           /* ASCII. */
3191           Dynarr_add (dst, c);
3192         }
3193       else if (BUFBYTE_LEADING_BYTE_P (c))
3194         {
3195           if (c == LEADING_BYTE_CHINESE_BIG5_1 ||
3196               c == LEADING_BYTE_CHINESE_BIG5_2)
3197             {
3198               /* A recognized leading byte. */
3199               ch = c;
3200               continue; /* not done with this character. */
3201             }
3202           /* otherwise just ignore this character. */
3203         }
3204       else if (ch == LEADING_BYTE_CHINESE_BIG5_1 ||
3205                ch == LEADING_BYTE_CHINESE_BIG5_2)
3206         {
3207           /* Previous char was a recognized leading byte. */
3208           ch = (ch << 8) | c;
3209           continue; /* not done with this character. */
3210         }
3211       else if (ch)
3212         {
3213           /* Encountering second byte of a Big5 character. */
3214           unsigned char b1, b2;
3215
3216           ENCODE_BIG5 (ch >> 8, ch & 0xFF, c, b1, b2);
3217           Dynarr_add (dst, b1);
3218           Dynarr_add (dst, b2);
3219         }
3220
3221       ch = 0;
3222     }
3223
3224   str->flags = flags;
3225   str->ch    = ch;
3226 }
3227
3228
3229 DEFUN ("decode-big5-char", Fdecode_big5_char, 1, 1, 0, /*
3230 Decode a Big5 character CODE of BIG5 coding-system.
3231 CODE is the character code in BIG5, a cons of two integers.
3232 Return the corresponding character.
3233 */
3234        (code))
3235 {
3236   unsigned char c1, c2, b1, b2;
3237
3238   CHECK_CONS (code);
3239   CHECK_INT (XCAR (code));
3240   CHECK_INT (XCDR (code));
3241   b1 = XINT (XCAR (code));
3242   b2 = XINT (XCDR (code));
3243   if (BYTE_BIG5_TWO_BYTE_1_P (b1) &&
3244       BYTE_BIG5_TWO_BYTE_2_P (b2))
3245     {
3246       int leading_byte;
3247       Lisp_Object charset;
3248       DECODE_BIG5 (b1, b2, leading_byte, c1, c2);
3249       charset = CHARSET_BY_LEADING_BYTE (leading_byte);
3250       return make_char (MAKE_CHAR (charset, c1 & 0x7F, c2 & 0x7F));
3251     }
3252   else
3253     return Qnil;
3254 }
3255
3256 DEFUN ("encode-big5-char", Fencode_big5_char, 1, 1, 0, /*
3257 Encode the Big5 character CH to BIG5 coding-system.
3258 Return the corresponding character code in Big5.
3259 */
3260        (ch))
3261 {
3262   Lisp_Object charset;
3263   int c1, c2, b1, b2;
3264
3265   CHECK_CHAR_COERCE_INT (ch);
3266   BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
3267   if (EQ (charset, Vcharset_chinese_big5_1) ||
3268       EQ (charset, Vcharset_chinese_big5_2))
3269     {
3270       ENCODE_BIG5 (XCHARSET_LEADING_BYTE (charset), c1 | 0x80, c2 | 0x80,
3271                    b1, b2);
3272       return Fcons (make_int (b1), make_int (b2));
3273     }
3274   else
3275     return Qnil;
3276 }
3277
3278 \f
3279 /************************************************************************/
3280 /*                           UCS-4 methods                              */
3281 /*                                                                      */
3282 /*  UCS-4 character codes are implemented as nonnegative integers.      */
3283 /*                                                                      */
3284 /************************************************************************/
3285
3286
3287 DEFUN ("set-ucs-char", Fset_ucs_char, 2, 2, 0, /*
3288 Map UCS-4 code CODE to Mule character CHARACTER.
3289
3290 Return T on success, NIL on failure.
3291 */
3292        (code, character))
3293 {
3294   unsigned int c;
3295
3296   CHECK_CHAR (character);
3297   CHECK_INT (code);
3298   c = XINT (code);
3299
3300   if (c < sizeof (fcd->ucs_to_mule_table))
3301     {
3302       fcd->ucs_to_mule_table[c] = character;
3303       return Qt;
3304     }
3305   else
3306     return Qnil;
3307 }
3308
3309 static Lisp_Object
3310 ucs_to_char (unsigned long code)
3311 {
3312   if (code < sizeof (fcd->ucs_to_mule_table))
3313     {
3314       return fcd->ucs_to_mule_table[code];
3315     }
3316   else if ((0xe00000 <= code) && (code <= 0xe00000 + 94 * 94 * 14))
3317     {
3318       unsigned int c;
3319
3320       code -= 0xe00000;
3321       c = code % (94 * 94);
3322       return make_char
3323         (MAKE_CHAR (CHARSET_BY_ATTRIBUTES
3324                     (CHARSET_TYPE_94X94, code / (94 * 94) + '@',
3325                      CHARSET_LEFT_TO_RIGHT),
3326                     c / 94 + 33, c % 94 + 33));
3327     }
3328   else
3329     return Qnil;
3330 }
3331
3332 DEFUN ("ucs-char", Fucs_char, 1, 1, 0, /*
3333 Return Mule character corresponding to UCS code CODE (a positive integer).
3334 */
3335        (code))
3336 {
3337   CHECK_NATNUM (code);
3338   return ucs_to_char (XINT (code));
3339 }
3340
3341 DEFUN ("set-char-ucs", Fset_char_ucs, 2, 2, 0, /*
3342 Map Mule character CHARACTER to UCS code CODE (a positive integer).
3343 */
3344        (character, code))
3345 {
3346   /* #### Isn't this gilding the lily?  Fput_char_table checks its args.
3347           Fset_char_ucs is more restrictive on index arg, but should
3348           check code arg in a char_table method. */
3349   CHECK_CHAR (character);
3350   CHECK_NATNUM (code);
3351   return Fput_char_table (character, code, mule_to_ucs_table);
3352 }
3353
3354 DEFUN ("char-ucs", Fchar_ucs, 1, 1, 0, /*
3355 Return the UCS code (a positive integer) corresponding to CHARACTER.
3356 */
3357        (character))
3358 {
3359   return Fget_char_table (character, mule_to_ucs_table);
3360 }
3361
3362 /* Decode a UCS-4 character into a buffer.  If the lookup fails, use
3363    <GETA MARK> (U+3013) of JIS X 0208, which means correct character
3364    is not found, instead.
3365    #### do something more appropriate (use blob?)
3366         Danger, Will Robinson!  Data loss.  Should we signal user? */
3367 static void
3368 decode_ucs4 (unsigned long ch, unsigned_char_dynarr *dst)
3369 {
3370   Lisp_Object chr = ucs_to_char (ch);
3371
3372   if (! NILP (chr))
3373     {
3374       Bufbyte work[MAX_EMCHAR_LEN];
3375       int len;
3376
3377       ch = XCHAR (chr);
3378       len = (ch < 128) ?
3379         simple_set_charptr_emchar (work, ch) :
3380         non_ascii_set_charptr_emchar (work, ch);
3381       Dynarr_add_many (dst, work, len);
3382     }
3383   else
3384     {
3385       Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
3386       Dynarr_add (dst, 34 + 128);
3387       Dynarr_add (dst, 46 + 128);
3388     }
3389 }
3390
3391 static unsigned long
3392 mule_char_to_ucs4 (Lisp_Object charset,
3393                    unsigned char h, unsigned char l)
3394 {
3395   Lisp_Object code
3396     = Fget_char_table (make_char (MAKE_CHAR (charset, h & 127, l & 127)),
3397                        mule_to_ucs_table);
3398
3399   if (INTP (code))
3400     {
3401       return XINT (code);
3402     }
3403   else if ( (XCHARSET_DIMENSION (charset) == 2) &&
3404             (XCHARSET_CHARS (charset) == 94) )
3405     {
3406       unsigned char final = XCHARSET_FINAL (charset);
3407
3408       if ( ('@' <= final) && (final < 0x7f) )
3409         {
3410           return 0xe00000 + (final - '@') * 94 * 94
3411             + ((h & 127) - 33) * 94 + (l & 127) - 33;
3412         }
3413       else
3414         {
3415           return '?';
3416         }
3417     }
3418   else
3419     {
3420       return '?';
3421     }
3422 }
3423
3424 static void
3425 encode_ucs4 (Lisp_Object charset,
3426              unsigned char h, unsigned char l, unsigned_char_dynarr *dst)
3427 {
3428   unsigned long code = mule_char_to_ucs4 (charset, h, l);
3429   Dynarr_add (dst,  code >> 24);
3430   Dynarr_add (dst, (code >> 16) & 255);
3431   Dynarr_add (dst, (code >>  8) & 255);
3432   Dynarr_add (dst,  code        & 255);
3433 }
3434
3435 static int
3436 detect_coding_ucs4 (struct detection_state *st, CONST unsigned char *src,
3437                     unsigned int n)
3438 {
3439   while (n--)
3440     {
3441       int c = *src++;
3442       switch (st->ucs4.in_byte)
3443         {
3444         case 0:
3445           if (c >= 128)
3446             return 0;
3447           else
3448             st->ucs4.in_byte++;
3449           break;
3450         case 3:
3451           st->ucs4.in_byte = 0;
3452           break;
3453         default:
3454           st->ucs4.in_byte++;
3455         }
3456     }
3457   return CODING_CATEGORY_UCS4_MASK;
3458 }
3459
3460 static void
3461 decode_coding_ucs4 (Lstream *decoding, CONST unsigned char *src,
3462                     unsigned_char_dynarr *dst, unsigned int n)
3463 {
3464   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3465   unsigned int flags = str->flags;
3466   unsigned int ch    = str->ch;
3467   unsigned char counter = str->counter;
3468
3469   while (n--)
3470     {
3471       unsigned char c = *src++;
3472       switch (counter)
3473         {
3474         case 0:
3475           ch = c;
3476           counter = 3;
3477           break;
3478         case 1:
3479           decode_ucs4 ( ( ch << 8 ) | c, dst);
3480           ch = 0;
3481           counter = 0;
3482           break;
3483         default:
3484           ch = ( ch << 8 ) | c;
3485           counter--;
3486         }
3487     }
3488   if (counter & CODING_STATE_END)
3489     DECODE_OUTPUT_PARTIAL_CHAR (ch);
3490
3491   str->flags = flags;
3492   str->ch    = ch;
3493   str->counter = counter;
3494 }
3495
3496 static void
3497 encode_coding_ucs4 (Lstream *encoding, CONST unsigned char *src,
3498                     unsigned_char_dynarr *dst, unsigned int n)
3499 {
3500   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3501   unsigned int flags = str->flags;
3502   unsigned int ch = str->ch;
3503   unsigned char char_boundary = str->iso2022.current_char_boundary;
3504   Lisp_Object charset = str->iso2022.current_charset;
3505
3506 #ifdef ENABLE_COMPOSITE_CHARS
3507   /* flags for handling composite chars.  We do a little switcharoo
3508      on the source while we're outputting the composite char. */
3509   unsigned int saved_n = 0;
3510   CONST unsigned char *saved_src = NULL;
3511   int in_composite = 0;
3512
3513  back_to_square_n:
3514 #endif
3515
3516   while (n--)
3517     {
3518       unsigned char c = *src++;
3519
3520       if (BYTE_ASCII_P (c))
3521         {               /* Processing ASCII character */
3522           ch = 0;
3523           encode_ucs4 (Vcharset_ascii, c, 0, dst);
3524           char_boundary = 1;
3525         }
3526       else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
3527         { /* Processing Leading Byte */
3528           ch = 0;
3529           charset = CHARSET_BY_LEADING_BYTE (c);
3530           if (LEADING_BYTE_PREFIX_P(c))
3531             ch = c;
3532           char_boundary = 0;
3533         }
3534       else
3535         {                       /* Processing Non-ASCII character */
3536           char_boundary = 1;
3537           if (EQ (charset, Vcharset_control_1))
3538             {
3539               encode_ucs4 (Vcharset_control_1, c, 0, dst);
3540             }
3541           else
3542             {
3543               switch (XCHARSET_REP_BYTES (charset))
3544                 {
3545                 case 2:
3546                   encode_ucs4 (charset, c, 0, dst);
3547                   break;
3548                 case 3:
3549                   if (XCHARSET_PRIVATE_P (charset))
3550                     {
3551                       encode_ucs4 (charset, c, 0, dst);
3552                       ch = 0;
3553                     }
3554                   else if (ch)
3555                     {
3556 #ifdef ENABLE_COMPOSITE_CHARS
3557                       if (EQ (charset, Vcharset_composite))
3558                         {
3559                           if (in_composite)
3560                             {
3561                               /* #### Bother! We don't know how to
3562                                  handle this yet. */
3563                               Dynarr_add (dst, 0);
3564                               Dynarr_add (dst, 0);
3565                               Dynarr_add (dst, 0);
3566                               Dynarr_add (dst, '~');
3567                             }
3568                           else
3569                             {
3570                               Emchar emch = MAKE_CHAR (Vcharset_composite,
3571                                                        ch & 0x7F, c & 0x7F);
3572                               Lisp_Object lstr = composite_char_string (emch);
3573                               saved_n = n;
3574                               saved_src = src;
3575                               in_composite = 1;
3576                               src = XSTRING_DATA   (lstr);
3577                               n   = XSTRING_LENGTH (lstr);
3578                             }
3579                         }
3580                       else
3581 #endif /* ENABLE_COMPOSITE_CHARS */
3582                         {
3583                           encode_ucs4(charset, ch, c, dst);
3584                         }
3585                       ch = 0;
3586                     }
3587                   else
3588                     {
3589                       ch = c;
3590                       char_boundary = 0;
3591                     }
3592                   break;
3593                 case 4:
3594                   if (ch)
3595                     {
3596                       encode_ucs4 (charset, ch, c, dst);
3597                       ch = 0;
3598                     }
3599                   else
3600                     {
3601                       ch = c;
3602                       char_boundary = 0;
3603                     }
3604                   break;
3605                 default:
3606                   abort ();
3607                 }
3608             }
3609         }
3610     }
3611
3612 #ifdef ENABLE_COMPOSITE_CHARS
3613   if (in_composite)
3614     {
3615       n = saved_n;
3616       src = saved_src;
3617       in_composite = 0;
3618       goto back_to_square_n; /* Wheeeeeeeee ..... */
3619     }
3620 #endif /* ENABLE_COMPOSITE_CHARS */
3621
3622   str->flags = flags;
3623   str->ch = ch;
3624   str->iso2022.current_char_boundary = char_boundary;
3625   str->iso2022.current_charset = charset;
3626
3627   /* Verbum caro factum est! */
3628 }
3629
3630 \f
3631 /************************************************************************/
3632 /*                           UTF-8 methods                              */
3633 /************************************************************************/
3634
3635 static int
3636 detect_coding_utf8 (struct detection_state *st, CONST unsigned char *src,
3637                     unsigned int n)
3638 {
3639   while (n--)
3640     {
3641       unsigned char c = *src++;
3642       switch (st->utf8.in_byte)
3643         {
3644         case 0:
3645           if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3646             return 0;
3647           else if (c >= 0xfc)
3648             st->utf8.in_byte = 5;
3649           else if (c >= 0xf8)
3650             st->utf8.in_byte = 4;
3651           else if (c >= 0xf0)
3652             st->utf8.in_byte = 3;
3653           else if (c >= 0xe0)
3654             st->utf8.in_byte = 2;
3655           else if (c >= 0xc0)
3656             st->utf8.in_byte = 1;
3657           else if (c >= 0x80)
3658             return 0;
3659           break;
3660         default:
3661           if ((c & 0xc0) != 0x80)
3662             return 0;
3663           else
3664             st->utf8.in_byte--;
3665         }
3666     }
3667   return CODING_CATEGORY_UTF8_MASK;
3668 }
3669
3670 static void
3671 decode_coding_utf8 (Lstream *decoding, CONST unsigned char *src,
3672                     unsigned_char_dynarr *dst, unsigned int n)
3673 {
3674   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3675   unsigned int flags  = str->flags;
3676   unsigned int ch     = str->ch;
3677   eol_type_t eol_type = str->eol_type;
3678   unsigned char counter = str->counter;
3679
3680   while (n--)
3681     {
3682       unsigned char c = *src++;
3683       switch (counter)
3684         {
3685         case 0:
3686           if ( c >= 0xfc )
3687             {
3688               ch = c & 0x01;
3689               counter = 5;
3690             }
3691           else if ( c >= 0xf8 )
3692             {
3693               ch = c & 0x03;
3694               counter = 4;
3695             }
3696           else if ( c >= 0xf0 )
3697             {
3698               ch = c & 0x07;
3699               counter = 3;
3700             }
3701           else if ( c >= 0xe0 )
3702             {
3703               ch = c & 0x0f;
3704               counter = 2;
3705             }
3706           else if ( c >= 0xc0 )
3707             {
3708               ch = c & 0x1f;
3709               counter = 1;
3710             }
3711           else
3712             {
3713               DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3714               decode_ucs4 (c, dst);
3715             }
3716           break;
3717         case 1:
3718           ch = ( ch << 6 ) | ( c & 0x3f );
3719           decode_ucs4 (ch, dst);
3720           ch = 0;
3721           counter = 0;
3722           break;
3723         default:
3724           ch = ( ch << 6 ) | ( c & 0x3f );
3725           counter--;
3726         }
3727     label_continue_loop:;
3728     }
3729
3730   if (flags & CODING_STATE_END)
3731     DECODE_OUTPUT_PARTIAL_CHAR (ch);
3732
3733   str->flags = flags;
3734   str->ch    = ch;
3735   str->counter = counter;
3736 }
3737
3738 static void
3739 encode_utf8 (Lisp_Object charset,
3740              unsigned char h, unsigned char l, unsigned_char_dynarr *dst)
3741 {
3742   unsigned long code = mule_char_to_ucs4 (charset, h, l);
3743   if ( code <= 0x7f )
3744     {
3745       Dynarr_add (dst, code);
3746     }
3747   else if ( code <= 0x7ff )
3748     {
3749       Dynarr_add (dst, (code >> 6) | 0xc0);
3750       Dynarr_add (dst, (code & 0x3f) | 0x80);
3751     }
3752   else if ( code <= 0xffff )
3753     {
3754       Dynarr_add (dst,  (code >> 12) | 0xe0);
3755       Dynarr_add (dst, ((code >>  6) & 0x3f) | 0x80);
3756       Dynarr_add (dst,  (code        & 0x3f) | 0x80);
3757     }
3758   else if ( code <= 0x1fffff )
3759     {
3760       Dynarr_add (dst,  (code >> 18) | 0xf0);
3761       Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
3762       Dynarr_add (dst, ((code >>  6) & 0x3f) | 0x80);
3763       Dynarr_add (dst,  (code        & 0x3f) | 0x80);
3764     }
3765   else if ( code <= 0x3ffffff )
3766     {
3767       Dynarr_add (dst,  (code >> 24) | 0xf8);
3768       Dynarr_add (dst, ((code >> 18) & 0x3f) | 0x80);
3769       Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
3770       Dynarr_add (dst, ((code >>  6) & 0x3f) | 0x80);
3771       Dynarr_add (dst,  (code        & 0x3f) | 0x80);
3772     }
3773   else
3774     {
3775       Dynarr_add (dst,  (code >> 30) | 0xfc);
3776       Dynarr_add (dst, ((code >> 24) & 0x3f) | 0x80);
3777       Dynarr_add (dst, ((code >> 18) & 0x3f) | 0x80);
3778       Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
3779       Dynarr_add (dst, ((code >>  6) & 0x3f) | 0x80);
3780       Dynarr_add (dst,  (code        & 0x3f) | 0x80);
3781     }
3782 }
3783
3784 static void
3785 encode_coding_utf8 (Lstream *encoding, CONST unsigned char *src,
3786                     unsigned_char_dynarr *dst, unsigned int n)
3787 {
3788   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3789   unsigned int flags  = str->flags;
3790   unsigned int ch     = str->ch;
3791   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3792   unsigned char char_boundary = str->iso2022.current_char_boundary;
3793   Lisp_Object charset = str->iso2022.current_charset;
3794
3795 #ifdef ENABLE_COMPOSITE_CHARS
3796   /* flags for handling composite chars.  We do a little switcharoo
3797      on the source while we're outputting the composite char. */
3798   unsigned int saved_n = 0;
3799   CONST unsigned char *saved_src = NULL;
3800   int in_composite = 0;
3801
3802  back_to_square_n:
3803 #endif /* ENABLE_COMPOSITE_CHARS */
3804
3805   while (n--)
3806     {
3807       unsigned char c = *src++;
3808
3809       if (BYTE_ASCII_P (c))
3810         {               /* Processing ASCII character */
3811           ch = 0;
3812           if (c == '\n')
3813             {
3814               if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3815                 Dynarr_add (dst, '\r');
3816               if (eol_type != EOL_CR)
3817                 Dynarr_add (dst, c);
3818             }
3819           else
3820             encode_utf8 (Vcharset_ascii, c, 0, dst);
3821           char_boundary = 1;
3822         }
3823       else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
3824         { /* Processing Leading Byte */
3825           ch = 0;
3826           charset = CHARSET_BY_LEADING_BYTE (c);
3827           if (LEADING_BYTE_PREFIX_P(c))
3828             ch = c;
3829           char_boundary = 0;
3830         }
3831       else
3832         {                       /* Processing Non-ASCII character */
3833           char_boundary = 1;
3834           if (EQ (charset, Vcharset_control_1))
3835             {
3836               encode_utf8 (Vcharset_control_1, c, 0, dst);
3837             }
3838           else
3839             {
3840               switch (XCHARSET_REP_BYTES (charset))
3841                 {
3842                 case 2:
3843                   encode_utf8 (charset, c, 0, dst);
3844                   break;
3845                 case 3:
3846                   if (XCHARSET_PRIVATE_P (charset))
3847                     {
3848                       encode_utf8 (charset, c, 0, dst);
3849                       ch = 0;
3850                     }
3851                   else if (ch)
3852                     {
3853 #ifdef ENABLE_COMPOSITE_CHARS
3854                       if (EQ (charset, Vcharset_composite))
3855                         {
3856                           if (in_composite)
3857                             {
3858                               /* #### Bother! We don't know how to
3859                                  handle this yet. */
3860                               encode_utf8 (Vcharset_ascii, '~', 0, dst);
3861                             }
3862                           else
3863                             {
3864                               Emchar emch = MAKE_CHAR (Vcharset_composite,
3865                                                        ch & 0x7F, c & 0x7F);
3866                               Lisp_Object lstr = composite_char_string (emch);
3867                               saved_n = n;
3868                               saved_src = src;
3869                               in_composite = 1;
3870                               src = XSTRING_DATA   (lstr);
3871                               n   = XSTRING_LENGTH (lstr);
3872                             }
3873                         }
3874                       else
3875 #endif /* ENABLE_COMPOSITE_CHARS */
3876                         {
3877                           encode_utf8 (charset, ch, c, dst);
3878                         }
3879                       ch = 0;
3880                     }
3881                   else
3882                     {
3883                       ch = c;
3884                       char_boundary = 0;
3885                     }
3886                   break;
3887                 case 4:
3888                   if (ch)
3889                     {
3890                       encode_utf8 (charset, ch, c, dst);
3891                       ch = 0;
3892                     }
3893                   else
3894                     {
3895                       ch = c;
3896                       char_boundary = 0;
3897                     }
3898                   break;
3899                 default:
3900                   abort ();
3901                 }
3902             }
3903         }
3904     }
3905
3906 #ifdef ENABLE_COMPOSITE_CHARS
3907   if (in_composite)
3908     {
3909       n = saved_n;
3910       src = saved_src;
3911       in_composite = 0;
3912       goto back_to_square_n; /* Wheeeeeeeee ..... */
3913     }
3914 #endif
3915
3916   str->flags = flags;
3917   str->ch    = ch;
3918   str->iso2022.current_char_boundary = char_boundary;
3919   str->iso2022.current_charset = charset;
3920
3921   /* Verbum caro factum est! */
3922 }
3923
3924 \f
3925 /************************************************************************/
3926 /*                           ISO2022 methods                            */
3927 /************************************************************************/
3928
3929 /* The following note describes the coding system ISO2022 briefly.
3930    Since the intention of this note is to help understand the
3931    functions in this file, some parts are NOT ACCURATE or OVERLY
3932    SIMPLIFIED.  For thorough understanding, please refer to the
3933    original document of ISO2022.
3934
3935    ISO2022 provides many mechanisms to encode several character sets
3936    in 7-bit and 8-bit environments.  For 7-bit environments, all text
3937    is encoded using bytes less than 128.  This may make the encoded
3938    text a little bit longer, but the text passes more easily through
3939    several gateways, some of which strip off MSB (Most Signigant Bit).
3940
3941    There are two kinds of character sets: control character set and
3942    graphic character set.  The former contains control characters such
3943    as `newline' and `escape' to provide control functions (control
3944    functions are also provided by escape sequences).  The latter
3945    contains graphic characters such as 'A' and '-'.  Emacs recognizes
3946    two control character sets and many graphic character sets.
3947
3948    Graphic character sets are classified into one of the following
3949    four classes, according to the number of bytes (DIMENSION) and
3950    number of characters in one dimension (CHARS) of the set:
3951    - DIMENSION1_CHARS94
3952    - DIMENSION1_CHARS96
3953    - DIMENSION2_CHARS94
3954    - DIMENSION2_CHARS96
3955
3956    In addition, each character set is assigned an identification tag,
3957    unique for each set, called "final character" (denoted as <F>
3958    hereafter).  The <F> of each character set is decided by ECMA(*)
3959    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
3960    (0x30..0x3F are for private use only).
3961
3962    Note (*): ECMA = European Computer Manufacturers Association
3963
3964    Here are examples of graphic character set [NAME(<F>)]:
3965         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
3966         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
3967         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
3968         o DIMENSION2_CHARS96 -- none for the moment
3969
3970    A code area (1 byte = 8 bits) is divided into 4 areas, C0, GL, C1, and GR.
3971         C0 [0x00..0x1F] -- control character plane 0
3972         GL [0x20..0x7F] -- graphic character plane 0
3973         C1 [0x80..0x9F] -- control character plane 1
3974         GR [0xA0..0xFF] -- graphic character plane 1
3975
3976    A control character set is directly designated and invoked to C0 or
3977    C1 by an escape sequence.  The most common case is that:
3978    - ISO646's  control character set is designated/invoked to C0, and
3979    - ISO6429's control character set is designated/invoked to C1,
3980    and usually these designations/invocations are omitted in encoded
3981    text.  In a 7-bit environment, only C0 can be used, and a control
3982    character for C1 is encoded by an appropriate escape sequence to
3983    fit into the environment.  All control characters for C1 are
3984    defined to have corresponding escape sequences.
3985
3986    A graphic character set is at first designated to one of four
3987    graphic registers (G0 through G3), then these graphic registers are
3988    invoked to GL or GR.  These designations and invocations can be
3989    done independently.  The most common case is that G0 is invoked to
3990    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
3991    these invocations and designations are omitted in encoded text.
3992    In a 7-bit environment, only GL can be used.
3993
3994    When a graphic character set of CHARS94 is invoked to GL, codes
3995    0x20 and 0x7F of the GL area work as control characters SPACE and
3996    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
3997    be used.
3998
3999    There are two ways of invocation: locking-shift and single-shift.
4000    With locking-shift, the invocation lasts until the next different
4001    invocation, whereas with single-shift, the invocation affects the
4002    following character only and doesn't affect the locking-shift
4003    state.  Invocations are done by the following control characters or
4004    escape sequences:
4005
4006    ----------------------------------------------------------------------
4007    abbrev  function                  cntrl escape seq   description
4008    ----------------------------------------------------------------------
4009    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
4010    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
4011    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
4012    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
4013    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
4014    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
4015    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
4016    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
4017    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
4018    ----------------------------------------------------------------------
4019    (*) These are not used by any known coding system.
4020
4021    Control characters for these functions are defined by macros
4022    ISO_CODE_XXX in `coding.h'.
4023
4024    Designations are done by the following escape sequences:
4025    ----------------------------------------------------------------------
4026    escape sequence      description
4027    ----------------------------------------------------------------------
4028    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
4029    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
4030    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
4031    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
4032    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
4033    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
4034    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
4035    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
4036    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
4037    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
4038    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
4039    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
4040    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
4041    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
4042    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
4043    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
4044    ----------------------------------------------------------------------
4045
4046    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
4047    of dimension 1, chars 94, and final character <F>, etc...
4048
4049    Note (*): Although these designations are not allowed in ISO2022,
4050    Emacs accepts them on decoding, and produces them on encoding
4051    CHARS96 character sets in a coding system which is characterized as
4052    7-bit environment, non-locking-shift, and non-single-shift.
4053
4054    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
4055    '(' can be omitted.  We refer to this as "short-form" hereafter.
4056
4057    Now you may notice that there are a lot of ways for encoding the
4058    same multilingual text in ISO2022.  Actually, there exist many
4059    coding systems such as Compound Text (used in X11's inter client
4060    communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
4061    (used in Korean internet), EUC (Extended UNIX Code, used in Asian
4062    localized platforms), and all of these are variants of ISO2022.
4063
4064    In addition to the above, Emacs handles two more kinds of escape
4065    sequences: ISO6429's direction specification and Emacs' private
4066    sequence for specifying character composition.
4067
4068    ISO6429's direction specification takes the following form:
4069         o CSI ']'      -- end of the current direction
4070         o CSI '0' ']'  -- end of the current direction
4071         o CSI '1' ']'  -- start of left-to-right text
4072         o CSI '2' ']'  -- start of right-to-left text
4073    The control character CSI (0x9B: control sequence introducer) is
4074    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
4075
4076    Character composition specification takes the following form:
4077         o ESC '0' -- start character composition
4078         o ESC '1' -- end character composition
4079    Since these are not standard escape sequences of any ISO standard,
4080    their use with these meanings is restricted to Emacs only.  */
4081
4082 static void
4083 reset_iso2022 (Lisp_Object coding_system, struct iso2022_decoder *iso)
4084 {
4085   int i;
4086
4087   for (i = 0; i < 4; i++)
4088     {
4089       if (!NILP (coding_system))
4090         iso->charset[i] =
4091           XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, i);
4092       else
4093         iso->charset[i] = Qt;
4094       iso->invalid_designated[i] = 0;
4095     }
4096   iso->esc = ISO_ESC_NOTHING;
4097   iso->esc_bytes_index = 0;
4098   iso->register_left = 0;
4099   iso->register_right = 1;
4100   iso->switched_dir_and_no_valid_charset_yet = 0;
4101   iso->invalid_switch_dir = 0;
4102   iso->output_direction_sequence = 0;
4103   iso->output_literally = 0;
4104 #ifdef ENABLE_COMPOSITE_CHARS
4105   if (iso->composite_chars)
4106     Dynarr_reset (iso->composite_chars);
4107 #endif
4108 }
4109
4110 static int
4111 fit_to_be_escape_quoted (unsigned char c)
4112 {
4113   switch (c)
4114     {
4115     case ISO_CODE_ESC:
4116     case ISO_CODE_CSI:
4117     case ISO_CODE_SS2:
4118     case ISO_CODE_SS3:
4119     case ISO_CODE_SO:
4120     case ISO_CODE_SI:
4121       return 1;
4122
4123     default:
4124       return 0;
4125     }
4126 }
4127
4128 /* Parse one byte of an ISO2022 escape sequence.
4129    If the result is an invalid escape sequence, return 0 and
4130    do not change anything in STR.  Otherwise, if the result is
4131    an incomplete escape sequence, update ISO2022.ESC and
4132    ISO2022.ESC_BYTES and return -1.  Otherwise, update
4133    all the state variables (but not ISO2022.ESC_BYTES) and
4134    return 1.
4135
4136    If CHECK_INVALID_CHARSETS is non-zero, check for designation
4137    or invocation of an invalid character set and treat that as
4138    an unrecognized escape sequence. */
4139
4140 static int
4141 parse_iso2022_esc (Lisp_Object codesys, struct iso2022_decoder *iso,
4142                    unsigned char c, unsigned int *flags,
4143                    int check_invalid_charsets)
4144 {
4145   /* (1) If we're at the end of a designation sequence, CS is the
4146      charset being designated and REG is the register to designate
4147      it to.
4148
4149      (2) If we're at the end of a locking-shift sequence, REG is
4150      the register to invoke and HALF (0 == left, 1 == right) is
4151      the half to invoke it into.
4152
4153      (3) If we're at the end of a single-shift sequence, REG is
4154      the register to invoke. */
4155   Lisp_Object cs = Qnil;
4156   int reg, half;
4157
4158   /* NOTE: This code does goto's all over the fucking place.
4159      The reason for this is that we're basically implementing
4160      a state machine here, and hierarchical languages like C
4161      don't really provide a clean way of doing this. */
4162
4163   if (! (*flags & CODING_STATE_ESCAPE))
4164     /* At beginning of escape sequence; we need to reset our
4165        escape-state variables. */
4166     iso->esc = ISO_ESC_NOTHING;
4167
4168   iso->output_literally = 0;
4169   iso->output_direction_sequence = 0;
4170
4171   switch (iso->esc)
4172     {
4173     case ISO_ESC_NOTHING:
4174       iso->esc_bytes_index = 0;
4175       switch (c)
4176         {
4177         case ISO_CODE_ESC:      /* Start escape sequence */
4178           *flags |= CODING_STATE_ESCAPE;
4179           iso->esc = ISO_ESC;
4180           goto not_done;
4181
4182         case ISO_CODE_CSI:      /* ISO6429 (specifying directionality) */
4183           *flags |= CODING_STATE_ESCAPE;
4184           iso->esc = ISO_ESC_5_11;
4185           goto not_done;
4186
4187         case ISO_CODE_SO:       /* locking shift 1 */
4188           reg = 1; half = 0;
4189           goto locking_shift;
4190         case ISO_CODE_SI:       /* locking shift 0 */
4191           reg = 0; half = 0;
4192           goto locking_shift;
4193
4194         case ISO_CODE_SS2:      /* single shift */
4195           reg = 2;
4196           goto single_shift;
4197         case ISO_CODE_SS3:      /* single shift */
4198           reg = 3;
4199           goto single_shift;
4200
4201         default:                        /* Other control characters */
4202           return 0;
4203         }
4204
4205     case ISO_ESC:
4206       switch (c)
4207         {
4208           /**** single shift ****/
4209
4210         case 'N':       /* single shift 2 */
4211           reg = 2;
4212           goto single_shift;
4213         case 'O':       /* single shift 3 */
4214           reg = 3;
4215           goto single_shift;
4216
4217           /**** locking shift ****/
4218
4219         case '~':       /* locking shift 1 right */
4220           reg = 1; half = 1;
4221           goto locking_shift;
4222         case 'n':       /* locking shift 2 */
4223           reg = 2; half = 0;
4224           goto locking_shift;
4225         case '}':       /* locking shift 2 right */
4226           reg = 2; half = 1;
4227           goto locking_shift;
4228         case 'o':       /* locking shift 3 */
4229           reg = 3; half = 0;
4230           goto locking_shift;
4231         case '|':       /* locking shift 3 right */
4232           reg = 3; half = 1;
4233           goto locking_shift;
4234
4235 #ifdef ENABLE_COMPOSITE_CHARS
4236           /**** composite ****/
4237
4238         case '0':
4239           iso->esc = ISO_ESC_START_COMPOSITE;
4240           *flags = (*flags & CODING_STATE_ISO2022_LOCK) |
4241             CODING_STATE_COMPOSITE;
4242           return 1;
4243
4244         case '1':
4245           iso->esc = ISO_ESC_END_COMPOSITE;
4246           *flags = (*flags & CODING_STATE_ISO2022_LOCK) &
4247             ~CODING_STATE_COMPOSITE;
4248           return 1;
4249 #endif /* ENABLE_COMPOSITE_CHARS */
4250
4251           /**** directionality ****/
4252
4253         case '[':
4254           iso->esc = ISO_ESC_5_11;
4255           goto not_done;
4256
4257           /**** designation ****/
4258
4259         case '$':       /* multibyte charset prefix */
4260           iso->esc = ISO_ESC_2_4;
4261           goto not_done;
4262
4263         default:
4264           if (0x28 <= c && c <= 0x2F)
4265             {
4266               iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_8);
4267               goto not_done;
4268             }
4269
4270           /* This function is called with CODESYS equal to nil when
4271              doing coding-system detection. */
4272           if (!NILP (codesys)
4273               && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
4274               && fit_to_be_escape_quoted (c))
4275             {
4276               iso->esc = ISO_ESC_LITERAL;
4277               *flags &= CODING_STATE_ISO2022_LOCK;
4278               return 1;
4279             }
4280
4281           /* bzzzt! */
4282           return 0;
4283         }
4284
4285
4286
4287       /**** directionality ****/
4288
4289     case ISO_ESC_5_11:          /* ISO6429 direction control */
4290       if (c == ']')
4291         {
4292           *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4293           goto directionality;
4294         }
4295       if      (c == '0') iso->esc = ISO_ESC_5_11_0;
4296       else if (c == '1') iso->esc = ISO_ESC_5_11_1;
4297       else if (c == '2') iso->esc = ISO_ESC_5_11_2;
4298       else               return 0;
4299       goto not_done;
4300
4301     case ISO_ESC_5_11_0:
4302       if (c == ']')
4303         {
4304           *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4305           goto directionality;
4306         }
4307       return 0;
4308
4309     case ISO_ESC_5_11_1:
4310       if (c == ']')
4311         {
4312           *flags = (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4313           goto directionality;
4314         }
4315       return 0;
4316
4317     case ISO_ESC_5_11_2:
4318       if (c == ']')
4319         {
4320           *flags = (*flags & CODING_STATE_ISO2022_LOCK) | CODING_STATE_R2L;
4321           goto directionality;
4322         }
4323       return 0;
4324
4325     directionality:
4326       iso->esc = ISO_ESC_DIRECTIONALITY;
4327       /* Various junk here to attempt to preserve the direction sequences
4328          literally in the text if they would otherwise be swallowed due
4329          to invalid designations that don't show up as actual charset
4330          changes in the text. */
4331       if (iso->invalid_switch_dir)
4332         {
4333           /* We already inserted a direction switch literally into the
4334              text.  We assume (#### this may not be right) that the
4335              next direction switch is the one going the other way,
4336              and we need to output that literally as well. */
4337           iso->output_literally = 1;
4338           iso->invalid_switch_dir = 0;
4339         }
4340       else
4341         {
4342           int jj;
4343
4344           /* If we are in the thrall of an invalid designation,
4345            then stick the directionality sequence literally into the
4346            output stream so it ends up in the original text again. */
4347           for (jj = 0; jj < 4; jj++)
4348             if (iso->invalid_designated[jj])
4349               break;
4350           if (jj < 4)
4351             {
4352               iso->output_literally = 1;
4353               iso->invalid_switch_dir = 1;
4354             }
4355           else
4356             /* Indicate that we haven't yet seen a valid designation,
4357                so that if a switch-dir is directly followed by an
4358                invalid designation, both get inserted literally. */
4359             iso->switched_dir_and_no_valid_charset_yet = 1;
4360         }
4361       return 1;
4362
4363
4364       /**** designation ****/
4365
4366     case ISO_ESC_2_4:
4367       if (0x28 <= c && c <= 0x2F)
4368         {
4369           iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_4_8);
4370           goto not_done;
4371         }
4372       if (0x40 <= c && c <= 0x42)
4373         {
4374           cs = CHARSET_BY_ATTRIBUTES (CHARSET_TYPE_94X94, c,
4375                                       *flags & CODING_STATE_R2L ?
4376                                       CHARSET_RIGHT_TO_LEFT :
4377                                       CHARSET_LEFT_TO_RIGHT);
4378           reg = 0;
4379           goto designated;
4380         }
4381       return 0;
4382
4383     default:
4384       {
4385         int type =-1;
4386
4387         if (c < '0' || c > '~')
4388           return 0; /* bad final byte */
4389
4390         if (iso->esc >= ISO_ESC_2_8 &&
4391             iso->esc <= ISO_ESC_2_15)
4392           {
4393             type = ((iso->esc >= ISO_ESC_2_12) ?
4394                     CHARSET_TYPE_96 : CHARSET_TYPE_94);
4395             reg = (iso->esc - ISO_ESC_2_8) & 3;
4396           }
4397         else if (iso->esc >= ISO_ESC_2_4_8 &&
4398                  iso->esc <= ISO_ESC_2_4_15)
4399           {
4400             type = ((iso->esc >= ISO_ESC_2_4_12) ?
4401                     CHARSET_TYPE_96X96 : CHARSET_TYPE_94X94);
4402             reg = (iso->esc - ISO_ESC_2_4_8) & 3;
4403           }
4404         else
4405           {
4406             /* Can this ever be reached? -slb */
4407             abort();
4408           }
4409
4410         cs = CHARSET_BY_ATTRIBUTES (type, c,
4411                                     *flags & CODING_STATE_R2L ?
4412                                     CHARSET_RIGHT_TO_LEFT :
4413                                     CHARSET_LEFT_TO_RIGHT);
4414         goto designated;
4415       }
4416     }
4417
4418  not_done:
4419   iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char) c;
4420   return -1;
4421
4422  single_shift:
4423   if (check_invalid_charsets && !CHARSETP (iso->charset[reg]))
4424     /* can't invoke something that ain't there. */
4425     return 0;
4426   iso->esc = ISO_ESC_SINGLE_SHIFT;
4427   *flags &= CODING_STATE_ISO2022_LOCK;
4428   if (reg == 2)
4429     *flags |= CODING_STATE_SS2;
4430   else
4431     *flags |= CODING_STATE_SS3;
4432   return 1;
4433
4434  locking_shift:
4435   if (check_invalid_charsets &&
4436       !CHARSETP (iso->charset[reg]))
4437     /* can't invoke something that ain't there. */
4438     return 0;
4439   if (half)
4440     iso->register_right = reg;
4441   else
4442     iso->register_left = reg;
4443   *flags &= CODING_STATE_ISO2022_LOCK;
4444   iso->esc = ISO_ESC_LOCKING_SHIFT;
4445   return 1;
4446
4447  designated:
4448   if (NILP (cs) && check_invalid_charsets)
4449     {
4450       iso->invalid_designated[reg] = 1;
4451       iso->charset[reg] = Vcharset_ascii;
4452       iso->esc = ISO_ESC_DESIGNATE;
4453       *flags &= CODING_STATE_ISO2022_LOCK;
4454       iso->output_literally = 1;
4455       if (iso->switched_dir_and_no_valid_charset_yet)
4456         {
4457           /* We encountered a switch-direction followed by an
4458              invalid designation.  Ensure that the switch-direction
4459              gets outputted; otherwise it will probably get eaten
4460              when the text is written out again. */
4461           iso->switched_dir_and_no_valid_charset_yet = 0;
4462           iso->output_direction_sequence = 1;
4463           /* And make sure that the switch-dir going the other
4464              way gets outputted, as well. */
4465           iso->invalid_switch_dir = 1;
4466         }
4467       return 1;
4468     }
4469   /* This function is called with CODESYS equal to nil when
4470      doing coding-system detection. */
4471   if (!NILP (codesys))
4472     {
4473       charset_conversion_spec_dynarr *dyn =
4474         XCODING_SYSTEM (codesys)->iso2022.input_conv;
4475
4476       if (dyn)
4477         {
4478           int i;
4479
4480           for (i = 0; i < Dynarr_length (dyn); i++)
4481             {
4482               struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
4483               if (EQ (cs, spec->from_charset))
4484                 cs = spec->to_charset;
4485             }
4486         }
4487     }
4488
4489   iso->charset[reg] = cs;
4490   iso->esc = ISO_ESC_DESIGNATE;
4491   *flags &= CODING_STATE_ISO2022_LOCK;
4492   if (iso->invalid_designated[reg])
4493     {
4494       iso->invalid_designated[reg] = 0;
4495       iso->output_literally = 1;
4496     }
4497   if (iso->switched_dir_and_no_valid_charset_yet)
4498     iso->switched_dir_and_no_valid_charset_yet = 0;
4499   return 1;
4500 }
4501
4502 static int
4503 detect_coding_iso2022 (struct detection_state *st, CONST unsigned char *src,
4504                        unsigned int n)
4505 {
4506   int mask;
4507
4508   /* #### There are serious deficiencies in the recognition mechanism
4509      here.  This needs to be much smarter if it's going to cut it.
4510      The sequence "\xff\x0f" is currently detected as LOCK_SHIFT while
4511      it should be detected as Latin-1.
4512      All the ISO2022 stuff in this file should be synced up with the
4513      code from FSF Emacs-20.4, in which Mule should be more or less stable.
4514      Perhaps we should wait till R2L works in FSF Emacs? */
4515
4516   if (!st->iso2022.initted)
4517     {
4518       reset_iso2022 (Qnil, &st->iso2022.iso);
4519       st->iso2022.mask = (CODING_CATEGORY_ISO_7_MASK |
4520                           CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4521                           CODING_CATEGORY_ISO_8_1_MASK |
4522                           CODING_CATEGORY_ISO_8_2_MASK |
4523                           CODING_CATEGORY_ISO_LOCK_SHIFT_MASK);
4524       st->iso2022.flags = 0;
4525       st->iso2022.high_byte_count = 0;
4526       st->iso2022.saw_single_shift = 0;
4527       st->iso2022.initted = 1;
4528     }
4529
4530   mask = st->iso2022.mask;
4531
4532   while (n--)
4533     {
4534       int c = *src++;
4535       if (c >= 0xA0)
4536         {
4537           mask &= ~CODING_CATEGORY_ISO_7_MASK;
4538           st->iso2022.high_byte_count++;
4539         }
4540       else
4541         {
4542           if (st->iso2022.high_byte_count && !st->iso2022.saw_single_shift)
4543             {
4544               if (st->iso2022.high_byte_count & 1)
4545                 /* odd number of high bytes; assume not iso-8-2 */
4546                 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4547             }
4548           st->iso2022.high_byte_count = 0;
4549           st->iso2022.saw_single_shift = 0;
4550           if (c > 0x80)
4551             mask &= ~CODING_CATEGORY_ISO_7_MASK;
4552         }
4553       if (!(st->iso2022.flags & CODING_STATE_ESCAPE)
4554           && (BYTE_C0_P (c) || BYTE_C1_P (c)))
4555         { /* control chars */
4556           switch (c)
4557             {
4558               /* Allow and ignore control characters that you might
4559                  reasonably see in a text file */
4560             case '\r':
4561             case '\n':
4562             case '\t':
4563             case  7: /* bell */
4564             case  8: /* backspace */
4565             case 11: /* vertical tab */
4566             case 12: /* form feed */
4567             case 26: /* MS-DOS C-z junk */
4568             case 31: /* '^_' -- for info */
4569               goto label_continue_loop;
4570
4571             default:
4572               break;
4573             }
4574         }
4575
4576       if ((st->iso2022.flags & CODING_STATE_ESCAPE) || BYTE_C0_P (c)
4577           || BYTE_C1_P (c))
4578         {
4579           if (parse_iso2022_esc (Qnil, &st->iso2022.iso, c,
4580                                  &st->iso2022.flags, 0))
4581             {
4582               switch (st->iso2022.iso.esc)
4583                 {
4584                 case ISO_ESC_DESIGNATE:
4585                   mask &= ~CODING_CATEGORY_ISO_8_1_MASK;
4586                   mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4587                   break;
4588                 case ISO_ESC_LOCKING_SHIFT:
4589                   mask = CODING_CATEGORY_ISO_LOCK_SHIFT_MASK;
4590                   goto ran_out_of_chars;
4591                 case ISO_ESC_SINGLE_SHIFT:
4592                   mask &= ~CODING_CATEGORY_ISO_8_DESIGNATE_MASK;
4593                   st->iso2022.saw_single_shift = 1;
4594                   break;
4595                 default:
4596                   break;
4597                 }
4598             }
4599           else
4600             {
4601               mask = 0;
4602               goto ran_out_of_chars;
4603             }
4604         }
4605     label_continue_loop:;
4606     }
4607
4608  ran_out_of_chars:
4609
4610   return mask;
4611 }
4612
4613 static int
4614 postprocess_iso2022_mask (int mask)
4615 {
4616   /* #### kind of cheesy */
4617   /* If seven-bit ISO is allowed, then assume that the encoding is
4618      entirely seven-bit and turn off the eight-bit ones. */
4619   if (mask & CODING_CATEGORY_ISO_7_MASK)
4620     mask &= ~ (CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4621                CODING_CATEGORY_ISO_8_1_MASK |
4622                CODING_CATEGORY_ISO_8_2_MASK);
4623   return mask;
4624 }
4625
4626 /* If FLAGS is a null pointer or specifies right-to-left motion,
4627    output a switch-dir-to-left-to-right sequence to DST.
4628    Also update FLAGS if it is not a null pointer.
4629    If INTERNAL_P is set, we are outputting in internal format and
4630    need to handle the CSI differently. */
4631
4632 static void
4633 restore_left_to_right_direction (Lisp_Coding_System *codesys,
4634                                  unsigned_char_dynarr *dst,
4635                                  unsigned int *flags,
4636                                  int internal_p)
4637 {
4638   if (!flags || (*flags & CODING_STATE_R2L))
4639     {
4640       if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4641         {
4642           Dynarr_add (dst, ISO_CODE_ESC);
4643           Dynarr_add (dst, '[');
4644         }
4645       else if (internal_p)
4646         DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4647       else
4648         Dynarr_add (dst, ISO_CODE_CSI);
4649       Dynarr_add (dst, '0');
4650       Dynarr_add (dst, ']');
4651       if (flags)
4652         *flags &= ~CODING_STATE_R2L;
4653     }
4654 }
4655
4656 /* If FLAGS is a null pointer or specifies a direction different from
4657    DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or
4658    CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape
4659    sequence to DST.  Also update FLAGS if it is not a null pointer.
4660    If INTERNAL_P is set, we are outputting in internal format and
4661    need to handle the CSI differently. */
4662
4663 static void
4664 ensure_correct_direction (int direction, Lisp_Coding_System *codesys,
4665                           unsigned_char_dynarr *dst, unsigned int *flags,
4666                           int internal_p)
4667 {
4668   if ((!flags || (*flags & CODING_STATE_R2L)) &&
4669       direction == CHARSET_LEFT_TO_RIGHT)
4670     restore_left_to_right_direction (codesys, dst, flags, internal_p);
4671   else if (!CODING_SYSTEM_ISO2022_NO_ISO6429 (codesys)
4672            && (!flags || !(*flags & CODING_STATE_R2L)) &&
4673            direction == CHARSET_RIGHT_TO_LEFT)
4674     {
4675       if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4676         {
4677           Dynarr_add (dst, ISO_CODE_ESC);
4678           Dynarr_add (dst, '[');
4679         }
4680       else if (internal_p)
4681         DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4682       else
4683         Dynarr_add (dst, ISO_CODE_CSI);
4684       Dynarr_add (dst, '2');
4685       Dynarr_add (dst, ']');
4686       if (flags)
4687         *flags |= CODING_STATE_R2L;
4688     }
4689 }
4690
4691 /* Convert ISO2022-format data to internal format. */
4692
4693 static void
4694 decode_coding_iso2022 (Lstream *decoding, CONST unsigned char *src,
4695                        unsigned_char_dynarr *dst, unsigned int n)
4696 {
4697   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4698   unsigned int flags  = str->flags;
4699   unsigned int ch     = str->ch;
4700   eol_type_t eol_type = str->eol_type;
4701 #ifdef ENABLE_COMPOSITE_CHARS
4702   unsigned_char_dynarr *real_dst = dst;
4703 #endif
4704   Lisp_Object coding_system;
4705
4706   XSETCODING_SYSTEM (coding_system, str->codesys);
4707
4708 #ifdef ENABLE_COMPOSITE_CHARS
4709   if (flags & CODING_STATE_COMPOSITE)
4710     dst = str->iso2022.composite_chars;
4711 #endif /* ENABLE_COMPOSITE_CHARS */
4712
4713   while (n--)
4714     {
4715       unsigned char c = *src++;
4716       if (flags & CODING_STATE_ESCAPE)
4717         {       /* Within ESC sequence */
4718           int retval = parse_iso2022_esc (coding_system, &str->iso2022,
4719                                           c, &flags, 1);
4720
4721           if (retval)
4722             {
4723               switch (str->iso2022.esc)
4724                 {
4725 #ifdef ENABLE_COMPOSITE_CHARS
4726                 case ISO_ESC_START_COMPOSITE:
4727                   if (str->iso2022.composite_chars)
4728                     Dynarr_reset (str->iso2022.composite_chars);
4729                   else
4730                     str->iso2022.composite_chars = Dynarr_new (unsigned_char);
4731                   dst = str->iso2022.composite_chars;
4732                   break;
4733                 case ISO_ESC_END_COMPOSITE:
4734                   {
4735                     Bufbyte comstr[MAX_EMCHAR_LEN];
4736                     Bytecount len;
4737                     Emchar emch = lookup_composite_char (Dynarr_atp (dst, 0),
4738                                                          Dynarr_length (dst));
4739                     dst = real_dst;
4740                     len = set_charptr_emchar (comstr, emch);
4741                     Dynarr_add_many (dst, comstr, len);
4742                     break;
4743                   }
4744 #endif /* ENABLE_COMPOSITE_CHARS */
4745
4746                 case ISO_ESC_LITERAL:
4747                   DECODE_ADD_BINARY_CHAR (c, dst);
4748                   break;
4749
4750                 default:
4751                   /* Everything else handled already */
4752                   break;
4753                 }
4754             }
4755
4756           /* Attempted error recovery. */
4757           if (str->iso2022.output_direction_sequence)
4758             ensure_correct_direction (flags & CODING_STATE_R2L ?
4759                                       CHARSET_RIGHT_TO_LEFT :
4760                                       CHARSET_LEFT_TO_RIGHT,
4761                                       str->codesys, dst, 0, 1);
4762           /* More error recovery. */
4763           if (!retval || str->iso2022.output_literally)
4764             {
4765               /* Output the (possibly invalid) sequence */
4766               int i;
4767               for (i = 0; i < str->iso2022.esc_bytes_index; i++)
4768                 DECODE_ADD_BINARY_CHAR (str->iso2022.esc_bytes[i], dst);
4769               flags &= CODING_STATE_ISO2022_LOCK;
4770               if (!retval)
4771                 n++, src--;/* Repeat the loop with the same character. */
4772               else
4773                 {
4774                   /* No sense in reprocessing the final byte of the
4775                      escape sequence; it could mess things up anyway.
4776                      Just add it now. */
4777                   DECODE_ADD_BINARY_CHAR (c, dst);
4778                 }
4779             }
4780           ch = 0;
4781         }
4782       else if (BYTE_C0_P (c) || BYTE_C1_P (c))
4783         { /* Control characters */
4784
4785           /***** Error-handling *****/
4786
4787           /* If we were in the middle of a character, dump out the
4788              partial character. */
4789           DECODE_OUTPUT_PARTIAL_CHAR (ch);
4790
4791           /* If we just saw a single-shift character, dump it out.
4792              This may dump out the wrong sort of single-shift character,
4793              but least it will give an indication that something went
4794              wrong. */
4795           if (flags & CODING_STATE_SS2)
4796             {
4797               DECODE_ADD_BINARY_CHAR (ISO_CODE_SS2, dst);
4798               flags &= ~CODING_STATE_SS2;
4799             }
4800           if (flags & CODING_STATE_SS3)
4801             {
4802               DECODE_ADD_BINARY_CHAR (ISO_CODE_SS3, dst);
4803               flags &= ~CODING_STATE_SS3;
4804             }
4805
4806           /***** Now handle the control characters. *****/
4807
4808           /* Handle CR/LF */
4809           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4810
4811           flags &= CODING_STATE_ISO2022_LOCK;
4812
4813           if (!parse_iso2022_esc (coding_system, &str->iso2022, c, &flags, 1))
4814             DECODE_ADD_BINARY_CHAR (c, dst);
4815         }
4816       else
4817         {                       /* Graphic characters */
4818           Lisp_Object charset;
4819           int lb;
4820           int reg;
4821
4822           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4823
4824           /* Now determine the charset. */
4825           reg = ((flags & CODING_STATE_SS2) ? 2
4826                  : (flags & CODING_STATE_SS3) ? 3
4827                  : !BYTE_ASCII_P (c) ? str->iso2022.register_right
4828                  : str->iso2022.register_left);
4829           charset = str->iso2022.charset[reg];
4830
4831           /* Error checking: */
4832           if (! CHARSETP (charset)
4833               || str->iso2022.invalid_designated[reg]
4834               || (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL)
4835                   && XCHARSET_CHARS (charset) == 94))
4836             /* Mrmph.  We are trying to invoke a register that has no
4837                or an invalid charset in it, or trying to add a character
4838                outside the range of the charset.  Insert that char literally
4839                to preserve it for the output. */
4840             {
4841               DECODE_OUTPUT_PARTIAL_CHAR (ch);
4842               DECODE_ADD_BINARY_CHAR (c, dst);
4843             }
4844
4845           else
4846             {
4847               /* Things are probably hunky-dorey. */
4848
4849               /* Fetch reverse charset, maybe. */
4850               if (((flags & CODING_STATE_R2L) &&
4851                    XCHARSET_DIRECTION (charset) == CHARSET_LEFT_TO_RIGHT)
4852                   ||
4853                   (!(flags & CODING_STATE_R2L) &&
4854                    XCHARSET_DIRECTION (charset) == CHARSET_RIGHT_TO_LEFT))
4855                 {
4856                   Lisp_Object new_charset =
4857                     XCHARSET_REVERSE_DIRECTION_CHARSET (charset);
4858                   if (!NILP (new_charset))
4859                     charset = new_charset;
4860                 }
4861
4862               lb = XCHARSET_LEADING_BYTE (charset);
4863               switch (XCHARSET_REP_BYTES (charset))
4864                 {
4865                 case 1: /* ASCII */
4866                   DECODE_OUTPUT_PARTIAL_CHAR (ch);
4867                   Dynarr_add (dst, c & 0x7F);
4868                   break;
4869
4870                 case 2: /* one-byte official */
4871                   DECODE_OUTPUT_PARTIAL_CHAR (ch);
4872                   Dynarr_add (dst, lb);
4873                   Dynarr_add (dst, c | 0x80);
4874                   break;
4875
4876                 case 3: /* one-byte private or two-byte official */
4877                   if (XCHARSET_PRIVATE_P (charset))
4878                     {
4879                       DECODE_OUTPUT_PARTIAL_CHAR (ch);
4880                       Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_1);
4881                       Dynarr_add (dst, lb);
4882                       Dynarr_add (dst, c | 0x80);
4883                     }
4884                   else
4885                     {
4886                       if (ch)
4887                         {
4888                           Dynarr_add (dst, lb);
4889                           Dynarr_add (dst, ch | 0x80);
4890                           Dynarr_add (dst, c | 0x80);
4891                           ch = 0;
4892                         }
4893                       else
4894                         ch = c;
4895                     }
4896                   break;
4897
4898                 default:        /* two-byte private */
4899                   if (ch)
4900                     {
4901                       Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_2);
4902                       Dynarr_add (dst, lb);
4903                       Dynarr_add (dst, ch | 0x80);
4904                       Dynarr_add (dst, c | 0x80);
4905                       ch = 0;
4906                     }
4907                   else
4908                     ch = c;
4909                 }
4910             }
4911
4912           if (!ch)
4913             flags &= CODING_STATE_ISO2022_LOCK;
4914         }
4915
4916     label_continue_loop:;
4917     }
4918
4919   if (flags & CODING_STATE_END)
4920     DECODE_OUTPUT_PARTIAL_CHAR (ch);
4921
4922   str->flags = flags;
4923   str->ch    = ch;
4924 }
4925
4926
4927 /***** ISO2022 encoder *****/
4928
4929 /* Designate CHARSET into register REG. */
4930
4931 static void
4932 iso2022_designate (Lisp_Object charset, unsigned char reg,
4933                    struct encoding_stream *str, unsigned_char_dynarr *dst)
4934 {
4935   static CONST char inter94[] = "()*+";
4936   static CONST char inter96[] = ",-./";
4937   unsigned int type;
4938   unsigned char final;
4939   Lisp_Object old_charset = str->iso2022.charset[reg];
4940
4941   str->iso2022.charset[reg] = charset;
4942   if (!CHARSETP (charset))
4943     /* charset might be an initial nil or t. */
4944     return;
4945   type = XCHARSET_TYPE (charset);
4946   final = XCHARSET_FINAL (charset);
4947   if (!str->iso2022.force_charset_on_output[reg] &&
4948       CHARSETP (old_charset) &&
4949       XCHARSET_TYPE (old_charset) == type &&
4950       XCHARSET_FINAL (old_charset) == final)
4951     return;
4952
4953   str->iso2022.force_charset_on_output[reg] = 0;
4954
4955   {
4956     charset_conversion_spec_dynarr *dyn =
4957       str->codesys->iso2022.output_conv;
4958
4959     if (dyn)
4960       {
4961         int i;
4962
4963         for (i = 0; i < Dynarr_length (dyn); i++)
4964           {
4965             struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
4966             if (EQ (charset, spec->from_charset))
4967                 charset = spec->to_charset;
4968           }
4969       }
4970   }
4971
4972   Dynarr_add (dst, ISO_CODE_ESC);
4973   switch (type)
4974     {
4975     case CHARSET_TYPE_94:
4976       Dynarr_add (dst, inter94[reg]);
4977       break;
4978     case CHARSET_TYPE_96:
4979       Dynarr_add (dst, inter96[reg]);
4980       break;
4981     case CHARSET_TYPE_94X94:
4982       Dynarr_add (dst, '$');
4983       if (reg != 0
4984           || !(CODING_SYSTEM_ISO2022_SHORT (str->codesys))
4985           || final < '@'
4986           || final > 'B')
4987         Dynarr_add (dst, inter94[reg]);
4988       break;
4989     case CHARSET_TYPE_96X96:
4990       Dynarr_add (dst, '$');
4991       Dynarr_add (dst, inter96[reg]);
4992       break;
4993     }
4994   Dynarr_add (dst, final);
4995 }
4996
4997 static void
4998 ensure_normal_shift (struct encoding_stream *str, unsigned_char_dynarr *dst)
4999 {
5000   if (str->iso2022.register_left != 0)
5001     {
5002       Dynarr_add (dst, ISO_CODE_SI);
5003       str->iso2022.register_left = 0;
5004     }
5005 }
5006
5007 static void
5008 ensure_shift_out (struct encoding_stream *str, unsigned_char_dynarr *dst)
5009 {
5010   if (str->iso2022.register_left != 1)
5011     {
5012       Dynarr_add (dst, ISO_CODE_SO);
5013       str->iso2022.register_left = 1;
5014     }
5015 }
5016
5017 /* Convert internally-formatted data to ISO2022 format. */
5018
5019 static void
5020 encode_coding_iso2022 (Lstream *encoding, CONST unsigned char *src,
5021                        unsigned_char_dynarr *dst, unsigned int n)
5022 {
5023   unsigned char charmask, c;
5024   unsigned char char_boundary;
5025   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
5026   unsigned int flags          = str->flags;
5027   unsigned int ch             = str->ch;
5028   Lisp_Coding_System *codesys = str->codesys;
5029   eol_type_t eol_type         = CODING_SYSTEM_EOL_TYPE (str->codesys);
5030   int i;
5031   Lisp_Object charset;
5032   int half;
5033
5034 #ifdef ENABLE_COMPOSITE_CHARS
5035   /* flags for handling composite chars.  We do a little switcharoo
5036      on the source while we're outputting the composite char. */
5037   unsigned int saved_n = 0;
5038   CONST unsigned char *saved_src = NULL;
5039   int in_composite = 0;
5040 #endif /* ENABLE_COMPOSITE_CHARS */
5041
5042   char_boundary = str->iso2022.current_char_boundary;
5043   charset = str->iso2022.current_charset;
5044   half = str->iso2022.current_half;
5045
5046 #ifdef ENABLE_COMPOSITE_CHARS
5047  back_to_square_n:
5048 #endif
5049   while (n--)
5050     {
5051       c = *src++;
5052
5053       if (BYTE_ASCII_P (c))
5054         {               /* Processing ASCII character */
5055           ch = 0;
5056
5057           restore_left_to_right_direction (codesys, dst, &flags, 0);
5058
5059           /* Make sure G0 contains ASCII */
5060           if ((c > ' ' && c < ISO_CODE_DEL) ||
5061               !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
5062             {
5063               ensure_normal_shift (str, dst);
5064               iso2022_designate (Vcharset_ascii, 0, str, dst);
5065             }
5066
5067           /* If necessary, restore everything to the default state
5068              at end-of-line */
5069           if (c == '\n' &&
5070               !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys)))
5071             {
5072               restore_left_to_right_direction (codesys, dst, &flags, 0);
5073
5074               ensure_normal_shift (str, dst);
5075
5076               for (i = 0; i < 4; i++)
5077                 {
5078                   Lisp_Object initial_charset =
5079                     CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5080                   iso2022_designate (initial_charset, i, str, dst);
5081                 }
5082             }
5083           if (c == '\n')
5084             {
5085               if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5086                 Dynarr_add (dst, '\r');
5087               if (eol_type != EOL_CR)
5088                 Dynarr_add (dst, c);
5089             }
5090           else
5091             {
5092               if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5093                   && fit_to_be_escape_quoted (c))
5094                 Dynarr_add (dst, ISO_CODE_ESC);
5095               Dynarr_add (dst, c);
5096             }
5097           char_boundary = 1;
5098         }
5099
5100       else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
5101         { /* Processing Leading Byte */
5102           ch = 0;
5103           charset = CHARSET_BY_LEADING_BYTE (c);
5104           if (LEADING_BYTE_PREFIX_P(c))
5105             ch = c;
5106           else if (!EQ (charset, Vcharset_control_1)
5107 #ifdef ENABLE_COMPOSITE_CHARS
5108                    && !EQ (charset, Vcharset_composite)
5109 #endif
5110                    )
5111             {
5112               int reg;
5113
5114               ensure_correct_direction (XCHARSET_DIRECTION (charset),
5115                                         codesys, dst, &flags, 0);
5116
5117               /* Now determine which register to use. */
5118               reg = -1;
5119               for (i = 0; i < 4; i++)
5120                 {
5121                   if (EQ (charset, str->iso2022.charset[i]) ||
5122                       EQ (charset,
5123                           CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i)))
5124                     {
5125                       reg = i;
5126                       break;
5127                     }
5128                 }
5129
5130               if (reg == -1)
5131                 {
5132                   if (XCHARSET_GRAPHIC (charset) != 0)
5133                     {
5134                       if (!NILP (str->iso2022.charset[1]) &&
5135                           (!CODING_SYSTEM_ISO2022_SEVEN (codesys) ||
5136                            CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys)))
5137                         reg = 1;
5138                       else if (!NILP (str->iso2022.charset[2]))
5139                         reg = 2;
5140                       else if (!NILP (str->iso2022.charset[3]))
5141                         reg = 3;
5142                       else
5143                         reg = 0;
5144                     }
5145                   else
5146                     reg = 0;
5147                 }
5148
5149               iso2022_designate (charset, reg, str, dst);
5150
5151               /* Now invoke that register. */
5152               switch (reg)
5153                 {
5154                 case 0:
5155                   ensure_normal_shift (str, dst);
5156                   half = 0;
5157                   break;
5158
5159                 case 1:
5160                   if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5161                     {
5162                       ensure_shift_out (str, dst);
5163                       half = 0;
5164                     }
5165                   else
5166                     half = 1;
5167                   break;
5168
5169                 case 2:
5170                   if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5171                     {
5172                       Dynarr_add (dst, ISO_CODE_ESC);
5173                       Dynarr_add (dst, 'N');
5174                       half = 0;
5175                     }
5176                   else
5177                     {
5178                       Dynarr_add (dst, ISO_CODE_SS2);
5179                       half = 1;
5180                     }
5181                   break;
5182
5183                 case 3:
5184                   if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5185                     {
5186                       Dynarr_add (dst, ISO_CODE_ESC);
5187                       Dynarr_add (dst, 'O');
5188                       half = 0;
5189                     }
5190                   else
5191                     {
5192                       Dynarr_add (dst, ISO_CODE_SS3);
5193                       half = 1;
5194                     }
5195                   break;
5196
5197                 default:
5198                   abort ();
5199                 }
5200             }
5201           char_boundary = 0;
5202         }
5203       else
5204         {                       /* Processing Non-ASCII character */
5205           charmask = (half == 0 ? 0x7F : 0xFF);
5206           char_boundary = 1;
5207           if (EQ (charset, Vcharset_control_1))
5208             {
5209               if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5210                   && fit_to_be_escape_quoted (c))
5211                 Dynarr_add (dst, ISO_CODE_ESC);
5212               /* you asked for it ... */
5213               Dynarr_add (dst, c - 0x20);
5214             }
5215           else
5216             {
5217               switch (XCHARSET_REP_BYTES (charset))
5218                 {
5219                 case 2:
5220                   Dynarr_add (dst, c & charmask);
5221                   break;
5222                 case 3:
5223                   if (XCHARSET_PRIVATE_P (charset))
5224                     {
5225                       Dynarr_add (dst, c & charmask);
5226                       ch = 0;
5227                     }
5228                   else if (ch)
5229                     {
5230 #ifdef ENABLE_COMPOSITE_CHARS
5231                       if (EQ (charset, Vcharset_composite))
5232                         {
5233                           if (in_composite)
5234                             {
5235                               /* #### Bother! We don't know how to
5236                                  handle this yet. */
5237                               Dynarr_add (dst, '~');
5238                             }
5239                           else
5240                             {
5241                               Emchar emch = MAKE_CHAR (Vcharset_composite,
5242                                                        ch & 0x7F, c & 0x7F);
5243                               Lisp_Object lstr = composite_char_string (emch);
5244                               saved_n = n;
5245                               saved_src = src;
5246                               in_composite = 1;
5247                               src = XSTRING_DATA   (lstr);
5248                               n   = XSTRING_LENGTH (lstr);
5249                               Dynarr_add (dst, ISO_CODE_ESC);
5250                               Dynarr_add (dst, '0'); /* start composing */
5251                             }
5252                         }
5253                       else
5254 #endif /* ENABLE_COMPOSITE_CHARS */
5255                         {
5256                           Dynarr_add (dst, ch & charmask);
5257                           Dynarr_add (dst, c & charmask);
5258                         }
5259                       ch = 0;
5260                     }
5261                   else
5262                     {
5263                       ch = c;
5264                       char_boundary = 0;
5265                     }
5266                   break;
5267                 case 4:
5268                   if (ch)
5269                     {
5270                       Dynarr_add (dst, ch & charmask);
5271                       Dynarr_add (dst, c & charmask);
5272                       ch = 0;
5273                     }
5274                   else
5275                     {
5276                       ch = c;
5277                       char_boundary = 0;
5278                     }
5279                   break;
5280                 default:
5281                   abort ();
5282                 }
5283             }
5284         }
5285     }
5286
5287 #ifdef ENABLE_COMPOSITE_CHARS
5288   if (in_composite)
5289     {
5290       n = saved_n;
5291       src = saved_src;
5292       in_composite = 0;
5293       Dynarr_add (dst, ISO_CODE_ESC);
5294       Dynarr_add (dst, '1'); /* end composing */
5295       goto back_to_square_n; /* Wheeeeeeeee ..... */
5296     }
5297 #endif /* ENABLE_COMPOSITE_CHARS */
5298
5299   if (char_boundary && flags & CODING_STATE_END)
5300     {
5301       restore_left_to_right_direction (codesys, dst, &flags, 0);
5302       ensure_normal_shift (str, dst);
5303       for (i = 0; i < 4; i++)
5304         {
5305           Lisp_Object initial_charset =
5306             CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5307           iso2022_designate (initial_charset, i, str, dst);
5308         }
5309     }
5310
5311   str->flags = flags;
5312   str->ch    = ch;
5313   str->iso2022.current_char_boundary = char_boundary;
5314   str->iso2022.current_charset = charset;
5315   str->iso2022.current_half = half;
5316
5317   /* Verbum caro factum est! */
5318 }
5319 #endif /* MULE */
5320 \f
5321 /************************************************************************/
5322 /*                     No-conversion methods                            */
5323 /************************************************************************/
5324
5325 /* This is used when reading in "binary" files -- i.e. files that may
5326    contain all 256 possible byte values and that are not to be
5327    interpreted as being in any particular decoding. */
5328 static void
5329 decode_coding_no_conversion (Lstream *decoding, CONST unsigned char *src,
5330                              unsigned_char_dynarr *dst, unsigned int n)
5331 {
5332   unsigned char c;
5333   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
5334   unsigned int flags  = str->flags;
5335   unsigned int ch     = str->ch;
5336   eol_type_t eol_type = str->eol_type;
5337
5338   while (n--)
5339     {
5340       c = *src++;
5341
5342       DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5343       DECODE_ADD_BINARY_CHAR (c, dst);
5344     label_continue_loop:;
5345     }
5346
5347   DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
5348
5349   str->flags = flags;
5350   str->ch    = ch;
5351 }
5352
5353 static void
5354 encode_coding_no_conversion (Lstream *encoding, CONST unsigned char *src,
5355                              unsigned_char_dynarr *dst, unsigned int n)
5356 {
5357   unsigned char c;
5358   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
5359   unsigned int flags  = str->flags;
5360   unsigned int ch     = str->ch;
5361   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
5362
5363   while (n--)
5364     {
5365       c = *src++;
5366       if (c == '\n')
5367         {
5368           if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5369             Dynarr_add (dst, '\r');
5370           if (eol_type != EOL_CR)
5371             Dynarr_add (dst, '\n');
5372           ch = 0;
5373         }
5374       else if (BYTE_ASCII_P (c))
5375         {
5376           assert (ch == 0);
5377           Dynarr_add (dst, c);
5378         }
5379       else if (BUFBYTE_LEADING_BYTE_P (c))
5380         {
5381           assert (ch == 0);
5382           if (c == LEADING_BYTE_LATIN_ISO8859_1 ||
5383               c == LEADING_BYTE_CONTROL_1)
5384             ch = c;
5385           else
5386             Dynarr_add (dst, '~'); /* untranslatable character */
5387         }
5388       else
5389         {
5390           if (ch == LEADING_BYTE_LATIN_ISO8859_1)
5391             Dynarr_add (dst, c);
5392           else if (ch == LEADING_BYTE_CONTROL_1)
5393             {
5394               assert (c < 0xC0);
5395               Dynarr_add (dst, c - 0x20);
5396             }
5397           /* else it should be the second or third byte of an
5398              untranslatable character, so ignore it */
5399           ch = 0;
5400         }
5401     }
5402
5403   str->flags = flags;
5404   str->ch    = ch;
5405 }
5406
5407 \f
5408 /************************************************************************/
5409 /*                   Simple internal/external functions                 */
5410 /************************************************************************/
5411
5412 static Extbyte_dynarr *conversion_out_dynarr;
5413 static Bufbyte_dynarr *conversion_in_dynarr;
5414
5415 /* Determine coding system from coding format */
5416
5417 /* #### not correct for all values of `fmt'! */
5418 static Lisp_Object
5419 external_data_format_to_coding_system (enum external_data_format fmt)
5420 {
5421   switch (fmt)
5422     {
5423     case FORMAT_FILENAME:
5424     case FORMAT_TERMINAL:
5425       if (EQ (Vfile_name_coding_system, Qnil) ||
5426           EQ (Vfile_name_coding_system, Qbinary))
5427         return Qnil;
5428       else
5429         return Fget_coding_system (Vfile_name_coding_system);
5430 #ifdef MULE
5431     case FORMAT_CTEXT:
5432       return Fget_coding_system (Qctext);
5433 #endif
5434     default:
5435       return Qnil;
5436     }
5437 }
5438
5439 Extbyte *
5440 convert_to_external_format (CONST Bufbyte *ptr,
5441                             Bytecount len,
5442                             Extcount *len_out,
5443                             enum external_data_format fmt)
5444 {
5445   Lisp_Object coding_system = external_data_format_to_coding_system (fmt);
5446
5447   if (!conversion_out_dynarr)
5448     conversion_out_dynarr = Dynarr_new (Extbyte);
5449   else
5450     Dynarr_reset (conversion_out_dynarr);
5451
5452   if (NILP (coding_system))
5453     {
5454       CONST Bufbyte *end = ptr + len;
5455
5456       for (; ptr < end;)
5457         {
5458           Bufbyte c =
5459             (BYTE_ASCII_P (*ptr))                  ? *ptr :
5460             (*ptr == LEADING_BYTE_CONTROL_1)       ? (*(ptr+1) - 0x20) :
5461             (*ptr == LEADING_BYTE_LATIN_ISO8859_1) ? (*(ptr+1)) :
5462             '~';
5463
5464           Dynarr_add (conversion_out_dynarr, (Extbyte) c);
5465           INC_CHARPTR (ptr);
5466         }
5467
5468 #ifdef ERROR_CHECK_BUFPOS
5469       assert (ptr == end);
5470 #endif
5471     }
5472   else
5473     {
5474       Lisp_Object instream, outstream, da_outstream;
5475       Lstream *istr, *ostr;
5476       struct gcpro gcpro1, gcpro2, gcpro3;
5477       char tempbuf[1024]; /* some random amount */
5478
5479       instream = make_fixed_buffer_input_stream ((unsigned char *) ptr, len);
5480       da_outstream = make_dynarr_output_stream
5481         ((unsigned_char_dynarr *) conversion_out_dynarr);
5482       outstream =
5483         make_encoding_output_stream (XLSTREAM (da_outstream), coding_system);
5484       istr = XLSTREAM (instream);
5485       ostr = XLSTREAM (outstream);
5486       GCPRO3 (instream, outstream, da_outstream);
5487       while (1)
5488         {
5489           ssize_t size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
5490           if (!size_in_bytes)
5491             break;
5492           Lstream_write (ostr, tempbuf, size_in_bytes);
5493         }
5494       Lstream_close (istr);
5495       Lstream_close (ostr);
5496       UNGCPRO;
5497       Lstream_delete (istr);
5498       Lstream_delete (ostr);
5499       Lstream_delete (XLSTREAM (da_outstream));
5500     }
5501
5502   *len_out = Dynarr_length (conversion_out_dynarr);
5503   Dynarr_add (conversion_out_dynarr, 0); /* remember to zero-terminate! */
5504   return Dynarr_atp (conversion_out_dynarr, 0);
5505 }
5506
5507 Bufbyte *
5508 convert_from_external_format (CONST Extbyte *ptr,
5509                               Extcount len,
5510                               Bytecount *len_out,
5511                               enum external_data_format fmt)
5512 {
5513   Lisp_Object coding_system = external_data_format_to_coding_system (fmt);
5514
5515   if (!conversion_in_dynarr)
5516     conversion_in_dynarr = Dynarr_new (Bufbyte);
5517   else
5518     Dynarr_reset (conversion_in_dynarr);
5519
5520   if (NILP (coding_system))
5521     {
5522       CONST Extbyte *end = ptr + len;
5523       for (; ptr < end; ptr++)
5524         {
5525           Extbyte c = *ptr;
5526           DECODE_ADD_BINARY_CHAR (c, conversion_in_dynarr);
5527         }
5528     }
5529   else
5530     {
5531       Lisp_Object instream, outstream, da_outstream;
5532       Lstream *istr, *ostr;
5533       struct gcpro gcpro1, gcpro2, gcpro3;
5534       char tempbuf[1024]; /* some random amount */
5535
5536       instream = make_fixed_buffer_input_stream ((unsigned char *) ptr, len);
5537       da_outstream = make_dynarr_output_stream
5538         ((unsigned_char_dynarr *) conversion_in_dynarr);
5539       outstream =
5540         make_decoding_output_stream (XLSTREAM (da_outstream), coding_system);
5541       istr = XLSTREAM (instream);
5542       ostr = XLSTREAM (outstream);
5543       GCPRO3 (instream, outstream, da_outstream);
5544       while (1)
5545         {
5546           ssize_t size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
5547           if (!size_in_bytes)
5548             break;
5549           Lstream_write (ostr, tempbuf, size_in_bytes);
5550         }
5551       Lstream_close (istr);
5552       Lstream_close (ostr);
5553       UNGCPRO;
5554       Lstream_delete (istr);
5555       Lstream_delete (ostr);
5556       Lstream_delete (XLSTREAM (da_outstream));
5557     }
5558
5559   *len_out = Dynarr_length (conversion_in_dynarr);
5560   Dynarr_add (conversion_in_dynarr, 0); /* remember to zero-terminate! */
5561   return Dynarr_atp (conversion_in_dynarr, 0);
5562 }
5563
5564 \f
5565 /************************************************************************/
5566 /*                             Initialization                           */
5567 /************************************************************************/
5568
5569 void
5570 syms_of_file_coding (void)
5571 {
5572   deferror (&Qcoding_system_error, "coding-system-error",
5573             "Coding-system error", Qio_error);
5574
5575   DEFSUBR (Fcoding_system_p);
5576   DEFSUBR (Ffind_coding_system);
5577   DEFSUBR (Fget_coding_system);
5578   DEFSUBR (Fcoding_system_list);
5579   DEFSUBR (Fcoding_system_name);
5580   DEFSUBR (Fmake_coding_system);
5581   DEFSUBR (Fcopy_coding_system);
5582   DEFSUBR (Fdefine_coding_system_alias);
5583   DEFSUBR (Fsubsidiary_coding_system);
5584
5585   DEFSUBR (Fcoding_system_type);
5586   DEFSUBR (Fcoding_system_doc_string);
5587 #ifdef MULE
5588   DEFSUBR (Fcoding_system_charset);
5589 #endif
5590   DEFSUBR (Fcoding_system_property);
5591
5592   DEFSUBR (Fcoding_category_list);
5593   DEFSUBR (Fset_coding_priority_list);
5594   DEFSUBR (Fcoding_priority_list);
5595   DEFSUBR (Fset_coding_category_system);
5596   DEFSUBR (Fcoding_category_system);
5597
5598   DEFSUBR (Fdetect_coding_region);
5599   DEFSUBR (Fdecode_coding_region);
5600   DEFSUBR (Fencode_coding_region);
5601 #ifdef MULE
5602   DEFSUBR (Fdecode_shift_jis_char);
5603   DEFSUBR (Fencode_shift_jis_char);
5604   DEFSUBR (Fdecode_big5_char);
5605   DEFSUBR (Fencode_big5_char);
5606   DEFSUBR (Fset_ucs_char);
5607   DEFSUBR (Fucs_char);
5608   DEFSUBR (Fset_char_ucs);
5609   DEFSUBR (Fchar_ucs);
5610 #endif /* MULE */
5611   defsymbol (&Qcoding_systemp, "coding-system-p");
5612   defsymbol (&Qno_conversion, "no-conversion");
5613   defsymbol (&Qraw_text, "raw-text");
5614 #ifdef MULE
5615   defsymbol (&Qbig5, "big5");
5616   defsymbol (&Qshift_jis, "shift-jis");
5617   defsymbol (&Qucs4, "ucs-4");
5618   defsymbol (&Qutf8, "utf-8");
5619   defsymbol (&Qccl, "ccl");
5620   defsymbol (&Qiso2022, "iso2022");
5621 #endif /* MULE */
5622   defsymbol (&Qmnemonic, "mnemonic");
5623   defsymbol (&Qeol_type, "eol-type");
5624   defsymbol (&Qpost_read_conversion, "post-read-conversion");
5625   defsymbol (&Qpre_write_conversion, "pre-write-conversion");
5626
5627   defsymbol (&Qcr, "cr");
5628   defsymbol (&Qlf, "lf");
5629   defsymbol (&Qcrlf, "crlf");
5630   defsymbol (&Qeol_cr, "eol-cr");
5631   defsymbol (&Qeol_lf, "eol-lf");
5632   defsymbol (&Qeol_crlf, "eol-crlf");
5633 #ifdef MULE
5634   defsymbol (&Qcharset_g0, "charset-g0");
5635   defsymbol (&Qcharset_g1, "charset-g1");
5636   defsymbol (&Qcharset_g2, "charset-g2");
5637   defsymbol (&Qcharset_g3, "charset-g3");
5638   defsymbol (&Qforce_g0_on_output, "force-g0-on-output");
5639   defsymbol (&Qforce_g1_on_output, "force-g1-on-output");
5640   defsymbol (&Qforce_g2_on_output, "force-g2-on-output");
5641   defsymbol (&Qforce_g3_on_output, "force-g3-on-output");
5642   defsymbol (&Qno_iso6429, "no-iso6429");
5643   defsymbol (&Qinput_charset_conversion, "input-charset-conversion");
5644   defsymbol (&Qoutput_charset_conversion, "output-charset-conversion");
5645
5646   defsymbol (&Qshort, "short");
5647   defsymbol (&Qno_ascii_eol, "no-ascii-eol");
5648   defsymbol (&Qno_ascii_cntl, "no-ascii-cntl");
5649   defsymbol (&Qseven, "seven");
5650   defsymbol (&Qlock_shift, "lock-shift");
5651   defsymbol (&Qescape_quoted, "escape-quoted");
5652 #endif /* MULE */
5653   defsymbol (&Qencode, "encode");
5654   defsymbol (&Qdecode, "decode");
5655
5656 #ifdef MULE
5657   defsymbol (&Qctext, "ctext");
5658   defsymbol (&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS],
5659              "shift-jis");
5660   defsymbol (&coding_category_symbol[CODING_CATEGORY_BIG5],
5661              "big5");
5662   defsymbol (&coding_category_symbol[CODING_CATEGORY_UCS4],
5663              "ucs-4");
5664   defsymbol (&coding_category_symbol[CODING_CATEGORY_UTF8],
5665              "utf-8");
5666   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_7],
5667              "iso-7");
5668   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE],
5669              "iso-8-designate");
5670   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_1],
5671              "iso-8-1");
5672   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_2],
5673              "iso-8-2");
5674   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_LOCK_SHIFT],
5675              "iso-lock-shift");
5676 #endif /* MULE */
5677   defsymbol (&coding_category_symbol[CODING_CATEGORY_NO_CONVERSION],
5678              "no-conversion");
5679 }
5680
5681 void
5682 lstream_type_create_file_coding (void)
5683 {
5684   LSTREAM_HAS_METHOD (decoding, reader);
5685   LSTREAM_HAS_METHOD (decoding, writer);
5686   LSTREAM_HAS_METHOD (decoding, rewinder);
5687   LSTREAM_HAS_METHOD (decoding, seekable_p);
5688   LSTREAM_HAS_METHOD (decoding, flusher);
5689   LSTREAM_HAS_METHOD (decoding, closer);
5690   LSTREAM_HAS_METHOD (decoding, marker);
5691
5692   LSTREAM_HAS_METHOD (encoding, reader);
5693   LSTREAM_HAS_METHOD (encoding, writer);
5694   LSTREAM_HAS_METHOD (encoding, rewinder);
5695   LSTREAM_HAS_METHOD (encoding, seekable_p);
5696   LSTREAM_HAS_METHOD (encoding, flusher);
5697   LSTREAM_HAS_METHOD (encoding, closer);
5698   LSTREAM_HAS_METHOD (encoding, marker);
5699 }
5700
5701 void
5702 vars_of_file_coding (void)
5703 {
5704   int i;
5705
5706   fcd = xnew (struct file_coding_dump);
5707   dumpstruct (&fcd, &fcd_description);
5708
5709   /* Initialize to something reasonable ... */
5710   for (i = 0; i <= CODING_CATEGORY_LAST; i++)
5711     {
5712       fcd->coding_category_system[i] = Qnil;
5713       fcd->coding_category_by_priority[i] = i;
5714     }
5715
5716   Fprovide (intern ("file-coding"));
5717
5718   DEFVAR_LISP ("keyboard-coding-system", &Vkeyboard_coding_system /*
5719 Coding system used for TTY keyboard input.
5720 Not used under a windowing system.
5721 */ );
5722   Vkeyboard_coding_system = Qnil;
5723
5724   DEFVAR_LISP ("terminal-coding-system", &Vterminal_coding_system /*
5725 Coding system used for TTY display output.
5726 Not used under a windowing system.
5727 */ );
5728   Vterminal_coding_system = Qnil;
5729
5730   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read /*
5731 Overriding coding system used when writing a file or process.
5732 You should *bind* this, not set it.  If this is non-nil, it specifies
5733 the coding system that will be used when a file or process is read
5734 in, and overrides `buffer-file-coding-system-for-read',
5735 `insert-file-contents-pre-hook', etc.  Use those variables instead of
5736 this one for permanent changes to the environment.
5737 */ );
5738   Vcoding_system_for_read = Qnil;
5739
5740   DEFVAR_LISP ("coding-system-for-write",
5741                &Vcoding_system_for_write /*
5742 Overriding coding system used when writing a file or process.
5743 You should *bind* this, not set it.  If this is non-nil, it specifies
5744 the coding system that will be used when a file or process is wrote
5745 in, and overrides `buffer-file-coding-system',
5746 `write-region-pre-hook', etc.  Use those variables instead of this one
5747 for permanent changes to the environment.
5748 */ );
5749   Vcoding_system_for_write = Qnil;
5750
5751   DEFVAR_LISP ("file-name-coding-system", &Vfile_name_coding_system /*
5752 Coding system used to convert pathnames when accessing files.
5753 */ );
5754   Vfile_name_coding_system = Qnil;
5755
5756   DEFVAR_BOOL ("enable-multibyte-characters", &enable_multibyte_characters /*
5757 Non-nil means the buffer contents are regarded as multi-byte form
5758 of characters, not a binary code.  This affects the display, file I/O,
5759 and behaviors of various editing commands.
5760
5761 Setting this to nil does not do anything.
5762 */ );
5763   enable_multibyte_characters = 1;
5764 }
5765
5766 void
5767 complex_vars_of_file_coding (void)
5768 {
5769   staticpro (&Vcoding_system_hash_table);
5770   Vcoding_system_hash_table =
5771     make_lisp_hash_table (50, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ);
5772
5773   the_codesys_prop_dynarr = Dynarr_new (codesys_prop);
5774   dumpstruct (&the_codesys_prop_dynarr, &codesys_prop_dynarr_description);
5775
5776 #define DEFINE_CODESYS_PROP(Prop_Type, Sym) do  \
5777 {                                               \
5778   struct codesys_prop csp;                      \
5779   csp.sym = (Sym);                              \
5780   csp.prop_type = (Prop_Type);                  \
5781   Dynarr_add (the_codesys_prop_dynarr, csp);    \
5782 } while (0)
5783
5784   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qmnemonic);
5785   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_type);
5786   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_cr);
5787   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_crlf);
5788   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_lf);
5789   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qpost_read_conversion);
5790   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qpre_write_conversion);
5791 #ifdef MULE
5792   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g0);
5793   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g1);
5794   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g2);
5795   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g3);
5796   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g0_on_output);
5797   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g1_on_output);
5798   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g2_on_output);
5799   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g3_on_output);
5800   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qshort);
5801   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_eol);
5802   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_cntl);
5803   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qseven);
5804   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qlock_shift);
5805   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_iso6429);
5806   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qescape_quoted);
5807   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qinput_charset_conversion);
5808   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qoutput_charset_conversion);
5809
5810   DEFINE_CODESYS_PROP (CODESYS_PROP_CCL,     Qencode);
5811   DEFINE_CODESYS_PROP (CODESYS_PROP_CCL,     Qdecode);
5812 #endif /* MULE */
5813   /* Need to create this here or we're really screwed. */
5814   Fmake_coding_system
5815     (Qraw_text, Qno_conversion,
5816      build_string ("Raw text, which means it converts only line-break-codes."),
5817      list2 (Qmnemonic, build_string ("Raw")));
5818
5819   Fmake_coding_system
5820     (Qbinary, Qno_conversion,
5821      build_string ("Binary, which means it does not convert anything."),
5822      list4 (Qeol_type, Qlf,
5823             Qmnemonic, build_string ("Binary")));
5824
5825   Fdefine_coding_system_alias (Qno_conversion, Qraw_text);
5826
5827   /* Need this for bootstrapping */
5828   fcd->coding_category_system[CODING_CATEGORY_NO_CONVERSION] =
5829     Fget_coding_system (Qraw_text);
5830
5831 #ifdef MULE
5832   {
5833     unsigned int i;
5834
5835     for (i = 0; i < 65536; i++)
5836       fcd->ucs_to_mule_table[i] = Qnil;
5837   }
5838   staticpro (&mule_to_ucs_table);
5839   mule_to_ucs_table = Fmake_char_table(Qgeneric);
5840 #endif /* MULE */
5841 }