git.chise.org Git - chise/xemacs-chise.git-/blob - src/text-coding.c

   1 /* Code conversion functions.
   2    Copyright (C) 1991, 1995 Free Software Foundation, Inc.
   3    Copyright (C) 1995 Sun Microsystems, Inc.
   4    Copyright (C) 1999,2000,2001 MORIOKA Tomohiko
   5
   6 This file is part of XEmacs.
   7
   8 XEmacs is free software; you can redistribute it and/or modify it
   9 under the terms of the GNU General Public License as published by the
  10 Free Software Foundation; either version 2, or (at your option) any
  11 later version.
  12
  13 XEmacs is distributed in the hope that it will be useful, but WITHOUT
  14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with XEmacs; see the file COPYING.  If not, write to
  20 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  21 Boston, MA 02111-1307, USA.  */
  22
  23 /* Synched up with: Mule 2.3.   Not in FSF. */
  24
  25 /* Rewritten by Ben Wing <ben@xemacs.org>. */
  26
  27 #include <config.h>
  28 #include "lisp.h"
  29
  30 #include "buffer.h"
  31 #include "elhash.h"
  32 #include "insdel.h"
  33 #include "lstream.h"
  34 #include "opaque.h"
  35 #ifdef MULE
  36 #include "mule-ccl.h"
  37 #include "chartab.h"
  38 #endif
  39 #include "file-coding.h"
  40
  41 Lisp_Object Qcoding_system_error;
  42
  43 Lisp_Object Vkeyboard_coding_system;
  44 Lisp_Object Vterminal_coding_system;
  45 Lisp_Object Vcoding_system_for_read;
  46 Lisp_Object Vcoding_system_for_write;
  47 Lisp_Object Vfile_name_coding_system;
  48
  49 /* Table of symbols identifying each coding category. */
  50 Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST + 1];
  51
  52
  53
  54 struct file_coding_dump {
  55   /* Coding system currently associated with each coding category. */
  56   Lisp_Object coding_category_system[CODING_CATEGORY_LAST + 1];
  57
  58   /* Table of all coding categories in decreasing order of priority.
  59      This describes a permutation of the possible coding categories. */
  60   int coding_category_by_priority[CODING_CATEGORY_LAST + 1];
  61
  62 #if defined(MULE) && !defined(UTF2000)
  63   Lisp_Object ucs_to_mule_table[65536];
  64 #endif
  65 } *fcd;
  66
  67 static const struct lrecord_description fcd_description_1[] = {
  68   { XD_LISP_OBJECT_ARRAY, offsetof (struct file_coding_dump, coding_category_system), CODING_CATEGORY_LAST + 1 },
  69 #if defined(MULE) && !defined(UTF2000)
  70   { XD_LISP_OBJECT_ARRAY, offsetof (struct file_coding_dump, ucs_to_mule_table), countof (fcd->ucs_to_mule_table) },
  71 #endif
  72   { XD_END }
  73 };
  74
  75 static const struct struct_description fcd_description = {
  76   sizeof (struct file_coding_dump),
  77   fcd_description_1
  78 };
  79
  80 Lisp_Object mule_to_ucs_table;
  81
  82 Lisp_Object Qcoding_systemp;
  83
  84 Lisp_Object Qraw_text, Qno_conversion, Qccl, Qiso2022;
  85 /* Qinternal in general.c */
  86
  87 Lisp_Object Qmnemonic, Qeol_type;
  88 Lisp_Object Qcr, Qcrlf, Qlf;
  89 Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf;
  90 Lisp_Object Qpost_read_conversion;
  91 Lisp_Object Qpre_write_conversion;
  92
  93 #ifdef MULE
  94 Lisp_Object Qucs4, Qutf8;
  95 Lisp_Object Qbig5, Qshift_jis;
  96 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3;
  97 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output;
  98 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output;
  99 Lisp_Object Qno_iso6429;
 100 Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion;
 101 Lisp_Object Qescape_quoted;
 102 Lisp_Object Qshort, Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift;
 103 #endif
 104 #ifdef UTF2000
 105 Lisp_Object Qdisable_composition;
 106 #endif
 107 Lisp_Object Qencode, Qdecode;
 108
 109 Lisp_Object Vcoding_system_hash_table;
 110
 111 int enable_multibyte_characters;
 112
 113 #ifdef MULE
 114 /* Additional information used by the ISO2022 decoder and detector. */
 115 struct iso2022_decoder
 116 {
 117   /* CHARSET holds the character sets currently assigned to the G0
 118      through G3 variables.  It is initialized from the array
 119      INITIAL_CHARSET in CODESYS. */
 120   Lisp_Object charset[4];
 121
 122   /* Which registers are currently invoked into the left (GL) and
 123      right (GR) halves of the 8-bit encoding space? */
 124   int register_left, register_right;
 125
 126   /* ISO_ESC holds a value indicating part of an escape sequence
 127      that has already been seen. */
 128   enum iso_esc_flag esc;
 129
 130   /* This records the bytes we've seen so far in an escape sequence,
 131      in case the sequence is invalid (we spit out the bytes unchanged). */
 132   unsigned char esc_bytes[8];
 133
 134   /* Index for next byte to store in ISO escape sequence. */
 135   int esc_bytes_index;
 136
 137 #ifdef ENABLE_COMPOSITE_CHARS
 138   /* Stuff seen so far when composing a string. */
 139   unsigned_char_dynarr *composite_chars;
 140 #endif
 141
 142   /* If we saw an invalid designation sequence for a particular
 143      register, we flag it here and switch to ASCII.  The next time we
 144      see a valid designation for this register, we turn off the flag
 145      and do the designation normally, but pretend the sequence was
 146      invalid.  The effect of all this is that (most of the time) the
 147      escape sequences for both the switch to the unknown charset, and
 148      the switch back to the known charset, get inserted literally into
 149      the buffer and saved out as such.  The hope is that we can
 150      preserve the escape sequences so that the resulting written out
 151      file makes sense.  If we don't do any of this, the designation
 152      to the invalid charset will be preserved but that switch back
 153      to the known charset will probably get eaten because it was
 154      the same charset that was already present in the register. */
 155   unsigned char invalid_designated[4];
 156
 157   /* We try to do similar things as above for direction-switching
 158      sequences.  If we encountered a direction switch while an
 159      invalid designation was present, or an invalid designation
 160      just after a direction switch (i.e. no valid designation
 161      encountered yet), we insert the direction-switch escape
 162      sequence literally into the output stream, and later on
 163      insert the corresponding direction-restoring escape sequence
 164      literally also. */
 165   unsigned int switched_dir_and_no_valid_charset_yet :1;
 166   unsigned int invalid_switch_dir :1;
 167
 168   /* Tells the decoder to output the escape sequence literally
 169      even though it was valid.  Used in the games we play to
 170      avoid lossage when we encounter invalid designations. */
 171   unsigned int output_literally :1;
 172   /* We encountered a direction switch followed by an invalid
 173      designation.  We didn't output the direction switch
 174      literally because we didn't know about the invalid designation;
 175      but we have to do so now. */
 176   unsigned int output_direction_sequence :1;
 177 };
 178 #endif /* MULE */
 179 EXFUN (Fcopy_coding_system, 2);
 180 #ifdef MULE
 181 struct detection_state;
 182
 183 static void
 184 text_encode_generic (Lstream *encoding, const Bufbyte *src,
 185                      unsigned_char_dynarr *dst, size_t n);
 186
 187 static int detect_coding_sjis (struct detection_state *st,
 188                                const Extbyte *src, size_t n);
 189 static void decode_coding_sjis (Lstream *decoding, const Extbyte *src,
 190                                 unsigned_char_dynarr *dst, size_t n);
 191 void char_encode_shift_jis (struct encoding_stream *str, Emchar c,
 192                             unsigned_char_dynarr *dst, unsigned int *flags);
 193 void char_finish_shift_jis (struct encoding_stream *str,
 194                             unsigned_char_dynarr *dst, unsigned int *flags);
 195
 196 static int detect_coding_big5 (struct detection_state *st,
 197                                const Extbyte *src, size_t n);
 198 static void decode_coding_big5 (Lstream *decoding, const Extbyte *src,
 199                                 unsigned_char_dynarr *dst, size_t n);
 200 void char_encode_big5 (struct encoding_stream *str, Emchar c,
 201                        unsigned_char_dynarr *dst, unsigned int *flags);
 202 void char_finish_big5 (struct encoding_stream *str,
 203                        unsigned_char_dynarr *dst, unsigned int *flags);
 204
 205 static int detect_coding_ucs4 (struct detection_state *st,
 206                                const Extbyte *src, size_t n);
 207 static void decode_coding_ucs4 (Lstream *decoding, const Extbyte *src,
 208                                 unsigned_char_dynarr *dst, size_t n);
 209 void char_encode_ucs4 (struct encoding_stream *str, Emchar c,
 210                        unsigned_char_dynarr *dst, unsigned int *flags);
 211 void char_finish_ucs4 (struct encoding_stream *str,
 212                        unsigned_char_dynarr *dst, unsigned int *flags);
 213
 214 static int detect_coding_utf8 (struct detection_state *st,
 215                                const Extbyte *src, size_t n);
 216 static void decode_coding_utf8 (Lstream *decoding, const Extbyte *src,
 217                                 unsigned_char_dynarr *dst, size_t n);
 218 void char_encode_utf8 (struct encoding_stream *str, Emchar c,
 219                        unsigned_char_dynarr *dst, unsigned int *flags);
 220 void char_finish_utf8 (struct encoding_stream *str,
 221                        unsigned_char_dynarr *dst, unsigned int *flags);
 222
 223 static int postprocess_iso2022_mask (int mask);
 224 static void reset_iso2022 (Lisp_Object coding_system,
 225                            struct iso2022_decoder *iso);
 226 static int detect_coding_iso2022 (struct detection_state *st,
 227                                   const Extbyte *src, size_t n);
 228 static void decode_coding_iso2022 (Lstream *decoding, const Extbyte *src,
 229                                    unsigned_char_dynarr *dst, size_t n);
 230 void char_encode_iso2022 (struct encoding_stream *str, Emchar c,
 231                           unsigned_char_dynarr *dst, unsigned int *flags);
 232 void char_finish_iso2022 (struct encoding_stream *str,
 233                           unsigned_char_dynarr *dst, unsigned int *flags);
 234 #endif /* MULE */
 235 static void decode_coding_no_conversion (Lstream *decoding, const Extbyte *src,
 236                                          unsigned_char_dynarr *dst, size_t n);
 237 static void encode_coding_no_conversion (Lstream *encoding, const Bufbyte *src,
 238                                          unsigned_char_dynarr *dst, size_t n);
 239 static void mule_decode (Lstream *decoding, const Extbyte *src,
 240                          unsigned_char_dynarr *dst, size_t n);
 241 static void mule_encode (Lstream *encoding, const Bufbyte *src,
 242                          unsigned_char_dynarr *dst, size_t n);
 243
 244 typedef struct codesys_prop codesys_prop;
 245 struct codesys_prop
 246 {
 247   Lisp_Object sym;
 248   int prop_type;
 249 };
 250
 251 typedef struct
 252 {
 253   Dynarr_declare (codesys_prop);
 254 } codesys_prop_dynarr;
 255
 256 static const struct lrecord_description codesys_prop_description_1[] = {
 257   { XD_LISP_OBJECT, offsetof (codesys_prop, sym) },
 258   { XD_END }
 259 };
 260
 261 static const struct struct_description codesys_prop_description = {
 262   sizeof (codesys_prop),
 263   codesys_prop_description_1
 264 };
 265
 266 static const struct lrecord_description codesys_prop_dynarr_description_1[] = {
 267   XD_DYNARR_DESC (codesys_prop_dynarr, &codesys_prop_description),
 268   { XD_END }
 269 };
 270
 271 static const struct struct_description codesys_prop_dynarr_description = {
 272   sizeof (codesys_prop_dynarr),
 273   codesys_prop_dynarr_description_1
 274 };
 275
 276 codesys_prop_dynarr *the_codesys_prop_dynarr;
 277
 278 enum codesys_prop_enum
 279 {
 280   CODESYS_PROP_ALL_OK,
 281   CODESYS_PROP_ISO2022,
 282   CODESYS_PROP_CCL
 283 };
 284
 285 \f
 286 /************************************************************************/
 287 /*                       Coding system functions                        */
 288 /************************************************************************/
 289
 290 static Lisp_Object mark_coding_system (Lisp_Object);
 291 static void print_coding_system (Lisp_Object, Lisp_Object, int);
 292 static void finalize_coding_system (void *header, int for_disksave);
 293
 294 #ifdef MULE
 295 static const struct lrecord_description ccs_description_1[] = {
 296   { XD_LISP_OBJECT, offsetof (charset_conversion_spec, from_charset) },
 297   { XD_LISP_OBJECT, offsetof (charset_conversion_spec, to_charset) },
 298   { XD_END }
 299 };
 300
 301 static const struct struct_description ccs_description = {
 302   sizeof (charset_conversion_spec),
 303   ccs_description_1
 304 };
 305
 306 static const struct lrecord_description ccsd_description_1[] = {
 307   XD_DYNARR_DESC (charset_conversion_spec_dynarr, &ccs_description),
 308   { XD_END }
 309 };
 310
 311 static const struct struct_description ccsd_description = {
 312   sizeof (charset_conversion_spec_dynarr),
 313   ccsd_description_1
 314 };
 315 #endif
 316
 317 static const struct lrecord_description coding_system_description[] = {
 318   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, name) },
 319   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, doc_string) },
 320   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, mnemonic) },
 321   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, post_read_conversion) },
 322   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, pre_write_conversion) },
 323   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_lf) },
 324   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_crlf) },
 325   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_cr) },
 326 #ifdef MULE
 327   { XD_LISP_OBJECT_ARRAY, offsetof (Lisp_Coding_System, iso2022.initial_charset), 4 },
 328   { XD_STRUCT_PTR,  offsetof (Lisp_Coding_System, iso2022.input_conv),  1, &ccsd_description },
 329   { XD_STRUCT_PTR,  offsetof (Lisp_Coding_System, iso2022.output_conv), 1, &ccsd_description },
 330   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccl.decode) },
 331   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccl.encode) },
 332 #endif
 333   { XD_END }
 334 };
 335
 336 DEFINE_LRECORD_IMPLEMENTATION ("coding-system", coding_system,
 337                                mark_coding_system, print_coding_system,
 338                                finalize_coding_system,
 339                                0, 0, coding_system_description,
 340                                Lisp_Coding_System);
 341
 342 static Lisp_Object
 343 mark_coding_system (Lisp_Object obj)
 344 {
 345   Lisp_Coding_System *codesys = XCODING_SYSTEM (obj);
 346
 347   mark_object (CODING_SYSTEM_NAME (codesys));
 348   mark_object (CODING_SYSTEM_DOC_STRING (codesys));
 349   mark_object (CODING_SYSTEM_MNEMONIC (codesys));
 350   mark_object (CODING_SYSTEM_EOL_LF (codesys));
 351   mark_object (CODING_SYSTEM_EOL_CRLF (codesys));
 352   mark_object (CODING_SYSTEM_EOL_CR (codesys));
 353
 354   switch (CODING_SYSTEM_TYPE (codesys))
 355     {
 356 #ifdef MULE
 357       int i;
 358     case CODESYS_ISO2022:
 359       for (i = 0; i < 4; i++)
 360         mark_object (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i));
 361       if (codesys->iso2022.input_conv)
 362         {
 363           for (i = 0; i < Dynarr_length (codesys->iso2022.input_conv); i++)
 364             {
 365               struct charset_conversion_spec *ccs =
 366                 Dynarr_atp (codesys->iso2022.input_conv, i);
 367               mark_object (ccs->from_charset);
 368               mark_object (ccs->to_charset);
 369             }
 370         }
 371       if (codesys->iso2022.output_conv)
 372         {
 373           for (i = 0; i < Dynarr_length (codesys->iso2022.output_conv); i++)
 374             {
 375               struct charset_conversion_spec *ccs =
 376                 Dynarr_atp (codesys->iso2022.output_conv, i);
 377               mark_object (ccs->from_charset);
 378               mark_object (ccs->to_charset);
 379             }
 380         }
 381       break;
 382
 383     case CODESYS_CCL:
 384       mark_object (CODING_SYSTEM_CCL_DECODE (codesys));
 385       mark_object (CODING_SYSTEM_CCL_ENCODE (codesys));
 386       break;
 387 #endif /* MULE */
 388     default:
 389       break;
 390     }
 391
 392   mark_object (CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys));
 393   return CODING_SYSTEM_POST_READ_CONVERSION (codesys);
 394 }
 395
 396 static void
 397 print_coding_system (Lisp_Object obj, Lisp_Object printcharfun,
 398                      int escapeflag)
 399 {
 400   Lisp_Coding_System *c = XCODING_SYSTEM (obj);
 401   if (print_readably)
 402     error ("printing unreadable object #<coding_system 0x%x>",
 403            c->header.uid);
 404
 405   write_c_string ("#<coding_system ", printcharfun);
 406   print_internal (c->name, printcharfun, 1);
 407   write_c_string (">", printcharfun);
 408 }
 409
 410 static void
 411 finalize_coding_system (void *header, int for_disksave)
 412 {
 413   Lisp_Coding_System *c = (Lisp_Coding_System *) header;
 414   /* Since coding systems never go away, this function is not
 415      necessary.  But it would be necessary if we changed things
 416      so that coding systems could go away. */
 417   if (!for_disksave) /* see comment in lstream.c */
 418     {
 419       switch (CODING_SYSTEM_TYPE (c))
 420         {
 421 #ifdef MULE
 422         case CODESYS_ISO2022:
 423           if (c->iso2022.input_conv)
 424             {
 425               Dynarr_free (c->iso2022.input_conv);
 426               c->iso2022.input_conv = 0;
 427             }
 428           if (c->iso2022.output_conv)
 429             {
 430               Dynarr_free (c->iso2022.output_conv);
 431               c->iso2022.output_conv = 0;
 432             }
 433           break;
 434 #endif /* MULE */
 435         default:
 436           break;
 437         }
 438     }
 439 }
 440
 441 static eol_type_t
 442 symbol_to_eol_type (Lisp_Object symbol)
 443 {
 444   CHECK_SYMBOL (symbol);
 445   if (NILP (symbol))      return EOL_AUTODETECT;
 446   if (EQ (symbol, Qlf))   return EOL_LF;
 447   if (EQ (symbol, Qcrlf)) return EOL_CRLF;
 448   if (EQ (symbol, Qcr))   return EOL_CR;
 449
 450   signal_simple_error ("Unrecognized eol type", symbol);
 451   return EOL_AUTODETECT; /* not reached */
 452 }
 453
 454 static Lisp_Object
 455 eol_type_to_symbol (eol_type_t type)
 456 {
 457   switch (type)
 458     {
 459     default: abort ();
 460     case EOL_LF:         return Qlf;
 461     case EOL_CRLF:       return Qcrlf;
 462     case EOL_CR:         return Qcr;
 463     case EOL_AUTODETECT: return Qnil;
 464     }
 465 }
 466
 467 static void
 468 setup_eol_coding_systems (Lisp_Coding_System *codesys)
 469 {
 470   Lisp_Object codesys_obj;
 471   int len = string_length (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name);
 472   char *codesys_name = (char *) alloca (len + 7);
 473   int mlen = -1;
 474   char *codesys_mnemonic=0;
 475
 476   Lisp_Object codesys_name_sym, sub_codesys_obj;
 477
 478   /* kludge */
 479
 480   XSETCODING_SYSTEM (codesys_obj, codesys);
 481
 482   memcpy (codesys_name,
 483           string_data (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name), len);
 484
 485   if (STRINGP (CODING_SYSTEM_MNEMONIC (codesys)))
 486     {
 487       mlen = XSTRING_LENGTH (CODING_SYSTEM_MNEMONIC (codesys));
 488       codesys_mnemonic = (char *) alloca (mlen + 7);
 489       memcpy (codesys_mnemonic,
 490               XSTRING_DATA (CODING_SYSTEM_MNEMONIC (codesys)), mlen);
 491     }
 492
 493 #define DEFINE_SUB_CODESYS(op_sys, op_sys_abbr, Type) do {                      \
 494   strcpy (codesys_name + len, "-" op_sys);                                      \
 495   if (mlen != -1)                                                               \
 496     strcpy (codesys_mnemonic + mlen, op_sys_abbr);                              \
 497   codesys_name_sym = intern (codesys_name);                                     \
 498   sub_codesys_obj = Fcopy_coding_system (codesys_obj, codesys_name_sym);        \
 499   XCODING_SYSTEM_EOL_TYPE (sub_codesys_obj) = Type;                             \
 500   if (mlen != -1)                                                               \
 501     XCODING_SYSTEM_MNEMONIC(sub_codesys_obj) =                                  \
 502       build_string (codesys_mnemonic);                                          \
 503   CODING_SYSTEM_##Type (codesys) = sub_codesys_obj;                             \
 504 } while (0)
 505
 506   DEFINE_SUB_CODESYS("unix", "", EOL_LF);
 507   DEFINE_SUB_CODESYS("dos",  ":T", EOL_CRLF);
 508   DEFINE_SUB_CODESYS("mac",  ":t", EOL_CR);
 509 }
 510
 511 DEFUN ("coding-system-p", Fcoding_system_p, 1, 1, 0, /*
 512 Return t if OBJECT is a coding system.
 513 A coding system is an object that defines how text containing multiple
 514 character sets is encoded into a stream of (typically 8-bit) bytes.
 515 The coding system is used to decode the stream into a series of
 516 characters (which may be from multiple charsets) when the text is read
 517 from a file or process, and is used to encode the text back into the
 518 same format when it is written out to a file or process.
 519
 520 For example, many ISO2022-compliant coding systems (such as Compound
 521 Text, which is used for inter-client data under the X Window System)
 522 use escape sequences to switch between different charsets -- Japanese
 523 Kanji, for example, is invoked with "ESC $ ( B"; ASCII is invoked
 524 with "ESC ( B"; and Cyrillic is invoked with "ESC - L".  See
 525 `make-coding-system' for more information.
 526
 527 Coding systems are normally identified using a symbol, and the
 528 symbol is accepted in place of the actual coding system object whenever
 529 a coding system is called for. (This is similar to how faces work.)
 530 */
 531        (object))
 532 {
 533   return CODING_SYSTEMP (object) ? Qt : Qnil;
 534 }
 535
 536 DEFUN ("find-coding-system", Ffind_coding_system, 1, 1, 0, /*
 537 Retrieve the coding system of the given name.
 538
 539 If CODING-SYSTEM-OR-NAME is a coding-system object, it is simply
 540 returned.  Otherwise, CODING-SYSTEM-OR-NAME should be a symbol.
 541 If there is no such coding system, nil is returned.  Otherwise the
 542 associated coding system object is returned.
 543 */
 544        (coding_system_or_name))
 545 {
 546   if (NILP (coding_system_or_name))
 547     coding_system_or_name = Qbinary;
 548   else if (CODING_SYSTEMP (coding_system_or_name))
 549     return coding_system_or_name;
 550   else
 551     CHECK_SYMBOL (coding_system_or_name);
 552
 553   while (1)
 554     {
 555       coding_system_or_name =
 556         Fgethash (coding_system_or_name, Vcoding_system_hash_table, Qnil);
 557
 558       if (CODING_SYSTEMP (coding_system_or_name) || NILP (coding_system_or_name))
 559         return coding_system_or_name;
 560     }
 561 }
 562
 563 DEFUN ("get-coding-system", Fget_coding_system, 1, 1, 0, /*
 564 Retrieve the coding system of the given name.
 565 Same as `find-coding-system' except that if there is no such
 566 coding system, an error is signaled instead of returning nil.
 567 */
 568        (name))
 569 {
 570   Lisp_Object coding_system = Ffind_coding_system (name);
 571
 572   if (NILP (coding_system))
 573     signal_simple_error ("No such coding system", name);
 574   return coding_system;
 575 }
 576
 577 /* We store the coding systems in hash tables with the names as the key and the
 578    actual coding system object as the value.  Occasionally we need to use them
 579    in a list format.  These routines provide us with that. */
 580 struct coding_system_list_closure
 581 {
 582   Lisp_Object *coding_system_list;
 583 };
 584
 585 static int
 586 add_coding_system_to_list_mapper (Lisp_Object key, Lisp_Object value,
 587                                   void *coding_system_list_closure)
 588 {
 589   /* This function can GC */
 590   struct coding_system_list_closure *cscl =
 591     (struct coding_system_list_closure *) coding_system_list_closure;
 592   Lisp_Object *coding_system_list = cscl->coding_system_list;
 593
 594   *coding_system_list = Fcons (key, *coding_system_list);
 595   return 0;
 596 }
 597
 598 DEFUN ("coding-system-list", Fcoding_system_list, 0, 0, 0, /*
 599 Return a list of the names of all defined coding systems.
 600 */
 601        ())
 602 {
 603   Lisp_Object coding_system_list = Qnil;
 604   struct gcpro gcpro1;
 605   struct coding_system_list_closure coding_system_list_closure;
 606
 607   GCPRO1 (coding_system_list);
 608   coding_system_list_closure.coding_system_list = &coding_system_list;
 609   elisp_maphash (add_coding_system_to_list_mapper, Vcoding_system_hash_table,
 610                  &coding_system_list_closure);
 611   UNGCPRO;
 612
 613   return coding_system_list;
 614 }
 615
 616 DEFUN ("coding-system-name", Fcoding_system_name, 1, 1, 0, /*
 617 Return the name of the given coding system.
 618 */
 619        (coding_system))
 620 {
 621   coding_system = Fget_coding_system (coding_system);
 622   return XCODING_SYSTEM_NAME (coding_system);
 623 }
 624
 625 static Lisp_Coding_System *
 626 allocate_coding_system (enum coding_system_type type, Lisp_Object name)
 627 {
 628   Lisp_Coding_System *codesys =
 629     alloc_lcrecord_type (Lisp_Coding_System, &lrecord_coding_system);
 630
 631   zero_lcrecord (codesys);
 632   CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = Qnil;
 633   CODING_SYSTEM_POST_READ_CONVERSION (codesys) = Qnil;
 634   CODING_SYSTEM_EOL_TYPE (codesys) = EOL_AUTODETECT;
 635   CODING_SYSTEM_EOL_CRLF (codesys) = Qnil;
 636   CODING_SYSTEM_EOL_CR   (codesys) = Qnil;
 637   CODING_SYSTEM_EOL_LF   (codesys) = Qnil;
 638   CODING_SYSTEM_TYPE     (codesys) = type;
 639   CODING_SYSTEM_MNEMONIC (codesys) = Qnil;
 640 #ifdef MULE
 641   if (type == CODESYS_ISO2022)
 642     {
 643       int i;
 644       for (i = 0; i < 4; i++)
 645         CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i) = Qnil;
 646     }
 647   else if (type == CODESYS_CCL)
 648     {
 649       CODING_SYSTEM_CCL_DECODE (codesys) = Qnil;
 650       CODING_SYSTEM_CCL_ENCODE (codesys) = Qnil;
 651     }
 652 #endif /* MULE */
 653   CODING_SYSTEM_NAME (codesys) = name;
 654
 655   return codesys;
 656 }
 657
 658 #ifdef MULE
 659 /* Given a list of charset conversion specs as specified in a Lisp
 660    program, parse it into STORE_HERE. */
 661
 662 static void
 663 parse_charset_conversion_specs (charset_conversion_spec_dynarr *store_here,
 664                                 Lisp_Object spec_list)
 665 {
 666   Lisp_Object rest;
 667
 668   EXTERNAL_LIST_LOOP (rest, spec_list)
 669     {
 670       Lisp_Object car = XCAR (rest);
 671       Lisp_Object from, to;
 672       struct charset_conversion_spec spec;
 673
 674       if (!CONSP (car) || !CONSP (XCDR (car)) || !NILP (XCDR (XCDR (car))))
 675         signal_simple_error ("Invalid charset conversion spec", car);
 676       from = Fget_charset (XCAR (car));
 677       to = Fget_charset (XCAR (XCDR (car)));
 678       if ( (XCHARSET_CHARS (from) != XCHARSET_CHARS (to)) ||
 679            (XCHARSET_DIMENSION (from) != XCHARSET_DIMENSION (to)) )
 680         signal_simple_error_2
 681           ("Attempted conversion between different charset types",
 682            from, to);
 683       spec.from_charset = from;
 684       spec.to_charset = to;
 685
 686       Dynarr_add (store_here, spec);
 687     }
 688 }
 689
 690 /* Given a dynarr LOAD_HERE of internally-stored charset conversion
 691    specs, return the equivalent as the Lisp programmer would see it.
 692
 693    If LOAD_HERE is 0, return Qnil. */
 694
 695 static Lisp_Object
 696 unparse_charset_conversion_specs (charset_conversion_spec_dynarr *load_here)
 697 {
 698   int i;
 699   Lisp_Object result;
 700
 701   if (!load_here)
 702     return Qnil;
 703   for (i = 0, result = Qnil; i < Dynarr_length (load_here); i++)
 704     {
 705       struct charset_conversion_spec *ccs = Dynarr_atp (load_here, i);
 706       result = Fcons (list2 (ccs->from_charset, ccs->to_charset), result);
 707     }
 708
 709   return Fnreverse (result);
 710 }
 711
 712 #endif /* MULE */
 713
 714 DEFUN ("make-coding-system", Fmake_coding_system, 2, 4, 0, /*
 715 Register symbol NAME as a coding system.
 716
 717 TYPE describes the conversion method used and should be one of
 718
 719 nil or 'undecided
 720      Automatic conversion.  XEmacs attempts to detect the coding system
 721      used in the file.
 722 'no-conversion
 723      No conversion.  Use this for binary files and such.  On output,
 724      graphic characters that are not in ASCII or Latin-1 will be
 725      replaced by a ?. (For a no-conversion-encoded buffer, these
 726      characters will only be present if you explicitly insert them.)
 727 'shift-jis
 728      Shift-JIS (a Japanese encoding commonly used in PC operating systems).
 729 'ucs-4
 730      ISO 10646 UCS-4 encoding.
 731 'utf-8
 732      ISO 10646 UTF-8 encoding.
 733 'iso2022
 734      Any ISO2022-compliant encoding.  Among other things, this includes
 735      JIS (the Japanese encoding commonly used for e-mail), EUC (the
 736      standard Unix encoding for Japanese and other languages), and
 737      Compound Text (the encoding used in X11).  You can specify more
 738      specific information about the conversion with the PROPS argument.
 739 'big5
 740      Big5 (the encoding commonly used for Taiwanese).
 741 'ccl
 742      The conversion is performed using a user-written pseudo-code
 743      program.  CCL (Code Conversion Language) is the name of this
 744      pseudo-code.
 745 'internal
 746      Write out or read in the raw contents of the memory representing
 747      the buffer's text.  This is primarily useful for debugging
 748      purposes, and is only enabled when XEmacs has been compiled with
 749      DEBUG_XEMACS defined (via the --debug configure option).
 750      WARNING: Reading in a file using 'internal conversion can result
 751      in an internal inconsistency in the memory representing a
 752      buffer's text, which will produce unpredictable results and may
 753      cause XEmacs to crash.  Under normal circumstances you should
 754      never use 'internal conversion.
 755
 756 DOC-STRING is a string describing the coding system.
 757
 758 PROPS is a property list, describing the specific nature of the
 759 character set.  Recognized properties are:
 760
 761 'mnemonic
 762      String to be displayed in the modeline when this coding system is
 763      active.
 764
 765 'eol-type
 766      End-of-line conversion to be used.  It should be one of
 767
 768         nil
 769                 Automatically detect the end-of-line type (LF, CRLF,
 770                 or CR).  Also generate subsidiary coding systems named
 771                 `NAME-unix', `NAME-dos', and `NAME-mac', that are
 772                 identical to this coding system but have an EOL-TYPE
 773                 value of 'lf, 'crlf, and 'cr, respectively.
 774         'lf
 775                 The end of a line is marked externally using ASCII LF.
 776                 Since this is also the way that XEmacs represents an
 777                 end-of-line internally, specifying this option results
 778                 in no end-of-line conversion.  This is the standard
 779                 format for Unix text files.
 780         'crlf
 781                 The end of a line is marked externally using ASCII
 782                 CRLF.  This is the standard format for MS-DOS text
 783                 files.
 784         'cr
 785                 The end of a line is marked externally using ASCII CR.
 786                 This is the standard format for Macintosh text files.
 787         t
 788                 Automatically detect the end-of-line type but do not
 789                 generate subsidiary coding systems.  (This value is
 790                 converted to nil when stored internally, and
 791                 `coding-system-property' will return nil.)
 792
 793 'disable-composition
 794      If non-nil, composition/decomposition for combining characters
 795      are disabled.
 796
 797 'post-read-conversion
 798      Function called after a file has been read in, to perform the
 799      decoding.  Called with two arguments, START and END, denoting
 800      a region of the current buffer to be decoded.
 801
 802 'pre-write-conversion
 803      Function called before a file is written out, to perform the
 804      encoding.  Called with two arguments, START and END, denoting
 805      a region of the current buffer to be encoded.
 806
 807
 808 The following additional properties are recognized if TYPE is 'iso2022:
 809
 810 'charset-g0
 811 'charset-g1
 812 'charset-g2
 813 'charset-g3
 814      The character set initially designated to the G0 - G3 registers.
 815      The value should be one of
 816
 817           -- A charset object (designate that character set)
 818           -- nil (do not ever use this register)
 819           -- t (no character set is initially designated to
 820                 the register, but may be later on; this automatically
 821                 sets the corresponding `force-g*-on-output' property)
 822
 823 'force-g0-on-output
 824 'force-g1-on-output
 825 'force-g2-on-output
 826 'force-g2-on-output
 827      If non-nil, send an explicit designation sequence on output before
 828      using the specified register.
 829
 830 'short
 831      If non-nil, use the short forms "ESC $ @", "ESC $ A", and
 832      "ESC $ B" on output in place of the full designation sequences
 833      "ESC $ ( @", "ESC $ ( A", and "ESC $ ( B".
 834
 835 'no-ascii-eol
 836      If non-nil, don't designate ASCII to G0 at each end of line on output.
 837      Setting this to non-nil also suppresses other state-resetting that
 838      normally happens at the end of a line.
 839
 840 'no-ascii-cntl
 841      If non-nil, don't designate ASCII to G0 before control chars on output.
 842
 843 'seven
 844      If non-nil, use 7-bit environment on output.  Otherwise, use 8-bit
 845      environment.
 846
 847 'lock-shift
 848      If non-nil, use locking-shift (SO/SI) instead of single-shift
 849      or designation by escape sequence.
 850
 851 'no-iso6429
 852      If non-nil, don't use ISO6429's direction specification.
 853
 854 'escape-quoted
 855      If non-nil, literal control characters that are the same as
 856      the beginning of a recognized ISO2022 or ISO6429 escape sequence
 857      (in particular, ESC (0x1B), SO (0x0E), SI (0x0F), SS2 (0x8E),
 858      SS3 (0x8F), and CSI (0x9B)) are "quoted" with an escape character
 859      so that they can be properly distinguished from an escape sequence.
 860      (Note that doing this results in a non-portable encoding.) This
 861      encoding flag is used for byte-compiled files.  Note that ESC
 862      is a good choice for a quoting character because there are no
 863      escape sequences whose second byte is a character from the Control-0
 864      or Control-1 character sets; this is explicitly disallowed by the
 865      ISO2022 standard.
 866
 867 'input-charset-conversion
 868      A list of conversion specifications, specifying conversion of
 869      characters in one charset to another when decoding is performed.
 870      Each specification is a list of two elements: the source charset,
 871      and the destination charset.
 872
 873 'output-charset-conversion
 874      A list of conversion specifications, specifying conversion of
 875      characters in one charset to another when encoding is performed.
 876      The form of each specification is the same as for
 877      'input-charset-conversion.
 878
 879
 880 The following additional properties are recognized (and required)
 881 if TYPE is 'ccl:
 882
 883 'decode
 884      CCL program used for decoding (converting to internal format).
 885
 886 'encode
 887      CCL program used for encoding (converting to external format).
 888 */
 889        (name, type, doc_string, props))
 890 {
 891   Lisp_Coding_System *codesys;
 892   enum coding_system_type ty;
 893   int need_to_setup_eol_systems = 1;
 894
 895   /* Convert type to constant */
 896   if (NILP (type) || EQ (type, Qundecided))
 897                                       { ty = CODESYS_AUTODETECT; }
 898 #ifdef MULE
 899   else if (EQ (type, Qshift_jis))     { ty = CODESYS_SHIFT_JIS; }
 900   else if (EQ (type, Qiso2022))       { ty = CODESYS_ISO2022; }
 901   else if (EQ (type, Qbig5))          { ty = CODESYS_BIG5; }
 902   else if (EQ (type, Qucs4))          { ty = CODESYS_UCS4; }
 903   else if (EQ (type, Qutf8))          { ty = CODESYS_UTF8; }
 904   else if (EQ (type, Qccl))           { ty = CODESYS_CCL; }
 905 #endif
 906   else if (EQ (type, Qno_conversion)) { ty = CODESYS_NO_CONVERSION; }
 907 #ifdef DEBUG_XEMACS
 908   else if (EQ (type, Qinternal))      { ty = CODESYS_INTERNAL; }
 909 #endif
 910   else
 911     signal_simple_error ("Invalid coding system type", type);
 912
 913   CHECK_SYMBOL (name);
 914
 915   codesys = allocate_coding_system (ty, name);
 916
 917   if (NILP (doc_string))
 918     doc_string = build_string ("");
 919   else
 920     CHECK_STRING (doc_string);
 921   CODING_SYSTEM_DOC_STRING (codesys) = doc_string;
 922
 923   {
 924     EXTERNAL_PROPERTY_LIST_LOOP_3 (key, value, props)
 925       {
 926         if (EQ (key, Qmnemonic))
 927           {
 928             if (!NILP (value))
 929               CHECK_STRING (value);
 930             CODING_SYSTEM_MNEMONIC (codesys) = value;
 931           }
 932
 933         else if (EQ (key, Qeol_type))
 934           {
 935             need_to_setup_eol_systems = NILP (value);
 936             if (EQ (value, Qt))
 937               value = Qnil;
 938             CODING_SYSTEM_EOL_TYPE (codesys) = symbol_to_eol_type (value);
 939           }
 940
 941         else if (EQ (key, Qpost_read_conversion))
 942           CODING_SYSTEM_POST_READ_CONVERSION (codesys) = value;
 943         else if (EQ (key, Qpre_write_conversion))
 944           CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = value;
 945 #ifdef UTF2000
 946         else if (EQ (key, Qdisable_composition))
 947           CODING_SYSTEM_DISABLE_COMPOSITION (codesys) = !NILP (value);
 948 #endif
 949 #ifdef MULE
 950         else if (ty == CODESYS_ISO2022)
 951           {
 952 #define FROB_INITIAL_CHARSET(charset_num) \
 953   CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \
 954     ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value))
 955
 956             if      (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
 957             else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
 958             else if (EQ (key, Qcharset_g2)) FROB_INITIAL_CHARSET (2);
 959             else if (EQ (key, Qcharset_g3)) FROB_INITIAL_CHARSET (3);
 960
 961 #define FROB_FORCE_CHARSET(charset_num) \
 962   CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = !NILP (value)
 963
 964             else if (EQ (key, Qforce_g0_on_output)) FROB_FORCE_CHARSET (0);
 965             else if (EQ (key, Qforce_g1_on_output)) FROB_FORCE_CHARSET (1);
 966             else if (EQ (key, Qforce_g2_on_output)) FROB_FORCE_CHARSET (2);
 967             else if (EQ (key, Qforce_g3_on_output)) FROB_FORCE_CHARSET (3);
 968
 969 #define FROB_BOOLEAN_PROPERTY(prop) \
 970   CODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value)
 971
 972             else if (EQ (key, Qshort))         FROB_BOOLEAN_PROPERTY (SHORT);
 973             else if (EQ (key, Qno_ascii_eol))  FROB_BOOLEAN_PROPERTY (NO_ASCII_EOL);
 974             else if (EQ (key, Qno_ascii_cntl)) FROB_BOOLEAN_PROPERTY (NO_ASCII_CNTL);
 975             else if (EQ (key, Qseven))         FROB_BOOLEAN_PROPERTY (SEVEN);
 976             else if (EQ (key, Qlock_shift))    FROB_BOOLEAN_PROPERTY (LOCK_SHIFT);
 977             else if (EQ (key, Qno_iso6429))    FROB_BOOLEAN_PROPERTY (NO_ISO6429);
 978             else if (EQ (key, Qescape_quoted)) FROB_BOOLEAN_PROPERTY (ESCAPE_QUOTED);
 979
 980             else if (EQ (key, Qinput_charset_conversion))
 981               {
 982                 codesys->iso2022.input_conv =
 983                   Dynarr_new (charset_conversion_spec);
 984                 parse_charset_conversion_specs (codesys->iso2022.input_conv,
 985                                                 value);
 986               }
 987             else if (EQ (key, Qoutput_charset_conversion))
 988               {
 989                 codesys->iso2022.output_conv =
 990                   Dynarr_new (charset_conversion_spec);
 991                 parse_charset_conversion_specs (codesys->iso2022.output_conv,
 992                                                 value);
 993               }
 994             else
 995               signal_simple_error ("Unrecognized property", key);
 996           }
 997         else if (EQ (type, Qccl))
 998           {
 999             Lisp_Object sym;
1000             struct ccl_program test_ccl;
1001             Extbyte *suffix;
1002
1003             /* Check key first.  */
1004             if (EQ (key, Qdecode))
1005               suffix = "-ccl-decode";
1006             else if (EQ (key, Qencode))
1007               suffix = "-ccl-encode";
1008             else
1009               signal_simple_error ("Unrecognized property", key);
1010
1011             /* If value is vector, register it as a ccl program
1012                associated with an newly created symbol for
1013                backward compatibility.  */
1014             if (VECTORP (value))
1015               {
1016                 sym = Fintern (concat2 (Fsymbol_name (name),
1017                                         build_string (suffix)),
1018                                Qnil);
1019                 Fregister_ccl_program (sym, value);
1020               }
1021             else
1022               {
1023                 CHECK_SYMBOL (value);
1024                 sym = value;
1025               }
1026             /* check if the given ccl programs are valid.  */
1027             if (setup_ccl_program (&test_ccl, sym) < 0)
1028               signal_simple_error ("Invalid CCL program", value);
1029
1030             if (EQ (key, Qdecode))
1031               CODING_SYSTEM_CCL_DECODE (codesys) = sym;
1032             else if (EQ (key, Qencode))
1033               CODING_SYSTEM_CCL_ENCODE (codesys) = sym;
1034
1035           }
1036 #endif /* MULE */
1037         else
1038           signal_simple_error ("Unrecognized property", key);
1039       }
1040   }
1041
1042   if (need_to_setup_eol_systems)
1043     setup_eol_coding_systems (codesys);
1044
1045   {
1046     Lisp_Object codesys_obj;
1047     XSETCODING_SYSTEM (codesys_obj, codesys);
1048     Fputhash (name, codesys_obj, Vcoding_system_hash_table);
1049     return codesys_obj;
1050   }
1051 }
1052
1053 DEFUN ("copy-coding-system", Fcopy_coding_system, 2, 2, 0, /*
1054 Copy OLD-CODING-SYSTEM to NEW-NAME.
1055 If NEW-NAME does not name an existing coding system, a new one will
1056 be created.
1057 */
1058        (old_coding_system, new_name))
1059 {
1060   Lisp_Object new_coding_system;
1061   old_coding_system = Fget_coding_system (old_coding_system);
1062   new_coding_system = Ffind_coding_system (new_name);
1063   if (NILP (new_coding_system))
1064     {
1065       XSETCODING_SYSTEM (new_coding_system,
1066                          allocate_coding_system
1067                          (XCODING_SYSTEM_TYPE (old_coding_system),
1068                           new_name));
1069       Fputhash (new_name, new_coding_system, Vcoding_system_hash_table);
1070     }
1071
1072   {
1073     Lisp_Coding_System *to = XCODING_SYSTEM (new_coding_system);
1074     Lisp_Coding_System *from = XCODING_SYSTEM (old_coding_system);
1075     memcpy (((char *) to  ) + sizeof (to->header),
1076             ((char *) from) + sizeof (from->header),
1077             sizeof (*from) - sizeof (from->header));
1078     to->name = new_name;
1079   }
1080   return new_coding_system;
1081 }
1082
1083 DEFUN ("coding-system-canonical-name-p", Fcoding_system_canonical_name_p, 1, 1, 0, /*
1084 Return t if OBJECT names a coding system, and is not a coding system alias.
1085 */
1086        (object))
1087 {
1088   return CODING_SYSTEMP (Fgethash (object, Vcoding_system_hash_table, Qnil))
1089     ? Qt : Qnil;
1090 }
1091
1092 DEFUN ("coding-system-alias-p", Fcoding_system_alias_p, 1, 1, 0, /*
1093 Return t if OBJECT is a coding system alias.
1094 All coding system aliases are created by `define-coding-system-alias'.
1095 */
1096        (object))
1097 {
1098   return SYMBOLP (Fgethash (object, Vcoding_system_hash_table, Qzero))
1099     ? Qt : Qnil;
1100 }
1101
1102 DEFUN ("coding-system-aliasee", Fcoding_system_aliasee, 1, 1, 0, /*
1103 Return the coding-system symbol for which symbol ALIAS is an alias.
1104 */
1105        (alias))
1106 {
1107   Lisp_Object aliasee = Fgethash (alias, Vcoding_system_hash_table, Qnil);
1108   if (SYMBOLP (aliasee))
1109     return aliasee;
1110   else
1111     signal_simple_error ("Symbol is not a coding system alias", alias);
1112   return Qnil;          /* To keep the compiler happy */
1113 }
1114
1115 static Lisp_Object
1116 append_suffix_to_symbol (Lisp_Object symbol, const char *ascii_string)
1117 {
1118   return Fintern (concat2 (Fsymbol_name (symbol), build_string (ascii_string)),
1119                   Qnil);
1120 }
1121
1122 /* A maphash function, for removing dangling coding system aliases. */
1123 static int
1124 dangling_coding_system_alias_p (Lisp_Object alias,
1125                                 Lisp_Object aliasee,
1126                                 void *dangling_aliases)
1127 {
1128   if (SYMBOLP (aliasee)
1129       && NILP (Fgethash (aliasee, Vcoding_system_hash_table, Qnil)))
1130     {
1131       (*(int *) dangling_aliases)++;
1132       return 1;
1133     }
1134   else
1135     return 0;
1136 }
1137
1138 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias, 2, 2, 0, /*
1139 Define symbol ALIAS as an alias for coding system ALIASEE.
1140
1141 You can use this function to redefine an alias that has already been defined,
1142 but you cannot redefine a name which is the canonical name for a coding system.
1143 \(a canonical name of a coding system is what is returned when you call
1144 `coding-system-name' on a coding system).
1145
1146 ALIASEE itself can be an alias, which allows you to define nested aliases.
1147
1148 You are forbidden, however, from creating alias loops or `dangling' aliases.
1149 These will be detected, and an error will be signaled if you attempt to do so.
1150
1151 If ALIASEE is nil, then ALIAS will simply be undefined.
1152
1153 See also `coding-system-alias-p', `coding-system-aliasee',
1154 and `coding-system-canonical-name-p'.
1155 */
1156        (alias, aliasee))
1157 {
1158   Lisp_Object real_coding_system, probe;
1159
1160   CHECK_SYMBOL (alias);
1161
1162   if (!NILP (Fcoding_system_canonical_name_p (alias)))
1163     signal_simple_error
1164       ("Symbol is the canonical name of a coding system and cannot be redefined",
1165        alias);
1166
1167   if (NILP (aliasee))
1168     {
1169       Lisp_Object subsidiary_unix = append_suffix_to_symbol (alias, "-unix");
1170       Lisp_Object subsidiary_dos  = append_suffix_to_symbol (alias, "-dos");
1171       Lisp_Object subsidiary_mac  = append_suffix_to_symbol (alias, "-mac");
1172
1173       Fremhash (alias, Vcoding_system_hash_table);
1174
1175       /* Undefine subsidiary aliases,
1176          presumably created by a previous call to this function */
1177       if (! NILP (Fcoding_system_alias_p (subsidiary_unix)) &&
1178           ! NILP (Fcoding_system_alias_p (subsidiary_dos))  &&
1179           ! NILP (Fcoding_system_alias_p (subsidiary_mac)))
1180         {
1181           Fdefine_coding_system_alias (subsidiary_unix, Qnil);
1182           Fdefine_coding_system_alias (subsidiary_dos,  Qnil);
1183           Fdefine_coding_system_alias (subsidiary_mac,  Qnil);
1184         }
1185
1186       /* Undefine dangling coding system aliases. */
1187       {
1188         int dangling_aliases;
1189
1190         do {
1191           dangling_aliases = 0;
1192           elisp_map_remhash (dangling_coding_system_alias_p,
1193                              Vcoding_system_hash_table,
1194                              &dangling_aliases);
1195         } while (dangling_aliases > 0);
1196       }
1197
1198       return Qnil;
1199     }
1200
1201   if (CODING_SYSTEMP (aliasee))
1202     aliasee = XCODING_SYSTEM_NAME (aliasee);
1203
1204   /* Checks that aliasee names a coding-system */
1205   real_coding_system = Fget_coding_system (aliasee);
1206
1207   /* Check for coding system alias loops */
1208   if (EQ (alias, aliasee))
1209     alias_loop: signal_simple_error_2
1210       ("Attempt to create a coding system alias loop", alias, aliasee);
1211
1212   for (probe = aliasee;
1213        SYMBOLP (probe);
1214        probe = Fgethash (probe, Vcoding_system_hash_table, Qzero))
1215     {
1216       if (EQ (probe, alias))
1217         goto alias_loop;
1218     }
1219
1220   Fputhash (alias, aliasee, Vcoding_system_hash_table);
1221
1222   /* Set up aliases for subsidiaries.
1223      #### There must be a better way to handle subsidiary coding systems. */
1224   {
1225     static const char *suffixes[] = { "-unix", "-dos", "-mac" };
1226     int i;
1227     for (i = 0; i < countof (suffixes); i++)
1228       {
1229         Lisp_Object alias_subsidiary =
1230           append_suffix_to_symbol (alias, suffixes[i]);
1231         Lisp_Object aliasee_subsidiary =
1232           append_suffix_to_symbol (aliasee, suffixes[i]);
1233
1234         if (! NILP (Ffind_coding_system (aliasee_subsidiary)))
1235           Fdefine_coding_system_alias (alias_subsidiary, aliasee_subsidiary);
1236       }
1237   }
1238   /* FSF return value is a vector of [ALIAS-unix ALIAS-dos ALIAS-mac],
1239      but it doesn't look intentional, so I'd rather return something
1240      meaningful or nothing at all. */
1241   return Qnil;
1242 }
1243
1244 static Lisp_Object
1245 subsidiary_coding_system (Lisp_Object coding_system, eol_type_t type)
1246 {
1247   Lisp_Coding_System *cs = XCODING_SYSTEM (coding_system);
1248   Lisp_Object new_coding_system;
1249
1250   if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
1251     return coding_system;
1252
1253   switch (type)
1254     {
1255     case EOL_AUTODETECT: return coding_system;
1256     case EOL_LF:   new_coding_system = CODING_SYSTEM_EOL_LF   (cs); break;
1257     case EOL_CR:   new_coding_system = CODING_SYSTEM_EOL_CR   (cs); break;
1258     case EOL_CRLF: new_coding_system = CODING_SYSTEM_EOL_CRLF (cs); break;
1259     default:       abort (); return Qnil;
1260     }
1261
1262   return NILP (new_coding_system) ? coding_system : new_coding_system;
1263 }
1264
1265 DEFUN ("subsidiary-coding-system", Fsubsidiary_coding_system, 2, 2, 0, /*
1266 Return the subsidiary coding system of CODING-SYSTEM with eol type EOL-TYPE.
1267 */
1268        (coding_system, eol_type))
1269 {
1270   coding_system = Fget_coding_system (coding_system);
1271
1272   return subsidiary_coding_system (coding_system,
1273                                    symbol_to_eol_type (eol_type));
1274 }
1275
1276 \f
1277 /************************************************************************/
1278 /*                         Coding system accessors                      */
1279 /************************************************************************/
1280
1281 DEFUN ("coding-system-doc-string", Fcoding_system_doc_string, 1, 1, 0, /*
1282 Return the doc string for CODING-SYSTEM.
1283 */
1284        (coding_system))
1285 {
1286   coding_system = Fget_coding_system (coding_system);
1287   return XCODING_SYSTEM_DOC_STRING (coding_system);
1288 }
1289
1290 DEFUN ("coding-system-type", Fcoding_system_type, 1, 1, 0, /*
1291 Return the type of CODING-SYSTEM.
1292 */
1293        (coding_system))
1294 {
1295   switch (XCODING_SYSTEM_TYPE (Fget_coding_system (coding_system)))
1296     {
1297     default: abort ();
1298     case CODESYS_AUTODETECT:    return Qundecided;
1299 #ifdef MULE
1300     case CODESYS_SHIFT_JIS:     return Qshift_jis;
1301     case CODESYS_ISO2022:       return Qiso2022;
1302     case CODESYS_BIG5:          return Qbig5;
1303     case CODESYS_UCS4:          return Qucs4;
1304     case CODESYS_UTF8:          return Qutf8;
1305     case CODESYS_CCL:           return Qccl;
1306 #endif
1307     case CODESYS_NO_CONVERSION: return Qno_conversion;
1308 #ifdef DEBUG_XEMACS
1309     case CODESYS_INTERNAL:      return Qinternal;
1310 #endif
1311     }
1312 }
1313
1314 #ifdef MULE
1315 static
1316 Lisp_Object coding_system_charset (Lisp_Object coding_system, int gnum)
1317 {
1318   Lisp_Object cs
1319     = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, gnum);
1320
1321   return CHARSETP (cs) ? XCHARSET_NAME (cs) : Qnil;
1322 }
1323
1324 DEFUN ("coding-system-charset", Fcoding_system_charset, 2, 2, 0, /*
1325 Return initial charset of CODING-SYSTEM designated to GNUM.
1326 GNUM allows 0 .. 3.
1327 */
1328        (coding_system, gnum))
1329 {
1330   coding_system = Fget_coding_system (coding_system);
1331   CHECK_INT (gnum);
1332
1333   return coding_system_charset (coding_system, XINT (gnum));
1334 }
1335 #endif /* MULE */
1336
1337 DEFUN ("coding-system-property", Fcoding_system_property, 2, 2, 0, /*
1338 Return the PROP property of CODING-SYSTEM.
1339 */
1340        (coding_system, prop))
1341 {
1342   int i, ok = 0;
1343   enum coding_system_type type;
1344
1345   coding_system = Fget_coding_system (coding_system);
1346   CHECK_SYMBOL (prop);
1347   type = XCODING_SYSTEM_TYPE (coding_system);
1348
1349   for (i = 0; !ok && i < Dynarr_length (the_codesys_prop_dynarr); i++)
1350     if (EQ (Dynarr_at (the_codesys_prop_dynarr, i).sym, prop))
1351       {
1352         ok = 1;
1353         switch (Dynarr_at (the_codesys_prop_dynarr, i).prop_type)
1354           {
1355           case CODESYS_PROP_ALL_OK:
1356             break;
1357 #ifdef MULE
1358           case CODESYS_PROP_ISO2022:
1359             if (type != CODESYS_ISO2022)
1360               signal_simple_error
1361                 ("Property only valid in ISO2022 coding systems",
1362                  prop);
1363             break;
1364
1365           case CODESYS_PROP_CCL:
1366             if (type != CODESYS_CCL)
1367               signal_simple_error
1368                 ("Property only valid in CCL coding systems",
1369                  prop);
1370             break;
1371 #endif /* MULE */
1372           default:
1373             abort ();
1374           }
1375       }
1376
1377   if (!ok)
1378     signal_simple_error ("Unrecognized property", prop);
1379
1380   if (EQ (prop, Qname))
1381     return XCODING_SYSTEM_NAME (coding_system);
1382   else if (EQ (prop, Qtype))
1383     return Fcoding_system_type (coding_system);
1384   else if (EQ (prop, Qdoc_string))
1385     return XCODING_SYSTEM_DOC_STRING (coding_system);
1386   else if (EQ (prop, Qmnemonic))
1387     return XCODING_SYSTEM_MNEMONIC (coding_system);
1388   else if (EQ (prop, Qeol_type))
1389     return eol_type_to_symbol (XCODING_SYSTEM_EOL_TYPE (coding_system));
1390   else if (EQ (prop, Qeol_lf))
1391     return XCODING_SYSTEM_EOL_LF (coding_system);
1392   else if (EQ (prop, Qeol_crlf))
1393     return XCODING_SYSTEM_EOL_CRLF (coding_system);
1394   else if (EQ (prop, Qeol_cr))
1395     return XCODING_SYSTEM_EOL_CR (coding_system);
1396   else if (EQ (prop, Qpost_read_conversion))
1397     return XCODING_SYSTEM_POST_READ_CONVERSION (coding_system);
1398   else if (EQ (prop, Qpre_write_conversion))
1399     return XCODING_SYSTEM_PRE_WRITE_CONVERSION (coding_system);
1400 #ifdef MULE
1401   else if (type == CODESYS_ISO2022)
1402     {
1403       if (EQ (prop, Qcharset_g0))
1404         return coding_system_charset (coding_system, 0);
1405       else if (EQ (prop, Qcharset_g1))
1406         return coding_system_charset (coding_system, 1);
1407       else if (EQ (prop, Qcharset_g2))
1408         return coding_system_charset (coding_system, 2);
1409       else if (EQ (prop, Qcharset_g3))
1410         return coding_system_charset (coding_system, 3);
1411
1412 #define FORCE_CHARSET(charset_num) \
1413   (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \
1414    (coding_system, charset_num) ? Qt : Qnil)
1415
1416       else if (EQ (prop, Qforce_g0_on_output)) return FORCE_CHARSET (0);
1417       else if (EQ (prop, Qforce_g1_on_output)) return FORCE_CHARSET (1);
1418       else if (EQ (prop, Qforce_g2_on_output)) return FORCE_CHARSET (2);
1419       else if (EQ (prop, Qforce_g3_on_output)) return FORCE_CHARSET (3);
1420
1421 #define LISP_BOOLEAN(prop) \
1422   (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil)
1423
1424       else if (EQ (prop, Qshort))         return LISP_BOOLEAN (SHORT);
1425       else if (EQ (prop, Qno_ascii_eol))  return LISP_BOOLEAN (NO_ASCII_EOL);
1426       else if (EQ (prop, Qno_ascii_cntl)) return LISP_BOOLEAN (NO_ASCII_CNTL);
1427       else if (EQ (prop, Qseven))         return LISP_BOOLEAN (SEVEN);
1428       else if (EQ (prop, Qlock_shift))    return LISP_BOOLEAN (LOCK_SHIFT);
1429       else if (EQ (prop, Qno_iso6429))    return LISP_BOOLEAN (NO_ISO6429);
1430       else if (EQ (prop, Qescape_quoted)) return LISP_BOOLEAN (ESCAPE_QUOTED);
1431
1432       else if (EQ (prop, Qinput_charset_conversion))
1433         return
1434           unparse_charset_conversion_specs
1435             (XCODING_SYSTEM (coding_system)->iso2022.input_conv);
1436       else if (EQ (prop, Qoutput_charset_conversion))
1437         return
1438           unparse_charset_conversion_specs
1439             (XCODING_SYSTEM (coding_system)->iso2022.output_conv);
1440       else
1441         abort ();
1442     }
1443   else if (type == CODESYS_CCL)
1444     {
1445       if (EQ (prop, Qdecode))
1446         return XCODING_SYSTEM_CCL_DECODE (coding_system);
1447       else if (EQ (prop, Qencode))
1448         return XCODING_SYSTEM_CCL_ENCODE (coding_system);
1449       else
1450         abort ();
1451     }
1452 #endif /* MULE */
1453   else
1454     abort ();
1455
1456   return Qnil; /* not reached */
1457 }
1458
1459 \f
1460 /************************************************************************/
1461 /*                       Coding category functions                      */
1462 /************************************************************************/
1463
1464 static int
1465 decode_coding_category (Lisp_Object symbol)
1466 {
1467   int i;
1468
1469   CHECK_SYMBOL (symbol);
1470   for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1471     if (EQ (coding_category_symbol[i], symbol))
1472       return i;
1473
1474   signal_simple_error ("Unrecognized coding category", symbol);
1475   return 0; /* not reached */
1476 }
1477
1478 DEFUN ("coding-category-list", Fcoding_category_list, 0, 0, 0, /*
1479 Return a list of all recognized coding categories.
1480 */
1481        ())
1482 {
1483   int i;
1484   Lisp_Object list = Qnil;
1485
1486   for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1487     list = Fcons (coding_category_symbol[i], list);
1488   return list;
1489 }
1490
1491 DEFUN ("set-coding-priority-list", Fset_coding_priority_list, 1, 1, 0, /*
1492 Change the priority order of the coding categories.
1493 LIST should be list of coding categories, in descending order of
1494 priority.  Unspecified coding categories will be lower in priority
1495 than all specified ones, in the same relative order they were in
1496 previously.
1497 */
1498        (list))
1499 {
1500   int category_to_priority[CODING_CATEGORY_LAST + 1];
1501   int i, j;
1502   Lisp_Object rest;
1503
1504   /* First generate a list that maps coding categories to priorities. */
1505
1506   for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1507     category_to_priority[i] = -1;
1508
1509   /* Highest priority comes from the specified list. */
1510   i = 0;
1511   EXTERNAL_LIST_LOOP (rest, list)
1512     {
1513       int cat = decode_coding_category (XCAR (rest));
1514
1515       if (category_to_priority[cat] >= 0)
1516         signal_simple_error ("Duplicate coding category in list", XCAR (rest));
1517       category_to_priority[cat] = i++;
1518     }
1519
1520   /* Now go through the existing categories by priority to retrieve
1521      the categories not yet specified and preserve their priority
1522      order. */
1523   for (j = 0; j <= CODING_CATEGORY_LAST; j++)
1524     {
1525       int cat = fcd->coding_category_by_priority[j];
1526       if (category_to_priority[cat] < 0)
1527         category_to_priority[cat] = i++;
1528     }
1529
1530   /* Now we need to construct the inverse of the mapping we just
1531      constructed. */
1532
1533   for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1534     fcd->coding_category_by_priority[category_to_priority[i]] = i;
1535
1536   /* Phew!  That was confusing. */
1537   return Qnil;
1538 }
1539
1540 DEFUN ("coding-priority-list", Fcoding_priority_list, 0, 0, 0, /*
1541 Return a list of coding categories in descending order of priority.
1542 */
1543        ())
1544 {
1545   int i;
1546   Lisp_Object list = Qnil;
1547
1548   for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1549     list = Fcons (coding_category_symbol[fcd->coding_category_by_priority[i]],
1550                   list);
1551   return list;
1552 }
1553
1554 DEFUN ("set-coding-category-system", Fset_coding_category_system, 2, 2, 0, /*
1555 Change the coding system associated with a coding category.
1556 */
1557        (coding_category, coding_system))
1558 {
1559   int cat = decode_coding_category (coding_category);
1560
1561   coding_system = Fget_coding_system (coding_system);
1562   fcd->coding_category_system[cat] = coding_system;
1563   return Qnil;
1564 }
1565
1566 DEFUN ("coding-category-system", Fcoding_category_system, 1, 1, 0, /*
1567 Return the coding system associated with a coding category.
1568 */
1569        (coding_category))
1570 {
1571   int cat = decode_coding_category (coding_category);
1572   Lisp_Object sys = fcd->coding_category_system[cat];
1573
1574   if (!NILP (sys))
1575     return XCODING_SYSTEM_NAME (sys);
1576   return Qnil;
1577 }
1578
1579 \f
1580 /************************************************************************/
1581 /*                     Detecting the encoding of data                   */
1582 /************************************************************************/
1583
1584 struct detection_state
1585 {
1586   eol_type_t eol_type;
1587   int seen_non_ascii;
1588   int mask;
1589 #ifdef MULE
1590   struct
1591     {
1592       int mask;
1593       int in_second_byte;
1594     }
1595   big5;
1596
1597   struct
1598     {
1599       int mask;
1600       int in_second_byte;
1601     }
1602   shift_jis;
1603
1604   struct
1605     {
1606       int mask;
1607       int in_byte;
1608   }
1609   ucs4;
1610
1611   struct
1612     {
1613       int mask;
1614       int in_byte;
1615     }
1616   utf8;
1617
1618   struct
1619     {
1620       int mask;
1621       int initted;
1622       struct iso2022_decoder iso;
1623       unsigned int flags;
1624       int high_byte_count;
1625       unsigned int saw_single_shift:1;
1626     }
1627   iso2022;
1628 #endif
1629   struct
1630     {
1631       int seen_anything;
1632       int just_saw_cr;
1633     }
1634   eol;
1635 };
1636
1637 static int
1638 acceptable_control_char_p (int c)
1639 {
1640   switch (c)
1641     {
1642       /* Allow and ignore control characters that you might
1643          reasonably see in a text file */
1644     case '\r':
1645     case '\n':
1646     case '\t':
1647     case  7: /* bell */
1648     case  8: /* backspace */
1649     case 11: /* vertical tab */
1650     case 12: /* form feed */
1651     case 26: /* MS-DOS C-z junk */
1652     case 31: /* '^_' -- for info */
1653       return 1;
1654     default:
1655       return 0;
1656     }
1657 }
1658
1659 static int
1660 mask_has_at_most_one_bit_p (int mask)
1661 {
1662   /* Perhaps the only thing useful you learn from intensive Microsoft
1663      technical interviews */
1664   return (mask & (mask - 1)) == 0;
1665 }
1666
1667 static eol_type_t
1668 detect_eol_type (struct detection_state *st, const Extbyte *src,
1669                  size_t n)
1670 {
1671   while (n--)
1672     {
1673       unsigned char c = *(unsigned char *)src++;
1674       if (c == '\n')
1675         {
1676           if (st->eol.just_saw_cr)
1677             return EOL_CRLF;
1678           else if (st->eol.seen_anything)
1679             return EOL_LF;
1680         }
1681       else if (st->eol.just_saw_cr)
1682         return EOL_CR;
1683       else if (c == '\r')
1684         st->eol.just_saw_cr = 1;
1685       else
1686         st->eol.just_saw_cr = 0;
1687       st->eol.seen_anything = 1;
1688     }
1689
1690   return EOL_AUTODETECT;
1691 }
1692
1693 /* Attempt to determine the encoding and EOL type of the given text.
1694    Before calling this function for the first type, you must initialize
1695    st->eol_type as appropriate and initialize st->mask to ~0.
1696
1697    st->eol_type holds the determined EOL type, or EOL_AUTODETECT if
1698    not yet known.
1699
1700    st->mask holds the determined coding category mask, or ~0 if only
1701    ASCII has been seen so far.
1702
1703    Returns:
1704
1705    0 == st->eol_type is EOL_AUTODETECT and/or more than coding category
1706         is present in st->mask
1707    1 == definitive answers are here for both st->eol_type and st->mask
1708 */
1709
1710 static int
1711 detect_coding_type (struct detection_state *st, const Extbyte *src,
1712                     size_t n, int just_do_eol)
1713 {
1714   if (st->eol_type == EOL_AUTODETECT)
1715     st->eol_type = detect_eol_type (st, src, n);
1716
1717   if (just_do_eol)
1718     return st->eol_type != EOL_AUTODETECT;
1719
1720   if (!st->seen_non_ascii)
1721     {
1722       for (; n; n--, src++)
1723         {
1724           unsigned char c = *(unsigned char *) src;
1725           if ((c < 0x20 && !acceptable_control_char_p (c)) || c >= 0x80)
1726             {
1727               st->seen_non_ascii = 1;
1728 #ifdef MULE
1729               st->shift_jis.mask = ~0;
1730               st->big5.mask = ~0;
1731               st->ucs4.mask = ~0;
1732               st->utf8.mask = ~0;
1733               st->iso2022.mask = ~0;
1734 #endif
1735               break;
1736             }
1737         }
1738     }
1739
1740   if (!n)
1741     return 0;
1742 #ifdef MULE
1743   if (!mask_has_at_most_one_bit_p (st->iso2022.mask))
1744     st->iso2022.mask = detect_coding_iso2022 (st, src, n);
1745   if (!mask_has_at_most_one_bit_p (st->shift_jis.mask))
1746     st->shift_jis.mask = detect_coding_sjis (st, src, n);
1747   if (!mask_has_at_most_one_bit_p (st->big5.mask))
1748     st->big5.mask = detect_coding_big5 (st, src, n);
1749   if (!mask_has_at_most_one_bit_p (st->utf8.mask))
1750     st->utf8.mask = detect_coding_utf8 (st, src, n);
1751   if (!mask_has_at_most_one_bit_p (st->ucs4.mask))
1752     st->ucs4.mask = detect_coding_ucs4 (st, src, n);
1753
1754   st->mask
1755     = st->iso2022.mask | st->shift_jis.mask | st->big5.mask
1756     | st->utf8.mask | st->ucs4.mask;
1757 #endif
1758   {
1759     int retval = mask_has_at_most_one_bit_p (st->mask);
1760     st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK;
1761     return retval && st->eol_type != EOL_AUTODETECT;
1762   }
1763 }
1764
1765 static Lisp_Object
1766 coding_system_from_mask (int mask)
1767 {
1768   if (mask == ~0)
1769     {
1770       /* If the file was entirely or basically ASCII, use the
1771          default value of `buffer-file-coding-system'. */
1772       Lisp_Object retval =
1773         XBUFFER (Vbuffer_defaults)->buffer_file_coding_system;
1774       if (!NILP (retval))
1775         {
1776           retval = Ffind_coding_system (retval);
1777           if (NILP (retval))
1778             {
1779               warn_when_safe
1780                 (Qbad_variable, Qwarning,
1781                  "Invalid `default-buffer-file-coding-system', set to nil");
1782               XBUFFER (Vbuffer_defaults)->buffer_file_coding_system = Qnil;
1783             }
1784         }
1785       if (NILP (retval))
1786         retval = Fget_coding_system (Qraw_text);
1787       return retval;
1788     }
1789   else
1790     {
1791       int i;
1792       int cat = -1;
1793 #ifdef MULE
1794       mask = postprocess_iso2022_mask (mask);
1795 #endif
1796       /* Look through the coding categories by priority and find
1797          the first one that is allowed. */
1798       for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1799         {
1800           cat = fcd->coding_category_by_priority[i];
1801           if ((mask & (1 << cat)) &&
1802               !NILP (fcd->coding_category_system[cat]))
1803             break;
1804         }
1805       if (cat >= 0)
1806         return fcd->coding_category_system[cat];
1807       else
1808         return Fget_coding_system (Qraw_text);
1809     }
1810 }
1811
1812 /* Given a seekable read stream and potential coding system and EOL type
1813    as specified, do any autodetection that is called for.  If the
1814    coding system and/or EOL type are not `autodetect', they will be left
1815    alone; but this function will never return an autodetect coding system
1816    or EOL type.
1817
1818    This function does not automatically fetch subsidiary coding systems;
1819    that should be unnecessary with the explicit eol-type argument. */
1820
1821 #define LENGTH(string_constant) (sizeof (string_constant) - 1)
1822
1823 void
1824 determine_real_coding_system (Lstream *stream, Lisp_Object *codesys_in_out,
1825                               eol_type_t *eol_type_in_out)
1826 {
1827   struct detection_state decst;
1828
1829   if (*eol_type_in_out == EOL_AUTODETECT)
1830     *eol_type_in_out = XCODING_SYSTEM_EOL_TYPE (*codesys_in_out);
1831
1832   xzero (decst);
1833   decst.eol_type = *eol_type_in_out;
1834   decst.mask = ~0;
1835
1836   /* If autodetection is called for, do it now. */
1837   if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT
1838       || *eol_type_in_out == EOL_AUTODETECT)
1839     {
1840       Extbyte buf[4096];
1841       Lisp_Object coding_system = Qnil;
1842       Extbyte *p;
1843       ssize_t nread = Lstream_read (stream, buf, sizeof (buf));
1844       Extbyte *scan_end;
1845
1846       /* Look for initial "-*-"; mode line prefix */
1847       for (p = buf,
1848              scan_end = buf + nread - LENGTH ("-*-coding:?-*-");
1849            p <= scan_end
1850              && *p != '\n'
1851              && *p != '\r';
1852            p++)
1853         if (*p == '-' && *(p+1) == '*' && *(p+2) == '-')
1854           {
1855             Extbyte *local_vars_beg = p + 3;
1856             /* Look for final "-*-"; mode line suffix */
1857             for (p = local_vars_beg,
1858                    scan_end = buf + nread - LENGTH ("-*-");
1859                  p <= scan_end
1860                    && *p != '\n'
1861                    && *p != '\r';
1862                  p++)
1863               if (*p == '-' && *(p+1) == '*' && *(p+2) == '-')
1864                 {
1865                   Extbyte *suffix = p;
1866                   /* Look for "coding:" */
1867                   for (p = local_vars_beg,
1868                          scan_end = suffix - LENGTH ("coding:?");
1869                        p <= scan_end;
1870                        p++)
1871                     if (memcmp ("coding:", p, LENGTH ("coding:")) == 0
1872                         && (p == local_vars_beg
1873                             || (*(p-1) == ' '  ||
1874                                 *(p-1) == '\t' ||
1875                                 *(p-1) == ';')))
1876                       {
1877                         Extbyte save;
1878                         int n;
1879                         p += LENGTH ("coding:");
1880                         while (*p == ' ' || *p == '\t') p++;
1881
1882                         /* Get coding system name */
1883                         save = *suffix; *suffix = '\0';
1884                         /* Characters valid in a MIME charset name (rfc 1521),
1885                            and in a Lisp symbol name. */
1886                         n = strspn ( (char *) p,
1887                                     "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
1888                                     "abcdefghijklmnopqrstuvwxyz"
1889                                     "0123456789"
1890                                     "!$%&*+-.^_{|}~");
1891                         *suffix = save;
1892                         if (n > 0)
1893                           {
1894                             save = p[n]; p[n] = '\0';
1895                             coding_system =
1896                               Ffind_coding_system (intern ((char *) p));
1897                             p[n] = save;
1898                           }
1899                         break;
1900                       }
1901                   break;
1902                 }
1903             break;
1904           }
1905
1906       if (NILP (coding_system))
1907         do
1908           {
1909             if (detect_coding_type (&decst, buf, nread,
1910                                     XCODING_SYSTEM_TYPE (*codesys_in_out)
1911                                     != CODESYS_AUTODETECT))
1912               break;
1913             nread = Lstream_read (stream, buf, sizeof (buf));
1914             if (nread == 0)
1915               break;
1916           }
1917         while (1);
1918
1919       else if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT
1920                && XCODING_SYSTEM_EOL_TYPE (coding_system) == EOL_AUTODETECT)
1921         do
1922           {
1923             if (detect_coding_type (&decst, buf, nread, 1))
1924               break;
1925             nread = Lstream_read (stream, buf, sizeof (buf));
1926             if (!nread)
1927               break;
1928           }
1929         while (1);
1930
1931       *eol_type_in_out = decst.eol_type;
1932       if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT)
1933         {
1934           if (NILP (coding_system))
1935             *codesys_in_out = coding_system_from_mask (decst.mask);
1936           else
1937             *codesys_in_out = coding_system;
1938         }
1939     }
1940
1941   /* If we absolutely can't determine the EOL type, just assume LF. */
1942   if (*eol_type_in_out == EOL_AUTODETECT)
1943     *eol_type_in_out = EOL_LF;
1944
1945   Lstream_rewind (stream);
1946 }
1947
1948 DEFUN ("detect-coding-region", Fdetect_coding_region, 2, 3, 0, /*
1949 Detect coding system of the text in the region between START and END.
1950 Return a list of possible coding systems ordered by priority.
1951 If only ASCII characters are found, return 'undecided or one of
1952 its subsidiary coding systems according to a detected end-of-line
1953 type.  Optional arg BUFFER defaults to the current buffer.
1954 */
1955        (start, end, buffer))
1956 {
1957   Lisp_Object val = Qnil;
1958   struct buffer *buf = decode_buffer (buffer, 0);
1959   Bufpos b, e;
1960   Lisp_Object instream, lb_instream;
1961   Lstream *istr, *lb_istr;
1962   struct detection_state decst;
1963   struct gcpro gcpro1, gcpro2;
1964
1965   get_buffer_range_char (buf, start, end, &b, &e, 0);
1966   lb_instream = make_lisp_buffer_input_stream (buf, b, e, 0);
1967   lb_istr = XLSTREAM (lb_instream);
1968   instream = make_encoding_input_stream (lb_istr, Fget_coding_system (Qbinary));
1969   istr = XLSTREAM (instream);
1970   GCPRO2 (instream, lb_instream);
1971   xzero (decst);
1972   decst.eol_type = EOL_AUTODETECT;
1973   decst.mask = ~0;
1974   while (1)
1975     {
1976       Extbyte random_buffer[4096];
1977       ssize_t nread = Lstream_read (istr, random_buffer, sizeof (random_buffer));
1978
1979       if (!nread)
1980         break;
1981       if (detect_coding_type (&decst, random_buffer, nread, 0))
1982         break;
1983     }
1984
1985   if (decst.mask == ~0)
1986     val = subsidiary_coding_system (Fget_coding_system (Qundecided),
1987                                     decst.eol_type);
1988   else
1989     {
1990       int i;
1991
1992       val = Qnil;
1993 #ifdef MULE
1994       decst.mask = postprocess_iso2022_mask (decst.mask);
1995 #endif
1996       for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1997         {
1998           int sys = fcd->coding_category_by_priority[i];
1999           if (decst.mask & (1 << sys))
2000             {
2001               Lisp_Object codesys = fcd->coding_category_system[sys];
2002               if (!NILP (codesys))
2003                 codesys = subsidiary_coding_system (codesys, decst.eol_type);
2004               val = Fcons (codesys, val);
2005             }
2006         }
2007     }
2008   Lstream_close (istr);
2009   UNGCPRO;
2010   Lstream_delete (istr);
2011   Lstream_delete (lb_istr);
2012   return val;
2013 }
2014
2015 \f
2016 /************************************************************************/
2017 /*           Converting to internal Mule format ("decoding")            */
2018 /************************************************************************/
2019
2020 /* A decoding stream is a stream used for decoding text (i.e.
2021    converting from some external format to internal format).
2022    The decoding-stream object keeps track of the actual coding
2023    stream, the stream that is at the other end, and data that
2024    needs to be persistent across the lifetime of the stream. */
2025
2026 /* Handle the EOL stuff related to just-read-in character C.
2027    EOL_TYPE is the EOL type of the coding stream.
2028    FLAGS is the current value of FLAGS in the coding stream, and may
2029    be modified by this macro.  (The macro only looks at the
2030    CODING_STATE_CR flag.)  DST is the Dynarr to which the decoded
2031    bytes are to be written.  You need to also define a local goto
2032    label "label_continue_loop" that is at the end of the main
2033    character-reading loop.
2034
2035    If C is a CR character, then this macro handles it entirely and
2036    jumps to label_continue_loop.  Otherwise, this macro does not add
2037    anything to DST, and continues normally.  You should continue
2038    processing C normally after this macro. */
2039
2040 #define DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst)         \
2041 do {                                                            \
2042   if (c == '\r')                                                \
2043     {                                                           \
2044       if (eol_type == EOL_CR)                                   \
2045         Dynarr_add (dst, '\n');                                 \
2046       else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR) \
2047         Dynarr_add (dst, c);                                    \
2048       else                                                      \
2049         flags |= CODING_STATE_CR;                               \
2050       goto label_continue_loop;                                 \
2051     }                                                           \
2052   else if (flags & CODING_STATE_CR)                             \
2053     {   /* eol_type == CODING_SYSTEM_EOL_CRLF */                \
2054       if (c != '\n')                                            \
2055         Dynarr_add (dst, '\r');                                 \
2056       flags &= ~CODING_STATE_CR;                                \
2057     }                                                           \
2058 } while (0)
2059
2060 /* C should be a binary character in the range 0 - 255; convert
2061    to internal format and add to Dynarr DST. */
2062
2063 #ifdef UTF2000
2064 #define DECODE_ADD_BINARY_CHAR(c, dst) \
2065 do {                                            \
2066   if (BYTE_ASCII_P (c))                         \
2067     Dynarr_add (dst, c);                        \
2068   else                                          \
2069     {                                           \
2070       Dynarr_add (dst, (c >> 6) | 0xc0);        \
2071       Dynarr_add (dst, (c & 0x3f) | 0x80);      \
2072     }                                           \
2073 } while (0)
2074
2075 INLINE_HEADER void DECODE_ADD_UCS_CHAR(Emchar c, unsigned_char_dynarr* dst);
2076 INLINE_HEADER void
2077 DECODE_ADD_UCS_CHAR(Emchar c, unsigned_char_dynarr* dst)
2078 {
2079   if ( c <= 0x7f )
2080     {
2081       Dynarr_add (dst, c);
2082     }
2083   else if ( c <= 0x7ff )
2084     {
2085       Dynarr_add (dst, (c >> 6) | 0xc0);
2086       Dynarr_add (dst, (c & 0x3f) | 0x80);
2087     }
2088   else if ( c <= 0xffff )
2089     {
2090       Dynarr_add (dst,  (c >> 12) | 0xe0);
2091       Dynarr_add (dst, ((c >>  6) & 0x3f) | 0x80);
2092       Dynarr_add (dst,  (c        & 0x3f) | 0x80);
2093     }
2094   else if ( c <= 0x1fffff )
2095     {
2096       Dynarr_add (dst,  (c >> 18) | 0xf0);
2097       Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2098       Dynarr_add (dst, ((c >>  6) & 0x3f) | 0x80);
2099       Dynarr_add (dst,  (c        & 0x3f) | 0x80);
2100     }
2101   else if ( c <= 0x3ffffff )
2102     {
2103       Dynarr_add (dst,  (c >> 24) | 0xf8);
2104       Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
2105       Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2106       Dynarr_add (dst, ((c >>  6) & 0x3f) | 0x80);
2107       Dynarr_add (dst,  (c        & 0x3f) | 0x80);
2108     }
2109   else
2110     {
2111       Dynarr_add (dst,  (c >> 30) | 0xfc);
2112       Dynarr_add (dst, ((c >> 24) & 0x3f) | 0x80);
2113       Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
2114       Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2115       Dynarr_add (dst, ((c >>  6) & 0x3f) | 0x80);
2116       Dynarr_add (dst,  (c        & 0x3f) | 0x80);
2117     }
2118 }
2119 #else
2120 #define DECODE_ADD_BINARY_CHAR(c, dst)          \
2121 do {                                            \
2122   if (BYTE_ASCII_P (c))                         \
2123     Dynarr_add (dst, c);                        \
2124   else if (BYTE_C1_P (c))                       \
2125     {                                           \
2126       Dynarr_add (dst, LEADING_BYTE_CONTROL_1); \
2127       Dynarr_add (dst, c + 0x20);               \
2128     }                                           \
2129   else                                          \
2130     {                                           \
2131       Dynarr_add (dst, LEADING_BYTE_LATIN_ISO8859_1); \
2132       Dynarr_add (dst, c);                      \
2133     }                                           \
2134 } while (0)
2135 #endif
2136
2137 #define DECODE_OUTPUT_PARTIAL_CHAR(ch)  \
2138 do {                                    \
2139   if (ch)                               \
2140     {                                   \
2141       DECODE_ADD_BINARY_CHAR (ch, dst); \
2142       ch = 0;                           \
2143     }                                   \
2144 } while (0)
2145
2146 #define DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst) \
2147 do {                                    \
2148   if (flags & CODING_STATE_END)         \
2149     {                                   \
2150       DECODE_OUTPUT_PARTIAL_CHAR (ch);  \
2151       if (flags & CODING_STATE_CR)      \
2152         Dynarr_add (dst, '\r');         \
2153     }                                   \
2154 } while (0)
2155
2156 #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding)
2157
2158 struct decoding_stream
2159 {
2160   /* Coding system that governs the conversion. */
2161   Lisp_Coding_System *codesys;
2162
2163   /* Stream that we read the encoded data from or
2164      write the decoded data to. */
2165   Lstream *other_end;
2166
2167   /* If we are reading, then we can return only a fixed amount of
2168      data, so if the conversion resulted in too much data, we store it
2169      here for retrieval the next time around. */
2170   unsigned_char_dynarr *runoff;
2171
2172   /* FLAGS holds flags indicating the current state of the decoding.
2173      Some of these flags are dependent on the coding system. */
2174   unsigned int flags;
2175
2176   /* CPOS holds a partially built-up code-point of character. */
2177   unsigned int cpos;
2178
2179   /* EOL_TYPE specifies the type of end-of-line conversion that
2180      currently applies.  We need to keep this separate from the
2181      EOL type stored in CODESYS because the latter might indicate
2182      automatic EOL-type detection while the former will always
2183      indicate a particular EOL type. */
2184   eol_type_t eol_type;
2185 #ifdef MULE
2186   /* Additional ISO2022 information.  We define the structure above
2187      because it's also needed by the detection routines. */
2188   struct iso2022_decoder iso2022;
2189
2190   /* Additional information (the state of the running CCL program)
2191      used by the CCL decoder. */
2192   struct ccl_program ccl;
2193
2194   /* counter for UTF-8 or UCS-4 */
2195   unsigned char counter;
2196 #endif
2197 #ifdef UTF2000
2198   unsigned combined_char_count;
2199   Emchar combined_chars[16];
2200   Lisp_Object combining_table;
2201 #endif
2202   struct detection_state decst;
2203 };
2204
2205 #ifdef UTF2000
2206 extern Lisp_Object Vcharacter_composition_table;
2207
2208 INLINE_HEADER void
2209 COMPOSE_FLUSH_CHARS (struct decoding_stream *str, unsigned_char_dynarr* dst);
2210 INLINE_HEADER void
2211 COMPOSE_FLUSH_CHARS (struct decoding_stream *str, unsigned_char_dynarr* dst)
2212 {
2213   unsigned i;
2214
2215   for (i = 0; i < str->combined_char_count; i++)
2216     DECODE_ADD_UCS_CHAR (str->combined_chars[i], dst);
2217   str->combined_char_count = 0;
2218   str->combining_table = Qnil;
2219 }
2220
2221 void COMPOSE_ADD_CHAR(struct decoding_stream *str, Emchar character,
2222                       unsigned_char_dynarr* dst);
2223 void
2224 COMPOSE_ADD_CHAR(struct decoding_stream *str,
2225                  Emchar character, unsigned_char_dynarr* dst)
2226 {
2227   if (CODING_SYSTEM_DISABLE_COMPOSITION (str->codesys))
2228     DECODE_ADD_UCS_CHAR (character, dst);
2229   else if (!CHAR_ID_TABLE_P (str->combining_table))
2230     {
2231       Lisp_Object ret
2232         = get_char_id_table (character, Vcharacter_composition_table);
2233
2234       if (NILP (ret))
2235         DECODE_ADD_UCS_CHAR (character, dst);
2236       else
2237         {
2238           str->combined_chars[0] = character;
2239           str->combined_char_count = 1;
2240           str->combining_table = ret;
2241         }
2242     }
2243   else
2244     {
2245       Lisp_Object ret
2246         = get_char_id_table (character, str->combining_table);
2247
2248       if (CHARP (ret))
2249         {
2250           Emchar char2 = XCHARVAL (ret);
2251           ret = get_char_id_table (char2, Vcharacter_composition_table);
2252           if (NILP (ret))
2253             {
2254               DECODE_ADD_UCS_CHAR (char2, dst);
2255               str->combined_char_count = 0;
2256               str->combining_table = Qnil;
2257             }
2258           else
2259             {
2260               str->combined_chars[0] = char2;
2261               str->combined_char_count = 1;
2262               str->combining_table = ret;
2263             }
2264         }
2265       else if (CHAR_ID_TABLE_P (ret))
2266         {
2267           str->combined_chars[str->combined_char_count++] = character;
2268           str->combining_table = ret;
2269         }
2270       else
2271         {
2272           COMPOSE_FLUSH_CHARS (str, dst);
2273           DECODE_ADD_UCS_CHAR (character, dst);
2274         }
2275     }
2276 }
2277 #else /* not UTF2000 */
2278 #define COMPOSE_FLUSH_CHARS(str, dst)
2279 #define COMPOSE_ADD_CHAR(str, ch, dst) DECODE_ADD_UCS_CHAR (ch, dst)
2280 #endif /* UTF2000 */
2281
2282 static ssize_t decoding_reader (Lstream *stream,
2283                                 unsigned char *data, size_t size);
2284 static ssize_t decoding_writer (Lstream *stream,
2285                                 const unsigned char *data, size_t size);
2286 static int decoding_rewinder   (Lstream *stream);
2287 static int decoding_seekable_p (Lstream *stream);
2288 static int decoding_flusher    (Lstream *stream);
2289 static int decoding_closer     (Lstream *stream);
2290
2291 static Lisp_Object decoding_marker (Lisp_Object stream);
2292
2293 DEFINE_LSTREAM_IMPLEMENTATION ("decoding", lstream_decoding,
2294                                sizeof (struct decoding_stream));
2295
2296 static Lisp_Object
2297 decoding_marker (Lisp_Object stream)
2298 {
2299   Lstream *str = DECODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2300   Lisp_Object str_obj;
2301
2302   /* We do not need to mark the coding systems or charsets stored
2303      within the stream because they are stored in a global list
2304      and automatically marked. */
2305
2306   XSETLSTREAM (str_obj, str);
2307   mark_object (str_obj);
2308   if (str->imp->marker)
2309     return (str->imp->marker) (str_obj);
2310   else
2311     return Qnil;
2312 }
2313
2314 /* Read SIZE bytes of data and store it into DATA.  We are a decoding stream
2315    so we read data from the other end, decode it, and store it into DATA. */
2316
2317 static ssize_t
2318 decoding_reader (Lstream *stream, unsigned char *data, size_t size)
2319 {
2320   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2321   unsigned char *orig_data = data;
2322   ssize_t read_size;
2323   int error_occurred = 0;
2324
2325   /* We need to interface to mule_decode(), which expects to take some
2326      amount of data and store the result into a Dynarr.  We have
2327      mule_decode() store into str->runoff, and take data from there
2328      as necessary. */
2329
2330   /* We loop until we have enough data, reading chunks from the other
2331      end and decoding it. */
2332   while (1)
2333     {
2334       /* Take data from the runoff if we can.  Make sure to take at
2335          most SIZE bytes, and delete the data from the runoff. */
2336       if (Dynarr_length (str->runoff) > 0)
2337         {
2338           size_t chunk = min (size, (size_t) Dynarr_length (str->runoff));
2339           memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2340           Dynarr_delete_many (str->runoff, 0, chunk);
2341           data += chunk;
2342           size -= chunk;
2343         }
2344
2345       if (size == 0)
2346         break; /* No more room for data */
2347
2348       if (str->flags & CODING_STATE_END)
2349         /* This means that on the previous iteration, we hit the EOF on
2350            the other end.  We loop once more so that mule_decode() can
2351            output any final stuff it may be holding, or any "go back
2352            to a sane state" escape sequences. (This latter makes sense
2353            during encoding.) */
2354         break;
2355
2356       /* Exhausted the runoff, so get some more.  DATA has at least
2357          SIZE bytes left of storage in it, so it's OK to read directly
2358          into it.  (We'll be overwriting above, after we've decoded it
2359          into the runoff.) */
2360       read_size = Lstream_read (str->other_end, data, size);
2361       if (read_size < 0)
2362         {
2363           error_occurred = 1;
2364           break;
2365         }
2366       if (read_size == 0)
2367         /* There might be some more end data produced in the translation.
2368            See the comment above. */
2369         str->flags |= CODING_STATE_END;
2370       mule_decode (stream, (Extbyte *) data, str->runoff, read_size);
2371     }
2372
2373   if (data - orig_data == 0)
2374     return error_occurred ? -1 : 0;
2375   else
2376     return data - orig_data;
2377 }
2378
2379 static ssize_t
2380 decoding_writer (Lstream *stream, const unsigned char *data, size_t size)
2381 {
2382   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2383   ssize_t retval;
2384
2385   /* Decode all our data into the runoff, and then attempt to write
2386      it all out to the other end.  Remove whatever chunk we succeeded
2387      in writing. */
2388   mule_decode (stream, (Extbyte *) data, str->runoff, size);
2389   retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2390                           Dynarr_length (str->runoff));
2391   if (retval > 0)
2392     Dynarr_delete_many (str->runoff, 0, retval);
2393   /* Do NOT return retval.  The return value indicates how much
2394      of the incoming data was written, not how many bytes were
2395      written. */
2396   return size;
2397 }
2398
2399 static void
2400 reset_decoding_stream (struct decoding_stream *str)
2401 {
2402 #ifdef MULE
2403   if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_ISO2022)
2404     {
2405       Lisp_Object coding_system;
2406       XSETCODING_SYSTEM (coding_system, str->codesys);
2407       reset_iso2022 (coding_system, &str->iso2022);
2408     }
2409   else if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_CCL)
2410     {
2411       setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_DECODE (str->codesys));
2412     }
2413   str->counter = 0;
2414 #endif /* MULE */
2415 #ifdef UTF2000
2416   str->combined_char_count = 0;
2417   str->combining_table = Qnil;
2418 #endif
2419   str->flags = str->cpos = 0;
2420 }
2421
2422 static int
2423 decoding_rewinder (Lstream *stream)
2424 {
2425   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2426   reset_decoding_stream (str);
2427   Dynarr_reset (str->runoff);
2428   return Lstream_rewind (str->other_end);
2429 }
2430
2431 static int
2432 decoding_seekable_p (Lstream *stream)
2433 {
2434   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2435   return Lstream_seekable_p (str->other_end);
2436 }
2437
2438 static int
2439 decoding_flusher (Lstream *stream)
2440 {
2441   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2442   return Lstream_flush (str->other_end);
2443 }
2444
2445 static int
2446 decoding_closer (Lstream *stream)
2447 {
2448   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2449   if (stream->flags & LSTREAM_FL_WRITE)
2450     {
2451       str->flags |= CODING_STATE_END;
2452       decoding_writer (stream, 0, 0);
2453     }
2454   Dynarr_free (str->runoff);
2455 #ifdef MULE
2456 #ifdef ENABLE_COMPOSITE_CHARS
2457   if (str->iso2022.composite_chars)
2458     Dynarr_free (str->iso2022.composite_chars);
2459 #endif
2460 #endif
2461   return Lstream_close (str->other_end);
2462 }
2463
2464 Lisp_Object
2465 decoding_stream_coding_system (Lstream *stream)
2466 {
2467   Lisp_Object coding_system;
2468   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2469
2470   XSETCODING_SYSTEM (coding_system, str->codesys);
2471   return subsidiary_coding_system (coding_system, str->eol_type);
2472 }
2473
2474 void
2475 set_decoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2476 {
2477   Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2478   struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2479   str->codesys = cs;
2480   if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
2481     str->eol_type = CODING_SYSTEM_EOL_TYPE (cs);
2482   reset_decoding_stream (str);
2483 }
2484
2485 /* WARNING WARNING WARNING WARNING!!!!!  If you open up a decoding
2486    stream for writing, no automatic code detection will be performed.
2487    The reason for this is that automatic code detection requires a
2488    seekable input.  Things will also fail if you open a decoding
2489    stream for reading using a non-fully-specified coding system and
2490    a non-seekable input stream. */
2491
2492 static Lisp_Object
2493 make_decoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2494                         const char *mode)
2495 {
2496   Lstream *lstr = Lstream_new (lstream_decoding, mode);
2497   struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2498   Lisp_Object obj;
2499
2500   xzero (*str);
2501   str->other_end = stream;
2502   str->runoff = (unsigned_char_dynarr *) Dynarr_new (unsigned_char);
2503   str->eol_type = EOL_AUTODETECT;
2504   if (!strcmp (mode, "r")
2505       && Lstream_seekable_p (stream))
2506     /* We can determine the coding system now. */
2507     determine_real_coding_system (stream, &codesys, &str->eol_type);
2508   set_decoding_stream_coding_system (lstr, codesys);
2509   str->decst.eol_type = str->eol_type;
2510   str->decst.mask = ~0;
2511   XSETLSTREAM (obj, lstr);
2512   return obj;
2513 }
2514
2515 Lisp_Object
2516 make_decoding_input_stream (Lstream *stream, Lisp_Object codesys)
2517 {
2518   return make_decoding_stream_1 (stream, codesys, "r");
2519 }
2520
2521 Lisp_Object
2522 make_decoding_output_stream (Lstream *stream, Lisp_Object codesys)
2523 {
2524   return make_decoding_stream_1 (stream, codesys, "w");
2525 }
2526
2527 /* Note: the decode_coding_* functions all take the same
2528    arguments as mule_decode(), which is to say some SRC data of
2529    size N, which is to be stored into dynamic array DST.
2530    DECODING is the stream within which the decoding is
2531    taking place, but no data is actually read from or
2532    written to that stream; that is handled in decoding_reader()
2533    or decoding_writer().  This allows the same functions to
2534    be used for both reading and writing. */
2535
2536 static void
2537 mule_decode (Lstream *decoding, const Extbyte *src,
2538              unsigned_char_dynarr *dst, size_t n)
2539 {
2540   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2541
2542   /* If necessary, do encoding-detection now.  We do this when
2543      we're a writing stream or a non-seekable reading stream,
2544      meaning that we can't just process the whole input,
2545      rewind, and start over. */
2546
2547   if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT ||
2548       str->eol_type == EOL_AUTODETECT)
2549     {
2550       Lisp_Object codesys;
2551
2552       XSETCODING_SYSTEM (codesys, str->codesys);
2553       detect_coding_type (&str->decst, src, n,
2554                           CODING_SYSTEM_TYPE (str->codesys) !=
2555                           CODESYS_AUTODETECT);
2556       if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT &&
2557           str->decst.mask != ~0)
2558         /* #### This is cheesy.  What we really ought to do is
2559            buffer up a certain amount of data so as to get a
2560            less random result. */
2561         codesys = coding_system_from_mask (str->decst.mask);
2562       str->eol_type = str->decst.eol_type;
2563       if (XCODING_SYSTEM (codesys) != str->codesys)
2564         {
2565           /* Preserve the CODING_STATE_END flag in case it was set.
2566              If we erase it, bad things might happen. */
2567           int was_end = str->flags & CODING_STATE_END;
2568           set_decoding_stream_coding_system (decoding, codesys);
2569           if (was_end)
2570             str->flags |= CODING_STATE_END;
2571         }
2572     }
2573
2574   switch (CODING_SYSTEM_TYPE (str->codesys))
2575     {
2576 #ifdef DEBUG_XEMACS
2577     case CODESYS_INTERNAL:
2578       Dynarr_add_many (dst, src, n);
2579       break;
2580 #endif
2581     case CODESYS_AUTODETECT:
2582       /* If we got this far and still haven't decided on the coding
2583          system, then do no conversion. */
2584     case CODESYS_NO_CONVERSION:
2585       decode_coding_no_conversion (decoding, src, dst, n);
2586       break;
2587 #ifdef MULE
2588     case CODESYS_SHIFT_JIS:
2589       decode_coding_sjis (decoding, src, dst, n);
2590       break;
2591     case CODESYS_BIG5:
2592       decode_coding_big5 (decoding, src, dst, n);
2593       break;
2594     case CODESYS_UCS4:
2595       decode_coding_ucs4 (decoding, src, dst, n);
2596       break;
2597     case CODESYS_UTF8:
2598       decode_coding_utf8 (decoding, src, dst, n);
2599       break;
2600     case CODESYS_CCL:
2601       str->ccl.last_block = str->flags & CODING_STATE_END;
2602       /* When applying ccl program to stream, MUST NOT set NULL
2603          pointer to src.  */
2604       ccl_driver (&str->ccl, (src ? (unsigned char *)src : (unsigned char*)""),
2605                   dst, n, 0, CCL_MODE_DECODING);
2606       break;
2607     case CODESYS_ISO2022:
2608       decode_coding_iso2022 (decoding, src, dst, n);
2609       break;
2610 #endif /* MULE */
2611     default:
2612       abort ();
2613     }
2614 }
2615
2616 DEFUN ("decode-coding-region", Fdecode_coding_region, 3, 4, 0, /*
2617 Decode the text between START and END which is encoded in CODING-SYSTEM.
2618 This is useful if you've read in encoded text from a file without decoding
2619 it (e.g. you read in a JIS-formatted file but used the `binary' or
2620 `no-conversion' coding system, so that it shows up as "^[$B!<!+^[(B").
2621 Return length of decoded text.
2622 BUFFER defaults to the current buffer if unspecified.
2623 */
2624        (start, end, coding_system, buffer))
2625 {
2626   Bufpos b, e;
2627   struct buffer *buf = decode_buffer (buffer, 0);
2628   Lisp_Object instream, lb_outstream, de_outstream, outstream;
2629   Lstream *istr, *ostr;
2630   struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2631
2632   get_buffer_range_char (buf, start, end, &b, &e, 0);
2633
2634   barf_if_buffer_read_only (buf, b, e);
2635
2636   coding_system = Fget_coding_system (coding_system);
2637   instream = make_lisp_buffer_input_stream  (buf, b, e, 0);
2638   lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2639   de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2640                                               coding_system);
2641   outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2642                                            Fget_coding_system (Qbinary));
2643   istr = XLSTREAM (instream);
2644   ostr = XLSTREAM (outstream);
2645   GCPRO4 (instream, lb_outstream, de_outstream, outstream);
2646
2647   /* The chain of streams looks like this:
2648
2649      [BUFFER] <----- send through
2650                      ------> [ENCODE AS BINARY]
2651                              ------> [DECODE AS SPECIFIED]
2652                                      ------> [BUFFER]
2653    */
2654
2655   while (1)
2656     {
2657       char tempbuf[1024]; /* some random amount */
2658       Bufpos newpos, even_newer_pos;
2659       Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2660       ssize_t size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2661
2662       if (!size_in_bytes)
2663         break;
2664       newpos = lisp_buffer_stream_startpos (istr);
2665       Lstream_write (ostr, tempbuf, size_in_bytes);
2666       even_newer_pos = lisp_buffer_stream_startpos (istr);
2667       buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2668                            even_newer_pos, 0);
2669     }
2670   Lstream_close (istr);
2671   Lstream_close (ostr);
2672   UNGCPRO;
2673   Lstream_delete (istr);
2674   Lstream_delete (ostr);
2675   Lstream_delete (XLSTREAM (de_outstream));
2676   Lstream_delete (XLSTREAM (lb_outstream));
2677   return Qnil;
2678 }
2679
2680 \f
2681 /************************************************************************/
2682 /*           Converting to an external encoding ("encoding")            */
2683 /************************************************************************/
2684
2685 /* An encoding stream is an output stream.  When you create the
2686    stream, you specify the coding system that governs the encoding
2687    and another stream that the resulting encoded data is to be
2688    sent to, and then start sending data to it. */
2689
2690 #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding)
2691
2692 struct encoding_stream
2693 {
2694   /* Coding system that governs the conversion. */
2695   Lisp_Coding_System *codesys;
2696
2697   /* Stream that we read the encoded data from or
2698      write the decoded data to. */
2699   Lstream *other_end;
2700
2701   /* If we are reading, then we can return only a fixed amount of
2702      data, so if the conversion resulted in too much data, we store it
2703      here for retrieval the next time around. */
2704   unsigned_char_dynarr *runoff;
2705
2706   /* FLAGS holds flags indicating the current state of the encoding.
2707      Some of these flags are dependent on the coding system. */
2708   unsigned int flags;
2709
2710   /* CH holds a partially built-up character.  Since we only deal
2711      with one- and two-byte characters at the moment, we only use
2712      this to store the first byte of a two-byte character. */
2713   unsigned int ch;
2714 #ifdef MULE
2715   /* Additional information used by the ISO2022 encoder. */
2716   struct
2717     {
2718       /* CHARSET holds the character sets currently assigned to the G0
2719          through G3 registers.  It is initialized from the array
2720          INITIAL_CHARSET in CODESYS. */
2721       Lisp_Object charset[4];
2722
2723       /* Which registers are currently invoked into the left (GL) and
2724          right (GR) halves of the 8-bit encoding space? */
2725       int register_left, register_right;
2726
2727       /* Whether we need to explicitly designate the charset in the
2728          G? register before using it.  It is initialized from the
2729          array FORCE_CHARSET_ON_OUTPUT in CODESYS. */
2730       unsigned char force_charset_on_output[4];
2731
2732       /* Other state variables that need to be preserved across
2733          invocations. */
2734       Lisp_Object current_charset;
2735       int current_half;
2736       int current_char_boundary;
2737     } iso2022;
2738
2739   void (*encode_char) (struct encoding_stream *str, Emchar c,
2740                        unsigned_char_dynarr *dst, unsigned int *flags);
2741   void (*finish) (struct encoding_stream *str,
2742                   unsigned_char_dynarr *dst, unsigned int *flags);
2743
2744   /* Additional information (the state of the running CCL program)
2745      used by the CCL encoder. */
2746   struct ccl_program ccl;
2747 #endif /* MULE */
2748 };
2749
2750 static ssize_t encoding_reader (Lstream *stream, unsigned char *data, size_t size);
2751 static ssize_t encoding_writer (Lstream *stream, const unsigned char *data,
2752                                 size_t size);
2753 static int encoding_rewinder   (Lstream *stream);
2754 static int encoding_seekable_p (Lstream *stream);
2755 static int encoding_flusher    (Lstream *stream);
2756 static int encoding_closer     (Lstream *stream);
2757
2758 static Lisp_Object encoding_marker (Lisp_Object stream);
2759
2760 DEFINE_LSTREAM_IMPLEMENTATION ("encoding", lstream_encoding,
2761                                sizeof (struct encoding_stream));
2762
2763 static Lisp_Object
2764 encoding_marker (Lisp_Object stream)
2765 {
2766   Lstream *str = ENCODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2767   Lisp_Object str_obj;
2768
2769   /* We do not need to mark the coding systems or charsets stored
2770      within the stream because they are stored in a global list
2771      and automatically marked. */
2772
2773   XSETLSTREAM (str_obj, str);
2774   mark_object (str_obj);
2775   if (str->imp->marker)
2776     return (str->imp->marker) (str_obj);
2777   else
2778     return Qnil;
2779 }
2780
2781 /* Read SIZE bytes of data and store it into DATA.  We are a encoding stream
2782    so we read data from the other end, encode it, and store it into DATA. */
2783
2784 static ssize_t
2785 encoding_reader (Lstream *stream, unsigned char *data, size_t size)
2786 {
2787   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2788   unsigned char *orig_data = data;
2789   ssize_t read_size;
2790   int error_occurred = 0;
2791
2792   /* We need to interface to mule_encode(), which expects to take some
2793      amount of data and store the result into a Dynarr.  We have
2794      mule_encode() store into str->runoff, and take data from there
2795      as necessary. */
2796
2797   /* We loop until we have enough data, reading chunks from the other
2798      end and encoding it. */
2799   while (1)
2800     {
2801       /* Take data from the runoff if we can.  Make sure to take at
2802          most SIZE bytes, and delete the data from the runoff. */
2803       if (Dynarr_length (str->runoff) > 0)
2804         {
2805           int chunk = min ((int) size, Dynarr_length (str->runoff));
2806           memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2807           Dynarr_delete_many (str->runoff, 0, chunk);
2808           data += chunk;
2809           size -= chunk;
2810         }
2811
2812       if (size == 0)
2813         break; /* No more room for data */
2814
2815       if (str->flags & CODING_STATE_END)
2816         /* This means that on the previous iteration, we hit the EOF on
2817            the other end.  We loop once more so that mule_encode() can
2818            output any final stuff it may be holding, or any "go back
2819            to a sane state" escape sequences. (This latter makes sense
2820            during encoding.) */
2821         break;
2822
2823       /* Exhausted the runoff, so get some more.  DATA at least SIZE bytes
2824          left of storage in it, so it's OK to read directly into it.
2825          (We'll be overwriting above, after we've encoded it into the
2826          runoff.) */
2827       read_size = Lstream_read (str->other_end, data, size);
2828       if (read_size < 0)
2829         {
2830           error_occurred = 1;
2831           break;
2832         }
2833       if (read_size == 0)
2834         /* There might be some more end data produced in the translation.
2835            See the comment above. */
2836         str->flags |= CODING_STATE_END;
2837       mule_encode (stream, data, str->runoff, read_size);
2838     }
2839
2840   if (data == orig_data)
2841     return error_occurred ? -1 : 0;
2842   else
2843     return data - orig_data;
2844 }
2845
2846 static ssize_t
2847 encoding_writer (Lstream *stream, const unsigned char *data, size_t size)
2848 {
2849   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2850   ssize_t retval;
2851
2852   /* Encode all our data into the runoff, and then attempt to write
2853      it all out to the other end.  Remove whatever chunk we succeeded
2854      in writing. */
2855   mule_encode (stream, data, str->runoff, size);
2856   retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2857                           Dynarr_length (str->runoff));
2858   if (retval > 0)
2859     Dynarr_delete_many (str->runoff, 0, retval);
2860   /* Do NOT return retval.  The return value indicates how much
2861      of the incoming data was written, not how many bytes were
2862      written. */
2863   return size;
2864 }
2865
2866 static void
2867 reset_encoding_stream (struct encoding_stream *str)
2868 {
2869 #ifdef MULE
2870   switch (CODING_SYSTEM_TYPE (str->codesys))
2871     {
2872     case CODESYS_ISO2022:
2873       {
2874         int i;
2875
2876         str->encode_char = &char_encode_iso2022;
2877         str->finish = &char_finish_iso2022;
2878         for (i = 0; i < 4; i++)
2879           {
2880             str->iso2022.charset[i] =
2881               CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, i);
2882             str->iso2022.force_charset_on_output[i] =
2883               CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (str->codesys, i);
2884           }
2885         str->iso2022.register_left = 0;
2886         str->iso2022.register_right = 1;
2887         str->iso2022.current_charset = Qnil;
2888         str->iso2022.current_half = 0;
2889         break;
2890       }
2891     case CODESYS_CCL:
2892       setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_ENCODE (str->codesys));
2893       break;
2894     case CODESYS_UTF8:
2895       str->encode_char = &char_encode_utf8;
2896       str->finish = &char_finish_utf8;
2897       break;
2898     case CODESYS_UCS4:
2899       str->encode_char = &char_encode_ucs4;
2900       str->finish = &char_finish_ucs4;
2901       break;
2902     case CODESYS_SHIFT_JIS:
2903       str->encode_char = &char_encode_shift_jis;
2904       str->finish = &char_finish_shift_jis;
2905       break;
2906     case CODESYS_BIG5:
2907       str->encode_char = &char_encode_big5;
2908       str->finish = &char_finish_big5;
2909       break;
2910     default:
2911       break;
2912     }
2913 #endif /* MULE */
2914   str->iso2022.current_char_boundary = 0;
2915   str->flags = str->ch = 0;
2916 }
2917
2918 static int
2919 encoding_rewinder (Lstream *stream)
2920 {
2921   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2922   reset_encoding_stream (str);
2923   Dynarr_reset (str->runoff);
2924   return Lstream_rewind (str->other_end);
2925 }
2926
2927 static int
2928 encoding_seekable_p (Lstream *stream)
2929 {
2930   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2931   return Lstream_seekable_p (str->other_end);
2932 }
2933
2934 static int
2935 encoding_flusher (Lstream *stream)
2936 {
2937   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2938   return Lstream_flush (str->other_end);
2939 }
2940
2941 static int
2942 encoding_closer (Lstream *stream)
2943 {
2944   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2945   if (stream->flags & LSTREAM_FL_WRITE)
2946     {
2947       str->flags |= CODING_STATE_END;
2948       encoding_writer (stream, 0, 0);
2949     }
2950   Dynarr_free (str->runoff);
2951   return Lstream_close (str->other_end);
2952 }
2953
2954 Lisp_Object
2955 encoding_stream_coding_system (Lstream *stream)
2956 {
2957   Lisp_Object coding_system;
2958   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2959
2960   XSETCODING_SYSTEM (coding_system, str->codesys);
2961   return coding_system;
2962 }
2963
2964 void
2965 set_encoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2966 {
2967   Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2968   struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2969   str->codesys = cs;
2970   reset_encoding_stream (str);
2971 }
2972
2973 static Lisp_Object
2974 make_encoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2975                         const char *mode)
2976 {
2977   Lstream *lstr = Lstream_new (lstream_encoding, mode);
2978   struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2979   Lisp_Object obj;
2980
2981   xzero (*str);
2982   str->runoff = Dynarr_new (unsigned_char);
2983   str->other_end = stream;
2984   set_encoding_stream_coding_system (lstr, codesys);
2985   XSETLSTREAM (obj, lstr);
2986   return obj;
2987 }
2988
2989 Lisp_Object
2990 make_encoding_input_stream (Lstream *stream, Lisp_Object codesys)
2991 {
2992   return make_encoding_stream_1 (stream, codesys, "r");
2993 }
2994
2995 Lisp_Object
2996 make_encoding_output_stream (Lstream *stream, Lisp_Object codesys)
2997 {
2998   return make_encoding_stream_1 (stream, codesys, "w");
2999 }
3000
3001 /* Convert N bytes of internally-formatted data stored in SRC to an
3002    external format, according to the encoding stream ENCODING.
3003    Store the encoded data into DST. */
3004
3005 static void
3006 mule_encode (Lstream *encoding, const Bufbyte *src,
3007              unsigned_char_dynarr *dst, size_t n)
3008 {
3009   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3010
3011   switch (CODING_SYSTEM_TYPE (str->codesys))
3012     {
3013 #ifdef DEBUG_XEMACS
3014     case CODESYS_INTERNAL:
3015       Dynarr_add_many (dst, src, n);
3016       break;
3017 #endif
3018     case CODESYS_AUTODETECT:
3019       /* If we got this far and still haven't decided on the coding
3020          system, then do no conversion. */
3021     case CODESYS_NO_CONVERSION:
3022       encode_coding_no_conversion (encoding, src, dst, n);
3023       break;
3024 #ifdef MULE
3025     case CODESYS_CCL:
3026       str->ccl.last_block = str->flags & CODING_STATE_END;
3027       /* When applying ccl program to stream, MUST NOT set NULL
3028          pointer to src.  */
3029       ccl_driver (&str->ccl, ((src) ? src : (unsigned char*)""),
3030                   dst, n, 0, CCL_MODE_ENCODING);
3031       break;
3032 #endif /* MULE */
3033     default:
3034       text_encode_generic (encoding, src, dst, n);
3035     }
3036 }
3037
3038 DEFUN ("encode-coding-region", Fencode_coding_region, 3, 4, 0, /*
3039 Encode the text between START and END using CODING-SYSTEM.
3040 This will, for example, convert Japanese characters into stuff such as
3041 "^[$B!<!+^[(B" if you use the JIS encoding.  Return length of encoded
3042 text.  BUFFER defaults to the current buffer if unspecified.
3043 */
3044        (start, end, coding_system, buffer))
3045 {
3046   Bufpos b, e;
3047   struct buffer *buf = decode_buffer (buffer, 0);
3048   Lisp_Object instream, lb_outstream, de_outstream, outstream;
3049   Lstream *istr, *ostr;
3050   struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
3051
3052   get_buffer_range_char (buf, start, end, &b, &e, 0);
3053
3054   barf_if_buffer_read_only (buf, b, e);
3055
3056   coding_system = Fget_coding_system (coding_system);
3057   instream  = make_lisp_buffer_input_stream  (buf, b, e, 0);
3058   lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
3059   de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
3060                                               Fget_coding_system (Qbinary));
3061   outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
3062                                            coding_system);
3063   istr = XLSTREAM (instream);
3064   ostr = XLSTREAM (outstream);
3065   GCPRO4 (instream, outstream, de_outstream, lb_outstream);
3066   /* The chain of streams looks like this:
3067
3068      [BUFFER] <----- send through
3069                      ------> [ENCODE AS SPECIFIED]
3070                              ------> [DECODE AS BINARY]
3071                                      ------> [BUFFER]
3072    */
3073   while (1)
3074     {
3075       char tempbuf[1024]; /* some random amount */
3076       Bufpos newpos, even_newer_pos;
3077       Bufpos oldpos = lisp_buffer_stream_startpos (istr);
3078       ssize_t size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
3079
3080       if (!size_in_bytes)
3081         break;
3082       newpos = lisp_buffer_stream_startpos (istr);
3083       Lstream_write (ostr, tempbuf, size_in_bytes);
3084       even_newer_pos = lisp_buffer_stream_startpos (istr);
3085       buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
3086                            even_newer_pos, 0);
3087     }
3088
3089   {
3090     Charcount retlen =
3091       lisp_buffer_stream_startpos (XLSTREAM (instream)) - b;
3092     Lstream_close (istr);
3093     Lstream_close (ostr);
3094     UNGCPRO;
3095     Lstream_delete (istr);
3096     Lstream_delete (ostr);
3097     Lstream_delete (XLSTREAM (de_outstream));
3098     Lstream_delete (XLSTREAM (lb_outstream));
3099     return make_int (retlen);
3100   }
3101 }
3102
3103 #ifdef MULE
3104 \f
3105 static void
3106 text_encode_generic (Lstream *encoding, const Bufbyte *src,
3107                      unsigned_char_dynarr *dst, size_t n)
3108 {
3109   unsigned char c;
3110   unsigned char char_boundary;
3111   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3112   unsigned int flags          = str->flags;
3113   Emchar ch                   = str->ch;
3114
3115   char_boundary = str->iso2022.current_char_boundary;
3116
3117   while (n--)
3118     {
3119       c = *src++;
3120
3121       if (char_boundary == 0)
3122         {
3123           if (c >= 0xfc)
3124             {
3125               ch = c & 0x01;
3126               char_boundary = 5;
3127             }
3128           else if (c >= 0xf8)
3129             {
3130               ch = c & 0x03;
3131               char_boundary = 4;
3132             }
3133           else if (c >= 0xf0)
3134             {
3135               ch = c & 0x07;
3136               char_boundary = 3;
3137             }
3138           else if (c >= 0xe0)
3139             {
3140               ch = c & 0x0f;
3141               char_boundary = 2;
3142             }
3143           else if (c >= 0xc0)
3144             {
3145               ch = c & 0x1f;
3146               char_boundary = 1;
3147             }
3148           else
3149             (*str->encode_char) (str, c, dst, &flags);
3150         }
3151       else if (char_boundary == 1)
3152         {
3153           (*str->encode_char) (str, (ch << 6) | (c & 0x3f), dst, &flags);
3154           ch =0;
3155           char_boundary = 0;
3156         }
3157       else
3158         {
3159           ch = (ch << 6) | (c & 0x3f);
3160           char_boundary--;
3161         }
3162     }
3163
3164   if ((char_boundary == 0) && (flags & CODING_STATE_END))
3165     {
3166       (*str->finish) (str, dst, &flags);
3167     }
3168
3169   str->flags = flags;
3170   str->ch    = ch;
3171   str->iso2022.current_char_boundary = char_boundary;
3172 }
3173
3174 \f
3175 /************************************************************************/
3176 /*                          Shift-JIS methods                           */
3177 /************************************************************************/
3178
3179 /* Shift-JIS is a coding system encoding three character sets: ASCII, right
3180    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
3181    as is.  A character of JISX0201-Kana (DIMENSION1_CHARS94 character set) is
3182    encoded by "position-code + 0x80".  A character of JISX0208
3183    (DIMENSION2_CHARS94 character set) is encoded in 2-byte but two
3184    position-codes are divided and shifted so that it fit in the range
3185    below.
3186
3187    --- CODE RANGE of Shift-JIS ---
3188    (character set)      (range)
3189    ASCII                0x00 .. 0x7F
3190    JISX0201-Kana        0xA0 .. 0xDF
3191    JISX0208 (1st byte)  0x80 .. 0x9F and 0xE0 .. 0xEF
3192             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
3193    -------------------------------
3194
3195 */
3196
3197 /* Is this the first byte of a Shift-JIS two-byte char? */
3198
3199 #define BYTE_SJIS_TWO_BYTE_1_P(c) \
3200   (((c) >= 0x81 && (c) <= 0x9F) || ((c) >= 0xE0 && (c) <= 0xEF))
3201
3202 /* Is this the second byte of a Shift-JIS two-byte char? */
3203
3204 #define BYTE_SJIS_TWO_BYTE_2_P(c) \
3205   (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0x80 && (c) <= 0xFC))
3206
3207 #define BYTE_SJIS_KATAKANA_P(c) \
3208   ((c) >= 0xA1 && (c) <= 0xDF)
3209
3210 static int
3211 detect_coding_sjis (struct detection_state *st, const Extbyte *src, size_t n)
3212 {
3213   while (n--)
3214     {
3215       unsigned char c = *(unsigned char *)src++;
3216       if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3217         return 0;
3218       if (st->shift_jis.in_second_byte)
3219         {
3220           st->shift_jis.in_second_byte = 0;
3221           if (c < 0x40)
3222             return 0;
3223         }
3224       else if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
3225         st->shift_jis.in_second_byte = 1;
3226     }
3227   return CODING_CATEGORY_SHIFT_JIS_MASK;
3228 }
3229
3230 /* Convert Shift-JIS data to internal format. */
3231
3232 static void
3233 decode_coding_sjis (Lstream *decoding, const Extbyte *src,
3234                     unsigned_char_dynarr *dst, size_t n)
3235 {
3236   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3237   unsigned int flags  = str->flags;
3238   unsigned int cpos   = str->cpos;
3239   eol_type_t eol_type = str->eol_type;
3240
3241   while (n--)
3242     {
3243       unsigned char c = *(unsigned char *)src++;
3244
3245       if (cpos)
3246         {
3247           /* Previous character was first byte of Shift-JIS Kanji char. */
3248           if (BYTE_SJIS_TWO_BYTE_2_P (c))
3249             {
3250               unsigned char e1, e2;
3251
3252               DECODE_SJIS (cpos, c, e1, e2);
3253 #ifdef UTF2000
3254               DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_japanese_jisx0208,
3255                                             e1 & 0x7F,
3256                                             e2 & 0x7F), dst);
3257 #else
3258               Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
3259               Dynarr_add (dst, e1);
3260               Dynarr_add (dst, e2);
3261 #endif
3262             }
3263           else
3264             {
3265               DECODE_ADD_BINARY_CHAR (cpos, dst);
3266               DECODE_ADD_BINARY_CHAR (c, dst);
3267             }
3268           cpos = 0;
3269         }
3270       else
3271         {
3272           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3273           if (BYTE_SJIS_TWO_BYTE_1_P (c))
3274             cpos = c;
3275           else if (BYTE_SJIS_KATAKANA_P (c))
3276             {
3277 #ifdef UTF2000
3278               DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_katakana_jisx0201,
3279                                             c & 0x7F, 0), dst);
3280 #else
3281               Dynarr_add (dst, LEADING_BYTE_KATAKANA_JISX0201);
3282               Dynarr_add (dst, c);
3283 #endif
3284             }
3285 #ifdef UTF2000
3286           else if (c > 32)
3287             DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_latin_jisx0201,
3288                                           c, 0), dst);
3289 #endif
3290           else
3291             DECODE_ADD_BINARY_CHAR (c, dst);
3292         }
3293     label_continue_loop:;
3294     }
3295
3296   DECODE_HANDLE_END_OF_CONVERSION (flags, cpos, dst);
3297
3298   str->flags = flags;
3299   str->cpos  = cpos;
3300 }
3301
3302 /* Convert internal character representation to Shift_JIS. */
3303
3304 void
3305 char_encode_shift_jis (struct encoding_stream *str, Emchar ch,
3306                        unsigned_char_dynarr *dst, unsigned int *flags)
3307 {
3308   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3309
3310   if (ch == '\n')
3311     {
3312       if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3313         Dynarr_add (dst, '\r');
3314       if (eol_type != EOL_CR)
3315         Dynarr_add (dst, ch);
3316     }
3317   else
3318     {
3319       unsigned int s1, s2;
3320 #ifdef UTF2000
3321       int code_point = charset_code_point (Vcharset_latin_jisx0201, ch);
3322
3323       if (code_point >= 0)
3324         Dynarr_add (dst, code_point);
3325       else if ((code_point
3326                 = charset_code_point (Vcharset_japanese_jisx0208_1990, ch))
3327                >= 0)
3328         {
3329           ENCODE_SJIS ((code_point >> 8) | 0x80,
3330                        (code_point & 0xFF) | 0x80, s1, s2);
3331           Dynarr_add (dst, s1);
3332           Dynarr_add (dst, s2);
3333         }
3334       else if ((code_point
3335                 = charset_code_point (Vcharset_katakana_jisx0201, ch))
3336                >= 0)
3337         Dynarr_add (dst, code_point | 0x80);
3338       else if ((code_point
3339                 = charset_code_point (Vcharset_japanese_jisx0208, ch))
3340                >= 0)
3341         {
3342           ENCODE_SJIS ((code_point >> 8) | 0x80,
3343                        (code_point & 0xFF) | 0x80, s1, s2);
3344           Dynarr_add (dst, s1);
3345           Dynarr_add (dst, s2);
3346         }
3347       else if ((code_point = charset_code_point (Vcharset_ascii, ch))
3348                >= 0)
3349         Dynarr_add (dst, code_point);
3350       else
3351         Dynarr_add (dst, '?');
3352 #else
3353       Lisp_Object charset;
3354       unsigned int c1, c2;
3355
3356       BREAKUP_CHAR (ch, charset, c1, c2);
3357
3358       if (EQ(charset, Vcharset_katakana_jisx0201))
3359         {
3360           Dynarr_add (dst, c1 | 0x80);
3361         }
3362       else if (c2 == 0)
3363         {
3364           Dynarr_add (dst, c1);
3365         }
3366       else if (EQ(charset, Vcharset_japanese_jisx0208))
3367         {
3368           ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3369           Dynarr_add (dst, s1);
3370           Dynarr_add (dst, s2);
3371         }
3372       else
3373         Dynarr_add (dst, '?');
3374 #endif
3375     }
3376 }
3377
3378 void
3379 char_finish_shift_jis (struct encoding_stream *str, unsigned_char_dynarr *dst,
3380                        unsigned int *flags)
3381 {
3382 }
3383
3384 DEFUN ("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /*
3385 Decode a JISX0208 character of Shift-JIS coding-system.
3386 CODE is the character code in Shift-JIS as a cons of type bytes.
3387 Return the corresponding character.
3388 */
3389        (code))
3390 {
3391   unsigned char c1, c2, s1, s2;
3392
3393   CHECK_CONS (code);
3394   CHECK_INT (XCAR (code));
3395   CHECK_INT (XCDR (code));
3396   s1 = XINT (XCAR (code));
3397   s2 = XINT (XCDR (code));
3398   if (BYTE_SJIS_TWO_BYTE_1_P (s1) &&
3399       BYTE_SJIS_TWO_BYTE_2_P (s2))
3400     {
3401       DECODE_SJIS (s1, s2, c1, c2);
3402       return make_char (MAKE_CHAR (Vcharset_japanese_jisx0208,
3403                                    c1 & 0x7F, c2 & 0x7F));
3404     }
3405   else
3406     return Qnil;
3407 }
3408
3409 DEFUN ("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /*
3410 Encode a JISX0208 character CHARACTER to SHIFT-JIS coding-system.
3411 Return the corresponding character code in SHIFT-JIS as a cons of two bytes.
3412 */
3413        (character))
3414 {
3415   Lisp_Object charset;
3416   int c1, c2, s1, s2;
3417
3418   CHECK_CHAR_COERCE_INT (character);
3419   BREAKUP_CHAR (XCHAR (character), charset, c1, c2);
3420   if (EQ (charset, Vcharset_japanese_jisx0208))
3421     {
3422       ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3423       return Fcons (make_int (s1), make_int (s2));
3424     }
3425   else
3426     return Qnil;
3427 }
3428
3429 \f
3430 /************************************************************************/
3431 /*                            Big5 methods                              */
3432 /************************************************************************/
3433
3434 /* BIG5 is a coding system encoding two character sets: ASCII and
3435    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
3436    character set and is encoded in two-byte.
3437
3438    --- CODE RANGE of BIG5 ---
3439    (character set)      (range)
3440    ASCII                0x00 .. 0x7F
3441    Big5 (1st byte)      0xA1 .. 0xFE
3442         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
3443    --------------------------
3444
3445    Since the number of characters in Big5 is larger than maximum
3446    characters in Emacs' charset (96x96), it can't be handled as one
3447    charset.  So, in Emacs, Big5 is divided into two: `charset-big5-1'
3448    and `charset-big5-2'.  Both <type>s are DIMENSION2_CHARS94.  The former
3449    contains frequently used characters and the latter contains less
3450    frequently used characters.  */
3451
3452 #ifdef UTF2000
3453 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
3454   ((c) >= 0x81 && (c) <= 0xFE)
3455 #else
3456 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
3457   ((c) >= 0xA1 && (c) <= 0xFE)
3458 #endif
3459
3460 /* Is this the second byte of a Shift-JIS two-byte char? */
3461
3462 #define BYTE_BIG5_TWO_BYTE_2_P(c) \
3463   (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0xA1 && (c) <= 0xFE))
3464
3465 /* Number of Big5 characters which have the same code in 1st byte.  */
3466
3467 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
3468
3469 /* Code conversion macros.  These are macros because they are used in
3470    inner loops during code conversion.
3471
3472    Note that temporary variables in macros introduce the classic
3473    dynamic-scoping problems with variable names.  We use capital-
3474    lettered variables in the assumption that XEmacs does not use
3475    capital letters in variables except in a very formalized way
3476    (e.g. Qstring). */
3477
3478 /* Convert Big5 code (b1, b2) into its internal string representation
3479    (lb, c1, c2). */
3480
3481 /* There is a much simpler way to split the Big5 charset into two.
3482    For the moment I'm going to leave the algorithm as-is because it
3483    claims to separate out the most-used characters into a single
3484    charset, which perhaps will lead to optimizations in various
3485    places.
3486
3487    The way the algorithm works is something like this:
3488
3489    Big5 can be viewed as a 94x157 charset, where the row is
3490    encoded into the bytes 0xA1 .. 0xFE and the column is encoded
3491    into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE.  As for frequency,
3492    the split between low and high column numbers is apparently
3493    meaningless; ascending rows produce less and less frequent chars.
3494    Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to
3495    the first charset, and the upper half (0xC9 .. 0xFE) to the
3496    second.  To do the conversion, we convert the character into
3497    a single number where 0 .. 156 is the first row, 157 .. 313
3498    is the second, etc.  That way, the characters are ordered by
3499    decreasing frequency.  Then we just chop the space in two
3500    and coerce the result into a 94x94 space.
3501    */
3502
3503 #define DECODE_BIG5(b1, b2, lb, c1, c2) do                              \
3504 {                                                                       \
3505   int B1 = b1, B2 = b2;                                                 \
3506   unsigned int I                                                        \
3507     = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62);     \
3508                                                                         \
3509   if (B1 < 0xC9)                                                        \
3510     {                                                                   \
3511       lb = LEADING_BYTE_CHINESE_BIG5_1;                                 \
3512     }                                                                   \
3513   else                                                                  \
3514     {                                                                   \
3515       lb = LEADING_BYTE_CHINESE_BIG5_2;                                 \
3516       I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1);                             \
3517     }                                                                   \
3518   c1 = I / (0xFF - 0xA1) + 0xA1;                                        \
3519   c2 = I % (0xFF - 0xA1) + 0xA1;                                        \
3520 } while (0)
3521
3522 /* Convert the internal string representation of a Big5 character
3523    (lb, c1, c2) into Big5 code (b1, b2). */
3524
3525 #define ENCODE_BIG5(lb, c1, c2, b1, b2) do                              \
3526 {                                                                       \
3527   unsigned int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1);       \
3528                                                                         \
3529   if (lb == LEADING_BYTE_CHINESE_BIG5_2)                                \
3530     {                                                                   \
3531       I += BIG5_SAME_ROW * (0xC9 - 0xA1);                               \
3532     }                                                                   \
3533   b1 = I / BIG5_SAME_ROW + 0xA1;                                        \
3534   b2 = I % BIG5_SAME_ROW;                                               \
3535   b2 += b2 < 0x3F ? 0x40 : 0x62;                                        \
3536 } while (0)
3537
3538 static int
3539 detect_coding_big5 (struct detection_state *st, const Extbyte *src, size_t n)
3540 {
3541   while (n--)
3542     {
3543       unsigned char c = *(unsigned char *)src++;
3544       if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO
3545 #ifndef UTF2000
3546           || (c >= 0x80 && c <= 0xA0)
3547 #endif
3548           )
3549         return 0;
3550       if (st->big5.in_second_byte)
3551         {
3552           st->big5.in_second_byte = 0;
3553           if (c < 0x40 || (c >= 0x80 && c <= 0xA0))
3554             return 0;
3555         }
3556       else if (
3557 #ifdef UTF2000
3558                c >= 0x81
3559 #else
3560                c >= 0xA1
3561 #endif
3562                )
3563         st->big5.in_second_byte = 1;
3564     }
3565   return CODING_CATEGORY_BIG5_MASK;
3566 }
3567
3568 /* Convert Big5 data to internal format. */
3569
3570 static void
3571 decode_coding_big5 (Lstream *decoding, const Extbyte *src,
3572                     unsigned_char_dynarr *dst, size_t n)
3573 {
3574   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3575   unsigned int flags  = str->flags;
3576   unsigned int cpos   = str->cpos;
3577   eol_type_t eol_type = str->eol_type;
3578
3579   while (n--)
3580     {
3581       unsigned char c = *(unsigned char *)src++;
3582       if (cpos)
3583         {
3584           /* Previous character was first byte of Big5 char. */
3585           if (BYTE_BIG5_TWO_BYTE_2_P (c))
3586             {
3587 #ifdef UTF2000
3588               DECODE_ADD_UCS_CHAR
3589                 (DECODE_CHAR (Vcharset_chinese_big5, (cpos << 8) | c),
3590                  dst);
3591 #else
3592               unsigned char b1, b2, b3;
3593               DECODE_BIG5 (cpos, c, b1, b2, b3);
3594               Dynarr_add (dst, b1);
3595               Dynarr_add (dst, b2);
3596               Dynarr_add (dst, b3);
3597 #endif
3598             }
3599           else
3600             {
3601               DECODE_ADD_BINARY_CHAR (cpos, dst);
3602               DECODE_ADD_BINARY_CHAR (c, dst);
3603             }
3604           cpos = 0;
3605         }
3606       else
3607         {
3608           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3609           if (BYTE_BIG5_TWO_BYTE_1_P (c))
3610             cpos = c;
3611           else
3612             DECODE_ADD_BINARY_CHAR (c, dst);
3613         }
3614     label_continue_loop:;
3615     }
3616
3617   DECODE_HANDLE_END_OF_CONVERSION (flags, cpos, dst);
3618
3619   str->flags = flags;
3620   str->cpos  = cpos;
3621 }
3622
3623 /* Convert internally-formatted data to Big5. */
3624
3625 void
3626 char_encode_big5 (struct encoding_stream *str, Emchar ch,
3627                   unsigned_char_dynarr *dst, unsigned int *flags)
3628 {
3629   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3630
3631   if (ch == '\n')
3632     {
3633       if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3634         Dynarr_add (dst, '\r');
3635       if (eol_type != EOL_CR)
3636         Dynarr_add (dst, ch);
3637     }
3638   else
3639     {
3640 #ifdef UTF2000
3641       int code_point;
3642
3643       if ((code_point = charset_code_point (Vcharset_ascii, ch)) >= 0)
3644         Dynarr_add (dst, code_point);
3645       else if ((code_point
3646                 = charset_code_point (Vcharset_chinese_big5, ch)) >= 0)
3647         {
3648           Dynarr_add (dst, code_point >> 8);
3649           Dynarr_add (dst, code_point & 0xFF);
3650         }
3651       else if ((code_point
3652                 = charset_code_point (Vcharset_chinese_big5_1, ch)) >= 0)
3653         {
3654           unsigned int I
3655             = ((code_point >> 8) - 33) * (0xFF - 0xA1)
3656             + ((code_point & 0xFF) - 33);
3657           unsigned char b1 = I / BIG5_SAME_ROW + 0xA1;
3658           unsigned char b2 = I % BIG5_SAME_ROW;
3659
3660           b2 += b2 < 0x3F ? 0x40 : 0x62;
3661           Dynarr_add (dst, b1);
3662           Dynarr_add (dst, b2);
3663         }
3664       else if ((code_point
3665                 = charset_code_point (Vcharset_chinese_big5_2, ch)) >= 0)
3666         {
3667           unsigned int I
3668             = ((code_point >> 8) - 33) * (0xFF - 0xA1)
3669             + ((code_point & 0xFF) - 33);
3670           unsigned char b1, b2;
3671
3672           I += BIG5_SAME_ROW * (0xC9 - 0xA1);
3673           b1 = I / BIG5_SAME_ROW + 0xA1;
3674           b2 = I % BIG5_SAME_ROW;
3675           b2 += b2 < 0x3F ? 0x40 : 0x62;
3676           Dynarr_add (dst, b1);
3677           Dynarr_add (dst, b2);
3678         }
3679       else
3680         Dynarr_add (dst, '?');
3681 #else
3682 #endif
3683     }
3684 }
3685
3686 void
3687 char_finish_big5 (struct encoding_stream *str, unsigned_char_dynarr *dst,
3688                   unsigned int *flags)
3689 {
3690 }
3691
3692
3693 DEFUN ("decode-big5-char", Fdecode_big5_char, 1, 1, 0, /*
3694 Decode a Big5 character CODE of BIG5 coding-system.
3695 CODE is the character code in BIG5, a cons of two integers.
3696 Return the corresponding character.
3697 */
3698        (code))
3699 {
3700   unsigned char c1, c2, b1, b2;
3701
3702   CHECK_CONS (code);
3703   CHECK_INT (XCAR (code));
3704   CHECK_INT (XCDR (code));
3705   b1 = XINT (XCAR (code));
3706   b2 = XINT (XCDR (code));
3707   if (BYTE_BIG5_TWO_BYTE_1_P (b1) &&
3708       BYTE_BIG5_TWO_BYTE_2_P (b2))
3709     {
3710       Charset_ID leading_byte;
3711       Lisp_Object charset;
3712       DECODE_BIG5 (b1, b2, leading_byte, c1, c2);
3713       charset = CHARSET_BY_LEADING_BYTE (leading_byte);
3714       return make_char (MAKE_CHAR (charset, c1 & 0x7F, c2 & 0x7F));
3715     }
3716   else
3717     return Qnil;
3718 }
3719
3720 DEFUN ("encode-big5-char", Fencode_big5_char, 1, 1, 0, /*
3721 Encode the Big5 character CHARACTER in the BIG5 coding-system.
3722 Return the corresponding character code in Big5.
3723 */
3724        (character))
3725 {
3726   Lisp_Object charset;
3727   int c1, c2, b1, b2;
3728
3729   CHECK_CHAR_COERCE_INT (character);
3730   BREAKUP_CHAR (XCHAR (character), charset, c1, c2);
3731   if (EQ (charset, Vcharset_chinese_big5_1) ||
3732       EQ (charset, Vcharset_chinese_big5_2))
3733     {
3734       ENCODE_BIG5 (XCHARSET_LEADING_BYTE (charset), c1 | 0x80, c2 | 0x80,
3735                    b1, b2);
3736       return Fcons (make_int (b1), make_int (b2));
3737     }
3738   else
3739     return Qnil;
3740 }
3741
3742 \f
3743 /************************************************************************/
3744 /*                           UCS-4 methods                              */
3745 /************************************************************************/
3746
3747 static int
3748 detect_coding_ucs4 (struct detection_state *st, const Extbyte *src, size_t n)
3749 {
3750   while (n--)
3751     {
3752       unsigned char c = *(unsigned char *)src++;
3753       switch (st->ucs4.in_byte)
3754         {
3755         case 0:
3756           if (c >= 128)
3757             return 0;
3758           else
3759             st->ucs4.in_byte++;
3760           break;
3761         case 3:
3762           st->ucs4.in_byte = 0;
3763           break;
3764         default:
3765           st->ucs4.in_byte++;
3766         }
3767     }
3768   return CODING_CATEGORY_UCS4_MASK;
3769 }
3770
3771 static void
3772 decode_coding_ucs4 (Lstream *decoding, const Extbyte *src,
3773                     unsigned_char_dynarr *dst, size_t n)
3774 {
3775   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3776   unsigned int flags = str->flags;
3777   unsigned int cpos  = str->cpos;
3778   unsigned char counter = str->counter;
3779
3780   while (n--)
3781     {
3782       unsigned char c = *(unsigned char *)src++;
3783       switch (counter)
3784         {
3785         case 0:
3786           cpos = c;
3787           counter = 3;
3788           break;
3789         case 1:
3790           DECODE_ADD_UCS_CHAR ((cpos << 8) | c, dst);
3791           cpos = 0;
3792           counter = 0;
3793           break;
3794         default:
3795           cpos = ( cpos << 8 ) | c;
3796           counter--;
3797         }
3798     }
3799   if (counter & CODING_STATE_END)
3800     DECODE_OUTPUT_PARTIAL_CHAR (cpos);
3801
3802   str->flags    = flags;
3803   str->cpos     = cpos;
3804   str->counter  = counter;
3805 }
3806
3807 void
3808 char_encode_ucs4 (struct encoding_stream *str, Emchar ch,
3809                   unsigned_char_dynarr *dst, unsigned int *flags)
3810 {
3811   Dynarr_add (dst, ch >> 24);
3812   Dynarr_add (dst, ch >> 16);
3813   Dynarr_add (dst, ch >>  8);
3814   Dynarr_add (dst, ch      );
3815 }
3816
3817 void
3818 char_finish_ucs4 (struct encoding_stream *str, unsigned_char_dynarr *dst,
3819                   unsigned int *flags)
3820 {
3821 }
3822
3823 \f
3824 /************************************************************************/
3825 /*                           UTF-8 methods                              */
3826 /************************************************************************/
3827
3828 static int
3829 detect_coding_utf8 (struct detection_state *st, const Extbyte *src, size_t n)
3830 {
3831   while (n--)
3832     {
3833       unsigned char c = *(unsigned char *)src++;
3834       switch (st->utf8.in_byte)
3835         {
3836         case 0:
3837           if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3838             return 0;
3839           else if (c >= 0xfc)
3840             st->utf8.in_byte = 5;
3841           else if (c >= 0xf8)
3842             st->utf8.in_byte = 4;
3843           else if (c >= 0xf0)
3844             st->utf8.in_byte = 3;
3845           else if (c >= 0xe0)
3846             st->utf8.in_byte = 2;
3847           else if (c >= 0xc0)
3848             st->utf8.in_byte = 1;
3849           else if (c >= 0x80)
3850             return 0;
3851           break;
3852         default:
3853           if ((c & 0xc0) != 0x80)
3854             return 0;
3855           else
3856             st->utf8.in_byte--;
3857         }
3858     }
3859   return CODING_CATEGORY_UTF8_MASK;
3860 }
3861
3862 static void
3863 decode_output_utf8_partial_char (unsigned char counter,
3864                                  unsigned int cpos,
3865                                  unsigned_char_dynarr *dst)
3866 {
3867   if (counter == 5)
3868     DECODE_ADD_BINARY_CHAR ( (cpos|0xFC), dst);
3869   else if (counter == 4)
3870     {
3871       if (cpos < (1 << 6))
3872         DECODE_ADD_BINARY_CHAR ( (cpos|0xF8), dst);
3873       else
3874         {
3875           DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xFC), dst);
3876           DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
3877         }
3878     }
3879   else if (counter == 3)
3880     {
3881       if (cpos < (1 << 6))
3882         DECODE_ADD_BINARY_CHAR ( (cpos|0xF0), dst);
3883       else if (cpos < (1 << 12))
3884         {
3885           DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xF8), dst);
3886           DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
3887         }
3888       else
3889         {
3890           DECODE_ADD_BINARY_CHAR ( ( (cpos >> 12)|0xFC), dst);
3891           DECODE_ADD_BINARY_CHAR ( (((cpos >>  6)&0x3F)|0x80), dst);
3892           DECODE_ADD_BINARY_CHAR ( ( (cpos       &0x3F)|0x80), dst);
3893         }
3894     }
3895   else if (counter == 2)
3896     {
3897       if (cpos < (1 << 6))
3898         DECODE_ADD_BINARY_CHAR ( (cpos|0xE0), dst);
3899       else if (cpos < (1 << 12))
3900         {
3901           DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xF0), dst);
3902           DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
3903         }
3904       else if (cpos < (1 << 18))
3905         {
3906           DECODE_ADD_BINARY_CHAR ( ( (cpos >> 12)|0xF8), dst);
3907           DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
3908           DECODE_ADD_BINARY_CHAR ( ( (cpos      &0x3F)|0x80), dst);
3909         }
3910       else
3911         {
3912           DECODE_ADD_BINARY_CHAR ( ( (cpos >> 18)|0xFC), dst);
3913           DECODE_ADD_BINARY_CHAR ( (((cpos >> 12)&0x3F)|0x80), dst);
3914           DECODE_ADD_BINARY_CHAR ( (((cpos >>  6)&0x3F)|0x80), dst);
3915           DECODE_ADD_BINARY_CHAR ( ( (cpos       &0x3F)|0x80), dst);
3916         }
3917     }
3918   else
3919     {
3920       if (cpos < (1 << 6))
3921         DECODE_ADD_BINARY_CHAR ( (cpos|0xC0), dst);
3922       else if (cpos < (1 << 12))
3923         {
3924           DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xE0), dst);
3925           DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
3926         }
3927       else if (cpos < (1 << 18))
3928         {
3929           DECODE_ADD_BINARY_CHAR ( ( (cpos >> 12)|0xF0), dst);
3930           DECODE_ADD_BINARY_CHAR ( (((cpos >>  6)&0x3F)|0x80), dst);
3931           DECODE_ADD_BINARY_CHAR ( ( (cpos       &0x3F)|0x80), dst);
3932         }
3933       else if (cpos < (1 << 24))
3934         {
3935           DECODE_ADD_BINARY_CHAR ( ( (cpos >> 18)|0xF8), dst);
3936           DECODE_ADD_BINARY_CHAR ( (((cpos >> 12)&0x3F)|0x80), dst);
3937           DECODE_ADD_BINARY_CHAR ( (((cpos >>  6)&0x3F)|0x80), dst);
3938           DECODE_ADD_BINARY_CHAR ( ( (cpos       &0x3F)|0x80), dst);
3939         }
3940       else
3941         {
3942           DECODE_ADD_BINARY_CHAR ( ( (cpos >> 24)|0xFC), dst);
3943           DECODE_ADD_BINARY_CHAR ( (((cpos >> 18)&0x3F)|0x80), dst);
3944           DECODE_ADD_BINARY_CHAR ( (((cpos >> 12)&0x3F)|0x80), dst);
3945           DECODE_ADD_BINARY_CHAR ( (((cpos >>  6)&0x3F)|0x80), dst);
3946           DECODE_ADD_BINARY_CHAR ( ( (cpos       &0x3F)|0x80), dst);
3947         }
3948     }
3949 }
3950
3951 static void
3952 decode_coding_utf8 (Lstream *decoding, const Extbyte *src,
3953                     unsigned_char_dynarr *dst, size_t n)
3954 {
3955   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3956   unsigned int flags    = str->flags;
3957   unsigned int cpos     = str->cpos;
3958   eol_type_t eol_type   = str->eol_type;
3959   unsigned char counter = str->counter;
3960
3961   while (n--)
3962     {
3963       unsigned char c = *(unsigned char *)src++;
3964       if (counter == 0)
3965         {
3966           if ( c < 0xC0 )
3967             {
3968               DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3969               DECODE_ADD_UCS_CHAR (c, dst);
3970             }
3971           else if ( c < 0xE0 )
3972             {
3973               cpos = c & 0x1f;
3974               counter = 1;
3975             }
3976           else if ( c < 0xF0 )
3977             {
3978               cpos = c & 0x0f;
3979               counter = 2;
3980             }
3981           else if ( c < 0xF8 )
3982             {
3983               cpos = c & 0x07;
3984               counter = 3;
3985             }
3986           else if ( c < 0xFC )
3987             {
3988               cpos = c & 0x03;
3989               counter = 4;
3990             }
3991           else
3992             {
3993               cpos = c & 0x01;
3994               counter = 5;
3995             }
3996         }
3997       else if ( (c & 0xC0) == 0x80 )
3998         {
3999           cpos = ( cpos << 6 ) | ( c & 0x3f );
4000           if (counter == 1)
4001             {
4002               DECODE_ADD_UCS_CHAR (cpos, dst);
4003               cpos = 0;
4004               counter = 0;
4005             }
4006           else
4007             counter--;
4008         }
4009       else
4010         {
4011           decode_output_utf8_partial_char (counter, cpos, dst);
4012           DECODE_ADD_BINARY_CHAR (c, dst);
4013           cpos = 0;
4014           counter = 0;
4015         }
4016     label_continue_loop:;
4017     }
4018
4019   if (flags & CODING_STATE_END)
4020     if (counter > 0)
4021       {
4022         decode_output_utf8_partial_char (counter, cpos, dst);
4023         cpos = 0;
4024         counter = 0;
4025       }
4026   str->flags    = flags;
4027   str->cpos     = cpos;
4028   str->counter  = counter;
4029 }
4030
4031 void
4032 char_encode_utf8 (struct encoding_stream *str, Emchar ch,
4033                   unsigned_char_dynarr *dst, unsigned int *flags)
4034 {
4035   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
4036
4037   if (ch == '\n')
4038     {
4039       if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
4040         Dynarr_add (dst, '\r');
4041       if (eol_type != EOL_CR)
4042         Dynarr_add (dst, ch);
4043     }
4044   else if (ch <= 0x7f)
4045     {
4046       Dynarr_add (dst, ch);
4047     }
4048   else if (ch <= 0x7ff)
4049     {
4050       Dynarr_add (dst, (ch >> 6) | 0xc0);
4051       Dynarr_add (dst, (ch & 0x3f) | 0x80);
4052     }
4053   else if (ch <= 0xffff)
4054     {
4055       Dynarr_add (dst,  (ch >> 12) | 0xe0);
4056       Dynarr_add (dst, ((ch >>  6) & 0x3f) | 0x80);
4057       Dynarr_add (dst,  (ch        & 0x3f) | 0x80);
4058     }
4059   else if (ch <= 0x1fffff)
4060     {
4061       Dynarr_add (dst,  (ch >> 18) | 0xf0);
4062       Dynarr_add (dst, ((ch >> 12) & 0x3f) | 0x80);
4063       Dynarr_add (dst, ((ch >>  6) & 0x3f) | 0x80);
4064       Dynarr_add (dst,  (ch        & 0x3f) | 0x80);
4065     }
4066   else if (ch <= 0x3ffffff)
4067     {
4068       Dynarr_add (dst,  (ch >> 24) | 0xf8);
4069       Dynarr_add (dst, ((ch >> 18) & 0x3f) | 0x80);
4070       Dynarr_add (dst, ((ch >> 12) & 0x3f) | 0x80);
4071       Dynarr_add (dst, ((ch >>  6) & 0x3f) | 0x80);
4072       Dynarr_add (dst,  (ch        & 0x3f) | 0x80);
4073     }
4074   else
4075     {
4076       Dynarr_add (dst,  (ch >> 30) | 0xfc);
4077       Dynarr_add (dst, ((ch >> 24) & 0x3f) | 0x80);
4078       Dynarr_add (dst, ((ch >> 18) & 0x3f) | 0x80);
4079       Dynarr_add (dst, ((ch >> 12) & 0x3f) | 0x80);
4080       Dynarr_add (dst, ((ch >>  6) & 0x3f) | 0x80);
4081       Dynarr_add (dst,  (ch        & 0x3f) | 0x80);
4082     }
4083 }
4084
4085 void
4086 char_finish_utf8 (struct encoding_stream *str, unsigned_char_dynarr *dst,
4087                   unsigned int *flags)
4088 {
4089 }
4090
4091 \f
4092 /************************************************************************/
4093 /*                           ISO2022 methods                            */
4094 /************************************************************************/
4095
4096 /* The following note describes the coding system ISO2022 briefly.
4097    Since the intention of this note is to help understand the
4098    functions in this file, some parts are NOT ACCURATE or OVERLY
4099    SIMPLIFIED.  For thorough understanding, please refer to the
4100    original document of ISO2022.
4101
4102    ISO2022 provides many mechanisms to encode several character sets
4103    in 7-bit and 8-bit environments.  For 7-bit environments, all text
4104    is encoded using bytes less than 128.  This may make the encoded
4105    text a little bit longer, but the text passes more easily through
4106    several gateways, some of which strip off MSB (Most Signigant Bit).
4107
4108    There are two kinds of character sets: control character set and
4109    graphic character set.  The former contains control characters such
4110    as `newline' and `escape' to provide control functions (control
4111    functions are also provided by escape sequences).  The latter
4112    contains graphic characters such as 'A' and '-'.  Emacs recognizes
4113    two control character sets and many graphic character sets.
4114
4115    Graphic character sets are classified into one of the following
4116    four classes, according to the number of bytes (DIMENSION) and
4117    number of characters in one dimension (CHARS) of the set:
4118    - DIMENSION1_CHARS94
4119    - DIMENSION1_CHARS96
4120    - DIMENSION2_CHARS94
4121    - DIMENSION2_CHARS96
4122
4123    In addition, each character set is assigned an identification tag,
4124    unique for each set, called "final character" (denoted as <F>
4125    hereafter).  The <F> of each character set is decided by ECMA(*)
4126    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
4127    (0x30..0x3F are for private use only).
4128
4129    Note (*): ECMA = European Computer Manufacturers Association
4130
4131    Here are examples of graphic character set [NAME(<F>)]:
4132         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
4133         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
4134         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
4135         o DIMENSION2_CHARS96 -- none for the moment
4136
4137    A code area (1 byte = 8 bits) is divided into 4 areas, C0, GL, C1, and GR.
4138         C0 [0x00..0x1F] -- control character plane 0
4139         GL [0x20..0x7F] -- graphic character plane 0
4140         C1 [0x80..0x9F] -- control character plane 1
4141         GR [0xA0..0xFF] -- graphic character plane 1
4142
4143    A control character set is directly designated and invoked to C0 or
4144    C1 by an escape sequence.  The most common case is that:
4145    - ISO646's  control character set is designated/invoked to C0, and
4146    - ISO6429's control character set is designated/invoked to C1,
4147    and usually these designations/invocations are omitted in encoded
4148    text.  In a 7-bit environment, only C0 can be used, and a control
4149    character for C1 is encoded by an appropriate escape sequence to
4150    fit into the environment.  All control characters for C1 are
4151    defined to have corresponding escape sequences.
4152
4153    A graphic character set is at first designated to one of four
4154    graphic registers (G0 through G3), then these graphic registers are
4155    invoked to GL or GR.  These designations and invocations can be
4156    done independently.  The most common case is that G0 is invoked to
4157    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
4158    these invocations and designations are omitted in encoded text.
4159    In a 7-bit environment, only GL can be used.
4160
4161    When a graphic character set of CHARS94 is invoked to GL, codes
4162    0x20 and 0x7F of the GL area work as control characters SPACE and
4163    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
4164    be used.
4165
4166    There are two ways of invocation: locking-shift and single-shift.
4167    With locking-shift, the invocation lasts until the next different
4168    invocation, whereas with single-shift, the invocation affects the
4169    following character only and doesn't affect the locking-shift
4170    state.  Invocations are done by the following control characters or
4171    escape sequences:
4172
4173    ----------------------------------------------------------------------
4174    abbrev  function                  cntrl escape seq   description
4175    ----------------------------------------------------------------------
4176    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
4177    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
4178    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
4179    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
4180    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
4181    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
4182    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
4183    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
4184    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
4185    ----------------------------------------------------------------------
4186    (*) These are not used by any known coding system.
4187
4188    Control characters for these functions are defined by macros
4189    ISO_CODE_XXX in `coding.h'.
4190
4191    Designations are done by the following escape sequences:
4192    ----------------------------------------------------------------------
4193    escape sequence      description
4194    ----------------------------------------------------------------------
4195    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
4196    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
4197    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
4198    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
4199    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
4200    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
4201    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
4202    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
4203    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
4204    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
4205    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
4206    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
4207    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
4208    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
4209    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
4210    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
4211    ----------------------------------------------------------------------
4212
4213    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
4214    of dimension 1, chars 94, and final character <F>, etc...
4215
4216    Note (*): Although these designations are not allowed in ISO2022,
4217    Emacs accepts them on decoding, and produces them on encoding
4218    CHARS96 character sets in a coding system which is characterized as
4219    7-bit environment, non-locking-shift, and non-single-shift.
4220
4221    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
4222    '(' can be omitted.  We refer to this as "short-form" hereafter.
4223
4224    Now you may notice that there are a lot of ways for encoding the
4225    same multilingual text in ISO2022.  Actually, there exist many
4226    coding systems such as Compound Text (used in X11's inter client
4227    communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
4228    (used in Korean internet), EUC (Extended UNIX Code, used in Asian
4229    localized platforms), and all of these are variants of ISO2022.
4230
4231    In addition to the above, Emacs handles two more kinds of escape
4232    sequences: ISO6429's direction specification and Emacs' private
4233    sequence for specifying character composition.
4234
4235    ISO6429's direction specification takes the following form:
4236         o CSI ']'      -- end of the current direction
4237         o CSI '0' ']'  -- end of the current direction
4238         o CSI '1' ']'  -- start of left-to-right text
4239         o CSI '2' ']'  -- start of right-to-left text
4240    The control character CSI (0x9B: control sequence introducer) is
4241    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
4242
4243    Character composition specification takes the following form:
4244         o ESC '0' -- start character composition
4245         o ESC '1' -- end character composition
4246    Since these are not standard escape sequences of any ISO standard,
4247    their use with these meanings is restricted to Emacs only.  */
4248
4249 static void
4250 reset_iso2022 (Lisp_Object coding_system, struct iso2022_decoder *iso)
4251 {
4252   int i;
4253
4254   for (i = 0; i < 4; i++)
4255     {
4256       if (!NILP (coding_system))
4257         iso->charset[i] =
4258           XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, i);
4259       else
4260         iso->charset[i] = Qt;
4261       iso->invalid_designated[i] = 0;
4262     }
4263   iso->esc = ISO_ESC_NOTHING;
4264   iso->esc_bytes_index = 0;
4265   iso->register_left = 0;
4266   iso->register_right = 1;
4267   iso->switched_dir_and_no_valid_charset_yet = 0;
4268   iso->invalid_switch_dir = 0;
4269   iso->output_direction_sequence = 0;
4270   iso->output_literally = 0;
4271 #ifdef ENABLE_COMPOSITE_CHARS
4272   if (iso->composite_chars)
4273     Dynarr_reset (iso->composite_chars);
4274 #endif
4275 }
4276
4277 static int
4278 fit_to_be_escape_quoted (unsigned char c)
4279 {
4280   switch (c)
4281     {
4282     case ISO_CODE_ESC:
4283     case ISO_CODE_CSI:
4284     case ISO_CODE_SS2:
4285     case ISO_CODE_SS3:
4286     case ISO_CODE_SO:
4287     case ISO_CODE_SI:
4288       return 1;
4289
4290     default:
4291       return 0;
4292     }
4293 }
4294
4295 /* Parse one byte of an ISO2022 escape sequence.
4296    If the result is an invalid escape sequence, return 0 and
4297    do not change anything in STR.  Otherwise, if the result is
4298    an incomplete escape sequence, update ISO2022.ESC and
4299    ISO2022.ESC_BYTES and return -1.  Otherwise, update
4300    all the state variables (but not ISO2022.ESC_BYTES) and
4301    return 1.
4302
4303    If CHECK_INVALID_CHARSETS is non-zero, check for designation
4304    or invocation of an invalid character set and treat that as
4305    an unrecognized escape sequence. */
4306
4307 static int
4308 parse_iso2022_esc (Lisp_Object codesys, struct iso2022_decoder *iso,
4309                    unsigned char c, unsigned int *flags,
4310                    int check_invalid_charsets)
4311 {
4312   /* (1) If we're at the end of a designation sequence, CS is the
4313      charset being designated and REG is the register to designate
4314      it to.
4315
4316      (2) If we're at the end of a locking-shift sequence, REG is
4317      the register to invoke and HALF (0 == left, 1 == right) is
4318      the half to invoke it into.
4319
4320      (3) If we're at the end of a single-shift sequence, REG is
4321      the register to invoke. */
4322   Lisp_Object cs = Qnil;
4323   int reg, half;
4324
4325   /* NOTE: This code does goto's all over the fucking place.
4326      The reason for this is that we're basically implementing
4327      a state machine here, and hierarchical languages like C
4328      don't really provide a clean way of doing this. */
4329
4330   if (! (*flags & CODING_STATE_ESCAPE))
4331     /* At beginning of escape sequence; we need to reset our
4332        escape-state variables. */
4333     iso->esc = ISO_ESC_NOTHING;
4334
4335   iso->output_literally = 0;
4336   iso->output_direction_sequence = 0;
4337
4338   switch (iso->esc)
4339     {
4340     case ISO_ESC_NOTHING:
4341       iso->esc_bytes_index = 0;
4342       switch (c)
4343         {
4344         case ISO_CODE_ESC:      /* Start escape sequence */
4345           *flags |= CODING_STATE_ESCAPE;
4346           iso->esc = ISO_ESC;
4347           goto not_done;
4348
4349         case ISO_CODE_CSI:      /* ISO6429 (specifying directionality) */
4350           *flags |= CODING_STATE_ESCAPE;
4351           iso->esc = ISO_ESC_5_11;
4352           goto not_done;
4353
4354         case ISO_CODE_SO:       /* locking shift 1 */
4355           reg = 1; half = 0;
4356           goto locking_shift;
4357         case ISO_CODE_SI:       /* locking shift 0 */
4358           reg = 0; half = 0;
4359           goto locking_shift;
4360
4361         case ISO_CODE_SS2:      /* single shift */
4362           reg = 2;
4363           goto single_shift;
4364         case ISO_CODE_SS3:      /* single shift */
4365           reg = 3;
4366           goto single_shift;
4367
4368         default:                        /* Other control characters */
4369           return 0;
4370         }
4371
4372     case ISO_ESC:
4373       switch (c)
4374         {
4375           /**** single shift ****/
4376
4377         case 'N':       /* single shift 2 */
4378           reg = 2;
4379           goto single_shift;
4380         case 'O':       /* single shift 3 */
4381           reg = 3;
4382           goto single_shift;
4383
4384           /**** locking shift ****/
4385
4386         case '~':       /* locking shift 1 right */
4387           reg = 1; half = 1;
4388           goto locking_shift;
4389         case 'n':       /* locking shift 2 */
4390           reg = 2; half = 0;
4391           goto locking_shift;
4392         case '}':       /* locking shift 2 right */
4393           reg = 2; half = 1;
4394           goto locking_shift;
4395         case 'o':       /* locking shift 3 */
4396           reg = 3; half = 0;
4397           goto locking_shift;
4398         case '|':       /* locking shift 3 right */
4399           reg = 3; half = 1;
4400           goto locking_shift;
4401
4402 #ifdef ENABLE_COMPOSITE_CHARS
4403           /**** composite ****/
4404
4405         case '0':
4406           iso->esc = ISO_ESC_START_COMPOSITE;
4407           *flags = (*flags & CODING_STATE_ISO2022_LOCK) |
4408             CODING_STATE_COMPOSITE;
4409           return 1;
4410
4411         case '1':
4412           iso->esc = ISO_ESC_END_COMPOSITE;
4413           *flags = (*flags & CODING_STATE_ISO2022_LOCK) &
4414             ~CODING_STATE_COMPOSITE;
4415           return 1;
4416 #endif /* ENABLE_COMPOSITE_CHARS */
4417
4418           /**** directionality ****/
4419
4420         case '[':
4421           iso->esc = ISO_ESC_5_11;
4422           goto not_done;
4423
4424           /**** designation ****/
4425
4426         case '$':       /* multibyte charset prefix */
4427           iso->esc = ISO_ESC_2_4;
4428           goto not_done;
4429
4430         default:
4431           if (0x28 <= c && c <= 0x2F)
4432             {
4433               iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_8);
4434               goto not_done;
4435             }
4436
4437           /* This function is called with CODESYS equal to nil when
4438              doing coding-system detection. */
4439           if (!NILP (codesys)
4440               && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
4441               && fit_to_be_escape_quoted (c))
4442             {
4443               iso->esc = ISO_ESC_LITERAL;
4444               *flags &= CODING_STATE_ISO2022_LOCK;
4445               return 1;
4446             }
4447
4448           /* bzzzt! */
4449           return 0;
4450         }
4451
4452
4453
4454       /**** directionality ****/
4455
4456     case ISO_ESC_5_11:          /* ISO6429 direction control */
4457       if (c == ']')
4458         {
4459           *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4460           goto directionality;
4461         }
4462       if      (c == '0') iso->esc = ISO_ESC_5_11_0;
4463       else if (c == '1') iso->esc = ISO_ESC_5_11_1;
4464       else if (c == '2') iso->esc = ISO_ESC_5_11_2;
4465       else               return 0;
4466       goto not_done;
4467
4468     case ISO_ESC_5_11_0:
4469       if (c == ']')
4470         {
4471           *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4472           goto directionality;
4473         }
4474       return 0;
4475
4476     case ISO_ESC_5_11_1:
4477       if (c == ']')
4478         {
4479           *flags = (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4480           goto directionality;
4481         }
4482       return 0;
4483
4484     case ISO_ESC_5_11_2:
4485       if (c == ']')
4486         {
4487           *flags = (*flags & CODING_STATE_ISO2022_LOCK) | CODING_STATE_R2L;
4488           goto directionality;
4489         }
4490       return 0;
4491
4492     directionality:
4493       iso->esc = ISO_ESC_DIRECTIONALITY;
4494       /* Various junk here to attempt to preserve the direction sequences
4495          literally in the text if they would otherwise be swallowed due
4496          to invalid designations that don't show up as actual charset
4497          changes in the text. */
4498       if (iso->invalid_switch_dir)
4499         {
4500           /* We already inserted a direction switch literally into the
4501              text.  We assume (#### this may not be right) that the
4502              next direction switch is the one going the other way,
4503              and we need to output that literally as well. */
4504           iso->output_literally = 1;
4505           iso->invalid_switch_dir = 0;
4506         }
4507       else
4508         {
4509           int jj;
4510
4511           /* If we are in the thrall of an invalid designation,
4512            then stick the directionality sequence literally into the
4513            output stream so it ends up in the original text again. */
4514           for (jj = 0; jj < 4; jj++)
4515             if (iso->invalid_designated[jj])
4516               break;
4517           if (jj < 4)
4518             {
4519               iso->output_literally = 1;
4520               iso->invalid_switch_dir = 1;
4521             }
4522           else
4523             /* Indicate that we haven't yet seen a valid designation,
4524                so that if a switch-dir is directly followed by an
4525                invalid designation, both get inserted literally. */
4526             iso->switched_dir_and_no_valid_charset_yet = 1;
4527         }
4528       return 1;
4529
4530
4531       /**** designation ****/
4532
4533     case ISO_ESC_2_4:
4534       if (0x28 <= c && c <= 0x2F)
4535         {
4536           iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_4_8);
4537           goto not_done;
4538         }
4539       if (0x40 <= c && c <= 0x42)
4540         {
4541           /* 94^n-set */
4542           cs = CHARSET_BY_ATTRIBUTES (94, -1, c,
4543                                       *flags & CODING_STATE_R2L ?
4544                                       CHARSET_RIGHT_TO_LEFT :
4545                                       CHARSET_LEFT_TO_RIGHT);
4546           reg = 0;
4547           goto designated;
4548         }
4549       return 0;
4550
4551     default:
4552       {
4553         int chars = 0;
4554         int single = 0;
4555
4556         if (c < '0' || c > '~')
4557           return 0; /* bad final byte */
4558
4559         if (iso->esc >= ISO_ESC_2_8 &&
4560             iso->esc <= ISO_ESC_2_15)
4561           {
4562             chars = (iso->esc >= ISO_ESC_2_12) ? 96 : 94;
4563             single = 1; /* single-byte */
4564             reg = (iso->esc - ISO_ESC_2_8) & 3;
4565           }
4566         else if (iso->esc >= ISO_ESC_2_4_8 &&
4567                  iso->esc <= ISO_ESC_2_4_15)
4568           {
4569             chars = (iso->esc >= ISO_ESC_2_4_12) ? 96 : 94;
4570             single = -1; /* multi-byte */
4571             reg = (iso->esc - ISO_ESC_2_4_8) & 3;
4572           }
4573         else
4574           {
4575             /* Can this ever be reached? -slb */
4576             abort();
4577           }
4578
4579         cs = CHARSET_BY_ATTRIBUTES (chars, single, c,
4580                                     *flags & CODING_STATE_R2L ?
4581                                     CHARSET_RIGHT_TO_LEFT :
4582                                     CHARSET_LEFT_TO_RIGHT);
4583         goto designated;
4584       }
4585     }
4586
4587  not_done:
4588   iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char) c;
4589   return -1;
4590
4591  single_shift:
4592   if (check_invalid_charsets && !CHARSETP (iso->charset[reg]))
4593     /* can't invoke something that ain't there. */
4594     return 0;
4595   iso->esc = ISO_ESC_SINGLE_SHIFT;
4596   *flags &= CODING_STATE_ISO2022_LOCK;
4597   if (reg == 2)
4598     *flags |= CODING_STATE_SS2;
4599   else
4600     *flags |= CODING_STATE_SS3;
4601   return 1;
4602
4603  locking_shift:
4604   if (check_invalid_charsets &&
4605       !CHARSETP (iso->charset[reg]))
4606     /* can't invoke something that ain't there. */
4607     return 0;
4608   if (half)
4609     iso->register_right = reg;
4610   else
4611     iso->register_left = reg;
4612   *flags &= CODING_STATE_ISO2022_LOCK;
4613   iso->esc = ISO_ESC_LOCKING_SHIFT;
4614   return 1;
4615
4616  designated:
4617   if (NILP (cs) && check_invalid_charsets)
4618     {
4619       iso->invalid_designated[reg] = 1;
4620       iso->charset[reg] = Vcharset_ascii;
4621       iso->esc = ISO_ESC_DESIGNATE;
4622       *flags &= CODING_STATE_ISO2022_LOCK;
4623       iso->output_literally = 1;
4624       if (iso->switched_dir_and_no_valid_charset_yet)
4625         {
4626           /* We encountered a switch-direction followed by an
4627              invalid designation.  Ensure that the switch-direction
4628              gets outputted; otherwise it will probably get eaten
4629              when the text is written out again. */
4630           iso->switched_dir_and_no_valid_charset_yet = 0;
4631           iso->output_direction_sequence = 1;
4632           /* And make sure that the switch-dir going the other
4633              way gets outputted, as well. */
4634           iso->invalid_switch_dir = 1;
4635         }
4636       return 1;
4637     }
4638   /* This function is called with CODESYS equal to nil when
4639      doing coding-system detection. */
4640   if (!NILP (codesys))
4641     {
4642       charset_conversion_spec_dynarr *dyn =
4643         XCODING_SYSTEM (codesys)->iso2022.input_conv;
4644
4645       if (dyn)
4646         {
4647           int i;
4648
4649           for (i = 0; i < Dynarr_length (dyn); i++)
4650             {
4651               struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
4652               if (EQ (cs, spec->from_charset))
4653                 cs = spec->to_charset;
4654             }
4655         }
4656     }
4657
4658   iso->charset[reg] = cs;
4659   iso->esc = ISO_ESC_DESIGNATE;
4660   *flags &= CODING_STATE_ISO2022_LOCK;
4661   if (iso->invalid_designated[reg])
4662     {
4663       iso->invalid_designated[reg] = 0;
4664       iso->output_literally = 1;
4665     }
4666   if (iso->switched_dir_and_no_valid_charset_yet)
4667     iso->switched_dir_and_no_valid_charset_yet = 0;
4668   return 1;
4669 }
4670
4671 static int
4672 detect_coding_iso2022 (struct detection_state *st, const Extbyte *src, size_t n)
4673 {
4674   int mask;
4675
4676   /* #### There are serious deficiencies in the recognition mechanism
4677      here.  This needs to be much smarter if it's going to cut it.
4678      The sequence "\xff\x0f" is currently detected as LOCK_SHIFT while
4679      it should be detected as Latin-1.
4680      All the ISO2022 stuff in this file should be synced up with the
4681      code from FSF Emacs-20.4, in which Mule should be more or less stable.
4682      Perhaps we should wait till R2L works in FSF Emacs? */
4683
4684   if (!st->iso2022.initted)
4685     {
4686       reset_iso2022 (Qnil, &st->iso2022.iso);
4687       st->iso2022.mask = (CODING_CATEGORY_ISO_7_MASK |
4688                           CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4689                           CODING_CATEGORY_ISO_8_1_MASK |
4690                           CODING_CATEGORY_ISO_8_2_MASK |
4691                           CODING_CATEGORY_ISO_LOCK_SHIFT_MASK);
4692       st->iso2022.flags = 0;
4693       st->iso2022.high_byte_count = 0;
4694       st->iso2022.saw_single_shift = 0;
4695       st->iso2022.initted = 1;
4696     }
4697
4698   mask = st->iso2022.mask;
4699
4700   while (n--)
4701     {
4702       unsigned char c = *(unsigned char *)src++;
4703       if (c >= 0xA0)
4704         {
4705           mask &= ~CODING_CATEGORY_ISO_7_MASK;
4706           st->iso2022.high_byte_count++;
4707         }
4708       else
4709         {
4710           if (st->iso2022.high_byte_count && !st->iso2022.saw_single_shift)
4711             {
4712               if (st->iso2022.high_byte_count & 1)
4713                 /* odd number of high bytes; assume not iso-8-2 */
4714                 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4715             }
4716           st->iso2022.high_byte_count = 0;
4717           st->iso2022.saw_single_shift = 0;
4718           if (c > 0x80)
4719             mask &= ~CODING_CATEGORY_ISO_7_MASK;
4720         }
4721       if (!(st->iso2022.flags & CODING_STATE_ESCAPE)
4722           && (BYTE_C0_P (c) || BYTE_C1_P (c)))
4723         { /* control chars */
4724           switch (c)
4725             {
4726               /* Allow and ignore control characters that you might
4727                  reasonably see in a text file */
4728             case '\r':
4729             case '\n':
4730             case '\t':
4731             case  7: /* bell */
4732             case  8: /* backspace */
4733             case 11: /* vertical tab */
4734             case 12: /* form feed */
4735             case 26: /* MS-DOS C-z junk */
4736             case 31: /* '^_' -- for info */
4737               goto label_continue_loop;
4738
4739             default:
4740               break;
4741             }
4742         }
4743
4744       if ((st->iso2022.flags & CODING_STATE_ESCAPE) || BYTE_C0_P (c)
4745           || BYTE_C1_P (c))
4746         {
4747           if (parse_iso2022_esc (Qnil, &st->iso2022.iso, c,
4748                                  &st->iso2022.flags, 0))
4749             {
4750               switch (st->iso2022.iso.esc)
4751                 {
4752                 case ISO_ESC_DESIGNATE:
4753                   mask &= ~CODING_CATEGORY_ISO_8_1_MASK;
4754                   mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4755                   break;
4756                 case ISO_ESC_LOCKING_SHIFT:
4757                   mask = CODING_CATEGORY_ISO_LOCK_SHIFT_MASK;
4758                   goto ran_out_of_chars;
4759                 case ISO_ESC_SINGLE_SHIFT:
4760                   mask &= ~CODING_CATEGORY_ISO_8_DESIGNATE_MASK;
4761                   st->iso2022.saw_single_shift = 1;
4762                   break;
4763                 default:
4764                   break;
4765                 }
4766             }
4767           else
4768             {
4769               mask = 0;
4770               goto ran_out_of_chars;
4771             }
4772         }
4773     label_continue_loop:;
4774     }
4775
4776  ran_out_of_chars:
4777
4778   return mask;
4779 }
4780
4781 static int
4782 postprocess_iso2022_mask (int mask)
4783 {
4784   /* #### kind of cheesy */
4785   /* If seven-bit ISO is allowed, then assume that the encoding is
4786      entirely seven-bit and turn off the eight-bit ones. */
4787   if (mask & CODING_CATEGORY_ISO_7_MASK)
4788     mask &= ~ (CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4789                CODING_CATEGORY_ISO_8_1_MASK |
4790                CODING_CATEGORY_ISO_8_2_MASK);
4791   return mask;
4792 }
4793
4794 /* If FLAGS is a null pointer or specifies right-to-left motion,
4795    output a switch-dir-to-left-to-right sequence to DST.
4796    Also update FLAGS if it is not a null pointer.
4797    If INTERNAL_P is set, we are outputting in internal format and
4798    need to handle the CSI differently. */
4799
4800 static void
4801 restore_left_to_right_direction (Lisp_Coding_System *codesys,
4802                                  unsigned_char_dynarr *dst,
4803                                  unsigned int *flags,
4804                                  int internal_p)
4805 {
4806   if (!flags || (*flags & CODING_STATE_R2L))
4807     {
4808       if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4809         {
4810           Dynarr_add (dst, ISO_CODE_ESC);
4811           Dynarr_add (dst, '[');
4812         }
4813       else if (internal_p)
4814         DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4815       else
4816         Dynarr_add (dst, ISO_CODE_CSI);
4817       Dynarr_add (dst, '0');
4818       Dynarr_add (dst, ']');
4819       if (flags)
4820         *flags &= ~CODING_STATE_R2L;
4821     }
4822 }
4823
4824 /* If FLAGS is a null pointer or specifies a direction different from
4825    DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or
4826    CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape
4827    sequence to DST.  Also update FLAGS if it is not a null pointer.
4828    If INTERNAL_P is set, we are outputting in internal format and
4829    need to handle the CSI differently. */
4830
4831 static void
4832 ensure_correct_direction (int direction, Lisp_Coding_System *codesys,
4833                           unsigned_char_dynarr *dst, unsigned int *flags,
4834                           int internal_p)
4835 {
4836   if ((!flags || (*flags & CODING_STATE_R2L)) &&
4837       direction == CHARSET_LEFT_TO_RIGHT)
4838     restore_left_to_right_direction (codesys, dst, flags, internal_p);
4839   else if (!CODING_SYSTEM_ISO2022_NO_ISO6429 (codesys)
4840            && (!flags || !(*flags & CODING_STATE_R2L)) &&
4841            direction == CHARSET_RIGHT_TO_LEFT)
4842     {
4843       if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4844         {
4845           Dynarr_add (dst, ISO_CODE_ESC);
4846           Dynarr_add (dst, '[');
4847         }
4848       else if (internal_p)
4849         DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4850       else
4851         Dynarr_add (dst, ISO_CODE_CSI);
4852       Dynarr_add (dst, '2');
4853       Dynarr_add (dst, ']');
4854       if (flags)
4855         *flags |= CODING_STATE_R2L;
4856     }
4857 }
4858
4859 /* Convert ISO2022-format data to internal format. */
4860
4861 static void
4862 decode_coding_iso2022 (Lstream *decoding, const Extbyte *src,
4863                        unsigned_char_dynarr *dst, size_t n)
4864 {
4865   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4866   unsigned int flags    = str->flags;
4867   unsigned int cpos     = str->cpos;
4868   unsigned char counter = str->counter;
4869   eol_type_t eol_type   = str->eol_type;
4870 #ifdef ENABLE_COMPOSITE_CHARS
4871   unsigned_char_dynarr *real_dst = dst;
4872 #endif
4873   Lisp_Object coding_system;
4874
4875   XSETCODING_SYSTEM (coding_system, str->codesys);
4876
4877 #ifdef ENABLE_COMPOSITE_CHARS
4878   if (flags & CODING_STATE_COMPOSITE)
4879     dst = str->iso2022.composite_chars;
4880 #endif /* ENABLE_COMPOSITE_CHARS */
4881
4882   while (n--)
4883     {
4884       unsigned char c = *(unsigned char *)src++;
4885       if (flags & CODING_STATE_ESCAPE)
4886         {       /* Within ESC sequence */
4887           int retval = parse_iso2022_esc (coding_system, &str->iso2022,
4888                                           c, &flags, 1);
4889
4890           if (retval)
4891             {
4892               switch (str->iso2022.esc)
4893                 {
4894 #ifdef ENABLE_COMPOSITE_CHARS
4895                 case ISO_ESC_START_COMPOSITE:
4896                   if (str->iso2022.composite_chars)
4897                     Dynarr_reset (str->iso2022.composite_chars);
4898                   else
4899                     str->iso2022.composite_chars = Dynarr_new (unsigned_char);
4900                   dst = str->iso2022.composite_chars;
4901                   break;
4902                 case ISO_ESC_END_COMPOSITE:
4903                   {
4904                     Bufbyte comstr[MAX_EMCHAR_LEN];
4905                     Bytecount len;
4906                     Emchar emch = lookup_composite_char (Dynarr_atp (dst, 0),
4907                                                          Dynarr_length (dst));
4908                     dst = real_dst;
4909                     len = set_charptr_emchar (comstr, emch);
4910                     Dynarr_add_many (dst, comstr, len);
4911                     break;
4912                   }
4913 #endif /* ENABLE_COMPOSITE_CHARS */
4914
4915                 case ISO_ESC_LITERAL:
4916                   COMPOSE_FLUSH_CHARS (str, dst);
4917                   DECODE_ADD_BINARY_CHAR (c, dst);
4918                   break;
4919
4920                 default:
4921                   /* Everything else handled already */
4922                   break;
4923                 }
4924             }
4925
4926           /* Attempted error recovery. */
4927           if (str->iso2022.output_direction_sequence)
4928             ensure_correct_direction (flags & CODING_STATE_R2L ?
4929                                       CHARSET_RIGHT_TO_LEFT :
4930                                       CHARSET_LEFT_TO_RIGHT,
4931                                       str->codesys, dst, 0, 1);
4932           /* More error recovery. */
4933           if (!retval || str->iso2022.output_literally)
4934             {
4935               /* Output the (possibly invalid) sequence */
4936               int i;
4937               COMPOSE_FLUSH_CHARS (str, dst);
4938               for (i = 0; i < str->iso2022.esc_bytes_index; i++)
4939                 DECODE_ADD_BINARY_CHAR (str->iso2022.esc_bytes[i], dst);
4940               flags &= CODING_STATE_ISO2022_LOCK;
4941               if (!retval)
4942                 n++, src--;/* Repeat the loop with the same character. */
4943               else
4944                 {
4945                   /* No sense in reprocessing the final byte of the
4946                      escape sequence; it could mess things up anyway.
4947                      Just add it now. */
4948                   COMPOSE_FLUSH_CHARS (str, dst);
4949                   DECODE_ADD_BINARY_CHAR (c, dst);
4950                 }
4951             }
4952           cpos = 0;
4953           counter = 0;
4954         }
4955       else if (BYTE_C0_P (c) || BYTE_C1_P (c))
4956         { /* Control characters */
4957
4958           /***** Error-handling *****/
4959
4960           /* If we were in the middle of a character, dump out the
4961              partial character. */
4962           if (counter)
4963             {
4964               COMPOSE_FLUSH_CHARS (str, dst);
4965               while (counter > 0)
4966                 {
4967                   counter--;
4968                   DECODE_ADD_BINARY_CHAR
4969                     ((unsigned char)(cpos >> (counter * 8)), dst);
4970                 }
4971               cpos = 0;
4972             }
4973
4974           /* If we just saw a single-shift character, dump it out.
4975              This may dump out the wrong sort of single-shift character,
4976              but least it will give an indication that something went
4977              wrong. */
4978           if (flags & CODING_STATE_SS2)
4979             {
4980               COMPOSE_FLUSH_CHARS (str, dst);
4981               DECODE_ADD_BINARY_CHAR (ISO_CODE_SS2, dst);
4982               flags &= ~CODING_STATE_SS2;
4983             }
4984           if (flags & CODING_STATE_SS3)
4985             {
4986               COMPOSE_FLUSH_CHARS (str, dst);
4987               DECODE_ADD_BINARY_CHAR (ISO_CODE_SS3, dst);
4988               flags &= ~CODING_STATE_SS3;
4989             }
4990
4991           /***** Now handle the control characters. *****/
4992
4993           /* Handle CR/LF */
4994 #ifdef UTF2000
4995           if (c == '\r')
4996             {
4997               COMPOSE_FLUSH_CHARS (str, dst);
4998               if (eol_type == EOL_CR)
4999                 Dynarr_add (dst, '\n');
5000               else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR)
5001                 Dynarr_add (dst, c);
5002               else
5003                 flags |= CODING_STATE_CR;
5004               goto label_continue_loop;
5005             }
5006           else if (flags & CODING_STATE_CR)
5007             {   /* eol_type == CODING_SYSTEM_EOL_CRLF */
5008               if (c != '\n')
5009                 Dynarr_add (dst, '\r');
5010               flags &= ~CODING_STATE_CR;
5011             }
5012 #else
5013           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5014 #endif
5015
5016           flags &= CODING_STATE_ISO2022_LOCK;
5017
5018           if (!parse_iso2022_esc (coding_system, &str->iso2022, c, &flags, 1))
5019             {
5020               COMPOSE_FLUSH_CHARS (str, dst);
5021               DECODE_ADD_BINARY_CHAR (c, dst);
5022             }
5023         }
5024       else
5025         {                       /* Graphic characters */
5026           Lisp_Object charset;
5027 #ifndef UTF2000
5028           Charset_ID lb;
5029 #endif
5030           int reg;
5031
5032 #ifdef UTF2000
5033           if (c == '\r')
5034             {
5035               COMPOSE_FLUSH_CHARS (str, dst);
5036               if (eol_type == EOL_CR)
5037                 Dynarr_add (dst, '\n');
5038               else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR)
5039                 Dynarr_add (dst, c);
5040               else
5041                 flags |= CODING_STATE_CR;
5042               goto label_continue_loop;
5043             }
5044           else if (flags & CODING_STATE_CR)
5045             {   /* eol_type == CODING_SYSTEM_EOL_CRLF */
5046               if (c != '\n')
5047                 Dynarr_add (dst, '\r');
5048               flags &= ~CODING_STATE_CR;
5049             }
5050 #else
5051           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5052 #endif
5053
5054           /* Now determine the charset. */
5055           reg = ((flags & CODING_STATE_SS2) ? 2
5056                  : (flags & CODING_STATE_SS3) ? 3
5057                  : !BYTE_ASCII_P (c) ? str->iso2022.register_right
5058                  : str->iso2022.register_left);
5059           charset = str->iso2022.charset[reg];
5060
5061           /* Error checking: */
5062           if (! CHARSETP (charset)
5063               || str->iso2022.invalid_designated[reg]
5064               || (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL)
5065                   && XCHARSET_CHARS (charset) == 94))
5066             /* Mrmph.  We are trying to invoke a register that has no
5067                or an invalid charset in it, or trying to add a character
5068                outside the range of the charset.  Insert that char literally
5069                to preserve it for the output. */
5070             {
5071               COMPOSE_FLUSH_CHARS (str, dst);
5072               while (counter > 0)
5073                 {
5074                   counter--;
5075                   DECODE_ADD_BINARY_CHAR
5076                     ((unsigned char)(cpos >> (counter * 8)), dst);
5077                 }
5078               cpos = 0;
5079               DECODE_ADD_BINARY_CHAR (c, dst);
5080             }
5081
5082           else
5083             {
5084               /* Things are probably hunky-dorey. */
5085
5086               /* Fetch reverse charset, maybe. */
5087               if (((flags & CODING_STATE_R2L) &&
5088                    XCHARSET_DIRECTION (charset) == CHARSET_LEFT_TO_RIGHT)
5089                   ||
5090                   (!(flags & CODING_STATE_R2L) &&
5091                    XCHARSET_DIRECTION (charset) == CHARSET_RIGHT_TO_LEFT))
5092                 {
5093                   Lisp_Object new_charset =
5094                     XCHARSET_REVERSE_DIRECTION_CHARSET (charset);
5095                   if (!NILP (new_charset))
5096                     charset = new_charset;
5097                 }
5098
5099 #ifdef UTF2000
5100               counter++;
5101               if (XCHARSET_DIMENSION (charset) == counter)
5102                 {
5103                   COMPOSE_ADD_CHAR (str,
5104                                     DECODE_CHAR (charset,
5105                                                  ((cpos & 0x7F7F7F) << 8)
5106                                                  | (c & 0x7F)),
5107                                     dst);
5108                   cpos = 0;
5109                   counter = 0;
5110                 }
5111               else
5112                 cpos = (cpos << 8) | c;
5113 #else
5114               lb = XCHARSET_LEADING_BYTE (charset);
5115               switch (XCHARSET_REP_BYTES (charset))
5116                 {
5117                 case 1: /* ASCII */
5118                   DECODE_OUTPUT_PARTIAL_CHAR (ch);
5119                   Dynarr_add (dst, c & 0x7F);
5120                   break;
5121
5122                 case 2: /* one-byte official */
5123                   DECODE_OUTPUT_PARTIAL_CHAR (ch);
5124                   Dynarr_add (dst, lb);
5125                   Dynarr_add (dst, c | 0x80);
5126                   break;
5127
5128                 case 3: /* one-byte private or two-byte official */
5129                   if (XCHARSET_PRIVATE_P (charset))
5130                     {
5131                       DECODE_OUTPUT_PARTIAL_CHAR (ch);
5132                       Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_1);
5133                       Dynarr_add (dst, lb);
5134                       Dynarr_add (dst, c | 0x80);
5135                     }
5136                   else
5137                     {
5138                       if (ch)
5139                         {
5140                           Dynarr_add (dst, lb);
5141                           Dynarr_add (dst, ch | 0x80);
5142                           Dynarr_add (dst, c | 0x80);
5143                           ch = 0;
5144                         }
5145                       else
5146                         ch = c;
5147                     }
5148                   break;
5149
5150                 default:        /* two-byte private */
5151                   if (ch)
5152                     {
5153                       Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_2);
5154                       Dynarr_add (dst, lb);
5155                       Dynarr_add (dst, ch | 0x80);
5156                       Dynarr_add (dst, c | 0x80);
5157                       ch = 0;
5158                     }
5159                   else
5160                     ch = c;
5161                 }
5162 #endif
5163             }
5164
5165           if (!cpos)
5166             flags &= CODING_STATE_ISO2022_LOCK;
5167         }
5168
5169     label_continue_loop:;
5170     }
5171
5172   if (flags & CODING_STATE_END)
5173     {
5174       COMPOSE_FLUSH_CHARS (str, dst);
5175       DECODE_OUTPUT_PARTIAL_CHAR (cpos);
5176     }
5177   str->flags   = flags;
5178   str->cpos    = cpos;
5179   str->counter = counter;
5180 }
5181
5182
5183 /***** ISO2022 encoder *****/
5184
5185 /* Designate CHARSET into register REG. */
5186
5187 static void
5188 iso2022_designate (Lisp_Object charset, unsigned char reg,
5189                    struct encoding_stream *str, unsigned_char_dynarr *dst)
5190 {
5191   static const char inter94[] = "()*+";
5192   static const char inter96[] = ",-./";
5193   unsigned short chars;
5194   unsigned char dimension;
5195   unsigned char final;
5196   Lisp_Object old_charset = str->iso2022.charset[reg];
5197
5198   str->iso2022.charset[reg] = charset;
5199   if (!CHARSETP (charset))
5200     /* charset might be an initial nil or t. */
5201     return;
5202   chars = XCHARSET_CHARS (charset);
5203   dimension = XCHARSET_DIMENSION (charset);
5204   final = XCHARSET_FINAL (charset);
5205   if (!str->iso2022.force_charset_on_output[reg] &&
5206       CHARSETP (old_charset) &&
5207       XCHARSET_CHARS (old_charset) == chars &&
5208       XCHARSET_DIMENSION (old_charset) == dimension &&
5209       XCHARSET_FINAL (old_charset) == final)
5210     return;
5211
5212   str->iso2022.force_charset_on_output[reg] = 0;
5213
5214   {
5215     charset_conversion_spec_dynarr *dyn =
5216       str->codesys->iso2022.output_conv;
5217
5218     if (dyn)
5219       {
5220         int i;
5221
5222         for (i = 0; i < Dynarr_length (dyn); i++)
5223           {
5224             struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
5225             if (EQ (charset, spec->from_charset))
5226                 charset = spec->to_charset;
5227           }
5228       }
5229   }
5230
5231   Dynarr_add (dst, ISO_CODE_ESC);
5232   switch (chars)
5233     {
5234     case 94:
5235       if (dimension == 1)
5236         Dynarr_add (dst, inter94[reg]);
5237       else
5238         {
5239           Dynarr_add (dst, '$');
5240           if (reg != 0
5241               || !(CODING_SYSTEM_ISO2022_SHORT (str->codesys))
5242               || final < '@'
5243               || final > 'B')
5244             Dynarr_add (dst, inter94[reg]);
5245         }
5246       break;
5247     case 96:
5248       if (dimension == 1)
5249         Dynarr_add (dst, inter96[reg]);
5250       else
5251         {
5252           Dynarr_add (dst, '$');
5253           Dynarr_add (dst, inter96[reg]);
5254         }
5255       break;
5256     }
5257   Dynarr_add (dst, final);
5258 }
5259
5260 static void
5261 ensure_normal_shift (struct encoding_stream *str, unsigned_char_dynarr *dst)
5262 {
5263   if (str->iso2022.register_left != 0)
5264     {
5265       Dynarr_add (dst, ISO_CODE_SI);
5266       str->iso2022.register_left = 0;
5267     }
5268 }
5269
5270 static void
5271 ensure_shift_out (struct encoding_stream *str, unsigned_char_dynarr *dst)
5272 {
5273   if (str->iso2022.register_left != 1)
5274     {
5275       Dynarr_add (dst, ISO_CODE_SO);
5276       str->iso2022.register_left = 1;
5277     }
5278 }
5279
5280 void
5281 char_encode_iso2022 (struct encoding_stream *str, Emchar ch,
5282                      unsigned_char_dynarr *dst, unsigned int *flags)
5283 {
5284   unsigned char charmask;
5285   Lisp_Coding_System* codesys = str->codesys;
5286   eol_type_t eol_type         = CODING_SYSTEM_EOL_TYPE (str->codesys);
5287   int i;
5288   Lisp_Object charset = str->iso2022.current_charset;
5289   int half = str->iso2022.current_half;
5290   int code_point = -1;
5291
5292   if (ch <= 0x7F)
5293     {
5294       restore_left_to_right_direction (codesys, dst, flags, 0);
5295
5296       /* Make sure G0 contains ASCII */
5297       if ((ch > ' ' && ch < ISO_CODE_DEL)
5298           || !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
5299         {
5300           ensure_normal_shift (str, dst);
5301           iso2022_designate (Vcharset_ascii, 0, str, dst);
5302         }
5303
5304       /* If necessary, restore everything to the default state
5305          at end-of-line */
5306       if (ch == '\n' && !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys)))
5307         {
5308           restore_left_to_right_direction (codesys, dst, flags, 0);
5309
5310           ensure_normal_shift (str, dst);
5311
5312           for (i = 0; i < 4; i++)
5313             {
5314               Lisp_Object initial_charset =
5315                 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5316               iso2022_designate (initial_charset, i, str, dst);
5317             }
5318         }
5319       if (ch == '\n')
5320         {
5321           if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5322             Dynarr_add (dst, '\r');
5323           if (eol_type != EOL_CR)
5324             Dynarr_add (dst, ch);
5325         }
5326       else
5327         {
5328           if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5329               && fit_to_be_escape_quoted (ch))
5330             Dynarr_add (dst, ISO_CODE_ESC);
5331           Dynarr_add (dst, ch);
5332         }
5333     }
5334   else if ( (0x80 <= ch) && (ch <= 0x9f) )
5335     {
5336       charmask = (half == 0 ? 0x00 : 0x80);
5337
5338       if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5339           && fit_to_be_escape_quoted (ch))
5340         Dynarr_add (dst, ISO_CODE_ESC);
5341       /* you asked for it ... */
5342       Dynarr_add (dst, ch);
5343     }
5344   else
5345     {
5346       int reg;
5347
5348       /* Now determine which register to use. */
5349       reg = -1;
5350       for (i = 0; i < 4; i++)
5351         {
5352           if ((CHARSETP (charset = str->iso2022.charset[i])
5353                && ((code_point = charset_code_point (charset, ch)) >= 0))
5354               ||
5355               (CHARSETP
5356                (charset
5357                 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i))
5358                && ((code_point = charset_code_point (charset, ch)) >= 0)))
5359             {
5360               reg = i;
5361               break;
5362             }
5363         }
5364       if (reg == -1)
5365         {
5366           Lisp_Object original_default_coded_charset_priority_list
5367             = Vdefault_coded_charset_priority_list;
5368
5369           while (!EQ (Vdefault_coded_charset_priority_list, Qnil))
5370             {
5371               code_point = ENCODE_CHAR (ch, charset);
5372               if (XCHARSET_FINAL (charset))
5373                 goto found;
5374               Vdefault_coded_charset_priority_list
5375                 = Fcdr (Fmemq (XCHARSET_NAME (charset),
5376                                Vdefault_coded_charset_priority_list));
5377             }
5378           code_point = ENCODE_CHAR (ch, charset);
5379           if (!XCHARSET_FINAL (charset))
5380             {
5381               charset = Vcharset_ascii;
5382               code_point = '~';
5383             }
5384         found:
5385           Vdefault_coded_charset_priority_list
5386             = original_default_coded_charset_priority_list;
5387         }
5388       ensure_correct_direction (XCHARSET_DIRECTION (charset),
5389                                 codesys, dst, flags, 0);
5390
5391       if (reg == -1)
5392         {
5393           if (XCHARSET_GRAPHIC (charset) != 0)
5394             {
5395               if (!NILP (str->iso2022.charset[1]) &&
5396                   (!CODING_SYSTEM_ISO2022_SEVEN (codesys)
5397                    || CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys)))
5398                 reg = 1;
5399               else if (!NILP (str->iso2022.charset[2]))
5400                 reg = 2;
5401               else if (!NILP (str->iso2022.charset[3]))
5402                 reg = 3;
5403               else
5404                 reg = 0;
5405             }
5406           else
5407             reg = 0;
5408         }
5409
5410       iso2022_designate (charset, reg, str, dst);
5411
5412       /* Now invoke that register. */
5413       switch (reg)
5414         {
5415         case 0:
5416           ensure_normal_shift (str, dst);
5417           half = 0;
5418           break;
5419         case 1:
5420           if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5421             {
5422               ensure_shift_out (str, dst);
5423               half = 0;
5424             }
5425           else
5426             half = 1;
5427           break;
5428         case 2:
5429           if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5430             {
5431               Dynarr_add (dst, ISO_CODE_ESC);
5432               Dynarr_add (dst, 'N');
5433               half = 0;
5434             }
5435           else
5436             {
5437               Dynarr_add (dst, ISO_CODE_SS2);
5438               half = 1;
5439             }
5440           break;
5441         case 3:
5442           if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5443             {
5444               Dynarr_add (dst, ISO_CODE_ESC);
5445               Dynarr_add (dst, 'O');
5446               half = 0;
5447             }
5448           else
5449             {
5450               Dynarr_add (dst, ISO_CODE_SS3);
5451               half = 1;
5452             }
5453           break;
5454         default:
5455           abort ();
5456         }
5457
5458       charmask = (half == 0 ? 0x00 : 0x80);
5459
5460       switch (XCHARSET_DIMENSION (charset))
5461         {
5462         case 1:
5463           Dynarr_add (dst, (code_point & 0xFF) | charmask);
5464           break;
5465         case 2:
5466           Dynarr_add (dst, ((code_point >> 8) & 0xFF) | charmask);
5467           Dynarr_add (dst, ( code_point       & 0xFF) | charmask);
5468           break;
5469         case 3:
5470           Dynarr_add (dst, ((code_point >> 16) & 0xFF) | charmask);
5471           Dynarr_add (dst, ((code_point >>  8) & 0xFF) | charmask);
5472           Dynarr_add (dst, ( code_point        & 0xFF) | charmask);
5473           break;
5474         case 4:
5475           Dynarr_add (dst, ((code_point >> 24) & 0xFF) | charmask);
5476           Dynarr_add (dst, ((code_point >> 16) & 0xFF) | charmask);
5477           Dynarr_add (dst, ((code_point >>  8) & 0xFF) | charmask);
5478           Dynarr_add (dst, ( code_point        & 0xFF) | charmask);
5479           break;
5480         default:
5481           abort ();
5482         }
5483     }
5484   str->iso2022.current_charset = charset;
5485   str->iso2022.current_half = half;
5486 }
5487
5488 void
5489 char_finish_iso2022 (struct encoding_stream *str, unsigned_char_dynarr *dst,
5490                      unsigned int *flags)
5491 {
5492   Lisp_Coding_System* codesys = str->codesys;
5493   int i;
5494
5495   restore_left_to_right_direction (codesys, dst, flags, 0);
5496   ensure_normal_shift (str, dst);
5497   for (i = 0; i < 4; i++)
5498     {
5499       Lisp_Object initial_charset
5500         = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5501       iso2022_designate (initial_charset, i, str, dst);
5502     }
5503 }
5504 #endif /* MULE */
5505 \f
5506 /************************************************************************/
5507 /*                     No-conversion methods                            */
5508 /************************************************************************/
5509
5510 /* This is used when reading in "binary" files -- i.e. files that may
5511    contain all 256 possible byte values and that are not to be
5512    interpreted as being in any particular decoding. */
5513 static void
5514 decode_coding_no_conversion (Lstream *decoding, const Extbyte *src,
5515                              unsigned_char_dynarr *dst, size_t n)
5516 {
5517   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
5518   unsigned int flags  = str->flags;
5519   unsigned int cpos   = str->cpos;
5520   eol_type_t eol_type = str->eol_type;
5521
5522   while (n--)
5523     {
5524       unsigned char c = *(unsigned char *)src++;
5525
5526       DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5527       DECODE_ADD_BINARY_CHAR (c, dst);
5528     label_continue_loop:;
5529     }
5530
5531   DECODE_HANDLE_END_OF_CONVERSION (flags, cpos, dst);
5532
5533   str->flags = flags;
5534   str->cpos  = cpos;
5535 }
5536
5537 static void
5538 encode_coding_no_conversion (Lstream *encoding, const Bufbyte *src,
5539                              unsigned_char_dynarr *dst, size_t n)
5540 {
5541   unsigned char c;
5542   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
5543   unsigned int flags  = str->flags;
5544   unsigned int ch     = str->ch;
5545   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
5546 #ifdef UTF2000
5547   unsigned char char_boundary = str->iso2022.current_char_boundary;
5548 #endif
5549
5550   while (n--)
5551     {
5552       c = *src++;
5553 #ifdef UTF2000
5554       if (char_boundary == 0)
5555         if ( c >= 0xfc )
5556           {
5557             ch = c & 0x01;
5558             char_boundary = 5;
5559           }
5560         else if ( c >= 0xf8 )
5561           {
5562             ch = c & 0x03;
5563             char_boundary = 4;
5564           }
5565         else if ( c >= 0xf0 )
5566           {
5567             ch = c & 0x07;
5568             char_boundary = 3;
5569           }
5570         else if ( c >= 0xe0 )
5571           {
5572             ch = c & 0x0f;
5573             char_boundary = 2;
5574           }
5575         else if ( c >= 0xc0 )
5576           {
5577             ch = c & 0x1f;
5578             char_boundary = 1;
5579           }
5580         else
5581           {
5582             ch = 0;
5583             if (c == '\n')
5584               {
5585                 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5586                   Dynarr_add (dst, '\r');
5587                 if (eol_type != EOL_CR)
5588                   Dynarr_add (dst, c);
5589               }
5590             else
5591               Dynarr_add (dst, c);
5592             char_boundary = 0;
5593           }
5594       else if (char_boundary == 1)
5595         {
5596           ch = ( ch << 6 ) | ( c & 0x3f );
5597           Dynarr_add (dst, ch & 0xff);
5598           char_boundary = 0;
5599         }
5600       else
5601         {
5602           ch = ( ch << 6 ) | ( c & 0x3f );
5603           char_boundary--;
5604         }
5605 #else /* not UTF2000 */
5606       if (c == '\n')
5607         {
5608           if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5609             Dynarr_add (dst, '\r');
5610           if (eol_type != EOL_CR)
5611             Dynarr_add (dst, '\n');
5612           ch = 0;
5613         }
5614       else if (BYTE_ASCII_P (c))
5615         {
5616           assert (ch == 0);
5617           Dynarr_add (dst, c);
5618         }
5619       else if (BUFBYTE_LEADING_BYTE_P (c))
5620         {
5621           assert (ch == 0);
5622           if (c == LEADING_BYTE_LATIN_ISO8859_1 ||
5623               c == LEADING_BYTE_CONTROL_1)
5624             ch = c;
5625           else
5626             Dynarr_add (dst, '~'); /* untranslatable character */
5627         }
5628       else
5629         {
5630           if (ch == LEADING_BYTE_LATIN_ISO8859_1)
5631             Dynarr_add (dst, c);
5632           else if (ch == LEADING_BYTE_CONTROL_1)
5633             {
5634               assert (c < 0xC0);
5635               Dynarr_add (dst, c - 0x20);
5636             }
5637           /* else it should be the second or third byte of an
5638              untranslatable character, so ignore it */
5639           ch = 0;
5640         }
5641 #endif /* not UTF2000 */
5642     }
5643
5644   str->flags = flags;
5645   str->ch    = ch;
5646 #ifdef UTF2000
5647   str->iso2022.current_char_boundary = char_boundary;
5648 #endif
5649 }
5650
5651 \f
5652
5653 /************************************************************************/
5654 /*                             Initialization                           */
5655 /************************************************************************/
5656
5657 void
5658 syms_of_file_coding (void)
5659 {
5660   INIT_LRECORD_IMPLEMENTATION (coding_system);
5661
5662   deferror (&Qcoding_system_error, "coding-system-error",
5663             "Coding-system error", Qio_error);
5664
5665   DEFSUBR (Fcoding_system_p);
5666   DEFSUBR (Ffind_coding_system);
5667   DEFSUBR (Fget_coding_system);
5668   DEFSUBR (Fcoding_system_list);
5669   DEFSUBR (Fcoding_system_name);
5670   DEFSUBR (Fmake_coding_system);
5671   DEFSUBR (Fcopy_coding_system);
5672   DEFSUBR (Fcoding_system_canonical_name_p);
5673   DEFSUBR (Fcoding_system_alias_p);
5674   DEFSUBR (Fcoding_system_aliasee);
5675   DEFSUBR (Fdefine_coding_system_alias);
5676   DEFSUBR (Fsubsidiary_coding_system);
5677
5678   DEFSUBR (Fcoding_system_type);
5679   DEFSUBR (Fcoding_system_doc_string);
5680 #ifdef MULE
5681   DEFSUBR (Fcoding_system_charset);
5682 #endif
5683   DEFSUBR (Fcoding_system_property);
5684
5685   DEFSUBR (Fcoding_category_list);
5686   DEFSUBR (Fset_coding_priority_list);
5687   DEFSUBR (Fcoding_priority_list);
5688   DEFSUBR (Fset_coding_category_system);
5689   DEFSUBR (Fcoding_category_system);
5690
5691   DEFSUBR (Fdetect_coding_region);
5692   DEFSUBR (Fdecode_coding_region);
5693   DEFSUBR (Fencode_coding_region);
5694 #ifdef MULE
5695   DEFSUBR (Fdecode_shift_jis_char);
5696   DEFSUBR (Fencode_shift_jis_char);
5697   DEFSUBR (Fdecode_big5_char);
5698   DEFSUBR (Fencode_big5_char);
5699 #endif /* MULE */
5700   defsymbol (&Qcoding_systemp, "coding-system-p");
5701   defsymbol (&Qno_conversion, "no-conversion");
5702   defsymbol (&Qraw_text, "raw-text");
5703 #ifdef MULE
5704   defsymbol (&Qbig5, "big5");
5705   defsymbol (&Qshift_jis, "shift-jis");
5706   defsymbol (&Qucs4, "ucs-4");
5707   defsymbol (&Qutf8, "utf-8");
5708   defsymbol (&Qccl, "ccl");
5709   defsymbol (&Qiso2022, "iso2022");
5710 #endif /* MULE */
5711   defsymbol (&Qmnemonic, "mnemonic");
5712   defsymbol (&Qeol_type, "eol-type");
5713   defsymbol (&Qpost_read_conversion, "post-read-conversion");
5714   defsymbol (&Qpre_write_conversion, "pre-write-conversion");
5715
5716   defsymbol (&Qcr, "cr");
5717   defsymbol (&Qlf, "lf");
5718   defsymbol (&Qcrlf, "crlf");
5719   defsymbol (&Qeol_cr, "eol-cr");
5720   defsymbol (&Qeol_lf, "eol-lf");
5721   defsymbol (&Qeol_crlf, "eol-crlf");
5722 #ifdef MULE
5723   defsymbol (&Qcharset_g0, "charset-g0");
5724   defsymbol (&Qcharset_g1, "charset-g1");
5725   defsymbol (&Qcharset_g2, "charset-g2");
5726   defsymbol (&Qcharset_g3, "charset-g3");
5727   defsymbol (&Qforce_g0_on_output, "force-g0-on-output");
5728   defsymbol (&Qforce_g1_on_output, "force-g1-on-output");
5729   defsymbol (&Qforce_g2_on_output, "force-g2-on-output");
5730   defsymbol (&Qforce_g3_on_output, "force-g3-on-output");
5731   defsymbol (&Qno_iso6429, "no-iso6429");
5732   defsymbol (&Qinput_charset_conversion, "input-charset-conversion");
5733   defsymbol (&Qoutput_charset_conversion, "output-charset-conversion");
5734
5735   defsymbol (&Qshort, "short");
5736   defsymbol (&Qno_ascii_eol, "no-ascii-eol");
5737   defsymbol (&Qno_ascii_cntl, "no-ascii-cntl");
5738   defsymbol (&Qseven, "seven");
5739   defsymbol (&Qlock_shift, "lock-shift");
5740   defsymbol (&Qescape_quoted, "escape-quoted");
5741 #endif /* MULE */
5742 #ifdef UTF2000
5743   defsymbol (&Qdisable_composition, "disable-composition");
5744 #endif
5745   defsymbol (&Qencode, "encode");
5746   defsymbol (&Qdecode, "decode");
5747
5748 #ifdef MULE
5749   defsymbol (&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS],
5750              "shift-jis");
5751   defsymbol (&coding_category_symbol[CODING_CATEGORY_BIG5],
5752              "big5");
5753   defsymbol (&coding_category_symbol[CODING_CATEGORY_UCS4],
5754              "ucs-4");
5755   defsymbol (&coding_category_symbol[CODING_CATEGORY_UTF8],
5756              "utf-8");
5757   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_7],
5758              "iso-7");
5759   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE],
5760              "iso-8-designate");
5761   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_1],
5762              "iso-8-1");
5763   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_2],
5764              "iso-8-2");
5765   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_LOCK_SHIFT],
5766              "iso-lock-shift");
5767 #endif /* MULE */
5768   defsymbol (&coding_category_symbol[CODING_CATEGORY_NO_CONVERSION],
5769              "no-conversion");
5770 }
5771
5772 void
5773 lstream_type_create_file_coding (void)
5774 {
5775   LSTREAM_HAS_METHOD (decoding, reader);
5776   LSTREAM_HAS_METHOD (decoding, writer);
5777   LSTREAM_HAS_METHOD (decoding, rewinder);
5778   LSTREAM_HAS_METHOD (decoding, seekable_p);
5779   LSTREAM_HAS_METHOD (decoding, flusher);
5780   LSTREAM_HAS_METHOD (decoding, closer);
5781   LSTREAM_HAS_METHOD (decoding, marker);
5782
5783   LSTREAM_HAS_METHOD (encoding, reader);
5784   LSTREAM_HAS_METHOD (encoding, writer);
5785   LSTREAM_HAS_METHOD (encoding, rewinder);
5786   LSTREAM_HAS_METHOD (encoding, seekable_p);
5787   LSTREAM_HAS_METHOD (encoding, flusher);
5788   LSTREAM_HAS_METHOD (encoding, closer);
5789   LSTREAM_HAS_METHOD (encoding, marker);
5790 }
5791
5792 void
5793 vars_of_file_coding (void)
5794 {
5795   int i;
5796
5797   fcd = xnew (struct file_coding_dump);
5798   dumpstruct (&fcd, &fcd_description);
5799
5800   /* Initialize to something reasonable ... */
5801   for (i = 0; i <= CODING_CATEGORY_LAST; i++)
5802     {
5803       fcd->coding_category_system[i] = Qnil;
5804       fcd->coding_category_by_priority[i] = i;
5805     }
5806
5807   Fprovide (intern ("file-coding"));
5808
5809   DEFVAR_LISP ("keyboard-coding-system", &Vkeyboard_coding_system /*
5810 Coding system used for TTY keyboard input.
5811 Not used under a windowing system.
5812 */ );
5813   Vkeyboard_coding_system = Qnil;
5814
5815   DEFVAR_LISP ("terminal-coding-system", &Vterminal_coding_system /*
5816 Coding system used for TTY display output.
5817 Not used under a windowing system.
5818 */ );
5819   Vterminal_coding_system = Qnil;
5820
5821   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read /*
5822 Overriding coding system used when reading from a file or process.
5823 You should bind this variable with `let', but do not set it globally.
5824 If this is non-nil, it specifies the coding system that will be used
5825 to decode input on read operations, such as from a file or process.
5826 It overrides `buffer-file-coding-system-for-read',
5827 `insert-file-contents-pre-hook', etc.  Use those variables instead of
5828 this one for permanent changes to the environment.  */ );
5829   Vcoding_system_for_read = Qnil;
5830
5831   DEFVAR_LISP ("coding-system-for-write",
5832                &Vcoding_system_for_write /*
5833 Overriding coding system used when writing to a file or process.
5834 You should bind this variable with `let', but do not set it globally.
5835 If this is non-nil, it specifies the coding system that will be used
5836 to encode output for write operations, such as to a file or process.
5837 It overrides `buffer-file-coding-system', `write-region-pre-hook', etc.
5838 Use those variables instead of this one for permanent changes to the
5839 environment.  */ );
5840   Vcoding_system_for_write = Qnil;
5841
5842   DEFVAR_LISP ("file-name-coding-system", &Vfile_name_coding_system /*
5843 Coding system used to convert pathnames when accessing files.
5844 */ );
5845   Vfile_name_coding_system = Qnil;
5846
5847   DEFVAR_BOOL ("enable-multibyte-characters", &enable_multibyte_characters /*
5848 Non-nil means the buffer contents are regarded as multi-byte form
5849 of characters, not a binary code.  This affects the display, file I/O,
5850 and behaviors of various editing commands.
5851
5852 Setting this to nil does not do anything.
5853 */ );
5854   enable_multibyte_characters = 1;
5855 }
5856
5857 void
5858 complex_vars_of_file_coding (void)
5859 {
5860   staticpro (&Vcoding_system_hash_table);
5861   Vcoding_system_hash_table =
5862     make_lisp_hash_table (50, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ);
5863
5864   the_codesys_prop_dynarr = Dynarr_new (codesys_prop);
5865   dumpstruct (&the_codesys_prop_dynarr, &codesys_prop_dynarr_description);
5866
5867 #define DEFINE_CODESYS_PROP(Prop_Type, Sym) do  \
5868 {                                               \
5869   struct codesys_prop csp;                      \
5870   csp.sym = (Sym);                              \
5871   csp.prop_type = (Prop_Type);                  \
5872   Dynarr_add (the_codesys_prop_dynarr, csp);    \
5873 } while (0)
5874
5875   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qmnemonic);
5876   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_type);
5877   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_cr);
5878   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_crlf);
5879   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_lf);
5880   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qpost_read_conversion);
5881   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qpre_write_conversion);
5882 #ifdef MULE
5883   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g0);
5884   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g1);
5885   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g2);
5886   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g3);
5887   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g0_on_output);
5888   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g1_on_output);
5889   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g2_on_output);
5890   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g3_on_output);
5891   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qshort);
5892   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_eol);
5893   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_cntl);
5894   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qseven);
5895   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qlock_shift);
5896   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_iso6429);
5897   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qescape_quoted);
5898   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qinput_charset_conversion);
5899   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qoutput_charset_conversion);
5900
5901   DEFINE_CODESYS_PROP (CODESYS_PROP_CCL,     Qencode);
5902   DEFINE_CODESYS_PROP (CODESYS_PROP_CCL,     Qdecode);
5903 #endif /* MULE */
5904   /* Need to create this here or we're really screwed. */
5905   Fmake_coding_system
5906     (Qraw_text, Qno_conversion,
5907      build_string ("Raw text, which means it converts only line-break-codes."),
5908      list2 (Qmnemonic, build_string ("Raw")));
5909
5910   Fmake_coding_system
5911     (Qbinary, Qno_conversion,
5912      build_string ("Binary, which means it does not convert anything."),
5913      list4 (Qeol_type, Qlf,
5914             Qmnemonic, build_string ("Binary")));
5915
5916 #ifdef UTF2000
5917   Fmake_coding_system
5918     (Qutf8, Qutf8,
5919      build_string ("Coding-system of ISO/IEC 10646 UTF-8."),
5920      list2 (Qmnemonic, build_string ("UTF8")));
5921 #endif
5922
5923   Fdefine_coding_system_alias (Qno_conversion, Qraw_text);
5924
5925   Fdefine_coding_system_alias (Qfile_name, Qbinary);
5926
5927   Fdefine_coding_system_alias (Qterminal, Qbinary);
5928   Fdefine_coding_system_alias (Qkeyboard, Qbinary);
5929
5930   /* Need this for bootstrapping */
5931   fcd->coding_category_system[CODING_CATEGORY_NO_CONVERSION] =
5932     Fget_coding_system (Qraw_text);
5933
5934 #ifdef UTF2000
5935   fcd->coding_category_system[CODING_CATEGORY_UTF8]
5936    = Fget_coding_system (Qutf8);
5937 #endif
5938
5939 #if defined(MULE) && !defined(UTF2000)
5940   {
5941     unsigned int i;
5942
5943     for (i = 0; i < countof (fcd->ucs_to_mule_table); i++)
5944       fcd->ucs_to_mule_table[i] = Qnil;
5945   }
5946   staticpro (&mule_to_ucs_table);
5947   mule_to_ucs_table = Fmake_char_table(Qgeneric);
5948 #endif /* defined(MULE) && !defined(UTF2000) */
5949 }