src/text-coding.c

   1 /* Code conversion functions.
   2    Copyright (C) 1991, 1995 Free Software Foundation, Inc.
   3    Copyright (C) 1995 Sun Microsystems, Inc.
   4    Copyright (C) 1999,2000,2001,2002 MORIOKA Tomohiko
   5
   6 This file is part of XEmacs.
   7
   8 XEmacs is free software; you can redistribute it and/or modify it
   9 under the terms of the GNU General Public License as published by the
  10 Free Software Foundation; either version 2, or (at your option) any
  11 later version.
  12
  13 XEmacs is distributed in the hope that it will be useful, but WITHOUT
  14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with XEmacs; see the file COPYING.  If not, write to
  20 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  21 Boston, MA 02111-1307, USA.  */
  22
  23 /* Synched up with: Mule 2.3.   Not in FSF. */
  24
  25 /* Rewritten by Ben Wing <ben@xemacs.org>. */
  26 /* Rewritten by MORIOKA Tomohiko <tomo@m17n.org> for XEmacs UTF-2000. */
  27
  28 #include <config.h>
  29 #include "lisp.h"
  30
  31 #include "buffer.h"
  32 #include "elhash.h"
  33 #include "insdel.h"
  34 #include "lstream.h"
  35 #include "opaque.h"
  36 #ifdef MULE
  37 #include "mule-ccl.h"
  38 #include "chartab.h"
  39 #endif
  40 #include "file-coding.h"
  41
  42 Lisp_Object Qcoding_system_error;
  43
  44 Lisp_Object Vkeyboard_coding_system;
  45 Lisp_Object Vterminal_coding_system;
  46 Lisp_Object Vcoding_system_for_read;
  47 Lisp_Object Vcoding_system_for_write;
  48 Lisp_Object Vfile_name_coding_system;
  49
  50 Lisp_Object Vcoded_charset_entity_reference_alist;
  51
  52 /* Table of symbols identifying each coding category. */
  53 Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST];
  54
  55
  56
  57 struct file_coding_dump {
  58   /* Coding system currently associated with each coding category. */
  59   Lisp_Object coding_category_system[CODING_CATEGORY_LAST];
  60
  61   /* Table of all coding categories in decreasing order of priority.
  62      This describes a permutation of the possible coding categories. */
  63   int coding_category_by_priority[CODING_CATEGORY_LAST];
  64
  65 #if defined(MULE) && !defined(UTF2000)
  66   Lisp_Object ucs_to_mule_table[65536];
  67 #endif
  68 } *fcd;
  69
  70 static const struct lrecord_description fcd_description_1[] = {
  71   { XD_LISP_OBJECT_ARRAY, offsetof (struct file_coding_dump, coding_category_system), CODING_CATEGORY_LAST },
  72 #if defined(MULE) && !defined(UTF2000)
  73   { XD_LISP_OBJECT_ARRAY, offsetof (struct file_coding_dump, ucs_to_mule_table), countof (fcd->ucs_to_mule_table) },
  74 #endif
  75   { XD_END }
  76 };
  77
  78 static const struct struct_description fcd_description = {
  79   sizeof (struct file_coding_dump),
  80   fcd_description_1
  81 };
  82
  83 Lisp_Object mule_to_ucs_table;
  84
  85 Lisp_Object Qcoding_systemp;
  86
  87 Lisp_Object Qraw_text, Qno_conversion, Qccl, Qiso2022;
  88 /* Qinternal in general.c */
  89
  90 Lisp_Object Qmnemonic, Qeol_type;
  91 Lisp_Object Qcr, Qcrlf, Qlf;
  92 Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf;
  93 Lisp_Object Qpost_read_conversion;
  94 Lisp_Object Qpre_write_conversion;
  95
  96 #ifdef MULE
  97 Lisp_Object Qucs4, Qutf8;
  98 Lisp_Object Qbig5, Qshift_jis;
  99 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3;
 100 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output;
 101 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output;
 102 Lisp_Object Qno_iso6429;
 103 Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion;
 104 Lisp_Object Qescape_quoted;
 105 Lisp_Object Qshort, Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift;
 106 #endif
 107 #ifdef UTF2000
 108 Lisp_Object Qutf_8_mcs;
 109 Lisp_Object Qdisable_composition;
 110 Lisp_Object Quse_entity_reference;
 111 Lisp_Object Qd, Qx, QX;
 112 #endif
 113 Lisp_Object Qencode, Qdecode;
 114
 115 Lisp_Object Vcoding_system_hash_table;
 116
 117 int enable_multibyte_characters;
 118
 119 #ifdef MULE
 120 /* Additional information used by the ISO2022 decoder and detector. */
 121 struct iso2022_decoder
 122 {
 123   /* CHARSET holds the character sets currently assigned to the G0
 124      through G3 variables.  It is initialized from the array
 125      INITIAL_CHARSET in CODESYS. */
 126   Lisp_Object charset[4];
 127
 128   /* Which registers are currently invoked into the left (GL) and
 129      right (GR) halves of the 8-bit encoding space? */
 130   int register_left, register_right;
 131
 132   /* ISO_ESC holds a value indicating part of an escape sequence
 133      that has already been seen. */
 134   enum iso_esc_flag esc;
 135
 136   /* This records the bytes we've seen so far in an escape sequence,
 137      in case the sequence is invalid (we spit out the bytes unchanged). */
 138   unsigned char esc_bytes[8];
 139
 140   /* Index for next byte to store in ISO escape sequence. */
 141   int esc_bytes_index;
 142
 143 #ifdef ENABLE_COMPOSITE_CHARS
 144   /* Stuff seen so far when composing a string. */
 145   unsigned_char_dynarr *composite_chars;
 146 #endif
 147
 148   /* If we saw an invalid designation sequence for a particular
 149      register, we flag it here and switch to ASCII.  The next time we
 150      see a valid designation for this register, we turn off the flag
 151      and do the designation normally, but pretend the sequence was
 152      invalid.  The effect of all this is that (most of the time) the
 153      escape sequences for both the switch to the unknown charset, and
 154      the switch back to the known charset, get inserted literally into
 155      the buffer and saved out as such.  The hope is that we can
 156      preserve the escape sequences so that the resulting written out
 157      file makes sense.  If we don't do any of this, the designation
 158      to the invalid charset will be preserved but that switch back
 159      to the known charset will probably get eaten because it was
 160      the same charset that was already present in the register. */
 161   unsigned char invalid_designated[4];
 162
 163   /* We try to do similar things as above for direction-switching
 164      sequences.  If we encountered a direction switch while an
 165      invalid designation was present, or an invalid designation
 166      just after a direction switch (i.e. no valid designation
 167      encountered yet), we insert the direction-switch escape
 168      sequence literally into the output stream, and later on
 169      insert the corresponding direction-restoring escape sequence
 170      literally also. */
 171   unsigned int switched_dir_and_no_valid_charset_yet :1;
 172   unsigned int invalid_switch_dir :1;
 173
 174   /* Tells the decoder to output the escape sequence literally
 175      even though it was valid.  Used in the games we play to
 176      avoid lossage when we encounter invalid designations. */
 177   unsigned int output_literally :1;
 178   /* We encountered a direction switch followed by an invalid
 179      designation.  We didn't output the direction switch
 180      literally because we didn't know about the invalid designation;
 181      but we have to do so now. */
 182   unsigned int output_direction_sequence :1;
 183 };
 184 #endif /* MULE */
 185 EXFUN (Fcopy_coding_system, 2);
 186 #ifdef MULE
 187 struct detection_state;
 188
 189 static void
 190 text_encode_generic (Lstream *encoding, const Bufbyte *src,
 191                      unsigned_char_dynarr *dst, size_t n);
 192
 193 static int detect_coding_sjis (struct detection_state *st,
 194                                const Extbyte *src, size_t n);
 195 static void decode_coding_sjis (Lstream *decoding, const Extbyte *src,
 196                                 unsigned_char_dynarr *dst, size_t n);
 197 void char_encode_shift_jis (struct encoding_stream *str, Emchar c,
 198                             unsigned_char_dynarr *dst, unsigned int *flags);
 199 void char_finish_shift_jis (struct encoding_stream *str,
 200                             unsigned_char_dynarr *dst, unsigned int *flags);
 201
 202 static int detect_coding_big5 (struct detection_state *st,
 203                                const Extbyte *src, size_t n);
 204 static void decode_coding_big5 (Lstream *decoding, const Extbyte *src,
 205                                 unsigned_char_dynarr *dst, size_t n);
 206 void char_encode_big5 (struct encoding_stream *str, Emchar c,
 207                        unsigned_char_dynarr *dst, unsigned int *flags);
 208 void char_finish_big5 (struct encoding_stream *str,
 209                        unsigned_char_dynarr *dst, unsigned int *flags);
 210
 211 static int detect_coding_ucs4 (struct detection_state *st,
 212                                const Extbyte *src, size_t n);
 213 static void decode_coding_ucs4 (Lstream *decoding, const Extbyte *src,
 214                                 unsigned_char_dynarr *dst, size_t n);
 215 void char_encode_ucs4 (struct encoding_stream *str, Emchar c,
 216                        unsigned_char_dynarr *dst, unsigned int *flags);
 217 void char_finish_ucs4 (struct encoding_stream *str,
 218                        unsigned_char_dynarr *dst, unsigned int *flags);
 219
 220 static int detect_coding_utf8 (struct detection_state *st,
 221                                const Extbyte *src, size_t n);
 222 static void decode_coding_utf8 (Lstream *decoding, const Extbyte *src,
 223                                 unsigned_char_dynarr *dst, size_t n);
 224 void char_encode_utf8 (struct encoding_stream *str, Emchar c,
 225                        unsigned_char_dynarr *dst, unsigned int *flags);
 226 void char_finish_utf8 (struct encoding_stream *str,
 227                        unsigned_char_dynarr *dst, unsigned int *flags);
 228
 229 static int postprocess_iso2022_mask (int mask);
 230 static void reset_iso2022 (Lisp_Object coding_system,
 231                            struct iso2022_decoder *iso);
 232 static int detect_coding_iso2022 (struct detection_state *st,
 233                                   const Extbyte *src, size_t n);
 234 static void decode_coding_iso2022 (Lstream *decoding, const Extbyte *src,
 235                                    unsigned_char_dynarr *dst, size_t n);
 236 void char_encode_iso2022 (struct encoding_stream *str, Emchar c,
 237                           unsigned_char_dynarr *dst, unsigned int *flags);
 238 void char_finish_iso2022 (struct encoding_stream *str,
 239                           unsigned_char_dynarr *dst, unsigned int *flags);
 240 #endif /* MULE */
 241 static void decode_coding_no_conversion (Lstream *decoding, const Extbyte *src,
 242                                          unsigned_char_dynarr *dst, size_t n);
 243 static void encode_coding_no_conversion (Lstream *encoding, const Bufbyte *src,
 244                                          unsigned_char_dynarr *dst, size_t n);
 245 static void mule_decode (Lstream *decoding, const Extbyte *src,
 246                          unsigned_char_dynarr *dst, size_t n);
 247 static void mule_encode (Lstream *encoding, const Bufbyte *src,
 248                          unsigned_char_dynarr *dst, size_t n);
 249
 250 typedef struct codesys_prop codesys_prop;
 251 struct codesys_prop
 252 {
 253   Lisp_Object sym;
 254   int prop_type;
 255 };
 256
 257 typedef struct
 258 {
 259   Dynarr_declare (codesys_prop);
 260 } codesys_prop_dynarr;
 261
 262 static const struct lrecord_description codesys_prop_description_1[] = {
 263   { XD_LISP_OBJECT, offsetof (codesys_prop, sym) },
 264   { XD_END }
 265 };
 266
 267 static const struct struct_description codesys_prop_description = {
 268   sizeof (codesys_prop),
 269   codesys_prop_description_1
 270 };
 271
 272 static const struct lrecord_description codesys_prop_dynarr_description_1[] = {
 273   XD_DYNARR_DESC (codesys_prop_dynarr, &codesys_prop_description),
 274   { XD_END }
 275 };
 276
 277 static const struct struct_description codesys_prop_dynarr_description = {
 278   sizeof (codesys_prop_dynarr),
 279   codesys_prop_dynarr_description_1
 280 };
 281
 282 codesys_prop_dynarr *the_codesys_prop_dynarr;
 283
 284 enum codesys_prop_enum
 285 {
 286   CODESYS_PROP_ALL_OK,
 287   CODESYS_PROP_ISO2022,
 288   CODESYS_PROP_CCL
 289 };
 290
 291 \f
 292 /************************************************************************/
 293 /*                       Coding system functions                        */
 294 /************************************************************************/
 295
 296 static Lisp_Object mark_coding_system (Lisp_Object);
 297 static void print_coding_system (Lisp_Object, Lisp_Object, int);
 298 static void finalize_coding_system (void *header, int for_disksave);
 299
 300 #ifdef MULE
 301 static const struct lrecord_description ccs_description_1[] = {
 302   { XD_LISP_OBJECT, offsetof (charset_conversion_spec, from_charset) },
 303   { XD_LISP_OBJECT, offsetof (charset_conversion_spec, to_charset) },
 304   { XD_END }
 305 };
 306
 307 static const struct struct_description ccs_description = {
 308   sizeof (charset_conversion_spec),
 309   ccs_description_1
 310 };
 311
 312 static const struct lrecord_description ccsd_description_1[] = {
 313   XD_DYNARR_DESC (charset_conversion_spec_dynarr, &ccs_description),
 314   { XD_END }
 315 };
 316
 317 static const struct struct_description ccsd_description = {
 318   sizeof (charset_conversion_spec_dynarr),
 319   ccsd_description_1
 320 };
 321 #endif
 322
 323 static const struct lrecord_description coding_system_description[] = {
 324   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, name) },
 325   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, doc_string) },
 326   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, mnemonic) },
 327   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, post_read_conversion) },
 328   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, pre_write_conversion) },
 329   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_lf) },
 330   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_crlf) },
 331   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_cr) },
 332 #ifdef MULE
 333   { XD_LISP_OBJECT_ARRAY, offsetof (Lisp_Coding_System, iso2022.initial_charset), 4 },
 334   { XD_STRUCT_PTR,  offsetof (Lisp_Coding_System, iso2022.input_conv),  1, &ccsd_description },
 335   { XD_STRUCT_PTR,  offsetof (Lisp_Coding_System, iso2022.output_conv), 1, &ccsd_description },
 336   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccl.decode) },
 337   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccl.encode) },
 338 #ifdef UTF2000
 339   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccs_priority_list) },
 340 #endif
 341 #endif
 342   { XD_END }
 343 };
 344
 345 DEFINE_LRECORD_IMPLEMENTATION ("coding-system", coding_system,
 346                                mark_coding_system, print_coding_system,
 347                                finalize_coding_system,
 348                                0, 0, coding_system_description,
 349                                Lisp_Coding_System);
 350
 351 static Lisp_Object
 352 mark_coding_system (Lisp_Object obj)
 353 {
 354   Lisp_Coding_System *codesys = XCODING_SYSTEM (obj);
 355
 356   mark_object (CODING_SYSTEM_NAME (codesys));
 357   mark_object (CODING_SYSTEM_DOC_STRING (codesys));
 358   mark_object (CODING_SYSTEM_MNEMONIC (codesys));
 359   mark_object (CODING_SYSTEM_EOL_LF (codesys));
 360   mark_object (CODING_SYSTEM_EOL_CRLF (codesys));
 361   mark_object (CODING_SYSTEM_EOL_CR (codesys));
 362
 363   switch (CODING_SYSTEM_TYPE (codesys))
 364     {
 365 #ifdef MULE
 366       int i;
 367     case CODESYS_ISO2022:
 368       for (i = 0; i < 4; i++)
 369         mark_object (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i));
 370       if (codesys->iso2022.input_conv)
 371         {
 372           for (i = 0; i < Dynarr_length (codesys->iso2022.input_conv); i++)
 373             {
 374               struct charset_conversion_spec *ccs =
 375                 Dynarr_atp (codesys->iso2022.input_conv, i);
 376               mark_object (ccs->from_charset);
 377               mark_object (ccs->to_charset);
 378             }
 379         }
 380       if (codesys->iso2022.output_conv)
 381         {
 382           for (i = 0; i < Dynarr_length (codesys->iso2022.output_conv); i++)
 383             {
 384               struct charset_conversion_spec *ccs =
 385                 Dynarr_atp (codesys->iso2022.output_conv, i);
 386               mark_object (ccs->from_charset);
 387               mark_object (ccs->to_charset);
 388             }
 389         }
 390       break;
 391 #ifdef UTF2000
 392
 393     case CODESYS_BIG5:
 394       mark_object (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 0));
 395       mark_object (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 1));
 396       break;
 397 #endif
 398
 399     case CODESYS_CCL:
 400       mark_object (CODING_SYSTEM_CCL_DECODE (codesys));
 401       mark_object (CODING_SYSTEM_CCL_ENCODE (codesys));
 402       break;
 403 #endif /* MULE */
 404     default:
 405       break;
 406     }
 407
 408   mark_object (CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys));
 409 #ifdef UTF2000
 410   mark_object (CODING_SYSTEM_CCS_PRIORITY_LIST (codesys));
 411 #endif
 412   return CODING_SYSTEM_POST_READ_CONVERSION (codesys);
 413 }
 414
 415 static void
 416 print_coding_system (Lisp_Object obj, Lisp_Object printcharfun,
 417                      int escapeflag)
 418 {
 419   Lisp_Coding_System *c = XCODING_SYSTEM (obj);
 420   if (print_readably)
 421     error ("printing unreadable object #<coding_system 0x%x>",
 422            c->header.uid);
 423
 424   write_c_string ("#<coding_system ", printcharfun);
 425   print_internal (c->name, printcharfun, 1);
 426   write_c_string (">", printcharfun);
 427 }
 428
 429 static void
 430 finalize_coding_system (void *header, int for_disksave)
 431 {
 432   Lisp_Coding_System *c = (Lisp_Coding_System *) header;
 433   /* Since coding systems never go away, this function is not
 434      necessary.  But it would be necessary if we changed things
 435      so that coding systems could go away. */
 436   if (!for_disksave) /* see comment in lstream.c */
 437     {
 438       switch (CODING_SYSTEM_TYPE (c))
 439         {
 440 #ifdef MULE
 441         case CODESYS_ISO2022:
 442           if (c->iso2022.input_conv)
 443             {
 444               Dynarr_free (c->iso2022.input_conv);
 445               c->iso2022.input_conv = 0;
 446             }
 447           if (c->iso2022.output_conv)
 448             {
 449               Dynarr_free (c->iso2022.output_conv);
 450               c->iso2022.output_conv = 0;
 451             }
 452           break;
 453 #endif /* MULE */
 454         default:
 455           break;
 456         }
 457     }
 458 }
 459
 460 static eol_type_t
 461 symbol_to_eol_type (Lisp_Object symbol)
 462 {
 463   CHECK_SYMBOL (symbol);
 464   if (NILP (symbol))      return EOL_AUTODETECT;
 465   if (EQ (symbol, Qlf))   return EOL_LF;
 466   if (EQ (symbol, Qcrlf)) return EOL_CRLF;
 467   if (EQ (symbol, Qcr))   return EOL_CR;
 468
 469   signal_simple_error ("Unrecognized eol type", symbol);
 470   return EOL_AUTODETECT; /* not reached */
 471 }
 472
 473 static Lisp_Object
 474 eol_type_to_symbol (eol_type_t type)
 475 {
 476   switch (type)
 477     {
 478     default: abort ();
 479     case EOL_LF:         return Qlf;
 480     case EOL_CRLF:       return Qcrlf;
 481     case EOL_CR:         return Qcr;
 482     case EOL_AUTODETECT: return Qnil;
 483     }
 484 }
 485
 486 static void
 487 setup_eol_coding_systems (Lisp_Coding_System *codesys)
 488 {
 489   Lisp_Object codesys_obj;
 490   int len = string_length (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name);
 491   char *codesys_name = (char *) alloca (len + 7);
 492   int mlen = -1;
 493   char *codesys_mnemonic=0;
 494
 495   Lisp_Object codesys_name_sym, sub_codesys_obj;
 496
 497   /* kludge */
 498
 499   XSETCODING_SYSTEM (codesys_obj, codesys);
 500
 501   memcpy (codesys_name,
 502           string_data (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name), len);
 503
 504   if (STRINGP (CODING_SYSTEM_MNEMONIC (codesys)))
 505     {
 506       mlen = XSTRING_LENGTH (CODING_SYSTEM_MNEMONIC (codesys));
 507       codesys_mnemonic = (char *) alloca (mlen + 7);
 508       memcpy (codesys_mnemonic,
 509               XSTRING_DATA (CODING_SYSTEM_MNEMONIC (codesys)), mlen);
 510     }
 511
 512 #define DEFINE_SUB_CODESYS(op_sys, op_sys_abbr, Type) do {                      \
 513   strcpy (codesys_name + len, "-" op_sys);                                      \
 514   if (mlen != -1)                                                               \
 515     strcpy (codesys_mnemonic + mlen, op_sys_abbr);                              \
 516   codesys_name_sym = intern (codesys_name);                                     \
 517   sub_codesys_obj = Fcopy_coding_system (codesys_obj, codesys_name_sym);        \
 518   XCODING_SYSTEM_EOL_TYPE (sub_codesys_obj) = Type;                             \
 519   if (mlen != -1)                                                               \
 520     XCODING_SYSTEM_MNEMONIC(sub_codesys_obj) =                                  \
 521       build_string (codesys_mnemonic);                                          \
 522   CODING_SYSTEM_##Type (codesys) = sub_codesys_obj;                             \
 523 } while (0)
 524
 525   DEFINE_SUB_CODESYS("unix", "", EOL_LF);
 526   DEFINE_SUB_CODESYS("dos",  ":T", EOL_CRLF);
 527   DEFINE_SUB_CODESYS("mac",  ":t", EOL_CR);
 528 }
 529
 530 DEFUN ("coding-system-p", Fcoding_system_p, 1, 1, 0, /*
 531 Return t if OBJECT is a coding system.
 532 A coding system is an object that defines how text containing multiple
 533 character sets is encoded into a stream of (typically 8-bit) bytes.
 534 The coding system is used to decode the stream into a series of
 535 characters (which may be from multiple charsets) when the text is read
 536 from a file or process, and is used to encode the text back into the
 537 same format when it is written out to a file or process.
 538
 539 For example, many ISO2022-compliant coding systems (such as Compound
 540 Text, which is used for inter-client data under the X Window System)
 541 use escape sequences to switch between different charsets -- Japanese
 542 Kanji, for example, is invoked with "ESC $ ( B"; ASCII is invoked
 543 with "ESC ( B"; and Cyrillic is invoked with "ESC - L".  See
 544 `make-coding-system' for more information.
 545
 546 Coding systems are normally identified using a symbol, and the
 547 symbol is accepted in place of the actual coding system object whenever
 548 a coding system is called for. (This is similar to how faces work.)
 549 */
 550        (object))
 551 {
 552   return CODING_SYSTEMP (object) ? Qt : Qnil;
 553 }
 554
 555 DEFUN ("find-coding-system", Ffind_coding_system, 1, 1, 0, /*
 556 Retrieve the coding system of the given name.
 557
 558 If CODING-SYSTEM-OR-NAME is a coding-system object, it is simply
 559 returned.  Otherwise, CODING-SYSTEM-OR-NAME should be a symbol.
 560 If there is no such coding system, nil is returned.  Otherwise the
 561 associated coding system object is returned.
 562 */
 563        (coding_system_or_name))
 564 {
 565   if (NILP (coding_system_or_name))
 566     coding_system_or_name = Qbinary;
 567   else if (CODING_SYSTEMP (coding_system_or_name))
 568     return coding_system_or_name;
 569   else
 570     CHECK_SYMBOL (coding_system_or_name);
 571
 572   while (1)
 573     {
 574       coding_system_or_name =
 575         Fgethash (coding_system_or_name, Vcoding_system_hash_table, Qnil);
 576
 577       if (CODING_SYSTEMP (coding_system_or_name) || NILP (coding_system_or_name))
 578         return coding_system_or_name;
 579     }
 580 }
 581
 582 DEFUN ("get-coding-system", Fget_coding_system, 1, 1, 0, /*
 583 Retrieve the coding system of the given name.
 584 Same as `find-coding-system' except that if there is no such
 585 coding system, an error is signaled instead of returning nil.
 586 */
 587        (name))
 588 {
 589   Lisp_Object coding_system = Ffind_coding_system (name);
 590
 591   if (NILP (coding_system))
 592     signal_simple_error ("No such coding system", name);
 593   return coding_system;
 594 }
 595
 596 /* We store the coding systems in hash tables with the names as the key and the
 597    actual coding system object as the value.  Occasionally we need to use them
 598    in a list format.  These routines provide us with that. */
 599 struct coding_system_list_closure
 600 {
 601   Lisp_Object *coding_system_list;
 602 };
 603
 604 static int
 605 add_coding_system_to_list_mapper (Lisp_Object key, Lisp_Object value,
 606                                   void *coding_system_list_closure)
 607 {
 608   /* This function can GC */
 609   struct coding_system_list_closure *cscl =
 610     (struct coding_system_list_closure *) coding_system_list_closure;
 611   Lisp_Object *coding_system_list = cscl->coding_system_list;
 612
 613   *coding_system_list = Fcons (key, *coding_system_list);
 614   return 0;
 615 }
 616
 617 DEFUN ("coding-system-list", Fcoding_system_list, 0, 0, 0, /*
 618 Return a list of the names of all defined coding systems.
 619 */
 620        ())
 621 {
 622   Lisp_Object coding_system_list = Qnil;
 623   struct gcpro gcpro1;
 624   struct coding_system_list_closure coding_system_list_closure;
 625
 626   GCPRO1 (coding_system_list);
 627   coding_system_list_closure.coding_system_list = &coding_system_list;
 628   elisp_maphash (add_coding_system_to_list_mapper, Vcoding_system_hash_table,
 629                  &coding_system_list_closure);
 630   UNGCPRO;
 631
 632   return coding_system_list;
 633 }
 634
 635 DEFUN ("coding-system-name", Fcoding_system_name, 1, 1, 0, /*
 636 Return the name of the given coding system.
 637 */
 638        (coding_system))
 639 {
 640   coding_system = Fget_coding_system (coding_system);
 641   return XCODING_SYSTEM_NAME (coding_system);
 642 }
 643
 644 static Lisp_Coding_System *
 645 allocate_coding_system (enum coding_system_type type, Lisp_Object name)
 646 {
 647   Lisp_Coding_System *codesys =
 648     alloc_lcrecord_type (Lisp_Coding_System, &lrecord_coding_system);
 649
 650   zero_lcrecord (codesys);
 651   CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = Qnil;
 652   CODING_SYSTEM_POST_READ_CONVERSION (codesys) = Qnil;
 653   CODING_SYSTEM_EOL_TYPE (codesys) = EOL_AUTODETECT;
 654   CODING_SYSTEM_EOL_CRLF (codesys) = Qnil;
 655   CODING_SYSTEM_EOL_CR   (codesys) = Qnil;
 656   CODING_SYSTEM_EOL_LF   (codesys) = Qnil;
 657   CODING_SYSTEM_TYPE     (codesys) = type;
 658   CODING_SYSTEM_MNEMONIC (codesys) = Qnil;
 659 #ifdef MULE
 660 #ifdef UTF2000
 661   CODING_SYSTEM_CCS_PRIORITY_LIST (codesys) = Qnil;
 662 #endif
 663   if (type == CODESYS_ISO2022)
 664     {
 665       int i;
 666       for (i = 0; i < 4; i++)
 667         CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i) = Qnil;
 668     }
 669 #ifdef UTF2000
 670   if (type == CODESYS_UTF8)
 671     {
 672       CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 0)
 673         = Vcharset_ucs;
 674       CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 1)
 675         = Qnil;
 676       CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 2)
 677         = Qnil;
 678       CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 3)
 679         = Qnil;
 680     }
 681   else if (type == CODESYS_BIG5)
 682     {
 683       CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 0)
 684         = Vcharset_ascii;
 685       CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 1)
 686         = Vcharset_chinese_big5;
 687       CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 2)
 688         = Qnil;
 689       CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 3)
 690         = Qnil;
 691     }
 692 #endif
 693   else if (type == CODESYS_CCL)
 694     {
 695       CODING_SYSTEM_CCL_DECODE (codesys) = Qnil;
 696       CODING_SYSTEM_CCL_ENCODE (codesys) = Qnil;
 697     }
 698 #endif /* MULE */
 699   CODING_SYSTEM_NAME (codesys) = name;
 700
 701   return codesys;
 702 }
 703
 704 #ifdef MULE
 705 /* Given a list of charset conversion specs as specified in a Lisp
 706    program, parse it into STORE_HERE. */
 707
 708 static void
 709 parse_charset_conversion_specs (charset_conversion_spec_dynarr *store_here,
 710                                 Lisp_Object spec_list)
 711 {
 712   Lisp_Object rest;
 713
 714   EXTERNAL_LIST_LOOP (rest, spec_list)
 715     {
 716       Lisp_Object car = XCAR (rest);
 717       Lisp_Object from, to;
 718       struct charset_conversion_spec spec;
 719
 720       if (!CONSP (car) || !CONSP (XCDR (car)) || !NILP (XCDR (XCDR (car))))
 721         signal_simple_error ("Invalid charset conversion spec", car);
 722       from = Fget_charset (XCAR (car));
 723       to = Fget_charset (XCAR (XCDR (car)));
 724       if ( (XCHARSET_CHARS (from) != XCHARSET_CHARS (to)) ||
 725            (XCHARSET_DIMENSION (from) != XCHARSET_DIMENSION (to)) )
 726         signal_simple_error_2
 727           ("Attempted conversion between different charset types",
 728            from, to);
 729       spec.from_charset = from;
 730       spec.to_charset = to;
 731
 732       Dynarr_add (store_here, spec);
 733     }
 734 }
 735
 736 /* Given a dynarr LOAD_HERE of internally-stored charset conversion
 737    specs, return the equivalent as the Lisp programmer would see it.
 738
 739    If LOAD_HERE is 0, return Qnil. */
 740
 741 static Lisp_Object
 742 unparse_charset_conversion_specs (charset_conversion_spec_dynarr *load_here)
 743 {
 744   int i;
 745   Lisp_Object result;
 746
 747   if (!load_here)
 748     return Qnil;
 749   for (i = 0, result = Qnil; i < Dynarr_length (load_here); i++)
 750     {
 751       struct charset_conversion_spec *ccs = Dynarr_atp (load_here, i);
 752       result = Fcons (list2 (ccs->from_charset, ccs->to_charset), result);
 753     }
 754
 755   return Fnreverse (result);
 756 }
 757
 758 #endif /* MULE */
 759
 760 DEFUN ("make-coding-system", Fmake_coding_system, 2, 4, 0, /*
 761 Register symbol NAME as a coding system.
 762
 763 TYPE describes the conversion method used and should be one of
 764
 765 nil or 'undecided
 766      Automatic conversion.  XEmacs attempts to detect the coding system
 767      used in the file.
 768 'no-conversion
 769      No conversion.  Use this for binary files and such.  On output,
 770      graphic characters that are not in ASCII or Latin-1 will be
 771      replaced by a ?. (For a no-conversion-encoded buffer, these
 772      characters will only be present if you explicitly insert them.)
 773 'shift-jis
 774      Shift-JIS (a Japanese encoding commonly used in PC operating systems).
 775 'ucs-4
 776      ISO 10646 UCS-4 encoding.
 777 'utf-8
 778      ISO 10646 UTF-8 encoding.
 779 'iso2022
 780      Any ISO2022-compliant encoding.  Among other things, this includes
 781      JIS (the Japanese encoding commonly used for e-mail), EUC (the
 782      standard Unix encoding for Japanese and other languages), and
 783      Compound Text (the encoding used in X11).  You can specify more
 784      specific information about the conversion with the PROPS argument.
 785 'big5
 786      Big5 (the encoding commonly used for Taiwanese).
 787 'ccl
 788      The conversion is performed using a user-written pseudo-code
 789      program.  CCL (Code Conversion Language) is the name of this
 790      pseudo-code.
 791 'internal
 792      Write out or read in the raw contents of the memory representing
 793      the buffer's text.  This is primarily useful for debugging
 794      purposes, and is only enabled when XEmacs has been compiled with
 795      DEBUG_XEMACS defined (via the --debug configure option).
 796      WARNING: Reading in a file using 'internal conversion can result
 797      in an internal inconsistency in the memory representing a
 798      buffer's text, which will produce unpredictable results and may
 799      cause XEmacs to crash.  Under normal circumstances you should
 800      never use 'internal conversion.
 801
 802 DOC-STRING is a string describing the coding system.
 803
 804 PROPS is a property list, describing the specific nature of the
 805 character set.  Recognized properties are:
 806
 807 'mnemonic
 808      String to be displayed in the modeline when this coding system is
 809      active.
 810
 811 'eol-type
 812      End-of-line conversion to be used.  It should be one of
 813
 814         nil
 815                 Automatically detect the end-of-line type (LF, CRLF,
 816                 or CR).  Also generate subsidiary coding systems named
 817                 `NAME-unix', `NAME-dos', and `NAME-mac', that are
 818                 identical to this coding system but have an EOL-TYPE
 819                 value of 'lf, 'crlf, and 'cr, respectively.
 820         'lf
 821                 The end of a line is marked externally using ASCII LF.
 822                 Since this is also the way that XEmacs represents an
 823                 end-of-line internally, specifying this option results
 824                 in no end-of-line conversion.  This is the standard
 825                 format for Unix text files.
 826         'crlf
 827                 The end of a line is marked externally using ASCII
 828                 CRLF.  This is the standard format for MS-DOS text
 829                 files.
 830         'cr
 831                 The end of a line is marked externally using ASCII CR.
 832                 This is the standard format for Macintosh text files.
 833         t
 834                 Automatically detect the end-of-line type but do not
 835                 generate subsidiary coding systems.  (This value is
 836                 converted to nil when stored internally, and
 837                 `coding-system-property' will return nil.)
 838
 839 'disable-composition
 840      If non-nil, composition/decomposition for combining characters
 841      are disabled.
 842
 843 'use-entity-reference
 844      If non-nil, SGML style entity-reference is used for non-system-characters.
 845
 846 'post-read-conversion
 847      Function called after a file has been read in, to perform the
 848      decoding.  Called with two arguments, START and END, denoting
 849      a region of the current buffer to be decoded.
 850
 851 'pre-write-conversion
 852      Function called before a file is written out, to perform the
 853      encoding.  Called with two arguments, START and END, denoting
 854      a region of the current buffer to be encoded.
 855
 856
 857 The following additional properties are recognized if TYPE is 'iso2022:
 858
 859 'charset-g0
 860 'charset-g1
 861 'charset-g2
 862 'charset-g3
 863      The character set initially designated to the G0 - G3 registers.
 864      The value should be one of
 865
 866           -- A charset object (designate that character set)
 867           -- nil (do not ever use this register)
 868           -- t (no character set is initially designated to
 869                 the register, but may be later on; this automatically
 870                 sets the corresponding `force-g*-on-output' property)
 871
 872 'force-g0-on-output
 873 'force-g1-on-output
 874 'force-g2-on-output
 875 'force-g2-on-output
 876      If non-nil, send an explicit designation sequence on output before
 877      using the specified register.
 878
 879 'short
 880      If non-nil, use the short forms "ESC $ @", "ESC $ A", and
 881      "ESC $ B" on output in place of the full designation sequences
 882      "ESC $ ( @", "ESC $ ( A", and "ESC $ ( B".
 883
 884 'no-ascii-eol
 885      If non-nil, don't designate ASCII to G0 at each end of line on output.
 886      Setting this to non-nil also suppresses other state-resetting that
 887      normally happens at the end of a line.
 888
 889 'no-ascii-cntl
 890      If non-nil, don't designate ASCII to G0 before control chars on output.
 891
 892 'seven
 893      If non-nil, use 7-bit environment on output.  Otherwise, use 8-bit
 894      environment.
 895
 896 'lock-shift
 897      If non-nil, use locking-shift (SO/SI) instead of single-shift
 898      or designation by escape sequence.
 899
 900 'no-iso6429
 901      If non-nil, don't use ISO6429's direction specification.
 902
 903 'escape-quoted
 904      If non-nil, literal control characters that are the same as
 905      the beginning of a recognized ISO2022 or ISO6429 escape sequence
 906      (in particular, ESC (0x1B), SO (0x0E), SI (0x0F), SS2 (0x8E),
 907      SS3 (0x8F), and CSI (0x9B)) are "quoted" with an escape character
 908      so that they can be properly distinguished from an escape sequence.
 909      (Note that doing this results in a non-portable encoding.) This
 910      encoding flag is used for byte-compiled files.  Note that ESC
 911      is a good choice for a quoting character because there are no
 912      escape sequences whose second byte is a character from the Control-0
 913      or Control-1 character sets; this is explicitly disallowed by the
 914      ISO2022 standard.
 915
 916 'input-charset-conversion
 917      A list of conversion specifications, specifying conversion of
 918      characters in one charset to another when decoding is performed.
 919      Each specification is a list of two elements: the source charset,
 920      and the destination charset.
 921
 922 'output-charset-conversion
 923      A list of conversion specifications, specifying conversion of
 924      characters in one charset to another when encoding is performed.
 925      The form of each specification is the same as for
 926      'input-charset-conversion.
 927
 928
 929 The following additional properties are recognized (and required)
 930 if TYPE is 'ccl:
 931
 932 'decode
 933      CCL program used for decoding (converting to internal format).
 934
 935 'encode
 936      CCL program used for encoding (converting to external format).
 937 */
 938        (name, type, doc_string, props))
 939 {
 940   Lisp_Coding_System *codesys;
 941   enum coding_system_type ty;
 942   int need_to_setup_eol_systems = 1;
 943
 944   /* Convert type to constant */
 945   if (NILP (type) || EQ (type, Qundecided))
 946                                       { ty = CODESYS_AUTODETECT; }
 947 #ifdef MULE
 948   else if (EQ (type, Qshift_jis))     { ty = CODESYS_SHIFT_JIS; }
 949   else if (EQ (type, Qiso2022))       { ty = CODESYS_ISO2022; }
 950   else if (EQ (type, Qbig5))          { ty = CODESYS_BIG5; }
 951   else if (EQ (type, Qucs4))          { ty = CODESYS_UCS4; }
 952   else if (EQ (type, Qutf8))          { ty = CODESYS_UTF8; }
 953   else if (EQ (type, Qccl))           { ty = CODESYS_CCL; }
 954 #endif
 955   else if (EQ (type, Qno_conversion)) { ty = CODESYS_NO_CONVERSION; }
 956 #ifdef DEBUG_XEMACS
 957   else if (EQ (type, Qinternal))      { ty = CODESYS_INTERNAL; }
 958 #endif
 959   else
 960     signal_simple_error ("Invalid coding system type", type);
 961
 962   CHECK_SYMBOL (name);
 963
 964   codesys = allocate_coding_system (ty, name);
 965
 966   if (NILP (doc_string))
 967     doc_string = build_string ("");
 968   else
 969     CHECK_STRING (doc_string);
 970   CODING_SYSTEM_DOC_STRING (codesys) = doc_string;
 971
 972   {
 973     EXTERNAL_PROPERTY_LIST_LOOP_3 (key, value, props)
 974       {
 975         if (EQ (key, Qmnemonic))
 976           {
 977             if (!NILP (value))
 978               CHECK_STRING (value);
 979             CODING_SYSTEM_MNEMONIC (codesys) = value;
 980           }
 981
 982         else if (EQ (key, Qeol_type))
 983           {
 984             need_to_setup_eol_systems = NILP (value);
 985             if (EQ (value, Qt))
 986               value = Qnil;
 987             CODING_SYSTEM_EOL_TYPE (codesys) = symbol_to_eol_type (value);
 988           }
 989
 990         else if (EQ (key, Qpost_read_conversion))
 991           CODING_SYSTEM_POST_READ_CONVERSION (codesys) = value;
 992         else if (EQ (key, Qpre_write_conversion))
 993           CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = value;
 994 #ifdef UTF2000
 995         else if (EQ (key, Qdisable_composition))
 996           CODING_SYSTEM_DISABLE_COMPOSITION (codesys) = !NILP (value);
 997         else if (EQ (key, Quse_entity_reference))
 998           CODING_SYSTEM_USE_ENTITY_REFERENCE (codesys) = !NILP (value);
 999 #endif
1000 #ifdef MULE
1001         else if (ty == CODESYS_ISO2022)
1002           {
1003 #define FROB_INITIAL_CHARSET(charset_num) \
1004   CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \
1005     ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value))
1006
1007             if      (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
1008             else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
1009             else if (EQ (key, Qcharset_g2)) FROB_INITIAL_CHARSET (2);
1010             else if (EQ (key, Qcharset_g3)) FROB_INITIAL_CHARSET (3);
1011
1012 #define FROB_FORCE_CHARSET(charset_num) \
1013   CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = !NILP (value)
1014
1015             else if (EQ (key, Qforce_g0_on_output)) FROB_FORCE_CHARSET (0);
1016             else if (EQ (key, Qforce_g1_on_output)) FROB_FORCE_CHARSET (1);
1017             else if (EQ (key, Qforce_g2_on_output)) FROB_FORCE_CHARSET (2);
1018             else if (EQ (key, Qforce_g3_on_output)) FROB_FORCE_CHARSET (3);
1019
1020 #define FROB_BOOLEAN_PROPERTY(prop) \
1021   CODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value)
1022
1023             else if (EQ (key, Qshort))         FROB_BOOLEAN_PROPERTY (SHORT);
1024             else if (EQ (key, Qno_ascii_eol))  FROB_BOOLEAN_PROPERTY (NO_ASCII_EOL);
1025             else if (EQ (key, Qno_ascii_cntl)) FROB_BOOLEAN_PROPERTY (NO_ASCII_CNTL);
1026             else if (EQ (key, Qseven))         FROB_BOOLEAN_PROPERTY (SEVEN);
1027             else if (EQ (key, Qlock_shift))    FROB_BOOLEAN_PROPERTY (LOCK_SHIFT);
1028             else if (EQ (key, Qno_iso6429))    FROB_BOOLEAN_PROPERTY (NO_ISO6429);
1029             else if (EQ (key, Qescape_quoted)) FROB_BOOLEAN_PROPERTY (ESCAPE_QUOTED);
1030
1031             else if (EQ (key, Qinput_charset_conversion))
1032               {
1033                 codesys->iso2022.input_conv =
1034                   Dynarr_new (charset_conversion_spec);
1035                 parse_charset_conversion_specs (codesys->iso2022.input_conv,
1036                                                 value);
1037               }
1038             else if (EQ (key, Qoutput_charset_conversion))
1039               {
1040                 codesys->iso2022.output_conv =
1041                   Dynarr_new (charset_conversion_spec);
1042                 parse_charset_conversion_specs (codesys->iso2022.output_conv,
1043                                                 value);
1044               }
1045             else
1046               signal_simple_error ("Unrecognized property", key);
1047           }
1048 #ifdef UTF2000
1049         else if (ty == CODESYS_UTF8)
1050           {
1051             if      (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
1052             else if (EQ (key, Qcharset_g1))
1053               CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 1) = value;
1054             else if (EQ (key, Qcharset_g2))
1055               CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, 2) = value;
1056             else
1057               signal_simple_error ("Unrecognized property", key);
1058           }
1059         else if (ty == CODESYS_BIG5)
1060           {
1061             if      (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
1062             else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
1063             else
1064               signal_simple_error ("Unrecognized property", key);
1065           }
1066 #endif
1067         else if (EQ (type, Qccl))
1068           {
1069             Lisp_Object sym;
1070             struct ccl_program test_ccl;
1071             Extbyte *suffix;
1072
1073             /* Check key first.  */
1074             if (EQ (key, Qdecode))
1075               suffix = "-ccl-decode";
1076             else if (EQ (key, Qencode))
1077               suffix = "-ccl-encode";
1078             else
1079               signal_simple_error ("Unrecognized property", key);
1080
1081             /* If value is vector, register it as a ccl program
1082                associated with an newly created symbol for
1083                backward compatibility.  */
1084             if (VECTORP (value))
1085               {
1086                 sym = Fintern (concat2 (Fsymbol_name (name),
1087                                         build_string (suffix)),
1088                                Qnil);
1089                 Fregister_ccl_program (sym, value);
1090               }
1091             else
1092               {
1093                 CHECK_SYMBOL (value);
1094                 sym = value;
1095               }
1096             /* check if the given ccl programs are valid.  */
1097             if (setup_ccl_program (&test_ccl, sym) < 0)
1098               signal_simple_error ("Invalid CCL program", value);
1099
1100             if (EQ (key, Qdecode))
1101               CODING_SYSTEM_CCL_DECODE (codesys) = sym;
1102             else if (EQ (key, Qencode))
1103               CODING_SYSTEM_CCL_ENCODE (codesys) = sym;
1104
1105           }
1106 #endif /* MULE */
1107         else
1108           signal_simple_error ("Unrecognized property", key);
1109       }
1110   }
1111
1112   if (need_to_setup_eol_systems)
1113     setup_eol_coding_systems (codesys);
1114
1115   {
1116     Lisp_Object codesys_obj;
1117     XSETCODING_SYSTEM (codesys_obj, codesys);
1118     Fputhash (name, codesys_obj, Vcoding_system_hash_table);
1119     return codesys_obj;
1120   }
1121 }
1122
1123 DEFUN ("copy-coding-system", Fcopy_coding_system, 2, 2, 0, /*
1124 Copy OLD-CODING-SYSTEM to NEW-NAME.
1125 If NEW-NAME does not name an existing coding system, a new one will
1126 be created.
1127 */
1128        (old_coding_system, new_name))
1129 {
1130   Lisp_Object new_coding_system;
1131   old_coding_system = Fget_coding_system (old_coding_system);
1132   new_coding_system = Ffind_coding_system (new_name);
1133   if (NILP (new_coding_system))
1134     {
1135       XSETCODING_SYSTEM (new_coding_system,
1136                          allocate_coding_system
1137                          (XCODING_SYSTEM_TYPE (old_coding_system),
1138                           new_name));
1139       Fputhash (new_name, new_coding_system, Vcoding_system_hash_table);
1140     }
1141
1142   {
1143     Lisp_Coding_System *to = XCODING_SYSTEM (new_coding_system);
1144     Lisp_Coding_System *from = XCODING_SYSTEM (old_coding_system);
1145     memcpy (((char *) to  ) + sizeof (to->header),
1146             ((char *) from) + sizeof (from->header),
1147             sizeof (*from) - sizeof (from->header));
1148     to->name = new_name;
1149   }
1150   return new_coding_system;
1151 }
1152
1153 DEFUN ("coding-system-canonical-name-p", Fcoding_system_canonical_name_p, 1, 1, 0, /*
1154 Return t if OBJECT names a coding system, and is not a coding system alias.
1155 */
1156        (object))
1157 {
1158   return CODING_SYSTEMP (Fgethash (object, Vcoding_system_hash_table, Qnil))
1159     ? Qt : Qnil;
1160 }
1161
1162 DEFUN ("coding-system-alias-p", Fcoding_system_alias_p, 1, 1, 0, /*
1163 Return t if OBJECT is a coding system alias.
1164 All coding system aliases are created by `define-coding-system-alias'.
1165 */
1166        (object))
1167 {
1168   return SYMBOLP (Fgethash (object, Vcoding_system_hash_table, Qzero))
1169     ? Qt : Qnil;
1170 }
1171
1172 DEFUN ("coding-system-aliasee", Fcoding_system_aliasee, 1, 1, 0, /*
1173 Return the coding-system symbol for which symbol ALIAS is an alias.
1174 */
1175        (alias))
1176 {
1177   Lisp_Object aliasee = Fgethash (alias, Vcoding_system_hash_table, Qnil);
1178   if (SYMBOLP (aliasee))
1179     return aliasee;
1180   else
1181     signal_simple_error ("Symbol is not a coding system alias", alias);
1182   return Qnil;          /* To keep the compiler happy */
1183 }
1184
1185 static Lisp_Object
1186 append_suffix_to_symbol (Lisp_Object symbol, const char *ascii_string)
1187 {
1188   return Fintern (concat2 (Fsymbol_name (symbol), build_string (ascii_string)),
1189                   Qnil);
1190 }
1191
1192 /* A maphash function, for removing dangling coding system aliases. */
1193 static int
1194 dangling_coding_system_alias_p (Lisp_Object alias,
1195                                 Lisp_Object aliasee,
1196                                 void *dangling_aliases)
1197 {
1198   if (SYMBOLP (aliasee)
1199       && NILP (Fgethash (aliasee, Vcoding_system_hash_table, Qnil)))
1200     {
1201       (*(int *) dangling_aliases)++;
1202       return 1;
1203     }
1204   else
1205     return 0;
1206 }
1207
1208 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias, 2, 2, 0, /*
1209 Define symbol ALIAS as an alias for coding system ALIASEE.
1210
1211 You can use this function to redefine an alias that has already been defined,
1212 but you cannot redefine a name which is the canonical name for a coding system.
1213 \(a canonical name of a coding system is what is returned when you call
1214 `coding-system-name' on a coding system).
1215
1216 ALIASEE itself can be an alias, which allows you to define nested aliases.
1217
1218 You are forbidden, however, from creating alias loops or `dangling' aliases.
1219 These will be detected, and an error will be signaled if you attempt to do so.
1220
1221 If ALIASEE is nil, then ALIAS will simply be undefined.
1222
1223 See also `coding-system-alias-p', `coding-system-aliasee',
1224 and `coding-system-canonical-name-p'.
1225 */
1226        (alias, aliasee))
1227 {
1228   Lisp_Object real_coding_system, probe;
1229
1230   CHECK_SYMBOL (alias);
1231
1232   if (!NILP (Fcoding_system_canonical_name_p (alias)))
1233     signal_simple_error
1234       ("Symbol is the canonical name of a coding system and cannot be redefined",
1235        alias);
1236
1237   if (NILP (aliasee))
1238     {
1239       Lisp_Object subsidiary_unix = append_suffix_to_symbol (alias, "-unix");
1240       Lisp_Object subsidiary_dos  = append_suffix_to_symbol (alias, "-dos");
1241       Lisp_Object subsidiary_mac  = append_suffix_to_symbol (alias, "-mac");
1242
1243       Fremhash (alias, Vcoding_system_hash_table);
1244
1245       /* Undefine subsidiary aliases,
1246          presumably created by a previous call to this function */
1247       if (! NILP (Fcoding_system_alias_p (subsidiary_unix)) &&
1248           ! NILP (Fcoding_system_alias_p (subsidiary_dos))  &&
1249           ! NILP (Fcoding_system_alias_p (subsidiary_mac)))
1250         {
1251           Fdefine_coding_system_alias (subsidiary_unix, Qnil);
1252           Fdefine_coding_system_alias (subsidiary_dos,  Qnil);
1253           Fdefine_coding_system_alias (subsidiary_mac,  Qnil);
1254         }
1255
1256       /* Undefine dangling coding system aliases. */
1257       {
1258         int dangling_aliases;
1259
1260         do {
1261           dangling_aliases = 0;
1262           elisp_map_remhash (dangling_coding_system_alias_p,
1263                              Vcoding_system_hash_table,
1264                              &dangling_aliases);
1265         } while (dangling_aliases > 0);
1266       }
1267
1268       return Qnil;
1269     }
1270
1271   if (CODING_SYSTEMP (aliasee))
1272     aliasee = XCODING_SYSTEM_NAME (aliasee);
1273
1274   /* Checks that aliasee names a coding-system */
1275   real_coding_system = Fget_coding_system (aliasee);
1276
1277   /* Check for coding system alias loops */
1278   if (EQ (alias, aliasee))
1279     alias_loop: signal_simple_error_2
1280       ("Attempt to create a coding system alias loop", alias, aliasee);
1281
1282   for (probe = aliasee;
1283        SYMBOLP (probe);
1284        probe = Fgethash (probe, Vcoding_system_hash_table, Qzero))
1285     {
1286       if (EQ (probe, alias))
1287         goto alias_loop;
1288     }
1289
1290   Fputhash (alias, aliasee, Vcoding_system_hash_table);
1291
1292   /* Set up aliases for subsidiaries.
1293      #### There must be a better way to handle subsidiary coding systems. */
1294   {
1295     static const char *suffixes[] = { "-unix", "-dos", "-mac" };
1296     int i;
1297     for (i = 0; i < countof (suffixes); i++)
1298       {
1299         Lisp_Object alias_subsidiary =
1300           append_suffix_to_symbol (alias, suffixes[i]);
1301         Lisp_Object aliasee_subsidiary =
1302           append_suffix_to_symbol (aliasee, suffixes[i]);
1303
1304         if (! NILP (Ffind_coding_system (aliasee_subsidiary)))
1305           Fdefine_coding_system_alias (alias_subsidiary, aliasee_subsidiary);
1306       }
1307   }
1308   /* FSF return value is a vector of [ALIAS-unix ALIAS-dos ALIAS-mac],
1309      but it doesn't look intentional, so I'd rather return something
1310      meaningful or nothing at all. */
1311   return Qnil;
1312 }
1313
1314 static Lisp_Object
1315 subsidiary_coding_system (Lisp_Object coding_system, eol_type_t type)
1316 {
1317   Lisp_Coding_System *cs = XCODING_SYSTEM (coding_system);
1318   Lisp_Object new_coding_system;
1319
1320   if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
1321     return coding_system;
1322
1323   switch (type)
1324     {
1325     case EOL_AUTODETECT: return coding_system;
1326     case EOL_LF:   new_coding_system = CODING_SYSTEM_EOL_LF   (cs); break;
1327     case EOL_CR:   new_coding_system = CODING_SYSTEM_EOL_CR   (cs); break;
1328     case EOL_CRLF: new_coding_system = CODING_SYSTEM_EOL_CRLF (cs); break;
1329     default:       abort (); return Qnil;
1330     }
1331
1332   return NILP (new_coding_system) ? coding_system : new_coding_system;
1333 }
1334
1335 DEFUN ("subsidiary-coding-system", Fsubsidiary_coding_system, 2, 2, 0, /*
1336 Return the subsidiary coding system of CODING-SYSTEM with eol type EOL-TYPE.
1337 */
1338        (coding_system, eol_type))
1339 {
1340   coding_system = Fget_coding_system (coding_system);
1341
1342   return subsidiary_coding_system (coding_system,
1343                                    symbol_to_eol_type (eol_type));
1344 }
1345
1346 \f
1347 /************************************************************************/
1348 /*                         Coding system accessors                      */
1349 /************************************************************************/
1350
1351 DEFUN ("coding-system-doc-string", Fcoding_system_doc_string, 1, 1, 0, /*
1352 Return the doc string for CODING-SYSTEM.
1353 */
1354        (coding_system))
1355 {
1356   coding_system = Fget_coding_system (coding_system);
1357   return XCODING_SYSTEM_DOC_STRING (coding_system);
1358 }
1359
1360 DEFUN ("coding-system-type", Fcoding_system_type, 1, 1, 0, /*
1361 Return the type of CODING-SYSTEM.
1362 */
1363        (coding_system))
1364 {
1365   switch (XCODING_SYSTEM_TYPE (Fget_coding_system (coding_system)))
1366     {
1367     default: abort ();
1368     case CODESYS_AUTODETECT:    return Qundecided;
1369 #ifdef MULE
1370     case CODESYS_SHIFT_JIS:     return Qshift_jis;
1371     case CODESYS_ISO2022:       return Qiso2022;
1372     case CODESYS_BIG5:          return Qbig5;
1373     case CODESYS_UCS4:          return Qucs4;
1374     case CODESYS_UTF8:          return Qutf8;
1375     case CODESYS_CCL:           return Qccl;
1376 #endif
1377     case CODESYS_NO_CONVERSION: return Qno_conversion;
1378 #ifdef DEBUG_XEMACS
1379     case CODESYS_INTERNAL:      return Qinternal;
1380 #endif
1381     }
1382 }
1383
1384 #ifdef MULE
1385 static
1386 Lisp_Object coding_system_charset (Lisp_Object coding_system, int gnum)
1387 {
1388   Lisp_Object cs
1389     = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, gnum);
1390
1391   return CHARSETP (cs) ? XCHARSET_NAME (cs) : Qnil;
1392 }
1393
1394 DEFUN ("coding-system-charset", Fcoding_system_charset, 2, 2, 0, /*
1395 Return initial charset of CODING-SYSTEM designated to GNUM.
1396 GNUM allows 0 .. 3.
1397 */
1398        (coding_system, gnum))
1399 {
1400   coding_system = Fget_coding_system (coding_system);
1401   CHECK_INT (gnum);
1402
1403   return coding_system_charset (coding_system, XINT (gnum));
1404 }
1405 #endif /* MULE */
1406
1407 DEFUN ("coding-system-property", Fcoding_system_property, 2, 2, 0, /*
1408 Return the PROP property of CODING-SYSTEM.
1409 */
1410        (coding_system, prop))
1411 {
1412   int i, ok = 0;
1413   enum coding_system_type type;
1414
1415   coding_system = Fget_coding_system (coding_system);
1416   CHECK_SYMBOL (prop);
1417   type = XCODING_SYSTEM_TYPE (coding_system);
1418
1419   for (i = 0; !ok && i < Dynarr_length (the_codesys_prop_dynarr); i++)
1420     if (EQ (Dynarr_at (the_codesys_prop_dynarr, i).sym, prop))
1421       {
1422         ok = 1;
1423         switch (Dynarr_at (the_codesys_prop_dynarr, i).prop_type)
1424           {
1425           case CODESYS_PROP_ALL_OK:
1426             break;
1427 #ifdef MULE
1428           case CODESYS_PROP_ISO2022:
1429             if (type != CODESYS_ISO2022)
1430               signal_simple_error
1431                 ("Property only valid in ISO2022 coding systems",
1432                  prop);
1433             break;
1434
1435           case CODESYS_PROP_CCL:
1436             if (type != CODESYS_CCL)
1437               signal_simple_error
1438                 ("Property only valid in CCL coding systems",
1439                  prop);
1440             break;
1441 #endif /* MULE */
1442           default:
1443             abort ();
1444           }
1445       }
1446
1447   if (!ok)
1448     signal_simple_error ("Unrecognized property", prop);
1449
1450   if (EQ (prop, Qname))
1451     return XCODING_SYSTEM_NAME (coding_system);
1452   else if (EQ (prop, Qtype))
1453     return Fcoding_system_type (coding_system);
1454   else if (EQ (prop, Qdoc_string))
1455     return XCODING_SYSTEM_DOC_STRING (coding_system);
1456   else if (EQ (prop, Qmnemonic))
1457     return XCODING_SYSTEM_MNEMONIC (coding_system);
1458   else if (EQ (prop, Qeol_type))
1459     return eol_type_to_symbol (XCODING_SYSTEM_EOL_TYPE (coding_system));
1460   else if (EQ (prop, Qeol_lf))
1461     return XCODING_SYSTEM_EOL_LF (coding_system);
1462   else if (EQ (prop, Qeol_crlf))
1463     return XCODING_SYSTEM_EOL_CRLF (coding_system);
1464   else if (EQ (prop, Qeol_cr))
1465     return XCODING_SYSTEM_EOL_CR (coding_system);
1466   else if (EQ (prop, Qpost_read_conversion))
1467     return XCODING_SYSTEM_POST_READ_CONVERSION (coding_system);
1468   else if (EQ (prop, Qpre_write_conversion))
1469     return XCODING_SYSTEM_PRE_WRITE_CONVERSION (coding_system);
1470 #ifdef MULE
1471 #ifdef UTF2000
1472   else if (EQ (prop, Qdisable_composition))
1473     return XCODING_SYSTEM_DISABLE_COMPOSITION (coding_system) ? Qt : Qnil;
1474   else if (EQ (prop, Quse_entity_reference))
1475     return XCODING_SYSTEM_USE_ENTITY_REFERENCE (coding_system) ? Qt : Qnil;
1476 #endif
1477   else if (type == CODESYS_ISO2022)
1478     {
1479       if (EQ (prop, Qcharset_g0))
1480         return coding_system_charset (coding_system, 0);
1481       else if (EQ (prop, Qcharset_g1))
1482         return coding_system_charset (coding_system, 1);
1483       else if (EQ (prop, Qcharset_g2))
1484         return coding_system_charset (coding_system, 2);
1485       else if (EQ (prop, Qcharset_g3))
1486         return coding_system_charset (coding_system, 3);
1487
1488 #define FORCE_CHARSET(charset_num) \
1489   (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \
1490    (coding_system, charset_num) ? Qt : Qnil)
1491
1492       else if (EQ (prop, Qforce_g0_on_output)) return FORCE_CHARSET (0);
1493       else if (EQ (prop, Qforce_g1_on_output)) return FORCE_CHARSET (1);
1494       else if (EQ (prop, Qforce_g2_on_output)) return FORCE_CHARSET (2);
1495       else if (EQ (prop, Qforce_g3_on_output)) return FORCE_CHARSET (3);
1496
1497 #define LISP_BOOLEAN(prop) \
1498   (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil)
1499
1500       else if (EQ (prop, Qshort))         return LISP_BOOLEAN (SHORT);
1501       else if (EQ (prop, Qno_ascii_eol))  return LISP_BOOLEAN (NO_ASCII_EOL);
1502       else if (EQ (prop, Qno_ascii_cntl)) return LISP_BOOLEAN (NO_ASCII_CNTL);
1503       else if (EQ (prop, Qseven))         return LISP_BOOLEAN (SEVEN);
1504       else if (EQ (prop, Qlock_shift))    return LISP_BOOLEAN (LOCK_SHIFT);
1505       else if (EQ (prop, Qno_iso6429))    return LISP_BOOLEAN (NO_ISO6429);
1506       else if (EQ (prop, Qescape_quoted)) return LISP_BOOLEAN (ESCAPE_QUOTED);
1507
1508       else if (EQ (prop, Qinput_charset_conversion))
1509         return
1510           unparse_charset_conversion_specs
1511             (XCODING_SYSTEM (coding_system)->iso2022.input_conv);
1512       else if (EQ (prop, Qoutput_charset_conversion))
1513         return
1514           unparse_charset_conversion_specs
1515             (XCODING_SYSTEM (coding_system)->iso2022.output_conv);
1516       else
1517         abort ();
1518     }
1519   else if (type == CODESYS_CCL)
1520     {
1521       if (EQ (prop, Qdecode))
1522         return XCODING_SYSTEM_CCL_DECODE (coding_system);
1523       else if (EQ (prop, Qencode))
1524         return XCODING_SYSTEM_CCL_ENCODE (coding_system);
1525       else
1526         abort ();
1527     }
1528 #endif /* MULE */
1529   else
1530     abort ();
1531
1532   return Qnil; /* not reached */
1533 }
1534
1535 \f
1536 /************************************************************************/
1537 /*                       Coding category functions                      */
1538 /************************************************************************/
1539
1540 static int
1541 decode_coding_category (Lisp_Object symbol)
1542 {
1543   int i;
1544
1545   CHECK_SYMBOL (symbol);
1546   for (i = 0; i < CODING_CATEGORY_LAST; i++)
1547     if (EQ (coding_category_symbol[i], symbol))
1548       return i;
1549
1550   signal_simple_error ("Unrecognized coding category", symbol);
1551   return 0; /* not reached */
1552 }
1553
1554 DEFUN ("coding-category-list", Fcoding_category_list, 0, 0, 0, /*
1555 Return a list of all recognized coding categories.
1556 */
1557        ())
1558 {
1559   int i;
1560   Lisp_Object list = Qnil;
1561
1562   for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
1563     list = Fcons (coding_category_symbol[i], list);
1564   return list;
1565 }
1566
1567 DEFUN ("set-coding-priority-list", Fset_coding_priority_list, 1, 1, 0, /*
1568 Change the priority order of the coding categories.
1569 LIST should be list of coding categories, in descending order of
1570 priority.  Unspecified coding categories will be lower in priority
1571 than all specified ones, in the same relative order they were in
1572 previously.
1573 */
1574        (list))
1575 {
1576   int category_to_priority[CODING_CATEGORY_LAST];
1577   int i, j;
1578   Lisp_Object rest;
1579
1580   /* First generate a list that maps coding categories to priorities. */
1581
1582   for (i = 0; i < CODING_CATEGORY_LAST; i++)
1583     category_to_priority[i] = -1;
1584
1585   /* Highest priority comes from the specified list. */
1586   i = 0;
1587   EXTERNAL_LIST_LOOP (rest, list)
1588     {
1589       int cat = decode_coding_category (XCAR (rest));
1590
1591       if (category_to_priority[cat] >= 0)
1592         signal_simple_error ("Duplicate coding category in list", XCAR (rest));
1593       category_to_priority[cat] = i++;
1594     }
1595
1596   /* Now go through the existing categories by priority to retrieve
1597      the categories not yet specified and preserve their priority
1598      order. */
1599   for (j = 0; j < CODING_CATEGORY_LAST; j++)
1600     {
1601       int cat = fcd->coding_category_by_priority[j];
1602       if (category_to_priority[cat] < 0)
1603         category_to_priority[cat] = i++;
1604     }
1605
1606   /* Now we need to construct the inverse of the mapping we just
1607      constructed. */
1608
1609   for (i = 0; i < CODING_CATEGORY_LAST; i++)
1610     fcd->coding_category_by_priority[category_to_priority[i]] = i;
1611
1612   /* Phew!  That was confusing. */
1613   return Qnil;
1614 }
1615
1616 DEFUN ("coding-priority-list", Fcoding_priority_list, 0, 0, 0, /*
1617 Return a list of coding categories in descending order of priority.
1618 */
1619        ())
1620 {
1621   int i;
1622   Lisp_Object list = Qnil;
1623
1624   for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
1625     list = Fcons (coding_category_symbol[fcd->coding_category_by_priority[i]],
1626                   list);
1627   return list;
1628 }
1629
1630 DEFUN ("set-coding-category-system", Fset_coding_category_system, 2, 2, 0, /*
1631 Change the coding system associated with a coding category.
1632 */
1633        (coding_category, coding_system))
1634 {
1635   int cat = decode_coding_category (coding_category);
1636
1637   coding_system = Fget_coding_system (coding_system);
1638   fcd->coding_category_system[cat] = coding_system;
1639   return Qnil;
1640 }
1641
1642 DEFUN ("coding-category-system", Fcoding_category_system, 1, 1, 0, /*
1643 Return the coding system associated with a coding category.
1644 */
1645        (coding_category))
1646 {
1647   int cat = decode_coding_category (coding_category);
1648   Lisp_Object sys = fcd->coding_category_system[cat];
1649
1650   if (!NILP (sys))
1651     return XCODING_SYSTEM_NAME (sys);
1652   return Qnil;
1653 }
1654
1655 \f
1656 /************************************************************************/
1657 /*                     Detecting the encoding of data                   */
1658 /************************************************************************/
1659
1660 struct detection_state
1661 {
1662   eol_type_t eol_type;
1663   int seen_non_ascii;
1664   int mask;
1665 #ifdef MULE
1666   struct
1667     {
1668       int mask;
1669       int in_second_byte;
1670     }
1671   big5;
1672
1673   struct
1674     {
1675       int mask;
1676       int in_second_byte;
1677     }
1678   shift_jis;
1679
1680   struct
1681     {
1682       int mask;
1683       int in_byte;
1684   }
1685   ucs4;
1686
1687   struct
1688     {
1689       int mask;
1690       int in_byte;
1691     }
1692   utf8;
1693
1694   struct
1695     {
1696       int mask;
1697       int initted;
1698       struct iso2022_decoder iso;
1699       unsigned int flags;
1700       int high_byte_count;
1701       unsigned int saw_single_shift:1;
1702     }
1703   iso2022;
1704 #endif
1705   struct
1706     {
1707       int seen_anything;
1708       int just_saw_cr;
1709     }
1710   eol;
1711 };
1712
1713 static int
1714 acceptable_control_char_p (int c)
1715 {
1716   switch (c)
1717     {
1718       /* Allow and ignore control characters that you might
1719          reasonably see in a text file */
1720     case '\r':
1721     case '\n':
1722     case '\t':
1723     case  7: /* bell */
1724     case  8: /* backspace */
1725     case 11: /* vertical tab */
1726     case 12: /* form feed */
1727     case 26: /* MS-DOS C-z junk */
1728     case 31: /* '^_' -- for info */
1729       return 1;
1730     default:
1731       return 0;
1732     }
1733 }
1734
1735 static int
1736 mask_has_at_most_one_bit_p (int mask)
1737 {
1738   /* Perhaps the only thing useful you learn from intensive Microsoft
1739      technical interviews */
1740   return (mask & (mask - 1)) == 0;
1741 }
1742
1743 static eol_type_t
1744 detect_eol_type (struct detection_state *st, const Extbyte *src,
1745                  size_t n)
1746 {
1747   while (n--)
1748     {
1749       unsigned char c = *(unsigned char *)src++;
1750       if (c == '\n')
1751         {
1752           if (st->eol.just_saw_cr)
1753             return EOL_CRLF;
1754           else if (st->eol.seen_anything)
1755             return EOL_LF;
1756         }
1757       else if (st->eol.just_saw_cr)
1758         return EOL_CR;
1759       else if (c == '\r')
1760         st->eol.just_saw_cr = 1;
1761       else
1762         st->eol.just_saw_cr = 0;
1763       st->eol.seen_anything = 1;
1764     }
1765
1766   return EOL_AUTODETECT;
1767 }
1768
1769 /* Attempt to determine the encoding and EOL type of the given text.
1770    Before calling this function for the first type, you must initialize
1771    st->eol_type as appropriate and initialize st->mask to ~0.
1772
1773    st->eol_type holds the determined EOL type, or EOL_AUTODETECT if
1774    not yet known.
1775
1776    st->mask holds the determined coding category mask, or ~0 if only
1777    ASCII has been seen so far.
1778
1779    Returns:
1780
1781    0 == st->eol_type is EOL_AUTODETECT and/or more than coding category
1782         is present in st->mask
1783    1 == definitive answers are here for both st->eol_type and st->mask
1784 */
1785
1786 static int
1787 detect_coding_type (struct detection_state *st, const Extbyte *src,
1788                     size_t n, int just_do_eol)
1789 {
1790   if (st->eol_type == EOL_AUTODETECT)
1791     st->eol_type = detect_eol_type (st, src, n);
1792
1793   if (just_do_eol)
1794     return st->eol_type != EOL_AUTODETECT;
1795
1796   if (!st->seen_non_ascii)
1797     {
1798       for (; n; n--, src++)
1799         {
1800           unsigned char c = *(unsigned char *) src;
1801           if ((c < 0x20 && !acceptable_control_char_p (c)) || c >= 0x80)
1802             {
1803               st->seen_non_ascii = 1;
1804 #ifdef MULE
1805               st->shift_jis.mask = ~0;
1806               st->big5.mask = ~0;
1807               st->ucs4.mask = ~0;
1808               st->utf8.mask = ~0;
1809               st->iso2022.mask = ~0;
1810 #endif
1811               break;
1812             }
1813         }
1814     }
1815
1816   if (!n)
1817     return 0;
1818 #ifdef MULE
1819   if (!mask_has_at_most_one_bit_p (st->iso2022.mask))
1820     st->iso2022.mask = detect_coding_iso2022 (st, src, n);
1821   if (!mask_has_at_most_one_bit_p (st->shift_jis.mask))
1822     st->shift_jis.mask = detect_coding_sjis (st, src, n);
1823   if (!mask_has_at_most_one_bit_p (st->big5.mask))
1824     st->big5.mask = detect_coding_big5 (st, src, n);
1825   if (!mask_has_at_most_one_bit_p (st->utf8.mask))
1826     st->utf8.mask = detect_coding_utf8 (st, src, n);
1827   if (!mask_has_at_most_one_bit_p (st->ucs4.mask))
1828     st->ucs4.mask = detect_coding_ucs4 (st, src, n);
1829
1830   st->mask
1831     = st->iso2022.mask | st->shift_jis.mask | st->big5.mask
1832     | st->utf8.mask | st->ucs4.mask;
1833 #endif
1834   {
1835     int retval = mask_has_at_most_one_bit_p (st->mask);
1836     st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK;
1837     return retval && st->eol_type != EOL_AUTODETECT;
1838   }
1839 }
1840
1841 static Lisp_Object
1842 coding_system_from_mask (int mask)
1843 {
1844   if (mask == ~0)
1845     {
1846       /* If the file was entirely or basically ASCII, use the
1847          default value of `buffer-file-coding-system'. */
1848       Lisp_Object retval =
1849         XBUFFER (Vbuffer_defaults)->buffer_file_coding_system;
1850       if (!NILP (retval))
1851         {
1852           retval = Ffind_coding_system (retval);
1853           if (NILP (retval))
1854             {
1855               warn_when_safe
1856                 (Qbad_variable, Qwarning,
1857                  "Invalid `default-buffer-file-coding-system', set to nil");
1858               XBUFFER (Vbuffer_defaults)->buffer_file_coding_system = Qnil;
1859             }
1860         }
1861       if (NILP (retval))
1862         retval = Fget_coding_system (Qraw_text);
1863       return retval;
1864     }
1865   else
1866     {
1867       int i;
1868       int cat = -1;
1869 #ifdef MULE
1870       mask = postprocess_iso2022_mask (mask);
1871 #endif
1872       /* Look through the coding categories by priority and find
1873          the first one that is allowed. */
1874       for (i = 0; i < CODING_CATEGORY_LAST; i++)
1875         {
1876           cat = fcd->coding_category_by_priority[i];
1877           if ((mask & (1 << cat)) &&
1878               !NILP (fcd->coding_category_system[cat]))
1879             break;
1880         }
1881       if (cat >= 0)
1882         return fcd->coding_category_system[cat];
1883       else
1884         return Fget_coding_system (Qraw_text);
1885     }
1886 }
1887
1888 /* Given a seekable read stream and potential coding system and EOL type
1889    as specified, do any autodetection that is called for.  If the
1890    coding system and/or EOL type are not `autodetect', they will be left
1891    alone; but this function will never return an autodetect coding system
1892    or EOL type.
1893
1894    This function does not automatically fetch subsidiary coding systems;
1895    that should be unnecessary with the explicit eol-type argument. */
1896
1897 #define LENGTH(string_constant) (sizeof (string_constant) - 1)
1898
1899 void
1900 determine_real_coding_system (Lstream *stream, Lisp_Object *codesys_in_out,
1901                               eol_type_t *eol_type_in_out)
1902 {
1903   struct detection_state decst;
1904
1905   if (*eol_type_in_out == EOL_AUTODETECT)
1906     *eol_type_in_out = XCODING_SYSTEM_EOL_TYPE (*codesys_in_out);
1907
1908   xzero (decst);
1909   decst.eol_type = *eol_type_in_out;
1910   decst.mask = ~0;
1911
1912   /* If autodetection is called for, do it now. */
1913   if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT
1914       || *eol_type_in_out == EOL_AUTODETECT)
1915     {
1916       Extbyte buf[4096];
1917       Lisp_Object coding_system = Qnil;
1918       Extbyte *p;
1919       ssize_t nread = Lstream_read (stream, buf, sizeof (buf));
1920       Extbyte *scan_end;
1921
1922       /* Look for initial "-*-"; mode line prefix */
1923       for (p = buf,
1924              scan_end = buf + nread - LENGTH ("-*-coding:?-*-");
1925            p <= scan_end
1926              && *p != '\n'
1927              && *p != '\r';
1928            p++)
1929         if (*p == '-' && *(p+1) == '*' && *(p+2) == '-')
1930           {
1931             Extbyte *local_vars_beg = p + 3;
1932             /* Look for final "-*-"; mode line suffix */
1933             for (p = local_vars_beg,
1934                    scan_end = buf + nread - LENGTH ("-*-");
1935                  p <= scan_end
1936                    && *p != '\n'
1937                    && *p != '\r';
1938                  p++)
1939               if (*p == '-' && *(p+1) == '*' && *(p+2) == '-')
1940                 {
1941                   Extbyte *suffix = p;
1942                   /* Look for "coding:" */
1943                   for (p = local_vars_beg,
1944                          scan_end = suffix - LENGTH ("coding:?");
1945                        p <= scan_end;
1946                        p++)
1947                     if (memcmp ("coding:", p, LENGTH ("coding:")) == 0
1948                         && (p == local_vars_beg
1949                             || (*(p-1) == ' '  ||
1950                                 *(p-1) == '\t' ||
1951                                 *(p-1) == ';')))
1952                       {
1953                         Extbyte save;
1954                         int n;
1955                         p += LENGTH ("coding:");
1956                         while (*p == ' ' || *p == '\t') p++;
1957
1958                         /* Get coding system name */
1959                         save = *suffix; *suffix = '\0';
1960                         /* Characters valid in a MIME charset name (rfc 1521),
1961                            and in a Lisp symbol name. */
1962                         n = strspn ( (char *) p,
1963                                     "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
1964                                     "abcdefghijklmnopqrstuvwxyz"
1965                                     "0123456789"
1966                                     "!$%&*+-.^_{|}~");
1967                         *suffix = save;
1968                         if (n > 0)
1969                           {
1970                             save = p[n]; p[n] = '\0';
1971                             coding_system =
1972                               Ffind_coding_system (intern ((char *) p));
1973                             p[n] = save;
1974                           }
1975                         break;
1976                       }
1977                   break;
1978                 }
1979             break;
1980           }
1981
1982       if (NILP (coding_system))
1983         do
1984           {
1985             if (detect_coding_type (&decst, buf, nread,
1986                                     XCODING_SYSTEM_TYPE (*codesys_in_out)
1987                                     != CODESYS_AUTODETECT))
1988               break;
1989             nread = Lstream_read (stream, buf, sizeof (buf));
1990             if (nread == 0)
1991               break;
1992           }
1993         while (1);
1994
1995       else if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT
1996                && XCODING_SYSTEM_EOL_TYPE (coding_system) == EOL_AUTODETECT)
1997         do
1998           {
1999             if (detect_coding_type (&decst, buf, nread, 1))
2000               break;
2001             nread = Lstream_read (stream, buf, sizeof (buf));
2002             if (!nread)
2003               break;
2004           }
2005         while (1);
2006
2007       *eol_type_in_out = decst.eol_type;
2008       if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT)
2009         {
2010           if (NILP (coding_system))
2011             *codesys_in_out = coding_system_from_mask (decst.mask);
2012           else
2013             *codesys_in_out = coding_system;
2014         }
2015     }
2016
2017   /* If we absolutely can't determine the EOL type, just assume LF. */
2018   if (*eol_type_in_out == EOL_AUTODETECT)
2019     *eol_type_in_out = EOL_LF;
2020
2021   Lstream_rewind (stream);
2022 }
2023
2024 DEFUN ("detect-coding-region", Fdetect_coding_region, 2, 3, 0, /*
2025 Detect coding system of the text in the region between START and END.
2026 Return a list of possible coding systems ordered by priority.
2027 If only ASCII characters are found, return 'undecided or one of
2028 its subsidiary coding systems according to a detected end-of-line
2029 type.  Optional arg BUFFER defaults to the current buffer.
2030 */
2031        (start, end, buffer))
2032 {
2033   Lisp_Object val = Qnil;
2034   struct buffer *buf = decode_buffer (buffer, 0);
2035   Bufpos b, e;
2036   Lisp_Object instream, lb_instream;
2037   Lstream *istr, *lb_istr;
2038   struct detection_state decst;
2039   struct gcpro gcpro1, gcpro2;
2040
2041   get_buffer_range_char (buf, start, end, &b, &e, 0);
2042   lb_instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2043   lb_istr = XLSTREAM (lb_instream);
2044   instream = make_encoding_input_stream (lb_istr, Fget_coding_system (Qbinary));
2045   istr = XLSTREAM (instream);
2046   GCPRO2 (instream, lb_instream);
2047   xzero (decst);
2048   decst.eol_type = EOL_AUTODETECT;
2049   decst.mask = ~0;
2050   while (1)
2051     {
2052       Extbyte random_buffer[4096];
2053       ssize_t nread = Lstream_read (istr, random_buffer, sizeof (random_buffer));
2054
2055       if (!nread)
2056         break;
2057       if (detect_coding_type (&decst, random_buffer, nread, 0))
2058         break;
2059     }
2060
2061   if (decst.mask == ~0)
2062     val = subsidiary_coding_system (Fget_coding_system (Qundecided),
2063                                     decst.eol_type);
2064   else
2065     {
2066       int i;
2067
2068       val = Qnil;
2069 #ifdef MULE
2070       decst.mask = postprocess_iso2022_mask (decst.mask);
2071 #endif
2072       for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
2073         {
2074           int sys = fcd->coding_category_by_priority[i];
2075           if (decst.mask & (1 << sys))
2076             {
2077               Lisp_Object codesys = fcd->coding_category_system[sys];
2078               if (!NILP (codesys))
2079                 codesys = subsidiary_coding_system (codesys, decst.eol_type);
2080               val = Fcons (codesys, val);
2081             }
2082         }
2083     }
2084   Lstream_close (istr);
2085   UNGCPRO;
2086   Lstream_delete (istr);
2087   Lstream_delete (lb_istr);
2088   return val;
2089 }
2090
2091 \f
2092 /************************************************************************/
2093 /*           Converting to internal Mule format ("decoding")            */
2094 /************************************************************************/
2095
2096 /* A decoding stream is a stream used for decoding text (i.e.
2097    converting from some external format to internal format).
2098    The decoding-stream object keeps track of the actual coding
2099    stream, the stream that is at the other end, and data that
2100    needs to be persistent across the lifetime of the stream. */
2101
2102 /* Handle the EOL stuff related to just-read-in character C.
2103    EOL_TYPE is the EOL type of the coding stream.
2104    FLAGS is the current value of FLAGS in the coding stream, and may
2105    be modified by this macro.  (The macro only looks at the
2106    CODING_STATE_CR flag.)  DST is the Dynarr to which the decoded
2107    bytes are to be written.  You need to also define a local goto
2108    label "label_continue_loop" that is at the end of the main
2109    character-reading loop.
2110
2111    If C is a CR character, then this macro handles it entirely and
2112    jumps to label_continue_loop.  Otherwise, this macro does not add
2113    anything to DST, and continues normally.  You should continue
2114    processing C normally after this macro. */
2115
2116 #define DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst)         \
2117 do {                                                            \
2118   if (c == '\r')                                                \
2119     {                                                           \
2120       if (eol_type == EOL_CR)                                   \
2121         Dynarr_add (dst, '\n');                                 \
2122       else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR) \
2123         Dynarr_add (dst, c);                                    \
2124       else                                                      \
2125         flags |= CODING_STATE_CR;                               \
2126       goto label_continue_loop;                                 \
2127     }                                                           \
2128   else if (flags & CODING_STATE_CR)                             \
2129     {   /* eol_type == CODING_SYSTEM_EOL_CRLF */                \
2130       if (c != '\n')                                            \
2131         Dynarr_add (dst, '\r');                                 \
2132       flags &= ~CODING_STATE_CR;                                \
2133     }                                                           \
2134 } while (0)
2135
2136 /* C should be a binary character in the range 0 - 255; convert
2137    to internal format and add to Dynarr DST. */
2138
2139 #ifdef UTF2000
2140 #define DECODE_ADD_BINARY_CHAR(c, dst) \
2141 do {                                            \
2142   if (BYTE_ASCII_P (c))                         \
2143     Dynarr_add (dst, c);                        \
2144   else                                          \
2145     {                                           \
2146       Dynarr_add (dst, (c >> 6) | 0xc0);        \
2147       Dynarr_add (dst, (c & 0x3f) | 0x80);      \
2148     }                                           \
2149 } while (0)
2150
2151 INLINE_HEADER void DECODE_ADD_UCS_CHAR(Emchar c, unsigned_char_dynarr* dst);
2152 INLINE_HEADER void
2153 DECODE_ADD_UCS_CHAR(Emchar c, unsigned_char_dynarr* dst)
2154 {
2155   if ( c <= 0x7f )
2156     {
2157       Dynarr_add (dst, c);
2158     }
2159   else if ( c <= 0x7ff )
2160     {
2161       Dynarr_add (dst, (c >> 6) | 0xc0);
2162       Dynarr_add (dst, (c & 0x3f) | 0x80);
2163     }
2164   else if ( c <= 0xffff )
2165     {
2166       Dynarr_add (dst,  (c >> 12) | 0xe0);
2167       Dynarr_add (dst, ((c >>  6) & 0x3f) | 0x80);
2168       Dynarr_add (dst,  (c        & 0x3f) | 0x80);
2169     }
2170   else if ( c <= 0x1fffff )
2171     {
2172       Dynarr_add (dst,  (c >> 18) | 0xf0);
2173       Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2174       Dynarr_add (dst, ((c >>  6) & 0x3f) | 0x80);
2175       Dynarr_add (dst,  (c        & 0x3f) | 0x80);
2176     }
2177   else if ( c <= 0x3ffffff )
2178     {
2179       Dynarr_add (dst,  (c >> 24) | 0xf8);
2180       Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
2181       Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2182       Dynarr_add (dst, ((c >>  6) & 0x3f) | 0x80);
2183       Dynarr_add (dst,  (c        & 0x3f) | 0x80);
2184     }
2185   else
2186     {
2187       Dynarr_add (dst,  (c >> 30) | 0xfc);
2188       Dynarr_add (dst, ((c >> 24) & 0x3f) | 0x80);
2189       Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
2190       Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2191       Dynarr_add (dst, ((c >>  6) & 0x3f) | 0x80);
2192       Dynarr_add (dst,  (c        & 0x3f) | 0x80);
2193     }
2194 }
2195 #else
2196 #define DECODE_ADD_BINARY_CHAR(c, dst)          \
2197 do {                                            \
2198   if (BYTE_ASCII_P (c))                         \
2199     Dynarr_add (dst, c);                        \
2200   else if (BYTE_C1_P (c))                       \
2201     {                                           \
2202       Dynarr_add (dst, LEADING_BYTE_CONTROL_1); \
2203       Dynarr_add (dst, c + 0x20);               \
2204     }                                           \
2205   else                                          \
2206     {                                           \
2207       Dynarr_add (dst, LEADING_BYTE_LATIN_ISO8859_1); \
2208       Dynarr_add (dst, c);                      \
2209     }                                           \
2210 } while (0)
2211 #endif
2212
2213 #define DECODE_OUTPUT_PARTIAL_CHAR(ch)  \
2214 do {                                    \
2215   if (ch)                               \
2216     {                                   \
2217       DECODE_ADD_BINARY_CHAR (ch, dst); \
2218       ch = 0;                           \
2219     }                                   \
2220 } while (0)
2221
2222 #define DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst) \
2223 do {                                    \
2224   if (flags & CODING_STATE_END)         \
2225     {                                   \
2226       DECODE_OUTPUT_PARTIAL_CHAR (ch);  \
2227       if (flags & CODING_STATE_CR)      \
2228         Dynarr_add (dst, '\r');         \
2229     }                                   \
2230 } while (0)
2231
2232 #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding)
2233
2234 struct decoding_stream
2235 {
2236   /* Coding system that governs the conversion. */
2237   Lisp_Coding_System *codesys;
2238
2239   /* Stream that we read the encoded data from or
2240      write the decoded data to. */
2241   Lstream *other_end;
2242
2243   /* If we are reading, then we can return only a fixed amount of
2244      data, so if the conversion resulted in too much data, we store it
2245      here for retrieval the next time around. */
2246   unsigned_char_dynarr *runoff;
2247
2248   /* FLAGS holds flags indicating the current state of the decoding.
2249      Some of these flags are dependent on the coding system. */
2250   unsigned int flags;
2251
2252   /* CPOS holds a partially built-up code-point of character. */
2253   unsigned int cpos;
2254
2255   /* EOL_TYPE specifies the type of end-of-line conversion that
2256      currently applies.  We need to keep this separate from the
2257      EOL type stored in CODESYS because the latter might indicate
2258      automatic EOL-type detection while the former will always
2259      indicate a particular EOL type. */
2260   eol_type_t eol_type;
2261 #ifdef MULE
2262   /* Additional ISO2022 information.  We define the structure above
2263      because it's also needed by the detection routines. */
2264   struct iso2022_decoder iso2022;
2265
2266   /* Additional information (the state of the running CCL program)
2267      used by the CCL decoder. */
2268   struct ccl_program ccl;
2269
2270   /* counter for UTF-8 or UCS-4 */
2271   unsigned char counter;
2272 #endif
2273 #ifdef UTF2000
2274   unsigned char er_counter;
2275   unsigned char er_buf[16];
2276
2277   unsigned combined_char_count;
2278   Emchar combined_chars[16];
2279   Lisp_Object combining_table;
2280 #endif
2281   struct detection_state decst;
2282 };
2283
2284 #ifdef UTF2000
2285 extern Lisp_Object Qcomposition;
2286
2287 INLINE_HEADER void
2288 decode_flush_er_chars (struct decoding_stream *str, unsigned_char_dynarr* dst);
2289 INLINE_HEADER void
2290 decode_flush_er_chars (struct decoding_stream *str, unsigned_char_dynarr* dst)
2291 {
2292   if ( str->er_counter > 0)
2293     {
2294       Dynarr_add_many (dst, str->er_buf, str->er_counter);
2295       str->er_counter = 0;
2296     }
2297 }
2298
2299 void decode_add_er_char (struct decoding_stream *str, Emchar character,
2300                          unsigned_char_dynarr* dst);
2301 void
2302 decode_add_er_char (struct decoding_stream *str, Emchar c,
2303                     unsigned_char_dynarr* dst)
2304 {
2305   if (str->er_counter == 0)
2306     {
2307       if (CODING_SYSTEM_USE_ENTITY_REFERENCE (str->codesys)
2308           && (c == '&') )
2309         {
2310           str->er_buf[0] = '&';
2311           str->er_counter++;
2312         }
2313       else
2314         DECODE_ADD_UCS_CHAR (c, dst);
2315     }
2316   else if (c == ';')
2317     {
2318       Lisp_Object string = make_string (str->er_buf,
2319                                         str->er_counter);
2320       Lisp_Object rest = Vcoded_charset_entity_reference_alist;
2321       Lisp_Object cell;
2322       Lisp_Object ret;
2323       Lisp_Object pat;
2324       Lisp_Object ccs;
2325       int base;
2326
2327       while (!NILP (rest))
2328         {
2329           cell = Fcar (rest);
2330           ccs = Fcar (cell);
2331           if (NILP (ccs = Ffind_charset (ccs)))
2332             continue;
2333
2334           cell = Fcdr (cell);
2335           ret = Fcar (cell);
2336           if (STRINGP (ret))
2337             pat = ret;
2338           else
2339             continue;
2340
2341           cell = Fcdr (cell);
2342           cell = Fcdr (cell);
2343           ret = Fcar (cell);
2344           if (EQ (ret, Qd))
2345             {
2346               pat = concat3 (build_string ("^&"),
2347                              pat, build_string ("\\([0-9]+\\)$"));
2348               base = 10;
2349             }
2350           else if (EQ (ret, Qx))
2351             {
2352               pat = concat3 (build_string ("^&"),
2353                              pat, build_string ("\\([0-9a-f]+\\)$"));
2354               base = 16;
2355             }
2356           else if (EQ (ret, QX))
2357             {
2358               pat = concat3 (build_string ("^&"),
2359                              pat, build_string ("\\([0-9A-F]+\\)$"));
2360               base = 16;
2361             }
2362           else
2363             continue;
2364
2365           if (!NILP (Fstring_match (pat, string, Qnil, Qnil)))
2366             {
2367               int code
2368                 = XINT (Fstring_to_number
2369                         (Fsubstring (string,
2370                                      Fmatch_beginning (make_int (1)),
2371                                      Fmatch_end (make_int (1))),
2372                          make_int (base)));
2373
2374               DECODE_ADD_UCS_CHAR (DECODE_CHAR (ccs, code), dst);
2375               goto decoded;
2376             }
2377           rest = Fcdr (rest);
2378         }
2379       if (!NILP (Fstring_match (build_string ("^&MCS-\\([0-9A-F]+\\)$"),
2380                                 string, Qnil, Qnil)))
2381         {
2382           int code
2383             = XINT (Fstring_to_number
2384                     (Fsubstring (string,
2385                                  Fmatch_beginning (make_int (1)),
2386                                  Fmatch_end (make_int (1))),
2387                      make_int (16)));
2388
2389           DECODE_ADD_UCS_CHAR (code, dst);
2390         }
2391       else
2392         {
2393           Dynarr_add_many (dst, str->er_buf, str->er_counter);
2394           Dynarr_add (dst, ';');
2395         }
2396     decoded:
2397       str->er_counter = 0;
2398     }
2399   else if ( (str->er_counter >= 16) || (c >= 0x7F) )
2400     {
2401       Dynarr_add_many (dst, str->er_buf, str->er_counter);
2402       str->er_counter = 0;
2403       DECODE_ADD_UCS_CHAR (c, dst);
2404     }
2405   else
2406     str->er_buf[str->er_counter++] = c;
2407 }
2408
2409 INLINE_HEADER void
2410 COMPOSE_FLUSH_CHARS (struct decoding_stream *str, unsigned_char_dynarr* dst);
2411 INLINE_HEADER void
2412 COMPOSE_FLUSH_CHARS (struct decoding_stream *str, unsigned_char_dynarr* dst)
2413 {
2414   unsigned i;
2415
2416   for (i = 0; i < str->combined_char_count; i++)
2417     decode_add_er_char (str, str->combined_chars[i], dst);
2418   str->combined_char_count = 0;
2419   str->combining_table = Qnil;
2420 }
2421
2422 void COMPOSE_ADD_CHAR (struct decoding_stream *str, Emchar character,
2423                        unsigned_char_dynarr* dst);
2424 void
2425 COMPOSE_ADD_CHAR (struct decoding_stream *str,
2426                   Emchar character, unsigned_char_dynarr* dst)
2427 {
2428   if (CODING_SYSTEM_DISABLE_COMPOSITION (str->codesys))
2429     decode_add_er_char (str, character, dst);
2430   else if (!CONSP (str->combining_table))
2431     {
2432       Lisp_Object ret
2433         = Fget_char_attribute (make_char (character), Qcomposition, Qnil);
2434
2435       if (NILP (ret))
2436         decode_add_er_char (str, character, dst);
2437       else
2438         {
2439           str->combined_chars[0] = character;
2440           str->combined_char_count = 1;
2441           str->combining_table = ret;
2442         }
2443     }
2444   else
2445     {
2446       Lisp_Object ret = Fcdr (Fassq (make_char (character), str->combining_table));
2447
2448       if (CHARP (ret))
2449         {
2450           Emchar char2 = XCHARVAL (ret);
2451           ret = Fget_char_attribute (make_char (character), Qcomposition, Qnil);
2452           if (NILP (ret))
2453             {
2454               decode_add_er_char (str, character, dst);
2455               str->combined_char_count = 0;
2456               str->combining_table = Qnil;
2457             }
2458           else
2459             {
2460               str->combined_chars[0] = char2;
2461               str->combined_char_count = 1;
2462               str->combining_table = ret;
2463             }
2464         }
2465       else
2466         {
2467           COMPOSE_FLUSH_CHARS (str, dst);
2468           decode_add_er_char (str, character, dst);
2469         }
2470     }
2471 }
2472 #else /* not UTF2000 */
2473 #define COMPOSE_FLUSH_CHARS(str, dst)
2474 #define COMPOSE_ADD_CHAR(str, ch, dst) DECODE_ADD_UCS_CHAR (ch, dst)
2475 #endif /* UTF2000 */
2476
2477 static ssize_t decoding_reader (Lstream *stream,
2478                                 unsigned char *data, size_t size);
2479 static ssize_t decoding_writer (Lstream *stream,
2480                                 const unsigned char *data, size_t size);
2481 static int decoding_rewinder   (Lstream *stream);
2482 static int decoding_seekable_p (Lstream *stream);
2483 static int decoding_flusher    (Lstream *stream);
2484 static int decoding_closer     (Lstream *stream);
2485
2486 static Lisp_Object decoding_marker (Lisp_Object stream);
2487
2488 DEFINE_LSTREAM_IMPLEMENTATION ("decoding", lstream_decoding,
2489                                sizeof (struct decoding_stream));
2490
2491 static Lisp_Object
2492 decoding_marker (Lisp_Object stream)
2493 {
2494   Lstream *str = DECODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2495   Lisp_Object str_obj;
2496
2497   /* We do not need to mark the coding systems or charsets stored
2498      within the stream because they are stored in a global list
2499      and automatically marked. */
2500
2501   XSETLSTREAM (str_obj, str);
2502   mark_object (str_obj);
2503   if (str->imp->marker)
2504     return (str->imp->marker) (str_obj);
2505   else
2506     return Qnil;
2507 }
2508
2509 /* Read SIZE bytes of data and store it into DATA.  We are a decoding stream
2510    so we read data from the other end, decode it, and store it into DATA. */
2511
2512 static ssize_t
2513 decoding_reader (Lstream *stream, unsigned char *data, size_t size)
2514 {
2515   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2516   unsigned char *orig_data = data;
2517   ssize_t read_size;
2518   int error_occurred = 0;
2519
2520   /* We need to interface to mule_decode(), which expects to take some
2521      amount of data and store the result into a Dynarr.  We have
2522      mule_decode() store into str->runoff, and take data from there
2523      as necessary. */
2524
2525   /* We loop until we have enough data, reading chunks from the other
2526      end and decoding it. */
2527   while (1)
2528     {
2529       /* Take data from the runoff if we can.  Make sure to take at
2530          most SIZE bytes, and delete the data from the runoff. */
2531       if (Dynarr_length (str->runoff) > 0)
2532         {
2533           size_t chunk = min (size, (size_t) Dynarr_length (str->runoff));
2534           memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2535           Dynarr_delete_many (str->runoff, 0, chunk);
2536           data += chunk;
2537           size -= chunk;
2538         }
2539
2540       if (size == 0)
2541         break; /* No more room for data */
2542
2543       if (str->flags & CODING_STATE_END)
2544         /* This means that on the previous iteration, we hit the EOF on
2545            the other end.  We loop once more so that mule_decode() can
2546            output any final stuff it may be holding, or any "go back
2547            to a sane state" escape sequences. (This latter makes sense
2548            during encoding.) */
2549         break;
2550
2551       /* Exhausted the runoff, so get some more.  DATA has at least
2552          SIZE bytes left of storage in it, so it's OK to read directly
2553          into it.  (We'll be overwriting above, after we've decoded it
2554          into the runoff.) */
2555       read_size = Lstream_read (str->other_end, data, size);
2556       if (read_size < 0)
2557         {
2558           error_occurred = 1;
2559           break;
2560         }
2561       if (read_size == 0)
2562         /* There might be some more end data produced in the translation.
2563            See the comment above. */
2564         str->flags |= CODING_STATE_END;
2565       mule_decode (stream, (Extbyte *) data, str->runoff, read_size);
2566     }
2567
2568   if (data - orig_data == 0)
2569     return error_occurred ? -1 : 0;
2570   else
2571     return data - orig_data;
2572 }
2573
2574 static ssize_t
2575 decoding_writer (Lstream *stream, const unsigned char *data, size_t size)
2576 {
2577   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2578   ssize_t retval;
2579
2580   /* Decode all our data into the runoff, and then attempt to write
2581      it all out to the other end.  Remove whatever chunk we succeeded
2582      in writing. */
2583   mule_decode (stream, (Extbyte *) data, str->runoff, size);
2584   retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2585                           Dynarr_length (str->runoff));
2586   if (retval > 0)
2587     Dynarr_delete_many (str->runoff, 0, retval);
2588   /* Do NOT return retval.  The return value indicates how much
2589      of the incoming data was written, not how many bytes were
2590      written. */
2591   return size;
2592 }
2593
2594 static void
2595 reset_decoding_stream (struct decoding_stream *str)
2596 {
2597 #ifdef MULE
2598   if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_ISO2022)
2599     {
2600       Lisp_Object coding_system;
2601       XSETCODING_SYSTEM (coding_system, str->codesys);
2602       reset_iso2022 (coding_system, &str->iso2022);
2603     }
2604   else if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_CCL)
2605     {
2606       setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_DECODE (str->codesys));
2607     }
2608   str->counter = 0;
2609 #endif /* MULE */
2610 #ifdef UTF2000
2611   str->er_counter = 0;
2612   str->combined_char_count = 0;
2613   str->combining_table = Qnil;
2614 #endif
2615   str->flags = str->cpos = 0;
2616 }
2617
2618 static int
2619 decoding_rewinder (Lstream *stream)
2620 {
2621   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2622   reset_decoding_stream (str);
2623   Dynarr_reset (str->runoff);
2624   return Lstream_rewind (str->other_end);
2625 }
2626
2627 static int
2628 decoding_seekable_p (Lstream *stream)
2629 {
2630   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2631   return Lstream_seekable_p (str->other_end);
2632 }
2633
2634 static int
2635 decoding_flusher (Lstream *stream)
2636 {
2637   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2638   return Lstream_flush (str->other_end);
2639 }
2640
2641 static int
2642 decoding_closer (Lstream *stream)
2643 {
2644   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2645   if (stream->flags & LSTREAM_FL_WRITE)
2646     {
2647       str->flags |= CODING_STATE_END;
2648       decoding_writer (stream, 0, 0);
2649     }
2650   Dynarr_free (str->runoff);
2651 #ifdef MULE
2652 #ifdef ENABLE_COMPOSITE_CHARS
2653   if (str->iso2022.composite_chars)
2654     Dynarr_free (str->iso2022.composite_chars);
2655 #endif
2656 #endif
2657   return Lstream_close (str->other_end);
2658 }
2659
2660 Lisp_Object
2661 decoding_stream_coding_system (Lstream *stream)
2662 {
2663   Lisp_Object coding_system;
2664   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2665
2666   XSETCODING_SYSTEM (coding_system, str->codesys);
2667   return subsidiary_coding_system (coding_system, str->eol_type);
2668 }
2669
2670 void
2671 set_decoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2672 {
2673   Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2674   struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2675   str->codesys = cs;
2676   if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
2677     str->eol_type = CODING_SYSTEM_EOL_TYPE (cs);
2678   reset_decoding_stream (str);
2679 }
2680
2681 /* WARNING WARNING WARNING WARNING!!!!!  If you open up a decoding
2682    stream for writing, no automatic code detection will be performed.
2683    The reason for this is that automatic code detection requires a
2684    seekable input.  Things will also fail if you open a decoding
2685    stream for reading using a non-fully-specified coding system and
2686    a non-seekable input stream. */
2687
2688 static Lisp_Object
2689 make_decoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2690                         const char *mode)
2691 {
2692   Lstream *lstr = Lstream_new (lstream_decoding, mode);
2693   struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2694   Lisp_Object obj;
2695
2696   xzero (*str);
2697   str->other_end = stream;
2698   str->runoff = (unsigned_char_dynarr *) Dynarr_new (unsigned_char);
2699   str->eol_type = EOL_AUTODETECT;
2700   if (!strcmp (mode, "r")
2701       && Lstream_seekable_p (stream))
2702     /* We can determine the coding system now. */
2703     determine_real_coding_system (stream, &codesys, &str->eol_type);
2704   set_decoding_stream_coding_system (lstr, codesys);
2705   str->decst.eol_type = str->eol_type;
2706   str->decst.mask = ~0;
2707   XSETLSTREAM (obj, lstr);
2708   return obj;
2709 }
2710
2711 Lisp_Object
2712 make_decoding_input_stream (Lstream *stream, Lisp_Object codesys)
2713 {
2714   return make_decoding_stream_1 (stream, codesys, "r");
2715 }
2716
2717 Lisp_Object
2718 make_decoding_output_stream (Lstream *stream, Lisp_Object codesys)
2719 {
2720   return make_decoding_stream_1 (stream, codesys, "w");
2721 }
2722
2723 /* Note: the decode_coding_* functions all take the same
2724    arguments as mule_decode(), which is to say some SRC data of
2725    size N, which is to be stored into dynamic array DST.
2726    DECODING is the stream within which the decoding is
2727    taking place, but no data is actually read from or
2728    written to that stream; that is handled in decoding_reader()
2729    or decoding_writer().  This allows the same functions to
2730    be used for both reading and writing. */
2731
2732 static void
2733 mule_decode (Lstream *decoding, const Extbyte *src,
2734              unsigned_char_dynarr *dst, size_t n)
2735 {
2736   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2737
2738   /* If necessary, do encoding-detection now.  We do this when
2739      we're a writing stream or a non-seekable reading stream,
2740      meaning that we can't just process the whole input,
2741      rewind, and start over. */
2742
2743   if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT ||
2744       str->eol_type == EOL_AUTODETECT)
2745     {
2746       Lisp_Object codesys;
2747
2748       XSETCODING_SYSTEM (codesys, str->codesys);
2749       detect_coding_type (&str->decst, src, n,
2750                           CODING_SYSTEM_TYPE (str->codesys) !=
2751                           CODESYS_AUTODETECT);
2752       if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT &&
2753           str->decst.mask != ~0)
2754         /* #### This is cheesy.  What we really ought to do is
2755            buffer up a certain amount of data so as to get a
2756            less random result. */
2757         codesys = coding_system_from_mask (str->decst.mask);
2758       str->eol_type = str->decst.eol_type;
2759       if (XCODING_SYSTEM (codesys) != str->codesys)
2760         {
2761           /* Preserve the CODING_STATE_END flag in case it was set.
2762              If we erase it, bad things might happen. */
2763           int was_end = str->flags & CODING_STATE_END;
2764           set_decoding_stream_coding_system (decoding, codesys);
2765           if (was_end)
2766             str->flags |= CODING_STATE_END;
2767         }
2768     }
2769
2770   switch (CODING_SYSTEM_TYPE (str->codesys))
2771     {
2772 #ifdef DEBUG_XEMACS
2773     case CODESYS_INTERNAL:
2774       Dynarr_add_many (dst, src, n);
2775       break;
2776 #endif
2777     case CODESYS_AUTODETECT:
2778       /* If we got this far and still haven't decided on the coding
2779          system, then do no conversion. */
2780     case CODESYS_NO_CONVERSION:
2781       decode_coding_no_conversion (decoding, src, dst, n);
2782       break;
2783 #ifdef MULE
2784     case CODESYS_SHIFT_JIS:
2785       decode_coding_sjis (decoding, src, dst, n);
2786       break;
2787     case CODESYS_BIG5:
2788       decode_coding_big5 (decoding, src, dst, n);
2789       break;
2790     case CODESYS_UCS4:
2791       decode_coding_ucs4 (decoding, src, dst, n);
2792       break;
2793     case CODESYS_UTF8:
2794       decode_coding_utf8 (decoding, src, dst, n);
2795       break;
2796     case CODESYS_CCL:
2797       str->ccl.last_block = str->flags & CODING_STATE_END;
2798       /* When applying ccl program to stream, MUST NOT set NULL
2799          pointer to src.  */
2800       ccl_driver (&str->ccl, (src ? (unsigned char *)src : (unsigned char*)""),
2801                   dst, n, 0, CCL_MODE_DECODING);
2802       break;
2803     case CODESYS_ISO2022:
2804       decode_coding_iso2022 (decoding, src, dst, n);
2805       break;
2806 #endif /* MULE */
2807     default:
2808       abort ();
2809     }
2810 }
2811
2812 DEFUN ("decode-coding-region", Fdecode_coding_region, 3, 4, 0, /*
2813 Decode the text between START and END which is encoded in CODING-SYSTEM.
2814 This is useful if you've read in encoded text from a file without decoding
2815 it (e.g. you read in a JIS-formatted file but used the `binary' or
2816 `no-conversion' coding system, so that it shows up as "^[$B!<!+^[(B").
2817 Return length of decoded text.
2818 BUFFER defaults to the current buffer if unspecified.
2819 */
2820        (start, end, coding_system, buffer))
2821 {
2822   Bufpos b, e;
2823   struct buffer *buf = decode_buffer (buffer, 0);
2824   Lisp_Object instream, lb_outstream, de_outstream, outstream;
2825   Lstream *istr, *ostr;
2826   struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2827
2828   get_buffer_range_char (buf, start, end, &b, &e, 0);
2829
2830   barf_if_buffer_read_only (buf, b, e);
2831
2832   coding_system = Fget_coding_system (coding_system);
2833   instream = make_lisp_buffer_input_stream  (buf, b, e, 0);
2834   lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2835   de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2836                                               coding_system);
2837   outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2838                                            Fget_coding_system (Qbinary));
2839   istr = XLSTREAM (instream);
2840   ostr = XLSTREAM (outstream);
2841   GCPRO4 (instream, lb_outstream, de_outstream, outstream);
2842
2843   /* The chain of streams looks like this:
2844
2845      [BUFFER] <----- send through
2846                      ------> [ENCODE AS BINARY]
2847                              ------> [DECODE AS SPECIFIED]
2848                                      ------> [BUFFER]
2849    */
2850
2851   while (1)
2852     {
2853       char tempbuf[1024]; /* some random amount */
2854       Bufpos newpos, even_newer_pos;
2855       Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2856       ssize_t size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2857
2858       if (!size_in_bytes)
2859         break;
2860       newpos = lisp_buffer_stream_startpos (istr);
2861       Lstream_write (ostr, tempbuf, size_in_bytes);
2862       even_newer_pos = lisp_buffer_stream_startpos (istr);
2863       buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2864                            even_newer_pos, 0);
2865     }
2866   Lstream_close (istr);
2867   Lstream_close (ostr);
2868   UNGCPRO;
2869   Lstream_delete (istr);
2870   Lstream_delete (ostr);
2871   Lstream_delete (XLSTREAM (de_outstream));
2872   Lstream_delete (XLSTREAM (lb_outstream));
2873   return Qnil;
2874 }
2875
2876 \f
2877 /************************************************************************/
2878 /*           Converting to an external encoding ("encoding")            */
2879 /************************************************************************/
2880
2881 /* An encoding stream is an output stream.  When you create the
2882    stream, you specify the coding system that governs the encoding
2883    and another stream that the resulting encoded data is to be
2884    sent to, and then start sending data to it. */
2885
2886 #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding)
2887
2888 struct encoding_stream
2889 {
2890   /* Coding system that governs the conversion. */
2891   Lisp_Coding_System *codesys;
2892
2893   /* Stream that we read the encoded data from or
2894      write the decoded data to. */
2895   Lstream *other_end;
2896
2897   /* If we are reading, then we can return only a fixed amount of
2898      data, so if the conversion resulted in too much data, we store it
2899      here for retrieval the next time around. */
2900   unsigned_char_dynarr *runoff;
2901
2902   /* FLAGS holds flags indicating the current state of the encoding.
2903      Some of these flags are dependent on the coding system. */
2904   unsigned int flags;
2905
2906   /* CH holds a partially built-up character.  Since we only deal
2907      with one- and two-byte characters at the moment, we only use
2908      this to store the first byte of a two-byte character. */
2909   unsigned int ch;
2910 #ifdef MULE
2911   /* Additional information used by the ISO2022 encoder. */
2912   struct
2913     {
2914       /* CHARSET holds the character sets currently assigned to the G0
2915          through G3 registers.  It is initialized from the array
2916          INITIAL_CHARSET in CODESYS. */
2917       Lisp_Object charset[4];
2918
2919       /* Which registers are currently invoked into the left (GL) and
2920          right (GR) halves of the 8-bit encoding space? */
2921       int register_left, register_right;
2922
2923       /* Whether we need to explicitly designate the charset in the
2924          G? register before using it.  It is initialized from the
2925          array FORCE_CHARSET_ON_OUTPUT in CODESYS. */
2926       unsigned char force_charset_on_output[4];
2927
2928       /* Other state variables that need to be preserved across
2929          invocations. */
2930       Lisp_Object current_charset;
2931       int current_half;
2932       int current_char_boundary;
2933     } iso2022;
2934
2935   void (*encode_char) (struct encoding_stream *str, Emchar c,
2936                        unsigned_char_dynarr *dst, unsigned int *flags);
2937   void (*finish) (struct encoding_stream *str,
2938                   unsigned_char_dynarr *dst, unsigned int *flags);
2939
2940   /* Additional information (the state of the running CCL program)
2941      used by the CCL encoder. */
2942   struct ccl_program ccl;
2943 #endif /* MULE */
2944 };
2945
2946 static ssize_t encoding_reader (Lstream *stream, unsigned char *data, size_t size);
2947 static ssize_t encoding_writer (Lstream *stream, const unsigned char *data,
2948                                 size_t size);
2949 static int encoding_rewinder   (Lstream *stream);
2950 static int encoding_seekable_p (Lstream *stream);
2951 static int encoding_flusher    (Lstream *stream);
2952 static int encoding_closer     (Lstream *stream);
2953
2954 static Lisp_Object encoding_marker (Lisp_Object stream);
2955
2956 DEFINE_LSTREAM_IMPLEMENTATION ("encoding", lstream_encoding,
2957                                sizeof (struct encoding_stream));
2958
2959 static Lisp_Object
2960 encoding_marker (Lisp_Object stream)
2961 {
2962   Lstream *str = ENCODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2963   Lisp_Object str_obj;
2964
2965   /* We do not need to mark the coding systems or charsets stored
2966      within the stream because they are stored in a global list
2967      and automatically marked. */
2968
2969   XSETLSTREAM (str_obj, str);
2970   mark_object (str_obj);
2971   if (str->imp->marker)
2972     return (str->imp->marker) (str_obj);
2973   else
2974     return Qnil;
2975 }
2976
2977 /* Read SIZE bytes of data and store it into DATA.  We are a encoding stream
2978    so we read data from the other end, encode it, and store it into DATA. */
2979
2980 static ssize_t
2981 encoding_reader (Lstream *stream, unsigned char *data, size_t size)
2982 {
2983   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2984   unsigned char *orig_data = data;
2985   ssize_t read_size;
2986   int error_occurred = 0;
2987
2988   /* We need to interface to mule_encode(), which expects to take some
2989      amount of data and store the result into a Dynarr.  We have
2990      mule_encode() store into str->runoff, and take data from there
2991      as necessary. */
2992
2993   /* We loop until we have enough data, reading chunks from the other
2994      end and encoding it. */
2995   while (1)
2996     {
2997       /* Take data from the runoff if we can.  Make sure to take at
2998          most SIZE bytes, and delete the data from the runoff. */
2999       if (Dynarr_length (str->runoff) > 0)
3000         {
3001           int chunk = min ((int) size, Dynarr_length (str->runoff));
3002           memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
3003           Dynarr_delete_many (str->runoff, 0, chunk);
3004           data += chunk;
3005           size -= chunk;
3006         }
3007
3008       if (size == 0)
3009         break; /* No more room for data */
3010
3011       if (str->flags & CODING_STATE_END)
3012         /* This means that on the previous iteration, we hit the EOF on
3013            the other end.  We loop once more so that mule_encode() can
3014            output any final stuff it may be holding, or any "go back
3015            to a sane state" escape sequences. (This latter makes sense
3016            during encoding.) */
3017         break;
3018
3019       /* Exhausted the runoff, so get some more.  DATA at least SIZE bytes
3020          left of storage in it, so it's OK to read directly into it.
3021          (We'll be overwriting above, after we've encoded it into the
3022          runoff.) */
3023       read_size = Lstream_read (str->other_end, data, size);
3024       if (read_size < 0)
3025         {
3026           error_occurred = 1;
3027           break;
3028         }
3029       if (read_size == 0)
3030         /* There might be some more end data produced in the translation.
3031            See the comment above. */
3032         str->flags |= CODING_STATE_END;
3033       mule_encode (stream, data, str->runoff, read_size);
3034     }
3035
3036   if (data == orig_data)
3037     return error_occurred ? -1 : 0;
3038   else
3039     return data - orig_data;
3040 }
3041
3042 static ssize_t
3043 encoding_writer (Lstream *stream, const unsigned char *data, size_t size)
3044 {
3045   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
3046   ssize_t retval;
3047
3048   /* Encode all our data into the runoff, and then attempt to write
3049      it all out to the other end.  Remove whatever chunk we succeeded
3050      in writing. */
3051   mule_encode (stream, data, str->runoff, size);
3052   retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
3053                           Dynarr_length (str->runoff));
3054   if (retval > 0)
3055     Dynarr_delete_many (str->runoff, 0, retval);
3056   /* Do NOT return retval.  The return value indicates how much
3057      of the incoming data was written, not how many bytes were
3058      written. */
3059   return size;
3060 }
3061
3062 static void
3063 reset_encoding_stream (struct encoding_stream *str)
3064 {
3065 #ifdef MULE
3066   switch (CODING_SYSTEM_TYPE (str->codesys))
3067     {
3068     case CODESYS_ISO2022:
3069       {
3070         int i;
3071
3072         str->encode_char = &char_encode_iso2022;
3073         str->finish = &char_finish_iso2022;
3074         for (i = 0; i < 4; i++)
3075           {
3076             str->iso2022.charset[i] =
3077               CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, i);
3078             str->iso2022.force_charset_on_output[i] =
3079               CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (str->codesys, i);
3080           }
3081         str->iso2022.register_left = 0;
3082         str->iso2022.register_right = 1;
3083         str->iso2022.current_charset = Qnil;
3084         str->iso2022.current_half = 0;
3085         break;
3086       }
3087     case CODESYS_CCL:
3088       setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_ENCODE (str->codesys));
3089       break;
3090     case CODESYS_UTF8:
3091       str->encode_char = &char_encode_utf8;
3092       str->finish = &char_finish_utf8;
3093       break;
3094     case CODESYS_UCS4:
3095       str->encode_char = &char_encode_ucs4;
3096       str->finish = &char_finish_ucs4;
3097       break;
3098     case CODESYS_SHIFT_JIS:
3099       str->encode_char = &char_encode_shift_jis;
3100       str->finish = &char_finish_shift_jis;
3101       break;
3102     case CODESYS_BIG5:
3103       str->encode_char = &char_encode_big5;
3104       str->finish = &char_finish_big5;
3105       break;
3106     default:
3107       break;
3108     }
3109 #endif /* MULE */
3110   str->iso2022.current_char_boundary = 0;
3111   str->flags = str->ch = 0;
3112 }
3113
3114 static int
3115 encoding_rewinder (Lstream *stream)
3116 {
3117   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
3118   reset_encoding_stream (str);
3119   Dynarr_reset (str->runoff);
3120   return Lstream_rewind (str->other_end);
3121 }
3122
3123 static int
3124 encoding_seekable_p (Lstream *stream)
3125 {
3126   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
3127   return Lstream_seekable_p (str->other_end);
3128 }
3129
3130 static int
3131 encoding_flusher (Lstream *stream)
3132 {
3133   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
3134   return Lstream_flush (str->other_end);
3135 }
3136
3137 static int
3138 encoding_closer (Lstream *stream)
3139 {
3140   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
3141   if (stream->flags & LSTREAM_FL_WRITE)
3142     {
3143       str->flags |= CODING_STATE_END;
3144       encoding_writer (stream, 0, 0);
3145     }
3146   Dynarr_free (str->runoff);
3147   return Lstream_close (str->other_end);
3148 }
3149
3150 Lisp_Object
3151 encoding_stream_coding_system (Lstream *stream)
3152 {
3153   Lisp_Object coding_system;
3154   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
3155
3156   XSETCODING_SYSTEM (coding_system, str->codesys);
3157   return coding_system;
3158 }
3159
3160 void
3161 set_encoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
3162 {
3163   Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
3164   struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
3165   str->codesys = cs;
3166   reset_encoding_stream (str);
3167 }
3168
3169 static Lisp_Object
3170 make_encoding_stream_1 (Lstream *stream, Lisp_Object codesys,
3171                         const char *mode)
3172 {
3173   Lstream *lstr = Lstream_new (lstream_encoding, mode);
3174   struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
3175   Lisp_Object obj;
3176
3177   xzero (*str);
3178   str->runoff = Dynarr_new (unsigned_char);
3179   str->other_end = stream;
3180   set_encoding_stream_coding_system (lstr, codesys);
3181   XSETLSTREAM (obj, lstr);
3182   return obj;
3183 }
3184
3185 Lisp_Object
3186 make_encoding_input_stream (Lstream *stream, Lisp_Object codesys)
3187 {
3188   return make_encoding_stream_1 (stream, codesys, "r");
3189 }
3190
3191 Lisp_Object
3192 make_encoding_output_stream (Lstream *stream, Lisp_Object codesys)
3193 {
3194   return make_encoding_stream_1 (stream, codesys, "w");
3195 }
3196
3197 /* Convert N bytes of internally-formatted data stored in SRC to an
3198    external format, according to the encoding stream ENCODING.
3199    Store the encoded data into DST. */
3200
3201 static void
3202 mule_encode (Lstream *encoding, const Bufbyte *src,
3203              unsigned_char_dynarr *dst, size_t n)
3204 {
3205   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3206
3207   switch (CODING_SYSTEM_TYPE (str->codesys))
3208     {
3209 #ifdef DEBUG_XEMACS
3210     case CODESYS_INTERNAL:
3211       Dynarr_add_many (dst, src, n);
3212       break;
3213 #endif
3214     case CODESYS_AUTODETECT:
3215       /* If we got this far and still haven't decided on the coding
3216          system, then do no conversion. */
3217     case CODESYS_NO_CONVERSION:
3218       encode_coding_no_conversion (encoding, src, dst, n);
3219       break;
3220 #ifdef MULE
3221     case CODESYS_CCL:
3222       str->ccl.last_block = str->flags & CODING_STATE_END;
3223       /* When applying ccl program to stream, MUST NOT set NULL
3224          pointer to src.  */
3225       ccl_driver (&str->ccl, ((src) ? src : (unsigned char*)""),
3226                   dst, n, 0, CCL_MODE_ENCODING);
3227       break;
3228 #endif /* MULE */
3229     default:
3230       text_encode_generic (encoding, src, dst, n);
3231     }
3232 }
3233
3234 DEFUN ("encode-coding-region", Fencode_coding_region, 3, 4, 0, /*
3235 Encode the text between START and END using CODING-SYSTEM.
3236 This will, for example, convert Japanese characters into stuff such as
3237 "^[$B!<!+^[(B" if you use the JIS encoding.  Return length of encoded
3238 text.  BUFFER defaults to the current buffer if unspecified.
3239 */
3240        (start, end, coding_system, buffer))
3241 {
3242   Bufpos b, e;
3243   struct buffer *buf = decode_buffer (buffer, 0);
3244   Lisp_Object instream, lb_outstream, de_outstream, outstream;
3245   Lstream *istr, *ostr;
3246   struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
3247
3248   get_buffer_range_char (buf, start, end, &b, &e, 0);
3249
3250   barf_if_buffer_read_only (buf, b, e);
3251
3252   coding_system = Fget_coding_system (coding_system);
3253   instream  = make_lisp_buffer_input_stream  (buf, b, e, 0);
3254   lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
3255   de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
3256                                               Fget_coding_system (Qbinary));
3257   outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
3258                                            coding_system);
3259   istr = XLSTREAM (instream);
3260   ostr = XLSTREAM (outstream);
3261   GCPRO4 (instream, outstream, de_outstream, lb_outstream);
3262   /* The chain of streams looks like this:
3263
3264      [BUFFER] <----- send through
3265                      ------> [ENCODE AS SPECIFIED]
3266                              ------> [DECODE AS BINARY]
3267                                      ------> [BUFFER]
3268    */
3269   while (1)
3270     {
3271       char tempbuf[1024]; /* some random amount */
3272       Bufpos newpos, even_newer_pos;
3273       Bufpos oldpos = lisp_buffer_stream_startpos (istr);
3274       ssize_t size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
3275
3276       if (!size_in_bytes)
3277         break;
3278       newpos = lisp_buffer_stream_startpos (istr);
3279       Lstream_write (ostr, tempbuf, size_in_bytes);
3280       even_newer_pos = lisp_buffer_stream_startpos (istr);
3281       buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
3282                            even_newer_pos, 0);
3283     }
3284
3285   {
3286     Charcount retlen =
3287       lisp_buffer_stream_startpos (XLSTREAM (instream)) - b;
3288     Lstream_close (istr);
3289     Lstream_close (ostr);
3290     UNGCPRO;
3291     Lstream_delete (istr);
3292     Lstream_delete (ostr);
3293     Lstream_delete (XLSTREAM (de_outstream));
3294     Lstream_delete (XLSTREAM (lb_outstream));
3295     return make_int (retlen);
3296   }
3297 }
3298
3299 #ifdef MULE
3300 \f
3301 static void
3302 text_encode_generic (Lstream *encoding, const Bufbyte *src,
3303                      unsigned_char_dynarr *dst, size_t n)
3304 {
3305   unsigned char c;
3306   unsigned char char_boundary;
3307   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3308   unsigned int flags          = str->flags;
3309   Emchar ch                   = str->ch;
3310
3311   char_boundary = str->iso2022.current_char_boundary;
3312
3313   while (n--)
3314     {
3315       c = *src++;
3316
3317       if (char_boundary == 0)
3318         {
3319           if (c >= 0xfc)
3320             {
3321               ch = c & 0x01;
3322               char_boundary = 5;
3323             }
3324           else if (c >= 0xf8)
3325             {
3326               ch = c & 0x03;
3327               char_boundary = 4;
3328             }
3329           else if (c >= 0xf0)
3330             {
3331               ch = c & 0x07;
3332               char_boundary = 3;
3333             }
3334           else if (c >= 0xe0)
3335             {
3336               ch = c & 0x0f;
3337               char_boundary = 2;
3338             }
3339           else if (c >= 0xc0)
3340             {
3341               ch = c & 0x1f;
3342               char_boundary = 1;
3343             }
3344           else
3345             (*str->encode_char) (str, c, dst, &flags);
3346         }
3347       else if (char_boundary == 1)
3348         {
3349           (*str->encode_char) (str, (ch << 6) | (c & 0x3f), dst, &flags);
3350           ch =0;
3351           char_boundary = 0;
3352         }
3353       else
3354         {
3355           ch = (ch << 6) | (c & 0x3f);
3356           char_boundary--;
3357         }
3358     }
3359
3360   if ((char_boundary == 0) && (flags & CODING_STATE_END))
3361     {
3362       (*str->finish) (str, dst, &flags);
3363     }
3364
3365   str->flags = flags;
3366   str->ch    = ch;
3367   str->iso2022.current_char_boundary = char_boundary;
3368 }
3369
3370 \f
3371 /************************************************************************/
3372 /*                          Shift-JIS methods                           */
3373 /************************************************************************/
3374
3375 /* Shift-JIS is a coding system encoding three character sets: ASCII, right
3376    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
3377    as is.  A character of JISX0201-Kana (DIMENSION1_CHARS94 character set) is
3378    encoded by "position-code + 0x80".  A character of JISX0208
3379    (DIMENSION2_CHARS94 character set) is encoded in 2-byte but two
3380    position-codes are divided and shifted so that it fit in the range
3381    below.
3382
3383    --- CODE RANGE of Shift-JIS ---
3384    (character set)      (range)
3385    ASCII                0x00 .. 0x7F
3386    JISX0201-Kana        0xA0 .. 0xDF
3387    JISX0208 (1st byte)  0x80 .. 0x9F and 0xE0 .. 0xEF
3388             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
3389    -------------------------------
3390
3391 */
3392
3393 /* Is this the first byte of a Shift-JIS two-byte char? */
3394
3395 #define BYTE_SJIS_TWO_BYTE_1_P(c) \
3396   (((c) >= 0x81 && (c) <= 0x9F) || ((c) >= 0xE0 && (c) <= 0xEF))
3397
3398 /* Is this the second byte of a Shift-JIS two-byte char? */
3399
3400 #define BYTE_SJIS_TWO_BYTE_2_P(c) \
3401   (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0x80 && (c) <= 0xFC))
3402
3403 #define BYTE_SJIS_KATAKANA_P(c) \
3404   ((c) >= 0xA1 && (c) <= 0xDF)
3405
3406 static int
3407 detect_coding_sjis (struct detection_state *st, const Extbyte *src, size_t n)
3408 {
3409   while (n--)
3410     {
3411       unsigned char c = *(unsigned char *)src++;
3412       if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3413         return 0;
3414       if (st->shift_jis.in_second_byte)
3415         {
3416           st->shift_jis.in_second_byte = 0;
3417           if (c < 0x40)
3418             return 0;
3419         }
3420       else if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
3421         st->shift_jis.in_second_byte = 1;
3422     }
3423   return CODING_CATEGORY_SHIFT_JIS_MASK;
3424 }
3425
3426 /* Convert Shift-JIS data to internal format. */
3427
3428 static void
3429 decode_coding_sjis (Lstream *decoding, const Extbyte *src,
3430                     unsigned_char_dynarr *dst, size_t n)
3431 {
3432   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3433   unsigned int flags  = str->flags;
3434   unsigned int cpos   = str->cpos;
3435   eol_type_t eol_type = str->eol_type;
3436
3437   while (n--)
3438     {
3439       unsigned char c = *(unsigned char *)src++;
3440
3441       if (cpos)
3442         {
3443           /* Previous character was first byte of Shift-JIS Kanji char. */
3444           if (BYTE_SJIS_TWO_BYTE_2_P (c))
3445             {
3446               unsigned char e1, e2;
3447
3448               DECODE_SJIS (cpos, c, e1, e2);
3449 #ifdef UTF2000
3450               DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_japanese_jisx0208,
3451                                             e1 & 0x7F,
3452                                             e2 & 0x7F), dst);
3453 #else
3454               Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
3455               Dynarr_add (dst, e1);
3456               Dynarr_add (dst, e2);
3457 #endif
3458             }
3459           else
3460             {
3461               DECODE_ADD_BINARY_CHAR (cpos, dst);
3462               DECODE_ADD_BINARY_CHAR (c, dst);
3463             }
3464           cpos = 0;
3465         }
3466       else
3467         {
3468           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3469           if (BYTE_SJIS_TWO_BYTE_1_P (c))
3470             cpos = c;
3471           else if (BYTE_SJIS_KATAKANA_P (c))
3472             {
3473 #ifdef UTF2000
3474               DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_katakana_jisx0201,
3475                                             c & 0x7F, 0), dst);
3476 #else
3477               Dynarr_add (dst, LEADING_BYTE_KATAKANA_JISX0201);
3478               Dynarr_add (dst, c);
3479 #endif
3480             }
3481 #ifdef UTF2000
3482           else if (c > 32)
3483             DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_latin_jisx0201,
3484                                           c, 0), dst);
3485 #endif
3486           else
3487             DECODE_ADD_BINARY_CHAR (c, dst);
3488         }
3489     label_continue_loop:;
3490     }
3491
3492   DECODE_HANDLE_END_OF_CONVERSION (flags, cpos, dst);
3493
3494   str->flags = flags;
3495   str->cpos  = cpos;
3496 }
3497
3498 /* Convert internal character representation to Shift_JIS. */
3499
3500 void
3501 char_encode_shift_jis (struct encoding_stream *str, Emchar ch,
3502                        unsigned_char_dynarr *dst, unsigned int *flags)
3503 {
3504   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3505
3506   if (ch == '\n')
3507     {
3508       if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3509         Dynarr_add (dst, '\r');
3510       if (eol_type != EOL_CR)
3511         Dynarr_add (dst, ch);
3512     }
3513   else
3514     {
3515       unsigned int s1, s2;
3516 #ifdef UTF2000
3517       int code_point = charset_code_point (Vcharset_latin_jisx0201, ch);
3518
3519       if (code_point >= 0)
3520         Dynarr_add (dst, code_point);
3521       else if ((code_point
3522                 = charset_code_point (Vcharset_japanese_jisx0208_1990, ch))
3523                >= 0)
3524         {
3525           ENCODE_SJIS ((code_point >> 8) | 0x80,
3526                        (code_point & 0xFF) | 0x80, s1, s2);
3527           Dynarr_add (dst, s1);
3528           Dynarr_add (dst, s2);
3529         }
3530       else if ((code_point
3531                 = charset_code_point (Vcharset_katakana_jisx0201, ch))
3532                >= 0)
3533         Dynarr_add (dst, code_point | 0x80);
3534       else if ((code_point
3535                 = charset_code_point (Vcharset_japanese_jisx0208, ch))
3536                >= 0)
3537         {
3538           ENCODE_SJIS ((code_point >> 8) | 0x80,
3539                        (code_point & 0xFF) | 0x80, s1, s2);
3540           Dynarr_add (dst, s1);
3541           Dynarr_add (dst, s2);
3542         }
3543       else if ((code_point = charset_code_point (Vcharset_ascii, ch))
3544                >= 0)
3545         Dynarr_add (dst, code_point);
3546       else
3547         Dynarr_add (dst, '?');
3548 #else
3549       Lisp_Object charset;
3550       unsigned int c1, c2;
3551
3552       BREAKUP_CHAR (ch, charset, c1, c2);
3553
3554       if (EQ(charset, Vcharset_katakana_jisx0201))
3555         {
3556           Dynarr_add (dst, c1 | 0x80);
3557         }
3558       else if (c2 == 0)
3559         {
3560           Dynarr_add (dst, c1);
3561         }
3562       else if (EQ(charset, Vcharset_japanese_jisx0208))
3563         {
3564           ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3565           Dynarr_add (dst, s1);
3566           Dynarr_add (dst, s2);
3567         }
3568       else
3569         Dynarr_add (dst, '?');
3570 #endif
3571     }
3572 }
3573
3574 void
3575 char_finish_shift_jis (struct encoding_stream *str, unsigned_char_dynarr *dst,
3576                        unsigned int *flags)
3577 {
3578 }
3579
3580 DEFUN ("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /*
3581 Decode a JISX0208 character of Shift-JIS coding-system.
3582 CODE is the character code in Shift-JIS as a cons of type bytes.
3583 Return the corresponding character.
3584 */
3585        (code))
3586 {
3587   unsigned char c1, c2, s1, s2;
3588
3589   CHECK_CONS (code);
3590   CHECK_INT (XCAR (code));
3591   CHECK_INT (XCDR (code));
3592   s1 = XINT (XCAR (code));
3593   s2 = XINT (XCDR (code));
3594   if (BYTE_SJIS_TWO_BYTE_1_P (s1) &&
3595       BYTE_SJIS_TWO_BYTE_2_P (s2))
3596     {
3597       DECODE_SJIS (s1, s2, c1, c2);
3598       return make_char (MAKE_CHAR (Vcharset_japanese_jisx0208,
3599                                    c1 & 0x7F, c2 & 0x7F));
3600     }
3601   else
3602     return Qnil;
3603 }
3604
3605 DEFUN ("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /*
3606 Encode a JISX0208 character CHARACTER to SHIFT-JIS coding-system.
3607 Return the corresponding character code in SHIFT-JIS as a cons of two bytes.
3608 */
3609        (character))
3610 {
3611   Lisp_Object charset;
3612   int c1, c2, s1, s2;
3613
3614   CHECK_CHAR_COERCE_INT (character);
3615   BREAKUP_CHAR (XCHAR (character), charset, c1, c2);
3616   if (EQ (charset, Vcharset_japanese_jisx0208))
3617     {
3618       ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3619       return Fcons (make_int (s1), make_int (s2));
3620     }
3621   else
3622     return Qnil;
3623 }
3624
3625 \f
3626 /************************************************************************/
3627 /*                            Big5 methods                              */
3628 /************************************************************************/
3629
3630 /* BIG5 is a coding system encoding two character sets: ASCII and
3631    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
3632    character set and is encoded in two-byte.
3633
3634    --- CODE RANGE of BIG5 ---
3635    (character set)      (range)
3636    ASCII                0x00 .. 0x7F
3637    Big5 (1st byte)      0xA1 .. 0xFE
3638         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
3639    --------------------------
3640
3641    Since the number of characters in Big5 is larger than maximum
3642    characters in Emacs' charset (96x96), it can't be handled as one
3643    charset.  So, in Emacs, Big5 is divided into two: `charset-big5-1'
3644    and `charset-big5-2'.  Both <type>s are DIMENSION2_CHARS94.  The former
3645    contains frequently used characters and the latter contains less
3646    frequently used characters.  */
3647
3648 #ifdef UTF2000
3649 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
3650   ((c) >= 0x81 && (c) <= 0xFE)
3651 #else
3652 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
3653   ((c) >= 0xA1 && (c) <= 0xFE)
3654 #endif
3655
3656 /* Is this the second byte of a Shift-JIS two-byte char? */
3657
3658 #define BYTE_BIG5_TWO_BYTE_2_P(c) \
3659   (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0xA1 && (c) <= 0xFE))
3660
3661 /* Number of Big5 characters which have the same code in 1st byte.  */
3662
3663 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
3664
3665 /* Code conversion macros.  These are macros because they are used in
3666    inner loops during code conversion.
3667
3668    Note that temporary variables in macros introduce the classic
3669    dynamic-scoping problems with variable names.  We use capital-
3670    lettered variables in the assumption that XEmacs does not use
3671    capital letters in variables except in a very formalized way
3672    (e.g. Qstring). */
3673
3674 /* Convert Big5 code (b1, b2) into its internal string representation
3675    (lb, c1, c2). */
3676
3677 /* There is a much simpler way to split the Big5 charset into two.
3678    For the moment I'm going to leave the algorithm as-is because it
3679    claims to separate out the most-used characters into a single
3680    charset, which perhaps will lead to optimizations in various
3681    places.
3682
3683    The way the algorithm works is something like this:
3684
3685    Big5 can be viewed as a 94x157 charset, where the row is
3686    encoded into the bytes 0xA1 .. 0xFE and the column is encoded
3687    into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE.  As for frequency,
3688    the split between low and high column numbers is apparently
3689    meaningless; ascending rows produce less and less frequent chars.
3690    Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to
3691    the first charset, and the upper half (0xC9 .. 0xFE) to the
3692    second.  To do the conversion, we convert the character into
3693    a single number where 0 .. 156 is the first row, 157 .. 313
3694    is the second, etc.  That way, the characters are ordered by
3695    decreasing frequency.  Then we just chop the space in two
3696    and coerce the result into a 94x94 space.
3697    */
3698
3699 #define DECODE_BIG5(b1, b2, lb, c1, c2) do                              \
3700 {                                                                       \
3701   int B1 = b1, B2 = b2;                                                 \
3702   unsigned int I                                                        \
3703     = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62);     \
3704                                                                         \
3705   if (B1 < 0xC9)                                                        \
3706     {                                                                   \
3707       lb = LEADING_BYTE_CHINESE_BIG5_1;                                 \
3708     }                                                                   \
3709   else                                                                  \
3710     {                                                                   \
3711       lb = LEADING_BYTE_CHINESE_BIG5_2;                                 \
3712       I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1);                             \
3713     }                                                                   \
3714   c1 = I / (0xFF - 0xA1) + 0xA1;                                        \
3715   c2 = I % (0xFF - 0xA1) + 0xA1;                                        \
3716 } while (0)
3717
3718 /* Convert the internal string representation of a Big5 character
3719    (lb, c1, c2) into Big5 code (b1, b2). */
3720
3721 #define ENCODE_BIG5(lb, c1, c2, b1, b2) do                              \
3722 {                                                                       \
3723   unsigned int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1);       \
3724                                                                         \
3725   if (lb == LEADING_BYTE_CHINESE_BIG5_2)                                \
3726     {                                                                   \
3727       I += BIG5_SAME_ROW * (0xC9 - 0xA1);                               \
3728     }                                                                   \
3729   b1 = I / BIG5_SAME_ROW + 0xA1;                                        \
3730   b2 = I % BIG5_SAME_ROW;                                               \
3731   b2 += b2 < 0x3F ? 0x40 : 0x62;                                        \
3732 } while (0)
3733
3734 static int
3735 detect_coding_big5 (struct detection_state *st, const Extbyte *src, size_t n)
3736 {
3737   while (n--)
3738     {
3739       unsigned char c = *(unsigned char *)src++;
3740       if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO
3741 #ifndef UTF2000
3742           || (c >= 0x80 && c <= 0xA0)
3743 #endif
3744           )
3745         return 0;
3746       if (st->big5.in_second_byte)
3747         {
3748           st->big5.in_second_byte = 0;
3749           if (c < 0x40 || (c >= 0x80 && c <= 0xA0))
3750             return 0;
3751         }
3752       else if (
3753 #ifdef UTF2000
3754                c >= 0x81
3755 #else
3756                c >= 0xA1
3757 #endif
3758                )
3759         st->big5.in_second_byte = 1;
3760     }
3761   return CODING_CATEGORY_BIG5_MASK;
3762 }
3763
3764 /* Convert Big5 data to internal format. */
3765
3766 static void
3767 decode_coding_big5 (Lstream *decoding, const Extbyte *src,
3768                     unsigned_char_dynarr *dst, size_t n)
3769 {
3770   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3771   unsigned int flags  = str->flags;
3772   unsigned int cpos   = str->cpos;
3773   eol_type_t eol_type = str->eol_type;
3774 #ifdef UTF2000
3775   Lisp_Object ccs
3776     = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (DECODING_STREAM_DATA
3777                                              (decoding)->codesys, 1);
3778 #endif
3779
3780   while (n--)
3781     {
3782       unsigned char c = *(unsigned char *)src++;
3783       if (cpos)
3784         {
3785           /* Previous character was first byte of Big5 char. */
3786           if (BYTE_BIG5_TWO_BYTE_2_P (c))
3787             {
3788 #ifdef UTF2000
3789               int code_point = (cpos << 8) | c;
3790               Emchar char_id = decode_defined_char (ccs, code_point);
3791
3792               if (char_id < 0)
3793                 char_id = DECODE_CHAR (Vcharset_chinese_big5, code_point);
3794               DECODE_ADD_UCS_CHAR (char_id, dst);
3795 #else
3796               unsigned char b1, b2, b3;
3797               DECODE_BIG5 (cpos, c, b1, b2, b3);
3798               Dynarr_add (dst, b1);
3799               Dynarr_add (dst, b2);
3800               Dynarr_add (dst, b3);
3801 #endif
3802             }
3803           else
3804             {
3805               DECODE_ADD_BINARY_CHAR (cpos, dst);
3806               DECODE_ADD_BINARY_CHAR (c, dst);
3807             }
3808           cpos = 0;
3809         }
3810       else
3811         {
3812           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3813           if (BYTE_BIG5_TWO_BYTE_1_P (c))
3814             {
3815               decode_flush_er_chars (str, dst);
3816               cpos = c;
3817             }
3818           else if ( c >= ' ' )
3819             {
3820               /* DECODE_ADD_BINARY_CHAR (c, dst); */
3821               decode_add_er_char (str, c, dst);
3822             }
3823           else
3824             {
3825               decode_flush_er_chars (str, dst);
3826               DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3827               DECODE_ADD_BINARY_CHAR (c, dst);
3828             }
3829         }
3830     label_continue_loop:;
3831     }
3832
3833   /* DECODE_HANDLE_END_OF_CONVERSION (flags, cpos, dst); */
3834   if (flags & CODING_STATE_END)
3835     {
3836       decode_flush_er_chars (str, dst);
3837       DECODE_OUTPUT_PARTIAL_CHAR (cpos);
3838       if (flags & CODING_STATE_CR)
3839         Dynarr_add (dst, '\r');
3840     }
3841
3842   str->flags = flags;
3843   str->cpos  = cpos;
3844 }
3845
3846 /* Convert internally-formatted data to Big5. */
3847
3848 void
3849 char_encode_big5 (struct encoding_stream *str, Emchar ch,
3850                   unsigned_char_dynarr *dst, unsigned int *flags)
3851 {
3852   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3853
3854   if (ch == '\n')
3855     {
3856       if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3857         Dynarr_add (dst, '\r');
3858       if (eol_type != EOL_CR)
3859         Dynarr_add (dst, ch);
3860     }
3861   else
3862     {
3863 #ifdef UTF2000
3864       int code_point;
3865       Lisp_Object ccs
3866         = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, 1);
3867
3868       if ((code_point = charset_code_point (Vcharset_ascii, ch)) >= 0)
3869         Dynarr_add (dst, code_point);
3870       else if ((code_point = charset_code_point (ccs, ch)) >= 0)
3871         {
3872           Dynarr_add (dst, code_point >> 8);
3873           Dynarr_add (dst, code_point & 0xFF);
3874         }
3875       else if ((code_point
3876                 = charset_code_point (Vcharset_chinese_big5, ch)) >= 0)
3877         {
3878           Dynarr_add (dst, code_point >> 8);
3879           Dynarr_add (dst, code_point & 0xFF);
3880         }
3881       else if ((code_point
3882                 = charset_code_point (Vcharset_chinese_big5_1, ch)) >= 0)
3883         {
3884           unsigned int I
3885             = ((code_point >> 8) - 33) * (0xFF - 0xA1)
3886             + ((code_point & 0xFF) - 33);
3887           unsigned char b1 = I / BIG5_SAME_ROW + 0xA1;
3888           unsigned char b2 = I % BIG5_SAME_ROW;
3889
3890           b2 += b2 < 0x3F ? 0x40 : 0x62;
3891           Dynarr_add (dst, b1);
3892           Dynarr_add (dst, b2);
3893         }
3894       else if ((code_point
3895                 = charset_code_point (Vcharset_chinese_big5_2, ch)) >= 0)
3896         {
3897           unsigned int I
3898             = ((code_point >> 8) - 33) * (0xFF - 0xA1)
3899             + ((code_point & 0xFF) - 33);
3900           unsigned char b1, b2;
3901
3902           I += BIG5_SAME_ROW * (0xC9 - 0xA1);
3903           b1 = I / BIG5_SAME_ROW + 0xA1;
3904           b2 = I % BIG5_SAME_ROW;
3905           b2 += b2 < 0x3F ? 0x40 : 0x62;
3906           Dynarr_add (dst, b1);
3907           Dynarr_add (dst, b2);
3908         }
3909       else
3910         Dynarr_add (dst, '?');
3911 #else
3912 #endif
3913     }
3914 }
3915
3916 void
3917 char_finish_big5 (struct encoding_stream *str, unsigned_char_dynarr *dst,
3918                   unsigned int *flags)
3919 {
3920 }
3921
3922
3923 DEFUN ("decode-big5-char", Fdecode_big5_char, 1, 1, 0, /*
3924 Decode a Big5 character CODE of BIG5 coding-system.
3925 CODE is the character code in BIG5, a cons of two integers.
3926 Return the corresponding character.
3927 */
3928        (code))
3929 {
3930   unsigned char c1, c2, b1, b2;
3931
3932   CHECK_CONS (code);
3933   CHECK_INT (XCAR (code));
3934   CHECK_INT (XCDR (code));
3935   b1 = XINT (XCAR (code));
3936   b2 = XINT (XCDR (code));
3937   if (BYTE_BIG5_TWO_BYTE_1_P (b1) &&
3938       BYTE_BIG5_TWO_BYTE_2_P (b2))
3939     {
3940       Charset_ID leading_byte;
3941       Lisp_Object charset;
3942       DECODE_BIG5 (b1, b2, leading_byte, c1, c2);
3943       charset = CHARSET_BY_LEADING_BYTE (leading_byte);
3944       return make_char (MAKE_CHAR (charset, c1 & 0x7F, c2 & 0x7F));
3945     }
3946   else
3947     return Qnil;
3948 }
3949
3950 DEFUN ("encode-big5-char", Fencode_big5_char, 1, 1, 0, /*
3951 Encode the Big5 character CHARACTER in the BIG5 coding-system.
3952 Return the corresponding character code in Big5.
3953 */
3954        (character))
3955 {
3956   Lisp_Object charset;
3957   int c1, c2, b1, b2;
3958
3959   CHECK_CHAR_COERCE_INT (character);
3960   BREAKUP_CHAR (XCHAR (character), charset, c1, c2);
3961   if (EQ (charset, Vcharset_chinese_big5_1) ||
3962       EQ (charset, Vcharset_chinese_big5_2))
3963     {
3964       ENCODE_BIG5 (XCHARSET_LEADING_BYTE (charset), c1 | 0x80, c2 | 0x80,
3965                    b1, b2);
3966       return Fcons (make_int (b1), make_int (b2));
3967     }
3968   else
3969     return Qnil;
3970 }
3971
3972 \f
3973 /************************************************************************/
3974 /*                           UCS-4 methods                              */
3975 /************************************************************************/
3976
3977 static int
3978 detect_coding_ucs4 (struct detection_state *st, const Extbyte *src, size_t n)
3979 {
3980   while (n--)
3981     {
3982       unsigned char c = *(unsigned char *)src++;
3983       switch (st->ucs4.in_byte)
3984         {
3985         case 0:
3986           if (c >= 128)
3987             return 0;
3988           else
3989             st->ucs4.in_byte++;
3990           break;
3991         case 3:
3992           st->ucs4.in_byte = 0;
3993           break;
3994         default:
3995           st->ucs4.in_byte++;
3996         }
3997     }
3998   return CODING_CATEGORY_UCS4_MASK;
3999 }
4000
4001 static void
4002 decode_coding_ucs4 (Lstream *decoding, const Extbyte *src,
4003                     unsigned_char_dynarr *dst, size_t n)
4004 {
4005   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4006   unsigned int flags = str->flags;
4007   unsigned int cpos  = str->cpos;
4008   unsigned char counter = str->counter;
4009
4010   while (n--)
4011     {
4012       unsigned char c = *(unsigned char *)src++;
4013       switch (counter)
4014         {
4015         case 0:
4016           cpos = c;
4017           counter = 3;
4018           break;
4019         case 1:
4020           DECODE_ADD_UCS_CHAR ((cpos << 8) | c, dst);
4021           cpos = 0;
4022           counter = 0;
4023           break;
4024         default:
4025           cpos = ( cpos << 8 ) | c;
4026           counter--;
4027         }
4028     }
4029   if (counter & CODING_STATE_END)
4030     DECODE_OUTPUT_PARTIAL_CHAR (cpos);
4031
4032   str->flags    = flags;
4033   str->cpos     = cpos;
4034   str->counter  = counter;
4035 }
4036
4037 void
4038 char_encode_ucs4 (struct encoding_stream *str, Emchar ch,
4039                   unsigned_char_dynarr *dst, unsigned int *flags)
4040 {
4041   Dynarr_add (dst, ch >> 24);
4042   Dynarr_add (dst, ch >> 16);
4043   Dynarr_add (dst, ch >>  8);
4044   Dynarr_add (dst, ch      );
4045 }
4046
4047 void
4048 char_finish_ucs4 (struct encoding_stream *str, unsigned_char_dynarr *dst,
4049                   unsigned int *flags)
4050 {
4051 }
4052
4053 \f
4054 /************************************************************************/
4055 /*                           UTF-8 methods                              */
4056 /************************************************************************/
4057
4058 static int
4059 detect_coding_utf8 (struct detection_state *st, const Extbyte *src, size_t n)
4060 {
4061   while (n--)
4062     {
4063       unsigned char c = *(unsigned char *)src++;
4064       switch (st->utf8.in_byte)
4065         {
4066         case 0:
4067           if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
4068             return 0;
4069           else if (c >= 0xfc)
4070             st->utf8.in_byte = 5;
4071           else if (c >= 0xf8)
4072             st->utf8.in_byte = 4;
4073           else if (c >= 0xf0)
4074             st->utf8.in_byte = 3;
4075           else if (c >= 0xe0)
4076             st->utf8.in_byte = 2;
4077           else if (c >= 0xc0)
4078             st->utf8.in_byte = 1;
4079           else if (c >= 0x80)
4080             return 0;
4081           break;
4082         default:
4083           if ((c & 0xc0) != 0x80)
4084             return 0;
4085           else
4086             st->utf8.in_byte--;
4087         }
4088     }
4089   return CODING_CATEGORY_UTF8_MASK;
4090 }
4091
4092 static void
4093 decode_output_utf8_partial_char (unsigned char counter,
4094                                  unsigned int cpos,
4095                                  unsigned_char_dynarr *dst)
4096 {
4097   if (counter == 5)
4098     DECODE_ADD_BINARY_CHAR ( (cpos|0xFC), dst);
4099   else if (counter == 4)
4100     {
4101       if (cpos < (1 << 6))
4102         DECODE_ADD_BINARY_CHAR ( (cpos|0xF8), dst);
4103       else
4104         {
4105           DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xFC), dst);
4106           DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
4107         }
4108     }
4109   else if (counter == 3)
4110     {
4111       if (cpos < (1 << 6))
4112         DECODE_ADD_BINARY_CHAR ( (cpos|0xF0), dst);
4113       else if (cpos < (1 << 12))
4114         {
4115           DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xF8), dst);
4116           DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
4117         }
4118       else
4119         {
4120           DECODE_ADD_BINARY_CHAR ( ( (cpos >> 12)|0xFC), dst);
4121           DECODE_ADD_BINARY_CHAR ( (((cpos >>  6)&0x3F)|0x80), dst);
4122           DECODE_ADD_BINARY_CHAR ( ( (cpos       &0x3F)|0x80), dst);
4123         }
4124     }
4125   else if (counter == 2)
4126     {
4127       if (cpos < (1 << 6))
4128         DECODE_ADD_BINARY_CHAR ( (cpos|0xE0), dst);
4129       else if (cpos < (1 << 12))
4130         {
4131           DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xF0), dst);
4132           DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
4133         }
4134       else if (cpos < (1 << 18))
4135         {
4136           DECODE_ADD_BINARY_CHAR ( ( (cpos >> 12)|0xF8), dst);
4137           DECODE_ADD_BINARY_CHAR ( (((cpos >> 6)&0x3F)|0x80), dst);
4138           DECODE_ADD_BINARY_CHAR ( ( (cpos      &0x3F)|0x80), dst);
4139         }
4140       else
4141         {
4142           DECODE_ADD_BINARY_CHAR ( ( (cpos >> 18)|0xFC), dst);
4143           DECODE_ADD_BINARY_CHAR ( (((cpos >> 12)&0x3F)|0x80), dst);
4144           DECODE_ADD_BINARY_CHAR ( (((cpos >>  6)&0x3F)|0x80), dst);
4145           DECODE_ADD_BINARY_CHAR ( ( (cpos       &0x3F)|0x80), dst);
4146         }
4147     }
4148   else
4149     {
4150       if (cpos < (1 << 6))
4151         DECODE_ADD_BINARY_CHAR ( (cpos|0xC0), dst);
4152       else if (cpos < (1 << 12))
4153         {
4154           DECODE_ADD_BINARY_CHAR ( ((cpos >> 6)|0xE0), dst);
4155           DECODE_ADD_BINARY_CHAR ( ((cpos&0x3F)|0x80), dst);
4156         }
4157       else if (cpos < (1 << 18))
4158         {
4159           DECODE_ADD_BINARY_CHAR ( ( (cpos >> 12)|0xF0), dst);
4160           DECODE_ADD_BINARY_CHAR ( (((cpos >>  6)&0x3F)|0x80), dst);
4161           DECODE_ADD_BINARY_CHAR ( ( (cpos       &0x3F)|0x80), dst);
4162         }
4163       else if (cpos < (1 << 24))
4164         {
4165           DECODE_ADD_BINARY_CHAR ( ( (cpos >> 18)|0xF8), dst);
4166           DECODE_ADD_BINARY_CHAR ( (((cpos >> 12)&0x3F)|0x80), dst);
4167           DECODE_ADD_BINARY_CHAR ( (((cpos >>  6)&0x3F)|0x80), dst);
4168           DECODE_ADD_BINARY_CHAR ( ( (cpos       &0x3F)|0x80), dst);
4169         }
4170       else
4171         {
4172           DECODE_ADD_BINARY_CHAR ( ( (cpos >> 24)|0xFC), dst);
4173           DECODE_ADD_BINARY_CHAR ( (((cpos >> 18)&0x3F)|0x80), dst);
4174           DECODE_ADD_BINARY_CHAR ( (((cpos >> 12)&0x3F)|0x80), dst);
4175           DECODE_ADD_BINARY_CHAR ( (((cpos >>  6)&0x3F)|0x80), dst);
4176           DECODE_ADD_BINARY_CHAR ( ( (cpos       &0x3F)|0x80), dst);
4177         }
4178     }
4179 }
4180
4181 static void
4182 decode_coding_utf8 (Lstream *decoding, const Extbyte *src,
4183                     unsigned_char_dynarr *dst, size_t n)
4184 {
4185   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4186   unsigned int flags    = str->flags;
4187   unsigned int cpos     = str->cpos;
4188   eol_type_t eol_type   = str->eol_type;
4189   unsigned char counter = str->counter;
4190 #ifdef UTF2000
4191   Lisp_Object ccs
4192     = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (DECODING_STREAM_DATA
4193                                              (decoding)->codesys, 0);
4194 #endif
4195
4196   while (n--)
4197     {
4198       unsigned char c = *(unsigned char *)src++;
4199       if (counter == 0)
4200         {
4201           if ( c < ' ' )
4202             {
4203               COMPOSE_FLUSH_CHARS (str, dst);
4204               decode_flush_er_chars (str, dst);
4205               DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4206               DECODE_ADD_UCS_CHAR (c, dst);
4207             }
4208           else if ( c < 0xC0 )
4209             /* decode_add_er_char (str, c, dst); */
4210             COMPOSE_ADD_CHAR (str, c, dst);
4211           else
4212             {
4213               /* decode_flush_er_chars (str, dst); */
4214               if ( c < 0xE0 )
4215                 {
4216                   cpos = c & 0x1f;
4217                   counter = 1;
4218                 }
4219               else if ( c < 0xF0 )
4220                 {
4221                   cpos = c & 0x0f;
4222                   counter = 2;
4223                 }
4224               else if ( c < 0xF8 )
4225                 {
4226                   cpos = c & 0x07;
4227                   counter = 3;
4228                 }
4229               else if ( c < 0xFC )
4230                 {
4231                   cpos = c & 0x03;
4232                   counter = 4;
4233                 }
4234               else
4235                 {
4236                   cpos = c & 0x01;
4237                   counter = 5;
4238                 }
4239             }
4240         }
4241       else if ( (c & 0xC0) == 0x80 )
4242         {
4243           cpos = ( cpos << 6 ) | ( c & 0x3f );
4244           if (counter == 1)
4245             {
4246               Emchar char_id = decode_defined_char (ccs, cpos);
4247
4248               if (char_id < 0)
4249                 char_id = cpos;
4250               COMPOSE_ADD_CHAR (str, char_id, dst);
4251               cpos = 0;
4252               counter = 0;
4253             }
4254           else
4255             counter--;
4256         }
4257       else
4258         {
4259           COMPOSE_FLUSH_CHARS (str, dst);
4260           decode_flush_er_chars (str, dst);
4261           decode_output_utf8_partial_char (counter, cpos, dst);
4262           DECODE_ADD_BINARY_CHAR (c, dst);
4263           cpos = 0;
4264           counter = 0;
4265         }
4266     label_continue_loop:;
4267     }
4268
4269   if (flags & CODING_STATE_END)
4270     {
4271       COMPOSE_FLUSH_CHARS (str, dst);
4272       decode_flush_er_chars (str, dst);
4273       if (counter > 0)
4274         {
4275           decode_output_utf8_partial_char (counter, cpos, dst);
4276           cpos = 0;
4277           counter = 0;
4278         }
4279     }
4280   str->flags    = flags;
4281   str->cpos     = cpos;
4282   str->counter  = counter;
4283 }
4284
4285 void
4286 char_encode_utf8 (struct encoding_stream *str, Emchar ch,
4287                   unsigned_char_dynarr *dst, unsigned int *flags)
4288 {
4289   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
4290
4291   if (ch == '\n')
4292     {
4293       if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
4294         Dynarr_add (dst, '\r');
4295       if (eol_type != EOL_CR)
4296         Dynarr_add (dst, ch);
4297     }
4298   else if (ch <= 0x7f)
4299     {
4300       Dynarr_add (dst, ch);
4301     }
4302   else
4303     {
4304       Lisp_Object ucs_ccs
4305         = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, 0);
4306       int code_point = charset_code_point (ucs_ccs, ch);
4307
4308       if ( (code_point < 0) || (code_point > 0x10FFFF) )
4309         {
4310           Lisp_Object map
4311             = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, 1);
4312           Lisp_Object ret;
4313
4314           if ( !NILP (map)
4315                && INTP (ret = Fget_char_attribute (make_char (ch),
4316                                                    map, Qnil)) )
4317             code_point = XINT (ret);
4318           else if ( !NILP (map =
4319                            CODING_SYSTEM_ISO2022_INITIAL_CHARSET
4320                            (str->codesys, 2))
4321                     && INTP (ret = Fget_char_attribute (make_char (ch),
4322                                                         map, Qnil)) )
4323             code_point = XINT (ret);
4324           else if (CODING_SYSTEM_USE_ENTITY_REFERENCE (str->codesys))
4325             {
4326               Lisp_Object rest = Vcoded_charset_entity_reference_alist;
4327               Lisp_Object cell;
4328               Lisp_Object ccs;
4329               int format_columns, idx;
4330               char buf[16], format[16];
4331
4332               while (!NILP (rest))
4333                 {
4334                   cell = Fcar (rest);
4335                   ccs = Fcar (cell);
4336                   if (!NILP (ccs = Ffind_charset (ccs)))
4337                     {
4338                       if ( (code_point
4339                             = charset_code_point (ccs, ch)) >= 0 )
4340                         {
4341                           cell = Fcdr (cell);
4342                           ret = Fcar (cell);
4343                           if (STRINGP (ret)
4344                               && ((idx =XSTRING_LENGTH (ret)) <= 6))
4345                             {
4346                               strncpy (format, XSTRING_DATA (ret), idx);
4347                             }
4348                           else
4349                             continue;
4350
4351                           cell = Fcdr (cell);
4352                           ret = Fcar (cell);
4353                           if (INTP (ret))
4354                             {
4355                               format [idx++] = '%';
4356                               format_columns = XINT (ret);
4357                               if ( (2 <= format_columns)
4358                                    && (format_columns <= 8) )
4359                                 {
4360                                   format [idx++] = '0';
4361                                   format [idx++] = '0' + format_columns;
4362                                 }
4363                             }
4364
4365                           cell = Fcdr (cell);
4366                           ret = Fcar (cell);
4367                           if (EQ (ret, Qd))
4368                             format [idx++] = 'd';
4369                           else if (EQ (ret, Qx))
4370                             format [idx++] = 'x';
4371                           else if (EQ (ret, QX))
4372                             format [idx++] = 'X';
4373                           else
4374                             continue;
4375                           format [idx++] = 0;
4376
4377                           sprintf (buf, format, code_point);
4378                           Dynarr_add (dst, '&');
4379                           Dynarr_add_many (dst, buf, strlen (buf));
4380                           Dynarr_add (dst, ';');
4381                           return;
4382                         }
4383                     }
4384                   rest = Fcdr (rest);
4385                 }
4386               sprintf (buf, "&MCS-%08X;", ch);
4387               Dynarr_add_many (dst, buf, strlen (buf));
4388               return;
4389             }
4390           else
4391             code_point = ch;
4392         }
4393       if (code_point <= 0x7ff)
4394         {
4395           Dynarr_add (dst, (code_point >> 6) | 0xc0);
4396           Dynarr_add (dst, (code_point & 0x3f) | 0x80);
4397         }
4398       else if (code_point <= 0xffff)
4399         {
4400           Dynarr_add (dst,  (code_point >> 12) | 0xe0);
4401           Dynarr_add (dst, ((code_point >>  6) & 0x3f) | 0x80);
4402           Dynarr_add (dst,  (code_point        & 0x3f) | 0x80);
4403         }
4404       else if (code_point <= 0x1fffff)
4405         {
4406           Dynarr_add (dst,  (code_point >> 18) | 0xf0);
4407           Dynarr_add (dst, ((code_point >> 12) & 0x3f) | 0x80);
4408           Dynarr_add (dst, ((code_point >>  6) & 0x3f) | 0x80);
4409           Dynarr_add (dst,  (code_point        & 0x3f) | 0x80);
4410         }
4411       else if (code_point <= 0x3ffffff)
4412         {
4413           Dynarr_add (dst,  (code_point >> 24) | 0xf8);
4414           Dynarr_add (dst, ((code_point >> 18) & 0x3f) | 0x80);
4415           Dynarr_add (dst, ((code_point >> 12) & 0x3f) | 0x80);
4416           Dynarr_add (dst, ((code_point >>  6) & 0x3f) | 0x80);
4417           Dynarr_add (dst,  (code_point        & 0x3f) | 0x80);
4418         }
4419       else
4420         {
4421           Dynarr_add (dst,  (code_point >> 30) | 0xfc);
4422           Dynarr_add (dst, ((code_point >> 24) & 0x3f) | 0x80);
4423           Dynarr_add (dst, ((code_point >> 18) & 0x3f) | 0x80);
4424           Dynarr_add (dst, ((code_point >> 12) & 0x3f) | 0x80);
4425           Dynarr_add (dst, ((code_point >>  6) & 0x3f) | 0x80);
4426           Dynarr_add (dst,  (code_point        & 0x3f) | 0x80);
4427         }
4428     }
4429 }
4430
4431 void
4432 char_finish_utf8 (struct encoding_stream *str, unsigned_char_dynarr *dst,
4433                   unsigned int *flags)
4434 {
4435 }
4436
4437 \f
4438 /************************************************************************/
4439 /*                           ISO2022 methods                            */
4440 /************************************************************************/
4441
4442 /* The following note describes the coding system ISO2022 briefly.
4443    Since the intention of this note is to help understand the
4444    functions in this file, some parts are NOT ACCURATE or OVERLY
4445    SIMPLIFIED.  For thorough understanding, please refer to the
4446    original document of ISO2022.
4447
4448    ISO2022 provides many mechanisms to encode several character sets
4449    in 7-bit and 8-bit environments.  For 7-bit environments, all text
4450    is encoded using bytes less than 128.  This may make the encoded
4451    text a little bit longer, but the text passes more easily through
4452    several gateways, some of which strip off MSB (Most Signigant Bit).
4453
4454    There are two kinds of character sets: control character set and
4455    graphic character set.  The former contains control characters such
4456    as `newline' and `escape' to provide control functions (control
4457    functions are also provided by escape sequences).  The latter
4458    contains graphic characters such as 'A' and '-'.  Emacs recognizes
4459    two control character sets and many graphic character sets.
4460
4461    Graphic character sets are classified into one of the following
4462    four classes, according to the number of bytes (DIMENSION) and
4463    number of characters in one dimension (CHARS) of the set:
4464    - DIMENSION1_CHARS94
4465    - DIMENSION1_CHARS96
4466    - DIMENSION2_CHARS94
4467    - DIMENSION2_CHARS96
4468
4469    In addition, each character set is assigned an identification tag,
4470    unique for each set, called "final character" (denoted as <F>
4471    hereafter).  The <F> of each character set is decided by ECMA(*)
4472    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
4473    (0x30..0x3F are for private use only).
4474
4475    Note (*): ECMA = European Computer Manufacturers Association
4476
4477    Here are examples of graphic character set [NAME(<F>)]:
4478         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
4479         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
4480         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
4481         o DIMENSION2_CHARS96 -- none for the moment
4482
4483    A code area (1 byte = 8 bits) is divided into 4 areas, C0, GL, C1, and GR.
4484         C0 [0x00..0x1F] -- control character plane 0
4485         GL [0x20..0x7F] -- graphic character plane 0
4486         C1 [0x80..0x9F] -- control character plane 1
4487         GR [0xA0..0xFF] -- graphic character plane 1
4488
4489    A control character set is directly designated and invoked to C0 or
4490    C1 by an escape sequence.  The most common case is that:
4491    - ISO646's  control character set is designated/invoked to C0, and
4492    - ISO6429's control character set is designated/invoked to C1,
4493    and usually these designations/invocations are omitted in encoded
4494    text.  In a 7-bit environment, only C0 can be used, and a control
4495    character for C1 is encoded by an appropriate escape sequence to
4496    fit into the environment.  All control characters for C1 are
4497    defined to have corresponding escape sequences.
4498
4499    A graphic character set is at first designated to one of four
4500    graphic registers (G0 through G3), then these graphic registers are
4501    invoked to GL or GR.  These designations and invocations can be
4502    done independently.  The most common case is that G0 is invoked to
4503    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
4504    these invocations and designations are omitted in encoded text.
4505    In a 7-bit environment, only GL can be used.
4506
4507    When a graphic character set of CHARS94 is invoked to GL, codes
4508    0x20 and 0x7F of the GL area work as control characters SPACE and
4509    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
4510    be used.
4511
4512    There are two ways of invocation: locking-shift and single-shift.
4513    With locking-shift, the invocation lasts until the next different
4514    invocation, whereas with single-shift, the invocation affects the
4515    following character only and doesn't affect the locking-shift
4516    state.  Invocations are done by the following control characters or
4517    escape sequences:
4518
4519    ----------------------------------------------------------------------
4520    abbrev  function                  cntrl escape seq   description
4521    ----------------------------------------------------------------------
4522    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
4523    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
4524    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
4525    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
4526    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
4527    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
4528    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
4529    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
4530    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
4531    ----------------------------------------------------------------------
4532    (*) These are not used by any known coding system.
4533
4534    Control characters for these functions are defined by macros
4535    ISO_CODE_XXX in `coding.h'.
4536
4537    Designations are done by the following escape sequences:
4538    ----------------------------------------------------------------------
4539    escape sequence      description
4540    ----------------------------------------------------------------------
4541    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
4542    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
4543    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
4544    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
4545    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
4546    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
4547    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
4548    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
4549    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
4550    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
4551    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
4552    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
4553    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
4554    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
4555    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
4556    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
4557    ----------------------------------------------------------------------
4558
4559    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
4560    of dimension 1, chars 94, and final character <F>, etc...
4561
4562    Note (*): Although these designations are not allowed in ISO2022,
4563    Emacs accepts them on decoding, and produces them on encoding
4564    CHARS96 character sets in a coding system which is characterized as
4565    7-bit environment, non-locking-shift, and non-single-shift.
4566
4567    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
4568    '(' can be omitted.  We refer to this as "short-form" hereafter.
4569
4570    Now you may notice that there are a lot of ways for encoding the
4571    same multilingual text in ISO2022.  Actually, there exist many
4572    coding systems such as Compound Text (used in X11's inter client
4573    communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
4574    (used in Korean internet), EUC (Extended UNIX Code, used in Asian
4575    localized platforms), and all of these are variants of ISO2022.
4576
4577    In addition to the above, Emacs handles two more kinds of escape
4578    sequences: ISO6429's direction specification and Emacs' private
4579    sequence for specifying character composition.
4580
4581    ISO6429's direction specification takes the following form:
4582         o CSI ']'      -- end of the current direction
4583         o CSI '0' ']'  -- end of the current direction
4584         o CSI '1' ']'  -- start of left-to-right text
4585         o CSI '2' ']'  -- start of right-to-left text
4586    The control character CSI (0x9B: control sequence introducer) is
4587    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
4588
4589    Character composition specification takes the following form:
4590         o ESC '0' -- start character composition
4591         o ESC '1' -- end character composition
4592    Since these are not standard escape sequences of any ISO standard,
4593    their use with these meanings is restricted to Emacs only.  */
4594
4595 static void
4596 reset_iso2022 (Lisp_Object coding_system, struct iso2022_decoder *iso)
4597 {
4598   int i;
4599
4600   for (i = 0; i < 4; i++)
4601     {
4602       if (!NILP (coding_system))
4603         iso->charset[i] =
4604           XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, i);
4605       else
4606         iso->charset[i] = Qt;
4607       iso->invalid_designated[i] = 0;
4608     }
4609   iso->esc = ISO_ESC_NOTHING;
4610   iso->esc_bytes_index = 0;
4611   iso->register_left = 0;
4612   iso->register_right = 1;
4613   iso->switched_dir_and_no_valid_charset_yet = 0;
4614   iso->invalid_switch_dir = 0;
4615   iso->output_direction_sequence = 0;
4616   iso->output_literally = 0;
4617 #ifdef ENABLE_COMPOSITE_CHARS
4618   if (iso->composite_chars)
4619     Dynarr_reset (iso->composite_chars);
4620 #endif
4621 }
4622
4623 static int
4624 fit_to_be_escape_quoted (unsigned char c)
4625 {
4626   switch (c)
4627     {
4628     case ISO_CODE_ESC:
4629     case ISO_CODE_CSI:
4630     case ISO_CODE_SS2:
4631     case ISO_CODE_SS3:
4632     case ISO_CODE_SO:
4633     case ISO_CODE_SI:
4634       return 1;
4635
4636     default:
4637       return 0;
4638     }
4639 }
4640
4641 /* Parse one byte of an ISO2022 escape sequence.
4642    If the result is an invalid escape sequence, return 0 and
4643    do not change anything in STR.  Otherwise, if the result is
4644    an incomplete escape sequence, update ISO2022.ESC and
4645    ISO2022.ESC_BYTES and return -1.  Otherwise, update
4646    all the state variables (but not ISO2022.ESC_BYTES) and
4647    return 1.
4648
4649    If CHECK_INVALID_CHARSETS is non-zero, check for designation
4650    or invocation of an invalid character set and treat that as
4651    an unrecognized escape sequence. */
4652
4653 static int
4654 parse_iso2022_esc (Lisp_Object codesys, struct iso2022_decoder *iso,
4655                    unsigned char c, unsigned int *flags,
4656                    int check_invalid_charsets)
4657 {
4658   /* (1) If we're at the end of a designation sequence, CS is the
4659      charset being designated and REG is the register to designate
4660      it to.
4661
4662      (2) If we're at the end of a locking-shift sequence, REG is
4663      the register to invoke and HALF (0 == left, 1 == right) is
4664      the half to invoke it into.
4665
4666      (3) If we're at the end of a single-shift sequence, REG is
4667      the register to invoke. */
4668   Lisp_Object cs = Qnil;
4669   int reg, half;
4670
4671   /* NOTE: This code does goto's all over the fucking place.
4672      The reason for this is that we're basically implementing
4673      a state machine here, and hierarchical languages like C
4674      don't really provide a clean way of doing this. */
4675
4676   if (! (*flags & CODING_STATE_ESCAPE))
4677     /* At beginning of escape sequence; we need to reset our
4678        escape-state variables. */
4679     iso->esc = ISO_ESC_NOTHING;
4680
4681   iso->output_literally = 0;
4682   iso->output_direction_sequence = 0;
4683
4684   switch (iso->esc)
4685     {
4686     case ISO_ESC_NOTHING:
4687       iso->esc_bytes_index = 0;
4688       switch (c)
4689         {
4690         case ISO_CODE_ESC:      /* Start escape sequence */
4691           *flags |= CODING_STATE_ESCAPE;
4692           iso->esc = ISO_ESC;
4693           goto not_done;
4694
4695         case ISO_CODE_CSI:      /* ISO6429 (specifying directionality) */
4696           *flags |= CODING_STATE_ESCAPE;
4697           iso->esc = ISO_ESC_5_11;
4698           goto not_done;
4699
4700         case ISO_CODE_SO:       /* locking shift 1 */
4701           reg = 1; half = 0;
4702           goto locking_shift;
4703         case ISO_CODE_SI:       /* locking shift 0 */
4704           reg = 0; half = 0;
4705           goto locking_shift;
4706
4707         case ISO_CODE_SS2:      /* single shift */
4708           reg = 2;
4709           goto single_shift;
4710         case ISO_CODE_SS3:      /* single shift */
4711           reg = 3;
4712           goto single_shift;
4713
4714         default:                        /* Other control characters */
4715           return 0;
4716         }
4717
4718     case ISO_ESC:
4719       switch (c)
4720         {
4721           /**** single shift ****/
4722
4723         case 'N':       /* single shift 2 */
4724           reg = 2;
4725           goto single_shift;
4726         case 'O':       /* single shift 3 */
4727           reg = 3;
4728           goto single_shift;
4729
4730           /**** locking shift ****/
4731
4732         case '~':       /* locking shift 1 right */
4733           reg = 1; half = 1;
4734           goto locking_shift;
4735         case 'n':       /* locking shift 2 */
4736           reg = 2; half = 0;
4737           goto locking_shift;
4738         case '}':       /* locking shift 2 right */
4739           reg = 2; half = 1;
4740           goto locking_shift;
4741         case 'o':       /* locking shift 3 */
4742           reg = 3; half = 0;
4743           goto locking_shift;
4744         case '|':       /* locking shift 3 right */
4745           reg = 3; half = 1;
4746           goto locking_shift;
4747
4748 #ifdef ENABLE_COMPOSITE_CHARS
4749           /**** composite ****/
4750
4751         case '0':
4752           iso->esc = ISO_ESC_START_COMPOSITE;
4753           *flags = (*flags & CODING_STATE_ISO2022_LOCK) |
4754             CODING_STATE_COMPOSITE;
4755           return 1;
4756
4757         case '1':
4758           iso->esc = ISO_ESC_END_COMPOSITE;
4759           *flags = (*flags & CODING_STATE_ISO2022_LOCK) &
4760             ~CODING_STATE_COMPOSITE;
4761           return 1;
4762 #endif /* ENABLE_COMPOSITE_CHARS */
4763
4764           /**** directionality ****/
4765
4766         case '[':
4767           iso->esc = ISO_ESC_5_11;
4768           goto not_done;
4769
4770           /**** designation ****/
4771
4772         case '$':       /* multibyte charset prefix */
4773           iso->esc = ISO_ESC_2_4;
4774           goto not_done;
4775
4776         default:
4777           if (0x28 <= c && c <= 0x2F)
4778             {
4779               iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_8);
4780               goto not_done;
4781             }
4782
4783           /* This function is called with CODESYS equal to nil when
4784              doing coding-system detection. */
4785           if (!NILP (codesys)
4786               && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
4787               && fit_to_be_escape_quoted (c))
4788             {
4789               iso->esc = ISO_ESC_LITERAL;
4790               *flags &= CODING_STATE_ISO2022_LOCK;
4791               return 1;
4792             }
4793
4794           /* bzzzt! */
4795           return 0;
4796         }
4797
4798
4799
4800       /**** directionality ****/
4801
4802     case ISO_ESC_5_11:          /* ISO6429 direction control */
4803       if (c == ']')
4804         {
4805           *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4806           goto directionality;
4807         }
4808       if      (c == '0') iso->esc = ISO_ESC_5_11_0;
4809       else if (c == '1') iso->esc = ISO_ESC_5_11_1;
4810       else if (c == '2') iso->esc = ISO_ESC_5_11_2;
4811       else               return 0;
4812       goto not_done;
4813
4814     case ISO_ESC_5_11_0:
4815       if (c == ']')
4816         {
4817           *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4818           goto directionality;
4819         }
4820       return 0;
4821
4822     case ISO_ESC_5_11_1:
4823       if (c == ']')
4824         {
4825           *flags = (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4826           goto directionality;
4827         }
4828       return 0;
4829
4830     case ISO_ESC_5_11_2:
4831       if (c == ']')
4832         {
4833           *flags = (*flags & CODING_STATE_ISO2022_LOCK) | CODING_STATE_R2L;
4834           goto directionality;
4835         }
4836       return 0;
4837
4838     directionality:
4839       iso->esc = ISO_ESC_DIRECTIONALITY;
4840       /* Various junk here to attempt to preserve the direction sequences
4841          literally in the text if they would otherwise be swallowed due
4842          to invalid designations that don't show up as actual charset
4843          changes in the text. */
4844       if (iso->invalid_switch_dir)
4845         {
4846           /* We already inserted a direction switch literally into the
4847              text.  We assume (#### this may not be right) that the
4848              next direction switch is the one going the other way,
4849              and we need to output that literally as well. */
4850           iso->output_literally = 1;
4851           iso->invalid_switch_dir = 0;
4852         }
4853       else
4854         {
4855           int jj;
4856
4857           /* If we are in the thrall of an invalid designation,
4858            then stick the directionality sequence literally into the
4859            output stream so it ends up in the original text again. */
4860           for (jj = 0; jj < 4; jj++)
4861             if (iso->invalid_designated[jj])
4862               break;
4863           if (jj < 4)
4864             {
4865               iso->output_literally = 1;
4866               iso->invalid_switch_dir = 1;
4867             }
4868           else
4869             /* Indicate that we haven't yet seen a valid designation,
4870                so that if a switch-dir is directly followed by an
4871                invalid designation, both get inserted literally. */
4872             iso->switched_dir_and_no_valid_charset_yet = 1;
4873         }
4874       return 1;
4875
4876
4877       /**** designation ****/
4878
4879     case ISO_ESC_2_4:
4880       if (0x28 <= c && c <= 0x2F)
4881         {
4882           iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_4_8);
4883           goto not_done;
4884         }
4885       if (0x40 <= c && c <= 0x42)
4886         {
4887           /* 94^n-set */
4888           cs = CHARSET_BY_ATTRIBUTES (94, -1, c,
4889                                       *flags & CODING_STATE_R2L ?
4890                                       CHARSET_RIGHT_TO_LEFT :
4891                                       CHARSET_LEFT_TO_RIGHT);
4892           reg = 0;
4893           goto designated;
4894         }
4895       return 0;
4896
4897     default:
4898       {
4899         int chars = 0;
4900         int single = 0;
4901
4902         if (c < '0' || c > '~')
4903           return 0; /* bad final byte */
4904
4905         if (iso->esc >= ISO_ESC_2_8 &&
4906             iso->esc <= ISO_ESC_2_15)
4907           {
4908             chars = (iso->esc >= ISO_ESC_2_12) ? 96 : 94;
4909             single = 1; /* single-byte */
4910             reg = (iso->esc - ISO_ESC_2_8) & 3;
4911           }
4912         else if (iso->esc >= ISO_ESC_2_4_8 &&
4913                  iso->esc <= ISO_ESC_2_4_15)
4914           {
4915             chars = (iso->esc >= ISO_ESC_2_4_12) ? 96 : 94;
4916             single = -1; /* multi-byte */
4917             reg = (iso->esc - ISO_ESC_2_4_8) & 3;
4918           }
4919         else
4920           {
4921             /* Can this ever be reached? -slb */
4922             abort();
4923           }
4924
4925         cs = CHARSET_BY_ATTRIBUTES (chars, single, c,
4926                                     *flags & CODING_STATE_R2L ?
4927                                     CHARSET_RIGHT_TO_LEFT :
4928                                     CHARSET_LEFT_TO_RIGHT);
4929         goto designated;
4930       }
4931     }
4932
4933  not_done:
4934   iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char) c;
4935   return -1;
4936
4937  single_shift:
4938   if (check_invalid_charsets && !CHARSETP (iso->charset[reg]))
4939     /* can't invoke something that ain't there. */
4940     return 0;
4941   iso->esc = ISO_ESC_SINGLE_SHIFT;
4942   *flags &= CODING_STATE_ISO2022_LOCK;
4943   if (reg == 2)
4944     *flags |= CODING_STATE_SS2;
4945   else
4946     *flags |= CODING_STATE_SS3;
4947   return 1;
4948
4949  locking_shift:
4950   if (check_invalid_charsets &&
4951       !CHARSETP (iso->charset[reg]))
4952     /* can't invoke something that ain't there. */
4953     return 0;
4954   if (half)
4955     iso->register_right = reg;
4956   else
4957     iso->register_left = reg;
4958   *flags &= CODING_STATE_ISO2022_LOCK;
4959   iso->esc = ISO_ESC_LOCKING_SHIFT;
4960   return 1;
4961
4962  designated:
4963   if (NILP (cs) && check_invalid_charsets)
4964     {
4965       iso->invalid_designated[reg] = 1;
4966       iso->charset[reg] = Vcharset_ascii;
4967       iso->esc = ISO_ESC_DESIGNATE;
4968       *flags &= CODING_STATE_ISO2022_LOCK;
4969       iso->output_literally = 1;
4970       if (iso->switched_dir_and_no_valid_charset_yet)
4971         {
4972           /* We encountered a switch-direction followed by an
4973              invalid designation.  Ensure that the switch-direction
4974              gets outputted; otherwise it will probably get eaten
4975              when the text is written out again. */
4976           iso->switched_dir_and_no_valid_charset_yet = 0;
4977           iso->output_direction_sequence = 1;
4978           /* And make sure that the switch-dir going the other
4979              way gets outputted, as well. */
4980           iso->invalid_switch_dir = 1;
4981         }
4982       return 1;
4983     }
4984   /* This function is called with CODESYS equal to nil when
4985      doing coding-system detection. */
4986   if (!NILP (codesys))
4987     {
4988       charset_conversion_spec_dynarr *dyn =
4989         XCODING_SYSTEM (codesys)->iso2022.input_conv;
4990
4991       if (dyn)
4992         {
4993           int i;
4994
4995           for (i = 0; i < Dynarr_length (dyn); i++)
4996             {
4997               struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
4998               if (EQ (cs, spec->from_charset))
4999                 cs = spec->to_charset;
5000             }
5001         }
5002     }
5003
5004   iso->charset[reg] = cs;
5005   iso->esc = ISO_ESC_DESIGNATE;
5006   *flags &= CODING_STATE_ISO2022_LOCK;
5007   if (iso->invalid_designated[reg])
5008     {
5009       iso->invalid_designated[reg] = 0;
5010       iso->output_literally = 1;
5011     }
5012   if (iso->switched_dir_and_no_valid_charset_yet)
5013     iso->switched_dir_and_no_valid_charset_yet = 0;
5014   return 1;
5015 }
5016
5017 static int
5018 detect_coding_iso2022 (struct detection_state *st, const Extbyte *src, size_t n)
5019 {
5020   int mask;
5021
5022   /* #### There are serious deficiencies in the recognition mechanism
5023      here.  This needs to be much smarter if it's going to cut it.
5024      The sequence "\xff\x0f" is currently detected as LOCK_SHIFT while
5025      it should be detected as Latin-1.
5026      All the ISO2022 stuff in this file should be synced up with the
5027      code from FSF Emacs-20.4, in which Mule should be more or less stable.
5028      Perhaps we should wait till R2L works in FSF Emacs? */
5029
5030   if (!st->iso2022.initted)
5031     {
5032       reset_iso2022 (Qnil, &st->iso2022.iso);
5033       st->iso2022.mask = (CODING_CATEGORY_ISO_7_MASK |
5034                           CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
5035                           CODING_CATEGORY_ISO_8_1_MASK |
5036                           CODING_CATEGORY_ISO_8_2_MASK |
5037                           CODING_CATEGORY_ISO_LOCK_SHIFT_MASK);
5038       st->iso2022.flags = 0;
5039       st->iso2022.high_byte_count = 0;
5040       st->iso2022.saw_single_shift = 0;
5041       st->iso2022.initted = 1;
5042     }
5043
5044   mask = st->iso2022.mask;
5045
5046   while (n--)
5047     {
5048       unsigned char c = *(unsigned char *)src++;
5049       if (c >= 0xA0)
5050         {
5051           mask &= ~CODING_CATEGORY_ISO_7_MASK;
5052           st->iso2022.high_byte_count++;
5053         }
5054       else
5055         {
5056           if (st->iso2022.high_byte_count && !st->iso2022.saw_single_shift)
5057             {
5058               if (st->iso2022.high_byte_count & 1)
5059                 /* odd number of high bytes; assume not iso-8-2 */
5060                 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
5061             }
5062           st->iso2022.high_byte_count = 0;
5063           st->iso2022.saw_single_shift = 0;
5064           if (c > 0x80)
5065             mask &= ~CODING_CATEGORY_ISO_7_MASK;
5066         }
5067       if (!(st->iso2022.flags & CODING_STATE_ESCAPE)
5068           && (BYTE_C0_P (c) || BYTE_C1_P (c)))
5069         { /* control chars */
5070           switch (c)
5071             {
5072               /* Allow and ignore control characters that you might
5073                  reasonably see in a text file */
5074             case '\r':
5075             case '\n':
5076             case '\t':
5077             case  7: /* bell */
5078             case  8: /* backspace */
5079             case 11: /* vertical tab */
5080             case 12: /* form feed */
5081             case 26: /* MS-DOS C-z junk */
5082             case 31: /* '^_' -- for info */
5083               goto label_continue_loop;
5084
5085             default:
5086               break;
5087             }
5088         }
5089
5090       if ((st->iso2022.flags & CODING_STATE_ESCAPE) || BYTE_C0_P (c)
5091           || BYTE_C1_P (c))
5092         {
5093           if (parse_iso2022_esc (Qnil, &st->iso2022.iso, c,
5094                                  &st->iso2022.flags, 0))
5095             {
5096               switch (st->iso2022.iso.esc)
5097                 {
5098                 case ISO_ESC_DESIGNATE:
5099                   mask &= ~CODING_CATEGORY_ISO_8_1_MASK;
5100                   mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
5101                   break;
5102                 case ISO_ESC_LOCKING_SHIFT:
5103                   mask = CODING_CATEGORY_ISO_LOCK_SHIFT_MASK;
5104                   goto ran_out_of_chars;
5105                 case ISO_ESC_SINGLE_SHIFT:
5106                   mask &= ~CODING_CATEGORY_ISO_8_DESIGNATE_MASK;
5107                   st->iso2022.saw_single_shift = 1;
5108                   break;
5109                 default:
5110                   break;
5111                 }
5112             }
5113           else
5114             {
5115               mask = 0;
5116               goto ran_out_of_chars;
5117             }
5118         }
5119     label_continue_loop:;
5120     }
5121
5122  ran_out_of_chars:
5123
5124   return mask;
5125 }
5126
5127 static int
5128 postprocess_iso2022_mask (int mask)
5129 {
5130   /* #### kind of cheesy */
5131   /* If seven-bit ISO is allowed, then assume that the encoding is
5132      entirely seven-bit and turn off the eight-bit ones. */
5133   if (mask & CODING_CATEGORY_ISO_7_MASK)
5134     mask &= ~ (CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
5135                CODING_CATEGORY_ISO_8_1_MASK |
5136                CODING_CATEGORY_ISO_8_2_MASK);
5137   return mask;
5138 }
5139
5140 /* If FLAGS is a null pointer or specifies right-to-left motion,
5141    output a switch-dir-to-left-to-right sequence to DST.
5142    Also update FLAGS if it is not a null pointer.
5143    If INTERNAL_P is set, we are outputting in internal format and
5144    need to handle the CSI differently. */
5145
5146 static void
5147 restore_left_to_right_direction (Lisp_Coding_System *codesys,
5148                                  unsigned_char_dynarr *dst,
5149                                  unsigned int *flags,
5150                                  int internal_p)
5151 {
5152   if (!flags || (*flags & CODING_STATE_R2L))
5153     {
5154       if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5155         {
5156           Dynarr_add (dst, ISO_CODE_ESC);
5157           Dynarr_add (dst, '[');
5158         }
5159       else if (internal_p)
5160         DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
5161       else
5162         Dynarr_add (dst, ISO_CODE_CSI);
5163       Dynarr_add (dst, '0');
5164       Dynarr_add (dst, ']');
5165       if (flags)
5166         *flags &= ~CODING_STATE_R2L;
5167     }
5168 }
5169
5170 /* If FLAGS is a null pointer or specifies a direction different from
5171    DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or
5172    CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape
5173    sequence to DST.  Also update FLAGS if it is not a null pointer.
5174    If INTERNAL_P is set, we are outputting in internal format and
5175    need to handle the CSI differently. */
5176
5177 static void
5178 ensure_correct_direction (int direction, Lisp_Coding_System *codesys,
5179                           unsigned_char_dynarr *dst, unsigned int *flags,
5180                           int internal_p)
5181 {
5182   if ((!flags || (*flags & CODING_STATE_R2L)) &&
5183       direction == CHARSET_LEFT_TO_RIGHT)
5184     restore_left_to_right_direction (codesys, dst, flags, internal_p);
5185   else if (!CODING_SYSTEM_ISO2022_NO_ISO6429 (codesys)
5186            && (!flags || !(*flags & CODING_STATE_R2L)) &&
5187            direction == CHARSET_RIGHT_TO_LEFT)
5188     {
5189       if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5190         {
5191           Dynarr_add (dst, ISO_CODE_ESC);
5192           Dynarr_add (dst, '[');
5193         }
5194       else if (internal_p)
5195         DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
5196       else
5197         Dynarr_add (dst, ISO_CODE_CSI);
5198       Dynarr_add (dst, '2');
5199       Dynarr_add (dst, ']');
5200       if (flags)
5201         *flags |= CODING_STATE_R2L;
5202     }
5203 }
5204
5205 /* Convert ISO2022-format data to internal format. */
5206
5207 static void
5208 decode_coding_iso2022 (Lstream *decoding, const Extbyte *src,
5209                        unsigned_char_dynarr *dst, size_t n)
5210 {
5211   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
5212   unsigned int flags    = str->flags;
5213   unsigned int cpos     = str->cpos;
5214   unsigned char counter = str->counter;
5215   eol_type_t eol_type   = str->eol_type;
5216 #ifdef ENABLE_COMPOSITE_CHARS
5217   unsigned_char_dynarr *real_dst = dst;
5218 #endif
5219   Lisp_Object coding_system;
5220
5221   XSETCODING_SYSTEM (coding_system, str->codesys);
5222
5223 #ifdef ENABLE_COMPOSITE_CHARS
5224   if (flags & CODING_STATE_COMPOSITE)
5225     dst = str->iso2022.composite_chars;
5226 #endif /* ENABLE_COMPOSITE_CHARS */
5227
5228   while (n--)
5229     {
5230       unsigned char c = *(unsigned char *)src++;
5231       if (flags & CODING_STATE_ESCAPE)
5232         {       /* Within ESC sequence */
5233           int retval = parse_iso2022_esc (coding_system, &str->iso2022,
5234                                           c, &flags, 1);
5235
5236           if (retval)
5237             {
5238               switch (str->iso2022.esc)
5239                 {
5240 #ifdef ENABLE_COMPOSITE_CHARS
5241                 case ISO_ESC_START_COMPOSITE:
5242                   if (str->iso2022.composite_chars)
5243                     Dynarr_reset (str->iso2022.composite_chars);
5244                   else
5245                     str->iso2022.composite_chars = Dynarr_new (unsigned_char);
5246                   dst = str->iso2022.composite_chars;
5247                   break;
5248                 case ISO_ESC_END_COMPOSITE:
5249                   {
5250                     Bufbyte comstr[MAX_EMCHAR_LEN];
5251                     Bytecount len;
5252                     Emchar emch = lookup_composite_char (Dynarr_atp (dst, 0),
5253                                                          Dynarr_length (dst));
5254                     dst = real_dst;
5255                     len = set_charptr_emchar (comstr, emch);
5256                     Dynarr_add_many (dst, comstr, len);
5257                     break;
5258                   }
5259 #endif /* ENABLE_COMPOSITE_CHARS */
5260
5261                 case ISO_ESC_LITERAL:
5262                   COMPOSE_FLUSH_CHARS (str, dst);
5263                   decode_flush_er_chars (str, dst);
5264                   DECODE_ADD_BINARY_CHAR (c, dst);
5265                   break;
5266
5267                 default:
5268                   /* Everything else handled already */
5269                   break;
5270                 }
5271             }
5272
5273           /* Attempted error recovery. */
5274           if (str->iso2022.output_direction_sequence)
5275             ensure_correct_direction (flags & CODING_STATE_R2L ?
5276                                       CHARSET_RIGHT_TO_LEFT :
5277                                       CHARSET_LEFT_TO_RIGHT,
5278                                       str->codesys, dst, 0, 1);
5279           /* More error recovery. */
5280           if (!retval || str->iso2022.output_literally)
5281             {
5282               /* Output the (possibly invalid) sequence */
5283               int i;
5284               COMPOSE_FLUSH_CHARS (str, dst);
5285               decode_flush_er_chars (str, dst);
5286               for (i = 0; i < str->iso2022.esc_bytes_index; i++)
5287                 DECODE_ADD_BINARY_CHAR (str->iso2022.esc_bytes[i], dst);
5288               flags &= CODING_STATE_ISO2022_LOCK;
5289               if (!retval)
5290                 n++, src--;/* Repeat the loop with the same character. */
5291               else
5292                 {
5293                   /* No sense in reprocessing the final byte of the
5294                      escape sequence; it could mess things up anyway.
5295                      Just add it now. */
5296                   COMPOSE_FLUSH_CHARS (str, dst);
5297                   decode_flush_er_chars (str, dst);
5298                   DECODE_ADD_BINARY_CHAR (c, dst);
5299                 }
5300             }
5301           cpos = 0;
5302           counter = 0;
5303         }
5304       else if (BYTE_C0_P (c) || BYTE_C1_P (c))
5305         { /* Control characters */
5306
5307           /***** Error-handling *****/
5308
5309           /* If we were in the middle of a character, dump out the
5310              partial character. */
5311           if (counter)
5312             {
5313               COMPOSE_FLUSH_CHARS (str, dst);
5314               decode_flush_er_chars (str, dst);
5315               while (counter > 0)
5316                 {
5317                   counter--;
5318                   DECODE_ADD_BINARY_CHAR
5319                     ((unsigned char)(cpos >> (counter * 8)), dst);
5320                 }
5321               cpos = 0;
5322             }
5323
5324           /* If we just saw a single-shift character, dump it out.
5325              This may dump out the wrong sort of single-shift character,
5326              but least it will give an indication that something went
5327              wrong. */
5328           if (flags & CODING_STATE_SS2)
5329             {
5330               COMPOSE_FLUSH_CHARS (str, dst);
5331               decode_flush_er_chars (str, dst);
5332               DECODE_ADD_BINARY_CHAR (ISO_CODE_SS2, dst);
5333               flags &= ~CODING_STATE_SS2;
5334             }
5335           if (flags & CODING_STATE_SS3)
5336             {
5337               COMPOSE_FLUSH_CHARS (str, dst);
5338               decode_flush_er_chars (str, dst);
5339               DECODE_ADD_BINARY_CHAR (ISO_CODE_SS3, dst);
5340               flags &= ~CODING_STATE_SS3;
5341             }
5342
5343           /***** Now handle the control characters. *****/
5344
5345           /* Handle CR/LF */
5346 #ifdef UTF2000
5347           if (c == '\r')
5348             {
5349               COMPOSE_FLUSH_CHARS (str, dst);
5350               decode_flush_er_chars (str, dst);
5351               if (eol_type == EOL_CR)
5352                 Dynarr_add (dst, '\n');
5353               else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR)
5354                 Dynarr_add (dst, c);
5355               else
5356                 flags |= CODING_STATE_CR;
5357               goto label_continue_loop;
5358             }
5359           else if (flags & CODING_STATE_CR)
5360             {   /* eol_type == CODING_SYSTEM_EOL_CRLF */
5361               if (c != '\n')
5362                 Dynarr_add (dst, '\r');
5363               flags &= ~CODING_STATE_CR;
5364             }
5365 #else
5366           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5367 #endif
5368
5369           flags &= CODING_STATE_ISO2022_LOCK;
5370
5371           if (!parse_iso2022_esc (coding_system, &str->iso2022, c, &flags, 1))
5372             {
5373               COMPOSE_FLUSH_CHARS (str, dst);
5374               decode_flush_er_chars (str, dst);
5375               DECODE_ADD_BINARY_CHAR (c, dst);
5376             }
5377         }
5378       else
5379         {                       /* Graphic characters */
5380           Lisp_Object charset;
5381 #ifndef UTF2000
5382           Charset_ID lb;
5383 #endif
5384           int reg;
5385
5386 #ifdef UTF2000
5387           if (c == '\r')
5388             {
5389               COMPOSE_FLUSH_CHARS (str, dst);
5390               decode_flush_er_chars (str, dst);
5391               if (eol_type == EOL_CR)
5392                 Dynarr_add (dst, '\n');
5393               else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR)
5394                 Dynarr_add (dst, c);
5395               else
5396                 flags |= CODING_STATE_CR;
5397               goto label_continue_loop;
5398             }
5399           else if (flags & CODING_STATE_CR)
5400             {   /* eol_type == CODING_SYSTEM_EOL_CRLF */
5401               if (c != '\n')
5402                 Dynarr_add (dst, '\r');
5403               flags &= ~CODING_STATE_CR;
5404             }
5405 #else
5406           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5407 #endif
5408
5409           /* Now determine the charset. */
5410           reg = ((flags & CODING_STATE_SS2) ? 2
5411                  : (flags & CODING_STATE_SS3) ? 3
5412                  : !BYTE_ASCII_P (c) ? str->iso2022.register_right
5413                  : str->iso2022.register_left);
5414           charset = str->iso2022.charset[reg];
5415
5416           /* Error checking: */
5417           if (! CHARSETP (charset)
5418               || str->iso2022.invalid_designated[reg]
5419               || (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL)
5420                   && XCHARSET_CHARS (charset) == 94))
5421             /* Mrmph.  We are trying to invoke a register that has no
5422                or an invalid charset in it, or trying to add a character
5423                outside the range of the charset.  Insert that char literally
5424                to preserve it for the output. */
5425             {
5426               COMPOSE_FLUSH_CHARS (str, dst);
5427               decode_flush_er_chars (str, dst);
5428               while (counter > 0)
5429                 {
5430                   counter--;
5431                   DECODE_ADD_BINARY_CHAR
5432                     ((unsigned char)(cpos >> (counter * 8)), dst);
5433                 }
5434               cpos = 0;
5435               DECODE_ADD_BINARY_CHAR (c, dst);
5436             }
5437
5438           else
5439             {
5440               /* Things are probably hunky-dorey. */
5441
5442               /* Fetch reverse charset, maybe. */
5443               if (((flags & CODING_STATE_R2L) &&
5444                    XCHARSET_DIRECTION (charset) == CHARSET_LEFT_TO_RIGHT)
5445                   ||
5446                   (!(flags & CODING_STATE_R2L) &&
5447                    XCHARSET_DIRECTION (charset) == CHARSET_RIGHT_TO_LEFT))
5448                 {
5449                   Lisp_Object new_charset =
5450                     XCHARSET_REVERSE_DIRECTION_CHARSET (charset);
5451                   if (!NILP (new_charset))
5452                     charset = new_charset;
5453                 }
5454
5455 #ifdef UTF2000
5456               counter++;
5457               if (XCHARSET_DIMENSION (charset) == counter)
5458                 {
5459                   COMPOSE_ADD_CHAR (str,
5460                                     DECODE_CHAR (charset,
5461                                                  ((cpos & 0x7F7F7F) << 8)
5462                                                  | (c & 0x7F)),
5463                                     dst);
5464                   cpos = 0;
5465                   counter = 0;
5466                 }
5467               else
5468                 cpos = (cpos << 8) | c;
5469 #else
5470               lb = XCHARSET_LEADING_BYTE (charset);
5471               switch (XCHARSET_REP_BYTES (charset))
5472                 {
5473                 case 1: /* ASCII */
5474                   DECODE_OUTPUT_PARTIAL_CHAR (ch);
5475                   Dynarr_add (dst, c & 0x7F);
5476                   break;
5477
5478                 case 2: /* one-byte official */
5479                   DECODE_OUTPUT_PARTIAL_CHAR (ch);
5480                   Dynarr_add (dst, lb);
5481                   Dynarr_add (dst, c | 0x80);
5482                   break;
5483
5484                 case 3: /* one-byte private or two-byte official */
5485                   if (XCHARSET_PRIVATE_P (charset))
5486                     {
5487                       DECODE_OUTPUT_PARTIAL_CHAR (ch);
5488                       Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_1);
5489                       Dynarr_add (dst, lb);
5490                       Dynarr_add (dst, c | 0x80);
5491                     }
5492                   else
5493                     {
5494                       if (ch)
5495                         {
5496                           Dynarr_add (dst, lb);
5497                           Dynarr_add (dst, ch | 0x80);
5498                           Dynarr_add (dst, c | 0x80);
5499                           ch = 0;
5500                         }
5501                       else
5502                         ch = c;
5503                     }
5504                   break;
5505
5506                 default:        /* two-byte private */
5507                   if (ch)
5508                     {
5509                       Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_2);
5510                       Dynarr_add (dst, lb);
5511                       Dynarr_add (dst, ch | 0x80);
5512                       Dynarr_add (dst, c | 0x80);
5513                       ch = 0;
5514                     }
5515                   else
5516                     ch = c;
5517                 }
5518 #endif
5519             }
5520
5521           if (!cpos)
5522             flags &= CODING_STATE_ISO2022_LOCK;
5523         }
5524
5525     label_continue_loop:;
5526     }
5527
5528   if (flags & CODING_STATE_END)
5529     {
5530       COMPOSE_FLUSH_CHARS (str, dst);
5531       decode_flush_er_chars (str, dst);
5532       DECODE_OUTPUT_PARTIAL_CHAR (cpos);
5533     }
5534   str->flags   = flags;
5535   str->cpos    = cpos;
5536   str->counter = counter;
5537 }
5538
5539
5540 /***** ISO2022 encoder *****/
5541
5542 /* Designate CHARSET into register REG. */
5543
5544 static void
5545 iso2022_designate (Lisp_Object charset, unsigned char reg,
5546                    struct encoding_stream *str, unsigned_char_dynarr *dst)
5547 {
5548   static const char inter94[] = "()*+";
5549   static const char inter96[] = ",-./";
5550   unsigned short chars;
5551   unsigned char dimension;
5552   unsigned char final;
5553   Lisp_Object old_charset = str->iso2022.charset[reg];
5554
5555   str->iso2022.charset[reg] = charset;
5556   if (!CHARSETP (charset))
5557     /* charset might be an initial nil or t. */
5558     return;
5559   chars = XCHARSET_CHARS (charset);
5560   dimension = XCHARSET_DIMENSION (charset);
5561   final = XCHARSET_FINAL (charset);
5562   if (!str->iso2022.force_charset_on_output[reg] &&
5563       CHARSETP (old_charset) &&
5564       XCHARSET_CHARS (old_charset) == chars &&
5565       XCHARSET_DIMENSION (old_charset) == dimension &&
5566       XCHARSET_FINAL (old_charset) == final)
5567     return;
5568
5569   str->iso2022.force_charset_on_output[reg] = 0;
5570
5571   {
5572     charset_conversion_spec_dynarr *dyn =
5573       str->codesys->iso2022.output_conv;
5574
5575     if (dyn)
5576       {
5577         int i;
5578
5579         for (i = 0; i < Dynarr_length (dyn); i++)
5580           {
5581             struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
5582             if (EQ (charset, spec->from_charset))
5583                 charset = spec->to_charset;
5584           }
5585       }
5586   }
5587
5588   Dynarr_add (dst, ISO_CODE_ESC);
5589   switch (chars)
5590     {
5591     case 94:
5592       if (dimension == 1)
5593         Dynarr_add (dst, inter94[reg]);
5594       else
5595         {
5596           Dynarr_add (dst, '$');
5597           if (reg != 0
5598               || !(CODING_SYSTEM_ISO2022_SHORT (str->codesys))
5599               || final < '@'
5600               || final > 'B')
5601             Dynarr_add (dst, inter94[reg]);
5602         }
5603       break;
5604     case 96:
5605       if (dimension == 1)
5606         Dynarr_add (dst, inter96[reg]);
5607       else
5608         {
5609           Dynarr_add (dst, '$');
5610           Dynarr_add (dst, inter96[reg]);
5611         }
5612       break;
5613     }
5614   Dynarr_add (dst, final);
5615 }
5616
5617 static void
5618 ensure_normal_shift (struct encoding_stream *str, unsigned_char_dynarr *dst)
5619 {
5620   if (str->iso2022.register_left != 0)
5621     {
5622       Dynarr_add (dst, ISO_CODE_SI);
5623       str->iso2022.register_left = 0;
5624     }
5625 }
5626
5627 static void
5628 ensure_shift_out (struct encoding_stream *str, unsigned_char_dynarr *dst)
5629 {
5630   if (str->iso2022.register_left != 1)
5631     {
5632       Dynarr_add (dst, ISO_CODE_SO);
5633       str->iso2022.register_left = 1;
5634     }
5635 }
5636
5637 void
5638 char_encode_iso2022 (struct encoding_stream *str, Emchar ch,
5639                      unsigned_char_dynarr *dst, unsigned int *flags)
5640 {
5641   unsigned char charmask;
5642   Lisp_Coding_System* codesys = str->codesys;
5643   eol_type_t eol_type         = CODING_SYSTEM_EOL_TYPE (str->codesys);
5644   int i;
5645   Lisp_Object charset = str->iso2022.current_charset;
5646   int half = str->iso2022.current_half;
5647   int code_point = -1;
5648
5649   if (ch <= 0x7F)
5650     {
5651       restore_left_to_right_direction (codesys, dst, flags, 0);
5652
5653       /* Make sure G0 contains ASCII */
5654       if ((ch > ' ' && ch < ISO_CODE_DEL)
5655           || !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
5656         {
5657           ensure_normal_shift (str, dst);
5658           iso2022_designate (Vcharset_ascii, 0, str, dst);
5659         }
5660
5661       /* If necessary, restore everything to the default state
5662          at end-of-line */
5663       if (ch == '\n' && !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys)))
5664         {
5665           restore_left_to_right_direction (codesys, dst, flags, 0);
5666
5667           ensure_normal_shift (str, dst);
5668
5669           for (i = 0; i < 4; i++)
5670             {
5671               Lisp_Object initial_charset =
5672                 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5673               iso2022_designate (initial_charset, i, str, dst);
5674             }
5675         }
5676       if (ch == '\n')
5677         {
5678           if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5679             Dynarr_add (dst, '\r');
5680           if (eol_type != EOL_CR)
5681             Dynarr_add (dst, ch);
5682         }
5683       else
5684         {
5685           if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5686               && fit_to_be_escape_quoted (ch))
5687             Dynarr_add (dst, ISO_CODE_ESC);
5688           Dynarr_add (dst, ch);
5689         }
5690     }
5691   else if ( (0x80 <= ch) && (ch <= 0x9f) )
5692     {
5693       charmask = (half == 0 ? 0x00 : 0x80);
5694
5695       if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5696           && fit_to_be_escape_quoted (ch))
5697         Dynarr_add (dst, ISO_CODE_ESC);
5698       /* you asked for it ... */
5699       Dynarr_add (dst, ch);
5700     }
5701   else
5702     {
5703       int reg;
5704
5705       /* Now determine which register to use. */
5706       reg = -1;
5707       for (i = 0; i < 4; i++)
5708         {
5709           if ((CHARSETP (charset = str->iso2022.charset[i])
5710                && ((code_point = charset_code_point (charset, ch)) >= 0))
5711               ||
5712               (CHARSETP
5713                (charset
5714                 = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i))
5715                && ((code_point = charset_code_point (charset, ch)) >= 0)))
5716             {
5717               reg = i;
5718               break;
5719             }
5720         }
5721       if (reg == -1)
5722         {
5723           Lisp_Object original_default_coded_charset_priority_list
5724             = Vdefault_coded_charset_priority_list;
5725
5726           while (!EQ (Vdefault_coded_charset_priority_list, Qnil))
5727             {
5728               code_point = ENCODE_CHAR (ch, charset);
5729               if (XCHARSET_FINAL (charset))
5730                 goto found;
5731               Vdefault_coded_charset_priority_list
5732                 = Fcdr (Fmemq (XCHARSET_NAME (charset),
5733                                Vdefault_coded_charset_priority_list));
5734             }
5735           code_point = ENCODE_CHAR (ch, charset);
5736           if (!XCHARSET_FINAL (charset))
5737             {
5738               charset = Vcharset_ascii;
5739               code_point = '~';
5740             }
5741         found:
5742           Vdefault_coded_charset_priority_list
5743             = original_default_coded_charset_priority_list;
5744         }
5745       ensure_correct_direction (XCHARSET_DIRECTION (charset),
5746                                 codesys, dst, flags, 0);
5747
5748       if (reg == -1)
5749         {
5750           if (XCHARSET_GRAPHIC (charset) != 0)
5751             {
5752               if (!NILP (str->iso2022.charset[1]) &&
5753                   (!CODING_SYSTEM_ISO2022_SEVEN (codesys)
5754                    || CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys)))
5755                 reg = 1;
5756               else if (!NILP (str->iso2022.charset[2]))
5757                 reg = 2;
5758               else if (!NILP (str->iso2022.charset[3]))
5759                 reg = 3;
5760               else
5761                 reg = 0;
5762             }
5763           else
5764             reg = 0;
5765         }
5766
5767       iso2022_designate (charset, reg, str, dst);
5768
5769       /* Now invoke that register. */
5770       switch (reg)
5771         {
5772         case 0:
5773           ensure_normal_shift (str, dst);
5774           half = 0;
5775           break;
5776         case 1:
5777           if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5778             {
5779               ensure_shift_out (str, dst);
5780               half = 0;
5781             }
5782           else
5783             half = 1;
5784           break;
5785         case 2:
5786           if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5787             {
5788               Dynarr_add (dst, ISO_CODE_ESC);
5789               Dynarr_add (dst, 'N');
5790               half = 0;
5791             }
5792           else
5793             {
5794               Dynarr_add (dst, ISO_CODE_SS2);
5795               half = 1;
5796             }
5797           break;
5798         case 3:
5799           if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5800             {
5801               Dynarr_add (dst, ISO_CODE_ESC);
5802               Dynarr_add (dst, 'O');
5803               half = 0;
5804             }
5805           else
5806             {
5807               Dynarr_add (dst, ISO_CODE_SS3);
5808               half = 1;
5809             }
5810           break;
5811         default:
5812           abort ();
5813         }
5814
5815       charmask = (half == 0 ? 0x00 : 0x80);
5816
5817       switch (XCHARSET_DIMENSION (charset))
5818         {
5819         case 1:
5820           Dynarr_add (dst, (code_point & 0xFF) | charmask);
5821           break;
5822         case 2:
5823           Dynarr_add (dst, ((code_point >> 8) & 0xFF) | charmask);
5824           Dynarr_add (dst, ( code_point       & 0xFF) | charmask);
5825           break;
5826         case 3:
5827           Dynarr_add (dst, ((code_point >> 16) & 0xFF) | charmask);
5828           Dynarr_add (dst, ((code_point >>  8) & 0xFF) | charmask);
5829           Dynarr_add (dst, ( code_point        & 0xFF) | charmask);
5830           break;
5831         case 4:
5832           Dynarr_add (dst, ((code_point >> 24) & 0xFF) | charmask);
5833           Dynarr_add (dst, ((code_point >> 16) & 0xFF) | charmask);
5834           Dynarr_add (dst, ((code_point >>  8) & 0xFF) | charmask);
5835           Dynarr_add (dst, ( code_point        & 0xFF) | charmask);
5836           break;
5837         default:
5838           abort ();
5839         }
5840     }
5841   str->iso2022.current_charset = charset;
5842   str->iso2022.current_half = half;
5843 }
5844
5845 void
5846 char_finish_iso2022 (struct encoding_stream *str, unsigned_char_dynarr *dst,
5847                      unsigned int *flags)
5848 {
5849   Lisp_Coding_System* codesys = str->codesys;
5850   int i;
5851
5852   restore_left_to_right_direction (codesys, dst, flags, 0);
5853   ensure_normal_shift (str, dst);
5854   for (i = 0; i < 4; i++)
5855     {
5856       Lisp_Object initial_charset
5857         = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5858       iso2022_designate (initial_charset, i, str, dst);
5859     }
5860 }
5861 #endif /* MULE */
5862 \f
5863 /************************************************************************/
5864 /*                     No-conversion methods                            */
5865 /************************************************************************/
5866
5867 /* This is used when reading in "binary" files -- i.e. files that may
5868    contain all 256 possible byte values and that are not to be
5869    interpreted as being in any particular decoding. */
5870 static void
5871 decode_coding_no_conversion (Lstream *decoding, const Extbyte *src,
5872                              unsigned_char_dynarr *dst, size_t n)
5873 {
5874   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
5875   unsigned int flags  = str->flags;
5876   unsigned int cpos   = str->cpos;
5877   eol_type_t eol_type = str->eol_type;
5878
5879   while (n--)
5880     {
5881       unsigned char c = *(unsigned char *)src++;
5882
5883       DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5884       DECODE_ADD_BINARY_CHAR (c, dst);
5885     label_continue_loop:;
5886     }
5887
5888   DECODE_HANDLE_END_OF_CONVERSION (flags, cpos, dst);
5889
5890   str->flags = flags;
5891   str->cpos  = cpos;
5892 }
5893
5894 static void
5895 encode_coding_no_conversion (Lstream *encoding, const Bufbyte *src,
5896                              unsigned_char_dynarr *dst, size_t n)
5897 {
5898   unsigned char c;
5899   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
5900   unsigned int flags  = str->flags;
5901   unsigned int ch     = str->ch;
5902   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
5903 #ifdef UTF2000
5904   unsigned char char_boundary = str->iso2022.current_char_boundary;
5905 #endif
5906
5907   while (n--)
5908     {
5909       c = *src++;
5910 #ifdef UTF2000
5911       if (char_boundary == 0)
5912         if ( c >= 0xfc )
5913           {
5914             ch = c & 0x01;
5915             char_boundary = 5;
5916           }
5917         else if ( c >= 0xf8 )
5918           {
5919             ch = c & 0x03;
5920             char_boundary = 4;
5921           }
5922         else if ( c >= 0xf0 )
5923           {
5924             ch = c & 0x07;
5925             char_boundary = 3;
5926           }
5927         else if ( c >= 0xe0 )
5928           {
5929             ch = c & 0x0f;
5930             char_boundary = 2;
5931           }
5932         else if ( c >= 0xc0 )
5933           {
5934             ch = c & 0x1f;
5935             char_boundary = 1;
5936           }
5937         else
5938           {
5939             ch = 0;
5940             if (c == '\n')
5941               {
5942                 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5943                   Dynarr_add (dst, '\r');
5944                 if (eol_type != EOL_CR)
5945                   Dynarr_add (dst, c);
5946               }
5947             else
5948               Dynarr_add (dst, c);
5949             char_boundary = 0;
5950           }
5951       else if (char_boundary == 1)
5952         {
5953           ch = ( ch << 6 ) | ( c & 0x3f );
5954           Dynarr_add (dst, ch & 0xff);
5955           char_boundary = 0;
5956         }
5957       else
5958         {
5959           ch = ( ch << 6 ) | ( c & 0x3f );
5960           char_boundary--;
5961         }
5962 #else /* not UTF2000 */
5963       if (c == '\n')
5964         {
5965           if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5966             Dynarr_add (dst, '\r');
5967           if (eol_type != EOL_CR)
5968             Dynarr_add (dst, '\n');
5969           ch = 0;
5970         }
5971       else if (BYTE_ASCII_P (c))
5972         {
5973           assert (ch == 0);
5974           Dynarr_add (dst, c);
5975         }
5976       else if (BUFBYTE_LEADING_BYTE_P (c))
5977         {
5978           assert (ch == 0);
5979           if (c == LEADING_BYTE_LATIN_ISO8859_1 ||
5980               c == LEADING_BYTE_CONTROL_1)
5981             ch = c;
5982           else
5983             Dynarr_add (dst, '~'); /* untranslatable character */
5984         }
5985       else
5986         {
5987           if (ch == LEADING_BYTE_LATIN_ISO8859_1)
5988             Dynarr_add (dst, c);
5989           else if (ch == LEADING_BYTE_CONTROL_1)
5990             {
5991               assert (c < 0xC0);
5992               Dynarr_add (dst, c - 0x20);
5993             }
5994           /* else it should be the second or third byte of an
5995              untranslatable character, so ignore it */
5996           ch = 0;
5997         }
5998 #endif /* not UTF2000 */
5999     }
6000
6001   str->flags = flags;
6002   str->ch    = ch;
6003 #ifdef UTF2000
6004   str->iso2022.current_char_boundary = char_boundary;
6005 #endif
6006 }
6007
6008 \f
6009
6010 /************************************************************************/
6011 /*                             Initialization                           */
6012 /************************************************************************/
6013
6014 void
6015 syms_of_file_coding (void)
6016 {
6017   INIT_LRECORD_IMPLEMENTATION (coding_system);
6018
6019   deferror (&Qcoding_system_error, "coding-system-error",
6020             "Coding-system error", Qio_error);
6021
6022   DEFSUBR (Fcoding_system_p);
6023   DEFSUBR (Ffind_coding_system);
6024   DEFSUBR (Fget_coding_system);
6025   DEFSUBR (Fcoding_system_list);
6026   DEFSUBR (Fcoding_system_name);
6027   DEFSUBR (Fmake_coding_system);
6028   DEFSUBR (Fcopy_coding_system);
6029   DEFSUBR (Fcoding_system_canonical_name_p);
6030   DEFSUBR (Fcoding_system_alias_p);
6031   DEFSUBR (Fcoding_system_aliasee);
6032   DEFSUBR (Fdefine_coding_system_alias);
6033   DEFSUBR (Fsubsidiary_coding_system);
6034
6035   DEFSUBR (Fcoding_system_type);
6036   DEFSUBR (Fcoding_system_doc_string);
6037 #ifdef MULE
6038   DEFSUBR (Fcoding_system_charset);
6039 #endif
6040   DEFSUBR (Fcoding_system_property);
6041
6042   DEFSUBR (Fcoding_category_list);
6043   DEFSUBR (Fset_coding_priority_list);
6044   DEFSUBR (Fcoding_priority_list);
6045   DEFSUBR (Fset_coding_category_system);
6046   DEFSUBR (Fcoding_category_system);
6047
6048   DEFSUBR (Fdetect_coding_region);
6049   DEFSUBR (Fdecode_coding_region);
6050   DEFSUBR (Fencode_coding_region);
6051 #ifdef MULE
6052   DEFSUBR (Fdecode_shift_jis_char);
6053   DEFSUBR (Fencode_shift_jis_char);
6054   DEFSUBR (Fdecode_big5_char);
6055   DEFSUBR (Fencode_big5_char);
6056 #endif /* MULE */
6057   defsymbol (&Qcoding_systemp, "coding-system-p");
6058   defsymbol (&Qno_conversion, "no-conversion");
6059   defsymbol (&Qraw_text, "raw-text");
6060 #ifdef MULE
6061   defsymbol (&Qbig5, "big5");
6062   defsymbol (&Qshift_jis, "shift-jis");
6063   defsymbol (&Qucs4, "ucs-4");
6064   defsymbol (&Qutf8, "utf-8");
6065   defsymbol (&Qccl, "ccl");
6066   defsymbol (&Qiso2022, "iso2022");
6067 #endif /* MULE */
6068   defsymbol (&Qmnemonic, "mnemonic");
6069   defsymbol (&Qeol_type, "eol-type");
6070   defsymbol (&Qpost_read_conversion, "post-read-conversion");
6071   defsymbol (&Qpre_write_conversion, "pre-write-conversion");
6072
6073   defsymbol (&Qcr, "cr");
6074   defsymbol (&Qlf, "lf");
6075   defsymbol (&Qcrlf, "crlf");
6076   defsymbol (&Qeol_cr, "eol-cr");
6077   defsymbol (&Qeol_lf, "eol-lf");
6078   defsymbol (&Qeol_crlf, "eol-crlf");
6079 #ifdef MULE
6080   defsymbol (&Qcharset_g0, "charset-g0");
6081   defsymbol (&Qcharset_g1, "charset-g1");
6082   defsymbol (&Qcharset_g2, "charset-g2");
6083   defsymbol (&Qcharset_g3, "charset-g3");
6084   defsymbol (&Qforce_g0_on_output, "force-g0-on-output");
6085   defsymbol (&Qforce_g1_on_output, "force-g1-on-output");
6086   defsymbol (&Qforce_g2_on_output, "force-g2-on-output");
6087   defsymbol (&Qforce_g3_on_output, "force-g3-on-output");
6088   defsymbol (&Qno_iso6429, "no-iso6429");
6089   defsymbol (&Qinput_charset_conversion, "input-charset-conversion");
6090   defsymbol (&Qoutput_charset_conversion, "output-charset-conversion");
6091
6092   defsymbol (&Qshort, "short");
6093   defsymbol (&Qno_ascii_eol, "no-ascii-eol");
6094   defsymbol (&Qno_ascii_cntl, "no-ascii-cntl");
6095   defsymbol (&Qseven, "seven");
6096   defsymbol (&Qlock_shift, "lock-shift");
6097   defsymbol (&Qescape_quoted, "escape-quoted");
6098 #endif /* MULE */
6099 #ifdef UTF2000
6100   defsymbol (&Qutf_8_mcs, "utf-8-mcs");
6101   defsymbol (&Qdisable_composition, "disable-composition");
6102   defsymbol (&Quse_entity_reference, "use-entity-reference");
6103   defsymbol (&Qd, "d");
6104   defsymbol (&Qx, "x");
6105   defsymbol (&QX, "X");
6106 #endif
6107   defsymbol (&Qencode, "encode");
6108   defsymbol (&Qdecode, "decode");
6109
6110 #ifdef MULE
6111   defsymbol (&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS],
6112              "shift-jis");
6113   defsymbol (&coding_category_symbol[CODING_CATEGORY_BIG5],
6114              "big5");
6115   defsymbol (&coding_category_symbol[CODING_CATEGORY_UCS4],
6116              "ucs-4");
6117   defsymbol (&coding_category_symbol[CODING_CATEGORY_UTF8],
6118              "utf-8");
6119   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_7],
6120              "iso-7");
6121   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE],
6122              "iso-8-designate");
6123   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_1],
6124              "iso-8-1");
6125   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_2],
6126              "iso-8-2");
6127   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_LOCK_SHIFT],
6128              "iso-lock-shift");
6129 #endif /* MULE */
6130   defsymbol (&coding_category_symbol[CODING_CATEGORY_NO_CONVERSION],
6131              "no-conversion");
6132 }
6133
6134 void
6135 lstream_type_create_file_coding (void)
6136 {
6137   LSTREAM_HAS_METHOD (decoding, reader);
6138   LSTREAM_HAS_METHOD (decoding, writer);
6139   LSTREAM_HAS_METHOD (decoding, rewinder);
6140   LSTREAM_HAS_METHOD (decoding, seekable_p);
6141   LSTREAM_HAS_METHOD (decoding, flusher);
6142   LSTREAM_HAS_METHOD (decoding, closer);
6143   LSTREAM_HAS_METHOD (decoding, marker);
6144
6145   LSTREAM_HAS_METHOD (encoding, reader);
6146   LSTREAM_HAS_METHOD (encoding, writer);
6147   LSTREAM_HAS_METHOD (encoding, rewinder);
6148   LSTREAM_HAS_METHOD (encoding, seekable_p);
6149   LSTREAM_HAS_METHOD (encoding, flusher);
6150   LSTREAM_HAS_METHOD (encoding, closer);
6151   LSTREAM_HAS_METHOD (encoding, marker);
6152 }
6153
6154 void
6155 vars_of_file_coding (void)
6156 {
6157   int i;
6158
6159   fcd = xnew (struct file_coding_dump);
6160   dump_add_root_struct_ptr (&fcd, &fcd_description);
6161
6162   /* Initialize to something reasonable ... */
6163   for (i = 0; i < CODING_CATEGORY_LAST; i++)
6164     {
6165       fcd->coding_category_system[i] = Qnil;
6166       fcd->coding_category_by_priority[i] = i;
6167     }
6168
6169   Fprovide (intern ("file-coding"));
6170
6171   DEFVAR_LISP ("keyboard-coding-system", &Vkeyboard_coding_system /*
6172 Coding system used for TTY keyboard input.
6173 Not used under a windowing system.
6174 */ );
6175   Vkeyboard_coding_system = Qnil;
6176
6177   DEFVAR_LISP ("terminal-coding-system", &Vterminal_coding_system /*
6178 Coding system used for TTY display output.
6179 Not used under a windowing system.
6180 */ );
6181   Vterminal_coding_system = Qnil;
6182
6183   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read /*
6184 Overriding coding system used when reading from a file or process.
6185 You should bind this variable with `let', but do not set it globally.
6186 If this is non-nil, it specifies the coding system that will be used
6187 to decode input on read operations, such as from a file or process.
6188 It overrides `buffer-file-coding-system-for-read',
6189 `insert-file-contents-pre-hook', etc.  Use those variables instead of
6190 this one for permanent changes to the environment.  */ );
6191   Vcoding_system_for_read = Qnil;
6192
6193   DEFVAR_LISP ("coding-system-for-write",
6194                &Vcoding_system_for_write /*
6195 Overriding coding system used when writing to a file or process.
6196 You should bind this variable with `let', but do not set it globally.
6197 If this is non-nil, it specifies the coding system that will be used
6198 to encode output for write operations, such as to a file or process.
6199 It overrides `buffer-file-coding-system', `write-region-pre-hook', etc.
6200 Use those variables instead of this one for permanent changes to the
6201 environment.  */ );
6202   Vcoding_system_for_write = Qnil;
6203
6204   DEFVAR_LISP ("file-name-coding-system", &Vfile_name_coding_system /*
6205 Coding system used to convert pathnames when accessing files.
6206 */ );
6207   Vfile_name_coding_system = Qnil;
6208
6209   DEFVAR_LISP ("coded-charset-entity-reference-alist",
6210                &Vcoded_charset_entity_reference_alist /*
6211 Alist of coded-charset vs corresponding entity-reference.
6212 Each element looks like (CCS PREFIX CODE-COLUMNS CODE-TYPE).
6213 CCS is coded-charset.
6214 CODE-COLUMNS is columns of code-point of entity-reference.
6215 CODE-TYPE is format type of code-point of entity-reference.
6216 `d' means decimal value and `x' means hexadecimal value.
6217 */ );
6218   Vcoded_charset_entity_reference_alist = Qnil;
6219
6220   DEFVAR_BOOL ("enable-multibyte-characters", &enable_multibyte_characters /*
6221 Non-nil means the buffer contents are regarded as multi-byte form
6222 of characters, not a binary code.  This affects the display, file I/O,
6223 and behaviors of various editing commands.
6224
6225 Setting this to nil does not do anything.
6226 */ );
6227   enable_multibyte_characters = 1;
6228 }
6229
6230 void
6231 complex_vars_of_file_coding (void)
6232 {
6233   staticpro (&Vcoding_system_hash_table);
6234   Vcoding_system_hash_table =
6235     make_lisp_hash_table (50, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ);
6236
6237   the_codesys_prop_dynarr = Dynarr_new (codesys_prop);
6238   dump_add_root_struct_ptr (&the_codesys_prop_dynarr, &codesys_prop_dynarr_description);
6239
6240 #define DEFINE_CODESYS_PROP(Prop_Type, Sym) do  \
6241 {                                               \
6242   struct codesys_prop csp;                      \
6243   csp.sym = (Sym);                              \
6244   csp.prop_type = (Prop_Type);                  \
6245   Dynarr_add (the_codesys_prop_dynarr, csp);    \
6246 } while (0)
6247
6248   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qmnemonic);
6249   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_type);
6250   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_cr);
6251   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_crlf);
6252   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_lf);
6253   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qpost_read_conversion);
6254   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qpre_write_conversion);
6255 #ifdef MULE
6256   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g0);
6257   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g1);
6258   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g2);
6259   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g3);
6260   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g0_on_output);
6261   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g1_on_output);
6262   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g2_on_output);
6263   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g3_on_output);
6264   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qshort);
6265   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_eol);
6266   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_cntl);
6267   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qseven);
6268   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qlock_shift);
6269   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_iso6429);
6270   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qescape_quoted);
6271   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qinput_charset_conversion);
6272   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qoutput_charset_conversion);
6273
6274   DEFINE_CODESYS_PROP (CODESYS_PROP_CCL,     Qencode);
6275   DEFINE_CODESYS_PROP (CODESYS_PROP_CCL,     Qdecode);
6276 #ifdef UTF2000
6277   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qdisable_composition);
6278   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Quse_entity_reference);
6279 #endif
6280 #endif /* MULE */
6281   /* Need to create this here or we're really screwed. */
6282   Fmake_coding_system
6283     (Qraw_text, Qno_conversion,
6284      build_string ("Raw text, which means it converts only line-break-codes."),
6285      list2 (Qmnemonic, build_string ("Raw")));
6286
6287   Fmake_coding_system
6288     (Qbinary, Qno_conversion,
6289      build_string ("Binary, which means it does not convert anything."),
6290      list4 (Qeol_type, Qlf,
6291             Qmnemonic, build_string ("Binary")));
6292
6293 #ifdef UTF2000
6294   Fmake_coding_system
6295     (Qutf_8_mcs, Qutf8,
6296      build_string
6297      ("Coding-system of UTF-8 with Multiple Coded-character-Sets extension."),
6298      list2 (Qmnemonic, build_string ("MTF8")));
6299 #endif
6300
6301   Fdefine_coding_system_alias (Qno_conversion, Qraw_text);
6302
6303   Fdefine_coding_system_alias (Qfile_name, Qbinary);
6304
6305   Fdefine_coding_system_alias (Qterminal, Qbinary);
6306   Fdefine_coding_system_alias (Qkeyboard, Qbinary);
6307
6308   /* Need this for bootstrapping */
6309   fcd->coding_category_system[CODING_CATEGORY_NO_CONVERSION] =
6310     Fget_coding_system (Qraw_text);
6311
6312 #ifdef UTF2000
6313   fcd->coding_category_system[CODING_CATEGORY_UTF8]
6314    = Fget_coding_system (Qutf_8_mcs);
6315 #endif
6316
6317 #if defined(MULE) && !defined(UTF2000)
6318   {
6319     size_t i;
6320
6321     for (i = 0; i < countof (fcd->ucs_to_mule_table); i++)
6322       fcd->ucs_to_mule_table[i] = Qnil;
6323   }
6324   staticpro (&mule_to_ucs_table);
6325   mule_to_ucs_table = Fmake_char_table(Qgeneric);
6326 #endif /* defined(MULE) && !defined(UTF2000) */
6327 }