src/file-coding.c

   1 /* Code conversion functions.
   2    Copyright (C) 1991, 1995 Free Software Foundation, Inc.
   3    Copyright (C) 1995 Sun Microsystems, Inc.
   4
   5 This file is part of XEmacs.
   6
   7 XEmacs is free software; you can redistribute it and/or modify it
   8 under the terms of the GNU General Public License as published by the
   9 Free Software Foundation; either version 2, or (at your option) any
  10 later version.
  11
  12 XEmacs is distributed in the hope that it will be useful, but WITHOUT
  13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  14 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  15 for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with XEmacs; see the file COPYING.  If not, write to
  19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  20 Boston, MA 02111-1307, USA.  */
  21
  22 /* Synched up with: Mule 2.3.   Not in FSF. */
  23
  24 /* Rewritten by Ben Wing <ben@xemacs.org>. */
  25
  26 #include <config.h>
  27 #include "lisp.h"
  28
  29 #include "buffer.h"
  30 #include "elhash.h"
  31 #include "insdel.h"
  32 #include "lstream.h"
  33 #include "opaque.h"
  34 #ifdef MULE
  35 #include "mule-ccl.h"
  36 #include "chartab.h"
  37 #endif
  38 #include "file-coding.h"
  39
  40 Lisp_Object Qcoding_system_error;
  41
  42 Lisp_Object Vkeyboard_coding_system;
  43 Lisp_Object Vterminal_coding_system;
  44 Lisp_Object Vcoding_system_for_read;
  45 Lisp_Object Vcoding_system_for_write;
  46 Lisp_Object Vfile_name_coding_system;
  47
  48 /* Table of symbols identifying each coding category. */
  49 Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST];
  50
  51
  52
  53 struct file_coding_dump {
  54   /* Coding system currently associated with each coding category. */
  55   Lisp_Object coding_category_system[CODING_CATEGORY_LAST];
  56
  57   /* Table of all coding categories in decreasing order of priority.
  58      This describes a permutation of the possible coding categories. */
  59   int coding_category_by_priority[CODING_CATEGORY_LAST];
  60
  61 #if defined(MULE) && !defined(UTF2000)
  62   Lisp_Object ucs_to_mule_table[65536];
  63 #endif
  64 } *fcd;
  65
  66 static const struct lrecord_description fcd_description_1[] = {
  67   { XD_LISP_OBJECT_ARRAY, offsetof (struct file_coding_dump, coding_category_system), CODING_CATEGORY_LAST },
  68 #if defined(MULE) && !defined(UTF2000)
  69   { XD_LISP_OBJECT_ARRAY, offsetof (struct file_coding_dump, ucs_to_mule_table), countof (fcd->ucs_to_mule_table) },
  70 #endif
  71   { XD_END }
  72 };
  73
  74 static const struct struct_description fcd_description = {
  75   sizeof (struct file_coding_dump),
  76   fcd_description_1
  77 };
  78
  79 Lisp_Object mule_to_ucs_table;
  80
  81 Lisp_Object Qcoding_systemp;
  82
  83 Lisp_Object Qraw_text, Qno_conversion, Qccl, Qiso2022;
  84 /* Qinternal in general.c */
  85
  86 Lisp_Object Qmnemonic, Qeol_type;
  87 Lisp_Object Qcr, Qcrlf, Qlf;
  88 Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf;
  89 Lisp_Object Qpost_read_conversion;
  90 Lisp_Object Qpre_write_conversion;
  91
  92 #ifdef MULE
  93 Lisp_Object Qucs4, Qutf8;
  94 Lisp_Object Qbig5, Qshift_jis;
  95 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3;
  96 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output;
  97 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output;
  98 Lisp_Object Qno_iso6429;
  99 Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion;
 100 Lisp_Object Qescape_quoted;
 101 Lisp_Object Qshort, Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift;
 102 #endif
 103 Lisp_Object Qencode, Qdecode;
 104
 105 Lisp_Object Vcoding_system_hash_table;
 106
 107 int enable_multibyte_characters;
 108
 109 #ifdef MULE
 110 /* Additional information used by the ISO2022 decoder and detector. */
 111 struct iso2022_decoder
 112 {
 113   /* CHARSET holds the character sets currently assigned to the G0
 114      through G3 variables.  It is initialized from the array
 115      INITIAL_CHARSET in CODESYS. */
 116   Lisp_Object charset[4];
 117
 118   /* Which registers are currently invoked into the left (GL) and
 119      right (GR) halves of the 8-bit encoding space? */
 120   int register_left, register_right;
 121
 122   /* ISO_ESC holds a value indicating part of an escape sequence
 123      that has already been seen. */
 124   enum iso_esc_flag esc;
 125
 126   /* This records the bytes we've seen so far in an escape sequence,
 127      in case the sequence is invalid (we spit out the bytes unchanged). */
 128   unsigned char esc_bytes[8];
 129
 130   /* Index for next byte to store in ISO escape sequence. */
 131   int esc_bytes_index;
 132
 133 #ifdef ENABLE_COMPOSITE_CHARS
 134   /* Stuff seen so far when composing a string. */
 135   unsigned_char_dynarr *composite_chars;
 136 #endif
 137
 138   /* If we saw an invalid designation sequence for a particular
 139      register, we flag it here and switch to ASCII.  The next time we
 140      see a valid designation for this register, we turn off the flag
 141      and do the designation normally, but pretend the sequence was
 142      invalid.  The effect of all this is that (most of the time) the
 143      escape sequences for both the switch to the unknown charset, and
 144      the switch back to the known charset, get inserted literally into
 145      the buffer and saved out as such.  The hope is that we can
 146      preserve the escape sequences so that the resulting written out
 147      file makes sense.  If we don't do any of this, the designation
 148      to the invalid charset will be preserved but that switch back
 149      to the known charset will probably get eaten because it was
 150      the same charset that was already present in the register. */
 151   unsigned char invalid_designated[4];
 152
 153   /* We try to do similar things as above for direction-switching
 154      sequences.  If we encountered a direction switch while an
 155      invalid designation was present, or an invalid designation
 156      just after a direction switch (i.e. no valid designation
 157      encountered yet), we insert the direction-switch escape
 158      sequence literally into the output stream, and later on
 159      insert the corresponding direction-restoring escape sequence
 160      literally also. */
 161   unsigned int switched_dir_and_no_valid_charset_yet :1;
 162   unsigned int invalid_switch_dir :1;
 163
 164   /* Tells the decoder to output the escape sequence literally
 165      even though it was valid.  Used in the games we play to
 166      avoid lossage when we encounter invalid designations. */
 167   unsigned int output_literally :1;
 168   /* We encountered a direction switch followed by an invalid
 169      designation.  We didn't output the direction switch
 170      literally because we didn't know about the invalid designation;
 171      but we have to do so now. */
 172   unsigned int output_direction_sequence :1;
 173 };
 174 #endif /* MULE */
 175 EXFUN (Fcopy_coding_system, 2);
 176 #ifdef MULE
 177 struct detection_state;
 178 static int detect_coding_sjis (struct detection_state *st,
 179                                const Extbyte *src, size_t n);
 180 static void decode_coding_sjis (Lstream *decoding, const Extbyte *src,
 181                                 unsigned_char_dynarr *dst, size_t n);
 182 static void encode_coding_sjis (Lstream *encoding, const Bufbyte *src,
 183                                 unsigned_char_dynarr *dst, size_t n);
 184 static int detect_coding_big5 (struct detection_state *st,
 185                                const Extbyte *src, size_t n);
 186 static void decode_coding_big5 (Lstream *decoding, const Extbyte *src,
 187                                 unsigned_char_dynarr *dst, size_t n);
 188 static void encode_coding_big5 (Lstream *encoding, const Bufbyte *src,
 189                                 unsigned_char_dynarr *dst, size_t n);
 190 static int detect_coding_ucs4 (struct detection_state *st,
 191                                const Extbyte *src, size_t n);
 192 static void decode_coding_ucs4 (Lstream *decoding, const Extbyte *src,
 193                                 unsigned_char_dynarr *dst, size_t n);
 194 static void encode_coding_ucs4 (Lstream *encoding, const Bufbyte *src,
 195                                 unsigned_char_dynarr *dst, size_t n);
 196 static int detect_coding_utf8 (struct detection_state *st,
 197                                const Extbyte *src, size_t n);
 198 static void decode_coding_utf8 (Lstream *decoding, const Extbyte *src,
 199                                 unsigned_char_dynarr *dst, size_t n);
 200 static void encode_coding_utf8 (Lstream *encoding, const Bufbyte *src,
 201                                 unsigned_char_dynarr *dst, size_t n);
 202 static int postprocess_iso2022_mask (int mask);
 203 static void reset_iso2022 (Lisp_Object coding_system,
 204                            struct iso2022_decoder *iso);
 205 static int detect_coding_iso2022 (struct detection_state *st,
 206                                   const Extbyte *src, size_t n);
 207 static void decode_coding_iso2022 (Lstream *decoding, const Extbyte *src,
 208                                    unsigned_char_dynarr *dst, size_t n);
 209 static void encode_coding_iso2022 (Lstream *encoding, const Bufbyte *src,
 210                                    unsigned_char_dynarr *dst, size_t n);
 211 #endif /* MULE */
 212 static void decode_coding_no_conversion (Lstream *decoding, const Extbyte *src,
 213                                          unsigned_char_dynarr *dst, size_t n);
 214 static void encode_coding_no_conversion (Lstream *encoding, const Bufbyte *src,
 215                                          unsigned_char_dynarr *dst, size_t n);
 216 static void mule_decode (Lstream *decoding, const Extbyte *src,
 217                          unsigned_char_dynarr *dst, size_t n);
 218 static void mule_encode (Lstream *encoding, const Bufbyte *src,
 219                          unsigned_char_dynarr *dst, size_t n);
 220
 221 typedef struct codesys_prop codesys_prop;
 222 struct codesys_prop
 223 {
 224   Lisp_Object sym;
 225   int prop_type;
 226 };
 227
 228 typedef struct
 229 {
 230   Dynarr_declare (codesys_prop);
 231 } codesys_prop_dynarr;
 232
 233 static const struct lrecord_description codesys_prop_description_1[] = {
 234   { XD_LISP_OBJECT, offsetof (codesys_prop, sym) },
 235   { XD_END }
 236 };
 237
 238 static const struct struct_description codesys_prop_description = {
 239   sizeof (codesys_prop),
 240   codesys_prop_description_1
 241 };
 242
 243 static const struct lrecord_description codesys_prop_dynarr_description_1[] = {
 244   XD_DYNARR_DESC (codesys_prop_dynarr, &codesys_prop_description),
 245   { XD_END }
 246 };
 247
 248 static const struct struct_description codesys_prop_dynarr_description = {
 249   sizeof (codesys_prop_dynarr),
 250   codesys_prop_dynarr_description_1
 251 };
 252
 253 codesys_prop_dynarr *the_codesys_prop_dynarr;
 254
 255 enum codesys_prop_enum
 256 {
 257   CODESYS_PROP_ALL_OK,
 258   CODESYS_PROP_ISO2022,
 259   CODESYS_PROP_CCL
 260 };
 261
 262 \f
 263 /************************************************************************/
 264 /*                       Coding system functions                        */
 265 /************************************************************************/
 266
 267 static Lisp_Object mark_coding_system (Lisp_Object);
 268 static void print_coding_system (Lisp_Object, Lisp_Object, int);
 269 static void finalize_coding_system (void *header, int for_disksave);
 270
 271 #ifdef MULE
 272 static const struct lrecord_description ccs_description_1[] = {
 273   { XD_LISP_OBJECT, offsetof (charset_conversion_spec, from_charset) },
 274   { XD_LISP_OBJECT, offsetof (charset_conversion_spec, to_charset) },
 275   { XD_END }
 276 };
 277
 278 static const struct struct_description ccs_description = {
 279   sizeof (charset_conversion_spec),
 280   ccs_description_1
 281 };
 282
 283 static const struct lrecord_description ccsd_description_1[] = {
 284   XD_DYNARR_DESC (charset_conversion_spec_dynarr, &ccs_description),
 285   { XD_END }
 286 };
 287
 288 static const struct struct_description ccsd_description = {
 289   sizeof (charset_conversion_spec_dynarr),
 290   ccsd_description_1
 291 };
 292 #endif
 293
 294 static const struct lrecord_description coding_system_description[] = {
 295   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, name) },
 296   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, doc_string) },
 297   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, mnemonic) },
 298   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, post_read_conversion) },
 299   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, pre_write_conversion) },
 300   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_lf) },
 301   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_crlf) },
 302   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_cr) },
 303 #ifdef MULE
 304   { XD_LISP_OBJECT_ARRAY, offsetof (Lisp_Coding_System, iso2022.initial_charset), 4 },
 305   { XD_STRUCT_PTR,  offsetof (Lisp_Coding_System, iso2022.input_conv),  1, &ccsd_description },
 306   { XD_STRUCT_PTR,  offsetof (Lisp_Coding_System, iso2022.output_conv), 1, &ccsd_description },
 307   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccl.decode) },
 308   { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccl.encode) },
 309 #endif
 310   { XD_END }
 311 };
 312
 313 DEFINE_LRECORD_IMPLEMENTATION ("coding-system", coding_system,
 314                                mark_coding_system, print_coding_system,
 315                                finalize_coding_system,
 316                                0, 0, coding_system_description,
 317                                Lisp_Coding_System);
 318
 319 static Lisp_Object
 320 mark_coding_system (Lisp_Object obj)
 321 {
 322   Lisp_Coding_System *codesys = XCODING_SYSTEM (obj);
 323
 324   mark_object (CODING_SYSTEM_NAME (codesys));
 325   mark_object (CODING_SYSTEM_DOC_STRING (codesys));
 326   mark_object (CODING_SYSTEM_MNEMONIC (codesys));
 327   mark_object (CODING_SYSTEM_EOL_LF (codesys));
 328   mark_object (CODING_SYSTEM_EOL_CRLF (codesys));
 329   mark_object (CODING_SYSTEM_EOL_CR (codesys));
 330
 331   switch (CODING_SYSTEM_TYPE (codesys))
 332     {
 333 #ifdef MULE
 334       int i;
 335     case CODESYS_ISO2022:
 336       for (i = 0; i < 4; i++)
 337         mark_object (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i));
 338       if (codesys->iso2022.input_conv)
 339         {
 340           for (i = 0; i < Dynarr_length (codesys->iso2022.input_conv); i++)
 341             {
 342               struct charset_conversion_spec *ccs =
 343                 Dynarr_atp (codesys->iso2022.input_conv, i);
 344               mark_object (ccs->from_charset);
 345               mark_object (ccs->to_charset);
 346             }
 347         }
 348       if (codesys->iso2022.output_conv)
 349         {
 350           for (i = 0; i < Dynarr_length (codesys->iso2022.output_conv); i++)
 351             {
 352               struct charset_conversion_spec *ccs =
 353                 Dynarr_atp (codesys->iso2022.output_conv, i);
 354               mark_object (ccs->from_charset);
 355               mark_object (ccs->to_charset);
 356             }
 357         }
 358       break;
 359
 360     case CODESYS_CCL:
 361       mark_object (CODING_SYSTEM_CCL_DECODE (codesys));
 362       mark_object (CODING_SYSTEM_CCL_ENCODE (codesys));
 363       break;
 364 #endif /* MULE */
 365     default:
 366       break;
 367     }
 368
 369   mark_object (CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys));
 370   return CODING_SYSTEM_POST_READ_CONVERSION (codesys);
 371 }
 372
 373 static void
 374 print_coding_system (Lisp_Object obj, Lisp_Object printcharfun,
 375                      int escapeflag)
 376 {
 377   Lisp_Coding_System *c = XCODING_SYSTEM (obj);
 378   if (print_readably)
 379     error ("printing unreadable object #<coding_system 0x%x>",
 380            c->header.uid);
 381
 382   write_c_string ("#<coding_system ", printcharfun);
 383   print_internal (c->name, printcharfun, 1);
 384   write_c_string (">", printcharfun);
 385 }
 386
 387 static void
 388 finalize_coding_system (void *header, int for_disksave)
 389 {
 390   Lisp_Coding_System *c = (Lisp_Coding_System *) header;
 391   /* Since coding systems never go away, this function is not
 392      necessary.  But it would be necessary if we changed things
 393      so that coding systems could go away. */
 394   if (!for_disksave) /* see comment in lstream.c */
 395     {
 396       switch (CODING_SYSTEM_TYPE (c))
 397         {
 398 #ifdef MULE
 399         case CODESYS_ISO2022:
 400           if (c->iso2022.input_conv)
 401             {
 402               Dynarr_free (c->iso2022.input_conv);
 403               c->iso2022.input_conv = 0;
 404             }
 405           if (c->iso2022.output_conv)
 406             {
 407               Dynarr_free (c->iso2022.output_conv);
 408               c->iso2022.output_conv = 0;
 409             }
 410           break;
 411 #endif /* MULE */
 412         default:
 413           break;
 414         }
 415     }
 416 }
 417
 418 static eol_type_t
 419 symbol_to_eol_type (Lisp_Object symbol)
 420 {
 421   CHECK_SYMBOL (symbol);
 422   if (NILP (symbol))      return EOL_AUTODETECT;
 423   if (EQ (symbol, Qlf))   return EOL_LF;
 424   if (EQ (symbol, Qcrlf)) return EOL_CRLF;
 425   if (EQ (symbol, Qcr))   return EOL_CR;
 426
 427   signal_simple_error ("Unrecognized eol type", symbol);
 428   return EOL_AUTODETECT; /* not reached */
 429 }
 430
 431 static Lisp_Object
 432 eol_type_to_symbol (eol_type_t type)
 433 {
 434   switch (type)
 435     {
 436     default: abort ();
 437     case EOL_LF:         return Qlf;
 438     case EOL_CRLF:       return Qcrlf;
 439     case EOL_CR:         return Qcr;
 440     case EOL_AUTODETECT: return Qnil;
 441     }
 442 }
 443
 444 static void
 445 setup_eol_coding_systems (Lisp_Coding_System *codesys)
 446 {
 447   Lisp_Object codesys_obj;
 448   int len = string_length (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name);
 449   char *codesys_name = (char *) alloca (len + 7);
 450   int mlen = -1;
 451   char *codesys_mnemonic=0;
 452
 453   Lisp_Object codesys_name_sym, sub_codesys_obj;
 454
 455   /* kludge */
 456
 457   XSETCODING_SYSTEM (codesys_obj, codesys);
 458
 459   memcpy (codesys_name,
 460           string_data (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name), len);
 461
 462   if (STRINGP (CODING_SYSTEM_MNEMONIC (codesys)))
 463     {
 464       mlen = XSTRING_LENGTH (CODING_SYSTEM_MNEMONIC (codesys));
 465       codesys_mnemonic = (char *) alloca (mlen + 7);
 466       memcpy (codesys_mnemonic,
 467               XSTRING_DATA (CODING_SYSTEM_MNEMONIC (codesys)), mlen);
 468     }
 469
 470 #define DEFINE_SUB_CODESYS(op_sys, op_sys_abbr, Type) do {                      \
 471   strcpy (codesys_name + len, "-" op_sys);                                      \
 472   if (mlen != -1)                                                               \
 473     strcpy (codesys_mnemonic + mlen, op_sys_abbr);                              \
 474   codesys_name_sym = intern (codesys_name);                                     \
 475   sub_codesys_obj = Fcopy_coding_system (codesys_obj, codesys_name_sym);        \
 476   XCODING_SYSTEM_EOL_TYPE (sub_codesys_obj) = Type;                             \
 477   if (mlen != -1)                                                               \
 478     XCODING_SYSTEM_MNEMONIC(sub_codesys_obj) =                                  \
 479       build_string (codesys_mnemonic);                                          \
 480   CODING_SYSTEM_##Type (codesys) = sub_codesys_obj;                             \
 481 } while (0)
 482
 483   DEFINE_SUB_CODESYS("unix", "", EOL_LF);
 484   DEFINE_SUB_CODESYS("dos",  ":T", EOL_CRLF);
 485   DEFINE_SUB_CODESYS("mac",  ":t", EOL_CR);
 486 }
 487
 488 DEFUN ("coding-system-p", Fcoding_system_p, 1, 1, 0, /*
 489 Return t if OBJECT is a coding system.
 490 A coding system is an object that defines how text containing multiple
 491 character sets is encoded into a stream of (typically 8-bit) bytes.
 492 The coding system is used to decode the stream into a series of
 493 characters (which may be from multiple charsets) when the text is read
 494 from a file or process, and is used to encode the text back into the
 495 same format when it is written out to a file or process.
 496
 497 For example, many ISO2022-compliant coding systems (such as Compound
 498 Text, which is used for inter-client data under the X Window System)
 499 use escape sequences to switch between different charsets -- Japanese
 500 Kanji, for example, is invoked with "ESC $ ( B"; ASCII is invoked
 501 with "ESC ( B"; and Cyrillic is invoked with "ESC - L".  See
 502 `make-coding-system' for more information.
 503
 504 Coding systems are normally identified using a symbol, and the
 505 symbol is accepted in place of the actual coding system object whenever
 506 a coding system is called for. (This is similar to how faces work.)
 507 */
 508        (object))
 509 {
 510   return CODING_SYSTEMP (object) ? Qt : Qnil;
 511 }
 512
 513 DEFUN ("find-coding-system", Ffind_coding_system, 1, 1, 0, /*
 514 Retrieve the coding system of the given name.
 515
 516 If CODING-SYSTEM-OR-NAME is a coding-system object, it is simply
 517 returned.  Otherwise, CODING-SYSTEM-OR-NAME should be a symbol.
 518 If there is no such coding system, nil is returned.  Otherwise the
 519 associated coding system object is returned.
 520 */
 521        (coding_system_or_name))
 522 {
 523   if (NILP (coding_system_or_name))
 524     coding_system_or_name = Qbinary;
 525   else if (CODING_SYSTEMP (coding_system_or_name))
 526     return coding_system_or_name;
 527   else
 528     CHECK_SYMBOL (coding_system_or_name);
 529
 530   while (1)
 531     {
 532       coding_system_or_name =
 533         Fgethash (coding_system_or_name, Vcoding_system_hash_table, Qnil);
 534
 535       if (CODING_SYSTEMP (coding_system_or_name) || NILP (coding_system_or_name))
 536         return coding_system_or_name;
 537     }
 538 }
 539
 540 DEFUN ("get-coding-system", Fget_coding_system, 1, 1, 0, /*
 541 Retrieve the coding system of the given name.
 542 Same as `find-coding-system' except that if there is no such
 543 coding system, an error is signaled instead of returning nil.
 544 */
 545        (name))
 546 {
 547   Lisp_Object coding_system = Ffind_coding_system (name);
 548
 549   if (NILP (coding_system))
 550     signal_simple_error ("No such coding system", name);
 551   return coding_system;
 552 }
 553
 554 /* We store the coding systems in hash tables with the names as the key and the
 555    actual coding system object as the value.  Occasionally we need to use them
 556    in a list format.  These routines provide us with that. */
 557 struct coding_system_list_closure
 558 {
 559   Lisp_Object *coding_system_list;
 560 };
 561
 562 static int
 563 add_coding_system_to_list_mapper (Lisp_Object key, Lisp_Object value,
 564                                   void *coding_system_list_closure)
 565 {
 566   /* This function can GC */
 567   struct coding_system_list_closure *cscl =
 568     (struct coding_system_list_closure *) coding_system_list_closure;
 569   Lisp_Object *coding_system_list = cscl->coding_system_list;
 570
 571   *coding_system_list = Fcons (key, *coding_system_list);
 572   return 0;
 573 }
 574
 575 DEFUN ("coding-system-list", Fcoding_system_list, 0, 0, 0, /*
 576 Return a list of the names of all defined coding systems.
 577 */
 578        ())
 579 {
 580   Lisp_Object coding_system_list = Qnil;
 581   struct gcpro gcpro1;
 582   struct coding_system_list_closure coding_system_list_closure;
 583
 584   GCPRO1 (coding_system_list);
 585   coding_system_list_closure.coding_system_list = &coding_system_list;
 586   elisp_maphash (add_coding_system_to_list_mapper, Vcoding_system_hash_table,
 587                  &coding_system_list_closure);
 588   UNGCPRO;
 589
 590   return coding_system_list;
 591 }
 592
 593 DEFUN ("coding-system-name", Fcoding_system_name, 1, 1, 0, /*
 594 Return the name of the given coding system.
 595 */
 596        (coding_system))
 597 {
 598   coding_system = Fget_coding_system (coding_system);
 599   return XCODING_SYSTEM_NAME (coding_system);
 600 }
 601
 602 static Lisp_Coding_System *
 603 allocate_coding_system (enum coding_system_type type, Lisp_Object name)
 604 {
 605   Lisp_Coding_System *codesys =
 606     alloc_lcrecord_type (Lisp_Coding_System, &lrecord_coding_system);
 607
 608   zero_lcrecord (codesys);
 609   CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = Qnil;
 610   CODING_SYSTEM_POST_READ_CONVERSION (codesys) = Qnil;
 611   CODING_SYSTEM_EOL_TYPE (codesys) = EOL_AUTODETECT;
 612   CODING_SYSTEM_EOL_CRLF (codesys) = Qnil;
 613   CODING_SYSTEM_EOL_CR   (codesys) = Qnil;
 614   CODING_SYSTEM_EOL_LF   (codesys) = Qnil;
 615   CODING_SYSTEM_TYPE     (codesys) = type;
 616   CODING_SYSTEM_MNEMONIC (codesys) = Qnil;
 617 #ifdef MULE
 618   if (type == CODESYS_ISO2022)
 619     {
 620       int i;
 621       for (i = 0; i < 4; i++)
 622         CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i) = Qnil;
 623     }
 624   else if (type == CODESYS_CCL)
 625     {
 626       CODING_SYSTEM_CCL_DECODE (codesys) = Qnil;
 627       CODING_SYSTEM_CCL_ENCODE (codesys) = Qnil;
 628     }
 629 #endif /* MULE */
 630   CODING_SYSTEM_NAME (codesys) = name;
 631
 632   return codesys;
 633 }
 634
 635 #ifdef MULE
 636 /* Given a list of charset conversion specs as specified in a Lisp
 637    program, parse it into STORE_HERE. */
 638
 639 static void
 640 parse_charset_conversion_specs (charset_conversion_spec_dynarr *store_here,
 641                                 Lisp_Object spec_list)
 642 {
 643   Lisp_Object rest;
 644
 645   EXTERNAL_LIST_LOOP (rest, spec_list)
 646     {
 647       Lisp_Object car = XCAR (rest);
 648       Lisp_Object from, to;
 649       struct charset_conversion_spec spec;
 650
 651       if (!CONSP (car) || !CONSP (XCDR (car)) || !NILP (XCDR (XCDR (car))))
 652         signal_simple_error ("Invalid charset conversion spec", car);
 653       from = Fget_charset (XCAR (car));
 654       to = Fget_charset (XCAR (XCDR (car)));
 655       if (XCHARSET_TYPE (from) != XCHARSET_TYPE (to))
 656         signal_simple_error_2
 657           ("Attempted conversion between different charset types",
 658            from, to);
 659       spec.from_charset = from;
 660       spec.to_charset = to;
 661
 662       Dynarr_add (store_here, spec);
 663     }
 664 }
 665
 666 /* Given a dynarr LOAD_HERE of internally-stored charset conversion
 667    specs, return the equivalent as the Lisp programmer would see it.
 668
 669    If LOAD_HERE is 0, return Qnil. */
 670
 671 static Lisp_Object
 672 unparse_charset_conversion_specs (charset_conversion_spec_dynarr *load_here)
 673 {
 674   int i;
 675   Lisp_Object result;
 676
 677   if (!load_here)
 678     return Qnil;
 679   for (i = 0, result = Qnil; i < Dynarr_length (load_here); i++)
 680     {
 681       struct charset_conversion_spec *ccs = Dynarr_atp (load_here, i);
 682       result = Fcons (list2 (ccs->from_charset, ccs->to_charset), result);
 683     }
 684
 685   return Fnreverse (result);
 686 }
 687
 688 #endif /* MULE */
 689
 690 DEFUN ("make-coding-system", Fmake_coding_system, 2, 4, 0, /*
 691 Register symbol NAME as a coding system.
 692
 693 TYPE describes the conversion method used and should be one of
 694
 695 nil or 'undecided
 696      Automatic conversion.  XEmacs attempts to detect the coding system
 697      used in the file.
 698 'no-conversion
 699      No conversion.  Use this for binary files and such.  On output,
 700      graphic characters that are not in ASCII or Latin-1 will be
 701      replaced by a ?. (For a no-conversion-encoded buffer, these
 702      characters will only be present if you explicitly insert them.)
 703 'shift-jis
 704      Shift-JIS (a Japanese encoding commonly used in PC operating systems).
 705 'ucs-4
 706      ISO 10646 UCS-4 encoding.
 707 'utf-8
 708      ISO 10646 UTF-8 encoding.
 709 'iso2022
 710      Any ISO2022-compliant encoding.  Among other things, this includes
 711      JIS (the Japanese encoding commonly used for e-mail), EUC (the
 712      standard Unix encoding for Japanese and other languages), and
 713      Compound Text (the encoding used in X11).  You can specify more
 714      specific information about the conversion with the PROPS argument.
 715 'big5
 716      Big5 (the encoding commonly used for Taiwanese).
 717 'ccl
 718      The conversion is performed using a user-written pseudo-code
 719      program.  CCL (Code Conversion Language) is the name of this
 720      pseudo-code.
 721 'internal
 722      Write out or read in the raw contents of the memory representing
 723      the buffer's text.  This is primarily useful for debugging
 724      purposes, and is only enabled when XEmacs has been compiled with
 725      DEBUG_XEMACS defined (via the --debug configure option).
 726      WARNING: Reading in a file using 'internal conversion can result
 727      in an internal inconsistency in the memory representing a
 728      buffer's text, which will produce unpredictable results and may
 729      cause XEmacs to crash.  Under normal circumstances you should
 730      never use 'internal conversion.
 731
 732 DOC-STRING is a string describing the coding system.
 733
 734 PROPS is a property list, describing the specific nature of the
 735 character set.  Recognized properties are:
 736
 737 'mnemonic
 738      String to be displayed in the modeline when this coding system is
 739      active.
 740
 741 'eol-type
 742      End-of-line conversion to be used.  It should be one of
 743
 744         nil
 745                 Automatically detect the end-of-line type (LF, CRLF,
 746                 or CR).  Also generate subsidiary coding systems named
 747                 `NAME-unix', `NAME-dos', and `NAME-mac', that are
 748                 identical to this coding system but have an EOL-TYPE
 749                 value of 'lf, 'crlf, and 'cr, respectively.
 750         'lf
 751                 The end of a line is marked externally using ASCII LF.
 752                 Since this is also the way that XEmacs represents an
 753                 end-of-line internally, specifying this option results
 754                 in no end-of-line conversion.  This is the standard
 755                 format for Unix text files.
 756         'crlf
 757                 The end of a line is marked externally using ASCII
 758                 CRLF.  This is the standard format for MS-DOS text
 759                 files.
 760         'cr
 761                 The end of a line is marked externally using ASCII CR.
 762                 This is the standard format for Macintosh text files.
 763         t
 764                 Automatically detect the end-of-line type but do not
 765                 generate subsidiary coding systems.  (This value is
 766                 converted to nil when stored internally, and
 767                 `coding-system-property' will return nil.)
 768
 769 'post-read-conversion
 770      Function called after a file has been read in, to perform the
 771      decoding.  Called with two arguments, START and END, denoting
 772      a region of the current buffer to be decoded.
 773
 774 'pre-write-conversion
 775      Function called before a file is written out, to perform the
 776      encoding.  Called with two arguments, START and END, denoting
 777      a region of the current buffer to be encoded.
 778
 779
 780 The following additional properties are recognized if TYPE is 'iso2022:
 781
 782 'charset-g0
 783 'charset-g1
 784 'charset-g2
 785 'charset-g3
 786      The character set initially designated to the G0 - G3 registers.
 787      The value should be one of
 788
 789           -- A charset object (designate that character set)
 790           -- nil (do not ever use this register)
 791           -- t (no character set is initially designated to
 792                 the register, but may be later on; this automatically
 793                 sets the corresponding `force-g*-on-output' property)
 794
 795 'force-g0-on-output
 796 'force-g1-on-output
 797 'force-g2-on-output
 798 'force-g2-on-output
 799      If non-nil, send an explicit designation sequence on output before
 800      using the specified register.
 801
 802 'short
 803      If non-nil, use the short forms "ESC $ @", "ESC $ A", and
 804      "ESC $ B" on output in place of the full designation sequences
 805      "ESC $ ( @", "ESC $ ( A", and "ESC $ ( B".
 806
 807 'no-ascii-eol
 808      If non-nil, don't designate ASCII to G0 at each end of line on output.
 809      Setting this to non-nil also suppresses other state-resetting that
 810      normally happens at the end of a line.
 811
 812 'no-ascii-cntl
 813      If non-nil, don't designate ASCII to G0 before control chars on output.
 814
 815 'seven
 816      If non-nil, use 7-bit environment on output.  Otherwise, use 8-bit
 817      environment.
 818
 819 'lock-shift
 820      If non-nil, use locking-shift (SO/SI) instead of single-shift
 821      or designation by escape sequence.
 822
 823 'no-iso6429
 824      If non-nil, don't use ISO6429's direction specification.
 825
 826 'escape-quoted
 827      If non-nil, literal control characters that are the same as
 828      the beginning of a recognized ISO2022 or ISO6429 escape sequence
 829      (in particular, ESC (0x1B), SO (0x0E), SI (0x0F), SS2 (0x8E),
 830      SS3 (0x8F), and CSI (0x9B)) are "quoted" with an escape character
 831      so that they can be properly distinguished from an escape sequence.
 832      (Note that doing this results in a non-portable encoding.) This
 833      encoding flag is used for byte-compiled files.  Note that ESC
 834      is a good choice for a quoting character because there are no
 835      escape sequences whose second byte is a character from the Control-0
 836      or Control-1 character sets; this is explicitly disallowed by the
 837      ISO2022 standard.
 838
 839 'input-charset-conversion
 840      A list of conversion specifications, specifying conversion of
 841      characters in one charset to another when decoding is performed.
 842      Each specification is a list of two elements: the source charset,
 843      and the destination charset.
 844
 845 'output-charset-conversion
 846      A list of conversion specifications, specifying conversion of
 847      characters in one charset to another when encoding is performed.
 848      The form of each specification is the same as for
 849      'input-charset-conversion.
 850
 851
 852 The following additional properties are recognized (and required)
 853 if TYPE is 'ccl:
 854
 855 'decode
 856      CCL program used for decoding (converting to internal format).
 857
 858 'encode
 859      CCL program used for encoding (converting to external format).
 860 */
 861        (name, type, doc_string, props))
 862 {
 863   Lisp_Coding_System *codesys;
 864   enum coding_system_type ty;
 865   int need_to_setup_eol_systems = 1;
 866
 867   /* Convert type to constant */
 868   if (NILP (type) || EQ (type, Qundecided))
 869                                       { ty = CODESYS_AUTODETECT; }
 870 #ifdef MULE
 871   else if (EQ (type, Qshift_jis))     { ty = CODESYS_SHIFT_JIS; }
 872   else if (EQ (type, Qiso2022))       { ty = CODESYS_ISO2022; }
 873   else if (EQ (type, Qbig5))          { ty = CODESYS_BIG5; }
 874   else if (EQ (type, Qucs4))          { ty = CODESYS_UCS4; }
 875   else if (EQ (type, Qutf8))          { ty = CODESYS_UTF8; }
 876   else if (EQ (type, Qccl))           { ty = CODESYS_CCL; }
 877 #endif
 878   else if (EQ (type, Qno_conversion)) { ty = CODESYS_NO_CONVERSION; }
 879 #ifdef DEBUG_XEMACS
 880   else if (EQ (type, Qinternal))      { ty = CODESYS_INTERNAL; }
 881 #endif
 882   else
 883     signal_simple_error ("Invalid coding system type", type);
 884
 885   CHECK_SYMBOL (name);
 886
 887   codesys = allocate_coding_system (ty, name);
 888
 889   if (NILP (doc_string))
 890     doc_string = build_string ("");
 891   else
 892     CHECK_STRING (doc_string);
 893   CODING_SYSTEM_DOC_STRING (codesys) = doc_string;
 894
 895   {
 896     EXTERNAL_PROPERTY_LIST_LOOP_3 (key, value, props)
 897       {
 898         if (EQ (key, Qmnemonic))
 899           {
 900             if (!NILP (value))
 901               CHECK_STRING (value);
 902             CODING_SYSTEM_MNEMONIC (codesys) = value;
 903           }
 904
 905         else if (EQ (key, Qeol_type))
 906           {
 907             need_to_setup_eol_systems = NILP (value);
 908             if (EQ (value, Qt))
 909               value = Qnil;
 910             CODING_SYSTEM_EOL_TYPE (codesys) = symbol_to_eol_type (value);
 911           }
 912
 913         else if (EQ (key, Qpost_read_conversion)) CODING_SYSTEM_POST_READ_CONVERSION (codesys) = value;
 914         else if (EQ (key, Qpre_write_conversion)) CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = value;
 915 #ifdef MULE
 916         else if (ty == CODESYS_ISO2022)
 917           {
 918 #define FROB_INITIAL_CHARSET(charset_num) \
 919   CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \
 920     ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value))
 921
 922             if      (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
 923             else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
 924             else if (EQ (key, Qcharset_g2)) FROB_INITIAL_CHARSET (2);
 925             else if (EQ (key, Qcharset_g3)) FROB_INITIAL_CHARSET (3);
 926
 927 #define FROB_FORCE_CHARSET(charset_num) \
 928   CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = !NILP (value)
 929
 930             else if (EQ (key, Qforce_g0_on_output)) FROB_FORCE_CHARSET (0);
 931             else if (EQ (key, Qforce_g1_on_output)) FROB_FORCE_CHARSET (1);
 932             else if (EQ (key, Qforce_g2_on_output)) FROB_FORCE_CHARSET (2);
 933             else if (EQ (key, Qforce_g3_on_output)) FROB_FORCE_CHARSET (3);
 934
 935 #define FROB_BOOLEAN_PROPERTY(prop) \
 936   CODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value)
 937
 938             else if (EQ (key, Qshort))         FROB_BOOLEAN_PROPERTY (SHORT);
 939             else if (EQ (key, Qno_ascii_eol))  FROB_BOOLEAN_PROPERTY (NO_ASCII_EOL);
 940             else if (EQ (key, Qno_ascii_cntl)) FROB_BOOLEAN_PROPERTY (NO_ASCII_CNTL);
 941             else if (EQ (key, Qseven))         FROB_BOOLEAN_PROPERTY (SEVEN);
 942             else if (EQ (key, Qlock_shift))    FROB_BOOLEAN_PROPERTY (LOCK_SHIFT);
 943             else if (EQ (key, Qno_iso6429))    FROB_BOOLEAN_PROPERTY (NO_ISO6429);
 944             else if (EQ (key, Qescape_quoted)) FROB_BOOLEAN_PROPERTY (ESCAPE_QUOTED);
 945
 946             else if (EQ (key, Qinput_charset_conversion))
 947               {
 948                 codesys->iso2022.input_conv =
 949                   Dynarr_new (charset_conversion_spec);
 950                 parse_charset_conversion_specs (codesys->iso2022.input_conv,
 951                                                 value);
 952               }
 953             else if (EQ (key, Qoutput_charset_conversion))
 954               {
 955                 codesys->iso2022.output_conv =
 956                   Dynarr_new (charset_conversion_spec);
 957                 parse_charset_conversion_specs (codesys->iso2022.output_conv,
 958                                                 value);
 959               }
 960             else
 961               signal_simple_error ("Unrecognized property", key);
 962           }
 963         else if (EQ (type, Qccl))
 964           {
 965             Lisp_Object sym;
 966             struct ccl_program test_ccl;
 967             Extbyte *suffix;
 968
 969             /* Check key first.  */
 970             if (EQ (key, Qdecode))
 971               suffix = "-ccl-decode";
 972             else if (EQ (key, Qencode))
 973               suffix = "-ccl-encode";
 974             else
 975               signal_simple_error ("Unrecognized property", key);
 976
 977             /* If value is vector, register it as a ccl program
 978                associated with an newly created symbol for
 979                backward compatibility.  */
 980             if (VECTORP (value))
 981               {
 982                 sym = Fintern (concat2 (Fsymbol_name (name),
 983                                         build_string (suffix)),
 984                                Qnil);
 985                 Fregister_ccl_program (sym, value);
 986               }
 987             else
 988               {
 989                 CHECK_SYMBOL (value);
 990                 sym = value;
 991               }
 992             /* check if the given ccl programs are valid.  */
 993             if (setup_ccl_program (&test_ccl, sym) < 0)
 994               signal_simple_error ("Invalid CCL program", value);
 995
 996             if (EQ (key, Qdecode))
 997               CODING_SYSTEM_CCL_DECODE (codesys) = sym;
 998             else if (EQ (key, Qencode))
 999               CODING_SYSTEM_CCL_ENCODE (codesys) = sym;
1000
1001           }
1002 #endif /* MULE */
1003         else
1004           signal_simple_error ("Unrecognized property", key);
1005       }
1006   }
1007
1008   if (need_to_setup_eol_systems)
1009     setup_eol_coding_systems (codesys);
1010
1011   {
1012     Lisp_Object codesys_obj;
1013     XSETCODING_SYSTEM (codesys_obj, codesys);
1014     Fputhash (name, codesys_obj, Vcoding_system_hash_table);
1015     return codesys_obj;
1016   }
1017 }
1018
1019 DEFUN ("copy-coding-system", Fcopy_coding_system, 2, 2, 0, /*
1020 Copy OLD-CODING-SYSTEM to NEW-NAME.
1021 If NEW-NAME does not name an existing coding system, a new one will
1022 be created.
1023 */
1024        (old_coding_system, new_name))
1025 {
1026   Lisp_Object new_coding_system;
1027   old_coding_system = Fget_coding_system (old_coding_system);
1028   new_coding_system = Ffind_coding_system (new_name);
1029   if (NILP (new_coding_system))
1030     {
1031       XSETCODING_SYSTEM (new_coding_system,
1032                          allocate_coding_system
1033                          (XCODING_SYSTEM_TYPE (old_coding_system),
1034                           new_name));
1035       Fputhash (new_name, new_coding_system, Vcoding_system_hash_table);
1036     }
1037
1038   {
1039     Lisp_Coding_System *to = XCODING_SYSTEM (new_coding_system);
1040     Lisp_Coding_System *from = XCODING_SYSTEM (old_coding_system);
1041     memcpy (((char *) to  ) + sizeof (to->header),
1042             ((char *) from) + sizeof (from->header),
1043             sizeof (*from) - sizeof (from->header));
1044     to->name = new_name;
1045   }
1046   return new_coding_system;
1047 }
1048
1049 DEFUN ("coding-system-canonical-name-p", Fcoding_system_canonical_name_p, 1, 1, 0, /*
1050 Return t if OBJECT names a coding system, and is not a coding system alias.
1051 */
1052        (object))
1053 {
1054   return CODING_SYSTEMP (Fgethash (object, Vcoding_system_hash_table, Qnil))
1055     ? Qt : Qnil;
1056 }
1057
1058 DEFUN ("coding-system-alias-p", Fcoding_system_alias_p, 1, 1, 0, /*
1059 Return t if OBJECT is a coding system alias.
1060 All coding system aliases are created by `define-coding-system-alias'.
1061 */
1062        (object))
1063 {
1064   return SYMBOLP (Fgethash (object, Vcoding_system_hash_table, Qzero))
1065     ? Qt : Qnil;
1066 }
1067
1068 DEFUN ("coding-system-aliasee", Fcoding_system_aliasee, 1, 1, 0, /*
1069 Return the coding-system symbol for which symbol ALIAS is an alias.
1070 */
1071        (alias))
1072 {
1073   Lisp_Object aliasee = Fgethash (alias, Vcoding_system_hash_table, Qnil);
1074   if (SYMBOLP (aliasee))
1075     return aliasee;
1076   else
1077     signal_simple_error ("Symbol is not a coding system alias", alias);
1078   return Qnil;          /* To keep the compiler happy */
1079 }
1080
1081 static Lisp_Object
1082 append_suffix_to_symbol (Lisp_Object symbol, const char *ascii_string)
1083 {
1084   return Fintern (concat2 (Fsymbol_name (symbol), build_string (ascii_string)),
1085                   Qnil);
1086 }
1087
1088 /* A maphash function, for removing dangling coding system aliases. */
1089 static int
1090 dangling_coding_system_alias_p (Lisp_Object alias,
1091                                 Lisp_Object aliasee,
1092                                 void *dangling_aliases)
1093 {
1094   if (SYMBOLP (aliasee)
1095       && NILP (Fgethash (aliasee, Vcoding_system_hash_table, Qnil)))
1096     {
1097       (*(int *) dangling_aliases)++;
1098       return 1;
1099     }
1100   else
1101     return 0;
1102 }
1103
1104 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias, 2, 2, 0, /*
1105 Define symbol ALIAS as an alias for coding system ALIASEE.
1106
1107 You can use this function to redefine an alias that has already been defined,
1108 but you cannot redefine a name which is the canonical name for a coding system.
1109 \(a canonical name of a coding system is what is returned when you call
1110 `coding-system-name' on a coding system).
1111
1112 ALIASEE itself can be an alias, which allows you to define nested aliases.
1113
1114 You are forbidden, however, from creating alias loops or `dangling' aliases.
1115 These will be detected, and an error will be signaled if you attempt to do so.
1116
1117 If ALIASEE is nil, then ALIAS will simply be undefined.
1118
1119 See also `coding-system-alias-p', `coding-system-aliasee',
1120 and `coding-system-canonical-name-p'.
1121 */
1122        (alias, aliasee))
1123 {
1124   Lisp_Object real_coding_system, probe;
1125
1126   CHECK_SYMBOL (alias);
1127
1128   if (!NILP (Fcoding_system_canonical_name_p (alias)))
1129     signal_simple_error
1130       ("Symbol is the canonical name of a coding system and cannot be redefined",
1131        alias);
1132
1133   if (NILP (aliasee))
1134     {
1135       Lisp_Object subsidiary_unix = append_suffix_to_symbol (alias, "-unix");
1136       Lisp_Object subsidiary_dos  = append_suffix_to_symbol (alias, "-dos");
1137       Lisp_Object subsidiary_mac  = append_suffix_to_symbol (alias, "-mac");
1138
1139       Fremhash (alias, Vcoding_system_hash_table);
1140
1141       /* Undefine subsidiary aliases,
1142          presumably created by a previous call to this function */
1143       if (! NILP (Fcoding_system_alias_p (subsidiary_unix)) &&
1144           ! NILP (Fcoding_system_alias_p (subsidiary_dos))  &&
1145           ! NILP (Fcoding_system_alias_p (subsidiary_mac)))
1146         {
1147           Fdefine_coding_system_alias (subsidiary_unix, Qnil);
1148           Fdefine_coding_system_alias (subsidiary_dos,  Qnil);
1149           Fdefine_coding_system_alias (subsidiary_mac,  Qnil);
1150         }
1151
1152       /* Undefine dangling coding system aliases. */
1153       {
1154         int dangling_aliases;
1155
1156         do {
1157           dangling_aliases = 0;
1158           elisp_map_remhash (dangling_coding_system_alias_p,
1159                              Vcoding_system_hash_table,
1160                              &dangling_aliases);
1161         } while (dangling_aliases > 0);
1162       }
1163
1164       return Qnil;
1165     }
1166
1167   if (CODING_SYSTEMP (aliasee))
1168     aliasee = XCODING_SYSTEM_NAME (aliasee);
1169
1170   /* Checks that aliasee names a coding-system */
1171   real_coding_system = Fget_coding_system (aliasee);
1172
1173   /* Check for coding system alias loops */
1174   if (EQ (alias, aliasee))
1175     alias_loop: signal_simple_error_2
1176       ("Attempt to create a coding system alias loop", alias, aliasee);
1177
1178   for (probe = aliasee;
1179        SYMBOLP (probe);
1180        probe = Fgethash (probe, Vcoding_system_hash_table, Qzero))
1181     {
1182       if (EQ (probe, alias))
1183         goto alias_loop;
1184     }
1185
1186   Fputhash (alias, aliasee, Vcoding_system_hash_table);
1187
1188   /* Set up aliases for subsidiaries.
1189      #### There must be a better way to handle subsidiary coding systems. */
1190   {
1191     static const char *suffixes[] = { "-unix", "-dos", "-mac" };
1192     int i;
1193     for (i = 0; i < countof (suffixes); i++)
1194       {
1195         Lisp_Object alias_subsidiary =
1196           append_suffix_to_symbol (alias, suffixes[i]);
1197         Lisp_Object aliasee_subsidiary =
1198           append_suffix_to_symbol (aliasee, suffixes[i]);
1199
1200         if (! NILP (Ffind_coding_system (aliasee_subsidiary)))
1201           Fdefine_coding_system_alias (alias_subsidiary, aliasee_subsidiary);
1202       }
1203   }
1204   /* FSF return value is a vector of [ALIAS-unix ALIAS-dos ALIAS-mac],
1205      but it doesn't look intentional, so I'd rather return something
1206      meaningful or nothing at all. */
1207   return Qnil;
1208 }
1209
1210 static Lisp_Object
1211 subsidiary_coding_system (Lisp_Object coding_system, eol_type_t type)
1212 {
1213   Lisp_Coding_System *cs = XCODING_SYSTEM (coding_system);
1214   Lisp_Object new_coding_system;
1215
1216   if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
1217     return coding_system;
1218
1219   switch (type)
1220     {
1221     case EOL_AUTODETECT: return coding_system;
1222     case EOL_LF:   new_coding_system = CODING_SYSTEM_EOL_LF   (cs); break;
1223     case EOL_CR:   new_coding_system = CODING_SYSTEM_EOL_CR   (cs); break;
1224     case EOL_CRLF: new_coding_system = CODING_SYSTEM_EOL_CRLF (cs); break;
1225     default:       abort (); return Qnil;
1226     }
1227
1228   return NILP (new_coding_system) ? coding_system : new_coding_system;
1229 }
1230
1231 DEFUN ("subsidiary-coding-system", Fsubsidiary_coding_system, 2, 2, 0, /*
1232 Return the subsidiary coding system of CODING-SYSTEM with eol type EOL-TYPE.
1233 */
1234        (coding_system, eol_type))
1235 {
1236   coding_system = Fget_coding_system (coding_system);
1237
1238   return subsidiary_coding_system (coding_system,
1239                                    symbol_to_eol_type (eol_type));
1240 }
1241
1242 \f
1243 /************************************************************************/
1244 /*                         Coding system accessors                      */
1245 /************************************************************************/
1246
1247 DEFUN ("coding-system-doc-string", Fcoding_system_doc_string, 1, 1, 0, /*
1248 Return the doc string for CODING-SYSTEM.
1249 */
1250        (coding_system))
1251 {
1252   coding_system = Fget_coding_system (coding_system);
1253   return XCODING_SYSTEM_DOC_STRING (coding_system);
1254 }
1255
1256 DEFUN ("coding-system-type", Fcoding_system_type, 1, 1, 0, /*
1257 Return the type of CODING-SYSTEM.
1258 */
1259        (coding_system))
1260 {
1261   switch (XCODING_SYSTEM_TYPE (Fget_coding_system (coding_system)))
1262     {
1263     default: abort ();
1264     case CODESYS_AUTODETECT:    return Qundecided;
1265 #ifdef MULE
1266     case CODESYS_SHIFT_JIS:     return Qshift_jis;
1267     case CODESYS_ISO2022:       return Qiso2022;
1268     case CODESYS_BIG5:          return Qbig5;
1269     case CODESYS_UCS4:          return Qucs4;
1270     case CODESYS_UTF8:          return Qutf8;
1271     case CODESYS_CCL:           return Qccl;
1272 #endif
1273     case CODESYS_NO_CONVERSION: return Qno_conversion;
1274 #ifdef DEBUG_XEMACS
1275     case CODESYS_INTERNAL:      return Qinternal;
1276 #endif
1277     }
1278 }
1279
1280 #ifdef MULE
1281 static
1282 Lisp_Object coding_system_charset (Lisp_Object coding_system, int gnum)
1283 {
1284   Lisp_Object cs
1285     = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, gnum);
1286
1287   return CHARSETP (cs) ? XCHARSET_NAME (cs) : Qnil;
1288 }
1289
1290 DEFUN ("coding-system-charset", Fcoding_system_charset, 2, 2, 0, /*
1291 Return initial charset of CODING-SYSTEM designated to GNUM.
1292 GNUM allows 0 .. 3.
1293 */
1294        (coding_system, gnum))
1295 {
1296   coding_system = Fget_coding_system (coding_system);
1297   CHECK_INT (gnum);
1298
1299   return coding_system_charset (coding_system, XINT (gnum));
1300 }
1301 #endif /* MULE */
1302
1303 DEFUN ("coding-system-property", Fcoding_system_property, 2, 2, 0, /*
1304 Return the PROP property of CODING-SYSTEM.
1305 */
1306        (coding_system, prop))
1307 {
1308   int i, ok = 0;
1309   enum coding_system_type type;
1310
1311   coding_system = Fget_coding_system (coding_system);
1312   CHECK_SYMBOL (prop);
1313   type = XCODING_SYSTEM_TYPE (coding_system);
1314
1315   for (i = 0; !ok && i < Dynarr_length (the_codesys_prop_dynarr); i++)
1316     if (EQ (Dynarr_at (the_codesys_prop_dynarr, i).sym, prop))
1317       {
1318         ok = 1;
1319         switch (Dynarr_at (the_codesys_prop_dynarr, i).prop_type)
1320           {
1321           case CODESYS_PROP_ALL_OK:
1322             break;
1323 #ifdef MULE
1324           case CODESYS_PROP_ISO2022:
1325             if (type != CODESYS_ISO2022)
1326               signal_simple_error
1327                 ("Property only valid in ISO2022 coding systems",
1328                  prop);
1329             break;
1330
1331           case CODESYS_PROP_CCL:
1332             if (type != CODESYS_CCL)
1333               signal_simple_error
1334                 ("Property only valid in CCL coding systems",
1335                  prop);
1336             break;
1337 #endif /* MULE */
1338           default:
1339             abort ();
1340           }
1341       }
1342
1343   if (!ok)
1344     signal_simple_error ("Unrecognized property", prop);
1345
1346   if (EQ (prop, Qname))
1347     return XCODING_SYSTEM_NAME (coding_system);
1348   else if (EQ (prop, Qtype))
1349     return Fcoding_system_type (coding_system);
1350   else if (EQ (prop, Qdoc_string))
1351     return XCODING_SYSTEM_DOC_STRING (coding_system);
1352   else if (EQ (prop, Qmnemonic))
1353     return XCODING_SYSTEM_MNEMONIC (coding_system);
1354   else if (EQ (prop, Qeol_type))
1355     return eol_type_to_symbol (XCODING_SYSTEM_EOL_TYPE (coding_system));
1356   else if (EQ (prop, Qeol_lf))
1357     return XCODING_SYSTEM_EOL_LF (coding_system);
1358   else if (EQ (prop, Qeol_crlf))
1359     return XCODING_SYSTEM_EOL_CRLF (coding_system);
1360   else if (EQ (prop, Qeol_cr))
1361     return XCODING_SYSTEM_EOL_CR (coding_system);
1362   else if (EQ (prop, Qpost_read_conversion))
1363     return XCODING_SYSTEM_POST_READ_CONVERSION (coding_system);
1364   else if (EQ (prop, Qpre_write_conversion))
1365     return XCODING_SYSTEM_PRE_WRITE_CONVERSION (coding_system);
1366 #ifdef MULE
1367   else if (type == CODESYS_ISO2022)
1368     {
1369       if (EQ (prop, Qcharset_g0))
1370         return coding_system_charset (coding_system, 0);
1371       else if (EQ (prop, Qcharset_g1))
1372         return coding_system_charset (coding_system, 1);
1373       else if (EQ (prop, Qcharset_g2))
1374         return coding_system_charset (coding_system, 2);
1375       else if (EQ (prop, Qcharset_g3))
1376         return coding_system_charset (coding_system, 3);
1377
1378 #define FORCE_CHARSET(charset_num) \
1379   (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \
1380    (coding_system, charset_num) ? Qt : Qnil)
1381
1382       else if (EQ (prop, Qforce_g0_on_output)) return FORCE_CHARSET (0);
1383       else if (EQ (prop, Qforce_g1_on_output)) return FORCE_CHARSET (1);
1384       else if (EQ (prop, Qforce_g2_on_output)) return FORCE_CHARSET (2);
1385       else if (EQ (prop, Qforce_g3_on_output)) return FORCE_CHARSET (3);
1386
1387 #define LISP_BOOLEAN(prop) \
1388   (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil)
1389
1390       else if (EQ (prop, Qshort))         return LISP_BOOLEAN (SHORT);
1391       else if (EQ (prop, Qno_ascii_eol))  return LISP_BOOLEAN (NO_ASCII_EOL);
1392       else if (EQ (prop, Qno_ascii_cntl)) return LISP_BOOLEAN (NO_ASCII_CNTL);
1393       else if (EQ (prop, Qseven))         return LISP_BOOLEAN (SEVEN);
1394       else if (EQ (prop, Qlock_shift))    return LISP_BOOLEAN (LOCK_SHIFT);
1395       else if (EQ (prop, Qno_iso6429))    return LISP_BOOLEAN (NO_ISO6429);
1396       else if (EQ (prop, Qescape_quoted)) return LISP_BOOLEAN (ESCAPE_QUOTED);
1397
1398       else if (EQ (prop, Qinput_charset_conversion))
1399         return
1400           unparse_charset_conversion_specs
1401             (XCODING_SYSTEM (coding_system)->iso2022.input_conv);
1402       else if (EQ (prop, Qoutput_charset_conversion))
1403         return
1404           unparse_charset_conversion_specs
1405             (XCODING_SYSTEM (coding_system)->iso2022.output_conv);
1406       else
1407         abort ();
1408     }
1409   else if (type == CODESYS_CCL)
1410     {
1411       if (EQ (prop, Qdecode))
1412         return XCODING_SYSTEM_CCL_DECODE (coding_system);
1413       else if (EQ (prop, Qencode))
1414         return XCODING_SYSTEM_CCL_ENCODE (coding_system);
1415       else
1416         abort ();
1417     }
1418 #endif /* MULE */
1419   else
1420     abort ();
1421
1422   return Qnil; /* not reached */
1423 }
1424
1425 \f
1426 /************************************************************************/
1427 /*                       Coding category functions                      */
1428 /************************************************************************/
1429
1430 static int
1431 decode_coding_category (Lisp_Object symbol)
1432 {
1433   int i;
1434
1435   CHECK_SYMBOL (symbol);
1436   for (i = 0; i < CODING_CATEGORY_LAST; i++)
1437     if (EQ (coding_category_symbol[i], symbol))
1438       return i;
1439
1440   signal_simple_error ("Unrecognized coding category", symbol);
1441   return 0; /* not reached */
1442 }
1443
1444 DEFUN ("coding-category-list", Fcoding_category_list, 0, 0, 0, /*
1445 Return a list of all recognized coding categories.
1446 */
1447        ())
1448 {
1449   int i;
1450   Lisp_Object list = Qnil;
1451
1452   for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
1453     list = Fcons (coding_category_symbol[i], list);
1454   return list;
1455 }
1456
1457 DEFUN ("set-coding-priority-list", Fset_coding_priority_list, 1, 1, 0, /*
1458 Change the priority order of the coding categories.
1459 LIST should be list of coding categories, in descending order of
1460 priority.  Unspecified coding categories will be lower in priority
1461 than all specified ones, in the same relative order they were in
1462 previously.
1463 */
1464        (list))
1465 {
1466   int category_to_priority[CODING_CATEGORY_LAST];
1467   int i, j;
1468   Lisp_Object rest;
1469
1470   /* First generate a list that maps coding categories to priorities. */
1471
1472   for (i = 0; i < CODING_CATEGORY_LAST; i++)
1473     category_to_priority[i] = -1;
1474
1475   /* Highest priority comes from the specified list. */
1476   i = 0;
1477   EXTERNAL_LIST_LOOP (rest, list)
1478     {
1479       int cat = decode_coding_category (XCAR (rest));
1480
1481       if (category_to_priority[cat] >= 0)
1482         signal_simple_error ("Duplicate coding category in list", XCAR (rest));
1483       category_to_priority[cat] = i++;
1484     }
1485
1486   /* Now go through the existing categories by priority to retrieve
1487      the categories not yet specified and preserve their priority
1488      order. */
1489   for (j = 0; j < CODING_CATEGORY_LAST; j++)
1490     {
1491       int cat = fcd->coding_category_by_priority[j];
1492       if (category_to_priority[cat] < 0)
1493         category_to_priority[cat] = i++;
1494     }
1495
1496   /* Now we need to construct the inverse of the mapping we just
1497      constructed. */
1498
1499   for (i = 0; i < CODING_CATEGORY_LAST; i++)
1500     fcd->coding_category_by_priority[category_to_priority[i]] = i;
1501
1502   /* Phew!  That was confusing. */
1503   return Qnil;
1504 }
1505
1506 DEFUN ("coding-priority-list", Fcoding_priority_list, 0, 0, 0, /*
1507 Return a list of coding categories in descending order of priority.
1508 */
1509        ())
1510 {
1511   int i;
1512   Lisp_Object list = Qnil;
1513
1514   for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
1515     list = Fcons (coding_category_symbol[fcd->coding_category_by_priority[i]],
1516                   list);
1517   return list;
1518 }
1519
1520 DEFUN ("set-coding-category-system", Fset_coding_category_system, 2, 2, 0, /*
1521 Change the coding system associated with a coding category.
1522 */
1523        (coding_category, coding_system))
1524 {
1525   int cat = decode_coding_category (coding_category);
1526
1527   coding_system = Fget_coding_system (coding_system);
1528   fcd->coding_category_system[cat] = coding_system;
1529   return Qnil;
1530 }
1531
1532 DEFUN ("coding-category-system", Fcoding_category_system, 1, 1, 0, /*
1533 Return the coding system associated with a coding category.
1534 */
1535        (coding_category))
1536 {
1537   int cat = decode_coding_category (coding_category);
1538   Lisp_Object sys = fcd->coding_category_system[cat];
1539
1540   if (!NILP (sys))
1541     return XCODING_SYSTEM_NAME (sys);
1542   return Qnil;
1543 }
1544
1545 \f
1546 /************************************************************************/
1547 /*                     Detecting the encoding of data                   */
1548 /************************************************************************/
1549
1550 struct detection_state
1551 {
1552   eol_type_t eol_type;
1553   int seen_non_ascii;
1554   int mask;
1555 #ifdef MULE
1556   struct
1557     {
1558       int mask;
1559       int in_second_byte;
1560     }
1561   big5;
1562
1563   struct
1564     {
1565       int mask;
1566       int in_second_byte;
1567     }
1568   shift_jis;
1569
1570   struct
1571     {
1572       int mask;
1573       int in_byte;
1574   }
1575   ucs4;
1576
1577   struct
1578     {
1579       int mask;
1580       int in_byte;
1581     }
1582   utf8;
1583
1584   struct
1585     {
1586       int mask;
1587       int initted;
1588       struct iso2022_decoder iso;
1589       unsigned int flags;
1590       int high_byte_count;
1591       unsigned int saw_single_shift:1;
1592     }
1593   iso2022;
1594 #endif
1595   struct
1596     {
1597       int seen_anything;
1598       int just_saw_cr;
1599     }
1600   eol;
1601 };
1602
1603 static int
1604 acceptable_control_char_p (int c)
1605 {
1606   switch (c)
1607     {
1608       /* Allow and ignore control characters that you might
1609          reasonably see in a text file */
1610     case '\r':
1611     case '\n':
1612     case '\t':
1613     case  7: /* bell */
1614     case  8: /* backspace */
1615     case 11: /* vertical tab */
1616     case 12: /* form feed */
1617     case 26: /* MS-DOS C-z junk */
1618     case 31: /* '^_' -- for info */
1619       return 1;
1620     default:
1621       return 0;
1622     }
1623 }
1624
1625 static int
1626 mask_has_at_most_one_bit_p (int mask)
1627 {
1628   /* Perhaps the only thing useful you learn from intensive Microsoft
1629      technical interviews */
1630   return (mask & (mask - 1)) == 0;
1631 }
1632
1633 static eol_type_t
1634 detect_eol_type (struct detection_state *st, const Extbyte *src,
1635                  size_t n)
1636 {
1637   while (n--)
1638     {
1639       unsigned char c = *(unsigned char *)src++;
1640       if (c == '\n')
1641         {
1642           if (st->eol.just_saw_cr)
1643             return EOL_CRLF;
1644           else if (st->eol.seen_anything)
1645             return EOL_LF;
1646         }
1647       else if (st->eol.just_saw_cr)
1648         return EOL_CR;
1649       else if (c == '\r')
1650         st->eol.just_saw_cr = 1;
1651       else
1652         st->eol.just_saw_cr = 0;
1653       st->eol.seen_anything = 1;
1654     }
1655
1656   return EOL_AUTODETECT;
1657 }
1658
1659 /* Attempt to determine the encoding and EOL type of the given text.
1660    Before calling this function for the first type, you must initialize
1661    st->eol_type as appropriate and initialize st->mask to ~0.
1662
1663    st->eol_type holds the determined EOL type, or EOL_AUTODETECT if
1664    not yet known.
1665
1666    st->mask holds the determined coding category mask, or ~0 if only
1667    ASCII has been seen so far.
1668
1669    Returns:
1670
1671    0 == st->eol_type is EOL_AUTODETECT and/or more than coding category
1672         is present in st->mask
1673    1 == definitive answers are here for both st->eol_type and st->mask
1674 */
1675
1676 static int
1677 detect_coding_type (struct detection_state *st, const Extbyte *src,
1678                     size_t n, int just_do_eol)
1679 {
1680   if (st->eol_type == EOL_AUTODETECT)
1681     st->eol_type = detect_eol_type (st, src, n);
1682
1683   if (just_do_eol)
1684     return st->eol_type != EOL_AUTODETECT;
1685
1686   if (!st->seen_non_ascii)
1687     {
1688       for (; n; n--, src++)
1689         {
1690           unsigned char c = *(unsigned char *) src;
1691           if ((c < 0x20 && !acceptable_control_char_p (c)) || c >= 0x80)
1692             {
1693               st->seen_non_ascii = 1;
1694 #ifdef MULE
1695               st->shift_jis.mask = ~0;
1696               st->big5.mask = ~0;
1697               st->ucs4.mask = ~0;
1698               st->utf8.mask = ~0;
1699               st->iso2022.mask = ~0;
1700 #endif
1701               break;
1702             }
1703         }
1704     }
1705
1706   if (!n)
1707     return 0;
1708 #ifdef MULE
1709   if (!mask_has_at_most_one_bit_p (st->iso2022.mask))
1710     st->iso2022.mask = detect_coding_iso2022 (st, src, n);
1711   if (!mask_has_at_most_one_bit_p (st->shift_jis.mask))
1712     st->shift_jis.mask = detect_coding_sjis (st, src, n);
1713   if (!mask_has_at_most_one_bit_p (st->big5.mask))
1714     st->big5.mask = detect_coding_big5 (st, src, n);
1715   if (!mask_has_at_most_one_bit_p (st->utf8.mask))
1716     st->utf8.mask = detect_coding_utf8 (st, src, n);
1717   if (!mask_has_at_most_one_bit_p (st->ucs4.mask))
1718     st->ucs4.mask = detect_coding_ucs4 (st, src, n);
1719
1720   st->mask
1721     = st->iso2022.mask | st->shift_jis.mask | st->big5.mask
1722     | st->utf8.mask | st->ucs4.mask;
1723 #endif
1724   {
1725     int retval = mask_has_at_most_one_bit_p (st->mask);
1726     st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK;
1727     return retval && st->eol_type != EOL_AUTODETECT;
1728   }
1729 }
1730
1731 static Lisp_Object
1732 coding_system_from_mask (int mask)
1733 {
1734   if (mask == ~0)
1735     {
1736       /* If the file was entirely or basically ASCII, use the
1737          default value of `buffer-file-coding-system'. */
1738       Lisp_Object retval =
1739         XBUFFER (Vbuffer_defaults)->buffer_file_coding_system;
1740       if (!NILP (retval))
1741         {
1742           retval = Ffind_coding_system (retval);
1743           if (NILP (retval))
1744             {
1745               warn_when_safe
1746                 (Qbad_variable, Qwarning,
1747                  "Invalid `default-buffer-file-coding-system', set to nil");
1748               XBUFFER (Vbuffer_defaults)->buffer_file_coding_system = Qnil;
1749             }
1750         }
1751       if (NILP (retval))
1752         retval = Fget_coding_system (Qraw_text);
1753       return retval;
1754     }
1755   else
1756     {
1757       int i;
1758       int cat = -1;
1759 #ifdef MULE
1760       mask = postprocess_iso2022_mask (mask);
1761 #endif
1762       /* Look through the coding categories by priority and find
1763          the first one that is allowed. */
1764       for (i = 0; i < CODING_CATEGORY_LAST; i++)
1765         {
1766           cat = fcd->coding_category_by_priority[i];
1767           if ((mask & (1 << cat)) &&
1768               !NILP (fcd->coding_category_system[cat]))
1769             break;
1770         }
1771       if (cat >= 0)
1772         return fcd->coding_category_system[cat];
1773       else
1774         return Fget_coding_system (Qraw_text);
1775     }
1776 }
1777
1778 /* Given a seekable read stream and potential coding system and EOL type
1779    as specified, do any autodetection that is called for.  If the
1780    coding system and/or EOL type are not `autodetect', they will be left
1781    alone; but this function will never return an autodetect coding system
1782    or EOL type.
1783
1784    This function does not automatically fetch subsidiary coding systems;
1785    that should be unnecessary with the explicit eol-type argument. */
1786
1787 #define LENGTH(string_constant) (sizeof (string_constant) - 1)
1788
1789 void
1790 determine_real_coding_system (Lstream *stream, Lisp_Object *codesys_in_out,
1791                               eol_type_t *eol_type_in_out)
1792 {
1793   struct detection_state decst;
1794
1795   if (*eol_type_in_out == EOL_AUTODETECT)
1796     *eol_type_in_out = XCODING_SYSTEM_EOL_TYPE (*codesys_in_out);
1797
1798   xzero (decst);
1799   decst.eol_type = *eol_type_in_out;
1800   decst.mask = ~0;
1801
1802   /* If autodetection is called for, do it now. */
1803   if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT
1804       || *eol_type_in_out == EOL_AUTODETECT)
1805     {
1806       Extbyte buf[4096];
1807       Lisp_Object coding_system = Qnil;
1808       Extbyte *p;
1809       ssize_t nread = Lstream_read (stream, buf, sizeof (buf));
1810       Extbyte *scan_end;
1811
1812       /* Look for initial "-*-"; mode line prefix */
1813       for (p = buf,
1814              scan_end = buf + nread - LENGTH ("-*-coding:?-*-");
1815            p <= scan_end
1816              && *p != '\n'
1817              && *p != '\r';
1818            p++)
1819         if (*p == '-' && *(p+1) == '*' && *(p+2) == '-')
1820           {
1821             Extbyte *local_vars_beg = p + 3;
1822             /* Look for final "-*-"; mode line suffix */
1823             for (p = local_vars_beg,
1824                    scan_end = buf + nread - LENGTH ("-*-");
1825                  p <= scan_end
1826                    && *p != '\n'
1827                    && *p != '\r';
1828                  p++)
1829               if (*p == '-' && *(p+1) == '*' && *(p+2) == '-')
1830                 {
1831                   Extbyte *suffix = p;
1832                   /* Look for "coding:" */
1833                   for (p = local_vars_beg,
1834                          scan_end = suffix - LENGTH ("coding:?");
1835                        p <= scan_end;
1836                        p++)
1837                     if (memcmp ("coding:", p, LENGTH ("coding:")) == 0
1838                         && (p == local_vars_beg
1839                             || (*(p-1) == ' '  ||
1840                                 *(p-1) == '\t' ||
1841                                 *(p-1) == ';')))
1842                       {
1843                         Extbyte save;
1844                         int n;
1845                         p += LENGTH ("coding:");
1846                         while (*p == ' ' || *p == '\t') p++;
1847
1848                         /* Get coding system name */
1849                         save = *suffix; *suffix = '\0';
1850                         /* Characters valid in a MIME charset name (rfc 1521),
1851                            and in a Lisp symbol name. */
1852                         n = strspn ( (char *) p,
1853                                     "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
1854                                     "abcdefghijklmnopqrstuvwxyz"
1855                                     "0123456789"
1856                                     "!$%&*+-.^_{|}~");
1857                         *suffix = save;
1858                         if (n > 0)
1859                           {
1860                             save = p[n]; p[n] = '\0';
1861                             coding_system =
1862                               Ffind_coding_system (intern ((char *) p));
1863                             p[n] = save;
1864                           }
1865                         break;
1866                       }
1867                   break;
1868                 }
1869             break;
1870           }
1871
1872       if (NILP (coding_system))
1873         do
1874           {
1875             if (detect_coding_type (&decst, buf, nread,
1876                                     XCODING_SYSTEM_TYPE (*codesys_in_out)
1877                                     != CODESYS_AUTODETECT))
1878               break;
1879             nread = Lstream_read (stream, buf, sizeof (buf));
1880             if (nread == 0)
1881               break;
1882           }
1883         while (1);
1884
1885       else if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT
1886                && XCODING_SYSTEM_EOL_TYPE (coding_system) == EOL_AUTODETECT)
1887         do
1888           {
1889             if (detect_coding_type (&decst, buf, nread, 1))
1890               break;
1891             nread = Lstream_read (stream, buf, sizeof (buf));
1892             if (!nread)
1893               break;
1894           }
1895         while (1);
1896
1897       *eol_type_in_out = decst.eol_type;
1898       if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT)
1899         {
1900           if (NILP (coding_system))
1901             *codesys_in_out = coding_system_from_mask (decst.mask);
1902           else
1903             *codesys_in_out = coding_system;
1904         }
1905     }
1906
1907   /* If we absolutely can't determine the EOL type, just assume LF. */
1908   if (*eol_type_in_out == EOL_AUTODETECT)
1909     *eol_type_in_out = EOL_LF;
1910
1911   Lstream_rewind (stream);
1912 }
1913
1914 DEFUN ("detect-coding-region", Fdetect_coding_region, 2, 3, 0, /*
1915 Detect coding system of the text in the region between START and END.
1916 Return a list of possible coding systems ordered by priority.
1917 If only ASCII characters are found, return 'undecided or one of
1918 its subsidiary coding systems according to a detected end-of-line
1919 type.  Optional arg BUFFER defaults to the current buffer.
1920 */
1921        (start, end, buffer))
1922 {
1923   Lisp_Object val = Qnil;
1924   struct buffer *buf = decode_buffer (buffer, 0);
1925   Bufpos b, e;
1926   Lisp_Object instream, lb_instream;
1927   Lstream *istr, *lb_istr;
1928   struct detection_state decst;
1929   struct gcpro gcpro1, gcpro2;
1930
1931   get_buffer_range_char (buf, start, end, &b, &e, 0);
1932   lb_instream = make_lisp_buffer_input_stream (buf, b, e, 0);
1933   lb_istr = XLSTREAM (lb_instream);
1934   instream = make_encoding_input_stream (lb_istr, Fget_coding_system (Qbinary));
1935   istr = XLSTREAM (instream);
1936   GCPRO2 (instream, lb_instream);
1937   xzero (decst);
1938   decst.eol_type = EOL_AUTODETECT;
1939   decst.mask = ~0;
1940   while (1)
1941     {
1942       Extbyte random_buffer[4096];
1943       ssize_t nread = Lstream_read (istr, random_buffer, sizeof (random_buffer));
1944
1945       if (!nread)
1946         break;
1947       if (detect_coding_type (&decst, random_buffer, nread, 0))
1948         break;
1949     }
1950
1951   if (decst.mask == ~0)
1952     val = subsidiary_coding_system (Fget_coding_system (Qundecided),
1953                                     decst.eol_type);
1954   else
1955     {
1956       int i;
1957
1958       val = Qnil;
1959 #ifdef MULE
1960       decst.mask = postprocess_iso2022_mask (decst.mask);
1961 #endif
1962       for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--)
1963         {
1964           int sys = fcd->coding_category_by_priority[i];
1965           if (decst.mask & (1 << sys))
1966             {
1967               Lisp_Object codesys = fcd->coding_category_system[sys];
1968               if (!NILP (codesys))
1969                 codesys = subsidiary_coding_system (codesys, decst.eol_type);
1970               val = Fcons (codesys, val);
1971             }
1972         }
1973     }
1974   Lstream_close (istr);
1975   UNGCPRO;
1976   Lstream_delete (istr);
1977   Lstream_delete (lb_istr);
1978   return val;
1979 }
1980
1981 \f
1982 /************************************************************************/
1983 /*           Converting to internal Mule format ("decoding")            */
1984 /************************************************************************/
1985
1986 /* A decoding stream is a stream used for decoding text (i.e.
1987    converting from some external format to internal format).
1988    The decoding-stream object keeps track of the actual coding
1989    stream, the stream that is at the other end, and data that
1990    needs to be persistent across the lifetime of the stream. */
1991
1992 /* Handle the EOL stuff related to just-read-in character C.
1993    EOL_TYPE is the EOL type of the coding stream.
1994    FLAGS is the current value of FLAGS in the coding stream, and may
1995    be modified by this macro.  (The macro only looks at the
1996    CODING_STATE_CR flag.)  DST is the Dynarr to which the decoded
1997    bytes are to be written.  You need to also define a local goto
1998    label "label_continue_loop" that is at the end of the main
1999    character-reading loop.
2000
2001    If C is a CR character, then this macro handles it entirely and
2002    jumps to label_continue_loop.  Otherwise, this macro does not add
2003    anything to DST, and continues normally.  You should continue
2004    processing C normally after this macro. */
2005
2006 #define DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst)         \
2007 do {                                                            \
2008   if (c == '\r')                                                \
2009     {                                                           \
2010       if (eol_type == EOL_CR)                                   \
2011         Dynarr_add (dst, '\n');                                 \
2012       else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR) \
2013         Dynarr_add (dst, c);                                    \
2014       else                                                      \
2015         flags |= CODING_STATE_CR;                               \
2016       goto label_continue_loop;                                 \
2017     }                                                           \
2018   else if (flags & CODING_STATE_CR)                             \
2019     {   /* eol_type == CODING_SYSTEM_EOL_CRLF */                \
2020       if (c != '\n')                                            \
2021         Dynarr_add (dst, '\r');                                 \
2022       flags &= ~CODING_STATE_CR;                                \
2023     }                                                           \
2024 } while (0)
2025
2026 /* C should be a binary character in the range 0 - 255; convert
2027    to internal format and add to Dynarr DST. */
2028
2029 #ifdef UTF2000
2030 #define DECODE_ADD_BINARY_CHAR(c, dst) \
2031 do {                                            \
2032   if (BYTE_ASCII_P (c))                         \
2033     Dynarr_add (dst, c);                        \
2034   else                                          \
2035     {                                           \
2036       Dynarr_add (dst, (c >> 6) | 0xc0);        \
2037       Dynarr_add (dst, (c & 0x3f) | 0x80);      \
2038     }                                           \
2039 } while (0)
2040
2041 INLINE void
2042 DECODE_ADD_UCS_CHAR(Emchar c, unsigned_char_dynarr* dst)
2043 {
2044   if ( c <= 0x7f )
2045     {
2046       Dynarr_add (dst, c);
2047     }
2048   else if ( c <= 0x7ff )
2049     {
2050       Dynarr_add (dst, (c >> 6) | 0xc0);
2051       Dynarr_add (dst, (c & 0x3f) | 0x80);
2052     }
2053   else if ( c <= 0xffff )
2054     {
2055       Dynarr_add (dst,  (c >> 12) | 0xe0);
2056       Dynarr_add (dst, ((c >>  6) & 0x3f) | 0x80);
2057       Dynarr_add (dst,  (c        & 0x3f) | 0x80);
2058     }
2059   else if ( c <= 0x1fffff )
2060     {
2061       Dynarr_add (dst,  (c >> 18) | 0xf0);
2062       Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2063       Dynarr_add (dst, ((c >>  6) & 0x3f) | 0x80);
2064       Dynarr_add (dst,  (c        & 0x3f) | 0x80);
2065     }
2066   else if ( c <= 0x3ffffff )
2067     {
2068       Dynarr_add (dst,  (c >> 24) | 0xf8);
2069       Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
2070       Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2071       Dynarr_add (dst, ((c >>  6) & 0x3f) | 0x80);
2072       Dynarr_add (dst,  (c        & 0x3f) | 0x80);
2073     }
2074   else
2075     {
2076       Dynarr_add (dst,  (c >> 30) | 0xfc);
2077       Dynarr_add (dst, ((c >> 24) & 0x3f) | 0x80);
2078       Dynarr_add (dst, ((c >> 18) & 0x3f) | 0x80);
2079       Dynarr_add (dst, ((c >> 12) & 0x3f) | 0x80);
2080       Dynarr_add (dst, ((c >>  6) & 0x3f) | 0x80);
2081       Dynarr_add (dst,  (c        & 0x3f) | 0x80);
2082     }
2083 }
2084 #else
2085 #define DECODE_ADD_BINARY_CHAR(c, dst)          \
2086 do {                                            \
2087   if (BYTE_ASCII_P (c))                         \
2088     Dynarr_add (dst, c);                        \
2089   else if (BYTE_C1_P (c))                       \
2090     {                                           \
2091       Dynarr_add (dst, LEADING_BYTE_CONTROL_1); \
2092       Dynarr_add (dst, c + 0x20);               \
2093     }                                           \
2094   else                                          \
2095     {                                           \
2096       Dynarr_add (dst, LEADING_BYTE_LATIN_ISO8859_1); \
2097       Dynarr_add (dst, c);                      \
2098     }                                           \
2099 } while (0)
2100 #endif
2101
2102 #define DECODE_OUTPUT_PARTIAL_CHAR(ch)  \
2103 do {                                    \
2104   if (ch)                               \
2105     {                                   \
2106       DECODE_ADD_BINARY_CHAR (ch, dst); \
2107       ch = 0;                           \
2108     }                                   \
2109 } while (0)
2110
2111 #define DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst) \
2112 do {                                    \
2113   if (flags & CODING_STATE_END)         \
2114     {                                   \
2115       DECODE_OUTPUT_PARTIAL_CHAR (ch);  \
2116       if (flags & CODING_STATE_CR)      \
2117         Dynarr_add (dst, '\r');         \
2118     }                                   \
2119 } while (0)
2120
2121 #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding)
2122
2123 struct decoding_stream
2124 {
2125   /* Coding system that governs the conversion. */
2126   Lisp_Coding_System *codesys;
2127
2128   /* Stream that we read the encoded data from or
2129      write the decoded data to. */
2130   Lstream *other_end;
2131
2132   /* If we are reading, then we can return only a fixed amount of
2133      data, so if the conversion resulted in too much data, we store it
2134      here for retrieval the next time around. */
2135   unsigned_char_dynarr *runoff;
2136
2137   /* FLAGS holds flags indicating the current state of the decoding.
2138      Some of these flags are dependent on the coding system. */
2139   unsigned int flags;
2140
2141   /* CH holds a partially built-up character.  Since we only deal
2142      with one- and two-byte characters at the moment, we only use
2143      this to store the first byte of a two-byte character. */
2144   unsigned int ch;
2145
2146   /* EOL_TYPE specifies the type of end-of-line conversion that
2147      currently applies.  We need to keep this separate from the
2148      EOL type stored in CODESYS because the latter might indicate
2149      automatic EOL-type detection while the former will always
2150      indicate a particular EOL type. */
2151   eol_type_t eol_type;
2152 #ifdef MULE
2153   /* Additional ISO2022 information.  We define the structure above
2154      because it's also needed by the detection routines. */
2155   struct iso2022_decoder iso2022;
2156
2157   /* Additional information (the state of the running CCL program)
2158      used by the CCL decoder. */
2159   struct ccl_program ccl;
2160
2161   /* counter for UTF-8 or UCS-4 */
2162   unsigned char counter;
2163 #endif
2164   struct detection_state decst;
2165 };
2166
2167 static ssize_t decoding_reader (Lstream *stream,
2168                                 unsigned char *data, size_t size);
2169 static ssize_t decoding_writer (Lstream *stream,
2170                                 const unsigned char *data, size_t size);
2171 static int decoding_rewinder   (Lstream *stream);
2172 static int decoding_seekable_p (Lstream *stream);
2173 static int decoding_flusher    (Lstream *stream);
2174 static int decoding_closer     (Lstream *stream);
2175
2176 static Lisp_Object decoding_marker (Lisp_Object stream);
2177
2178 DEFINE_LSTREAM_IMPLEMENTATION ("decoding", lstream_decoding,
2179                                sizeof (struct decoding_stream));
2180
2181 static Lisp_Object
2182 decoding_marker (Lisp_Object stream)
2183 {
2184   Lstream *str = DECODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2185   Lisp_Object str_obj;
2186
2187   /* We do not need to mark the coding systems or charsets stored
2188      within the stream because they are stored in a global list
2189      and automatically marked. */
2190
2191   XSETLSTREAM (str_obj, str);
2192   mark_object (str_obj);
2193   if (str->imp->marker)
2194     return (str->imp->marker) (str_obj);
2195   else
2196     return Qnil;
2197 }
2198
2199 /* Read SIZE bytes of data and store it into DATA.  We are a decoding stream
2200    so we read data from the other end, decode it, and store it into DATA. */
2201
2202 static ssize_t
2203 decoding_reader (Lstream *stream, unsigned char *data, size_t size)
2204 {
2205   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2206   unsigned char *orig_data = data;
2207   ssize_t read_size;
2208   int error_occurred = 0;
2209
2210   /* We need to interface to mule_decode(), which expects to take some
2211      amount of data and store the result into a Dynarr.  We have
2212      mule_decode() store into str->runoff, and take data from there
2213      as necessary. */
2214
2215   /* We loop until we have enough data, reading chunks from the other
2216      end and decoding it. */
2217   while (1)
2218     {
2219       /* Take data from the runoff if we can.  Make sure to take at
2220          most SIZE bytes, and delete the data from the runoff. */
2221       if (Dynarr_length (str->runoff) > 0)
2222         {
2223           size_t chunk = min (size, (size_t) Dynarr_length (str->runoff));
2224           memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2225           Dynarr_delete_many (str->runoff, 0, chunk);
2226           data += chunk;
2227           size -= chunk;
2228         }
2229
2230       if (size == 0)
2231         break; /* No more room for data */
2232
2233       if (str->flags & CODING_STATE_END)
2234         /* This means that on the previous iteration, we hit the EOF on
2235            the other end.  We loop once more so that mule_decode() can
2236            output any final stuff it may be holding, or any "go back
2237            to a sane state" escape sequences. (This latter makes sense
2238            during encoding.) */
2239         break;
2240
2241       /* Exhausted the runoff, so get some more.  DATA has at least
2242          SIZE bytes left of storage in it, so it's OK to read directly
2243          into it.  (We'll be overwriting above, after we've decoded it
2244          into the runoff.) */
2245       read_size = Lstream_read (str->other_end, data, size);
2246       if (read_size < 0)
2247         {
2248           error_occurred = 1;
2249           break;
2250         }
2251       if (read_size == 0)
2252         /* There might be some more end data produced in the translation.
2253            See the comment above. */
2254         str->flags |= CODING_STATE_END;
2255       mule_decode (stream, (Extbyte *) data, str->runoff, read_size);
2256     }
2257
2258   if (data - orig_data == 0)
2259     return error_occurred ? -1 : 0;
2260   else
2261     return data - orig_data;
2262 }
2263
2264 static ssize_t
2265 decoding_writer (Lstream *stream, const unsigned char *data, size_t size)
2266 {
2267   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2268   ssize_t retval;
2269
2270   /* Decode all our data into the runoff, and then attempt to write
2271      it all out to the other end.  Remove whatever chunk we succeeded
2272      in writing. */
2273   mule_decode (stream, (Extbyte *) data, str->runoff, size);
2274   retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2275                           Dynarr_length (str->runoff));
2276   if (retval > 0)
2277     Dynarr_delete_many (str->runoff, 0, retval);
2278   /* Do NOT return retval.  The return value indicates how much
2279      of the incoming data was written, not how many bytes were
2280      written. */
2281   return size;
2282 }
2283
2284 static void
2285 reset_decoding_stream (struct decoding_stream *str)
2286 {
2287 #ifdef MULE
2288   if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_ISO2022)
2289     {
2290       Lisp_Object coding_system;
2291       XSETCODING_SYSTEM (coding_system, str->codesys);
2292       reset_iso2022 (coding_system, &str->iso2022);
2293     }
2294   else if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_CCL)
2295     {
2296       setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_DECODE (str->codesys));
2297     }
2298   str->counter = 0;
2299 #endif /* MULE */
2300   if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT
2301       || CODING_SYSTEM_EOL_TYPE (str->codesys) == EOL_AUTODETECT)
2302     {
2303       xzero (str->decst);
2304       str->decst.eol_type = EOL_AUTODETECT;
2305       str->decst.mask = ~0;
2306     }
2307   str->flags = str->ch = 0;
2308 }
2309
2310 static int
2311 decoding_rewinder (Lstream *stream)
2312 {
2313   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2314   reset_decoding_stream (str);
2315   Dynarr_reset (str->runoff);
2316   return Lstream_rewind (str->other_end);
2317 }
2318
2319 static int
2320 decoding_seekable_p (Lstream *stream)
2321 {
2322   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2323   return Lstream_seekable_p (str->other_end);
2324 }
2325
2326 static int
2327 decoding_flusher (Lstream *stream)
2328 {
2329   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2330   return Lstream_flush (str->other_end);
2331 }
2332
2333 static int
2334 decoding_closer (Lstream *stream)
2335 {
2336   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2337   if (stream->flags & LSTREAM_FL_WRITE)
2338     {
2339       str->flags |= CODING_STATE_END;
2340       decoding_writer (stream, 0, 0);
2341     }
2342   Dynarr_free (str->runoff);
2343 #ifdef MULE
2344 #ifdef ENABLE_COMPOSITE_CHARS
2345   if (str->iso2022.composite_chars)
2346     Dynarr_free (str->iso2022.composite_chars);
2347 #endif
2348 #endif
2349   return Lstream_close (str->other_end);
2350 }
2351
2352 Lisp_Object
2353 decoding_stream_coding_system (Lstream *stream)
2354 {
2355   Lisp_Object coding_system;
2356   struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2357
2358   XSETCODING_SYSTEM (coding_system, str->codesys);
2359   return subsidiary_coding_system (coding_system, str->eol_type);
2360 }
2361
2362 void
2363 set_decoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2364 {
2365   Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2366   struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2367   str->codesys = cs;
2368   if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
2369     str->eol_type = CODING_SYSTEM_EOL_TYPE (cs);
2370   reset_decoding_stream (str);
2371 }
2372
2373 /* WARNING WARNING WARNING WARNING!!!!!  If you open up a decoding
2374    stream for writing, no automatic code detection will be performed.
2375    The reason for this is that automatic code detection requires a
2376    seekable input.  Things will also fail if you open a decoding
2377    stream for reading using a non-fully-specified coding system and
2378    a non-seekable input stream. */
2379
2380 static Lisp_Object
2381 make_decoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2382                         const char *mode)
2383 {
2384   Lstream *lstr = Lstream_new (lstream_decoding, mode);
2385   struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2386   Lisp_Object obj;
2387
2388   xzero (*str);
2389   str->other_end = stream;
2390   str->runoff = (unsigned_char_dynarr *) Dynarr_new (unsigned_char);
2391   str->eol_type = EOL_AUTODETECT;
2392   if (!strcmp (mode, "r")
2393       && Lstream_seekable_p (stream))
2394     /* We can determine the coding system now. */
2395     determine_real_coding_system (stream, &codesys, &str->eol_type);
2396   set_decoding_stream_coding_system (lstr, codesys);
2397   str->decst.eol_type = str->eol_type;
2398   str->decst.mask = ~0;
2399   XSETLSTREAM (obj, lstr);
2400   return obj;
2401 }
2402
2403 Lisp_Object
2404 make_decoding_input_stream (Lstream *stream, Lisp_Object codesys)
2405 {
2406   return make_decoding_stream_1 (stream, codesys, "r");
2407 }
2408
2409 Lisp_Object
2410 make_decoding_output_stream (Lstream *stream, Lisp_Object codesys)
2411 {
2412   return make_decoding_stream_1 (stream, codesys, "w");
2413 }
2414
2415 /* Note: the decode_coding_* functions all take the same
2416    arguments as mule_decode(), which is to say some SRC data of
2417    size N, which is to be stored into dynamic array DST.
2418    DECODING is the stream within which the decoding is
2419    taking place, but no data is actually read from or
2420    written to that stream; that is handled in decoding_reader()
2421    or decoding_writer().  This allows the same functions to
2422    be used for both reading and writing. */
2423
2424 static void
2425 mule_decode (Lstream *decoding, const Extbyte *src,
2426              unsigned_char_dynarr *dst, size_t n)
2427 {
2428   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2429
2430   /* If necessary, do encoding-detection now.  We do this when
2431      we're a writing stream or a non-seekable reading stream,
2432      meaning that we can't just process the whole input,
2433      rewind, and start over. */
2434
2435   if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT ||
2436       str->eol_type == EOL_AUTODETECT)
2437     {
2438       Lisp_Object codesys;
2439
2440       XSETCODING_SYSTEM (codesys, str->codesys);
2441       detect_coding_type (&str->decst, src, n,
2442                           CODING_SYSTEM_TYPE (str->codesys) !=
2443                           CODESYS_AUTODETECT);
2444       if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT &&
2445           str->decst.mask != ~0)
2446         /* #### This is cheesy.  What we really ought to do is
2447            buffer up a certain amount of data so as to get a
2448            less random result. */
2449         codesys = coding_system_from_mask (str->decst.mask);
2450       str->eol_type = str->decst.eol_type;
2451       if (XCODING_SYSTEM (codesys) != str->codesys)
2452         {
2453           /* Preserve the CODING_STATE_END flag in case it was set.
2454              If we erase it, bad things might happen. */
2455           int was_end = str->flags & CODING_STATE_END;
2456           set_decoding_stream_coding_system (decoding, codesys);
2457           if (was_end)
2458             str->flags |= CODING_STATE_END;
2459         }
2460     }
2461
2462   switch (CODING_SYSTEM_TYPE (str->codesys))
2463     {
2464 #ifdef DEBUG_XEMACS
2465     case CODESYS_INTERNAL:
2466       Dynarr_add_many (dst, src, n);
2467       break;
2468 #endif
2469     case CODESYS_AUTODETECT:
2470       /* If we got this far and still haven't decided on the coding
2471          system, then do no conversion. */
2472     case CODESYS_NO_CONVERSION:
2473       decode_coding_no_conversion (decoding, src, dst, n);
2474       break;
2475 #ifdef MULE
2476     case CODESYS_SHIFT_JIS:
2477       decode_coding_sjis (decoding, src, dst, n);
2478       break;
2479     case CODESYS_BIG5:
2480       decode_coding_big5 (decoding, src, dst, n);
2481       break;
2482     case CODESYS_UCS4:
2483       decode_coding_ucs4 (decoding, src, dst, n);
2484       break;
2485     case CODESYS_UTF8:
2486       decode_coding_utf8 (decoding, src, dst, n);
2487       break;
2488     case CODESYS_CCL:
2489       str->ccl.last_block = str->flags & CODING_STATE_END;
2490       /* When applying ccl program to stream, MUST NOT set NULL
2491          pointer to src.  */
2492       ccl_driver (&str->ccl, (src ? (unsigned char *)src : (unsigned char*)""),
2493                   dst, n, 0, CCL_MODE_DECODING);
2494       break;
2495     case CODESYS_ISO2022:
2496       decode_coding_iso2022 (decoding, src, dst, n);
2497       break;
2498 #endif /* MULE */
2499     default:
2500       abort ();
2501     }
2502 }
2503
2504 DEFUN ("decode-coding-region", Fdecode_coding_region, 3, 4, 0, /*
2505 Decode the text between START and END which is encoded in CODING-SYSTEM.
2506 This is useful if you've read in encoded text from a file without decoding
2507 it (e.g. you read in a JIS-formatted file but used the `binary' or
2508 `no-conversion' coding system, so that it shows up as "^[$B!<!+^[(B").
2509 Return length of decoded text.
2510 BUFFER defaults to the current buffer if unspecified.
2511 */
2512        (start, end, coding_system, buffer))
2513 {
2514   Bufpos b, e;
2515   struct buffer *buf = decode_buffer (buffer, 0);
2516   Lisp_Object instream, lb_outstream, de_outstream, outstream;
2517   Lstream *istr, *ostr;
2518   struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2519
2520   get_buffer_range_char (buf, start, end, &b, &e, 0);
2521
2522   barf_if_buffer_read_only (buf, b, e);
2523
2524   coding_system = Fget_coding_system (coding_system);
2525   instream = make_lisp_buffer_input_stream  (buf, b, e, 0);
2526   lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2527   de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2528                                               coding_system);
2529   outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2530                                            Fget_coding_system (Qbinary));
2531   istr = XLSTREAM (instream);
2532   ostr = XLSTREAM (outstream);
2533   GCPRO4 (instream, lb_outstream, de_outstream, outstream);
2534
2535   /* The chain of streams looks like this:
2536
2537      [BUFFER] <----- send through
2538                      ------> [ENCODE AS BINARY]
2539                              ------> [DECODE AS SPECIFIED]
2540                                      ------> [BUFFER]
2541    */
2542
2543   while (1)
2544     {
2545       char tempbuf[1024]; /* some random amount */
2546       Bufpos newpos, even_newer_pos;
2547       Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2548       ssize_t size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2549
2550       if (!size_in_bytes)
2551         break;
2552       newpos = lisp_buffer_stream_startpos (istr);
2553       Lstream_write (ostr, tempbuf, size_in_bytes);
2554       even_newer_pos = lisp_buffer_stream_startpos (istr);
2555       buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2556                            even_newer_pos, 0);
2557     }
2558   Lstream_close (istr);
2559   Lstream_close (ostr);
2560   UNGCPRO;
2561   Lstream_delete (istr);
2562   Lstream_delete (ostr);
2563   Lstream_delete (XLSTREAM (de_outstream));
2564   Lstream_delete (XLSTREAM (lb_outstream));
2565   return Qnil;
2566 }
2567
2568 \f
2569 /************************************************************************/
2570 /*           Converting to an external encoding ("encoding")            */
2571 /************************************************************************/
2572
2573 /* An encoding stream is an output stream.  When you create the
2574    stream, you specify the coding system that governs the encoding
2575    and another stream that the resulting encoded data is to be
2576    sent to, and then start sending data to it. */
2577
2578 #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding)
2579
2580 struct encoding_stream
2581 {
2582   /* Coding system that governs the conversion. */
2583   Lisp_Coding_System *codesys;
2584
2585   /* Stream that we read the encoded data from or
2586      write the decoded data to. */
2587   Lstream *other_end;
2588
2589   /* If we are reading, then we can return only a fixed amount of
2590      data, so if the conversion resulted in too much data, we store it
2591      here for retrieval the next time around. */
2592   unsigned_char_dynarr *runoff;
2593
2594   /* FLAGS holds flags indicating the current state of the encoding.
2595      Some of these flags are dependent on the coding system. */
2596   unsigned int flags;
2597
2598   /* CH holds a partially built-up character.  Since we only deal
2599      with one- and two-byte characters at the moment, we only use
2600      this to store the first byte of a two-byte character. */
2601   unsigned int ch;
2602 #ifdef MULE
2603   /* Additional information used by the ISO2022 encoder. */
2604   struct
2605     {
2606       /* CHARSET holds the character sets currently assigned to the G0
2607          through G3 registers.  It is initialized from the array
2608          INITIAL_CHARSET in CODESYS. */
2609       Lisp_Object charset[4];
2610
2611       /* Which registers are currently invoked into the left (GL) and
2612          right (GR) halves of the 8-bit encoding space? */
2613       int register_left, register_right;
2614
2615       /* Whether we need to explicitly designate the charset in the
2616          G? register before using it.  It is initialized from the
2617          array FORCE_CHARSET_ON_OUTPUT in CODESYS. */
2618       unsigned char force_charset_on_output[4];
2619
2620       /* Other state variables that need to be preserved across
2621          invocations. */
2622       Lisp_Object current_charset;
2623       int current_half;
2624       int current_char_boundary;
2625     } iso2022;
2626
2627   /* Additional information (the state of the running CCL program)
2628      used by the CCL encoder. */
2629   struct ccl_program ccl;
2630 #endif /* MULE */
2631 };
2632
2633 static ssize_t encoding_reader (Lstream *stream, unsigned char *data, size_t size);
2634 static ssize_t encoding_writer (Lstream *stream, const unsigned char *data,
2635                                 size_t size);
2636 static int encoding_rewinder   (Lstream *stream);
2637 static int encoding_seekable_p (Lstream *stream);
2638 static int encoding_flusher    (Lstream *stream);
2639 static int encoding_closer     (Lstream *stream);
2640
2641 static Lisp_Object encoding_marker (Lisp_Object stream);
2642
2643 DEFINE_LSTREAM_IMPLEMENTATION ("encoding", lstream_encoding,
2644                                sizeof (struct encoding_stream));
2645
2646 static Lisp_Object
2647 encoding_marker (Lisp_Object stream)
2648 {
2649   Lstream *str = ENCODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2650   Lisp_Object str_obj;
2651
2652   /* We do not need to mark the coding systems or charsets stored
2653      within the stream because they are stored in a global list
2654      and automatically marked. */
2655
2656   XSETLSTREAM (str_obj, str);
2657   mark_object (str_obj);
2658   if (str->imp->marker)
2659     return (str->imp->marker) (str_obj);
2660   else
2661     return Qnil;
2662 }
2663
2664 /* Read SIZE bytes of data and store it into DATA.  We are a encoding stream
2665    so we read data from the other end, encode it, and store it into DATA. */
2666
2667 static ssize_t
2668 encoding_reader (Lstream *stream, unsigned char *data, size_t size)
2669 {
2670   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2671   unsigned char *orig_data = data;
2672   ssize_t read_size;
2673   int error_occurred = 0;
2674
2675   /* We need to interface to mule_encode(), which expects to take some
2676      amount of data and store the result into a Dynarr.  We have
2677      mule_encode() store into str->runoff, and take data from there
2678      as necessary. */
2679
2680   /* We loop until we have enough data, reading chunks from the other
2681      end and encoding it. */
2682   while (1)
2683     {
2684       /* Take data from the runoff if we can.  Make sure to take at
2685          most SIZE bytes, and delete the data from the runoff. */
2686       if (Dynarr_length (str->runoff) > 0)
2687         {
2688           int chunk = min ((int) size, Dynarr_length (str->runoff));
2689           memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2690           Dynarr_delete_many (str->runoff, 0, chunk);
2691           data += chunk;
2692           size -= chunk;
2693         }
2694
2695       if (size == 0)
2696         break; /* No more room for data */
2697
2698       if (str->flags & CODING_STATE_END)
2699         /* This means that on the previous iteration, we hit the EOF on
2700            the other end.  We loop once more so that mule_encode() can
2701            output any final stuff it may be holding, or any "go back
2702            to a sane state" escape sequences. (This latter makes sense
2703            during encoding.) */
2704         break;
2705
2706       /* Exhausted the runoff, so get some more.  DATA at least SIZE bytes
2707          left of storage in it, so it's OK to read directly into it.
2708          (We'll be overwriting above, after we've encoded it into the
2709          runoff.) */
2710       read_size = Lstream_read (str->other_end, data, size);
2711       if (read_size < 0)
2712         {
2713           error_occurred = 1;
2714           break;
2715         }
2716       if (read_size == 0)
2717         /* There might be some more end data produced in the translation.
2718            See the comment above. */
2719         str->flags |= CODING_STATE_END;
2720       mule_encode (stream, data, str->runoff, read_size);
2721     }
2722
2723   if (data == orig_data)
2724     return error_occurred ? -1 : 0;
2725   else
2726     return data - orig_data;
2727 }
2728
2729 static ssize_t
2730 encoding_writer (Lstream *stream, const unsigned char *data, size_t size)
2731 {
2732   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2733   ssize_t retval;
2734
2735   /* Encode all our data into the runoff, and then attempt to write
2736      it all out to the other end.  Remove whatever chunk we succeeded
2737      in writing. */
2738   mule_encode (stream, data, str->runoff, size);
2739   retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2740                           Dynarr_length (str->runoff));
2741   if (retval > 0)
2742     Dynarr_delete_many (str->runoff, 0, retval);
2743   /* Do NOT return retval.  The return value indicates how much
2744      of the incoming data was written, not how many bytes were
2745      written. */
2746   return size;
2747 }
2748
2749 static void
2750 reset_encoding_stream (struct encoding_stream *str)
2751 {
2752 #ifdef MULE
2753   switch (CODING_SYSTEM_TYPE (str->codesys))
2754     {
2755     case CODESYS_ISO2022:
2756       {
2757         int i;
2758
2759         for (i = 0; i < 4; i++)
2760           {
2761             str->iso2022.charset[i] =
2762               CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, i);
2763             str->iso2022.force_charset_on_output[i] =
2764               CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (str->codesys, i);
2765           }
2766         str->iso2022.register_left = 0;
2767         str->iso2022.register_right = 1;
2768         str->iso2022.current_charset = Qnil;
2769         str->iso2022.current_half = 0;
2770 #ifdef UTF2000
2771         str->iso2022.current_char_boundary = 0;
2772 #else
2773         str->iso2022.current_char_boundary = 1;
2774 #endif
2775         break;
2776       }
2777     case CODESYS_CCL:
2778       setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_ENCODE (str->codesys));
2779       break;
2780     default:
2781       break;
2782     }
2783 #endif /* MULE */
2784
2785   str->flags = str->ch = 0;
2786 }
2787
2788 static int
2789 encoding_rewinder (Lstream *stream)
2790 {
2791   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2792   reset_encoding_stream (str);
2793   Dynarr_reset (str->runoff);
2794   return Lstream_rewind (str->other_end);
2795 }
2796
2797 static int
2798 encoding_seekable_p (Lstream *stream)
2799 {
2800   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2801   return Lstream_seekable_p (str->other_end);
2802 }
2803
2804 static int
2805 encoding_flusher (Lstream *stream)
2806 {
2807   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2808   return Lstream_flush (str->other_end);
2809 }
2810
2811 static int
2812 encoding_closer (Lstream *stream)
2813 {
2814   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2815   if (stream->flags & LSTREAM_FL_WRITE)
2816     {
2817       str->flags |= CODING_STATE_END;
2818       encoding_writer (stream, 0, 0);
2819     }
2820   Dynarr_free (str->runoff);
2821   return Lstream_close (str->other_end);
2822 }
2823
2824 Lisp_Object
2825 encoding_stream_coding_system (Lstream *stream)
2826 {
2827   Lisp_Object coding_system;
2828   struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2829
2830   XSETCODING_SYSTEM (coding_system, str->codesys);
2831   return coding_system;
2832 }
2833
2834 void
2835 set_encoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2836 {
2837   Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2838   struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2839   str->codesys = cs;
2840   reset_encoding_stream (str);
2841 }
2842
2843 static Lisp_Object
2844 make_encoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2845                         const char *mode)
2846 {
2847   Lstream *lstr = Lstream_new (lstream_encoding, mode);
2848   struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2849   Lisp_Object obj;
2850
2851   xzero (*str);
2852   str->runoff = Dynarr_new (unsigned_char);
2853   str->other_end = stream;
2854   set_encoding_stream_coding_system (lstr, codesys);
2855   XSETLSTREAM (obj, lstr);
2856   return obj;
2857 }
2858
2859 Lisp_Object
2860 make_encoding_input_stream (Lstream *stream, Lisp_Object codesys)
2861 {
2862   return make_encoding_stream_1 (stream, codesys, "r");
2863 }
2864
2865 Lisp_Object
2866 make_encoding_output_stream (Lstream *stream, Lisp_Object codesys)
2867 {
2868   return make_encoding_stream_1 (stream, codesys, "w");
2869 }
2870
2871 /* Convert N bytes of internally-formatted data stored in SRC to an
2872    external format, according to the encoding stream ENCODING.
2873    Store the encoded data into DST. */
2874
2875 static void
2876 mule_encode (Lstream *encoding, const Bufbyte *src,
2877              unsigned_char_dynarr *dst, size_t n)
2878 {
2879   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2880
2881   switch (CODING_SYSTEM_TYPE (str->codesys))
2882     {
2883 #ifdef DEBUG_XEMACS
2884     case CODESYS_INTERNAL:
2885       Dynarr_add_many (dst, src, n);
2886       break;
2887 #endif
2888     case CODESYS_AUTODETECT:
2889       /* If we got this far and still haven't decided on the coding
2890          system, then do no conversion. */
2891     case CODESYS_NO_CONVERSION:
2892       encode_coding_no_conversion (encoding, src, dst, n);
2893       break;
2894 #ifdef MULE
2895     case CODESYS_SHIFT_JIS:
2896       encode_coding_sjis (encoding, src, dst, n);
2897       break;
2898     case CODESYS_BIG5:
2899       encode_coding_big5 (encoding, src, dst, n);
2900       break;
2901     case CODESYS_UCS4:
2902       encode_coding_ucs4 (encoding, src, dst, n);
2903       break;
2904     case CODESYS_UTF8:
2905       encode_coding_utf8 (encoding, src, dst, n);
2906       break;
2907     case CODESYS_CCL:
2908       str->ccl.last_block = str->flags & CODING_STATE_END;
2909       /* When applying ccl program to stream, MUST NOT set NULL
2910          pointer to src.  */
2911       ccl_driver (&str->ccl, ((src) ? src : (unsigned char*)""),
2912                   dst, n, 0, CCL_MODE_ENCODING);
2913       break;
2914     case CODESYS_ISO2022:
2915       encode_coding_iso2022 (encoding, src, dst, n);
2916       break;
2917 #endif /* MULE */
2918     default:
2919       abort ();
2920     }
2921 }
2922
2923 DEFUN ("encode-coding-region", Fencode_coding_region, 3, 4, 0, /*
2924 Encode the text between START and END using CODING-SYSTEM.
2925 This will, for example, convert Japanese characters into stuff such as
2926 "^[$B!<!+^[(B" if you use the JIS encoding.  Return length of encoded
2927 text.  BUFFER defaults to the current buffer if unspecified.
2928 */
2929        (start, end, coding_system, buffer))
2930 {
2931   Bufpos b, e;
2932   struct buffer *buf = decode_buffer (buffer, 0);
2933   Lisp_Object instream, lb_outstream, de_outstream, outstream;
2934   Lstream *istr, *ostr;
2935   struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2936
2937   get_buffer_range_char (buf, start, end, &b, &e, 0);
2938
2939   barf_if_buffer_read_only (buf, b, e);
2940
2941   coding_system = Fget_coding_system (coding_system);
2942   instream  = make_lisp_buffer_input_stream  (buf, b, e, 0);
2943   lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2944   de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2945                                               Fget_coding_system (Qbinary));
2946   outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2947                                            coding_system);
2948   istr = XLSTREAM (instream);
2949   ostr = XLSTREAM (outstream);
2950   GCPRO4 (instream, outstream, de_outstream, lb_outstream);
2951   /* The chain of streams looks like this:
2952
2953      [BUFFER] <----- send through
2954                      ------> [ENCODE AS SPECIFIED]
2955                              ------> [DECODE AS BINARY]
2956                                      ------> [BUFFER]
2957    */
2958   while (1)
2959     {
2960       char tempbuf[1024]; /* some random amount */
2961       Bufpos newpos, even_newer_pos;
2962       Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2963       ssize_t size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2964
2965       if (!size_in_bytes)
2966         break;
2967       newpos = lisp_buffer_stream_startpos (istr);
2968       Lstream_write (ostr, tempbuf, size_in_bytes);
2969       even_newer_pos = lisp_buffer_stream_startpos (istr);
2970       buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2971                            even_newer_pos, 0);
2972     }
2973
2974   {
2975     Charcount retlen =
2976       lisp_buffer_stream_startpos (XLSTREAM (instream)) - b;
2977     Lstream_close (istr);
2978     Lstream_close (ostr);
2979     UNGCPRO;
2980     Lstream_delete (istr);
2981     Lstream_delete (ostr);
2982     Lstream_delete (XLSTREAM (de_outstream));
2983     Lstream_delete (XLSTREAM (lb_outstream));
2984     return make_int (retlen);
2985   }
2986 }
2987
2988 #ifdef MULE
2989 \f
2990 /************************************************************************/
2991 /*                          Shift-JIS methods                           */
2992 /************************************************************************/
2993
2994 /* Shift-JIS is a coding system encoding three character sets: ASCII, right
2995    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
2996    as is.  A character of JISX0201-Kana (DIMENSION1_CHARS94 character set) is
2997    encoded by "position-code + 0x80".  A character of JISX0208
2998    (DIMENSION2_CHARS94 character set) is encoded in 2-byte but two
2999    position-codes are divided and shifted so that it fit in the range
3000    below.
3001
3002    --- CODE RANGE of Shift-JIS ---
3003    (character set)      (range)
3004    ASCII                0x00 .. 0x7F
3005    JISX0201-Kana        0xA0 .. 0xDF
3006    JISX0208 (1st byte)  0x80 .. 0x9F and 0xE0 .. 0xEF
3007             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
3008    -------------------------------
3009
3010 */
3011
3012 /* Is this the first byte of a Shift-JIS two-byte char? */
3013
3014 #define BYTE_SJIS_TWO_BYTE_1_P(c) \
3015   (((c) >= 0x81 && (c) <= 0x9F) || ((c) >= 0xE0 && (c) <= 0xEF))
3016
3017 /* Is this the second byte of a Shift-JIS two-byte char? */
3018
3019 #define BYTE_SJIS_TWO_BYTE_2_P(c) \
3020   (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0x80 && (c) <= 0xFC))
3021
3022 #define BYTE_SJIS_KATAKANA_P(c) \
3023   ((c) >= 0xA1 && (c) <= 0xDF)
3024
3025 static int
3026 detect_coding_sjis (struct detection_state *st, const Extbyte *src, size_t n)
3027 {
3028   while (n--)
3029     {
3030       unsigned char c = *(unsigned char *)src++;
3031       if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3032         return 0;
3033       if (st->shift_jis.in_second_byte)
3034         {
3035           st->shift_jis.in_second_byte = 0;
3036           if (c < 0x40)
3037             return 0;
3038         }
3039       else if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
3040         st->shift_jis.in_second_byte = 1;
3041     }
3042   return CODING_CATEGORY_SHIFT_JIS_MASK;
3043 }
3044
3045 /* Convert Shift-JIS data to internal format. */
3046
3047 static void
3048 decode_coding_sjis (Lstream *decoding, const Extbyte *src,
3049                     unsigned_char_dynarr *dst, size_t n)
3050 {
3051   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3052   unsigned int flags  = str->flags;
3053   unsigned int ch     = str->ch;
3054   eol_type_t eol_type = str->eol_type;
3055
3056   while (n--)
3057     {
3058       unsigned char c = *(unsigned char *)src++;
3059
3060       if (ch)
3061         {
3062           /* Previous character was first byte of Shift-JIS Kanji char. */
3063           if (BYTE_SJIS_TWO_BYTE_2_P (c))
3064             {
3065               unsigned char e1, e2;
3066
3067               DECODE_SJIS (ch, c, e1, e2);
3068 #ifdef UTF2000
3069               DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_japanese_jisx0208,
3070                                             e1 & 0x7F,
3071                                             e2 & 0x7F), dst);
3072 #else
3073               Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
3074               Dynarr_add (dst, e1);
3075               Dynarr_add (dst, e2);
3076 #endif
3077             }
3078           else
3079             {
3080               DECODE_ADD_BINARY_CHAR (ch, dst);
3081               DECODE_ADD_BINARY_CHAR (c, dst);
3082             }
3083           ch = 0;
3084         }
3085       else
3086         {
3087           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3088           if (BYTE_SJIS_TWO_BYTE_1_P (c))
3089             ch = c;
3090           else if (BYTE_SJIS_KATAKANA_P (c))
3091             {
3092 #ifdef UTF2000
3093               DECODE_ADD_UCS_CHAR(MAKE_CHAR(Vcharset_katakana_jisx0201,
3094                                             c & 0x7F, 0), dst);
3095 #else
3096               Dynarr_add (dst, LEADING_BYTE_KATAKANA_JISX0201);
3097               Dynarr_add (dst, c);
3098 #endif
3099             }
3100           else
3101             DECODE_ADD_BINARY_CHAR (c, dst);
3102         }
3103     label_continue_loop:;
3104     }
3105
3106   DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
3107
3108   str->flags = flags;
3109   str->ch    = ch;
3110 }
3111
3112 /* Convert internally-formatted data to Shift-JIS. */
3113
3114 static void
3115 encode_coding_sjis (Lstream *encoding, const Bufbyte *src,
3116                     unsigned_char_dynarr *dst, size_t n)
3117 {
3118   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3119   unsigned int flags  = str->flags;
3120   unsigned int ch     = str->ch;
3121   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3122 #ifdef UTF2000
3123   unsigned char char_boundary = str->iso2022.current_char_boundary;
3124 #endif
3125
3126   while (n--)
3127     {
3128       Bufbyte c = *src++;
3129 #ifdef UTF2000
3130       switch (char_boundary)
3131         {
3132         case 0:
3133           if ( c >= 0xfc )
3134             {
3135               ch = c & 0x01;
3136               char_boundary = 5;
3137             }
3138           else if ( c >= 0xf8 )
3139             {
3140               ch = c & 0x03;
3141               char_boundary = 4;
3142             }
3143           else if ( c >= 0xf0 )
3144             {
3145               ch = c & 0x07;
3146               char_boundary = 3;
3147             }
3148           else if ( c >= 0xe0 )
3149             {
3150               ch = c & 0x0f;
3151               char_boundary = 2;
3152             }
3153           else if ( c >= 0xc0 )
3154             {
3155               ch = c & 0x1f;
3156               char_boundary = 1;
3157             }
3158           else
3159             {
3160               ch = 0;
3161               if (c == '\n')
3162                 {
3163                   if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3164                     Dynarr_add (dst, '\r');
3165                   if (eol_type != EOL_CR)
3166                     Dynarr_add (dst, c);
3167                 }
3168               else
3169                 Dynarr_add (dst, c);
3170               char_boundary = 0;
3171             }
3172           break;
3173         case 1:
3174           ch = ( ch << 6 ) | ( c & 0x3f );
3175           {
3176             Lisp_Object charset;
3177             unsigned int c1, c2, s1, s2;
3178
3179             BREAKUP_CHAR (ch, charset, c1, c2);
3180             if (EQ(charset, Vcharset_katakana_jisx0201))
3181               {
3182                 Dynarr_add (dst, c1 | 0x80);
3183               }
3184             else if (EQ(charset, Vcharset_japanese_jisx0208))
3185               {
3186                 ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3187                 Dynarr_add (dst, s1);
3188                 Dynarr_add (dst, s2);
3189               }
3190           }
3191           char_boundary = 0;
3192           break;
3193         default:
3194           ch = ( ch << 6 ) | ( c & 0x3f );
3195           char_boundary--;
3196         }
3197 #else
3198       if (c == '\n')
3199         {
3200           if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3201             Dynarr_add (dst, '\r');
3202           if (eol_type != EOL_CR)
3203             Dynarr_add (dst, '\n');
3204           ch = 0;
3205         }
3206       else if (BYTE_ASCII_P (c))
3207         {
3208           Dynarr_add (dst, c);
3209           ch = 0;
3210         }
3211       else if (BUFBYTE_LEADING_BYTE_P (c))
3212         ch = (c == LEADING_BYTE_KATAKANA_JISX0201 ||
3213               c == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
3214               c == LEADING_BYTE_JAPANESE_JISX0208) ? c : 0;
3215       else if (ch)
3216         {
3217           if (ch == LEADING_BYTE_KATAKANA_JISX0201)
3218             {
3219               Dynarr_add (dst, c);
3220               ch = 0;
3221             }
3222           else if (ch == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
3223                    ch == LEADING_BYTE_JAPANESE_JISX0208)
3224             ch = c;
3225           else
3226             {
3227               unsigned char j1, j2;
3228               ENCODE_SJIS (ch, c, j1, j2);
3229               Dynarr_add (dst, j1);
3230               Dynarr_add (dst, j2);
3231               ch = 0;
3232             }
3233         }
3234 #endif
3235     }
3236
3237   str->flags = flags;
3238   str->ch    = ch;
3239 #ifdef UTF2000
3240   str->iso2022.current_char_boundary = char_boundary;
3241 #endif
3242 }
3243
3244 DEFUN ("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /*
3245 Decode a JISX0208 character of Shift-JIS coding-system.
3246 CODE is the character code in Shift-JIS as a cons of type bytes.
3247 Return the corresponding character.
3248 */
3249        (code))
3250 {
3251   unsigned char c1, c2, s1, s2;
3252
3253   CHECK_CONS (code);
3254   CHECK_INT (XCAR (code));
3255   CHECK_INT (XCDR (code));
3256   s1 = XINT (XCAR (code));
3257   s2 = XINT (XCDR (code));
3258   if (BYTE_SJIS_TWO_BYTE_1_P (s1) &&
3259       BYTE_SJIS_TWO_BYTE_2_P (s2))
3260     {
3261       DECODE_SJIS (s1, s2, c1, c2);
3262       return make_char (MAKE_CHAR (Vcharset_japanese_jisx0208,
3263                                    c1 & 0x7F, c2 & 0x7F));
3264     }
3265   else
3266     return Qnil;
3267 }
3268
3269 DEFUN ("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /*
3270 Encode a JISX0208 character CHARACTER to SHIFT-JIS coding-system.
3271 Return the corresponding character code in SHIFT-JIS as a cons of two bytes.
3272 */
3273        (character))
3274 {
3275   Lisp_Object charset;
3276   int c1, c2, s1, s2;
3277
3278   CHECK_CHAR_COERCE_INT (character);
3279   BREAKUP_CHAR (XCHAR (character), charset, c1, c2);
3280   if (EQ (charset, Vcharset_japanese_jisx0208))
3281     {
3282       ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
3283       return Fcons (make_int (s1), make_int (s2));
3284     }
3285   else
3286     return Qnil;
3287 }
3288
3289 \f
3290 /************************************************************************/
3291 /*                            Big5 methods                              */
3292 /************************************************************************/
3293
3294 /* BIG5 is a coding system encoding two character sets: ASCII and
3295    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
3296    character set and is encoded in two-byte.
3297
3298    --- CODE RANGE of BIG5 ---
3299    (character set)      (range)
3300    ASCII                0x00 .. 0x7F
3301    Big5 (1st byte)      0xA1 .. 0xFE
3302         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
3303    --------------------------
3304
3305    Since the number of characters in Big5 is larger than maximum
3306    characters in Emacs' charset (96x96), it can't be handled as one
3307    charset.  So, in Emacs, Big5 is divided into two: `charset-big5-1'
3308    and `charset-big5-2'.  Both <type>s are DIMENSION2_CHARS94.  The former
3309    contains frequently used characters and the latter contains less
3310    frequently used characters.  */
3311
3312 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
3313   ((c) >= 0xA1 && (c) <= 0xFE)
3314
3315 /* Is this the second byte of a Shift-JIS two-byte char? */
3316
3317 #define BYTE_BIG5_TWO_BYTE_2_P(c) \
3318   (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0xA1 && (c) <= 0xFE))
3319
3320 /* Number of Big5 characters which have the same code in 1st byte.  */
3321
3322 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
3323
3324 /* Code conversion macros.  These are macros because they are used in
3325    inner loops during code conversion.
3326
3327    Note that temporary variables in macros introduce the classic
3328    dynamic-scoping problems with variable names.  We use capital-
3329    lettered variables in the assumption that XEmacs does not use
3330    capital letters in variables except in a very formalized way
3331    (e.g. Qstring). */
3332
3333 /* Convert Big5 code (b1, b2) into its internal string representation
3334    (lb, c1, c2). */
3335
3336 /* There is a much simpler way to split the Big5 charset into two.
3337    For the moment I'm going to leave the algorithm as-is because it
3338    claims to separate out the most-used characters into a single
3339    charset, which perhaps will lead to optimizations in various
3340    places.
3341
3342    The way the algorithm works is something like this:
3343
3344    Big5 can be viewed as a 94x157 charset, where the row is
3345    encoded into the bytes 0xA1 .. 0xFE and the column is encoded
3346    into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE.  As for frequency,
3347    the split between low and high column numbers is apparently
3348    meaningless; ascending rows produce less and less frequent chars.
3349    Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to
3350    the first charset, and the upper half (0xC9 .. 0xFE) to the
3351    second.  To do the conversion, we convert the character into
3352    a single number where 0 .. 156 is the first row, 157 .. 313
3353    is the second, etc.  That way, the characters are ordered by
3354    decreasing frequency.  Then we just chop the space in two
3355    and coerce the result into a 94x94 space.
3356    */
3357
3358 #define DECODE_BIG5(b1, b2, lb, c1, c2) do                              \
3359 {                                                                       \
3360   int B1 = b1, B2 = b2;                                                 \
3361   unsigned int I                                                        \
3362     = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62);     \
3363                                                                         \
3364   if (B1 < 0xC9)                                                        \
3365     {                                                                   \
3366       lb = LEADING_BYTE_CHINESE_BIG5_1;                                 \
3367     }                                                                   \
3368   else                                                                  \
3369     {                                                                   \
3370       lb = LEADING_BYTE_CHINESE_BIG5_2;                                 \
3371       I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1);                             \
3372     }                                                                   \
3373   c1 = I / (0xFF - 0xA1) + 0xA1;                                        \
3374   c2 = I % (0xFF - 0xA1) + 0xA1;                                        \
3375 } while (0)
3376
3377 /* Convert the internal string representation of a Big5 character
3378    (lb, c1, c2) into Big5 code (b1, b2). */
3379
3380 #define ENCODE_BIG5(lb, c1, c2, b1, b2) do                              \
3381 {                                                                       \
3382   unsigned int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1);       \
3383                                                                         \
3384   if (lb == LEADING_BYTE_CHINESE_BIG5_2)                                \
3385     {                                                                   \
3386       I += BIG5_SAME_ROW * (0xC9 - 0xA1);                               \
3387     }                                                                   \
3388   b1 = I / BIG5_SAME_ROW + 0xA1;                                        \
3389   b2 = I % BIG5_SAME_ROW;                                               \
3390   b2 += b2 < 0x3F ? 0x40 : 0x62;                                        \
3391 } while (0)
3392
3393 static int
3394 detect_coding_big5 (struct detection_state *st, const Extbyte *src, size_t n)
3395 {
3396   while (n--)
3397     {
3398       unsigned char c = *(unsigned char *)src++;
3399       if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO ||
3400           (c >= 0x80 && c <= 0xA0))
3401         return 0;
3402       if (st->big5.in_second_byte)
3403         {
3404           st->big5.in_second_byte = 0;
3405           if (c < 0x40 || (c >= 0x80 && c <= 0xA0))
3406             return 0;
3407         }
3408       else if (c >= 0xA1)
3409         st->big5.in_second_byte = 1;
3410     }
3411   return CODING_CATEGORY_BIG5_MASK;
3412 }
3413
3414 /* Convert Big5 data to internal format. */
3415
3416 static void
3417 decode_coding_big5 (Lstream *decoding, const Extbyte *src,
3418                     unsigned_char_dynarr *dst, size_t n)
3419 {
3420   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3421   unsigned int flags  = str->flags;
3422   unsigned int ch     = str->ch;
3423   eol_type_t eol_type = str->eol_type;
3424
3425   while (n--)
3426     {
3427       unsigned char c = *(unsigned char *)src++;
3428       if (ch)
3429         {
3430           /* Previous character was first byte of Big5 char. */
3431           if (BYTE_BIG5_TWO_BYTE_2_P (c))
3432             {
3433               unsigned char b1, b2, b3;
3434               DECODE_BIG5 (ch, c, b1, b2, b3);
3435               Dynarr_add (dst, b1);
3436               Dynarr_add (dst, b2);
3437               Dynarr_add (dst, b3);
3438             }
3439           else
3440             {
3441               DECODE_ADD_BINARY_CHAR (ch, dst);
3442               DECODE_ADD_BINARY_CHAR (c, dst);
3443             }
3444           ch = 0;
3445         }
3446       else
3447         {
3448           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3449           if (BYTE_BIG5_TWO_BYTE_1_P (c))
3450             ch = c;
3451           else
3452             DECODE_ADD_BINARY_CHAR (c, dst);
3453         }
3454     label_continue_loop:;
3455     }
3456
3457   DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
3458
3459   str->flags = flags;
3460   str->ch    = ch;
3461 }
3462
3463 /* Convert internally-formatted data to Big5. */
3464
3465 static void
3466 encode_coding_big5 (Lstream *encoding, const Bufbyte *src,
3467                     unsigned_char_dynarr *dst, size_t n)
3468 {
3469 #ifndef UTF2000
3470   unsigned char c;
3471   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3472   unsigned int flags  = str->flags;
3473   unsigned int ch     = str->ch;
3474   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3475
3476   while (n--)
3477     {
3478       c = *src++;
3479       if (c == '\n')
3480         {
3481           if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3482             Dynarr_add (dst, '\r');
3483           if (eol_type != EOL_CR)
3484             Dynarr_add (dst, '\n');
3485         }
3486       else if (BYTE_ASCII_P (c))
3487         {
3488           /* ASCII. */
3489           Dynarr_add (dst, c);
3490         }
3491       else if (BUFBYTE_LEADING_BYTE_P (c))
3492         {
3493           if (c == LEADING_BYTE_CHINESE_BIG5_1 ||
3494               c == LEADING_BYTE_CHINESE_BIG5_2)
3495             {
3496               /* A recognized leading byte. */
3497               ch = c;
3498               continue; /* not done with this character. */
3499             }
3500           /* otherwise just ignore this character. */
3501         }
3502       else if (ch == LEADING_BYTE_CHINESE_BIG5_1 ||
3503                ch == LEADING_BYTE_CHINESE_BIG5_2)
3504         {
3505           /* Previous char was a recognized leading byte. */
3506           ch = (ch << 8) | c;
3507           continue; /* not done with this character. */
3508         }
3509       else if (ch)
3510         {
3511           /* Encountering second byte of a Big5 character. */
3512           unsigned char b1, b2;
3513
3514           ENCODE_BIG5 (ch >> 8, ch & 0xFF, c, b1, b2);
3515           Dynarr_add (dst, b1);
3516           Dynarr_add (dst, b2);
3517         }
3518
3519       ch = 0;
3520     }
3521
3522   str->flags = flags;
3523   str->ch    = ch;
3524 #endif
3525 }
3526
3527
3528 DEFUN ("decode-big5-char", Fdecode_big5_char, 1, 1, 0, /*
3529 Decode a Big5 character CODE of BIG5 coding-system.
3530 CODE is the character code in BIG5, a cons of two integers.
3531 Return the corresponding character.
3532 */
3533        (code))
3534 {
3535   unsigned char c1, c2, b1, b2;
3536
3537   CHECK_CONS (code);
3538   CHECK_INT (XCAR (code));
3539   CHECK_INT (XCDR (code));
3540   b1 = XINT (XCAR (code));
3541   b2 = XINT (XCDR (code));
3542   if (BYTE_BIG5_TWO_BYTE_1_P (b1) &&
3543       BYTE_BIG5_TWO_BYTE_2_P (b2))
3544     {
3545       Charset_ID leading_byte;
3546       Lisp_Object charset;
3547       DECODE_BIG5 (b1, b2, leading_byte, c1, c2);
3548       charset = CHARSET_BY_LEADING_BYTE (leading_byte);
3549       return make_char (MAKE_CHAR (charset, c1 & 0x7F, c2 & 0x7F));
3550     }
3551   else
3552     return Qnil;
3553 }
3554
3555 DEFUN ("encode-big5-char", Fencode_big5_char, 1, 1, 0, /*
3556 Encode the Big5 character CHARACTER in the BIG5 coding-system.
3557 Return the corresponding character code in Big5.
3558 */
3559        (character))
3560 {
3561   Lisp_Object charset;
3562   int c1, c2, b1, b2;
3563
3564   CHECK_CHAR_COERCE_INT (character);
3565   BREAKUP_CHAR (XCHAR (character), charset, c1, c2);
3566   if (EQ (charset, Vcharset_chinese_big5_1) ||
3567       EQ (charset, Vcharset_chinese_big5_2))
3568     {
3569       ENCODE_BIG5 (XCHARSET_LEADING_BYTE (charset), c1 | 0x80, c2 | 0x80,
3570                    b1, b2);
3571       return Fcons (make_int (b1), make_int (b2));
3572     }
3573   else
3574     return Qnil;
3575 }
3576
3577 \f
3578 /************************************************************************/
3579 /*                           UCS-4 methods                              */
3580 /*                                                                      */
3581 /*  UCS-4 character codes are implemented as nonnegative integers.      */
3582 /*                                                                      */
3583 /************************************************************************/
3584
3585 #ifndef UTF2000
3586 DEFUN ("set-ucs-char", Fset_ucs_char, 2, 2, 0, /*
3587 Map UCS-4 code CODE to Mule character CHARACTER.
3588
3589 Return T on success, NIL on failure.
3590 */
3591        (code, character))
3592 {
3593   size_t c;
3594
3595   CHECK_CHAR (character);
3596   CHECK_NATNUM (code);
3597   c = XINT (code);
3598
3599   if (c < countof (fcd->ucs_to_mule_table))
3600     {
3601       fcd->ucs_to_mule_table[c] = character;
3602       return Qt;
3603     }
3604   else
3605     return Qnil;
3606 }
3607
3608 static Lisp_Object
3609 ucs_to_char (unsigned long code)
3610 {
3611   if (code < countof (fcd->ucs_to_mule_table))
3612     {
3613       return fcd->ucs_to_mule_table[code];
3614     }
3615   else if ((0xe00000 <= code) && (code <= 0xe00000 + 94 * 94 * 14))
3616     {
3617       unsigned int c;
3618
3619       code -= 0xe00000;
3620       c = code % (94 * 94);
3621       return make_char
3622         (MAKE_CHAR (CHARSET_BY_ATTRIBUTES
3623                     (94, 2, code / (94 * 94) + '@',
3624                      CHARSET_LEFT_TO_RIGHT),
3625                     c / 94 + 33, c % 94 + 33));
3626     }
3627   else
3628     return Qnil;
3629 }
3630
3631 DEFUN ("ucs-char", Fucs_char, 1, 1, 0, /*
3632 Return Mule character corresponding to UCS code CODE (a positive integer).
3633 */
3634        (code))
3635 {
3636   CHECK_NATNUM (code);
3637   return ucs_to_char (XINT (code));
3638 }
3639
3640 DEFUN ("set-char-ucs", Fset_char_ucs, 2, 2, 0, /*
3641 Map Mule character CHARACTER to UCS code CODE (a positive integer).
3642 */
3643        (character, code))
3644 {
3645   /* #### Isn't this gilding the lily?  Fput_char_table checks its args.
3646           Fset_char_ucs is more restrictive on index arg, but should
3647           check code arg in a char_table method. */
3648   CHECK_CHAR (character);
3649   CHECK_NATNUM (code);
3650   return Fput_char_table (character, code, mule_to_ucs_table);
3651 }
3652
3653 DEFUN ("char-ucs", Fchar_ucs, 1, 1, 0, /*
3654 Return the UCS code (a positive integer) corresponding to CHARACTER.
3655 */
3656        (character))
3657 {
3658   return Fget_char_table (character, mule_to_ucs_table);
3659 }
3660 #endif
3661
3662 #ifdef UTF2000
3663 #define decode_ucs4 DECODE_ADD_UCS_CHAR
3664 #else
3665 /* Decode a UCS-4 character into a buffer.  If the lookup fails, use
3666    <GETA MARK> (U+3013) of JIS X 0208, which means correct character
3667    is not found, instead.
3668    #### do something more appropriate (use blob?)
3669         Danger, Will Robinson!  Data loss.  Should we signal user? */
3670 static void
3671 decode_ucs4 (unsigned long ch, unsigned_char_dynarr *dst)
3672 {
3673   Lisp_Object chr = ucs_to_char (ch);
3674
3675   if (! NILP (chr))
3676     {
3677       Bufbyte work[MAX_EMCHAR_LEN];
3678       int len;
3679
3680       ch = XCHAR (chr);
3681       len = (ch < 128) ?
3682         simple_set_charptr_emchar (work, ch) :
3683         non_ascii_set_charptr_emchar (work, ch);
3684       Dynarr_add_many (dst, work, len);
3685     }
3686   else
3687     {
3688       Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
3689       Dynarr_add (dst, 34 + 128);
3690       Dynarr_add (dst, 46 + 128);
3691     }
3692 }
3693 #endif
3694
3695 #ifndef UTF2000
3696 static unsigned long
3697 mule_char_to_ucs4 (Lisp_Object charset,
3698                    unsigned char h, unsigned char l)
3699 {
3700   Lisp_Object code
3701     = Fget_char_table (make_char (MAKE_CHAR (charset, h & 127, l & 127)),
3702                        mule_to_ucs_table);
3703
3704   if (INTP (code))
3705     {
3706       return XINT (code);
3707     }
3708   else if ( (XCHARSET_DIMENSION (charset) == 2) &&
3709             (XCHARSET_CHARS (charset) == 94) )
3710     {
3711       unsigned char final = XCHARSET_FINAL (charset);
3712
3713       if ( ('@' <= final) && (final < 0x7f) )
3714         {
3715           return 0xe00000 + (final - '@') * 94 * 94
3716             + ((h & 127) - 33) * 94 + (l & 127) - 33;
3717         }
3718       else
3719         {
3720           return '?';
3721         }
3722     }
3723   else
3724     {
3725       return '?';
3726     }
3727 }
3728
3729 static void
3730 encode_ucs4 (Lisp_Object charset,
3731              unsigned char h, unsigned char l, unsigned_char_dynarr *dst)
3732 {
3733   unsigned long code = mule_char_to_ucs4 (charset, h, l);
3734   Dynarr_add (dst,  code >> 24);
3735   Dynarr_add (dst, (code >> 16) & 255);
3736   Dynarr_add (dst, (code >>  8) & 255);
3737   Dynarr_add (dst,  code        & 255);
3738 }
3739 #endif
3740
3741 static int
3742 detect_coding_ucs4 (struct detection_state *st, const Extbyte *src, size_t n)
3743 {
3744   while (n--)
3745     {
3746       unsigned char c = *(unsigned char *)src++;
3747       switch (st->ucs4.in_byte)
3748         {
3749         case 0:
3750           if (c >= 128)
3751             return 0;
3752           else
3753             st->ucs4.in_byte++;
3754           break;
3755         case 3:
3756           st->ucs4.in_byte = 0;
3757           break;
3758         default:
3759           st->ucs4.in_byte++;
3760         }
3761     }
3762   return CODING_CATEGORY_UCS4_MASK;
3763 }
3764
3765 static void
3766 decode_coding_ucs4 (Lstream *decoding, const Extbyte *src,
3767                     unsigned_char_dynarr *dst, size_t n)
3768 {
3769   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3770   unsigned int flags = str->flags;
3771   unsigned int ch    = str->ch;
3772   unsigned char counter = str->counter;
3773
3774   while (n--)
3775     {
3776       unsigned char c = *(unsigned char *)src++;
3777       switch (counter)
3778         {
3779         case 0:
3780           ch = c;
3781           counter = 3;
3782           break;
3783         case 1:
3784           decode_ucs4 ( ( ch << 8 ) | c, dst);
3785           ch = 0;
3786           counter = 0;
3787           break;
3788         default:
3789           ch = ( ch << 8 ) | c;
3790           counter--;
3791         }
3792     }
3793   if (counter & CODING_STATE_END)
3794     DECODE_OUTPUT_PARTIAL_CHAR (ch);
3795
3796   str->flags = flags;
3797   str->ch    = ch;
3798   str->counter = counter;
3799 }
3800
3801 static void
3802 encode_coding_ucs4 (Lstream *encoding, const Bufbyte *src,
3803                     unsigned_char_dynarr *dst, size_t n)
3804 {
3805 #ifndef UTF2000
3806   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3807   unsigned int flags = str->flags;
3808   unsigned int ch = str->ch;
3809   unsigned char char_boundary = str->iso2022.current_char_boundary;
3810   Lisp_Object charset = str->iso2022.current_charset;
3811
3812 #ifdef ENABLE_COMPOSITE_CHARS
3813   /* flags for handling composite chars.  We do a little switcharoo
3814      on the source while we're outputting the composite char. */
3815   unsigned int saved_n = 0;
3816   const unsigned char *saved_src = NULL;
3817   int in_composite = 0;
3818
3819  back_to_square_n:
3820 #endif
3821
3822   while (n--)
3823     {
3824       unsigned char c = *src++;
3825
3826       if (BYTE_ASCII_P (c))
3827         {               /* Processing ASCII character */
3828           ch = 0;
3829           encode_ucs4 (Vcharset_ascii, c, 0, dst);
3830           char_boundary = 1;
3831         }
3832       else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
3833         { /* Processing Leading Byte */
3834           ch = 0;
3835           charset = CHARSET_BY_LEADING_BYTE (c);
3836           if (LEADING_BYTE_PREFIX_P(c))
3837             ch = c;
3838           char_boundary = 0;
3839         }
3840       else
3841         {                       /* Processing Non-ASCII character */
3842           char_boundary = 1;
3843           if (EQ (charset, Vcharset_control_1))
3844             {
3845               encode_ucs4 (Vcharset_control_1, c, 0, dst);
3846             }
3847           else
3848             {
3849               switch (XCHARSET_REP_BYTES (charset))
3850                 {
3851                 case 2:
3852                   encode_ucs4 (charset, c, 0, dst);
3853                   break;
3854                 case 3:
3855                   if (XCHARSET_PRIVATE_P (charset))
3856                     {
3857                       encode_ucs4 (charset, c, 0, dst);
3858                       ch = 0;
3859                     }
3860                   else if (ch)
3861                     {
3862 #ifdef ENABLE_COMPOSITE_CHARS
3863                       if (EQ (charset, Vcharset_composite))
3864                         {
3865                           if (in_composite)
3866                             {
3867                               /* #### Bother! We don't know how to
3868                                  handle this yet. */
3869                               Dynarr_add (dst, '\0');
3870                               Dynarr_add (dst, '\0');
3871                               Dynarr_add (dst, '\0');
3872                               Dynarr_add (dst, '~');
3873                             }
3874                           else
3875                             {
3876                               Emchar emch = MAKE_CHAR (Vcharset_composite,
3877                                                        ch & 0x7F, c & 0x7F);
3878                               Lisp_Object lstr = composite_char_string (emch);
3879                               saved_n = n;
3880                               saved_src = src;
3881                               in_composite = 1;
3882                               src = XSTRING_DATA   (lstr);
3883                               n   = XSTRING_LENGTH (lstr);
3884                             }
3885                         }
3886                       else
3887 #endif /* ENABLE_COMPOSITE_CHARS */
3888                         {
3889                           encode_ucs4(charset, ch, c, dst);
3890                         }
3891                       ch = 0;
3892                     }
3893                   else
3894                     {
3895                       ch = c;
3896                       char_boundary = 0;
3897                     }
3898                   break;
3899                 case 4:
3900                   if (ch)
3901                     {
3902                       encode_ucs4 (charset, ch, c, dst);
3903                       ch = 0;
3904                     }
3905                   else
3906                     {
3907                       ch = c;
3908                       char_boundary = 0;
3909                     }
3910                   break;
3911                 default:
3912                   abort ();
3913                 }
3914             }
3915         }
3916     }
3917
3918 #ifdef ENABLE_COMPOSITE_CHARS
3919   if (in_composite)
3920     {
3921       n = saved_n;
3922       src = saved_src;
3923       in_composite = 0;
3924       goto back_to_square_n; /* Wheeeeeeeee ..... */
3925     }
3926 #endif /* ENABLE_COMPOSITE_CHARS */
3927
3928   str->flags = flags;
3929   str->ch = ch;
3930   str->iso2022.current_char_boundary = char_boundary;
3931   str->iso2022.current_charset = charset;
3932
3933   /* Verbum caro factum est! */
3934 #endif
3935 }
3936
3937 \f
3938 /************************************************************************/
3939 /*                           UTF-8 methods                              */
3940 /************************************************************************/
3941
3942 static int
3943 detect_coding_utf8 (struct detection_state *st, const Extbyte *src, size_t n)
3944 {
3945   while (n--)
3946     {
3947       unsigned char c = *(unsigned char *)src++;
3948       switch (st->utf8.in_byte)
3949         {
3950         case 0:
3951           if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3952             return 0;
3953           else if (c >= 0xfc)
3954             st->utf8.in_byte = 5;
3955           else if (c >= 0xf8)
3956             st->utf8.in_byte = 4;
3957           else if (c >= 0xf0)
3958             st->utf8.in_byte = 3;
3959           else if (c >= 0xe0)
3960             st->utf8.in_byte = 2;
3961           else if (c >= 0xc0)
3962             st->utf8.in_byte = 1;
3963           else if (c >= 0x80)
3964             return 0;
3965           break;
3966         default:
3967           if ((c & 0xc0) != 0x80)
3968             return 0;
3969           else
3970             st->utf8.in_byte--;
3971         }
3972     }
3973   return CODING_CATEGORY_UTF8_MASK;
3974 }
3975
3976 static void
3977 decode_coding_utf8 (Lstream *decoding, const Extbyte *src,
3978                     unsigned_char_dynarr *dst, size_t n)
3979 {
3980   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3981   unsigned int flags  = str->flags;
3982   unsigned int ch     = str->ch;
3983   eol_type_t eol_type = str->eol_type;
3984   unsigned char counter = str->counter;
3985
3986   while (n--)
3987     {
3988       unsigned char c = *(unsigned char *)src++;
3989       switch (counter)
3990         {
3991         case 0:
3992           if ( c >= 0xfc )
3993             {
3994               ch = c & 0x01;
3995               counter = 5;
3996             }
3997           else if ( c >= 0xf8 )
3998             {
3999               ch = c & 0x03;
4000               counter = 4;
4001             }
4002           else if ( c >= 0xf0 )
4003             {
4004               ch = c & 0x07;
4005               counter = 3;
4006             }
4007           else if ( c >= 0xe0 )
4008             {
4009               ch = c & 0x0f;
4010               counter = 2;
4011             }
4012           else if ( c >= 0xc0 )
4013             {
4014               ch = c & 0x1f;
4015               counter = 1;
4016             }
4017           else
4018             {
4019               DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4020               decode_ucs4 (c, dst);
4021             }
4022           break;
4023         case 1:
4024           ch = ( ch << 6 ) | ( c & 0x3f );
4025           decode_ucs4 (ch, dst);
4026           ch = 0;
4027           counter = 0;
4028           break;
4029         default:
4030           ch = ( ch << 6 ) | ( c & 0x3f );
4031           counter--;
4032         }
4033     label_continue_loop:;
4034     }
4035
4036   if (flags & CODING_STATE_END)
4037     DECODE_OUTPUT_PARTIAL_CHAR (ch);
4038
4039   str->flags = flags;
4040   str->ch    = ch;
4041   str->counter = counter;
4042 }
4043
4044 #ifndef UTF2000
4045 static void
4046 encode_utf8 (Lisp_Object charset,
4047              unsigned char h, unsigned char l, unsigned_char_dynarr *dst)
4048 {
4049   unsigned long code = mule_char_to_ucs4 (charset, h, l);
4050   if ( code <= 0x7f )
4051     {
4052       Dynarr_add (dst, code);
4053     }
4054   else if ( code <= 0x7ff )
4055     {
4056       Dynarr_add (dst, (code >> 6) | 0xc0);
4057       Dynarr_add (dst, (code & 0x3f) | 0x80);
4058     }
4059   else if ( code <= 0xffff )
4060     {
4061       Dynarr_add (dst,  (code >> 12) | 0xe0);
4062       Dynarr_add (dst, ((code >>  6) & 0x3f) | 0x80);
4063       Dynarr_add (dst,  (code        & 0x3f) | 0x80);
4064     }
4065   else if ( code <= 0x1fffff )
4066     {
4067       Dynarr_add (dst,  (code >> 18) | 0xf0);
4068       Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
4069       Dynarr_add (dst, ((code >>  6) & 0x3f) | 0x80);
4070       Dynarr_add (dst,  (code        & 0x3f) | 0x80);
4071     }
4072   else if ( code <= 0x3ffffff )
4073     {
4074       Dynarr_add (dst,  (code >> 24) | 0xf8);
4075       Dynarr_add (dst, ((code >> 18) & 0x3f) | 0x80);
4076       Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
4077       Dynarr_add (dst, ((code >>  6) & 0x3f) | 0x80);
4078       Dynarr_add (dst,  (code        & 0x3f) | 0x80);
4079     }
4080   else
4081     {
4082       Dynarr_add (dst,  (code >> 30) | 0xfc);
4083       Dynarr_add (dst, ((code >> 24) & 0x3f) | 0x80);
4084       Dynarr_add (dst, ((code >> 18) & 0x3f) | 0x80);
4085       Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
4086       Dynarr_add (dst, ((code >>  6) & 0x3f) | 0x80);
4087       Dynarr_add (dst,  (code        & 0x3f) | 0x80);
4088     }
4089 }
4090 #endif
4091
4092 static void
4093 encode_coding_utf8 (Lstream *encoding, const Bufbyte *src,
4094                     unsigned_char_dynarr *dst, size_t n)
4095 {
4096   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
4097   unsigned int flags  = str->flags;
4098   unsigned int ch     = str->ch;
4099   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
4100   unsigned char char_boundary = str->iso2022.current_char_boundary;
4101 #ifdef UTF2000
4102
4103   while (n--)
4104     {
4105       unsigned char c = *src++;
4106       switch (char_boundary)
4107         {
4108         case 0:
4109           if ( c >= 0xfc )
4110             {
4111               Dynarr_add (dst, c);
4112               char_boundary = 5;
4113             }
4114           else if ( c >= 0xf8 )
4115             {
4116               Dynarr_add (dst, c);
4117               char_boundary = 4;
4118             }
4119           else if ( c >= 0xf0 )
4120             {
4121               Dynarr_add (dst, c);
4122               char_boundary = 3;
4123             }
4124           else if ( c >= 0xe0 )
4125             {
4126               Dynarr_add (dst, c);
4127               char_boundary = 2;
4128             }
4129           else if ( c >= 0xc0 )
4130             {
4131               Dynarr_add (dst, c);
4132               char_boundary = 1;
4133             }
4134           else
4135             {
4136               if (c == '\n')
4137                 {
4138                   if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
4139                     Dynarr_add (dst, '\r');
4140                   if (eol_type != EOL_CR)
4141                     Dynarr_add (dst, c);
4142                 }
4143               else
4144                 Dynarr_add (dst, c);
4145               char_boundary = 0;
4146             }
4147           break;
4148         case 1:
4149           Dynarr_add (dst, c);
4150           char_boundary = 0;
4151           break;
4152         default:
4153           Dynarr_add (dst, c);
4154           char_boundary--;
4155         }
4156     }
4157 #else /* not UTF2000 */
4158   Lisp_Object charset = str->iso2022.current_charset;
4159
4160 #ifdef ENABLE_COMPOSITE_CHARS
4161   /* flags for handling composite chars.  We do a little switcharoo
4162      on the source while we're outputting the composite char. */
4163   unsigned int saved_n = 0;
4164   const unsigned char *saved_src = NULL;
4165   int in_composite = 0;
4166
4167  back_to_square_n:
4168 #endif /* ENABLE_COMPOSITE_CHARS */
4169
4170   while (n--)
4171     {
4172       unsigned char c = *src++;
4173
4174       if (BYTE_ASCII_P (c))
4175         {               /* Processing ASCII character */
4176           ch = 0;
4177           if (c == '\n')
4178             {
4179               if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
4180                 Dynarr_add (dst, '\r');
4181               if (eol_type != EOL_CR)
4182                 Dynarr_add (dst, c);
4183             }
4184           else
4185             encode_utf8 (Vcharset_ascii, c, 0, dst);
4186           char_boundary = 1;
4187         }
4188       else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
4189         { /* Processing Leading Byte */
4190           ch = 0;
4191           charset = CHARSET_BY_LEADING_BYTE (c);
4192           if (LEADING_BYTE_PREFIX_P(c))
4193             ch = c;
4194           char_boundary = 0;
4195         }
4196       else
4197         {                       /* Processing Non-ASCII character */
4198           char_boundary = 1;
4199           if (EQ (charset, Vcharset_control_1))
4200             {
4201               encode_utf8 (Vcharset_control_1, c, 0, dst);
4202             }
4203           else
4204             {
4205               switch (XCHARSET_REP_BYTES (charset))
4206                 {
4207                 case 2:
4208                   encode_utf8 (charset, c, 0, dst);
4209                   break;
4210                 case 3:
4211                   if (XCHARSET_PRIVATE_P (charset))
4212                     {
4213                       encode_utf8 (charset, c, 0, dst);
4214                       ch = 0;
4215                     }
4216                   else if (ch)
4217                     {
4218 #ifdef ENABLE_COMPOSITE_CHARS
4219                       if (EQ (charset, Vcharset_composite))
4220                         {
4221                           if (in_composite)
4222                             {
4223                               /* #### Bother! We don't know how to
4224                                  handle this yet. */
4225                               encode_utf8 (Vcharset_ascii, '~', 0, dst);
4226                             }
4227                           else
4228                             {
4229                               Emchar emch = MAKE_CHAR (Vcharset_composite,
4230                                                        ch & 0x7F, c & 0x7F);
4231                               Lisp_Object lstr = composite_char_string (emch);
4232                               saved_n = n;
4233                               saved_src = src;
4234                               in_composite = 1;
4235                               src = XSTRING_DATA   (lstr);
4236                               n   = XSTRING_LENGTH (lstr);
4237                             }
4238                         }
4239                       else
4240 #endif /* ENABLE_COMPOSITE_CHARS */
4241                         {
4242                           encode_utf8 (charset, ch, c, dst);
4243                         }
4244                       ch = 0;
4245                     }
4246                   else
4247                     {
4248                       ch = c;
4249                       char_boundary = 0;
4250                     }
4251                   break;
4252                 case 4:
4253                   if (ch)
4254                     {
4255                       encode_utf8 (charset, ch, c, dst);
4256                       ch = 0;
4257                     }
4258                   else
4259                     {
4260                       ch = c;
4261                       char_boundary = 0;
4262                     }
4263                   break;
4264                 default:
4265                   abort ();
4266                 }
4267             }
4268         }
4269     }
4270
4271 #ifdef ENABLE_COMPOSITE_CHARS
4272   if (in_composite)
4273     {
4274       n = saved_n;
4275       src = saved_src;
4276       in_composite = 0;
4277       goto back_to_square_n; /* Wheeeeeeeee ..... */
4278     }
4279 #endif
4280
4281 #endif /* not UTF2000 */
4282   str->flags = flags;
4283   str->ch    = ch;
4284   str->iso2022.current_char_boundary = char_boundary;
4285 #ifndef UTF2000
4286   str->iso2022.current_charset = charset;
4287 #endif
4288
4289   /* Verbum caro factum est! */
4290 }
4291
4292 \f
4293 /************************************************************************/
4294 /*                           ISO2022 methods                            */
4295 /************************************************************************/
4296
4297 /* The following note describes the coding system ISO2022 briefly.
4298    Since the intention of this note is to help understand the
4299    functions in this file, some parts are NOT ACCURATE or OVERLY
4300    SIMPLIFIED.  For thorough understanding, please refer to the
4301    original document of ISO2022.
4302
4303    ISO2022 provides many mechanisms to encode several character sets
4304    in 7-bit and 8-bit environments.  For 7-bit environments, all text
4305    is encoded using bytes less than 128.  This may make the encoded
4306    text a little bit longer, but the text passes more easily through
4307    several gateways, some of which strip off MSB (Most Signigant Bit).
4308
4309    There are two kinds of character sets: control character set and
4310    graphic character set.  The former contains control characters such
4311    as `newline' and `escape' to provide control functions (control
4312    functions are also provided by escape sequences).  The latter
4313    contains graphic characters such as 'A' and '-'.  Emacs recognizes
4314    two control character sets and many graphic character sets.
4315
4316    Graphic character sets are classified into one of the following
4317    four classes, according to the number of bytes (DIMENSION) and
4318    number of characters in one dimension (CHARS) of the set:
4319    - DIMENSION1_CHARS94
4320    - DIMENSION1_CHARS96
4321    - DIMENSION2_CHARS94
4322    - DIMENSION2_CHARS96
4323
4324    In addition, each character set is assigned an identification tag,
4325    unique for each set, called "final character" (denoted as <F>
4326    hereafter).  The <F> of each character set is decided by ECMA(*)
4327    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
4328    (0x30..0x3F are for private use only).
4329
4330    Note (*): ECMA = European Computer Manufacturers Association
4331
4332    Here are examples of graphic character set [NAME(<F>)]:
4333         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
4334         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
4335         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
4336         o DIMENSION2_CHARS96 -- none for the moment
4337
4338    A code area (1 byte = 8 bits) is divided into 4 areas, C0, GL, C1, and GR.
4339         C0 [0x00..0x1F] -- control character plane 0
4340         GL [0x20..0x7F] -- graphic character plane 0
4341         C1 [0x80..0x9F] -- control character plane 1
4342         GR [0xA0..0xFF] -- graphic character plane 1
4343
4344    A control character set is directly designated and invoked to C0 or
4345    C1 by an escape sequence.  The most common case is that:
4346    - ISO646's  control character set is designated/invoked to C0, and
4347    - ISO6429's control character set is designated/invoked to C1,
4348    and usually these designations/invocations are omitted in encoded
4349    text.  In a 7-bit environment, only C0 can be used, and a control
4350    character for C1 is encoded by an appropriate escape sequence to
4351    fit into the environment.  All control characters for C1 are
4352    defined to have corresponding escape sequences.
4353
4354    A graphic character set is at first designated to one of four
4355    graphic registers (G0 through G3), then these graphic registers are
4356    invoked to GL or GR.  These designations and invocations can be
4357    done independently.  The most common case is that G0 is invoked to
4358    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
4359    these invocations and designations are omitted in encoded text.
4360    In a 7-bit environment, only GL can be used.
4361
4362    When a graphic character set of CHARS94 is invoked to GL, codes
4363    0x20 and 0x7F of the GL area work as control characters SPACE and
4364    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
4365    be used.
4366
4367    There are two ways of invocation: locking-shift and single-shift.
4368    With locking-shift, the invocation lasts until the next different
4369    invocation, whereas with single-shift, the invocation affects the
4370    following character only and doesn't affect the locking-shift
4371    state.  Invocations are done by the following control characters or
4372    escape sequences:
4373
4374    ----------------------------------------------------------------------
4375    abbrev  function                  cntrl escape seq   description
4376    ----------------------------------------------------------------------
4377    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
4378    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
4379    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
4380    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
4381    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
4382    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
4383    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
4384    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
4385    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
4386    ----------------------------------------------------------------------
4387    (*) These are not used by any known coding system.
4388
4389    Control characters for these functions are defined by macros
4390    ISO_CODE_XXX in `coding.h'.
4391
4392    Designations are done by the following escape sequences:
4393    ----------------------------------------------------------------------
4394    escape sequence      description
4395    ----------------------------------------------------------------------
4396    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
4397    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
4398    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
4399    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
4400    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
4401    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
4402    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
4403    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
4404    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
4405    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
4406    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
4407    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
4408    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
4409    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
4410    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
4411    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
4412    ----------------------------------------------------------------------
4413
4414    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
4415    of dimension 1, chars 94, and final character <F>, etc...
4416
4417    Note (*): Although these designations are not allowed in ISO2022,
4418    Emacs accepts them on decoding, and produces them on encoding
4419    CHARS96 character sets in a coding system which is characterized as
4420    7-bit environment, non-locking-shift, and non-single-shift.
4421
4422    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
4423    '(' can be omitted.  We refer to this as "short-form" hereafter.
4424
4425    Now you may notice that there are a lot of ways for encoding the
4426    same multilingual text in ISO2022.  Actually, there exist many
4427    coding systems such as Compound Text (used in X11's inter client
4428    communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
4429    (used in Korean internet), EUC (Extended UNIX Code, used in Asian
4430    localized platforms), and all of these are variants of ISO2022.
4431
4432    In addition to the above, Emacs handles two more kinds of escape
4433    sequences: ISO6429's direction specification and Emacs' private
4434    sequence for specifying character composition.
4435
4436    ISO6429's direction specification takes the following form:
4437         o CSI ']'      -- end of the current direction
4438         o CSI '0' ']'  -- end of the current direction
4439         o CSI '1' ']'  -- start of left-to-right text
4440         o CSI '2' ']'  -- start of right-to-left text
4441    The control character CSI (0x9B: control sequence introducer) is
4442    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
4443
4444    Character composition specification takes the following form:
4445         o ESC '0' -- start character composition
4446         o ESC '1' -- end character composition
4447    Since these are not standard escape sequences of any ISO standard,
4448    their use with these meanings is restricted to Emacs only.  */
4449
4450 static void
4451 reset_iso2022 (Lisp_Object coding_system, struct iso2022_decoder *iso)
4452 {
4453   int i;
4454
4455   for (i = 0; i < 4; i++)
4456     {
4457       if (!NILP (coding_system))
4458         iso->charset[i] =
4459           XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, i);
4460       else
4461         iso->charset[i] = Qt;
4462       iso->invalid_designated[i] = 0;
4463     }
4464   iso->esc = ISO_ESC_NOTHING;
4465   iso->esc_bytes_index = 0;
4466   iso->register_left = 0;
4467   iso->register_right = 1;
4468   iso->switched_dir_and_no_valid_charset_yet = 0;
4469   iso->invalid_switch_dir = 0;
4470   iso->output_direction_sequence = 0;
4471   iso->output_literally = 0;
4472 #ifdef ENABLE_COMPOSITE_CHARS
4473   if (iso->composite_chars)
4474     Dynarr_reset (iso->composite_chars);
4475 #endif
4476 }
4477
4478 static int
4479 fit_to_be_escape_quoted (unsigned char c)
4480 {
4481   switch (c)
4482     {
4483     case ISO_CODE_ESC:
4484     case ISO_CODE_CSI:
4485     case ISO_CODE_SS2:
4486     case ISO_CODE_SS3:
4487     case ISO_CODE_SO:
4488     case ISO_CODE_SI:
4489       return 1;
4490
4491     default:
4492       return 0;
4493     }
4494 }
4495
4496 /* Parse one byte of an ISO2022 escape sequence.
4497    If the result is an invalid escape sequence, return 0 and
4498    do not change anything in STR.  Otherwise, if the result is
4499    an incomplete escape sequence, update ISO2022.ESC and
4500    ISO2022.ESC_BYTES and return -1.  Otherwise, update
4501    all the state variables (but not ISO2022.ESC_BYTES) and
4502    return 1.
4503
4504    If CHECK_INVALID_CHARSETS is non-zero, check for designation
4505    or invocation of an invalid character set and treat that as
4506    an unrecognized escape sequence.
4507
4508    ********************************************************************
4509
4510    #### Strategies for error annotation and coding orthogonalization
4511
4512    We really want to separate out a number of things.  Conceptually,
4513    there is a nested syntax.
4514
4515    At the top level is the ISO 2022 extension syntax, including charset
4516    designation and invocation, and certain auxiliary controls such as the
4517    ISO 6429 direction specification.  These are octet-oriented, with the
4518    single exception (AFAIK) of the "exit Unicode" sequence which uses the
4519    UTF's natural width (1 byte for UTF-7 and UTF-8, 2 bytes for UCS-2 and
4520    UTF-16, and 4 bytes for UCS-4 and UTF-32).  This will be treated as a
4521    (deprecated) special case in Unicode processing.
4522
4523    The middle layer is ISO 2022 character interpretation.  This will depend
4524    on the current state of the ISO 2022 registers, and assembles octets
4525    into the character's internal representation.
4526
4527    The lowest level is translating system control conventions.  At present
4528    this is restricted to newline translation, but one could imagine doing
4529    tab conversion or line wrapping here.  "Escape from Unicode" processing
4530    would be done at this level.
4531
4532    At each level the parser will verify the syntax.  In the case of a
4533    syntax error or warning (such as a redundant escape sequence that affects
4534    no characters), the parser will take some action, typically inserting the
4535    erroneous octets directly into the output and creating an annotation
4536    which can be used by higher level I/O to mark the affected region.
4537
4538    This should make it possible to do something sensible about separating
4539    newline convention processing from character construction, and about
4540    preventing ISO 2022 escape sequences from being recognized
4541    inappropriately.
4542
4543    The basic strategy will be to have octet classification tables, and
4544    switch processing according to the table entry.
4545
4546    It's possible that, by doing the processing with tables of functions or
4547    the like, the parser can be used for both detection and translation. */
4548
4549 static int
4550 parse_iso2022_esc (Lisp_Object codesys, struct iso2022_decoder *iso,
4551                    unsigned char c, unsigned int *flags,
4552                    int check_invalid_charsets)
4553 {
4554   /* (1) If we're at the end of a designation sequence, CS is the
4555      charset being designated and REG is the register to designate
4556      it to.
4557
4558      (2) If we're at the end of a locking-shift sequence, REG is
4559      the register to invoke and HALF (0 == left, 1 == right) is
4560      the half to invoke it into.
4561
4562      (3) If we're at the end of a single-shift sequence, REG is
4563      the register to invoke. */
4564   Lisp_Object cs = Qnil;
4565   int reg, half;
4566
4567   /* NOTE: This code does goto's all over the fucking place.
4568      The reason for this is that we're basically implementing
4569      a state machine here, and hierarchical languages like C
4570      don't really provide a clean way of doing this. */
4571
4572   if (! (*flags & CODING_STATE_ESCAPE))
4573     /* At beginning of escape sequence; we need to reset our
4574        escape-state variables. */
4575     iso->esc = ISO_ESC_NOTHING;
4576
4577   iso->output_literally = 0;
4578   iso->output_direction_sequence = 0;
4579
4580   switch (iso->esc)
4581     {
4582     case ISO_ESC_NOTHING:
4583       iso->esc_bytes_index = 0;
4584       switch (c)
4585         {
4586         case ISO_CODE_ESC:      /* Start escape sequence */
4587           *flags |= CODING_STATE_ESCAPE;
4588           iso->esc = ISO_ESC;
4589           goto not_done;
4590
4591         case ISO_CODE_CSI:      /* ISO6429 (specifying directionality) */
4592           *flags |= CODING_STATE_ESCAPE;
4593           iso->esc = ISO_ESC_5_11;
4594           goto not_done;
4595
4596         case ISO_CODE_SO:       /* locking shift 1 */
4597           reg = 1; half = 0;
4598           goto locking_shift;
4599         case ISO_CODE_SI:       /* locking shift 0 */
4600           reg = 0; half = 0;
4601           goto locking_shift;
4602
4603         case ISO_CODE_SS2:      /* single shift */
4604           reg = 2;
4605           goto single_shift;
4606         case ISO_CODE_SS3:      /* single shift */
4607           reg = 3;
4608           goto single_shift;
4609
4610         default:                        /* Other control characters */
4611           return 0;
4612         }
4613
4614     case ISO_ESC:
4615       switch (c)
4616         {
4617           /**** single shift ****/
4618
4619         case 'N':       /* single shift 2 */
4620           reg = 2;
4621           goto single_shift;
4622         case 'O':       /* single shift 3 */
4623           reg = 3;
4624           goto single_shift;
4625
4626           /**** locking shift ****/
4627
4628         case '~':       /* locking shift 1 right */
4629           reg = 1; half = 1;
4630           goto locking_shift;
4631         case 'n':       /* locking shift 2 */
4632           reg = 2; half = 0;
4633           goto locking_shift;
4634         case '}':       /* locking shift 2 right */
4635           reg = 2; half = 1;
4636           goto locking_shift;
4637         case 'o':       /* locking shift 3 */
4638           reg = 3; half = 0;
4639           goto locking_shift;
4640         case '|':       /* locking shift 3 right */
4641           reg = 3; half = 1;
4642           goto locking_shift;
4643
4644 #ifdef ENABLE_COMPOSITE_CHARS
4645           /**** composite ****/
4646
4647         case '0':
4648           iso->esc = ISO_ESC_START_COMPOSITE;
4649           *flags = (*flags & CODING_STATE_ISO2022_LOCK) |
4650             CODING_STATE_COMPOSITE;
4651           return 1;
4652
4653         case '1':
4654           iso->esc = ISO_ESC_END_COMPOSITE;
4655           *flags = (*flags & CODING_STATE_ISO2022_LOCK) &
4656             ~CODING_STATE_COMPOSITE;
4657           return 1;
4658 #endif /* ENABLE_COMPOSITE_CHARS */
4659
4660           /**** directionality ****/
4661
4662         case '[':
4663           iso->esc = ISO_ESC_5_11;
4664           goto not_done;
4665
4666           /**** designation ****/
4667
4668         case '$':       /* multibyte charset prefix */
4669           iso->esc = ISO_ESC_2_4;
4670           goto not_done;
4671
4672         default:
4673           if (0x28 <= c && c <= 0x2F)
4674             {
4675               iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_8);
4676               goto not_done;
4677             }
4678
4679           /* This function is called with CODESYS equal to nil when
4680              doing coding-system detection. */
4681           if (!NILP (codesys)
4682               && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
4683               && fit_to_be_escape_quoted (c))
4684             {
4685               iso->esc = ISO_ESC_LITERAL;
4686               *flags &= CODING_STATE_ISO2022_LOCK;
4687               return 1;
4688             }
4689
4690           /* bzzzt! */
4691           return 0;
4692         }
4693
4694
4695
4696       /**** directionality ****/
4697
4698     case ISO_ESC_5_11:          /* ISO6429 direction control */
4699       if (c == ']')
4700         {
4701           *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4702           goto directionality;
4703         }
4704       if      (c == '0') iso->esc = ISO_ESC_5_11_0;
4705       else if (c == '1') iso->esc = ISO_ESC_5_11_1;
4706       else if (c == '2') iso->esc = ISO_ESC_5_11_2;
4707       else               return 0;
4708       goto not_done;
4709
4710     case ISO_ESC_5_11_0:
4711       if (c == ']')
4712         {
4713           *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4714           goto directionality;
4715         }
4716       return 0;
4717
4718     case ISO_ESC_5_11_1:
4719       if (c == ']')
4720         {
4721           *flags = (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4722           goto directionality;
4723         }
4724       return 0;
4725
4726     case ISO_ESC_5_11_2:
4727       if (c == ']')
4728         {
4729           *flags = (*flags & CODING_STATE_ISO2022_LOCK) | CODING_STATE_R2L;
4730           goto directionality;
4731         }
4732       return 0;
4733
4734     directionality:
4735       iso->esc = ISO_ESC_DIRECTIONALITY;
4736       /* Various junk here to attempt to preserve the direction sequences
4737          literally in the text if they would otherwise be swallowed due
4738          to invalid designations that don't show up as actual charset
4739          changes in the text. */
4740       if (iso->invalid_switch_dir)
4741         {
4742           /* We already inserted a direction switch literally into the
4743              text.  We assume (#### this may not be right) that the
4744              next direction switch is the one going the other way,
4745              and we need to output that literally as well. */
4746           iso->output_literally = 1;
4747           iso->invalid_switch_dir = 0;
4748         }
4749       else
4750         {
4751           int jj;
4752
4753           /* If we are in the thrall of an invalid designation,
4754            then stick the directionality sequence literally into the
4755            output stream so it ends up in the original text again. */
4756           for (jj = 0; jj < 4; jj++)
4757             if (iso->invalid_designated[jj])
4758               break;
4759           if (jj < 4)
4760             {
4761               iso->output_literally = 1;
4762               iso->invalid_switch_dir = 1;
4763             }
4764           else
4765             /* Indicate that we haven't yet seen a valid designation,
4766                so that if a switch-dir is directly followed by an
4767                invalid designation, both get inserted literally. */
4768             iso->switched_dir_and_no_valid_charset_yet = 1;
4769         }
4770       return 1;
4771
4772
4773       /**** designation ****/
4774
4775     case ISO_ESC_2_4:
4776       if (0x28 <= c && c <= 0x2F)
4777         {
4778           iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_4_8);
4779           goto not_done;
4780         }
4781       if (0x40 <= c && c <= 0x42)
4782         {
4783           cs = CHARSET_BY_ATTRIBUTES (94, 2, c,
4784                                       *flags & CODING_STATE_R2L ?
4785                                       CHARSET_RIGHT_TO_LEFT :
4786                                       CHARSET_LEFT_TO_RIGHT);
4787           reg = 0;
4788           goto designated;
4789         }
4790       return 0;
4791
4792     default:
4793       {
4794         /* int type =-1; */
4795         int chars = 0;
4796         int single = 0;
4797
4798         if (c < '0' || c > '~')
4799           return 0; /* bad final byte */
4800
4801         if (iso->esc >= ISO_ESC_2_8 &&
4802             iso->esc <= ISO_ESC_2_15)
4803           {
4804             chars = (iso->esc >= ISO_ESC_2_12) ? 96 : 94;
4805             single = 1; /* single-byte */
4806             reg = (iso->esc - ISO_ESC_2_8) & 3;
4807           }
4808         else if (iso->esc >= ISO_ESC_2_4_8 &&
4809                  iso->esc <= ISO_ESC_2_4_15)
4810           {
4811             chars = (iso->esc >= ISO_ESC_2_4_12) ? 96 : 94;
4812             single = -1; /* multi-byte */
4813             reg = (iso->esc - ISO_ESC_2_4_8) & 3;
4814           }
4815         else
4816           {
4817             /* Can this ever be reached? -slb */
4818             abort();
4819             return 0;
4820           }
4821
4822         cs = CHARSET_BY_ATTRIBUTES (chars, single, c,
4823                                     *flags & CODING_STATE_R2L ?
4824                                     CHARSET_RIGHT_TO_LEFT :
4825                                     CHARSET_LEFT_TO_RIGHT);
4826         goto designated;
4827       }
4828     }
4829
4830  not_done:
4831   iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char) c;
4832   return -1;
4833
4834  single_shift:
4835   if (check_invalid_charsets && !CHARSETP (iso->charset[reg]))
4836     /* can't invoke something that ain't there. */
4837     return 0;
4838   iso->esc = ISO_ESC_SINGLE_SHIFT;
4839   *flags &= CODING_STATE_ISO2022_LOCK;
4840   if (reg == 2)
4841     *flags |= CODING_STATE_SS2;
4842   else
4843     *flags |= CODING_STATE_SS3;
4844   return 1;
4845
4846  locking_shift:
4847   if (check_invalid_charsets &&
4848       !CHARSETP (iso->charset[reg]))
4849     /* can't invoke something that ain't there. */
4850     return 0;
4851   if (half)
4852     iso->register_right = reg;
4853   else
4854     iso->register_left = reg;
4855   *flags &= CODING_STATE_ISO2022_LOCK;
4856   iso->esc = ISO_ESC_LOCKING_SHIFT;
4857   return 1;
4858
4859  designated:
4860   if (NILP (cs) && check_invalid_charsets)
4861     {
4862       iso->invalid_designated[reg] = 1;
4863       iso->charset[reg] = Vcharset_ascii;
4864       iso->esc = ISO_ESC_DESIGNATE;
4865       *flags &= CODING_STATE_ISO2022_LOCK;
4866       iso->output_literally = 1;
4867       if (iso->switched_dir_and_no_valid_charset_yet)
4868         {
4869           /* We encountered a switch-direction followed by an
4870              invalid designation.  Ensure that the switch-direction
4871              gets outputted; otherwise it will probably get eaten
4872              when the text is written out again. */
4873           iso->switched_dir_and_no_valid_charset_yet = 0;
4874           iso->output_direction_sequence = 1;
4875           /* And make sure that the switch-dir going the other
4876              way gets outputted, as well. */
4877           iso->invalid_switch_dir = 1;
4878         }
4879       return 1;
4880     }
4881   /* This function is called with CODESYS equal to nil when
4882      doing coding-system detection. */
4883   if (!NILP (codesys))
4884     {
4885       charset_conversion_spec_dynarr *dyn =
4886         XCODING_SYSTEM (codesys)->iso2022.input_conv;
4887
4888       if (dyn)
4889         {
4890           int i;
4891
4892           for (i = 0; i < Dynarr_length (dyn); i++)
4893             {
4894               struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
4895               if (EQ (cs, spec->from_charset))
4896                 cs = spec->to_charset;
4897             }
4898         }
4899     }
4900
4901   iso->charset[reg] = cs;
4902   iso->esc = ISO_ESC_DESIGNATE;
4903   *flags &= CODING_STATE_ISO2022_LOCK;
4904   if (iso->invalid_designated[reg])
4905     {
4906       iso->invalid_designated[reg] = 0;
4907       iso->output_literally = 1;
4908     }
4909   if (iso->switched_dir_and_no_valid_charset_yet)
4910     iso->switched_dir_and_no_valid_charset_yet = 0;
4911   return 1;
4912 }
4913
4914 static int
4915 detect_coding_iso2022 (struct detection_state *st, const Extbyte *src, size_t n)
4916 {
4917   int mask;
4918
4919   /* #### There are serious deficiencies in the recognition mechanism
4920      here.  This needs to be much smarter if it's going to cut it.
4921      The sequence "\xff\x0f" is currently detected as LOCK_SHIFT while
4922      it should be detected as Latin-1.
4923      All the ISO2022 stuff in this file should be synced up with the
4924      code from FSF Emacs-20.4, in which Mule should be more or less stable.
4925      Perhaps we should wait till R2L works in FSF Emacs? */
4926
4927   if (!st->iso2022.initted)
4928     {
4929       reset_iso2022 (Qnil, &st->iso2022.iso);
4930       st->iso2022.mask = (CODING_CATEGORY_ISO_7_MASK |
4931                           CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4932                           CODING_CATEGORY_ISO_8_1_MASK |
4933                           CODING_CATEGORY_ISO_8_2_MASK |
4934                           CODING_CATEGORY_ISO_LOCK_SHIFT_MASK);
4935       st->iso2022.flags = 0;
4936       st->iso2022.high_byte_count = 0;
4937       st->iso2022.saw_single_shift = 0;
4938       st->iso2022.initted = 1;
4939     }
4940
4941   mask = st->iso2022.mask;
4942
4943   while (n--)
4944     {
4945       unsigned char c = *(unsigned char *)src++;
4946       if (c >= 0xA0)
4947         {
4948           mask &= ~CODING_CATEGORY_ISO_7_MASK;
4949           st->iso2022.high_byte_count++;
4950         }
4951       else
4952         {
4953           if (st->iso2022.high_byte_count && !st->iso2022.saw_single_shift)
4954             {
4955               if (st->iso2022.high_byte_count & 1)
4956                 /* odd number of high bytes; assume not iso-8-2 */
4957                 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4958             }
4959           st->iso2022.high_byte_count = 0;
4960           st->iso2022.saw_single_shift = 0;
4961           if (c > 0x80)
4962             mask &= ~CODING_CATEGORY_ISO_7_MASK;
4963         }
4964       if (!(st->iso2022.flags & CODING_STATE_ESCAPE)
4965           && (BYTE_C0_P (c) || BYTE_C1_P (c)))
4966         { /* control chars */
4967           switch (c)
4968             {
4969               /* Allow and ignore control characters that you might
4970                  reasonably see in a text file */
4971             case '\r':
4972             case '\n':
4973             case '\t':
4974             case  7: /* bell */
4975             case  8: /* backspace */
4976             case 11: /* vertical tab */
4977             case 12: /* form feed */
4978             case 26: /* MS-DOS C-z junk */
4979             case 31: /* '^_' -- for info */
4980               goto label_continue_loop;
4981
4982             default:
4983               break;
4984             }
4985         }
4986
4987       if ((st->iso2022.flags & CODING_STATE_ESCAPE) || BYTE_C0_P (c)
4988           || BYTE_C1_P (c))
4989         {
4990           if (parse_iso2022_esc (Qnil, &st->iso2022.iso, c,
4991                                  &st->iso2022.flags, 0))
4992             {
4993               switch (st->iso2022.iso.esc)
4994                 {
4995                 case ISO_ESC_DESIGNATE:
4996                   mask &= ~CODING_CATEGORY_ISO_8_1_MASK;
4997                   mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4998                   break;
4999                 case ISO_ESC_LOCKING_SHIFT:
5000                   mask = CODING_CATEGORY_ISO_LOCK_SHIFT_MASK;
5001                   goto ran_out_of_chars;
5002                 case ISO_ESC_SINGLE_SHIFT:
5003                   mask &= ~CODING_CATEGORY_ISO_8_DESIGNATE_MASK;
5004                   st->iso2022.saw_single_shift = 1;
5005                   break;
5006                 default:
5007                   break;
5008                 }
5009             }
5010           else
5011             {
5012               mask = 0;
5013               goto ran_out_of_chars;
5014             }
5015         }
5016     label_continue_loop:;
5017     }
5018
5019  ran_out_of_chars:
5020
5021   return mask;
5022 }
5023
5024 static int
5025 postprocess_iso2022_mask (int mask)
5026 {
5027   /* #### kind of cheesy */
5028   /* If seven-bit ISO is allowed, then assume that the encoding is
5029      entirely seven-bit and turn off the eight-bit ones. */
5030   if (mask & CODING_CATEGORY_ISO_7_MASK)
5031     mask &= ~ (CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
5032                CODING_CATEGORY_ISO_8_1_MASK |
5033                CODING_CATEGORY_ISO_8_2_MASK);
5034   return mask;
5035 }
5036
5037 /* If FLAGS is a null pointer or specifies right-to-left motion,
5038    output a switch-dir-to-left-to-right sequence to DST.
5039    Also update FLAGS if it is not a null pointer.
5040    If INTERNAL_P is set, we are outputting in internal format and
5041    need to handle the CSI differently. */
5042
5043 static void
5044 restore_left_to_right_direction (Lisp_Coding_System *codesys,
5045                                  unsigned_char_dynarr *dst,
5046                                  unsigned int *flags,
5047                                  int internal_p)
5048 {
5049   if (!flags || (*flags & CODING_STATE_R2L))
5050     {
5051       if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5052         {
5053           Dynarr_add (dst, ISO_CODE_ESC);
5054           Dynarr_add (dst, '[');
5055         }
5056       else if (internal_p)
5057         DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
5058       else
5059         Dynarr_add (dst, ISO_CODE_CSI);
5060       Dynarr_add (dst, '0');
5061       Dynarr_add (dst, ']');
5062       if (flags)
5063         *flags &= ~CODING_STATE_R2L;
5064     }
5065 }
5066
5067 /* If FLAGS is a null pointer or specifies a direction different from
5068    DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or
5069    CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape
5070    sequence to DST.  Also update FLAGS if it is not a null pointer.
5071    If INTERNAL_P is set, we are outputting in internal format and
5072    need to handle the CSI differently. */
5073
5074 static void
5075 ensure_correct_direction (int direction, Lisp_Coding_System *codesys,
5076                           unsigned_char_dynarr *dst, unsigned int *flags,
5077                           int internal_p)
5078 {
5079   if ((!flags || (*flags & CODING_STATE_R2L)) &&
5080       direction == CHARSET_LEFT_TO_RIGHT)
5081     restore_left_to_right_direction (codesys, dst, flags, internal_p);
5082   else if (!CODING_SYSTEM_ISO2022_NO_ISO6429 (codesys)
5083            && (!flags || !(*flags & CODING_STATE_R2L)) &&
5084            direction == CHARSET_RIGHT_TO_LEFT)
5085     {
5086       if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5087         {
5088           Dynarr_add (dst, ISO_CODE_ESC);
5089           Dynarr_add (dst, '[');
5090         }
5091       else if (internal_p)
5092         DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
5093       else
5094         Dynarr_add (dst, ISO_CODE_CSI);
5095       Dynarr_add (dst, '2');
5096       Dynarr_add (dst, ']');
5097       if (flags)
5098         *flags |= CODING_STATE_R2L;
5099     }
5100 }
5101
5102 /* Convert ISO2022-format data to internal format. */
5103
5104 static void
5105 decode_coding_iso2022 (Lstream *decoding, const Extbyte *src,
5106                        unsigned_char_dynarr *dst, size_t n)
5107 {
5108   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
5109   unsigned int flags  = str->flags;
5110   unsigned int ch     = str->ch;
5111   eol_type_t eol_type = str->eol_type;
5112 #ifdef ENABLE_COMPOSITE_CHARS
5113   unsigned_char_dynarr *real_dst = dst;
5114 #endif
5115   Lisp_Object coding_system;
5116
5117   XSETCODING_SYSTEM (coding_system, str->codesys);
5118
5119 #ifdef ENABLE_COMPOSITE_CHARS
5120   if (flags & CODING_STATE_COMPOSITE)
5121     dst = str->iso2022.composite_chars;
5122 #endif /* ENABLE_COMPOSITE_CHARS */
5123
5124   while (n--)
5125     {
5126       unsigned char c = *(unsigned char *)src++;
5127       if (flags & CODING_STATE_ESCAPE)
5128         {       /* Within ESC sequence */
5129           int retval = parse_iso2022_esc (coding_system, &str->iso2022,
5130                                           c, &flags, 1);
5131
5132           if (retval)
5133             {
5134               switch (str->iso2022.esc)
5135                 {
5136 #ifdef ENABLE_COMPOSITE_CHARS
5137                 case ISO_ESC_START_COMPOSITE:
5138                   if (str->iso2022.composite_chars)
5139                     Dynarr_reset (str->iso2022.composite_chars);
5140                   else
5141                     str->iso2022.composite_chars = Dynarr_new (unsigned_char);
5142                   dst = str->iso2022.composite_chars;
5143                   break;
5144                 case ISO_ESC_END_COMPOSITE:
5145                   {
5146                     Bufbyte comstr[MAX_EMCHAR_LEN];
5147                     Bytecount len;
5148                     Emchar emch = lookup_composite_char (Dynarr_atp (dst, 0),
5149                                                          Dynarr_length (dst));
5150                     dst = real_dst;
5151                     len = set_charptr_emchar (comstr, emch);
5152                     Dynarr_add_many (dst, comstr, len);
5153                     break;
5154                   }
5155 #endif /* ENABLE_COMPOSITE_CHARS */
5156
5157                 case ISO_ESC_LITERAL:
5158                   DECODE_ADD_BINARY_CHAR (c, dst);
5159                   break;
5160
5161                 default:
5162                   /* Everything else handled already */
5163                   break;
5164                 }
5165             }
5166
5167           /* Attempted error recovery. */
5168           if (str->iso2022.output_direction_sequence)
5169             ensure_correct_direction (flags & CODING_STATE_R2L ?
5170                                       CHARSET_RIGHT_TO_LEFT :
5171                                       CHARSET_LEFT_TO_RIGHT,
5172                                       str->codesys, dst, 0, 1);
5173           /* More error recovery. */
5174           if (!retval || str->iso2022.output_literally)
5175             {
5176               /* Output the (possibly invalid) sequence */
5177               int i;
5178               for (i = 0; i < str->iso2022.esc_bytes_index; i++)
5179                 DECODE_ADD_BINARY_CHAR (str->iso2022.esc_bytes[i], dst);
5180               flags &= CODING_STATE_ISO2022_LOCK;
5181               if (!retval)
5182                 n++, src--;/* Repeat the loop with the same character. */
5183               else
5184                 {
5185                   /* No sense in reprocessing the final byte of the
5186                      escape sequence; it could mess things up anyway.
5187                      Just add it now. */
5188                   DECODE_ADD_BINARY_CHAR (c, dst);
5189                 }
5190             }
5191           ch = 0;
5192         }
5193       else if (BYTE_C0_P (c) || BYTE_C1_P (c))
5194         { /* Control characters */
5195
5196           /***** Error-handling *****/
5197
5198           /* If we were in the middle of a character, dump out the
5199              partial character. */
5200           DECODE_OUTPUT_PARTIAL_CHAR (ch);
5201
5202           /* If we just saw a single-shift character, dump it out.
5203              This may dump out the wrong sort of single-shift character,
5204              but least it will give an indication that something went
5205              wrong. */
5206           if (flags & CODING_STATE_SS2)
5207             {
5208               DECODE_ADD_BINARY_CHAR (ISO_CODE_SS2, dst);
5209               flags &= ~CODING_STATE_SS2;
5210             }
5211           if (flags & CODING_STATE_SS3)
5212             {
5213               DECODE_ADD_BINARY_CHAR (ISO_CODE_SS3, dst);
5214               flags &= ~CODING_STATE_SS3;
5215             }
5216
5217           /***** Now handle the control characters. *****/
5218
5219           /* Handle CR/LF */
5220           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5221
5222           flags &= CODING_STATE_ISO2022_LOCK;
5223
5224           if (!parse_iso2022_esc (coding_system, &str->iso2022, c, &flags, 1))
5225             DECODE_ADD_BINARY_CHAR (c, dst);
5226         }
5227       else
5228         {                       /* Graphic characters */
5229           Lisp_Object charset;
5230 #ifndef UTF2000
5231           Charset_ID lb;
5232 #endif
5233           int reg;
5234
5235           DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5236
5237           /* Now determine the charset. */
5238           reg = ((flags & CODING_STATE_SS2) ? 2
5239                  : (flags & CODING_STATE_SS3) ? 3
5240                  : !BYTE_ASCII_P (c) ? str->iso2022.register_right
5241                  : str->iso2022.register_left);
5242           charset = str->iso2022.charset[reg];
5243
5244           /* Error checking: */
5245           if (! CHARSETP (charset)
5246               || str->iso2022.invalid_designated[reg]
5247               || (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL)
5248                   && XCHARSET_CHARS (charset) == 94))
5249             /* Mrmph.  We are trying to invoke a register that has no
5250                or an invalid charset in it, or trying to add a character
5251                outside the range of the charset.  Insert that char literally
5252                to preserve it for the output. */
5253             {
5254               DECODE_OUTPUT_PARTIAL_CHAR (ch);
5255               DECODE_ADD_BINARY_CHAR (c, dst);
5256             }
5257
5258           else
5259             {
5260               /* Things are probably hunky-dorey. */
5261
5262               /* Fetch reverse charset, maybe. */
5263               if (((flags & CODING_STATE_R2L) &&
5264                    XCHARSET_DIRECTION (charset) == CHARSET_LEFT_TO_RIGHT)
5265                   ||
5266                   (!(flags & CODING_STATE_R2L) &&
5267                    XCHARSET_DIRECTION (charset) == CHARSET_RIGHT_TO_LEFT))
5268                 {
5269                   Lisp_Object new_charset =
5270                     XCHARSET_REVERSE_DIRECTION_CHARSET (charset);
5271                   if (!NILP (new_charset))
5272                     charset = new_charset;
5273                 }
5274
5275 #ifdef UTF2000
5276               if (XCHARSET_DIMENSION (charset) == 1)
5277                 {
5278                   DECODE_OUTPUT_PARTIAL_CHAR (ch);
5279                   DECODE_ADD_UCS_CHAR
5280                     (MAKE_CHAR (charset, c & 0x7F, 0), dst);
5281                 }
5282               else if (ch)
5283                 {
5284                   DECODE_ADD_UCS_CHAR
5285                     (MAKE_CHAR (charset, ch & 0x7F, c & 0x7F), dst);
5286                   ch = 0;
5287                 }
5288               else
5289                 ch = c;
5290 #else
5291               lb = XCHARSET_LEADING_BYTE (charset);
5292               switch (XCHARSET_REP_BYTES (charset))
5293                 {
5294                 case 1: /* ASCII */
5295                   DECODE_OUTPUT_PARTIAL_CHAR (ch);
5296                   Dynarr_add (dst, c & 0x7F);
5297                   break;
5298
5299                 case 2: /* one-byte official */
5300                   DECODE_OUTPUT_PARTIAL_CHAR (ch);
5301                   Dynarr_add (dst, lb);
5302                   Dynarr_add (dst, c | 0x80);
5303                   break;
5304
5305                 case 3: /* one-byte private or two-byte official */
5306                   if (XCHARSET_PRIVATE_P (charset))
5307                     {
5308                       DECODE_OUTPUT_PARTIAL_CHAR (ch);
5309                       Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_1);
5310                       Dynarr_add (dst, lb);
5311                       Dynarr_add (dst, c | 0x80);
5312                     }
5313                   else
5314                     {
5315                       if (ch)
5316                         {
5317                           Dynarr_add (dst, lb);
5318                           Dynarr_add (dst, ch | 0x80);
5319                           Dynarr_add (dst, c | 0x80);
5320                           ch = 0;
5321                         }
5322                       else
5323                         ch = c;
5324                     }
5325                   break;
5326
5327                 default:        /* two-byte private */
5328                   if (ch)
5329                     {
5330                       Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_2);
5331                       Dynarr_add (dst, lb);
5332                       Dynarr_add (dst, ch | 0x80);
5333                       Dynarr_add (dst, c | 0x80);
5334                       ch = 0;
5335                     }
5336                   else
5337                     ch = c;
5338                 }
5339 #endif
5340             }
5341
5342           if (!ch)
5343             flags &= CODING_STATE_ISO2022_LOCK;
5344         }
5345
5346     label_continue_loop:;
5347     }
5348
5349   if (flags & CODING_STATE_END)
5350     DECODE_OUTPUT_PARTIAL_CHAR (ch);
5351
5352   str->flags = flags;
5353   str->ch    = ch;
5354 }
5355
5356
5357 /***** ISO2022 encoder *****/
5358
5359 /* Designate CHARSET into register REG. */
5360
5361 static void
5362 iso2022_designate (Lisp_Object charset, unsigned char reg,
5363                    struct encoding_stream *str, unsigned_char_dynarr *dst)
5364 {
5365   static const char inter94[] = "()*+";
5366   static const char inter96[] = ",-./";
5367   unsigned int type;
5368   unsigned char final;
5369   Lisp_Object old_charset = str->iso2022.charset[reg];
5370
5371   str->iso2022.charset[reg] = charset;
5372   if (!CHARSETP (charset))
5373     /* charset might be an initial nil or t. */
5374     return;
5375   type = XCHARSET_TYPE (charset);
5376   final = XCHARSET_FINAL (charset);
5377   if (!str->iso2022.force_charset_on_output[reg] &&
5378       CHARSETP (old_charset) &&
5379       XCHARSET_TYPE (old_charset) == type &&
5380       XCHARSET_FINAL (old_charset) == final)
5381     return;
5382
5383   str->iso2022.force_charset_on_output[reg] = 0;
5384
5385   {
5386     charset_conversion_spec_dynarr *dyn =
5387       str->codesys->iso2022.output_conv;
5388
5389     if (dyn)
5390       {
5391         int i;
5392
5393         for (i = 0; i < Dynarr_length (dyn); i++)
5394           {
5395             struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
5396             if (EQ (charset, spec->from_charset))
5397                 charset = spec->to_charset;
5398           }
5399       }
5400   }
5401
5402   Dynarr_add (dst, ISO_CODE_ESC);
5403   switch (type)
5404     {
5405     case CHARSET_TYPE_94:
5406       Dynarr_add (dst, inter94[reg]);
5407       break;
5408     case CHARSET_TYPE_96:
5409       Dynarr_add (dst, inter96[reg]);
5410       break;
5411     case CHARSET_TYPE_94X94:
5412       Dynarr_add (dst, '$');
5413       if (reg != 0
5414           || !(CODING_SYSTEM_ISO2022_SHORT (str->codesys))
5415           || final < '@'
5416           || final > 'B')
5417         Dynarr_add (dst, inter94[reg]);
5418       break;
5419     case CHARSET_TYPE_96X96:
5420       Dynarr_add (dst, '$');
5421       Dynarr_add (dst, inter96[reg]);
5422       break;
5423     }
5424   Dynarr_add (dst, final);
5425 }
5426
5427 static void
5428 ensure_normal_shift (struct encoding_stream *str, unsigned_char_dynarr *dst)
5429 {
5430   if (str->iso2022.register_left != 0)
5431     {
5432       Dynarr_add (dst, ISO_CODE_SI);
5433       str->iso2022.register_left = 0;
5434     }
5435 }
5436
5437 static void
5438 ensure_shift_out (struct encoding_stream *str, unsigned_char_dynarr *dst)
5439 {
5440   if (str->iso2022.register_left != 1)
5441     {
5442       Dynarr_add (dst, ISO_CODE_SO);
5443       str->iso2022.register_left = 1;
5444     }
5445 }
5446
5447 /* Convert internally-formatted data to ISO2022 format. */
5448
5449 static void
5450 encode_coding_iso2022 (Lstream *encoding, const Bufbyte *src,
5451                        unsigned_char_dynarr *dst, size_t n)
5452 {
5453   unsigned char charmask, c;
5454   unsigned char char_boundary;
5455   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
5456   unsigned int flags          = str->flags;
5457   Emchar ch                   = str->ch;
5458   Lisp_Coding_System *codesys = str->codesys;
5459   eol_type_t eol_type         = CODING_SYSTEM_EOL_TYPE (str->codesys);
5460   int i;
5461   Lisp_Object charset;
5462   int half;
5463 #ifdef UTF2000
5464   unsigned int byte1, byte2;
5465 #endif
5466
5467 #ifdef ENABLE_COMPOSITE_CHARS
5468   /* flags for handling composite chars.  We do a little switcharoo
5469      on the source while we're outputting the composite char. */
5470   unsigned int saved_n = 0;
5471   const unsigned char *saved_src = NULL;
5472   int in_composite = 0;
5473 #endif /* ENABLE_COMPOSITE_CHARS */
5474
5475   char_boundary = str->iso2022.current_char_boundary;
5476   charset = str->iso2022.current_charset;
5477   half = str->iso2022.current_half;
5478
5479 #ifdef ENABLE_COMPOSITE_CHARS
5480  back_to_square_n:
5481 #endif
5482 #ifdef UTF2000
5483   while (n--)
5484     {
5485       c = *src++;
5486
5487       switch (char_boundary)
5488         {
5489         case 0:
5490           if ( c >= 0xfc )
5491             {
5492               ch = c & 0x01;
5493               char_boundary = 5;
5494             }
5495           else if ( c >= 0xf8 )
5496             {
5497               ch = c & 0x03;
5498               char_boundary = 4;
5499             }
5500           else if ( c >= 0xf0 )
5501             {
5502               ch = c & 0x07;
5503               char_boundary = 3;
5504             }
5505           else if ( c >= 0xe0 )
5506             {
5507               ch = c & 0x0f;
5508               char_boundary = 2;
5509             }
5510           else if ( c >= 0xc0 )
5511             {
5512               ch = c & 0x1f;
5513               char_boundary = 1;
5514             }
5515           else
5516             {
5517               ch = 0;
5518
5519               restore_left_to_right_direction (codesys, dst, &flags, 0);
5520
5521               /* Make sure G0 contains ASCII */
5522               if ((c > ' ' && c < ISO_CODE_DEL) ||
5523                   !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
5524                 {
5525                   ensure_normal_shift (str, dst);
5526                   iso2022_designate (Vcharset_ascii, 0, str, dst);
5527                 }
5528
5529               /* If necessary, restore everything to the default state
5530                  at end-of-line */
5531               if (c == '\n' &&
5532                   !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys)))
5533                 {
5534                   restore_left_to_right_direction (codesys, dst, &flags, 0);
5535
5536                   ensure_normal_shift (str, dst);
5537
5538                   for (i = 0; i < 4; i++)
5539                     {
5540                       Lisp_Object initial_charset =
5541                         CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5542                       iso2022_designate (initial_charset, i, str, dst);
5543                     }
5544                 }
5545               if (c == '\n')
5546                 {
5547                   if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5548                     Dynarr_add (dst, '\r');
5549                   if (eol_type != EOL_CR)
5550                     Dynarr_add (dst, c);
5551                 }
5552               else
5553                 {
5554                   if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5555                       && fit_to_be_escape_quoted (c))
5556                     Dynarr_add (dst, ISO_CODE_ESC);
5557                   Dynarr_add (dst, c);
5558                 }
5559               char_boundary = 0;
5560             }
5561           break;
5562         case 1:
5563           ch = ( ch << 6 ) | ( c & 0x3f );
5564
5565           char_boundary = 0;
5566           if ( (0x80 <= ch) && (ch <= 0x9f) )
5567             {
5568               charmask = (half == 0 ? 0x00 : 0x80);
5569
5570               if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5571                   && fit_to_be_escape_quoted (ch))
5572                 Dynarr_add (dst, ISO_CODE_ESC);
5573               /* you asked for it ... */
5574               Dynarr_add (dst, ch);
5575             }
5576           else
5577             {
5578               int reg;
5579
5580               BREAKUP_CHAR (ch, charset, byte1, byte2);
5581               ensure_correct_direction (XCHARSET_DIRECTION (charset),
5582                                         codesys, dst, &flags, 0);
5583
5584               /* Now determine which register to use. */
5585               reg = -1;
5586               for (i = 0; i < 4; i++)
5587                 {
5588                   if (EQ (charset, str->iso2022.charset[i]) ||
5589                       EQ (charset,
5590                           CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i)))
5591                     {
5592                       reg = i;
5593                       break;
5594                     }
5595                 }
5596
5597               if (reg == -1)
5598                 {
5599                   if (XCHARSET_GRAPHIC (charset) != 0)
5600                     {
5601                       if (!NILP (str->iso2022.charset[1]) &&
5602                           (!CODING_SYSTEM_ISO2022_SEVEN (codesys) ||
5603                            CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys)))
5604                         reg = 1;
5605                       else if (!NILP (str->iso2022.charset[2]))
5606                         reg = 2;
5607                       else if (!NILP (str->iso2022.charset[3]))
5608                         reg = 3;
5609                       else
5610                         reg = 0;
5611                     }
5612                   else
5613                     reg = 0;
5614                 }
5615
5616               iso2022_designate (charset, reg, str, dst);
5617
5618               /* Now invoke that register. */
5619               switch (reg)
5620                 {
5621                 case 0:
5622                   ensure_normal_shift (str, dst);
5623                   half = 0;
5624                   break;
5625
5626                 case 1:
5627                   if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5628                     {
5629                       ensure_shift_out (str, dst);
5630                       half = 0;
5631                     }
5632                   else
5633                     half = 1;
5634                   break;
5635
5636                 case 2:
5637                   if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5638                     {
5639                       Dynarr_add (dst, ISO_CODE_ESC);
5640                       Dynarr_add (dst, 'N');
5641                       half = 0;
5642                     }
5643                   else
5644                     {
5645                       Dynarr_add (dst, ISO_CODE_SS2);
5646                       half = 1;
5647                     }
5648                   break;
5649
5650                 case 3:
5651                   if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5652                     {
5653                       Dynarr_add (dst, ISO_CODE_ESC);
5654                       Dynarr_add (dst, 'O');
5655                       half = 0;
5656                     }
5657                   else
5658                     {
5659                       Dynarr_add (dst, ISO_CODE_SS3);
5660                       half = 1;
5661                     }
5662                   break;
5663
5664                 default:
5665                   abort ();
5666                 }
5667
5668               charmask = (half == 0 ? 0x00 : 0x80);
5669
5670               switch (XCHARSET_DIMENSION (charset))
5671                 {
5672                 case 1:
5673                   Dynarr_add (dst, byte1 | charmask);
5674                   break;
5675                 case 2:
5676                   Dynarr_add (dst, byte1 | charmask);
5677                   Dynarr_add (dst, byte2 | charmask);
5678                   break;
5679                 default:
5680                   abort ();
5681                 }
5682             }
5683           ch =0;
5684           break;
5685         default:
5686           ch = ( ch << 6 ) | ( c & 0x3f );
5687           char_boundary--;
5688         }
5689     }
5690 #else /* not UTF2000 */
5691
5692   while (n--)
5693     {
5694       c = *src++;
5695
5696       if (BYTE_ASCII_P (c))
5697         {               /* Processing ASCII character */
5698           ch = 0;
5699
5700           restore_left_to_right_direction (codesys, dst, &flags, 0);
5701
5702           /* Make sure G0 contains ASCII */
5703           if ((c > ' ' && c < ISO_CODE_DEL) ||
5704               !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
5705             {
5706               ensure_normal_shift (str, dst);
5707               iso2022_designate (Vcharset_ascii, 0, str, dst);
5708             }
5709
5710           /* If necessary, restore everything to the default state
5711              at end-of-line */
5712           if (c == '\n' &&
5713               !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys)))
5714             {
5715               restore_left_to_right_direction (codesys, dst, &flags, 0);
5716
5717               ensure_normal_shift (str, dst);
5718
5719               for (i = 0; i < 4; i++)
5720                 {
5721                   Lisp_Object initial_charset =
5722                     CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5723                   iso2022_designate (initial_charset, i, str, dst);
5724                 }
5725             }
5726           if (c == '\n')
5727             {
5728               if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5729                 Dynarr_add (dst, '\r');
5730               if (eol_type != EOL_CR)
5731                 Dynarr_add (dst, c);
5732             }
5733           else
5734             {
5735               if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5736                   && fit_to_be_escape_quoted (c))
5737                 Dynarr_add (dst, ISO_CODE_ESC);
5738               Dynarr_add (dst, c);
5739             }
5740           char_boundary = 1;
5741         }
5742
5743       else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
5744         { /* Processing Leading Byte */
5745           ch = 0;
5746           charset = CHARSET_BY_LEADING_BYTE (c);
5747           if (LEADING_BYTE_PREFIX_P(c))
5748             ch = c;
5749           else if (!EQ (charset, Vcharset_control_1)
5750 #ifdef ENABLE_COMPOSITE_CHARS
5751                    && !EQ (charset, Vcharset_composite)
5752 #endif
5753                    )
5754             {
5755               int reg;
5756
5757               ensure_correct_direction (XCHARSET_DIRECTION (charset),
5758                                         codesys, dst, &flags, 0);
5759
5760               /* Now determine which register to use. */
5761               reg = -1;
5762               for (i = 0; i < 4; i++)
5763                 {
5764                   if (EQ (charset, str->iso2022.charset[i]) ||
5765                       EQ (charset,
5766                           CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i)))
5767                     {
5768                       reg = i;
5769                       break;
5770                     }
5771                 }
5772
5773               if (reg == -1)
5774                 {
5775                   if (XCHARSET_GRAPHIC (charset) != 0)
5776                     {
5777                       if (!NILP (str->iso2022.charset[1]) &&
5778                           (!CODING_SYSTEM_ISO2022_SEVEN (codesys) ||
5779                            CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys)))
5780                         reg = 1;
5781                       else if (!NILP (str->iso2022.charset[2]))
5782                         reg = 2;
5783                       else if (!NILP (str->iso2022.charset[3]))
5784                         reg = 3;
5785                       else
5786                         reg = 0;
5787                     }
5788                   else
5789                     reg = 0;
5790                 }
5791
5792               iso2022_designate (charset, reg, str, dst);
5793
5794               /* Now invoke that register. */
5795               switch (reg)
5796                 {
5797                 case 0:
5798                   ensure_normal_shift (str, dst);
5799                   half = 0;
5800                   break;
5801
5802                 case 1:
5803                   if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5804                     {
5805                       ensure_shift_out (str, dst);
5806                       half = 0;
5807                     }
5808                   else
5809                     half = 1;
5810                   break;
5811
5812                 case 2:
5813                   if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5814                     {
5815                       Dynarr_add (dst, ISO_CODE_ESC);
5816                       Dynarr_add (dst, 'N');
5817                       half = 0;
5818                     }
5819                   else
5820                     {
5821                       Dynarr_add (dst, ISO_CODE_SS2);
5822                       half = 1;
5823                     }
5824                   break;
5825
5826                 case 3:
5827                   if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5828                     {
5829                       Dynarr_add (dst, ISO_CODE_ESC);
5830                       Dynarr_add (dst, 'O');
5831                       half = 0;
5832                     }
5833                   else
5834                     {
5835                       Dynarr_add (dst, ISO_CODE_SS3);
5836                       half = 1;
5837                     }
5838                   break;
5839
5840                 default:
5841                   abort ();
5842                 }
5843             }
5844           char_boundary = 0;
5845         }
5846       else
5847         {                       /* Processing Non-ASCII character */
5848           charmask = (half == 0 ? 0x7F : 0xFF);
5849           char_boundary = 1;
5850           if (EQ (charset, Vcharset_control_1))
5851             {
5852               if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5853                   && fit_to_be_escape_quoted (c))
5854                 Dynarr_add (dst, ISO_CODE_ESC);
5855               /* you asked for it ... */
5856               Dynarr_add (dst, c - 0x20);
5857             }
5858           else
5859             {
5860               switch (XCHARSET_REP_BYTES (charset))
5861                 {
5862                 case 2:
5863                   Dynarr_add (dst, c & charmask);
5864                   break;
5865                 case 3:
5866                   if (XCHARSET_PRIVATE_P (charset))
5867                     {
5868                       Dynarr_add (dst, c & charmask);
5869                       ch = 0;
5870                     }
5871                   else if (ch)
5872                     {
5873 #ifdef ENABLE_COMPOSITE_CHARS
5874                       if (EQ (charset, Vcharset_composite))
5875                         {
5876                           if (in_composite)
5877                             {
5878                               /* #### Bother! We don't know how to
5879                                  handle this yet. */
5880                               Dynarr_add (dst, '~');
5881                             }
5882                           else
5883                             {
5884                               Emchar emch = MAKE_CHAR (Vcharset_composite,
5885                                                        ch & 0x7F, c & 0x7F);
5886                               Lisp_Object lstr = composite_char_string (emch);
5887                               saved_n = n;
5888                               saved_src = src;
5889                               in_composite = 1;
5890                               src = XSTRING_DATA   (lstr);
5891                               n   = XSTRING_LENGTH (lstr);
5892                               Dynarr_add (dst, ISO_CODE_ESC);
5893                               Dynarr_add (dst, '0'); /* start composing */
5894                             }
5895                         }
5896                       else
5897 #endif /* ENABLE_COMPOSITE_CHARS */
5898                         {
5899                           Dynarr_add (dst, ch & charmask);
5900                           Dynarr_add (dst, c & charmask);
5901                         }
5902                       ch = 0;
5903                     }
5904                   else
5905                     {
5906                       ch = c;
5907                       char_boundary = 0;
5908                     }
5909                   break;
5910                 case 4:
5911                   if (ch)
5912                     {
5913                       Dynarr_add (dst, ch & charmask);
5914                       Dynarr_add (dst, c & charmask);
5915                       ch = 0;
5916                     }
5917                   else
5918                     {
5919                       ch = c;
5920                       char_boundary = 0;
5921                     }
5922                   break;
5923                 default:
5924                   abort ();
5925                 }
5926             }
5927         }
5928     }
5929 #endif /* not UTF2000 */
5930
5931 #ifdef ENABLE_COMPOSITE_CHARS
5932   if (in_composite)
5933     {
5934       n = saved_n;
5935       src = saved_src;
5936       in_composite = 0;
5937       Dynarr_add (dst, ISO_CODE_ESC);
5938       Dynarr_add (dst, '1'); /* end composing */
5939       goto back_to_square_n; /* Wheeeeeeeee ..... */
5940     }
5941 #endif /* ENABLE_COMPOSITE_CHARS */
5942
5943 #ifdef UTF2000
5944   if ( (char_boundary == 0) && flags & CODING_STATE_END)
5945 #else
5946   if (char_boundary && flags & CODING_STATE_END)
5947 #endif
5948     {
5949       restore_left_to_right_direction (codesys, dst, &flags, 0);
5950       ensure_normal_shift (str, dst);
5951       for (i = 0; i < 4; i++)
5952         {
5953           Lisp_Object initial_charset =
5954             CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5955           iso2022_designate (initial_charset, i, str, dst);
5956         }
5957     }
5958
5959   str->flags = flags;
5960   str->ch    = ch;
5961   str->iso2022.current_char_boundary = char_boundary;
5962   str->iso2022.current_charset = charset;
5963   str->iso2022.current_half = half;
5964
5965   /* Verbum caro factum est! */
5966 }
5967 #endif /* MULE */
5968 \f
5969 /************************************************************************/
5970 /*                     No-conversion methods                            */
5971 /************************************************************************/
5972
5973 /* This is used when reading in "binary" files -- i.e. files that may
5974    contain all 256 possible byte values and that are not to be
5975    interpreted as being in any particular decoding. */
5976 static void
5977 decode_coding_no_conversion (Lstream *decoding, const Extbyte *src,
5978                              unsigned_char_dynarr *dst, size_t n)
5979 {
5980   struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
5981   unsigned int flags  = str->flags;
5982   unsigned int ch     = str->ch;
5983   eol_type_t eol_type = str->eol_type;
5984
5985   while (n--)
5986     {
5987       unsigned char c = *(unsigned char *)src++;
5988
5989       DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5990       DECODE_ADD_BINARY_CHAR (c, dst);
5991     label_continue_loop:;
5992     }
5993
5994   DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
5995
5996   str->flags = flags;
5997   str->ch    = ch;
5998 }
5999
6000 static void
6001 encode_coding_no_conversion (Lstream *encoding, const Bufbyte *src,
6002                              unsigned_char_dynarr *dst, size_t n)
6003 {
6004   unsigned char c;
6005   struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
6006   unsigned int flags  = str->flags;
6007   unsigned int ch     = str->ch;
6008   eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
6009 #ifdef UTF2000
6010   unsigned char char_boundary = str->iso2022.current_char_boundary;
6011 #endif
6012
6013   while (n--)
6014     {
6015       c = *src++;
6016 #ifdef UTF2000
6017       switch (char_boundary)
6018         {
6019         case 0:
6020           if ( c >= 0xfc )
6021             {
6022               ch = c & 0x01;
6023               char_boundary = 5;
6024             }
6025           else if ( c >= 0xf8 )
6026             {
6027               ch = c & 0x03;
6028               char_boundary = 4;
6029             }
6030           else if ( c >= 0xf0 )
6031             {
6032               ch = c & 0x07;
6033               char_boundary = 3;
6034             }
6035           else if ( c >= 0xe0 )
6036             {
6037               ch = c & 0x0f;
6038               char_boundary = 2;
6039             }
6040           else if ( c >= 0xc0 )
6041             {
6042               ch = c & 0x1f;
6043               char_boundary = 1;
6044             }
6045           else
6046             {
6047               ch = 0;
6048
6049               if (c == '\n')
6050                 {
6051                   if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
6052                     Dynarr_add (dst, '\r');
6053                   if (eol_type != EOL_CR)
6054                     Dynarr_add (dst, c);
6055                 }
6056               else
6057                 Dynarr_add (dst, c);
6058               char_boundary = 0;
6059             }
6060           break;
6061         case 1:
6062           ch = ( ch << 6 ) | ( c & 0x3f );
6063           Dynarr_add (dst, ch & 0xff);
6064           char_boundary = 0;
6065           break;
6066         default:
6067           ch = ( ch << 6 ) | ( c & 0x3f );
6068           char_boundary--;
6069         }
6070 #else /* not UTF2000 */
6071       if (c == '\n')
6072         {
6073           if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
6074             Dynarr_add (dst, '\r');
6075           if (eol_type != EOL_CR)
6076             Dynarr_add (dst, '\n');
6077           ch = 0;
6078         }
6079       else if (BYTE_ASCII_P (c))
6080         {
6081           assert (ch == 0);
6082           Dynarr_add (dst, c);
6083         }
6084       else if (BUFBYTE_LEADING_BYTE_P (c))
6085         {
6086           assert (ch == 0);
6087           if (c == LEADING_BYTE_LATIN_ISO8859_1 ||
6088               c == LEADING_BYTE_CONTROL_1)
6089             ch = c;
6090           else
6091             Dynarr_add (dst, '~'); /* untranslatable character */
6092         }
6093       else
6094         {
6095           if (ch == LEADING_BYTE_LATIN_ISO8859_1)
6096             Dynarr_add (dst, c);
6097           else if (ch == LEADING_BYTE_CONTROL_1)
6098             {
6099               assert (c < 0xC0);
6100               Dynarr_add (dst, c - 0x20);
6101             }
6102           /* else it should be the second or third byte of an
6103              untranslatable character, so ignore it */
6104           ch = 0;
6105         }
6106 #endif /* not UTF2000 */
6107     }
6108
6109   str->flags = flags;
6110   str->ch    = ch;
6111 #ifdef UTF2000
6112   str->iso2022.current_char_boundary = char_boundary;
6113 #endif
6114 }
6115
6116 \f
6117
6118 /************************************************************************/
6119 /*                             Initialization                           */
6120 /************************************************************************/
6121
6122 void
6123 syms_of_file_coding (void)
6124 {
6125   INIT_LRECORD_IMPLEMENTATION (coding_system);
6126
6127   DEFERROR_STANDARD (Qcoding_system_error, Qio_error);
6128
6129   DEFSUBR (Fcoding_system_p);
6130   DEFSUBR (Ffind_coding_system);
6131   DEFSUBR (Fget_coding_system);
6132   DEFSUBR (Fcoding_system_list);
6133   DEFSUBR (Fcoding_system_name);
6134   DEFSUBR (Fmake_coding_system);
6135   DEFSUBR (Fcopy_coding_system);
6136   DEFSUBR (Fcoding_system_canonical_name_p);
6137   DEFSUBR (Fcoding_system_alias_p);
6138   DEFSUBR (Fcoding_system_aliasee);
6139   DEFSUBR (Fdefine_coding_system_alias);
6140   DEFSUBR (Fsubsidiary_coding_system);
6141
6142   DEFSUBR (Fcoding_system_type);
6143   DEFSUBR (Fcoding_system_doc_string);
6144 #ifdef MULE
6145   DEFSUBR (Fcoding_system_charset);
6146 #endif
6147   DEFSUBR (Fcoding_system_property);
6148
6149   DEFSUBR (Fcoding_category_list);
6150   DEFSUBR (Fset_coding_priority_list);
6151   DEFSUBR (Fcoding_priority_list);
6152   DEFSUBR (Fset_coding_category_system);
6153   DEFSUBR (Fcoding_category_system);
6154
6155   DEFSUBR (Fdetect_coding_region);
6156   DEFSUBR (Fdecode_coding_region);
6157   DEFSUBR (Fencode_coding_region);
6158 #ifdef MULE
6159   DEFSUBR (Fdecode_shift_jis_char);
6160   DEFSUBR (Fencode_shift_jis_char);
6161   DEFSUBR (Fdecode_big5_char);
6162   DEFSUBR (Fencode_big5_char);
6163 #ifndef UTF2000
6164   DEFSUBR (Fset_ucs_char);
6165   DEFSUBR (Fucs_char);
6166   DEFSUBR (Fset_char_ucs);
6167   DEFSUBR (Fchar_ucs);
6168 #endif /* not UTF2000 */
6169 #endif /* MULE */
6170   defsymbol (&Qcoding_systemp, "coding-system-p");
6171   defsymbol (&Qno_conversion, "no-conversion");
6172   defsymbol (&Qraw_text, "raw-text");
6173 #ifdef MULE
6174   defsymbol (&Qbig5, "big5");
6175   defsymbol (&Qshift_jis, "shift-jis");
6176   defsymbol (&Qucs4, "ucs-4");
6177   defsymbol (&Qutf8, "utf-8");
6178   defsymbol (&Qccl, "ccl");
6179   defsymbol (&Qiso2022, "iso2022");
6180 #endif /* MULE */
6181   defsymbol (&Qmnemonic, "mnemonic");
6182   defsymbol (&Qeol_type, "eol-type");
6183   defsymbol (&Qpost_read_conversion, "post-read-conversion");
6184   defsymbol (&Qpre_write_conversion, "pre-write-conversion");
6185
6186   defsymbol (&Qcr, "cr");
6187   defsymbol (&Qlf, "lf");
6188   defsymbol (&Qcrlf, "crlf");
6189   defsymbol (&Qeol_cr, "eol-cr");
6190   defsymbol (&Qeol_lf, "eol-lf");
6191   defsymbol (&Qeol_crlf, "eol-crlf");
6192 #ifdef MULE
6193   defsymbol (&Qcharset_g0, "charset-g0");
6194   defsymbol (&Qcharset_g1, "charset-g1");
6195   defsymbol (&Qcharset_g2, "charset-g2");
6196   defsymbol (&Qcharset_g3, "charset-g3");
6197   defsymbol (&Qforce_g0_on_output, "force-g0-on-output");
6198   defsymbol (&Qforce_g1_on_output, "force-g1-on-output");
6199   defsymbol (&Qforce_g2_on_output, "force-g2-on-output");
6200   defsymbol (&Qforce_g3_on_output, "force-g3-on-output");
6201   defsymbol (&Qno_iso6429, "no-iso6429");
6202   defsymbol (&Qinput_charset_conversion, "input-charset-conversion");
6203   defsymbol (&Qoutput_charset_conversion, "output-charset-conversion");
6204
6205   defsymbol (&Qshort, "short");
6206   defsymbol (&Qno_ascii_eol, "no-ascii-eol");
6207   defsymbol (&Qno_ascii_cntl, "no-ascii-cntl");
6208   defsymbol (&Qseven, "seven");
6209   defsymbol (&Qlock_shift, "lock-shift");
6210   defsymbol (&Qescape_quoted, "escape-quoted");
6211 #endif /* MULE */
6212   defsymbol (&Qencode, "encode");
6213   defsymbol (&Qdecode, "decode");
6214
6215 #ifdef MULE
6216   defsymbol (&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS],
6217              "shift-jis");
6218   defsymbol (&coding_category_symbol[CODING_CATEGORY_BIG5],
6219              "big5");
6220   defsymbol (&coding_category_symbol[CODING_CATEGORY_UCS4],
6221              "ucs-4");
6222   defsymbol (&coding_category_symbol[CODING_CATEGORY_UTF8],
6223              "utf-8");
6224   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_7],
6225              "iso-7");
6226   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE],
6227              "iso-8-designate");
6228   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_1],
6229              "iso-8-1");
6230   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_2],
6231              "iso-8-2");
6232   defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_LOCK_SHIFT],
6233              "iso-lock-shift");
6234 #endif /* MULE */
6235   defsymbol (&coding_category_symbol[CODING_CATEGORY_NO_CONVERSION],
6236              "no-conversion");
6237 }
6238
6239 void
6240 lstream_type_create_file_coding (void)
6241 {
6242   LSTREAM_HAS_METHOD (decoding, reader);
6243   LSTREAM_HAS_METHOD (decoding, writer);
6244   LSTREAM_HAS_METHOD (decoding, rewinder);
6245   LSTREAM_HAS_METHOD (decoding, seekable_p);
6246   LSTREAM_HAS_METHOD (decoding, flusher);
6247   LSTREAM_HAS_METHOD (decoding, closer);
6248   LSTREAM_HAS_METHOD (decoding, marker);
6249
6250   LSTREAM_HAS_METHOD (encoding, reader);
6251   LSTREAM_HAS_METHOD (encoding, writer);
6252   LSTREAM_HAS_METHOD (encoding, rewinder);
6253   LSTREAM_HAS_METHOD (encoding, seekable_p);
6254   LSTREAM_HAS_METHOD (encoding, flusher);
6255   LSTREAM_HAS_METHOD (encoding, closer);
6256   LSTREAM_HAS_METHOD (encoding, marker);
6257 }
6258
6259 void
6260 vars_of_file_coding (void)
6261 {
6262   int i;
6263
6264   fcd = xnew (struct file_coding_dump);
6265   dump_add_root_struct_ptr (&fcd, &fcd_description);
6266
6267   /* Initialize to something reasonable ... */
6268   for (i = 0; i < CODING_CATEGORY_LAST; i++)
6269     {
6270       fcd->coding_category_system[i] = Qnil;
6271       fcd->coding_category_by_priority[i] = i;
6272     }
6273
6274   Fprovide (intern ("file-coding"));
6275
6276   DEFVAR_LISP ("keyboard-coding-system", &Vkeyboard_coding_system /*
6277 Coding system used for TTY keyboard input.
6278 Not used under a windowing system.
6279 */ );
6280   Vkeyboard_coding_system = Qnil;
6281
6282   DEFVAR_LISP ("terminal-coding-system", &Vterminal_coding_system /*
6283 Coding system used for TTY display output.
6284 Not used under a windowing system.
6285 */ );
6286   Vterminal_coding_system = Qnil;
6287
6288   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read /*
6289 Overriding coding system used when reading from a file or process.
6290 You should bind this variable with `let', but do not set it globally.
6291 If this is non-nil, it specifies the coding system that will be used
6292 to decode input on read operations, such as from a file or process.
6293 It overrides `buffer-file-coding-system-for-read',
6294 `insert-file-contents-pre-hook', etc.  Use those variables instead of
6295 this one for permanent changes to the environment.  */ );
6296   Vcoding_system_for_read = Qnil;
6297
6298   DEFVAR_LISP ("coding-system-for-write",
6299                &Vcoding_system_for_write /*
6300 Overriding coding system used when writing to a file or process.
6301 You should bind this variable with `let', but do not set it globally.
6302 If this is non-nil, it specifies the coding system that will be used
6303 to encode output for write operations, such as to a file or process.
6304 It overrides `buffer-file-coding-system', `write-region-pre-hook', etc.
6305 Use those variables instead of this one for permanent changes to the
6306 environment.  */ );
6307   Vcoding_system_for_write = Qnil;
6308
6309   DEFVAR_LISP ("file-name-coding-system", &Vfile_name_coding_system /*
6310 Coding system used to convert pathnames when accessing files.
6311 */ );
6312   Vfile_name_coding_system = Qnil;
6313
6314   DEFVAR_BOOL ("enable-multibyte-characters", &enable_multibyte_characters /*
6315 Non-nil means the buffer contents are regarded as multi-byte form
6316 of characters, not a binary code.  This affects the display, file I/O,
6317 and behaviors of various editing commands.
6318
6319 Setting this to nil does not do anything.
6320 */ );
6321   enable_multibyte_characters = 1;
6322 }
6323
6324 void
6325 complex_vars_of_file_coding (void)
6326 {
6327   staticpro (&Vcoding_system_hash_table);
6328   Vcoding_system_hash_table =
6329     make_lisp_hash_table (50, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ);
6330
6331   the_codesys_prop_dynarr = Dynarr_new (codesys_prop);
6332   dump_add_root_struct_ptr (&the_codesys_prop_dynarr, &codesys_prop_dynarr_description);
6333
6334 #define DEFINE_CODESYS_PROP(Prop_Type, Sym) do  \
6335 {                                               \
6336   struct codesys_prop csp;                      \
6337   csp.sym = (Sym);                              \
6338   csp.prop_type = (Prop_Type);                  \
6339   Dynarr_add (the_codesys_prop_dynarr, csp);    \
6340 } while (0)
6341
6342   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qmnemonic);
6343   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_type);
6344   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_cr);
6345   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_crlf);
6346   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qeol_lf);
6347   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qpost_read_conversion);
6348   DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK,  Qpre_write_conversion);
6349 #ifdef MULE
6350   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g0);
6351   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g1);
6352   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g2);
6353   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g3);
6354   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g0_on_output);
6355   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g1_on_output);
6356   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g2_on_output);
6357   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g3_on_output);
6358   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qshort);
6359   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_eol);
6360   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_cntl);
6361   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qseven);
6362   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qlock_shift);
6363   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_iso6429);
6364   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qescape_quoted);
6365   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qinput_charset_conversion);
6366   DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qoutput_charset_conversion);
6367
6368   DEFINE_CODESYS_PROP (CODESYS_PROP_CCL,     Qencode);
6369   DEFINE_CODESYS_PROP (CODESYS_PROP_CCL,     Qdecode);
6370 #endif /* MULE */
6371   /* Need to create this here or we're really screwed. */
6372   Fmake_coding_system
6373     (Qraw_text, Qno_conversion,
6374      build_string ("Raw text, which means it converts only line-break-codes."),
6375      list2 (Qmnemonic, build_string ("Raw")));
6376
6377   Fmake_coding_system
6378     (Qbinary, Qno_conversion,
6379      build_string ("Binary, which means it does not convert anything."),
6380      list4 (Qeol_type, Qlf,
6381             Qmnemonic, build_string ("Binary")));
6382
6383 #ifdef UTF2000
6384   Fmake_coding_system
6385     (Qutf8, Qutf8,
6386      build_string ("Coding-system of ISO/IEC 10646 UTF-8."),
6387      list2 (Qmnemonic, build_string ("UTF8")));
6388 #endif
6389
6390   Fdefine_coding_system_alias (Qno_conversion, Qraw_text);
6391
6392   Fdefine_coding_system_alias (Qfile_name, Qbinary);
6393
6394   Fdefine_coding_system_alias (Qterminal, Qbinary);
6395   Fdefine_coding_system_alias (Qkeyboard, Qbinary);
6396
6397   /* Need this for bootstrapping */
6398   fcd->coding_category_system[CODING_CATEGORY_NO_CONVERSION] =
6399     Fget_coding_system (Qraw_text);
6400
6401 #ifdef UTF2000
6402   fcd->coding_category_system[CODING_CATEGORY_UTF8]
6403    = Fget_coding_system (Qutf8);
6404 #endif
6405
6406 #if defined(MULE) && !defined(UTF2000)
6407   {
6408     size_t i;
6409
6410     for (i = 0; i < countof (fcd->ucs_to_mule_table); i++)
6411       fcd->ucs_to_mule_table[i] = Qnil;
6412   }
6413   staticpro (&mule_to_ucs_table);
6414   mule_to_ucs_table = Fmake_char_table(Qgeneric);
6415 #endif /* defined(MULE) && !defined(UTF2000) */
6416 }