--- /dev/null
+/* Header for multilingual functions.
+ Copyright (C) 1992, 1995 Free Software Foundation, Inc.
+ Copyright (C) 1995 Sun Microsystems, Inc.
+
+This file is part of XEmacs.
+
+XEmacs is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by the
+Free Software Foundation; either version 2, or (at your option) any
+later version.
+
+XEmacs is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU General Public License
+along with XEmacs; see the file COPYING. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+Boston, MA 02111-1307, USA. */
+
+/* Synched up with: Mule 2.3. Not in FSF. */
+
+/* Rewritten by Ben Wing <ben@xemacs.org>. */
+
+#ifndef INCLUDED_mule_charset_h_
+#define INCLUDED_mule_charset_h_
+
+/*
+ 1. Character Sets
+ =================
+
+ A character set (or "charset") is an ordered set of characters.
+ A particular character in a charset is indexed using one or
+ more "position codes", which are non-negative integers.
+ The number of position codes needed to identify a particular
+ character in a charset is called the "dimension" of the
+ charset. In XEmacs/Mule, all charsets have 1 or 2 dimensions,
+ and the size of all charsets (except for a few special cases)
+ is either 94, 96, 94 by 94, or 96 by 96. The range of
+ position codes used to index characters from any of these
+ types of character sets is as follows:
+
+ Charset type Position code 1 Position code 2
+ ------------------------------------------------------------
+ 94 33 - 126 N/A
+ 96 32 - 127 N/A
+ 94x94 33 - 126 33 - 126
+ 96x96 32 - 127 32 - 127
+
+ Note that in the above cases position codes do not start at
+ an expected value such as 0 or 1. The reason for this will
+ become clear later.
+
+ For example, Latin-1 is a 96-character charset, and JISX0208
+ (the Japanese national character set) is a 94x94-character
+ charset.
+
+ [Note that, although the ranges above define the *valid*
+ position codes for a charset, some of the slots in a particular
+ charset may in fact be empty. This is the case for JISX0208,
+ for example, where (e.g.) all the slots whose first
+ position code is in the range 118 - 127 are empty.]
+
+ There are three charsets that do not follow the above rules.
+ All of them have one dimension, and have ranges of position
+ codes as follows:
+
+ Charset name Position code 1
+ ------------------------------------
+ ASCII 0 - 127
+ Control-1 0 - 31
+ Composite 0 - some large number
+
+ (The upper bound of the position code for composite characters
+ has not yet been determined, but it will probably be at
+ least 16,383).
+
+ ASCII is the union of two subsidiary character sets:
+ Printing-ASCII (the printing ASCII character set,
+ consisting of position codes 33 - 126, like for a standard
+ 94-character charset) and Control-ASCII (the non-printing
+ characters that would appear in a binary file with codes 0
+ - 32 and 127).
+
+ Control-1 contains the non-printing characters that would
+ appear in a binary file with codes 128 - 159.
+
+ Composite contains characters that are generated by
+ overstriking one or more characters from other charsets.
+
+ Note that some characters in ASCII, and all characters
+ in Control-1, are "control" (non-printing) characters.
+ These have no printed representation but instead control
+ some other function of the printing (e.g. TAB or 8 moves
+ the current character position to the next tab stop).
+ All other characters in all charsets are "graphic"
+ (printing) characters.
+
+ When a binary file is read in, the bytes in the file are
+ assigned to character sets as follows:
+
+ Bytes Character set Range
+ --------------------------------------------------
+ 0 - 127 ASCII 0 - 127
+ 128 - 159 Control-1 0 - 31
+ 160 - 255 Latin-1 32 - 127
+
+ This is a bit ad-hoc but gets the job done.
+
+ 2. Encodings
+ ============
+
+ An "encoding" is a way of numerically representing
+ characters from one or more character sets. If an encoding
+ only encompasses one character set, then the position codes
+ for the characters in that character set could be used
+ directly. This is not possible, however, if more than one
+ character set is to be used in the encoding.
+
+ For example, the conversion detailed above between bytes in
+ a binary file and characters is effectively an encoding
+ that encompasses the three character sets ASCII, Control-1,
+ and Latin-1 in a stream of 8-bit bytes.
+
+ Thus, an encoding can be viewed as a way of encoding
+ characters from a specified group of character sets using a
+ stream of bytes, each of which contains a fixed number of
+ bits (but not necessarily 8, as in the common usage of
+ "byte").
+
+ Here are descriptions of a couple of common
+ encodings:
+
+
+ A. Japanese EUC (Extended Unix Code)
+
+ This encompasses the character sets:
+ - Printing-ASCII,
+ - Katakana-JISX0201 (half-width katakana, the right half of JISX0201).
+ - Japanese-JISX0208
+ - Japanese-JISX0212
+ It uses 8-bit bytes.
+
+ Note that Printing-ASCII and Katakana-JISX0201 are 94-character
+ charsets, while Japanese-JISX0208 is a 94x94-character charset.
+
+ The encoding is as follows:
+
+ Character set Representation (PC == position-code)
+ ------------- --------------
+ Printing-ASCII PC1
+ Japanese-JISX0208 PC1 + 0x80 | PC2 + 0x80
+ Katakana-JISX0201 0x8E | PC1 + 0x80
+
+
+ B. JIS7
+
+ This encompasses the character sets:
+ - Printing-ASCII
+ - Latin-JISX0201 (the left half of JISX0201; this character set is
+ very similar to Printing-ASCII and is a 94-character charset)
+ - Japanese-JISX0208
+ - Katakana-JISX0201
+ It uses 7-bit bytes.
+
+ Unlike Japanese EUC, this is a "modal" encoding, which
+ means that there are multiple states that the encoding can
+ be in, which affect how the bytes are to be interpreted.
+ Special sequences of bytes (called "escape sequences")
+ are used to change states.
+
+ The encoding is as follows:
+
+ Character set Representation
+ ------------- --------------
+ Printing-ASCII PC1
+ Latin-JISX0201 PC1
+ Katakana-JISX0201 PC1
+ Japanese-JISX0208 PC1 | PC2
+
+ Escape sequence ASCII equivalent Meaning
+ --------------- ---------------- -------
+ 0x1B 0x28 0x42 ESC ( B invoke Printing-ASCII
+ 0x1B 0x28 0x4A ESC ( J invoke Latin-JISX0201
+ 0x1B 0x28 0x49 ESC ( I invoke Katakana-JISX0201
+ 0x1B 0x24 0x42 ESC $ B invoke Japanese-JISX0208
+
+ Initially, Printing-ASCII is invoked.
+
+ 3. Internal Mule Encodings
+ ==========================
+
+ In XEmacs/Mule, each character set is assigned a unique number,
+ called a "leading byte". This is used in the encodings of a
+ character. Leading bytes are in the range 0x80 - 0xFF
+ (except for ASCII, which has a leading byte of 0), although
+ some leading bytes are reserved.
+
+ Charsets whose leading byte is in the range 0x80 - 0x9F are
+ called "official" and are used for built-in charsets.
+ Other charsets are called "private" and have leading bytes
+ in the range 0xA0 - 0xFF; these are user-defined charsets.
+
+ More specifically:
+
+ Character set Leading byte
+ ------------- ------------
+ ASCII 0
+ Composite 0x80
+ Dimension-1 Official 0x81 - 0x8D
+ (0x8E is free)
+ Control 0x8F
+ Dimension-2 Official 0x90 - 0x99
+ (0x9A - 0x9D are free;
+ 0x9E and 0x9F are reserved)
+ Dimension-1 Private 0xA0 - 0xEF
+ Dimension-2 Private 0xF0 - 0xFF
+
+ There are two internal encodings for characters in XEmacs/Mule.
+ One is called "string encoding" and is an 8-bit encoding that
+ is used for representing characters in a buffer or string.
+ It uses 1 to 4 bytes per character. The other is called
+ "character encoding" and is a 19-bit encoding that is used
+ for representing characters individually in a variable.
+
+ (In the following descriptions, we'll ignore composite
+ characters for the moment. We also give a general (structural)
+ overview first, followed later by the exact details.)
+
+ A. Internal String Encoding
+
+ ASCII characters are encoded using their position code directly.
+ Other characters are encoded using their leading byte followed
+ by their position code(s) with the high bit set. Characters
+ in private character sets have their leading byte prefixed with
+ a "leading byte prefix", which is either 0x9E or 0x9F. (No
+ character sets are ever assigned these leading bytes.) Specifically:
+
+ Character set Encoding (PC == position-code)
+ ------------- -------- (LB == leading-byte)
+ ASCII PC1 |
+ Control-1 LB | PC1 + 0xA0
+ Dimension-1 official LB | PC1 + 0x80
+ Dimension-1 private 0x9E | LB | PC1 + 0x80
+ Dimension-2 official LB | PC1 | PC2 + 0x80
+ Dimension-2 private 0x9F | LB | PC1 + 0x80 | PC2 + 0x80
+
+ The basic characteristic of this encoding is that the first byte
+ of all characters is in the range 0x00 - 0x9F, and the second and
+ following bytes of all characters is in the range 0xA0 - 0xFF.
+ This means that it is impossible to get out of sync, or more
+ specifically:
+
+ 1. Given any byte position, the beginning of the character it is
+ within can be determined in constant time.
+ 2. Given any byte position at the beginning of a character, the
+ beginning of the next character can be determined in constant
+ time.
+ 3. Given any byte position at the beginning of a character, the
+ beginning of the previous character can be determined in constant
+ time.
+ 4. Textual searches can simply treat encoded strings as if they
+ were encoded in a one-byte-per-character fashion rather than
+ the actual multi-byte encoding.
+
+ None of the standard non-modal encodings meet all of these
+ conditions. For example, EUC satisfies only (2) and (3), while
+ Shift-JIS and Big5 (not yet described) satisfy only (2). (All
+ non-modal encodings must satisfy (2), in order to be unambiguous.)
+
+ B. Internal Character Encoding
+
+ One 19-bit word represents a single character. The word is
+ separated into three fields:
+
+ Bit number: 18 17 16 15 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00
+ <------------> <------------------> <------------------>
+ Field: 1 2 3
+
+ Note that fields 2 and 3 hold 7 bits each, while field 1 holds 5 bits.
+
+ Character set Field 1 Field 2 Field 3
+ ------------- ------- ------- -------
+ ASCII 0 0 PC1
+ range: (00 - 7F)
+ Control-1 0 1 PC1
+ range: (00 - 1F)
+ Dimension-1 official 0 LB - 0x80 PC1
+ range: (01 - 0D) (20 - 7F)
+ Dimension-1 private 0 LB - 0x80 PC1
+ range: (20 - 6F) (20 - 7F)
+ Dimension-2 official LB - 0x8F PC1 PC2
+ range: (01 - 0A) (20 - 7F) (20 - 7F)
+ Dimension-2 private LB - 0xE1 PC1 PC2
+ range: (0F - 1E) (20 - 7F) (20 - 7F)
+ Composite 0x1F ? ?
+
+ Note that character codes 0 - 255 are the same as the "binary encoding"
+ described above.
+*/
+
+/*
+ About Unicode support:
+
+ Adding Unicode support is very desirable. Unicode will likely be a
+ very common representation in the future, and thus we should
+ represent Unicode characters using three bytes instead of four.
+ This means we need to find leading bytes for Unicode. Given that
+ there are 65,536 characters in Unicode and we can attach 96x96 =
+ 9,216 characters per leading byte, we need eight leading bytes for
+ Unicode. We currently have four free (0x9A - 0x9D), and with a
+ little bit of rearranging we can get five: ASCII doesn't really
+ need to take up a leading byte. (We could just as well use 0x7F,
+ with a little change to the functions that assume that 0x80 is the
+ lowest leading byte.) This means we still need to dump three
+ leading bytes and move them into private space. The CNS charsets
+ are good candidates since they are rarely used, and
+ JAPANESE_JISX0208_1978 is becoming less and less used and could
+ also be dumped. */
+
+\f
+/************************************************************************/
+/* Definition of leading bytes */
+/************************************************************************/
+
+#define MIN_LEADING_BYTE 0x80
+/* These need special treatment in a string and/or character */
+#define LEADING_BYTE_ASCII 0x8E /* Omitted in a buffer */
+#ifdef ENABLE_COMPOSITE_CHARS
+#endif
+#define LEADING_BYTE_COMPOSITE 0x80 /* for a composite character */
+#define LEADING_BYTE_CONTROL_1 0x8F /* represent normal 80-9F */
+
+/* Note the gap in each official charset can cause core dump
+ as first and last values are used to determine whether
+ charset is defined or not in non_ascii_valid_char_p */
+
+/** The following are for 1-byte characters in an official charset. **/
+enum LEADING_BYTE_OFFICIAL_1
+{
+ LEADING_BYTE_LATIN_ISO8859_1 = 0x81, /* Right half of ISO 8859-1 */
+ LEADING_BYTE_LATIN_ISO8859_2, /* 0x82 Right half of ISO 8859-2 */
+ LEADING_BYTE_LATIN_ISO8859_3, /* 0x83 Right half of ISO 8859-3 */
+ LEADING_BYTE_LATIN_ISO8859_4, /* 0x84 Right half of ISO 8859-4 */
+ LEADING_BYTE_THAI_TIS620, /* 0x85 TIS620-2533 */
+ LEADING_BYTE_GREEK_ISO8859_7, /* 0x86 Right half of ISO 8859-7 */
+ LEADING_BYTE_ARABIC_ISO8859_6, /* 0x87 Right half of ISO 8859-6 */
+ LEADING_BYTE_HEBREW_ISO8859_8, /* 0x88 Right half of ISO 8859-8 */
+ LEADING_BYTE_KATAKANA_JISX0201, /* 0x89 Right half of JIS X0201-1976 */
+ LEADING_BYTE_LATIN_JISX0201, /* 0x8A Left half of JIS X0201-1976 */
+ LEADING_BYTE_CYRILLIC_ISO8859_5,/* 0x8B Right half of ISO 8859-5 */
+ LEADING_BYTE_LATIN_ISO8859_9 /* 0x8C Right half of ISO 8859-9 */
+ /* 0x8D unused */
+};
+
+#define MIN_LEADING_BYTE_OFFICIAL_1 LEADING_BYTE_LATIN_ISO8859_1
+#define MAX_LEADING_BYTE_OFFICIAL_1 LEADING_BYTE_LATIN_ISO8859_9
+
+/** The following are for 2-byte characters in an official charset. **/
+enum LEADING_BYTE_OFFICIAL_2
+{
+ LEADING_BYTE_JAPANESE_JISX0208_1978 = 0x90, /* Japanese JIS X0208-1978 */
+ LEADING_BYTE_CHINESE_GB2312, /* 0x91 Chinese Hanzi GB2312-1980 */
+ LEADING_BYTE_JAPANESE_JISX0208, /* 0x92 Japanese JIS X0208-1983 */
+ LEADING_BYTE_KOREAN_KSC5601, /* 0x93 Hangul KS C5601-1987 */
+ LEADING_BYTE_JAPANESE_JISX0212, /* 0x94 Japanese JIS X0212-1990 */
+ LEADING_BYTE_CHINESE_CNS11643_1, /* 0x95 Chinese CNS11643 Set 1 */
+ LEADING_BYTE_CHINESE_CNS11643_2, /* 0x96 Chinese CNS11643 Set 2 */
+ LEADING_BYTE_CHINESE_BIG5_1, /* 0x97 Big5 Level 1 */
+ LEADING_BYTE_CHINESE_BIG5_2 /* 0x98 Big5 Level 2 */
+ /* 0x99 unused */
+ /* 0x9A unused */
+ /* 0x9B unused */
+ /* 0x9C unused */
+};
+
+#define MIN_LEADING_BYTE_OFFICIAL_2 LEADING_BYTE_JAPANESE_JISX0208_1978
+#define MAX_LEADING_BYTE_OFFICIAL_2 LEADING_BYTE_CHINESE_BIG5_2
+
+/** The following are for 1- and 2-byte characters in a private charset. **/
+
+#define PRE_LEADING_BYTE_PRIVATE_1 0x9E /* 1-byte char-set */
+#define PRE_LEADING_BYTE_PRIVATE_2 0x9F /* 2-byte char-set */
+
+#define MIN_LEADING_BYTE_PRIVATE_1 0xA0
+#define MAX_LEADING_BYTE_PRIVATE_1 0xEF
+#define MIN_LEADING_BYTE_PRIVATE_2 0xF0
+#define MAX_LEADING_BYTE_PRIVATE_2 0xFF
+
+#define NUM_LEADING_BYTES 128
+
+\f
+/************************************************************************/
+/* Operations on leading bytes */
+/************************************************************************/
+
+/* Is this leading byte for a private charset? */
+
+#define LEADING_BYTE_PRIVATE_P(lb) ((lb) >= MIN_LEADING_BYTE_PRIVATE_1)
+
+/* Is this a prefix for a private leading byte? */
+
+INLINE_HEADER int LEADING_BYTE_PREFIX_P (Bufbyte lb);
+INLINE_HEADER int
+LEADING_BYTE_PREFIX_P (Bufbyte lb)
+{
+ return (lb == PRE_LEADING_BYTE_PRIVATE_1 ||
+ lb == PRE_LEADING_BYTE_PRIVATE_2);
+}
+
+/* Given a private leading byte, return the leading byte prefix stored
+ in a string. */
+
+#define PRIVATE_LEADING_BYTE_PREFIX(lb) \
+ ((unsigned int) (lb) < MIN_LEADING_BYTE_PRIVATE_2 ? \
+ PRE_LEADING_BYTE_PRIVATE_1 : \
+ PRE_LEADING_BYTE_PRIVATE_2)
+
+\f
+/************************************************************************/
+/* Operations on individual bytes */
+/* of any format */
+/************************************************************************/
+
+/* These are carefully designed to work if BYTE is signed or unsigned. */
+/* Note that SPC and DEL are considered ASCII, not control. */
+
+#define BYTE_ASCII_P(byte) (((byte) & ~0x7f) == 0)
+#define BYTE_C0_P(byte) (((byte) & ~0x1f) == 0)
+#define BYTE_C1_P(byte) (((byte) & ~0x1f) == 0x80)
+
+\f
+/************************************************************************/
+/* Operations on individual bytes */
+/* in a Mule-formatted string */
+/************************************************************************/
+
+/* Does BYTE represent the first byte of a character? */
+
+#define BUFBYTE_FIRST_BYTE_P(byte) ((byte) < 0xA0)
+
+/* Does BYTE represent the first byte of a multi-byte character? */
+
+#define BUFBYTE_LEADING_BYTE_P(byte) BYTE_C1_P (byte)
+
+\f
+/************************************************************************/
+/* Information about a particular character set */
+/************************************************************************/
+
+struct Lisp_Charset
+{
+ struct lcrecord_header header;
+
+ int id;
+ Lisp_Object name;
+ Lisp_Object doc_string;
+ Lisp_Object registry;
+ Lisp_Object short_name;
+ Lisp_Object long_name;
+
+ Lisp_Object reverse_direction_charset;
+
+ Lisp_Object ccl_program;
+
+ /* Final byte of this character set in ISO2022 designating escape sequence */
+ Bufbyte final;
+
+ /* Number of bytes (1 - 4) required in the internal representation
+ for characters in this character set. This is *not* the
+ same as the dimension of the character set). */
+ unsigned int rep_bytes;
+
+ /* Number of columns a character in this charset takes up, on TTY
+ devices. Not used for X devices. */
+ unsigned int columns;
+
+ /* Direction of this character set */
+ unsigned int direction;
+
+ /* Type of this character set (94, 96, 94x94, 96x96) */
+ unsigned int type;
+
+ /* Number of bytes used in encoding of this character set (1 or 2) */
+ unsigned int dimension;
+
+ /* Number of chars in each dimension (usually 94 or 96) */
+ unsigned int chars;
+
+ /* Which half of font to be used to display this character set */
+ unsigned int graphic;
+};
+typedef struct Lisp_Charset Lisp_Charset;
+
+DECLARE_LRECORD (charset, Lisp_Charset);
+#define XCHARSET(x) XRECORD (x, charset, Lisp_Charset)
+#define XSETCHARSET(x, p) XSETRECORD (x, p, charset)
+#define CHARSETP(x) RECORDP (x, charset)
+#define CHECK_CHARSET(x) CHECK_RECORD (x, charset)
+#define CONCHECK_CHARSET(x) CONCHECK_RECORD (x, charset)
+
+#define CHARSET_TYPE_94 0 /* This charset includes 94 characters. */
+#define CHARSET_TYPE_96 1 /* This charset includes 96 characters. */
+#define CHARSET_TYPE_94X94 2 /* This charset includes 94x94 characters. */
+#define CHARSET_TYPE_96X96 3 /* This charset includes 96x96 characters. */
+
+#define CHARSET_LEFT_TO_RIGHT 0
+#define CHARSET_RIGHT_TO_LEFT 1
+
+/* Leading byte and id have been regrouped. -- OG */
+#define CHARSET_ID(cs) ((cs)->id)
+#define CHARSET_LEADING_BYTE(cs) ((Bufbyte) CHARSET_ID(cs))
+#define CHARSET_NAME(cs) ((cs)->name)
+#define CHARSET_SHORT_NAME(cs) ((cs)->short_name)
+#define CHARSET_LONG_NAME(cs) ((cs)->long_name)
+#define CHARSET_REP_BYTES(cs) ((cs)->rep_bytes)
+#define CHARSET_COLUMNS(cs) ((cs)->columns)
+#define CHARSET_GRAPHIC(cs) ((cs)->graphic)
+#define CHARSET_TYPE(cs) ((cs)->type)
+#define CHARSET_DIRECTION(cs) ((cs)->direction)
+#define CHARSET_FINAL(cs) ((cs)->final)
+#define CHARSET_DOC_STRING(cs) ((cs)->doc_string)
+#define CHARSET_REGISTRY(cs) ((cs)->registry)
+#define CHARSET_CCL_PROGRAM(cs) ((cs)->ccl_program)
+#define CHARSET_DIMENSION(cs) ((cs)->dimension)
+#define CHARSET_CHARS(cs) ((cs)->chars)
+#define CHARSET_REVERSE_DIRECTION_CHARSET(cs) ((cs)->reverse_direction_charset)
+
+
+#define CHARSET_PRIVATE_P(cs) LEADING_BYTE_PRIVATE_P (CHARSET_LEADING_BYTE (cs))
+
+#define XCHARSET_ID(cs) CHARSET_ID (XCHARSET (cs))
+#define XCHARSET_NAME(cs) CHARSET_NAME (XCHARSET (cs))
+#define XCHARSET_SHORT_NAME(cs) CHARSET_SHORT_NAME (XCHARSET (cs))
+#define XCHARSET_LONG_NAME(cs) CHARSET_LONG_NAME (XCHARSET (cs))
+#define XCHARSET_REP_BYTES(cs) CHARSET_REP_BYTES (XCHARSET (cs))
+#define XCHARSET_COLUMNS(cs) CHARSET_COLUMNS (XCHARSET (cs))
+#define XCHARSET_GRAPHIC(cs) CHARSET_GRAPHIC (XCHARSET (cs))
+#define XCHARSET_TYPE(cs) CHARSET_TYPE (XCHARSET (cs))
+#define XCHARSET_DIRECTION(cs) CHARSET_DIRECTION (XCHARSET (cs))
+#define XCHARSET_FINAL(cs) CHARSET_FINAL (XCHARSET (cs))
+#define XCHARSET_DOC_STRING(cs) CHARSET_DOC_STRING (XCHARSET (cs))
+#define XCHARSET_REGISTRY(cs) CHARSET_REGISTRY (XCHARSET (cs))
+#define XCHARSET_LEADING_BYTE(cs) CHARSET_LEADING_BYTE (XCHARSET (cs))
+#define XCHARSET_CCL_PROGRAM(cs) CHARSET_CCL_PROGRAM (XCHARSET (cs))
+#define XCHARSET_DIMENSION(cs) CHARSET_DIMENSION (XCHARSET (cs))
+#define XCHARSET_CHARS(cs) CHARSET_CHARS (XCHARSET (cs))
+#define XCHARSET_PRIVATE_P(cs) CHARSET_PRIVATE_P (XCHARSET (cs))
+#define XCHARSET_REVERSE_DIRECTION_CHARSET(cs) \
+ CHARSET_REVERSE_DIRECTION_CHARSET (XCHARSET (cs))
+
+struct charset_lookup {
+ /* Table of charsets indexed by leading byte. */
+ Lisp_Object charset_by_leading_byte[128];
+
+ /* Table of charsets indexed by type/final-byte/direction. */
+ Lisp_Object charset_by_attributes[4][128][2];
+ Bufbyte next_allocated_1_byte_leading_byte;
+ Bufbyte next_allocated_2_byte_leading_byte;
+};
+
+INLINE_HEADER Lisp_Object CHARSET_BY_LEADING_BYTE (Bufbyte lb);
+INLINE_HEADER Lisp_Object
+CHARSET_BY_LEADING_BYTE (Bufbyte lb)
+{
+ extern struct charset_lookup *chlook;
+
+#ifdef ERROR_CHECK_TYPECHECK
+ /* When error-checking is on, x86 GCC 2.95.2 -O3 miscompiles the
+ following unless we introduce `tem'. */
+ int tem = lb;
+ type_checking_assert (tem >= 0x80 && tem <= 0xFF);
+#endif
+ return chlook->charset_by_leading_byte[lb - 128];
+}
+
+INLINE_HEADER Lisp_Object
+CHARSET_BY_ATTRIBUTES (unsigned int type, unsigned char final, int dir);
+INLINE_HEADER Lisp_Object
+CHARSET_BY_ATTRIBUTES (unsigned int type, unsigned char final, int dir)
+{
+ extern struct charset_lookup *chlook;
+
+ type_checking_assert (type < countof (chlook->charset_by_attributes) &&
+ final < countof (chlook->charset_by_attributes[0]) &&
+ dir < countof (chlook->charset_by_attributes[0][0]));
+ return chlook->charset_by_attributes[type][final][dir];
+}
+
+/* Table of number of bytes in the string representation of a character
+ indexed by the first byte of that representation.
+
+ This value can be derived in other ways -- e.g. something like
+ XCHARSET_REP_BYTES (CHARSET_BY_LEADING_BYTE (first_byte))
+ but it's faster this way. */
+extern const Bytecount rep_bytes_by_first_byte[0xA0];
+
+/* Number of bytes in the string representation of a character. */
+INLINE_HEADER int REP_BYTES_BY_FIRST_BYTE (Bufbyte fb);
+INLINE_HEADER int
+REP_BYTES_BY_FIRST_BYTE (Bufbyte fb)
+{
+ type_checking_assert (fb < 0xA0);
+ return rep_bytes_by_first_byte[fb];
+}
+
+\f
+/************************************************************************/
+/* Dealing with characters */
+/************************************************************************/
+
+/* Is this character represented by more than one byte in a string? */
+
+#define CHAR_MULTIBYTE_P(c) ((c) >= 0x80)
+
+#define CHAR_ASCII_P(c) (!CHAR_MULTIBYTE_P (c))
+
+/* The bit fields of character are divided into 3 parts:
+ FIELD1(5bits):FIELD2(7bits):FIELD3(7bits) */
+
+#define CHAR_FIELD1_MASK (0x1F << 14)
+#define CHAR_FIELD2_MASK (0x7F << 7)
+#define CHAR_FIELD3_MASK 0x7F
+
+/* Macros to access each field of a character code of C. */
+
+#define CHAR_FIELD1(c) (((c) & CHAR_FIELD1_MASK) >> 14)
+#define CHAR_FIELD2(c) (((c) & CHAR_FIELD2_MASK) >> 7)
+#define CHAR_FIELD3(c) ((c) & CHAR_FIELD3_MASK)
+
+/* Field 1, if non-zero, usually holds a leading byte for a
+ dimension-2 charset. Field 2, if non-zero, usually holds a leading
+ byte for a dimension-1 charset. */
+
+/* Converting between field values and leading bytes. */
+
+#define FIELD2_TO_OFFICIAL_LEADING_BYTE 0x80
+#define FIELD2_TO_PRIVATE_LEADING_BYTE 0x80
+
+#define FIELD1_TO_OFFICIAL_LEADING_BYTE 0x8F
+#define FIELD1_TO_PRIVATE_LEADING_BYTE 0xE1
+
+/* Minimum and maximum allowed values for the fields. */
+
+#define MIN_CHAR_FIELD2_OFFICIAL \
+ (MIN_LEADING_BYTE_OFFICIAL_1 - FIELD2_TO_OFFICIAL_LEADING_BYTE)
+#define MAX_CHAR_FIELD2_OFFICIAL \
+ (MAX_LEADING_BYTE_OFFICIAL_1 - FIELD2_TO_OFFICIAL_LEADING_BYTE)
+
+#define MIN_CHAR_FIELD1_OFFICIAL \
+ (MIN_LEADING_BYTE_OFFICIAL_2 - FIELD1_TO_OFFICIAL_LEADING_BYTE)
+#define MAX_CHAR_FIELD1_OFFICIAL \
+ (MAX_LEADING_BYTE_OFFICIAL_2 - FIELD1_TO_OFFICIAL_LEADING_BYTE)
+
+#define MIN_CHAR_FIELD2_PRIVATE \
+ (MIN_LEADING_BYTE_PRIVATE_1 - FIELD2_TO_PRIVATE_LEADING_BYTE)
+#define MAX_CHAR_FIELD2_PRIVATE \
+ (MAX_LEADING_BYTE_PRIVATE_1 - FIELD2_TO_PRIVATE_LEADING_BYTE)
+
+#define MIN_CHAR_FIELD1_PRIVATE \
+ (MIN_LEADING_BYTE_PRIVATE_2 - FIELD1_TO_PRIVATE_LEADING_BYTE)
+#define MAX_CHAR_FIELD1_PRIVATE \
+ (MAX_LEADING_BYTE_PRIVATE_2 - FIELD1_TO_PRIVATE_LEADING_BYTE)
+
+/* Minimum character code of each <type> character. */
+
+#define MIN_CHAR_OFFICIAL_TYPE9N (MIN_CHAR_FIELD2_OFFICIAL << 7)
+#define MIN_CHAR_PRIVATE_TYPE9N (MIN_CHAR_FIELD2_PRIVATE << 7)
+#define MIN_CHAR_OFFICIAL_TYPE9NX9N (MIN_CHAR_FIELD1_OFFICIAL << 14)
+#define MIN_CHAR_PRIVATE_TYPE9NX9N (MIN_CHAR_FIELD1_PRIVATE << 14)
+#define MIN_CHAR_COMPOSITION (0x1F << 14)
+
+/* Leading byte of a character.
+
+ NOTE: This takes advantage of the fact that
+ FIELD2_TO_OFFICIAL_LEADING_BYTE and
+ FIELD2_TO_PRIVATE_LEADING_BYTE are the same.
+ */
+
+INLINE_HEADER Bufbyte CHAR_LEADING_BYTE (Emchar c);
+INLINE_HEADER Bufbyte
+CHAR_LEADING_BYTE (Emchar c)
+{
+ if (CHAR_ASCII_P (c))
+ return LEADING_BYTE_ASCII;
+ else if (c < 0xA0)
+ return LEADING_BYTE_CONTROL_1;
+ else if (c < MIN_CHAR_OFFICIAL_TYPE9NX9N)
+ return CHAR_FIELD2 (c) + FIELD2_TO_OFFICIAL_LEADING_BYTE;
+ else if (c < MIN_CHAR_PRIVATE_TYPE9NX9N)
+ return CHAR_FIELD1 (c) + FIELD1_TO_OFFICIAL_LEADING_BYTE;
+ else if (c < MIN_CHAR_COMPOSITION)
+ return CHAR_FIELD1 (c) + FIELD1_TO_PRIVATE_LEADING_BYTE;
+ else
+ {
+#ifdef ENABLE_COMPOSITE_CHARS
+ return LEADING_BYTE_COMPOSITE;
+#else
+ ABORT();
+ return 0;
+#endif /* ENABLE_COMPOSITE_CHARS */
+ }
+}
+
+#define CHAR_CHARSET(c) CHARSET_BY_LEADING_BYTE (CHAR_LEADING_BYTE (c))
+
+/* Return a character whose charset is CHARSET and position-codes
+ are C1 and C2. TYPE9N character ignores C2.
+
+ NOTE: This takes advantage of the fact that
+ FIELD2_TO_OFFICIAL_LEADING_BYTE and
+ FIELD2_TO_PRIVATE_LEADING_BYTE are the same.
+ */
+
+INLINE_HEADER Emchar MAKE_CHAR (Lisp_Object charset, int c1, int c2);
+INLINE_HEADER Emchar
+MAKE_CHAR (Lisp_Object charset, int c1, int c2)
+{
+ if (EQ (charset, Vcharset_ascii))
+ return c1;
+ else if (EQ (charset, Vcharset_control_1))
+ return c1 | 0x80;
+#ifdef ENABLE_COMPOSITE_CHARS
+ else if (EQ (charset, Vcharset_composite))
+ return (0x1F << 14) | ((c1) << 7) | (c2);
+#endif
+ else if (XCHARSET_DIMENSION (charset) == 1)
+ return ((XCHARSET_LEADING_BYTE (charset) -
+ FIELD2_TO_OFFICIAL_LEADING_BYTE) << 7) | (c1);
+ else if (!XCHARSET_PRIVATE_P (charset))
+ return ((XCHARSET_LEADING_BYTE (charset) -
+ FIELD1_TO_OFFICIAL_LEADING_BYTE) << 14) | ((c1) << 7) | (c2);
+ else
+ return ((XCHARSET_LEADING_BYTE (charset) -
+ FIELD1_TO_PRIVATE_LEADING_BYTE) << 14) | ((c1) << 7) | (c2);
+}
+
+/* The charset of character C is set to CHARSET, and the
+ position-codes of C are set to C1 and C2. C2 of TYPE9N character
+ is 0. */
+
+/* BREAKUP_CHAR_1_UNSAFE assumes that the charset has already been
+ calculated, and just computes c1 and c2.
+
+ BREAKUP_CHAR also computes and stores the charset. */
+
+#define BREAKUP_CHAR_1_UNSAFE(c, charset, c1, c2) \
+ XCHARSET_DIMENSION (charset) == 1 \
+ ? ((c1) = CHAR_FIELD3 (c), (c2) = 0) \
+ : ((c1) = CHAR_FIELD2 (c), \
+ (c2) = CHAR_FIELD3 (c))
+
+INLINE_HEADER void breakup_char_1 (Emchar c, Lisp_Object *charset, int *c1, int *c2);
+INLINE_HEADER void
+breakup_char_1 (Emchar c, Lisp_Object *charset, int *c1, int *c2)
+{
+ *charset = CHAR_CHARSET (c);
+ BREAKUP_CHAR_1_UNSAFE (c, *charset, *c1, *c2);
+}
+
+#define BREAKUP_CHAR(c, charset, c1, c2) \
+ breakup_char_1 (c, &(charset), &(c1), &(c2))
+
+
+\f
+#ifdef ENABLE_COMPOSITE_CHARS
+/************************************************************************/
+/* Composite characters */
+/************************************************************************/
+
+Emchar lookup_composite_char (Bufbyte *str, int len);
+Lisp_Object composite_char_string (Emchar ch);
+#endif /* ENABLE_COMPOSITE_CHARS */
+
+\f
+/************************************************************************/
+/* Exported functions */
+/************************************************************************/
+
+EXFUN (Ffind_charset, 1);
+EXFUN (Fget_charset, 1);
+
+extern Lisp_Object Vcharset_chinese_big5_1;
+extern Lisp_Object Vcharset_chinese_big5_2;
+extern Lisp_Object Vcharset_japanese_jisx0208;
+
+Emchar Lstream_get_emchar_1 (Lstream *stream, int first_char);
+int Lstream_fput_emchar (Lstream *stream, Emchar ch);
+void Lstream_funget_emchar (Lstream *stream, Emchar ch);
+
+int copy_internal_to_external (const Bufbyte *internal, Bytecount len,
+ unsigned char *external);
+Bytecount copy_external_to_internal (const unsigned char *external,
+ int len, Bufbyte *internal);
+
+#endif /* INCLUDED_mule_charset_h_ */