From: handa Date: Tue, 6 Sep 2005 00:33:59 +0000 (+0000) Subject: *** empty log message *** X-Git-Tag: REL-1-3-0~183 X-Git-Url: http://git.chise.org/gitweb/?a=commitdiff_plain;h=d9bc7319d98b357dee332681a29c26adea18c604;p=m17n%2Fm17n-lib.git *** empty log message *** --- diff --git a/src/ChangeLog b/src/ChangeLog index 0296335..8c5964e 100644 --- a/src/ChangeLog +++ b/src/ChangeLog @@ -1,3 +1,15 @@ +2005-09-06 TAKAHASHI Naoto + + * mtext.c (tricky_chars, *cased, *soft_dotted, *case_mapping) + (MCharTable *combining_class, Mlt, Mtr, Maz, gr03A3, lt0049, + (lt004A, lt012E, lt00CC, lt00CD, lt0128, tr0130, tr0049, tr0069): + New variables. + (init_case_conversion): New function. + (CASE_CONV_INIT, REPLACE, DELETE, LOOKUP): New macros. + (uppercase_precheck, lowercase_precheck, final_sigma) + (after_soft_dotted, more_above, before_dot, after_i) + (mtext_uppercase, mtext_titlecase, mtext_lowercase): New function.s + 2005-09-05 TAKAHASHI Naoto * plist.c (read_mtext_element): Fix previous change. diff --git a/src/mtext.c b/src/mtext.c index 363bf2d..eab2431 100644 --- a/src/mtext.c +++ b/src/mtext.c @@ -611,6 +611,8 @@ free_mtext (void *object) free (object); } +/** Case handler (case-folding comparison and case conversion) */ + /** Structure for an iterator used in case-fold comparison. */ struct casecmp_iterator { @@ -684,6 +686,292 @@ case_compare (MText *mt1, int from1, int to1, MText *mt2, int from2, int to2) return (it2.pos == to2 ? (it1.pos < to1) : -1); } +static MCharTable *tricky_chars, *cased, *soft_dotted, *case_mapping; +static MCharTable *combining_class; + +/* Languages that require special handling in case-conversion. */ +static MSymbol Mlt, Mtr, Maz; + +static MText *gr03A3; +static MText *lt0049, *lt004A, *lt012E, *lt00CC, *lt00CD, *lt0128; +static MText *tr0130, *tr0049, *tr0069; + +static int +init_case_conversion () +{ + Mlt = msymbol ("lt"); + Mtr = msymbol ("tr"); + Maz = msymbol ("az"); + + gr03A3 = mtext (); + mtext_cat_char (gr03A3, 0x03C2); + + lt0049 = mtext (); + mtext_cat_char (lt0049, 0x0069); + mtext_cat_char (lt0049, 0x0307); + + lt004A = mtext (); + mtext_cat_char (lt004A, 0x006A); + mtext_cat_char (lt004A, 0x0307); + + lt012E = mtext (); + mtext_cat_char (lt012E, 0x012F); + mtext_cat_char (lt012E, 0x0307); + + lt00CC = mtext (); + mtext_cat_char (lt00CC, 0x0069); + mtext_cat_char (lt00CC, 0x0307); + mtext_cat_char (lt00CC, 0x0300); + + lt00CD = mtext (); + mtext_cat_char (lt00CD, 0x0069); + mtext_cat_char (lt00CD, 0x0307); + mtext_cat_char (lt00CD, 0x0301); + + lt0128 = mtext (); + mtext_cat_char (lt0128, 0x0069); + mtext_cat_char (lt0128, 0x0307); + mtext_cat_char (lt0128, 0x0303); + + tr0130 = mtext (); + mtext_cat_char (tr0130, 0x0069); + + tr0049 = mtext (); + mtext_cat_char (tr0049, 0x0131); + + tr0069 = mtext (); + mtext_cat_char (tr0069, 0x0130); + + if ((cased = mchar_get_prop_table (msymbol ("cased"), NULL))) + return -1; + if ((soft_dotted = mchar_get_prop_table (msymbol ("soft-dotted"), NULL))) + return -1; + if ((case_mapping = mchar_get_prop_table (msymbol ("case-mapping"), NULL))) + return -1; + if ((combining_class = mchar_get_prop_table (Mcombining_class, NULL))) + return -1; + + tricky_chars = mchartable (Mnil, 0); + mchartable_set (tricky_chars, 0x0049, (void *) 1); + mchartable_set (tricky_chars, 0x004A, (void *) 1); + mchartable_set (tricky_chars, 0x00CC, (void *) 1); + mchartable_set (tricky_chars, 0x00CD, (void *) 1); + mchartable_set (tricky_chars, 0x0128, (void *) 1); + mchartable_set (tricky_chars, 0x012E, (void *) 1); + mchartable_set (tricky_chars, 0x0130, (void *) 1); + mchartable_set (tricky_chars, 0x0307, (void *) 1); + mchartable_set (tricky_chars, 0x03A3, (void *) 1); + return 0; +} + +#define CASE_CONV_INIT(ret) \ + do { \ + if (! tricky_chars \ + && init_case_conversion () < 0) \ + MERROR (MERROR_MTEXT, ret); \ + } while (0) + + +/* Replace the character at I of MT with VAR, increment I and LEN, + and set MODIFIED to 1. */ + +#define REPLACE(var) \ + do { \ + int varlen = mtext_nchars (var); \ + \ + mtext_replace (mt, i, i + 1, var, 0, varlen); \ + i += varlen; \ + len += varlen - 1; \ + modified = 1; \ + } while (0) + +/* Delete the character at I of MT, decrement LEN, + and set MODIFIED to 1. */ + +#define DELETE() \ + do { \ + mtext_del (mt, i, i + 1); \ + len--; \ + modified = 1; \ + } while (0) + +#define LOOKUP() \ + do { \ + MPlist *pl = mchartable_lookup (case_mapping, c); \ + \ + if (pl) \ + { \ + /* Lowercase is the 1st element. */ \ + MText *lower = MPLIST_VAL ((MPlist *) MPLIST_VAL (pl)); \ + int llen = mtext_nchars (lower); \ + \ + if (mtext_ref_char (lower, 0) != c || llen > 1) \ + { \ + mtext_replace (mt, i, i + 1, lower, 0, llen); \ + i += llen; \ + len += llen - 1; \ + modified = 1; \ + } \ + else \ + i++; \ + } \ + else \ + i++; \ + } while (0) + + +int +uppercase_precheck (MText *mt) +{ + int len = mtext_nchars (mt), i; + + for (i = 0; i < len; i++) + if (mtext_ref_char (mt, i) == 0x0307 && + (MSymbol) mtext_get_prop (mt, i, Mlanguage) == Mlt) + return 1; + return 0; +} + +int +lowercase_precheck (MText *mt) +{ + int len = mtext_len (mt), i; + + for (i = 0; i < len; i++) + { + int c = mtext_ref_char (mt, i); + + if ((int) mchartable_lookup (tricky_chars, c) == 1) + { + MSymbol lang; + + if (c == 0x03A3) + return 1; + + lang = mtext_get_prop (mt, i, Mlanguage); + + if (lang == Mlt && + (c == 0x0049 || c == 0x004A || c == 0x012E || + c == 0x00CC || c == 0x00CD || c == 0x0128)) + return 1; + + if ((lang == Mtr || lang == Maz) && + (c == 0x0130 || c == 0x0307 || c == 0x0049)) + return 1; + } + } + return 0; +} + +#define CASED 1 +#define CASE_IGNORABLE 2 + +int +final_sigma (MText *mt, int pos) +{ + int i, len = mtext_len (mt); + int c; + + for (i = pos - 1; i >= 0; i--) + { + c = (int) mchartable_lookup (cased, mtext_ref_char (mt, i)); + if (c == -1) + c = 0; + if (c & CASED) + break; + if (! (c & CASE_IGNORABLE)) + return 0; + } + + if (i == -1) + return 0; + + for (i = pos + 1; i < len; i++) + { + c = (int) mchartable_lookup (cased, mtext_ref_char (mt, i)); + if (c == -1) + c = 0; + if (c & CASED) + return 0; + if (! (c & CASE_IGNORABLE)) + return 1; + } + + return 1; +} + +int +after_soft_dotted (MText *mt, int i) +{ + int c, class; + + for (i--; i >= 0; i--) + { + c = mtext_ref_char (mt, i); + if ((MSymbol) mchartable_lookup (soft_dotted, c) == Mt) + return 1; + class = (int) mchartable_lookup (combining_class, c); + if (class == 0 || class == 230) + return 0; + } + + return 0; +} + +int +more_above (MText *mt, int i) +{ + int class, len = mtext_len (mt); + + for (i++; i < len; i++) + { + class = (int) mchartable_lookup (combining_class, + mtext_ref_char (mt, i)); + if (class == 230) + return 1; + if (class == 0) + return 0; + } + + return 0; +} + +int +before_dot (MText *mt, int i) +{ + int c, class, len = mtext_len (mt); + + for (i++; i < len; i++) + { + c = mtext_ref_char (mt, i); + if (c == 0x0307) + return 1; + class = (int) mchartable_lookup (combining_class, c); + if (class == 230 || class == 0) + return 0; + } + + return 0; +} + +int +after_i (MText *mt, int i) +{ + int c, class; + + for (i--; i >= 0; i--) + { + c = mtext_ref_char (mt, i); + if (c == (int) 'I') + return 1; + class = (int) mchartable_lookup (combining_class, c); + if (class == 230 || class == 0) + return 0; + } + + return 0; +} + /* Internal API */ @@ -2952,6 +3240,265 @@ mtext_case_compare (MText *mt1, int from1, int to1, return case_compare (mt1, from1, to1, mt2, from2, to2); } +/*=*/ + +/***en + @brief Uppercase an M-text. + + + The mtext_uppercase () function destructively converts each + character in M-text $MT to uppercase. Adjacent characters in $MT + may affect the case conversion. If the Mlanguage text property is + attached to $MT, it may also affect the conversion. The length of + $MT may change. Characters that cannot be converted to uppercase + is left unchanged. All the text properties are inherited. + + @return + If more than one character is converted, 1 is returned. + Otherwise, 0 is returned. +*/ + +/***ja + @brief M-text ¤òÂçʸ»ú¤Ë¤¹¤ë. + + ´Ø¿ô mtext_uppercase () ¤Ï M-text $MT Ãæ¤Î³Æʸ»ú¤òÇ˲õŪ¤ËÂçʸ»ú¤ËÊÑ + ´¹¤¹¤ë¡£ÊÑ´¹¤ËºÝ¤·¤ÆÎÙÀܤ¹¤ëʸ»ú¤Î±Æ¶Á¤ò¼õ¤±¤ë¤³¤È¤¬¤¢¤ë¡£$MT ¤Ë¥Æ + ¥­¥¹¥È¥×¥í¥Ñ¥Æ¥£ Mlanguage ¤¬ÉÕ¤¤¤Æ¤¤¤ë¾ì¹ç¤Ï¡¢¤½¤ì¤âÊÑ´¹¤Ë±Æ¶Á¤ò + Í¿¤¨¤¦¤ë¡£$MT ¤ÎŤµ¤ÏÊѤï¤ë¤³¤È¤¬¤¢¤ë¡£Âçʸ»ú¤ËÊÑ´¹¤Ç¤­¤Ê¤«¤Ã¤¿Ê¸ + »ú¤Ï¤½¤Î¤Þ¤Þ»Ä¤ë¡£¥Æ¥­¥¹¥È¥×¥í¥Ñ¥Æ¥£¤Ï¤¹¤Ù¤Æ·Ñ¾µ¤µ¤ì¤ë¡£ + + @return + 1ʸ»ú°Ê¾å¤¬ÊÑ´¹¤µ¤ì¤¿¾ì¹ç¤Ï1¤¬ÊÖ¤µ¤ì¤ë¡£¤½¤¦¤Ç¤Ê¤¤¾ì¹ç¤Ï0¤¬ÊÖ¤µ¤ì¤ë¡£ +*/ + +/*** + @seealso mtext_lowercase (), mtext_titlecase () +*/ + +int +mtext_uppercase (MText *mt) +{ + int len = mtext_len (mt), i, j; + int c; + int modified = 0; + MText *orig; + MSymbol lang; + + CASE_CONV_INIT (-1); + + if (uppercase_precheck (mt)) + orig = mtext_dup (mt); + + /* i moves over mt, j moves over orig. */ + for (i = 0, j = 0; i < len; j++) + { + c = mtext_ref_char (mt, i); + lang = (MSymbol) mtext_get_prop (mt, i, Mlanguage); + + if (c == 0x0307 && lang == Mlt && after_soft_dotted (orig, j)) + DELETE (); + + else if (c == 0x0069 && (lang == Mtr || lang == Maz)) + REPLACE (tr0069); + + else + { + MPlist *pl = (MPlist *) mchartable_lookup (case_mapping, c); + + if (pl) + { + MText *upper; + int ulen; + + /* Uppercase is the 3rd element. */ + upper = (MText *) mplist_value (mplist_next (mplist_next (mplist_value (pl)))); + ulen = mtext_len (upper); + + if (mtext_ref_char (upper, 0) != c || ulen > 1) + { + mtext_replace (mt, i, i + 1, upper, 0, ulen); + modified = 1; + i += ulen; + len += ulen - 1; + } + + else + i++; + } + + else /* pl == NULL */ + i++; + } + } + + if (orig) + m17n_object_unref (orig); + return modified; +} + +/*=*/ + +/***en + @brief Titlecase an M-text. + + The mtext_titlecase () function destructively converts the first + character in M-text $MT to titlecase. The length of $MT may + change. If the character cannot be converted to titlercase, it is + left unchanged. All the text properties are inherited. + + @return + If the character is converted, 1 is returned. Otherwise, 0 is + returned. +*/ + +/***ja + @brief M-text ¤ò¥¿¥¤¥È¥ë¥±¡¼¥¹¤Ë¤¹¤ë. + + ´Ø¿ô mtext_titlecase () ¤Ï M-text $MT ¤ÎÀèƬ¤Îʸ»ú¤òÇ˲õŪ¤Ë¥¿¥¤¥È¥ë + ¥±¡¼¥¹¤ËÊÑ´¹¤¹¤ë¡£$MT ¤ÎŤµ¤ÏÊѤï¤ë¤³¤È¤¬¤¢¤ë¡£¥¿¥¤¥È¥ë¥±¡¼¥¹¤Ë¤Ë + ÊÑ´¹¤Ç¤­¤Ê¤«¤Ã¤¿¾ì¹ç¤Ï¤½¤Î¤Þ¤Þ¤ÇÊѤï¤é¤Ê¤¤¡£¥Æ¥­¥¹¥È¥×¥í¥Ñ¥Æ¥£¤Ï¤¹ + ¤Ù¤Æ·Ñ¾µ¤µ¤ì¤ë¡£ + + @return + ʸ»ú¤¬ÊÑ´¹¤µ¤ì¤¿¾ì¹ç¤Ï1¤¬ÊÖ¤µ¤ì¤ë¡£¤½¤¦¤Ç¤Ê¤¤¾ì¹ç¤Ï0¤¬ÊÖ¤µ¤ì¤ë¡£ +*/ + +/*** + @seealso mtext_lowercase (), mtext_uppercase () +*/ + +int +mtext_titlecase (MText *mt) +{ + int c = mtext_ref_char (mt, 0); + MSymbol lang = mtext_get_prop (mt, 0, Mlanguage); + MPlist *pl; + int modified = 0; + + CASE_CONV_INIT (-1); + + if ((lang == Mtr || lang == Maz) && c == 0x0069) + { + mtext_replace (mt, 0, 1, tr0069, 0, 1); + modified = 1; + } + + else if ((pl = mchartable_lookup (case_mapping, c))) + { + /* Titlecase is the 2nd element. */ + MText *title = (MText *) mplist_value (mplist_next (mplist_value (pl))); + int tlen = mtext_len (title); + + if (mtext_ref_char (title, 0) != c || tlen > 1) + { + mtext_replace (mt, 0, 1, title, 0, tlen); + modified = 1; + } + } + + return modified; +} + +/*=*/ + +/***en + @brief Lowercase an M-text. + + The mtext_lowercase () function destructively converts each + character in M-text $MT to lowercase. Adjacent characters in $MT + may affect the case conversion. If the Mlanguage text property is + attached to $MT, it may also affect the conversion. The length of + $MT may change. Characters that cannot be converted to lowercase + is left unchanged. All the text properties are inherited. + + @return + If more than one character is converted, 1 is returned. + Otherwise, 0 is returned. +*/ + +/***ja + @brief M-text ¤ò¾®Ê¸»ú¤Ë¤¹¤ë. + + ´Ø¿ô mtext_lowercase () ¤Ï M-text $MT Ãæ¤Î³Æʸ»ú¤òÇ˲õŪ¤Ë¾®Ê¸»ú¤ËÊÑ + ´¹¤¹¤ë¡£ÊÑ´¹¤ËºÝ¤·¤ÆÎÙÀܤ¹¤ëʸ»ú¤Î±Æ¶Á¤ò¼õ¤±¤ë¤³¤È¤¬¤¢¤ë¡£$MT ¤Ë¥Æ + ¥­¥¹¥È¥×¥í¥Ñ¥Æ¥£ Mlanguage ¤¬ÉÕ¤¤¤Æ¤¤¤ë¾ì¹ç¤Ï¡¢¤½¤ì¤âÊÑ´¹¤Ë±Æ¶Á¤ò + Í¿¤¨¤¦¤ë¡£$MT ¤ÎŤµ¤ÏÊѤï¤ë¤³¤È¤¬¤¢¤ë¡£¾®Ê¸»ú¤ËÊÑ´¹¤Ç¤­¤Ê¤«¤Ã¤¿Ê¸ + »ú¤Ï¤½¤Î¤Þ¤Þ»Ä¤ë¡£¥Æ¥­¥¹¥È¥×¥í¥Ñ¥Æ¥£¤Ï¤¹¤Ù¤Æ·Ñ¾µ¤µ¤ì¤ë¡£ + + @return + 1ʸ»ú°Ê¾å¤¬ÊÑ´¹¤µ¤ì¤¿¾ì¹ç¤Ï1¤¬ÊÖ¤µ¤ì¤ë¡£¤½¤¦¤Ç¤Ê¤¤¾ì¹ç¤Ï0¤¬ÊÖ¤µ¤ì¤ë¡£ +*/ + +/*** + @seealso mtext_titlecase (), mtext_uppercase () +*/ + +int +mtext_lowercase (MText *mt) + +{ + int len = mtext_len (mt), i, j; + int c; + int modified = 0; + MText *orig; + MSymbol lang; + + CASE_CONV_INIT (-1); + + if (lowercase_precheck (mt)) + orig = mtext_dup (mt); + + /* i moves over mt, j moves over orig. */ + for (i = 0, j = 0; i < len; j++) + { + c = mtext_ref_char (mt, i); + lang = (MSymbol) mtext_get_prop (mt, i, Mlanguage); + + if (c == 0x03A3 && final_sigma (orig, j)) + REPLACE (gr03A3); + + else if (lang == Mlt) + { + if (c == 0x00CC) + REPLACE (lt00CC); + else if (c == 0x00CD) + REPLACE (lt00CD); + else if (c == 0x0128) + REPLACE (lt0128); + else if (orig && more_above (orig, j)) + { + if (c == 0x0049) + REPLACE (lt0049); + else if (c == 0x004A) + REPLACE (lt004A); + else if (c == 0x012E) + REPLACE (lt012E); + else + LOOKUP (); + } + else + LOOKUP (); + } + + else if (lang == Mtr || lang == Maz) + { + if (c == 0x0130) + REPLACE (tr0130); + else if (c == 0x0307 && after_i (orig, j)) + DELETE (); + else if (c == 0x0049 && ! before_dot (orig, j)) + REPLACE (tr0049); + else + LOOKUP (); + } + + else + LOOKUP (); + } + + return modified; +} + /*** @} */ #include