X-Git-Url: http://git.chise.org/gitweb/?a=blobdiff_plain;f=src%2Fmtext.c;h=b13ef8810e2f6c420313618e87a58600959d174a;hb=ebb9aa922a01d5052acee38d06d4b175086ba725;hp=22d6cee5e3df04854bd8ca3287a6973cd7603c1a;hpb=92bef2270a390d059529d535b5eb7f82a7971528;p=m17n%2Fm17n-lib.git diff --git a/src/mtext.c b/src/mtext.c index 22d6cee..b13ef88 100644 --- a/src/mtext.c +++ b/src/mtext.c @@ -1,5 +1,5 @@ /* mtext.c -- M-text module. - Copyright (C) 2003, 2004 + Copyright (C) 2003, 2004, 2005 National Institute of Advanced Industrial Science and Technology (AIST) Registration Number H15PRO112 @@ -93,6 +93,9 @@ #include "character.h" #include "mtext.h" #include "plist.h" +#ifdef HAVE_THAI_WORDSEG +#include "word-thai.h" +#endif static M17NObjectArray mtext_table; @@ -682,11 +685,18 @@ case_compare (MText *mt1, int from1, int to1, MText *mt2, int from2, int to2) /* Internal API */ +MCharTable *wordseg_func_table; + int mtext__init () { + M17N_OBJECT_ADD_ARRAY (mtext_table, "M-text"); M_charbag = msymbol_as_managing_key (" charbag"); mtext_table.count = 0; + wordseg_func_table = mchartable (Mnil, NULL); +#ifdef HAVE_THAI_WORDSEG + mtext__word_thai_init (); +#endif return 0; } @@ -694,7 +704,11 @@ mtext__init () void mtext__fini (void) { - mdebug__report_object ("M-text", &mtext_table); +#ifdef HAVE_THAI_WORDSEG + mtext__word_thai_fini (); +#endif + M17N_OBJECT_UNREF (wordseg_func_table); + wordseg_func_table = NULL; } @@ -1103,6 +1117,22 @@ mtext__eol (MText *mt, int pos) } } +typedef int (*MTextWordsegFunc) (MText *mt, int pos, int *from, int *to); + +int +mtext__word_segment (MText *mt, int pos, int *from, int *to) +{ + int c = mtext_ref_char (mt, pos); + MTextWordsegFunc func + = (MTextWordsegFunc) mchartable_lookup (wordseg_func_table, c); + + if (func) + return (func) (mt, pos, from, to); + *from = *to = pos; + return -1; +} + + /*** @} */ #endif /* !FOR_DOXYGEN || DOXYGEN_INTERNAL_MODULE */ @@ -1110,9 +1140,9 @@ mtext__eol (MText *mt, int pos) /* External API */ #ifdef WORDS_BIGENDIAN -const int MTEXT_FORMAT_UTF_16 = MTEXT_FORMAT_UTF_16BE; +const enum MTextFormat MTEXT_FORMAT_UTF_16 = MTEXT_FORMAT_UTF_16BE; #else -const int MTEXT_FORMAT_UTF_16 = MTEXT_FORMAT_UTF_16LE; +const enum MTextFormat MTEXT_FORMAT_UTF_16 = MTEXT_FORMAT_UTF_16LE; #endif #ifdef WORDS_BIGENDIAN @@ -1232,6 +1262,86 @@ mtext_from_data (const void *data, int nitems, enum MTextFormat format) /*=*/ /***en + @brief Get information about the text data in M-text. + + The mtext_data () function returns a pointer to the text data of + M-text $MT. If $FMT is not NULL, the format of the text data is + stored in it. If $NUNITS is not NULL, the number of units of the + text data is stored in it. + + If $POS_IDX is not NULL and it points to a non-negative number, + what it points to is a character position. In this case, the + return value is a pointer to the text data of a character at that + position. + + Otherwise, if $UNIT_IDX is not NULL, it points to a unit position. + In this case, the return value is a pointer to the text data of a + character containing that unit. + + The character position and unit position of the return value are + stored in $POS_IDX and $UNIT_DIX respectively if they are not + NULL. + + */ + +void * +mtext_data (MText *mt, enum MTextFormat *fmt, int *nunits, + int *pos_idx, int *unit_idx) +{ + void *data; + int pos = 0, unit_pos = 0; + + if (fmt) + *fmt = mt->format; + data = MTEXT_DATA (mt); + if (pos_idx && *pos_idx >= 0) + { + pos = *pos_idx; + if (pos > mtext_nchars (mt)) + MERROR (MERROR_MTEXT, NULL); + unit_pos = POS_CHAR_TO_BYTE (mt, pos); + } + else if (unit_idx) + { + unit_pos = *unit_idx; + + if (unit_pos < 0 || unit_pos > mtext_nbytes (mt)) + MERROR (MERROR_MTEXT, NULL); + pos = POS_BYTE_TO_CHAR (mt, unit_pos); + unit_pos = POS_CHAR_TO_BYTE (mt, pos); + } + if (nunits) + *nunits = mtext_nbytes (mt) - unit_pos; + if (pos_idx) + *pos_idx = pos; + if (unit_idx) + *unit_idx = unit_pos; + if (unit_pos > 0) + { + if (mt->format <= MTEXT_FORMAT_UTF_8) + data = (unsigned char *) data + unit_pos; + else if (mt->format <= MTEXT_FORMAT_UTF_16BE) + data = (unsigned short *) data + unit_pos; + else + data = (unsigned int *) data + unit_pos; + } + return data; +} + +/*=*/ + +/***en @brief Number of characters in M-text. The mtext_len () function returns the number of characters in @@ -2477,9 +2587,7 @@ int mtext_text (MText *mt1, int pos, MText *mt2) { int from = pos; - int pos_byte = POS_CHAR_TO_BYTE (mt1, pos); int c = mtext_ref_char (mt2, 0); - int nbytes1 = mtext_nbytes (mt1); int nbytes2 = mtext_nbytes (mt2); int limit; int use_memcmp = (mt1->format == mt2->format @@ -2487,13 +2595,14 @@ mtext_text (MText *mt1, int pos, MText *mt2) && mt2->format == MTEXT_FORMAT_UTF_8)); int unit_bytes = UNIT_BYTES (mt1->format); - if (nbytes2 > pos_byte + nbytes1) + if (from + mtext_nchars (mt2) > mtext_nchars (mt1)) return -1; - pos_byte = nbytes1 - nbytes2; - limit = POS_BYTE_TO_CHAR (mt1, pos_byte); + limit = mtext_nchars (mt1) - mtext_nchars (mt2) + 1; while (1) { + int pos_byte; + if ((pos = mtext_character (mt1, from, limit, c)) < 0) return -1; pos_byte = POS_CHAR_TO_BYTE (mt1, pos); @@ -2568,7 +2677,7 @@ mtext_search (MText *mt1, int from, int to, MText *mt2) return -1; while (1) { - if ((from = find_char_backward (mt1, from, to, c)) < 0) + if ((from = find_char_backward (mt1, to, from + 1, c)) < 0) return -1; from_byte = POS_CHAR_TO_BYTE (mt1, from); if (! memcmp (mt1->data + from_byte, mt2->data, nbytes2))