X-Git-Url: http://git.chise.org/gitweb/?a=blobdiff_plain;ds=sidebyside;f=src%2Flanguage.c;h=8eb389b55bdfaa6fa833b84cc6d3c4e5a8d514fa;hb=20af8aaac75317a3156d0e39840d10f1be3e9e23;hp=a9b591211daa1990982940ac08b78501798e327e;hpb=cf32e0de6e84959f6afc5e8c6914b2223a30ae8b;p=m17n%2Fm17n-lib.git diff --git a/src/language.c b/src/language.c index a9b5912..8eb389b 100644 --- a/src/language.c +++ b/src/language.c @@ -1,5 +1,5 @@ -/* language.c -- language module. - Copyright (C) 2003, 2004 +/* language.c -- language (and script) module. + Copyright (C) 2003, 2004, 2006 National Institute of Advanced Industrial Science and Technology (AIST) Registration Number H15PRO112 @@ -17,7 +17,7 @@ You should have received a copy of the GNU Lesser General Public License along with the m17n library; if not, write to the Free - Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + Software Foundation, Inc., 51 Franklin Street, Fifth Floor, 02111-1307, USA. */ #include @@ -27,227 +27,563 @@ #include "internal.h" #include "language.h" #include "symbol.h" +#include "plist.h" +#include "mtext.h" #if !defined (FOR_DOXYGEN) || defined (DOXYGEN_INTERNAL_MODULE) +static MPlist *language_list; +static MPlist *script_list; + +static MPlist * +load_lang_script_list (MSymbol tag0, MSymbol tag1, MSymbol tag2, MSymbol tag3) +{ + MDatabase *mdb = mdatabase_find (tag0, tag1, tag2, tag3); + MPlist *plist, *pl, *p; + + if (! mdb + || ! (plist = mdatabase_load (mdb))) + return NULL; + /* Check at least if the plist is ((SYMBOL ...) ...). */ + MPLIST_DO (pl, plist) + { + if (! MPLIST_PLIST_P (pl)) + break; + p = MPLIST_PLIST (pl); + if (! MPLIST_SYMBOL_P (p)) + break; + } + if (! MPLIST_TAIL_P (pl)) + { + M17N_OBJECT_UNREF (plist); + return NULL; + } + return plist; +} + +static int +init_language_list (void) +{ + language_list = load_lang_script_list (msymbol ("standard"), Mlanguage, + msymbol ("iso639"), Mnil); + if (! language_list) + { + language_list = mplist (); + MERROR (MERROR_DB, -1); + } + return 0; +} + + +static int +init_script_list (void) +{ + script_list = load_lang_script_list (msymbol ("standard"), Mscript, + msymbol ("unicode"), Mnil); + if (! script_list) + { + script_list = mplist (); + MERROR (MERROR_DB, -1); + } + return 0; +} + /* Internal API */ int mlang__init () { - /* ISO 639 */ - struct { - char *name, *fullname; - } lang_rec[] = - { {"ab", "Abkhazian"}, - {"aa", "Afar"}, - {"af", "Afrikaans"}, - {"ak", "Akan"}, - {"sq", "Albanian"}, - {"am", "Amharic"}, - {"ar", "Arabic"}, - {"hy", "Armenian"}, - {"as", "Assamese"}, - {"ay", "Aymara"}, - {"az", "Azerbaijani"}, - {"ba", "Bashkir"}, - {"eu", "Basque"}, - {"bn", "Bengali"}, /* Bangla */ - {"dz", "Bhutani"}, - {"bh", "Bihari"}, - {"bi", "Bislama"}, - {"br", "Breton"}, - {"bg", "Bulgarian"}, - {"my", "Burmese"}, - {"be", "Byelorussian"}, /* Belarusian */ - {"km", "Cambodian"}, /* Khmer */ - {"ca", "Catalan"}, -#if 0 - {"??", "Cherokee"}, - {"??", "Chewa"}, -#endif - {"zh", "Chinese"}, - {"co", "Corsican"}, - {"hr", "Croatian"}, - {"cs", "Czech"}, - {"da", "Danish"}, - {"dv", "Dhivehi"}, - {"nl", "Dutch"}, -#if 0 - {"??", "Edo"}, -#endif - {"en", "English"}, - {"eo", "Esperanto"}, - {"et", "Estonian"}, - {"fo", "Faeroese"}, - {"fa", "Farsi"}, - {"fj", "Fiji"}, - {"fi", "Finnish"}, -#if 0 - {"??", "Flemish"}, -#endif - {"fr", "French"}, - {"fy", "Frisian"}, -#if 0 - {"??", "Fulfulde"}, -#endif - {"gl", "Galician"}, - {"gd", "Gaelic(Scottish)"}, /* Scottish */ - {"gv", "Gaelic(Manx)"}, /* Manx */ - {"ka", "Georgian"}, - {"de", "German"}, - {"el", "Greek"}, - {"kl", "Greenlandic"}, - {"gn", "Guarani"}, - {"gu", "Gujarati"}, - {"ha", "Hausa"}, -#if 0 - {"??", "Hawaiian"}, - {"iw", "Hebrew"}, -#endif - {"he", "Hebrew"}, - {"hi", "Hindi"}, - {"hu", "Hungarian"}, -#if 0 - {"??", "Ibibio"}, -#endif - {"is", "Icelandic"}, -#if 0 - {"??", "Igbo"}, - {"in", "Indonesian"}, -#endif - {"id", "Indonesian"}, -#if 0 - {"ia", "Interlingua"}, - {"ie", "Interlingue"}, -#endif - {"iu", "Inuktitut"}, - {"ik", "Inupiak"}, - {"ga", "Irish"}, - {"it", "Italian"}, - {"ja", "Japanese"}, - {"jw", "Javanese"}, - {"kn", "Kannada"}, -#if 0 - {"??", "Kanuri"}, -#endif - {"ks", "Kashmiri"}, - {"kk", "Kazakh"}, - {"rw", "Kinyarwanda"}, /* Ruanda */ - {"ky", "Kirghiz"}, - {"rn", "Kirundi"}, /* Rundi */ - {"ko", "Korean"}, - {"ku", "Kurdish"}, - {"lo", "Laothian"}, - {"la", "Latin"}, - {"lv", "Latvian"}, /* Lettish */ - {"ln", "Lingala"}, - {"lt", "Lithuanian"}, - {"mk", "Macedonian"}, - {"mg", "Malagasy"}, - {"ms", "Malay"}, - {"ml", "Malayalam"}, -#if 0 - {"??", "Manipuri"}, -#endif - {"mt", "Maltese"}, - {"mi", "Maori"}, - {"mr", "Marathi"}, - {"mo", "Moldavian"}, - {"mn", "Mongolian"}, - {"na", "Nauru"}, - {"ne", "Nepali"}, - {"no", "Norwegian"}, - {"oc", "Occitan"}, - {"or", "Oriya"}, - {"om", "Oromo"}, /* Afan, Galla */ -#if 0 - {"??", "Papiamentu"}, -#endif - {"ps", "Pashto"}, /* Pushto */ - {"pl", "Polish"}, - {"pt", "Portuguese"}, - {"pa", "Punjabi"}, - {"qu", "Quechua"}, - {"rm", "Rhaeto-Romance"}, - {"ro", "Romanian"}, - {"ru", "Russian"}, -#if 0 - {"??", "Sami"}, /* Lappish */ -#endif - {"sm", "Samoan"}, - {"sg", "Sangro"}, - {"sa", "Sanskrit"}, - {"sr", "Serbian"}, - {"sh", "Serbo-Croatian"}, - {"st", "Sesotho"}, - {"tn", "Setswana"}, - {"sn", "Shona"}, - {"sd", "Sindhi"}, - {"si", "Sinhalese"}, - {"ss", "Siswati"}, - {"sk", "Slovak"}, - {"sl", "Slovenian"}, - {"so", "Somali"}, - {"es", "Spanish"}, - {"su", "Sundanese"}, - {"sw", "Swahili"}, /* Kiswahili */ - {"sv", "Swedish"}, -#if 0 - {"??", "Syriac"}, -#endif - {"tl", "Tagalog"}, - {"tg", "Tajik"}, -#if 0 - {"??", "Tamazight"}, -#endif - {"ta", "Tamil"}, - {"tt", "Tatar"}, - {"te", "Telugu"}, - {"th", "Thai"}, - {"bo", "Tibetan"}, - {"ti", "Tigrinya"}, - {"to", "Tonga"}, - {"ts", "Tsonga"}, - {"tr", "Turkish"}, - {"tk", "Turkmen"}, - {"tw", "Twi"}, - {"ug", "Uighur"}, - {"uk", "Ukrainian"}, - {"ur", "Urdu"}, - {"uz", "Uzbek"}, -#if 0 - {"??", "Venda"}, -#endif - {"vi", "Vietnamese"}, - {"vo", "Volapuk"}, - {"cy", "Welsh"}, - {"wo", "Wolof"}, - {"xh", "Xhosa"}, -#if 0 - {"??", "Yi"}, - {"ji", "Yiddish"}, -#endif - {"yi", "Yiddish"}, - {"yo", "Yoruba"}, - {"zu", "Zulu"} }; - int i; - - Mlanguage = msymbol ("language"); - msymbol_put (Mlanguage, Mtext_prop_serializer, - (void *) msymbol__serializer); - msymbol_put (Mlanguage, Mtext_prop_deserializer, - (void *) msymbol__deserializer); - for (i = 0; i < ((sizeof lang_rec) / (sizeof lang_rec[0])); i++) - msymbol_put (msymbol (lang_rec[i].name), Mlanguage, - msymbol (lang_rec[i].fullname)); + msymbol_put_func (Mlanguage, Mtext_prop_serializer, + M17N_FUNC (msymbol__serializer)); + msymbol_put_func (Mlanguage, Mtext_prop_deserializer, + M17N_FUNC (msymbol__deserializer)); + Miso639_2 = msymbol ("iso639-2"); + Miso639_1 = msymbol ("iso639-1"); + + language_list = script_list = NULL; return 0; } void mlang__fini (void) { + M17N_OBJECT_UNREF (language_list); + language_list = NULL; + M17N_OBJECT_UNREF (script_list); + script_list = NULL; +} + +/*=*/ + +/***en + @brief Get information about a language. + + The mlanguage_info () function returns a well-formed @e plist that + contains information about $LANGUAGE. $LANGUAGE is a symbol whose + name is an ISO639-2 3-letter language code, an ISO639-1 2-letter + language codes, or an English word. + + The format of the plist is: + +@verbatim + (ISO639-2 [ISO639-1 | nil] ENGLISH-NAME ["NATIVE-NAME" | nil] + ["REPRESENTATIVE-CHARACTERS"]) +@endverbatim + + where, ISO639-2 is a symbol whose name is 3-letter language code + of ISO639-2, ISO639-1 is a symbol whose name is 2-letter language + code of ISO639-1, ENGLISH-NAME is a symbol whose name is the + English name of the language, "NATIVE-NAME" is an M-text written + by the most natural way in the language, + "REPRESENTATIVE-CHARACTERS" is an M-text that contains + representative characters used by the language. + + It is assured that the formats of both M-texts are + #MTEXT_FORMAT_UTF_8. + + @return + If the information is available, this function returns a plist + that should not be modified nor freed. Otherwise, it returns + @c NULL. + + @seealso + mlanguage_list () */ + +MPlist * +mlanguage__info (MSymbol language) +{ + MPlist *plist; + + if (! language_list + && init_language_list () < 0) + return NULL; + + MPLIST_DO (plist, language_list) + { + MPlist *pl = MPLIST_PLIST (plist); + + if (MPLIST_SYMBOL_P (pl) && MPLIST_SYMBOL (pl) == language) + return MPLIST_PLIST (plist); + if (! MPLIST_TAIL_P (pl)) + { + pl = MPLIST_NEXT (pl); + if (MPLIST_SYMBOL_P (pl) && MPLIST_SYMBOL (pl) == language) + return MPLIST_PLIST (plist); + if (! MPLIST_TAIL_P (pl)) + { + pl = MPLIST_NEXT (pl); + if (MPLIST_SYMBOL_P (pl) && MPLIST_SYMBOL (pl) == language) + return MPLIST_PLIST (plist); + } + } + } + return NULL; +} + +static MPlist * +mscript__info (MSymbol script) +{ + MPlist *plist; + + if (! script_list + && init_script_list () < 0) + return NULL; + MPLIST_DO (plist, script_list) + { + MPlist *pl = MPLIST_PLIST (plist); + + if (MPLIST_SYMBOL (pl) == script) + return pl; + } + return NULL; +} + +MPlist * +mscript__char_list (MSymbol name) +{ + MPlist *plist = mscript__info (name); + + if (plist /* script name */ + && (plist = MPLIST_NEXT (plist)) /* language list */ + && ! MPLIST_TAIL_P (plist) + && (plist = MPLIST_NEXT (plist)) /* char list */ + && MPLIST_PLIST_P (plist)) + return MPLIST_PLIST (plist); + return NULL; +} + +MSymbol +mscript__otf_tag (MSymbol script) +{ + MPlist *plist = mscript__info (script); + + if (plist /* script name */ + && (plist = MPLIST_NEXT (plist)) /* language list */ + && ! MPLIST_TAIL_P (plist) + && (plist = MPLIST_NEXT (plist)) /* char list */ + && ! MPLIST_TAIL_P (plist) + && (plist = MPLIST_NEXT (plist)) /* otf tag */ + && MPLIST_SYMBOL_P (plist)) + return MPLIST_SYMBOL (plist); + return NULL; +} + +MSymbol +mscript__from_otf_tag (MSymbol otf_tag) +{ + MPlist *plist; + /* As it is expected that this function is called in a sequence with + the same argument, we use a cache. */ + static MSymbol last_otf_tag, script; + + if (! script_list) + { + last_otf_tag = script = Mnil; + if (init_script_list () < 0) + return Mnil; + } + if (otf_tag == last_otf_tag) + return script; + last_otf_tag = otf_tag; + script = Mnil; + MPLIST_DO (plist, script_list) + { + MPlist *pl = MPLIST_PLIST (plist), *p; + + if (pl /* script name */ + && (p = MPLIST_NEXT (pl)) /* language tag */ + && ! MPLIST_TAIL_P (p) + && (p = MPLIST_NEXT (p)) /* char list */ + && ! MPLIST_TAIL_P (p) + && (p = MPLIST_NEXT (p)) /* otf tag */ + && MPLIST_SYMBOL_P (p) + && otf_tag == MPLIST_SYMBOL (p)) + { + script = MPLIST_SYMBOL (pl); + break; + } + } + return script; } #endif /* !FOR_DOXYGEN || DOXYGEN_INTERNAL_MODULE */ + + +/* External API */ + +MSymbol Miso639_1, Miso639_2; + +/*=*/ + +/***en + @brief List 3-letter language codes. + + The mlanguage_list () funciton returns a well-formed plist whose + keys are #Msymbol and values are symbols whose names are ISO639-2 + 3-letter language codes. + + @return + This function returns a plist. The caller should free it by + m17n_object_unref (). + + @seealso + mscript_list (). */ + +/***ja + @brief 3文字言語コードをリストする. + + 関数 mlanguage_list () は、整形式 (well-formed) plist を返す。各キー + は #Msymbol であり、個々の値は ISO639-2 に定められた3文字言語コー + ドを名前とするシンボルである。 + + @return + この関数が返す plist は、呼び出し側が m17n_object_unref () を使っ + て解放する必要がある。 + + @seealso + mscript_list (). */ + +MPlist * +mlanguage_list (void) +{ + MPlist *plist, *pl, *p, *p0; + + if (! language_list + && init_language_list () < 0) + return NULL; + plist = pl = mplist (); + MPLIST_DO (p, language_list) + { + p0 = MPLIST_PLIST (p); + pl = mplist_add (pl, Msymbol, MPLIST_VAL (p0)); + } + return plist; +} + +/*=*/ + +/***en + @brief Get a language code. + + The mlanguage_code () function returns a symbol whose name is the + ISO639 language code of $LANGUAGE. $LANGUAGE is a symbol whose + name is an ISO639-2 3-letter language code, an ISO639-1 2-letter + language codes, or an English word. + + $LEN specifies the type of the returned language code. If it is + 3, an ISO639-2 3-letter language code is returned. If it is 2, an + ISO639-1 2-letter language code is returned when defined; + otherwise #Mnil is returned. If it is 0, a 2-letter code is + returned when defined; otherwise a 3-letter code is returned. + + @return + If the information is available, this function returns a non-#Mnil + symbol. Otherwise, it returns #Mnil. + + @seealso + mlanguage_name (), mlanguage_text (). */ + +/***ja + @brief 言語コードを得る. + + 関数 mlanguage_code () は、$LANGUAGE に対応した ISO-639 言語コード + が名前であるようなシンボルを返す。$LANGUAGE はシンボルであり、その + 名前は、ISO639-2 3文字言語コード、ISO639-1 2文字言語コード、英語名、 + のいずれかである。 + + $LEN は返される言語コードの種類を決定する。$LEN が3の場合は + ISO639-2 3文字言語コードが返される。2の場合は、もし定義されていれ + ば ISO639-1 2文字言語コードが、そうでなければ #Mnil が返される。0 + の場合は、もし定義されていれば2文字コードが、そうでなければ3文字コー + ドが返される。 + + @return + もし情報が得られれば、この関数は #Mnil 以外のシンボルを返す。そう + でなければ #Mnil を返す。 + + @seealso + mlanguage_name (), mlanguage_text (). */ + +MSymbol +mlanguage_code (MSymbol language, int len) +{ + MPlist *plist = mlanguage__info (language); + MSymbol code; + + if (! plist) + return Mnil; + if (! MPLIST_SYMBOL_P (plist)) + return Mnil; + code = MPLIST_SYMBOL (plist); + if (len == 3) + return code; + plist = MPLIST_NEXT (plist); + return ((MPLIST_SYMBOL_P (plist) && MPLIST_SYMBOL (plist) != Mnil) + ? MPLIST_SYMBOL (plist) + : len == 0 ? code : Mnil); +} + +/*=*/ + +/***en + @brief Get an English language name. + + The mlanguage_name () function returns a symbol whose name is an + English name of $LANGUAGE. $LANGUAGE is a symbol whose name is an + ISO639-2 3-letter language code, an ISO639-1 2-letter language + codes, or an English word. + + @return + If the information is available, this function returns a non-#Mnil + symbol. Otherwise, it returns #Mnil. + + @seealso + mlanguage_code (), mlanguage_text (). */ + +/***ja + @brief 言語の英語名を得る. + + 関数 mlanguage_name () は、$LANGUAGE の英語名を名前とするようなシ + ンボルを返す。$LANGUAGE はシンボルであり、その名前は、ISO639-2 3文 + 字言語コード、ISO639-1 2文字言語コード、英語名、のいずれかである。 + + @return + 求めている情報が得られるなら、この関数は #Mnil 以外のシンボルを返 + す。そうでなければ #Mnil を返す。 + + @seealso + mlanguage_code (), mlanguage_text (). */ + +MSymbol +mlanguage_name (MSymbol language) +{ + MPlist *plist = mlanguage__info (language); + + if (! plist) /* 3-letter code */ + return Mnil; + plist = MPLIST_NEXT (plist); /* 2-letter code */ + if (MPLIST_TAIL_P (plist)) + return Mnil; + plist = MPLIST_NEXT (plist); /* english name */ + if (! MPLIST_SYMBOL_P (plist)) + return Mnil; + return MPLIST_SYMBOL (plist); +} + +/*=*/ + +/***en + @brief Return the language name written in that language. + + The mlanguage_text () function returns, in the form of M-text, the + language name of $LANGUAGE written in $LANGUAGE. If the + representative characters of the language are known, the + characters of the returned M-text has a text property whose key is + #Mtext and whose value is an M-text that contains the + representative characters. + + @return + If the information is available, this function returns an M-text + that should not be modified nor freed. Otherwise, it returns @c + NULL. + + @seealso + mlanguage_code (), mlanguage_name (). */ + +/***ja + @brief 与えられた言語自身で書かれた言語名を返す. + + 関数 mlanguage_text () は、言語 $LANGUAGE で書かれた $LANGUAGE の + 名前を M-text の形式で返す。その言語の代表的な文字がわかっている場 + 合は、返される M-text の各文字に、キーが #Mtext で値がその代表的な + 文字を含む M-text であるようなテキストプロパティが付加される。 + + @return + 求める情報が得られた場合、この関数が返す M-text を変更したり解放し + たりしてはいけない。情報が得られなかった場合は @c NULL が返される。 + + @seealso + mlanguage_code (), mlanguage_name (). */ + +MText * +mlanguage_text (MSymbol language) +{ + MPlist *plist = mlanguage__info (language); + MText *mt; + + if (! plist) + return NULL; + plist = MPLIST_NEXT (plist); + if (MPLIST_TAIL_P (plist)) + return NULL; + plist = MPLIST_NEXT (plist); + if (MPLIST_TAIL_P (plist)) + return NULL; + plist = MPLIST_NEXT (plist); + if (! MPLIST_MTEXT_P (plist)) + return NULL; + mt = MPLIST_MTEXT (plist); + if (mtext_nchars (mt) == 0) + return NULL; + plist = MPLIST_NEXT (plist); + if (MPLIST_MTEXT_P (plist) + && ! mtext_get_prop (mt, 0, Mtext)) + mtext_put_prop (mt, 0, mtext_nchars (mt), Mtext, MPLIST_MTEXT (plist)); + return mt; +} + +/***en + @brief List script names. + + The mscript_list () funciton returns a well-formed plist whose + keys are #Msymbol and values are symbols whose names are script + names. + + @return + This function returns a plist. The caller should free it by + m17n_object_unref (). + + @seealso + mscript_language_list (), mlanguage_list (). */ + +/***ja + @brief スクリプト名をリストする. + + 関数 mscript_list () は、整形式 (well-formed) plist を返す。各キー + は #Msymbol であり、個々の値はスクリプト名を名前とするシンボルであ + る。 + + @return + この関数が返す plist は、呼び出し側が m17n_object_unref () を使っ + て解放する必要がある。 + + @seealso + mscript_language_list (), mlanguage_list (). */ + +MPlist * +mscript_list (void) +{ + MPlist *plist, *pl, *p, *p0; + + if (! script_list + && init_script_list () < 0) + return NULL; + plist = pl = mplist (); + MPLIST_DO (p, script_list) + { + p0 = MPLIST_PLIST (p); + pl = mplist_add (pl, Msymbol, MPLIST_VAL (p0)); + } + return plist; +} + +/*=*/ + +/***en + @brief List languages that use a specified script. + + The mscript_language_list () function lists languages that use + $SCRIPT. $SCRIPT is a symbol whose name is the lower-cased + version of a script name that appears in the Unicode Character + Database. + + @return + + This function returns a well-formed plist whose keys are #Msymbol + and values are symbols whose names are ISO639-1 2-letter codes (or + ISO639-2 3-letter codes, if the former is not available). The + caller should not modify nor free it. If the m17n library does + not know about $SCRIPT, it returns @ c NULL. + + @seealso + mscript_list (), mlanguage_list (). */ + +/***ja + @brief 与えられたスクリプトを用いる言語をリストする. + + 関数 mscript_language_list () は、$SCRIPT を用いる言語をリストする。 + $SCRIPT はシンボルで、その名前は Unicode Character Database に示さ + れているスクリプト名をすべて小文字にしたものである。 + + @return この関数は、整形式 (well-formed) plist を返す。各キーは + #Msymbol であり、個々の値は ISO639-1 に定められた2文字言語コード + (定義されていない場合は ISO639-2 に定められた3文字言語コード) を名 + 前とするシンボルである。返される plist は変更したり解放したりして + はならない。$SCRIPT が未知の場合は @c NULL が返される。 + + @seealso + mscript_list (), mlanguage_list (). */ + + +MPlist * +mscript_language_list (MSymbol script) +{ + MPlist *plist = mscript__info (script); + + if (plist /* script name */ + && (plist = MPLIST_NEXT (plist)) /* language list */ + && MPLIST_PLIST_P (plist)) + return MPLIST_PLIST (plist); + return NULL; +}