From b18aec04fda8780834ce61dacfbc519169cf721c Mon Sep 17 00:00:00 2001 From: ntakahas Date: Thu, 1 Jul 2010 09:40:57 +0000 Subject: [PATCH] Add support for several new Unicode characters. Add a 'j&n' combination to produce the literal 'jn' compound. Cleans up the formatting. --- im/te-rts.mim | 375 +++++++++++++++++++++++++++++++-------------------------- 1 file changed, 201 insertions(+), 174 deletions(-) diff --git a/im/te-rts.mim b/im/te-rts.mim index 77f31be..d16779b 100644 --- a/im/te-rts.mim +++ b/im/te-rts.mim @@ -3,7 +3,7 @@ ;; Copyright (C) 2003, 2004, 2005, 2006 ;; National Institute of Advanced Industrial Science and Technology (AIST) ;; Registration Number H15PRO112 -;; Copyright 2005, 2006 Suraj N. Kurapati +;; Copyright 2005, 2006, 2010 Suraj N. Kurapati ;; Copyright 2006 Chaitanya Kamisetty @@ -26,47 +26,43 @@ ;; Boston, MA 02110-1301, USA. - (input-method te rts) (description "Input method for Telugu script with RTS method. For the detail of RTS, see the page: - . - -This input method is based upon the Telugu Rice Transliteration -Standard (RTS) specification and its Rice Inverse -Transliterator (RIT) supplement. - -The original RTS specification was written by Ananda Kishore and -Rama Rao Kanneganti in 1992 and can presently be accessed in the -\"soc.culture.indian.telugu\" newsgroup archives (see -). - -The RIT supplement adds alternative combinations for -transliteration but, in general, does not distract from the -original specification (see -). Whenever a -supplemental combination conflicts with the original RTS, the RTS -version has precedence and the supplemental combination is -disregarded (such as 'ea' from RIT 3.0). - -Finally, this input method deviates slightly from the RTS in the -following ways: - - (1) The combinations \"\@n\", \"\@2\", \"~c\", and \"~j\" - yield \"�\" because their corresponding glyphs do not - yet exist in Telugu's Unicode chart. - - (2) If it appears at the end of a word, the combination - \"m\" yields \"ం\". The user can type \"m&\" to - bypass this behavior and force \"m\" to yield \"మ్\". + . + +This input method is based on the Telugu Rice Transliteration Standard (RTS) +specification[1] and its Rice Inverse Transliterator (RIT) supplement[2]. + +The original RTS specification was written by Ananda Kishore and Rama Rao +Kanneganti in 1992 and can presently be accessed in the archives[1] of the +'soc.culture.indian.telugu' USENET newsgroup. + +The RIT supplement[2] enriches RTS with alternative combinations. However, +in cases where RIT and RTS define conflicting mappings for the same +combination, such as 'ea', only the RTS mapping is honored. + +Finally, this input method deviates from the RTS in the following ways: + +* The combination '\@n' yields '�' because its corresponding glyph does not + yet exist in the Telugu unicode chart. + +* The combination 'm' yields 'ం' if it appears at the end of a word. The + user can type 'm&' to bypass this behavior and force 'm' to yield 'మ్'. + +* The sunna prevention operator '&' can be used to force a more literal + transliteration of consonant compounds such as 'jn' by writing 'j&n'. + +[1]: http://groups.google.com/groups?selm=Bv0A9M.27B@rice.edu +[2]: http://www.teluguworld.org/RIT/rit3.0/manual.html ") (title "క") (map (starter - ((S-\ )) ((C-@)) ; m17n stuff + ((S-\ )) ((C-@)) ("a") ("b") ("c") ("d") ("e") ("f") ("g") ("h") ("i") ("j") ("k") ("l") ("m") ("n") ("o") ("p") ("r") ("s") ("t") ("u") @@ -78,17 +74,15 @@ following ways: ("0") ("1") ("2") ("3") ("4") ("5") ("6") ("7") ("8") ("9") - ("@") ("|") ("~") - - ("#") ("_") + ("@") ("|") ("~") ("#") ) - - - ; these consonants undergo automatic sunna generation (consonant - ; row 1 + ;------------------------------------------------------------------------- + ; row 1 - క ఖ గ ఘ ఙ + ;------------------------------------------------------------------------- + ("k" "క్") ("kh" "ఖ్") @@ -105,23 +99,27 @@ following ways: ("Gh" "ఘ్") ("GH" "ఘ్") + ("~m" "ఙ్") + + ;------------------------------------------------------------------------- + ; row 2 - చ ఛ జ ఝ ఞ + ;------------------------------------------------------------------------- - ; row 2 ("c" "చ్") ("ch" "చ్") ("cH" "చ్") - - ("~c" "�") ; త్స (tsa) allophone of చ (cha) + + ("~c" "ౘ") ("C" "ఛ్") ("Ch" "ఛ్") ("CH" "ఛ్") - ("c'" "ఛ్") ; from RIT 2.0, 3.0 + ("c'" "ఛ్") ; from RIT 2.0, 3.0 ("j" "జ్") - ("z" "జ్") ; from RIT 3.0 + ("z" "జ్") ; from RIT 3.0 - ("~j" "�") ; డ్జ (dza) allophone of జ (ja) + ("~j" "ౙ") ("jh" "ఝ్") ("jH" "ఝ్") @@ -129,8 +127,12 @@ following ways: ("Jh" "ఝ్") ("JH" "ఝ్") + ("~n" "ఞ్") + + ;------------------------------------------------------------------------- + ; row 3 - ట ఠ డ ఢ ణ + ;------------------------------------------------------------------------- - ; row 3 ("T" "ట్") ("t'" "ట్") @@ -147,8 +149,15 @@ following ways: ("dh'" "ఢ్") ("dH'" "ఢ్") + ("N" "ణ్") + ("nh" "ణ్") + ("nH" "ణ్") + ("n'" "ణ్") ; from RIT 2.0, 3.0 + + ;------------------------------------------------------------------------- + ; row 4 - త థ ద ధ న + ;------------------------------------------------------------------------- - ; row 4 ("t" "త్") ("th" "థ్") @@ -159,8 +168,10 @@ following ways: ("dh" "ధ్") ("dH" "ధ్") + ;------------------------------------------------------------------------- + ; row 5 - ప ఫ బ భ మ + ;------------------------------------------------------------------------- - ; row 5 ("p" "ప్") ("f" "ఫ్") @@ -178,97 +189,110 @@ following ways: ("Bh" "భ్") ("BH" "భ్") + ;------------------------------------------------------------------------- + ; row 6 - య ర ల వ శ ష స హ ళ క్ష ఱ + ;------------------------------------------------------------------------- + + ; ("y" "య్") is defined below in consonant-without-sunna + + ; ("r" "ర్") is defined below in consonant-without-sunna - ; row 6 ("l" "ల్") ("v" "వ్") - ("V" "వ్") ; from RIT 3.0 + ("V" "వ్") ; from RIT 3.0 ("w" "వ్") - ("W" "వ్") ; from RIT 3.0 + ("W" "వ్") ; from RIT 3.0 ("S" "శ్") - ("s'" "శ్") ; from RIT 2.0, 3.0 + ("s'" "శ్") ; from RIT 2.0, 3.0 + + ("sh" "ష్") + ("sH" "ష్") + ("Sh" "ష్") ; from RIT 3.0 + ("SH" "ష్") ; from RIT 3.0 ("s" "స్") + ("h" "హ్") + ("H" "హ్") + + ("L" "ళ్") + ("lh" "ళ్") + ("lH" "ళ్") + ("Lh" "ళ్") + ("LH" "ళ్") + ("l'" "ళ్") ; from RIT 2.0, 3.0 ("x" "క్ష్") ("ksh" "క్ష్") ("ksH" "క్ష్") - ("ks" "క్స్") ; workaround for inputting "క్స్" - + ("ks" "క్స్") ; disambiguation for this input method's 1-character lookahead - ; misc. compounds - ("dd'" "డ్డ్") ; from RIT 3.0 - ("dd" "ద్ద్") + ("~r" "ఱ్") + ("r''" "ఱ్") ; from RIT 2.0, 3.0 - ("tt'" "ట్ట్") ; from RIT 3.0 - ("tt" "త్త్") + ;--------------------------------------------------------------------------- + ; compounds + ;--------------------------------------------------------------------------- ("jn" "జ్ఞ్") - ) - + ("j&n" "జ్న్") ; apply sunna prevention operator to produce literal compound + ("dd'" "డ్డ్") ; from RIT 3.0 + ("dd" "ద్ద్") ; disambiguation for this input method's 1-character lookahead - ; these consonants do NOT undergo automatic sunna generation - (consonant2 - ("~m" "ఙ్") + ("tt'" "ట్ట్") ; from RIT 3.0 + ("tt" "త్త్") ; disambiguation for this input method's 1-character lookahead + ) - ("~n" "ఞ్") + (consonant-without-sunna - ("N" "ణ్") - ("nh" "ణ్") - ("nH" "ణ్") - ("n'" "ణ్") ; from RIT 2.0, 3.0 + ; Quotation from "sunna generation" section of RIT 3.0 specification: + ; + ; when 'n' or 'm' is followed by a consonant except 'r' or 'y' RIT + ; assumes it to be a sunna + ; + ("r" "ర్") + ("y" "య్") + ; Quotation from "sunna generation" section of RIT 3.0 specification: + ; + ; You can prevent a sunna generation by writing 'n&' or 'm&'. + ; ("n&" "న్") - ("m&" "మ్") - - ("y" "య్") - - ("r" "ర్") - - ("sh" "ష్") - ("sH" "ష్") - ("Sh" "ష్") ; from RIT 3.0 - ("SH" "ష్") ; from RIT 3.0 - - ("h" "హ్") - ("H" "హ్") - - ("L" "ళ్") - ("lh" "ళ్") - ("lH" "ళ్") - ("Lh" "ళ్") - ("LH" "ళ్") - ("l'" "ళ్") ; from RIT 2.0, 3.0 - - ("~r" "ఱ్") - ("r''" "ఱ్") ; from RIT 2.0, 3.0 ) - - - ; these consonants are converted into sunna by the automatic sunna generation logic, if they appear inside a word (sunna-inside-word ("n" "న్") - ("m" "మ్") ) - - - ; these sequences are converted into sunna by the automatic sunna generation logic, if they appear at the end of a word (sunna-endof-word - ((m Tab) "ం ") - ((m Return) "ం") + ;------------------------------------------------------------------------- + ; whitespace + ;------------------------------------------------------------------------- - ; the sequences below, using punctuation marks to denote the end of a word, are generated by the following shell command. keys in [1] the (starter) block, [2] the (independent) block, and [3] those which begin with the 'm' key are intentionally excluded from this command to ensure that they are transliterated normally. - ; for ch in ' ' '!' '\"' '#' '$' '%' "'" '(' ')' '*' '+' ',' '-' '.' '/' '\\' ':' ';' '<' '=' '>' '?' '[' ']' '_' '`' '{' '}'; do echo " (\"m${ch}\" \"ం${ch}\")"; done # exclude '^' '&' '|' '@' '~' ("m " "ం ") + ((m Tab) "ం\t") + ((m Return) "ం\n") + + ;------------------------------------------------------------------------- + ; punctuation + ;------------------------------------------------------------------------- + ; + ; The sequences below are generated by this Bourne shell script: + ; + ; for ch in '!' '\"' '#' '$' '%' "'" '(' ')' '*' '+' ',' '-' '.' \ + ; '/' '\\' ':' ';' '<' '=' '>' '?' '[' ']' '_' '`' '{' '}' + ; do echo " (\"m${ch}\" \"ం${ch}\")"; done + ; + ; Sequences ending with '^' '&' '|' '@' '~' are omitted from the above + ; loop because those punctuation marks already serve a purpose in this + ; input method. + ; ("m!" "ం!") ("m\"" "ం\"") ("m#" "ం#") @@ -298,16 +322,19 @@ following ways: ("m}" "ం}") ) - - (independent + ((S-\ ) "‌") + ((C-@) "‍") + + ;------------------------------------------------------------------------- + ; vowels + ;------------------------------------------------------------------------- - ; అచ్చులు (vowels) ("a" "అ") ("aa" "ఆ") ("a'" "ఆ") - ("A" "ఆ") ; from RIT 2.0, 3.0 + ("A" "ఆ") ; from RIT 2.0, 3.0 ("i" "ఇ") @@ -315,7 +342,7 @@ following ways: ("ii" "ఈ") ("ia" "ఈ") ("i'" "ఈ") - ("I" "ఈ") ; from RIT 2.0, 3.0 + ("I" "ఈ") ; from RIT 2.0, 3.0 ("u" "ఉ") @@ -326,14 +353,14 @@ following ways: ("u'" "ఊ") ("R" "ఋ") - ("r'" "ఋ") ; from RIT 2.0 + ("r'" "ఋ") ; from RIT 2.0 ("Ru" "ౠ") - ("r'u" "ౠ") ; from RIT 2.0 + ("r'u" "ౠ") ; from RIT 2.0 - ("~l" "ఌ") + ("~l" "ౢ") - ("~L" "ౡ") + ("~L" "ౣ") ("e" "ఎ") @@ -343,7 +370,7 @@ following ways: ("e'" "ఏ") ("ai" "ఐ") - ("ei" "ఐ") ; from RIT 3.0 + ("ei" "ఐ") ; from RIT 3.0 ("o" "ఒ") @@ -354,10 +381,28 @@ following ways: ("au" "ఔ") ("ou" "ఔ") - ("ow" "ఔ") ; from RIT 3.0 + ("ow" "ఔ") ; from RIT 3.0 + + ; This combination is defined in the "internal representation" section of + ; RTS. It was widely used in early RTS implementations which lacked the + ; automatic sunna generation capability and has thus became the defacto + ; way of producing a sunna manually. + ("M" "ం") + + ("@M" "ఁ") + ("@m" "ఁ") ; from RIT 3.0 + + ("@h" "ః") + ("@n" "�") + ("@N" "�") ; from RIT 3.0 + + ("@2" "ఽ") + + ;------------------------------------------------------------------------- + ; digits + ;------------------------------------------------------------------------- - ; అంకెలు (numbers) ("0" "౦") ("1" "౧") ("2" "౨") @@ -369,44 +414,29 @@ following ways: ("8" "౮") ("9" "౯") - + ;------------------------------------------------------------------------- ; punctuation - ("|" "।") ; from RIT 3.0 - ("||" "॥") ; from Yudit - - - ; additional modifiers - ("M" "ం") ; from "internal representation" section of RTS. This combination has been included because it is very widely used in RTS implementations which do not support automatic sunna generation and thus has become the defacto way of manually producing sunna. - - ("@M" "ఁ") ; అర్ధసున్న (ardhasunna), చంద్ర బిందు (chandra bindu) - ("@m" "ఁ") ; from RIT 3.0 - - ("@h" "ః") ; విసర్గ (visarga) - ("@H" "ః") - - ("@n" "�") ; నకర పొల్లు (nakara-pollu), నకర విరమ (nakara-virama) - ("@N" "�") ; from RIT 3.0 - - ("@2" "�") ; అవగ్రహ (avagraha) + ;------------------------------------------------------------------------- - ("^" "్‌") ; పొల్లు (pollu), విరమ (virama), halant - - ("_" "") ; ignored according to RTS - - - ; m17n stuff - ((S-\ ) "‌") - ((C-@) "‍") + ; The characters at the right-hand-side of these mappings are borrowed + ; from the Devanagiri unicode chart because they do not yet exist in the + ; Telugu unicode chart. + ("|" "।") ; from RIT 3.0 + ("||" "॥") ; from Yudit ) + (dependent + ("^" (delete @-) "్‌") + ;------------------------------------------------------------------------- + ; vowels + ;------------------------------------------------------------------------- - (dependent ("a" (delete @-) "") ("aa" (delete @-) "ా") ("a'" (delete @-) "ా") - ("A" (delete @-) "ా") ; from RIT 3.0 + ("A" (delete @-) "ా") ; from RIT 3.0 ("i" (delete @-) "ి") @@ -414,7 +444,7 @@ following ways: ("ii" (delete @-) "ీ") ("ia" (delete @-) "ీ") ("i'" (delete @-) "ీ") - ("I" (delete @-) "ీ") ; from RIT 3.0 + ("I" (delete @-) "ీ") ; from RIT 3.0 ("u" (delete @-) "ు") @@ -425,14 +455,14 @@ following ways: ("u'" (delete @-) "ూ") ("R" (delete @-) "ృ") - ("r'" (delete @-) "ృ") ; from RIT 2.0 + ("r'" (delete @-) "ృ") ; from RIT 2.0 ("Ru" (delete @-) "ౄ") - ("r'u" (delete @-) "ౄ") ; from RIT 2.0 + ("r'u" (delete @-) "ౄ") ; from RIT 2.0 - ("~l" (delete @-) "") + ("~l" (delete @-) "ౢ") - ("~L" (delete @-) "") + ("~L" (delete @-) "ౣ") ("e" (delete @-) "ె") @@ -442,7 +472,7 @@ following ways: ("e'" (delete @-) "ే") ("ai" (delete @-) "ై") - ("ei" (delete @-) "ై") ; from RIT 3.0 + ("ei" (delete @-) "ై") ; from RIT 3.0 ("o" (delete @-) "ొ") @@ -453,14 +483,10 @@ following ways: ("au" (delete @-) "ౌ") ("ou" (delete @-) "ౌ") - ("ow" (delete @-) "ౌ") ; from RIT 3.0 - - - ; additional modifiers - ("^" (delete @-) "్‌") ; పొల్లు (pollu), విరమ (virama), halant + ("ow" (delete @-) "ౌ") ; from RIT 3.0 ) -(single_hash + (single_hash ("#" "") ) @@ -468,25 +494,29 @@ following ways: ("###" "#") ) - (invariant - ("a" "a" ) ("b" "b" ) ("c" "c" ) ("d" "d" ) ("e" "e") ("f" "f") ("g" "g") ("h" "h") ("i" "i") ("j" "j") - ("k" "k" ) ("l" "l" ) ("m" "m" ) ("n" "n" ) ("o" "o" ) ("p" "p" ) ("q" "q") ("r" "r" ) ("s" "s" ) ("t" "t" ) ("u" "u" ) - ("v" "v" ) ("w" "w" ) ("x" "x" ) ("y" "y" ) ("z" "z" ) + (invariant + ("a" "a") ("b" "b") ("c" "c") ("d" "d") ("e" "e") ("f" "f") ("g" "g") + ("h" "h") ("i" "i") ("j" "j") ("k" "k") ("l" "l") ("m" "m") ("n" "n") + ("o" "o") ("p" "p") ("q" "q") ("r" "r") ("s" "s") ("t" "t") ("u" "u") + ("v" "v") ("w" "w") ("x" "x") ("y" "y") ("z" "z") - ("A" "A" ) ("B" "B" ) ("C" "C" ) ("D" "D" ) ("E" "E" ) ("F" "F") ("G" "G" ) ("H" "H" ) ("I" "I" ) ("J" "J" ) ("K" "K" ) - ("L" "L" ) ("M" "M" ) ("N" "N" ) ("O" "O" ) ("P" "P" ) ("Q" "Q") ("R" "R" ) ("S" "S" ) ("T" "T" ) ("U" "U" ) ("V" "V" ) - ("W" "W" ) ("X" "X" ) ("Y" "Y" ) ("Z" "Z" ) + ("A" "A") ("B" "B") ("C" "C") ("D" "D") ("E" "E") ("F" "F") ("G" "G") + ("H" "H") ("I" "I") ("J" "J") ("K" "K") ("L" "L") ("M" "M") ("N" "N") + ("O" "O") ("P" "P") ("Q" "Q") ("R" "R") ("S" "S") ("T" "T") ("U" "U") + ("V" "V") ("W" "W") ("X" "X") ("Y" "Y") ("Z" "Z") - ("0" "0" ) ("1" "1" ) ("2" "2" ) ("3" "3" ) ("4" "4" ) ("5" "5" ) ("6" "6" ) ("7" "7" ) ("8" "8" ) ("9" "9" ) + ("0" "0") ("1" "1") ("2" "2") ("3" "3") ("4" "4") ("5" "5") ("6" "6") + ("7" "7") ("8" "8") ("9" "9") - ("~" "~") ("`" "`") ("!" "!") ("@" "@" ) ("$" "$") ("%" "%") ("^" "^") ("&" "&") ("*" "*") ("(" "(") (")" ")") ("_" "_") - ("-" "-") ("+" "+") ("=" "=") ("{" "{") ("[" "[") ("}" "}") ("]" "]") ("|" "|" ) ("\\" "\\") (":" ":") (";" ";") - ("\"" "\"") ("\'" "\'") ("<" "<") ("," ",") (">" ">") ("." ".") ("?" "?") ("/" "/") + ("~" "~") ("`" "`") ("!" "!") ("@" "@") ("$" "$") ("%" "%") ("^" "^") + ("&" "&") ("*" "*") ("(" "(") (")" ")") ("_" "_") ("-" "-") ("+" "+") + ("=" "=") ("{" "{") ("[" "[") ("}" "}") ("]" "]") ("|" "|" ) ("\\" "\\") + (":" ":") (";" ";") ("\"" "\"") ("\'" "\'") ("<" "<") ("," ",") (">" ">") + ("." ".") ("?" "?") ("/" "/") (" " " ") ((Tab) ("\t")) ((BackSpace) (undo)) ((Return) ("\n")) - ) + ) - ; m17n stuff (return ((Return))) @@ -494,9 +524,6 @@ following ways: ((BackSpace) (undo))) ) - - -; state machine for transliteration (state (init (starter (pushback 1) (shift intermediate)) @@ -504,7 +531,7 @@ following ways: (intermediate (consonant (shift second)) - (consonant2 (shift second)) + (consonant-without-sunna (shift second)) (sunna-inside-word (shift second-sunna-inside-word)) (sunna-endof-word (shift init)) (independent (shift init)) @@ -516,7 +543,7 @@ following ways: (second (consonant) - (consonant2) + (consonant-without-sunna) (sunna-inside-word (shift second-sunna-inside-word)) (sunna-endof-word (shift init)) (dependent (shift init)) @@ -527,7 +554,7 @@ following ways: (second-sunna-inside-word (t (mark p)) (consonant (move p) (delete @-) (delete @-) "ం" (move @>) (shift second)) - (consonant2 (shift second)) + (consonant-without-sunna (shift second)) (sunna-inside-word) (sunna-endof-word (shift init)) (dependent (shift init)) -- 1.7.10.4