chise/parser.rb

   1 # Copyright (C) 2002-2004 Kouichirou Eto, All rights reserved.
   2
   3 require "chise/chisedb"
   4 require "chise/utf8"
   5
   6 module CHISE
   7   module EntityReferenceModule
   8     PART    = "&([-+0-9A-Za-z#]+);"
   9     ALL     = '\A'+PART+'\Z'
  10     PART_RE = Regexp.new(PART)
  11     ALL_RE  = Regexp.new(ALL)
  12
  13     def contain_er?(s)  (PART_RE =~ s) != nil;  end
  14     def is_er?(s)       (ALL_RE =~ s)  != nil;  end
  15
  16     # the order is important.  The primary charset should be selectable.
  17     CCS_TABLE = [
  18       %w( =jis-x0208-1990       J90- 4 X),
  19       %w( =jis-x0208-1983       J83- 4 X),
  20       %w( =jis-x0208-1978       J78- 4 X),
  21       %w( =jis-x0208            J90- 4 X), # \8cp\8f³\82Ì\83A\83h\83z\83b\83N\82È\8eÀ\91\95
  22       %w( =jis-x0208            J83- 4 X), # \8cp\8f³\82Ì\83A\83h\83z\83b\83N\82È\8eÀ\91\95
  23       %w( =jis-x0208            J78- 4 X), # \8cp\8f³\82Ì\83A\83h\83z\83b\83N\82È\8eÀ\91\95
  24       %w( =jis-x0213-1-2000     JX1- 4 X),
  25       %w( =jis-x0213-2-2000     JX2- 4 X),
  26       %w( =jis-x0212            JSP- 4 X),
  27       %w( =big5-cdp             CDP- 4 X),
  28       %w( =big5                 B-   4 X),
  29       %w( =cns11643-1           C1-  4 X),
  30       %w( =cns11643-2           C2-  4 X),
  31       %w( =cns11643-3           C3-  4 X),
  32       %w( =cns11643-4           C4-  4 X),
  33       %w( =cns11643-5           C5-  4 X),
  34       %w( =cns11643-6           C6-  4 X),
  35       %w( =cns11643-7           C7-  4 X),
  36       %w( =ks-x1001             K0-  4 X),
  37       %w( =daikanwa             M-   5 d),
  38       %w( =cbeta                CB   5 d),
  39       %w( =gt                   GT-  5 d),
  40       %w( =gt-k                 GT-K 5 d),
  41       %w( =hanziku-1            HZK01- 4 X),
  42       %w( =hanziku-2            HZK02- 4 X),
  43       %w( =hanziku-3            HZK03- 4 X),
  44       %w( =hanziku-4            HZK04- 4 X),
  45       %w( =hanziku-5            HZK05- 4 X),
  46       %w( =hanziku-6            HZK06- 4 X),
  47       %w( =hanziku-7            HZK07- 4 X),
  48       %w( =hanziku-8            HZK08- 4 X),
  49       %w( =hanziku-9            HZK09- 4 X),
  50       %w( =hanziku-10           HZK10- 4 X),
  51       %w( =hanziku-11           HZK11- 4 X),
  52       %w( =hanziku-12           HZK12- 4 X),
  53       %w( =ruimoku-v6           RUI6-  4 X),
  54       %w( =jef-china3           JC3-   4 X),
  55     ]
  56   end
  57
  58   class CharacterParser
  59     include EntityReferenceModule
  60     include UTF8Value
  61
  62     PRIVATE_USE_AREA = 0xe000
  63
  64     def parse(c) # parse a value and return a number (MCS)
  65       raise "c is nil" if c.nil?
  66
  67       if c.kind_of?(String)
  68         if /\A\?/ =~ c
  69           c = c.sub(/\A\?/, "") # remove "?" in the head
  70           #u4 = c.u8tou32 # translate from UTF-8 to UTF-32
  71           #return u4.u32to_i # translate UTF-32 to UCS number
  72           return u8toi(c)
  73         end
  74
  75         return parse_er(c) if is_er?(c) # ER?
  76
  77         return c.to_i if /^\d+$/ =~ c # only numbers?
  78
  79         raise "unknown format"
  80       end
  81
  82       if c.kind_of?(Numeric)
  83         c = 0x80000000 + c if c < 0 # negative value
  84         return c.to_i
  85       end
  86
  87       raise "unknown object"
  88     end
  89
  90     def parse_er(s) # parse a Entity Reference and return a number (MCS)
  91       raise "wrong ER." unless ALL_RE =~ s # don't use is_er? for getting $1.
  92
  93       s = $1 # extract the part of ER
  94
  95       return $1.hex if s =~ /\AMCS-([0-9A-Fa-f]+)\Z/ # MCS. It's a mystery.
  96
  97       return $1.hex if s =~ /\AU[-+]?([0-9A-Fa-f]+)\Z/ ||
  98           s =~ /\A#x([0-9A-Fa-f]+)\Z/ # Unicode code point in Hex.
  99
 100       return $1.to_i if s =~ /\A#([0-9]+)\Z/ # Unicode code point in Decimal.
 101
 102       if s =~ /\Amy-([0-9]+)\Z/ # my own code point. It's a secret.
 103         return PRIVATE_USE_AREA + $1.to_i # private use area of Unicode.
 104       end
 105
 106       if s =~ /\AI-/ # I- stands for Isolated character. It's a wonder.
 107         s = s.sub(/\AI-/, "")
 108       end
 109
 110       CCS_TABLE.each {|ccs, er_prefix, keta, numtype|
 111         if numtype == "d"
 112           nre = '\d'
 113         elsif numtype == "X"
 114           nre = "[0-9A-Fa-f]"
 115         else
 116           next
 117         end
 118
 119         re = "\\A#{er_prefix}(#{nre}{#{keta},#{keta}})\\Z"
 120         next unless Regexp.new(re) =~ s
 121         #qp s
 122
 123         codestr = $1
 124         if numtype == "d"
 125           code = codestr.to_i
 126         else
 127           code = codestr.hex
 128         end
 129
 130         u8 = get_ccs(ccs, code)
 131         #qp ccs, s, u8
 132         next if u8.nil?
 133
 134         num = parse(u8)
 135         next if num.nil?
 136
 137         return num
 138       }
 139
 140       raise "unknown Entity Reference"
 141     end
 142
 143     private
 144     def get_ccs(ccs, code_point)
 145       cd = ChiseDB.instance
 146       cd.decode_char(ccs, code_point)
 147     end
 148   end
 149
 150   class EntityReferenceParser
 151     include EntityReferenceModule
 152
 153     def de_er(s) # replace EntityReference with corresponding character.
 154       return s unless PART_RE =~ s # don't use contain_er? to get $1
 155
 156       er = "&"+$1+";"
 157       char = Character.get(er)
 158       ss = s.sub(Regexp.new(Regexp.escape(er)), char.utf8_mcs)
 159
 160       return de_er(ss) if contain_er?(ss) # recursive
 161       ss
 162     end
 163   end
 164
 165   class EntityReferenceEncoder
 166     include EntityReferenceModule
 167
 168     def to_er(char)
 169       cid = char.char_id
 170       return "&#x%04x;" % cid if cid <=  0xffff
 171       return "&#x%05x;" % cid if cid <= 0xfffff
 172
 173       CCS_TABLE.each {|ccs, er_prefix, keta, numtype|
 174         code = char[ccs]
 175         next if code.nil?
 176         return "&#{er_prefix}%0#{keta}#{numtype};" % code
 177       }
 178
 179       "&MCS-%08X;" % cid # the last answer
 180     end
 181
 182     def to_er_by_ccs(cid, ccs) # not yet
 183     end
 184
 185   end
 186 end