7 ;; | (| . rxs) alternative
8 ;; | (rx . rx) concatination
9 ;; | (cc c1 c2 c3 ...) character class (natset)
11 ;; | (+ . rx) positive closure
14 ;; | (non-ascii) (cc 128)
18 ;; | pc primitive character class
19 ;; | (act . int) action
24 ;; | (c1 . c2) [c1 c2)
32 (defun rx-empty-p (rx) (equal rx []))
33 (defun rx-null-p (rx) (equal rx ()))
34 (defun rx-act-p (rx) (and (consp rx) (eq (car rx) 'act)))
35 (defun rx-cc-p (rx) (and (consp rx) (eq (car rx) 'cc)))
36 (defalias 'rx-pc-p 'integerp)
38 (defun rx-clo-p (rx) (and (consp rx) (eq (car rx) '*)))
39 (defun rx-alt-p (rx) (and (consp rx) (eq (car rx) '|)))
40 (defun rx-con-p (rx) (and (consp rx) (or (null (car rx)) (not (symbolp (car rx))))))
52 ((null (cdr rxs)) (car rxs))
54 (defun rx-alt2 (r1 r2)
66 ((and (rx-act-p r1) (rx-act-p r2)) r2)
68 (defun rx-act (obj) (cons 'act obj))
69 (defun rx-cc (cs) (cons 'cc cs))
71 ;;; regular expression preprocessing
73 (defun rx-range-to-ns (range)
75 ((char-or-char-int-p range)
76 (natset-single (char-int range)))
78 (let ((ns (natset-empty)) (chars (string-to-int-list range)))
80 (setq ns (natset-union ns (natset-single (car chars)))
83 ((eq range 'non-ascii)
87 (char-or-char-int-p (car range)))
88 (natset-seg (car range)))
92 (char-or-char-int-p (car range))
93 (char-or-char-int-p (cadr range)))
94 (natset-seg (char-int (car range)) (char-int (cadr range))))
96 (char-or-char-int-p (car range))
97 (char-or-char-int-p (cdr range)))
98 (natset-seg (char-int (car range)) (1- (char-int (cdr range)))))
99 (t (error "not range %s" range))))
101 (defun rx-vcc-to-rx (vcc)
102 (let ((l (append vcc ())) neg ns)
106 (setq l (mapcar 'rx-range-to-ns l))
107 (setq ns (natset-empty))
109 (setq ns (natset-union ns (car l))
111 (if neg (setq ns (natset-negate ns)))
112 (if (natset-empty-p ns)
116 (defun rx-desugar (rx)
118 ((stringp rx) (rx-desugar (string-to-int-list rx)))
119 ((vectorp rx) (rx-vcc-to-rx rx))
120 ((char-or-char-int-p rx) (rx-cc (natset-single (char-int rx))))
121 ((and (consp rx) (eq (car rx) '+)) (let ((r (rx-desugar (cdr rx)))) (rx-con r (rx-clo r))))
122 ((and (consp rx) (eq (car rx) 'non-ascii)) (rx-cc (natset-seg 128)))
123 ((and (consp rx) (eq (car rx) 'any)) (rx-cc (natset-full)))
128 ((rx-clo-p rx) (rx-clo (rx-desugar (cdr rx))))
129 ((rx-alt-p rx) (rx-alt (mapcar 'rx-desugar (cdr rx))))
130 ((rx-con-p rx) (rx-con (rx-desugar (car rx)) (rx-desugar (cdr rx))))
131 (t (error "not rx %s" rx))))
133 (defun rx-collect-cc (rx &optional cs)
138 ((rx-cc-p rx) (append (cdr rx) cs))
139 ((rx-clo-p rx) (rx-collect-cc (cdr rx) cs))
143 (setq cs (rx-collect-cc (car rx) cs)
146 ((rx-con-p rx) (rx-collect-cc (car rx) (rx-collect-cc (cdr rx) cs)))
147 (t (error "not rx %s" rx))))
149 (defun rx-cc-to-pc (rx cs)
157 (while (and (consp rx) (consp (cdr rx)))
158 (let ((start (car rx)) (end (cadr rx)))
159 (setq res (rx-filter (lambda (c) (and (<= start c) (< c end))) cs res)
162 (let ((start (car rx)))
163 (setq res (rx-filter (lambda (c) (<= start c)) cs res))))
164 (rx-alt (rx-sort-int res))))
165 ((rx-clo-p rx) (rx-clo (rx-cc-to-pc (cdr rx) cs)))
166 ((rx-alt-p rx) (rx-alt (mapcar (lambda (r) (rx-cc-to-pc r cs)) (cdr rx))))
167 ((rx-con-p rx) (rx-con (rx-cc-to-pc (car rx) cs) (rx-cc-to-pc (cdr rx) cs)))
168 (t (error "not rx %s" rx))))
170 (defun rx-categolize-char (rx)
171 (let ((cs (rx-sort-int (rx-collect-cc rx))))
178 (defun rx-nullable-p (rx)
180 ((rx-empty-p rx) nil)
187 (while (and (consp rx) (not (rx-nullable-p (car rx))))
191 (and (rx-nullable-p (car rx)) (rx-nullable-p (cdr rx))))
192 (t (error "not rx %s" rx))))
194 (defun rx-simplify (rx)
201 (rx-clo (rx-simplify (cdr rx))))
203 (let ((in (cdr rx)) (out ())
204 already-simplified-list already-simplified)
208 already-simplified (car already-simplified-list)
209 already-simplified-list (cdr already-simplified-list))
211 (setq in (append (cdr rx) in))
213 (setq rx (if already-simplified rx (rx-simplify rx)))
215 ((rx-empty-p rx)) ; [] is identity element for alternation.
217 (setq in (append (cdr rx) in)
218 already-simplified-list (append (make-list (length (cdr rx)) nil) already-simplified-list)))
219 ((not (member rx out))
220 (setq out (cons rx out)))))))
221 (rx-alt (rx-sort-rx (reverse out)))))
224 (let ((in (list (car rx) (cdr rx))) (out ())
225 already-simplified-list already-simplified)
229 already-simplified (car already-simplified-list)
230 already-simplified-list (cdr already-simplified-list))
232 (setq in (rx-cons* (car rx) (cdr rx) in))
234 (setq rx (if already-simplified rx (rx-simplify rx)))
236 ((rx-empty-p rx) ; [] is zero element for concatination.
238 ((rx-null-p rx)) ; () is identity element for concatination.
240 (setq in (rx-cons* (car rx) (cdr rx) in))
241 already-simplified-list (rx-cons* t t already-simplified-list))
243 (setq out (cons rx out)))))))
244 (if (= (length out) 1)
247 (t (error "not rx %s" rx))))
251 (defun rx-head-pcs (rx &optional res)
253 ((rx-empty-p rx) res)
256 ((rx-pc-p rx) (if (member rx res) res (cons rx res)))
257 ((rx-clo-p rx) (rx-head-pcs (cdr rx) res))
261 (setq res (rx-head-pcs (car rx) res)
265 (setq res (rx-head-pcs (car rx) res))
266 (if (rx-nullable-p (car rx))
267 (setq res (rx-head-pcs (cdr rx) res)))
269 (t (error "not rx %s" rx))))
271 (defun rx-head-act (rx &optional res)
273 ((rx-empty-p rx) res)
275 ((rx-act-p rx) (rx-better-act rx res))
277 ((rx-clo-p rx) (rx-head-act (cdr rx) res))
281 (setq res (rx-head-act (car rx) res)
285 (setq res (rx-head-act (car rx) res))
286 (if (rx-nullable-p (car rx))
287 (setq res (rx-head-act (cdr rx) res)))
289 (t (error "not rx %s" rx))))
293 (defun rx-step-internal (rx pc)
298 ((rx-pc-p rx) (if (= rx pc) () []))
299 ((rx-clo-p rx) (rx-con (rx-step-internal (cdr rx) pc) rx))
300 ((rx-alt-p rx) (rx-alt (mapcar (lambda (r) (rx-step-internal r pc)) (cdr rx))))
302 (if (rx-nullable-p (car rx))
304 (rx-con (rx-step-internal (car rx) pc) (cdr rx))
305 (rx-step-internal (cdr rx) pc))
306 (rx-con (rx-step-internal (car rx) pc) (cdr rx))))
307 (t (error "not rx %s" rx))))
309 (defun rx-step (rx &rest pcs)
311 (setq rx (rx-simplify (rx-step-internal rx (car pcs)))
317 (defun rx-better-act (a1 a2)
320 ((< (cdr a1) (cdr a2)) a1)
323 (defun rx-cons* (elt &rest lst)
326 (cons elt (apply 'rx-cons* (car lst) (cdr lst)))))
328 (defun rx-filter (fun lst &optional rest)
331 (if (funcall fun (car lst))
332 (cons (car lst) (rx-filter fun (cdr lst) rest))
333 (rx-filter fun (cdr lst) rest))))
335 (defun rx-cmp-index (rx)
337 ((rx-null-p rx) (list 0))
338 ((rx-act-p rx) (list 1 (cdr rx)))
339 ((rx-empty-p rx) (list 2))
340 ((rx-clo-p rx) (list 3 (cdr rx)))
341 ((rx-alt-p rx) (cons 4 (cdr rx)))
342 ((rx-con-p rx) (list 5 (car rx) (cdr rx)))
343 ((rx-pc-p rx) (list 6 rx))
344 (t (error "not rx %s" rx))))
346 (defun rx-cmp-int (i1 i2)
352 (defun rx-cmp-rx (r1 r2)
353 (let ((i1 (rx-cmp-index r1)) (i2 (rx-cmp-index r2)))
355 ((< (car i1) (car i2)) -1)
356 ((> (car i1) (car i2)) 1)
360 (while (and (consp i1) (consp i2))
361 (let ((r (if (and (integerp (car i1)) (integerp (car i2)))
362 (rx-cmp-int (car i1) (car i2))
363 (rx-cmp-rx (car i1) (car i2)))))
368 (if (null i1) (if (null i2) 0 -1) 1))))))
370 (defun rx-sort-rx (l &optional res)
373 (let ((e (car l)) lt gt cmp)
376 (setq cmp (rx-cmp-rx (car l) e))
378 ((< cmp 0) (setq lt (cons (car l) lt)))
379 ((< 0 cmp) (setq gt (cons (car l) gt))))
381 (rx-sort-rx lt (cons e (rx-sort-rx gt res))))))
383 (defun rx-sort-int (l &optional res)
386 (let ((e (car l)) lt gt)
390 ((< (car l) e) (setq lt (cons (car l) lt)))
391 ((< e (car l)) (setq gt (cons (car l) gt))))
393 (rx-sort-int lt (cons e (rx-sort-int gt res))))))