;;;-*- Mode: Lisp; Package: REGEX -*- ;;; ;; Copyright (C) Paul Meurer 2000. All rights reserved. ;; paul.meurer@hit.uib.no ;; HIT-centre, University of Bergen ;;; Regular expression parser. ;;------------------------------------------------------------------------------------- ;; TO DO: ;; ;; KNOWN BUGS: ;; ;;------------------------------------------------------------------------------------- (in-package "REGEX") (defclass tag-fsa (regexp-fsa) ()) (defclass tag-nfa (tag-fsa nfa) ()) (defclass tag-dfa (tag-fsa dfa) ()) (defmethod make-nfa ((fsa tag-fsa)) (make-instance 'tag-nfa)) (defmethod make-dfa ((fsa tag-fsa)) (make-instance 'tag-dfa)) (defmethod any-determinize ((fsa tag-dfa)) (let ((nfa (fsa::copy-to-nfa fsa))) (any-determinize nfa))) (defmethod any-determinize ((nfa tag-nfa)) (relation-map #'(lambda (state relation) (declare (ignore state)) (let ((any-dest (relation-get :@ relation))) (when any-dest (relation-map #'(lambda (key value) (unless (find key '(:@ :nl)) (set-map #'(lambda (dest) (set-insert dest value)) any-dest))) relation)))) (fsa-delta nfa)) (minimize nfa)) #+old? (defmethod string-match ((fsa regexp-dfa) (token cgp::token) &key minimal maximal exact) "If exact = t returns t if there is an exact match, and if MINIMAL/MAXIMAL = T only if the exact match is minimal/maximal. If EXACT = NIL returns all matching endpositions, only the first one if MINIMAL = T, only the last one if MAXIMAL = T" (let ((end ()) (string (cgp::token-value token))) (with-slots (fsa-start-state fsa-final-states fsa-delta) fsa (loop with state = fsa-start-state and i = 0 when (set-member-p state fsa-final-states) do (cond (exact (let ((success (= i (length string)))) (when (or minimal success) (return-from string-match success)))) (maximal (if (and minimal end) (return-from string-match) (setf end i))) (minimal (setf end i)) (t (push i end))) until (or (= i (length string)) (null state) (and minimal (not maximal) end)) do (setf state (delta-get (char string i) state fsa-delta)) (incf i))) end)) #+old?? (defmethod string-match ((fsa regexp-dfa) (token cgp::token) &key minimal maximal exact) "If exact = t returns t if there is an exact match, and if MINIMAL/MAXIMAL = T only if the exact match is minimal/maximal. If EXACT = NIL returns all matching endpositions, only the first one if MINIMAL = T, only the last one if MAXIMAL = T" (let ((end ()) (string (cgp::token-value token))) (with-slots (fsa-start-state fsa-final-states fsa-delta) fsa (loop with state = fsa-start-state and i = 0 when (set-member-p state fsa-final-states) do (cond (exact (let ((success (= i (length string)))) (when (or minimal success) (return-from string-match token)))) (maximal (if (and minimal end) (return-from string-match) (setf end i))) (minimal (setf end i)) (t (push i end))) until (or (and (= i (length string)) (null (cgp::token-next token))) (null state) (and minimal (not maximal) end)) do (if (= i (length string)) (setf state (delta-get #\Space state fsa-delta) token (cgp::token-next token) string (cgp::token-value token) i -1) (setf state (delta-get (char string i) state fsa-delta))) (incf i))) (values end token))) (defmethod string-match ((fsa regexp-dfa) (token cgp::token) &key minimal maximal exact end-test) "If exact = t returns the token which is the last one contributing to an exact match (or NIL if there is no such match), and if MINIMAL/MAXIMAL = T only if the exact match is minimal/maximal. If EXACT = NIL returns all pairs of matching endpositions (token-relative) and tokens they are in, only the first one (as first value token, as second value position) if MINIMAL = T, and only the last one if MAXIMAL = T" (declare (optimize (speed 3) (safety 0))) (let ((end ()) (string (cgp::token-value token)) (end-token nil)) (with-slots (fsa-start-state fsa-final-states fsa-delta) fsa (loop with state = fsa-start-state and i = 0 when (and state (set-member-p state fsa-final-states)) do (cond (exact (let ((success (= i (length string)))) (cond (minimal (return-from string-match token)) (success (if maximal (setf end-token token) (return-from string-match token)))))) (maximal (if (and minimal end) (return-from string-match nil) (setf end i end-token token))) (minimal (setf end i)) (t (push (cons i token) end))) until (or (and (= i (length string)) (or (and end-test (funcall end-test (cgp::next-str-token token))) (null (cgp::next-str-token token)))) (null state) (and minimal (not maximal) end)) do (if (= i (length string)) (setf state (delta-get #\Space state fsa-delta) token (cgp::next-str-token token) string (cgp::token-value token) i -1) (setf state (delta-get (char string i) state fsa-delta))) (incf i))) (cond (exact end-token) ((consp end) end) (t (values (or end-token token) end))))) (defmethod string-match ((fsa regexp-dfa) (string-list list) &key minimal maximal exact &allow-other-keys) "If exact = t returns t if there is an exact match, and if MINIMAL/MAXIMAL = T only if the exact match is minimal/maximal. If EXACT = NIL returns all matching endpositions, only the first one if MINIMAL = T, only the last one if MAXIMAL = T" (let ((end ()) (string (car string-list)) (end-string nil)) (setf string-list (cdr string-list)) (with-slots (fsa-start-state fsa-final-states fsa-delta) fsa (loop with state = fsa-start-state and i = 0 when (and state (set-member-p state fsa-final-states)) do (cond (exact (let ((success (= i (length string)))) (cond (minimal (return-from string-match string)) (success (if maximal (setf end-string string) (return-from string-match string)))))) (maximal (if (and minimal end) (return-from string-match nil) (setf end i end-string string))) (minimal (setf end i)) (t (push (cons i string) end))) until (or (and (= i (length string)) (null string-list)) (null state) (and minimal (not maximal) end)) do (if (= i (length string)) (setf state (delta-get #\Space state fsa-delta) ;previous-string string string (car string-list) string-list (cdr string-list) i -1) (setf state (delta-get (char string i) state fsa-delta))) (incf i))) (cond (exact end-string) ((consp end) end) #+ignore ((zerop end) (values previous-string (when previous-string (length previous-string)))) (t (values (or end-string string) end))))) #+test (string-match *number-regexp* '("53" "153" "458") :maximal t :minimal nil :exact t) #+test (string-match *number-regexp* "+135 4506" :maximal t) (defparameter *number-regexp* (compile-regexp (u::concat "[\\+\\-]?[0-9][0-9]?[0-9]?[\\.\\ ]?([0-9][0-9][0-9][\\.\\ ]?)*(\\,[0-9]+)?[\\.\\%]?" "([\\/\\-\\:][\\+\\-]?[0-9][0-9]?[0-9]?[\\.\\ ]?([0-9][0-9][0-9][\\.\\ ]?)*(\\,[0-9]+)?[\\.\\%]?)?"))) #+test (string-match *number-regexp* "10:3" :exact t :minimal nil :maximal t) #+test (string-match *number-regexp* "9 ." :exact t :minimal nil :maximal nil) #+test (string-match *number-regexp* "5" :exact nil :minimal t :maximal nil) #+test (string-match *number-regexp* "54 334.129,4%-+5" :exact nil :minimal nil :maximal t) (defparameter *special-number-regexp* (compile-regexp "([0-9]+\\.)*[0-9]+\\.?")) #+test (string-match *special-number-regexp* "192.168.24.115" :exact t) (defparameter *area-measure-regexp* (compile-regexp "[0-9]+[xX]*")) ; ?? #+test (string-match *area-measure-regexp* "7xx" :exact t) (defparameter *rational-number-regexp* (compile-regexp "[\\+\\-]?[0-9]* ?[0-9]+(( / )|/)[0-9]+")) #+test (string-match *rational-number-regexp* "-17 3/5" :exact t) (defparameter *date-regexp* (compile-regexp (u::concat "((([0-2]?[0-9])|(3[0-1]))\\. ?((0?[0-9])|(1[0-2]))\\. ?[0-9][0-9]([0-9][0-9])?)|" "((([0-2]?[0-9])|(3[0-1]))\\-((0?[0-9])|(1[0-2]))\\-[0-9][0-9]([0-9][0-9])?)|" "((([0-2]?[0-9])|(3[0-1]))\\/((0?[0-9])|(1[0-2]))\\/[0-9][0-9]([0-9][0-9])?)|" "((([0-2]?[0-9])|(3[0-1]))\\/((0?[0-9])|(1[0-2]))(\\-[0-9][0-9]([0-9][0-9])?)?)|" "([0-9][0-9]([0-9][0-9])?\\.((0?[0-9])|(1[0-2]))\\.(([0-2]?[0-9])|(3[0-1])))|" "([0-9][0-9]([0-9][0-9])?\\-((0?[0-9])|(1[0-2]))\\-(([0-2]?[0-9])|(3[0-1])))|" "([0-9][0-9]([0-9][0-9])?\\/((0?[0-9])|(1[0-2]))\\/(([0-2]?[0-9])|(3[0-1])))"))) #+test (string-match *date-regexp* "21.12. 1980" :exact t) #+test (string-match *date-regexp* "1980.07.05" :exact t) #+test (string-match *date-regexp* "5/4-99" :exact t) (defparameter *time-regexp* (compile-regexp "(([0-1]?[0-9])|(2[0-4]))\\.[0-5][0-9]")) #+test (string-match *time-regexp* "23.05" :exact t) ; $romartalU = 'IVXLCDM'; ; $romartalL = 'ivxlcdm'; (defparameter *amount-regexp* (compile-regexp "[\\+\\-]?[0-9][0-9]?[0-9]?[\\.\\ ]?([0-9][0-9][0-9][\\.\\ ]?)*\\,\\-")) #+test (string-match *amount-regexp* "28.278,-" :exact t) ;; could be much more elaborate! (defparameter *uppercase-roman-numeral-regexp* (compile-regexp "[IVXLCDM][IVXLCDM]+")) ;(defparameter *roman-nr* (compile-regexp "(I?X|X?(I(V|((II?)?))|V(I(II?)?)?))")) #+test (string-match *uppercase-roman-numeral-regexp* "IIVIVVI" :exact t) ; hihi (defparameter *sentence-start-roman-numeral-regexp* (compile-regexp "[ivxlcdm]+\\ ?[\\.\\)]")) #+test (string-match *sentence-start-roman-numeral-regexp* "xi)" :exact t) ;; templates: ; : number ; : alphabetic character ; : lowercase character ; : uppercase character ; : newline ; : space