;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*- ;;; $Header: /usr/local/cvsrep/cl-ppcre/lexer.lisp,v 1.35 2009/09/17 19:17:31 edi Exp $ ;;; The lexer's responsibility is to convert the regex string into a ;;; sequence of tokens which are in turn consumed by the parser. ;;; ;;; The lexer is aware of Perl's 'extended mode' and it also 'knows' ;;; (with a little help from the parser) how many register groups it ;;; has opened so far. (The latter is necessary for interpreting ;;; strings like "\\10" correctly.) ;;; Copyright (c) 2002-2009, Dr. Edmund Weitz. All rights reserved. ;;; Redistribution and use in source and binary forms, with or without ;;; modification, are permitted provided that the following conditions ;;; are met: ;;; * Redistributions of source code must retain the above copyright ;;; notice, this list of conditions and the following disclaimer. ;;; * Redistributions in binary form must reproduce the above ;;; copyright notice, this list of conditions and the following ;;; disclaimer in the documentation and/or other materials ;;; provided with the distribution. ;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR 'AS IS' AND ANY EXPRESSED ;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ;;; ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY ;;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE ;;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS ;;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, ;;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. (in-package :cl-ppcre) (declaim (inline map-char-to-special-class)) (defun map-char-to-special-char-class (chr) (declare #.*standard-optimize-settings*) "Maps escaped characters like \"\\d\" to the tokens which represent their associated character classes." (case chr ((#\d) :digit-class) ((#\D) :non-digit-class) ((#\w) :word-char-class) ((#\W) :non-word-char-class) ((#\s) :whitespace-char-class) ((#\S) :non-whitespace-char-class))) (defstruct (lexer (:constructor make-lexer-internal)) "LEXER structures are used to hold the regex string which is currently lexed and to keep track of the lexer's state." (str "" :type string :read-only t) (len 0 :type fixnum :read-only t) (reg 0 :type fixnum) (pos 0 :type fixnum) (last-pos nil :type list)) (defun make-lexer (string) (declare (inline make-lexer-internal) #-:genera (string string)) (make-lexer-internal :str (maybe-coerce-to-simple-string string) :len (length string))) (declaim (inline end-of-string-p)) (defun end-of-string-p (lexer) (declare #.*standard-optimize-settings*) "Tests whether we're at the end of the regex string." (<= (lexer-len lexer) (lexer-pos lexer))) (declaim (inline looking-at-p)) (defun looking-at-p (lexer chr) (declare #.*standard-optimize-settings*) "Tests whether the next character the lexer would see is CHR. Does not respect extended mode." (and (not (end-of-string-p lexer)) (char= (schar (lexer-str lexer) (lexer-pos lexer)) chr))) (declaim (inline next-char-non-extended)) (defun next-char-non-extended (lexer) (declare #.*standard-optimize-settings*) "Returns the next character which is to be examined and updates the POS slot. Does not respect extended mode." (cond ((end-of-string-p lexer) nil) (t (prog1 (schar (lexer-str lexer) (lexer-pos lexer)) (incf (lexer-pos lexer)))))) (defun next-char (lexer) (declare #.*standard-optimize-settings*) "Returns the next character which is to be examined and updates the POS slot. Respects extended mode, i.e. whitespace, comments, and also nested comments are skipped if applicable." (let ((next-char (next-char-non-extended lexer)) last-loop-pos) (loop ;; remember where we started (setq last-loop-pos (lexer-pos lexer)) ;; first we look for nested comments like (?#foo) (when (and next-char (char= next-char #\() (looking-at-p lexer #\?)) (incf (lexer-pos lexer)) (cond ((looking-at-p lexer #\#) ;; must be a nested comment - so we have to search for ;; the closing parenthesis (let ((error-pos (- (lexer-pos lexer) 2))) (unless ;; loop 'til ')' or end of regex string and ;; return NIL if ')' wasn't encountered (loop for skip-char = next-char then (next-char-non-extended lexer) while (and skip-char (char/= skip-char #\))) finally (return skip-char)) (signal-syntax-error* error-pos "Comment group not closed."))) (setq next-char (next-char-non-extended lexer))) (t ;; undo effect of previous INCF if we didn't see a # (decf (lexer-pos lexer))))) (when *extended-mode-p* ;; now - if we're in extended mode - we skip whitespace and ;; comments; repeat the following loop while we look at ;; whitespace or #\# (loop while (and next-char (or (char= next-char #\#) (whitespacep next-char))) do (setq next-char (if (char= next-char #\#) ;; if we saw a comment marker skip until ;; we're behind #\Newline... (loop for skip-char = next-char then (next-char-non-extended lexer) while (and skip-char (char/= skip-char #\Newline)) finally (return (next-char-non-extended lexer))) ;; ...otherwise (whitespace) skip until we ;; see the next non-whitespace character (loop for skip-char = next-char then (next-char-non-extended lexer) while (and skip-char (whitespacep skip-char)) finally (return skip-char)))))) ;; if the position has moved we have to repeat our tests ;; because of cases like /^a (?#xxx) (?#yyy) {3}c/x which ;; would be equivalent to /^a{3}c/ in Perl (unless (> (lexer-pos lexer) last-loop-pos) (return next-char))))) (declaim (inline fail)) (defun fail (lexer) (declare #.*standard-optimize-settings*) "Moves (LEXER-POS LEXER) back to the last position stored in \(LEXER-LAST-POS LEXER) and pops the LAST-POS stack." (unless (lexer-last-pos lexer) (signal-syntax-error "LAST-POS stack of LEXER ~A is empty." lexer)) (setf (lexer-pos lexer) (pop (lexer-last-pos lexer))) nil) (defun get-number (lexer &key (radix 10) max-length no-whitespace-p) (declare #.*standard-optimize-settings*) "Read and consume the number the lexer is currently looking at and return it. Returns NIL if no number could be identified. RADIX is used as in PARSE-INTEGER. If MAX-LENGTH is not NIL we'll read at most the next MAX-LENGTH characters. If NO-WHITESPACE-P is not NIL we don't tolerate whitespace in front of the number." (when (or (end-of-string-p lexer) (and no-whitespace-p (whitespacep (schar (lexer-str lexer) (lexer-pos lexer))))) (return-from get-number nil)) (multiple-value-bind (integer new-pos) (parse-integer (lexer-str lexer) :start (lexer-pos lexer) :end (if max-length (let ((end-pos (+ (lexer-pos lexer) (the fixnum max-length))) (lexer-len (lexer-len lexer))) (if (< end-pos lexer-len) end-pos lexer-len)) (lexer-len lexer)) :radix radix :junk-allowed t) (cond ((and integer (>= (the fixnum integer) 0)) (setf (lexer-pos lexer) new-pos) integer) (t nil)))) (declaim (inline try-number)) (defun try-number (lexer &key (radix 10) max-length no-whitespace-p) (declare #.*standard-optimize-settings*) "Like GET-NUMBER but won't consume anything if no number is seen." ;; remember current position (push (lexer-pos lexer) (lexer-last-pos lexer)) (let ((number (get-number lexer :radix radix :max-length max-length :no-whitespace-p no-whitespace-p))) (or number (fail lexer)))) (declaim (inline make-char-from-code)) (defun make-char-from-code (number error-pos) (declare #.*standard-optimize-settings*) "Create character from char-code NUMBER. NUMBER can be NIL which is interpreted as 0. ERROR-POS is the position where the corresponding number started within the regex string." ;; only look at rightmost eight bits in compliance with Perl (let ((code (logand #o377 (the fixnum (or number 0))))) (or (and (< code char-code-limit) (code-char code)) (signal-syntax-error* error-pos "No character for hex-code ~X." number)))) (defun unescape-char (lexer) (declare #.*standard-optimize-settings*) "Convert the characters\(s) following a backslash into a token which is returned. This function is to be called when the backslash has already been consumed. Special character classes like \\W are handled elsewhere." (when (end-of-string-p lexer) (signal-syntax-error "String ends with backslash.")) (let ((chr (next-char-non-extended lexer))) (case chr ((#\E) ;; if \Q quoting is on this is ignored, otherwise it's just an ;; #\E (if *allow-quoting* :void #\E)) ((#\c) ;; \cx means control-x in Perl (let ((next-char (next-char-non-extended lexer))) (unless next-char (signal-syntax-error* (lexer-pos lexer) "Character missing after '\\c' at position ~A.")) (code-char (logxor #x40 (char-code (char-upcase next-char)))))) ((#\x) ;; \x should be followed by a hexadecimal char code, ;; two digits or less (let* ((error-pos (lexer-pos lexer)) (number (get-number lexer :radix 16 :max-length 2 :no-whitespace-p t))) ;; note that it is OK if \x is followed by zero digits (make-char-from-code number error-pos))) ((#\0 #\1 #\2 #\3 #\4 #\5 #\6 #\7 #\8 #\9) ;; \x should be followed by an octal char code, ;; three digits or less (let* ((error-pos (decf (lexer-pos lexer))) (number (get-number lexer :radix 8 :max-length 3))) (make-char-from-code number error-pos))) ;; the following five character names are 'semi-standard' ;; according to the CLHS but I'm not aware of any implementation ;; that doesn't implement them ((#\t) #\Tab) ((#\n) #\Newline) ((#\r) #\Return) ((#\f) #\Page) ((#\b) #\Backspace) ((#\a) (code-char 7)) ; ASCII bell ((#\e) (code-char 27)) ; ASCII escape (otherwise ;; all other characters aren't affected by a backslash chr)))) (defun read-char-property (lexer first-char) (declare #.*standard-optimize-settings*) (unless (eql (next-char-non-extended lexer) #\{) (signal-syntax-error* (lexer-pos lexer) "Expected left brace after \\~A." first-char)) (let ((name (with-output-to-string (out nil :element-type #+:lispworks 'lw:simple-char #-:lispworks 'character) (loop (let ((char (or (next-char-non-extended lexer) (signal-syntax-error "Unexpected EOF after \\~A{." first-char)))) (when (char= char #\}) (return)) (write-char char out)))))) (list (if (char= first-char #\p) :property :inverted-property) ;; we must reverse here because of what PARSE-STRING does (nreverse name)))) (defun collect-char-class (lexer) "Reads and consumes characters from regex string until a right bracket is seen. Assembles them into a list \(which is returned) of characters, character ranges, like \(:RANGE #\\A #\\E) for a-e, and tokens representing special character classes." (declare #.*standard-optimize-settings*) (let ((start-pos (lexer-pos lexer)) ; remember start for error message hyphen-seen last-char list) (flet ((handle-char (c) "Do the right thing with character C depending on whether we're inside a range or not." (cond ((and hyphen-seen last-char) (setf (car list) (list :range last-char c) last-char nil)) (t (push c list) (setq last-char c))) (setq hyphen-seen nil))) (loop for first = t then nil for c = (next-char-non-extended lexer) ;; leave loop if at end of string while c do (cond ((char= c #\\) ;; we've seen a backslash (let ((next-char (next-char-non-extended lexer))) (case next-char ((#\d #\D #\w #\W #\s #\S) ;; a special character class (push (map-char-to-special-char-class next-char) list) ;; if the last character was a hyphen ;; just collect it literally (when hyphen-seen (push #\- list)) ;; if the next character is a hyphen do the same (when (looking-at-p lexer #\-) (push #\- list) (incf (lexer-pos lexer))) (setq hyphen-seen nil)) ((#\P #\p) ;; maybe a character property (cond ((null *property-resolver*) (handle-char next-char)) (t (push (read-char-property lexer next-char) list) ;; if the last character was a hyphen ;; just collect it literally (when hyphen-seen (push #\- list)) ;; if the next character is a hyphen do the same (when (looking-at-p lexer #\-) (push #\- list) (incf (lexer-pos lexer))) (setq hyphen-seen nil)))) ((#\E) ;; if \Q quoting is on we ignore \E, ;; otherwise it's just a plain #\E (unless *allow-quoting* (handle-char #\E))) (otherwise ;; otherwise unescape the following character(s) (decf (lexer-pos lexer)) (handle-char (unescape-char lexer)))))) (first ;; the first character must not be a right bracket ;; and isn't treated specially if it's a hyphen (handle-char c)) ((char= c #\]) ;; end of character class ;; make sure we collect a pending hyphen (when hyphen-seen (setq hyphen-seen nil) (handle-char #\-)) ;; reverse the list to preserve the order intended ;; by the author of the regex string (return-from collect-char-class (nreverse list))) ((and (char= c #\-) last-char (not hyphen-seen)) ;; if the last character was 'just a character' ;; we expect to be in the middle of a range (setq hyphen-seen t)) ((char= c #\-) ;; otherwise this is just an ordinary hyphen (handle-char #\-)) (t ;; default case - just collect the character (handle-char c)))) ;; we can only exit the loop normally if we've reached the end ;; of the regex string without seeing a right bracket (signal-syntax-error* start-pos "Missing right bracket to close character class.")))) (defun maybe-parse-flags (lexer) (declare #.*standard-optimize-settings*) "Reads a sequence of modifiers \(including #\\- to reverse their meaning) and returns a corresponding list of \"flag\" tokens. The \"x\" modifier is treated specially in that it dynamically modifies the behaviour of the lexer itself via the special variable *EXTENDED-MODE-P*." (prog1 (loop with set = t for chr = (next-char-non-extended lexer) unless chr do (signal-syntax-error "Unexpected end of string.") while (find chr "-imsx" :test #'char=) ;; the first #\- will invert the meaning of all modifiers ;; following it if (char= chr #\-) do (setq set nil) else if (char= chr #\x) do (setq *extended-mode-p* set) else collect (if set (case chr ((#\i) :case-insensitive-p) ((#\m) :multi-line-mode-p) ((#\s) :single-line-mode-p)) (case chr ((#\i) :case-sensitive-p) ((#\m) :not-multi-line-mode-p) ((#\s) :not-single-line-mode-p)))) (decf (lexer-pos lexer)))) (defun get-quantifier (lexer) (declare #.*standard-optimize-settings*) "Returns a list of two values (min max) if what the lexer is looking at can be interpreted as a quantifier. Otherwise returns NIL and resets the lexer to its old position." ;; remember starting position for FAIL and UNGET-TOKEN functions (push (lexer-pos lexer) (lexer-last-pos lexer)) (let ((next-char (next-char lexer))) (case next-char ((#\*) ;; * (Kleene star): match 0 or more times '(0 nil)) ((#\+) ;; +: match 1 or more times '(1 nil)) ((#\?) ;; ?: match 0 or 1 times '(0 1)) ((#\{) ;; one of ;; {n}: match exactly n times ;; {n,}: match at least n times ;; {n,m}: match at least n but not more than m times ;; note that anything not matching one of these patterns will ;; be interpreted literally - even whitespace isn't allowed (let ((num1 (get-number lexer :no-whitespace-p t))) (if num1 (let ((next-char (next-char-non-extended lexer))) (case next-char ((#\,) (let* ((num2 (get-number lexer :no-whitespace-p t)) (next-char (next-char-non-extended lexer))) (case next-char ((#\}) ;; this is the case {n,} (NUM2 is NIL) or {n,m} (list num1 num2)) (otherwise (fail lexer))))) ((#\}) ;; this is the case {n} (list num1 num1)) (otherwise (fail lexer)))) ;; no number following left curly brace, so we treat it ;; like a normal character (fail lexer)))) ;; cannot be a quantifier (otherwise (fail lexer))))) (defun parse-register-name-aux (lexer) "Reads and returns the name in a named register group. It is assumed that the starting #\< character has already been read. The closing #\> will also be consumed." ;; we have to look for an ending > character now (let ((end-name (position #\> (lexer-str lexer) :start (lexer-pos lexer) :test #'char=))) (unless end-name ;; there has to be > somewhere, syntax error otherwise (signal-syntax-error* (1- (lexer-pos lexer)) "Opening #\< in named group has no closing #\>.")) (let ((name (subseq (lexer-str lexer) (lexer-pos lexer) end-name))) (unless (every #'(lambda (char) (or (alphanumericp char) (char= #\- char))) name) ;; register name can contain only alphanumeric characters or #\- (signal-syntax-error* (lexer-pos lexer) "Invalid character in named register group.")) ;; advance lexer beyond "" part (setf (lexer-pos lexer) (1+ end-name)) name))) (defun get-token (lexer) (declare #.*standard-optimize-settings*) "Returns and consumes the next token from the regex string \(or NIL)." ;; remember starting position for UNGET-TOKEN function (push (lexer-pos lexer) (lexer-last-pos lexer)) (let ((next-char (next-char lexer))) (cond (next-char (case next-char ;; the easy cases first - the following six characters ;; always have a special meaning and get translated ;; into tokens immediately ((#\)) :close-paren) ((#\|) :vertical-bar) ((#\?) :question-mark) ((#\.) :everything) ((#\^) :start-anchor) ((#\$) :end-anchor) ((#\+ #\*) ;; quantifiers will always be consumend by ;; GET-QUANTIFIER, they must not appear here (signal-syntax-error* (1- (lexer-pos lexer)) "Quantifier '~A' not allowed." next-char)) ((#\{) ;; left brace isn't a special character in it's own ;; right but we must check if what follows might ;; look like a quantifier (let ((this-pos (lexer-pos lexer)) (this-last-pos (lexer-last-pos lexer))) (unget-token lexer) (when (get-quantifier lexer) (signal-syntax-error* (car this-last-pos) "Quantifier '~A' not allowed." (subseq (lexer-str lexer) (car this-last-pos) (lexer-pos lexer)))) (setf (lexer-pos lexer) this-pos (lexer-last-pos lexer) this-last-pos) next-char)) ((#\[) ;; left bracket always starts a character class (cons (cond ((looking-at-p lexer #\^) (incf (lexer-pos lexer)) :inverted-char-class) (t :char-class)) (collect-char-class lexer))) ((#\\) ;; backslash might mean different things so we have ;; to peek one char ahead: (let ((next-char (next-char-non-extended lexer))) (case next-char ((#\A) :modeless-start-anchor) ((#\Z) :modeless-end-anchor) ((#\z) :modeless-end-anchor-no-newline) ((#\b) :word-boundary) ((#\B) :non-word-boundary) ((#\k) (cond ((and *allow-named-registers* (looking-at-p lexer #\<)) ;; back-referencing a named register (incf (lexer-pos lexer)) (list :back-reference (nreverse (parse-register-name-aux lexer)))) (t ;; false alarm, just unescape \k #\k))) ((#\d #\D #\w #\W #\s #\S) ;; these will be treated like character classes (map-char-to-special-char-class next-char)) ((#\1 #\2 #\3 #\4 #\5 #\6 #\7 #\8 #\9) ;; uh, a digit... (let* ((old-pos (decf (lexer-pos lexer))) ;; ...so let's get the whole number first (backref-number (get-number lexer))) (declare (fixnum backref-number)) (cond ((and (> backref-number (lexer-reg lexer)) (<= 10 backref-number)) ;; \10 and higher are treated as octal ;; character codes if we haven't ;; opened that much register groups ;; yet (setf (lexer-pos lexer) old-pos) ;; re-read the number from the old ;; position and convert it to its ;; corresponding character (make-char-from-code (get-number lexer :radix 8 :max-length 3) old-pos)) (t ;; otherwise this must refer to a ;; backreference (list :back-reference backref-number))))) ((#\0) ;; this always means an octal character code ;; (at most three digits) (let ((old-pos (decf (lexer-pos lexer)))) (make-char-from-code (get-number lexer :radix 8 :max-length 3) old-pos))) ((#\P #\p) ;; might be a named property (cond (*property-resolver* (read-char-property lexer next-char)) (t next-char))) (otherwise ;; in all other cases just unescape the ;; character (decf (lexer-pos lexer)) (unescape-char lexer))))) ((#\() ;; an open parenthesis might mean different things ;; depending on what follows... (cond ((looking-at-p lexer #\?) ;; this is the case '(?' (and probably more behind) (incf (lexer-pos lexer)) ;; we have to check for modifiers first ;; because a colon might follow (let* ((flags (maybe-parse-flags lexer)) (next-char (next-char-non-extended lexer))) ;; modifiers are only allowed if a colon ;; or a closing parenthesis are following (when (and flags (not (find next-char ":)" :test #'char=))) (signal-syntax-error* (car (lexer-last-pos lexer)) "Sequence '~A' not recognized." (subseq (lexer-str lexer) (car (lexer-last-pos lexer)) (lexer-pos lexer)))) (case next-char ((nil) ;; syntax error (signal-syntax-error "End of string following '(?'.")) ((#\)) ;; an empty group except for the flags ;; (if there are any) (or (and flags (cons :flags flags)) :void)) ((#\() ;; branch :open-paren-paren) ((#\>) ;; standalone :open-paren-greater) ((#\=) ;; positive look-ahead :open-paren-equal) ((#\!) ;; negative look-ahead :open-paren-exclamation) ((#\:) ;; non-capturing group - return flags as ;; second value (values :open-paren-colon flags)) ((#\<) ;; might be a look-behind assertion or a named group, so ;; check next character (let ((next-char (next-char-non-extended lexer))) (if (alpha-char-p next-char) (progn ;; we have encountered a named group ;; are we supporting register naming? (unless *allow-named-registers* (signal-syntax-error* (1- (lexer-pos lexer)) "Character '~A' may not follow '(?<'." next-char)) ;; put the letter back (decf (lexer-pos lexer)) ;; named group :open-paren-less-letter) (case next-char ((#\=) ;; positive look-behind :open-paren-less-equal) ((#\!) ;; negative look-behind :open-paren-less-exclamation) ((#\)) ;; Perl allows "(?<)" and treats ;; it like a null string :void) ((nil) ;; syntax error (signal-syntax-error "End of string following '(?<'.")) (t ;; also syntax error (signal-syntax-error* (1- (lexer-pos lexer)) "Character '~A' may not follow '(?<'." next-char )))))) (otherwise (signal-syntax-error* (1- (lexer-pos lexer)) "Character '~A' may not follow '(?'." next-char))))) (t ;; if next-char was not #\? (this is within ;; the first COND), we've just seen an opening ;; parenthesis and leave it like that :open-paren))) (otherwise ;; all other characters are their own tokens next-char))) ;; we didn't get a character (this if the "else" branch from ;; the first IF), so we don't return a token but NIL (t (pop (lexer-last-pos lexer)) nil)))) (declaim (inline unget-token)) (defun unget-token (lexer) (declare #.*standard-optimize-settings*) "Moves the lexer back to the last position stored in the LAST-POS stack." (if (lexer-last-pos lexer) (setf (lexer-pos lexer) (pop (lexer-last-pos lexer))) (error "No token to unget \(this should not happen)"))) (declaim (inline start-of-subexpr-p)) (defun start-of-subexpr-p (lexer) (declare #.*standard-optimize-settings*) "Tests whether the next token can start a valid sub-expression, i.e. a stand-alone regex." (let* ((pos (lexer-pos lexer)) (next-char (next-char lexer))) (not (or (null next-char) (prog1 (member (the character next-char) '(#\) #\|) :test #'char=) (setf (lexer-pos lexer) pos))))))