;;; -*- Mode: LISP; Package: CGP; BASE: 10; Syntax: ANSI-Common-Lisp; -*- ;; ;; Copyright (C) Paul Meurer 2000-2004. All rights reserved. ;; paul.meurer@aksis.uib.no ;; Aksis, University of Bergen ;; ;; ;; Reimplementation in ANSI CommonLisp of the multi-tagger module ;; (Perl program) written by Lars Jørgen Tvedt, UiO 1999 ;; Used in the CG parser/tagger system (Oslo-tagger) developed at UiO ;; (Dokumentasjonsprosjektet; Tekstlaboratoriet) ;; ;;------------------------------------------------------------------------------------- ;; TO DO: ;; ;;------------------------------------------------------------------------------------- (in-package "CGP") ;; should split up into several classes (defclass token () ((chain :initform nil :initarg :chain :accessor token-chain) ; token-chain (sentence) the token is a member of (prev :initform nil :initarg :prev :accessor token-prev) ; previous token in chain (next :initform nil :initarg :next :accessor token-next) ; next token in chain (id :initform nil :reader token-id) ;; value of token (eg. a word, :newline etc.) (value :initform nil :initarg :value :accessor token-value) (normalized-value :initform nil :initarg :normalized-value :accessor token-normalized-value) ;; tokens this token is a concatenation of (expansion :initform nil :initarg :expansion :accessor token-expansion) (features :initform nil :initarg :features :accessor token-features) ; token features (filtered-features :initform nil :initarg :filtered-features :accessor token-filtered-features) ; filtered (transformed) token features (attributes :initform nil :initarg :attributes :accessor token-attributes) ; other attributes (language :initform nil :initarg :language :accessor token-language) ; language of word token (position :initform nil :initarg :position :accessor token-stream-position) (tree-position :initform nil :initarg :tree-position :accessor token-tree-position) (whitespace :initform nil :initarg :whitespace :accessor token-whitespace) ;; rules used when eliminating readings in CG parse (used-rules :initform nil :initarg :used-rules :accessor token-used-rules) (correct-reading-discarded-p :initform nil :accessor correct-reading-discarded-p) (lc-features :initform nil :accessor lc-features) (alist :initform () :accessor token-alist) (insignificant-p :initform nil :initarg insignificant-p :accessor insignificant-p) )) (defmethod initialize-instance :after ((token token) &key &allow-other-keys) (with-slots (id chain) token (when chain (setf id (incf (token-max-id chain)))))) ;; token class for comparing two CGs (defclass compare-token (token) ((compare-features :initform nil :accessor compare-features) (compare-used-rules :initform nil :accessor compare-used-rules))) (defvar *token-class* 'token) (defmethod token-class ((sentence-class (eql 'sentence))) *token-class*) (defmethod token-class ((sentence-class (eql 'compare-sentence))) 'compare-token) (defun get-token (&rest keys &key token-class &allow-other-keys) (apply #'make-instance (or token-class (token-class *sentence-class*)) keys)) (defmethod print-object ((token token) stream) (with-slots (value) token (print-unreadable-object (token stream :type t :identity t) (when (slot-boundp token 'value) (if (and (eq value :insignificant) (token-attributes token)) (write (token-attributes token) :stream stream :escape t) (write value :stream stream :escape t)))))) (defun token-p (object) (typep object 'token)) ;; A TOKEN-CHAIN is a double-linked list of tokens (defclass token-chain () ((first-token :initform nil :initarg :first-token :accessor first-token) (last-token :initform nil :initarg :last-token :accessor last-token) (token-max-id :initform 0 :accessor token-max-id))) ;; The SENTENCE-ARRAY is an array containing the tokens of the sentence token-chain, ;; plus an initial sentence-start token (>>>). ;; SENTENCE-ARRAY is useful in the CG parser module and is initialized only before ;; disambiguation. (defclass sentence (token-chain) ((sentence-array :accessor sentence-array) (stream :initarg :stream :accessor sentence-stream) (tokenizer :initform *tokenizer* :initarg :tokenizer :reader tokenizer) (encoding :initarg :encoding :initform :macintosh :reader encoding) (cg :initarg :cg :reader constraint-grammar) (feature-vector :initform nil :initarg :feature-vector :reader feature-vector))) (defclass document-sentence (sentence) ((document :initform nil :initarg :document :reader sentence-document) (start :initform nil :initarg :start :reader document-sentence-start) (end :initform nil :initarg :end :reader document-sentence-end) (read-only-p :initform nil :initarg :read-only-p :accessor read-only-p) (tagging-changed-p :initform nil :accessor tagging-changed-p))) (defclass cgp-document-sentence (document-sentence) ()) (defmethod print-object ((sentence sentence) stream) (print-unreadable-object (sentence stream :type t :identity t) (labels ((walk (token count) (cond ((null token) nil) ((or (not (slot-boundp token 'value)) (eq (token-value token) :newline)) (walk (token-next token) count)) ((> count 3) (write-string "..." stream)) ((eq (token-value token) :insignificant) (write-string "% " stream) (walk (token-next token) count)) (t (format stream "~a" (token-value token)) (write-char #\Space stream) (walk (token-next token) (1+ count)))))) (walk (first-token sentence) 0)))) (defmethod matches ((sentence sentence)) nil) (defclass regexp-sentence (sentence) ((matches :initform () :accessor matches) (pending-operations :initform nil :accessor pending-operations))) (defclass regexp-token (token) (#+old(lc-features :initform nil :accessor lc-features) (match :initform nil :accessor match))) (defclass control-token (regexp-token) ((control-features :initform () :initarg :control-features :accessor token-control-features))) (defmethod initialize-instance :after ((sentence sentence) &key &allow-other-keys) (setf (sentence-array sentence) (make-array 1 :initial-element (get-token :value '(>>>) :features (list (cons '>>> (encode-features '>>>))) :chain sentence) :adjustable t :fill-pointer t)) sentence) (defmethod sentence-length ((sentence sentence)) (fill-pointer (sentence-array sentence))) (defvar *sentence-class* 'sentence) ;; class for testing two grammars against each other (defclass compare-sentence (sentence) ((compare-cg :initarg :compare-cg :accessor compare-cg))) ;; change later: get sentence from token pool if pool is not empty? (defun get-sentence (&rest rest &key (sentence-class *sentence-class*) (tokenizer *tokenizer*) &allow-other-keys) (apply #'make-instance sentence-class :tokenizer tokenizer rest)) (defun insert-token (token new-token) "Inserts a NEW-TOKEN between a TOKEN and its successor. Returns TOKEN." (let ((next (token-next token))) (setf (token-next token) new-token (token-prev new-token) token (token-next new-token) next) (when next (setf (token-prev next) new-token)) token)) (defun remove-token (token) "Removes a TOKEN from a token chain. Returns next and previous token." (let ((prev (token-prev token)) (next (token-next token))) (when prev (setf (token-next prev) next)) (when next (setf (token-prev next) prev)) ;; ev. reset first-token or last-token of chain (let ((chain (token-chain token))) (cond ((null chain) nil) ((eq token (first-token chain)) (setf (first-token chain) next)) ((eq token (last-token chain)) (setf (last-token chain) prev)) (t nil))) ;; add removed token to token pool? (values next prev))) ;; change later: get token from token pool if pool is not empty #+ignore (defun get-token (&key value features prev next chain) (make-token :value value :features features :prev prev :next next :chain chain)) (defmethod add-token ((chain token-chain) value &rest rest &key features normalized-value &allow-other-keys) "Adds a token at the end of a token chain, returns that token." #+debug(print (cons :value value)) (if (null (first-token chain)) (setf (first-token chain) (apply #'get-token :value value :chain chain rest) (last-token chain) (first-token chain)) (let ((token (token-next (last-token chain)))) (if token (setf (token-value token) value (token-features token) features (token-normalized-value token) normalized-value) (let ((last-token (last-token chain))) (setf token (apply #'get-token :value value :chain chain rest) (token-next last-token) token (token-prev token) last-token))) (setf (last-token chain) token)))) (defmethod push-token ((chain token-chain) value &rest rest &key features normalized-value &allow-other-keys) "Adds a token at the start of a token chain, returns that token." (let ((first-token (first-token chain)) (new-token (apply #'get-token :value value :chain chain rest))) (setf (first-token chain) new-token) (if (null first-token) (setf (last-token chain) (first-token chain)) (setf (token-next new-token) first-token (token-prev first-token) new-token)))) (defmethod find-token ((token-chain token-chain) value &key start (key #'identity) (test #'eq) from-end) "Finds a token in a token chain" (labels ((walk (token) (cond ((null token) nil) ((funcall test value (funcall key token)) token) (from-end (walk (token-prev token))) (t (walk (token-next token)))))) (walk (cond (start start) (from-end (last-token token-chain)) (t (first-token token-chain)))))) (defmethod find-token-if ((token-chain token-chain) predicate &key start (key #'identity) from-end) (labels ((walk (token) (cond ((null token) nil) ((funcall predicate (funcall key token)) token) (from-end (walk (token-prev token))) (t (walk (token-next token)))))) (walk (cond (start start) (from-end (last-token token-chain)) (t (first-token token-chain)))))) (defmethod token-position ((chain token-chain) token) "position of TOKEN in CHAIN or NIL" (do ((tk (first-token chain) (token-next tk)) (i 0 (1+ i))) ((or (eq tk token) (null tk)) (when tk i)))) (defmethod token-distance (token1 token2) "distance between TOKEN1 <= TOKEN2, or NIL" (do ((tk token1 (token-next tk)) (i 0 (1+ i))) ((or (eq tk token2) (null tk)) (when tk i)))) (defmethod map-tokens ((token-chain token-chain) function &key start all-p expand-tokens-p walk-function) (labels ((walk (token &optional concat-token) (cond ((null token) nil) (walk-function (funcall function token) (walk (funcall walk-function token))) ((and (or expand-tokens-p all-p) (token-expansion token)) (do ((ex-token (car (token-expansion token)) (token-next ex-token))) ((eq ex-token (cdr (token-expansion token))) (walk ex-token token)) (walk ex-token token)) (when all-p (funcall function token)) (walk (token-next token) nil)) (t (unless concat-token (funcall function token) (walk (token-next token) nil)))))) (walk (or start (first-token token-chain))))) #+old (defmethod map-tokens ((token-chain token-chain) function &key start walk-function) (labels ((walk (token) (when token (funcall function token) (walk (funcall (or walk-function #'token-next) token))))) (walk (or start (first-token token-chain))))) ; push-token ; pop-token :eof