;;; -*- Mode: LISP; Package: CGP; BASE: 10; Syntax: ANSI-Common-Lisp; -*- ;; ;; Copyright (C) Paul Meurer 2000 - 2007. All rights reserved. ;; paul.meurer@aksis.uib.no ;; Aksis, University of Bergen ;; ;; Reimplementation in ANSI CommonLisp of the multi-tagger module ;; (Perl program) written by Lars J¿rgen Tvedt, UiO 1999 ;; Used in the CG parser/tagger system (Oslo-tagger) developed at UiO ;; (Dokumentasjonsprosjektet; Tekstlaboratoriet) ;; ;; Short description of the algorithm ;; ;; Parsing consists of the following steps: ;; 1. Tokenizing ;; 2. Determination of sentence boundaries ;; 3. Multitagging of tokens (including recognition of compounds) ;; 4. Disambiguation using CG parser ;; ;; The tokenizer performs an initial tokenization of the input (NEXT-WORD) (using a line ;; buffer which holds the last line read and the position in the line). ;; In BUILD-SENTENCE, the tokens are successively read into a TOKEN-CHAIN (SENTENCE). ;; Each token which might represent a sentence boundary (newline, punctuation) is immediately ;; tested using left context (since no right context is available at that stage), and the ;; previously found possible sentence terminator (if existing) undergoes a final test, this ;; time using left and right context (right context is now available until the new possible ;; terminator token). If the token passes the test, we have found a sentence boundary, and ;; the token chain is split at that token, the left part representing the fully tokenized sentence, ;; and the right part being a partially tokenized new sentence. ;; After a sentence is found, several adjustments are made to the tokenization, e.g. ;; divided words are joined. ;; ;; In the next step, the tokens are multitagged (TAG-SENTENCE). ;; ;; Since the CG parser needs random access to the tokens in a sentence, whereas a token-chain ;; only provides sequential access, a vector (SENTENCE-ARRAY) containing the tokens is built. ;; ;; Thread safety is achieved by object slots instead of globals (e.g. in "compounds.lisp") ;; or by rebinding globals dynamically to thread-owned objects (*line-buffer*). ;; Thread safety is necessary and essential in web context where several users might ;; simultaneously access the tagger. ;;------------------------------------------------------------------------------------- ;; TO DO: ;; - parentheses and quotes after sentence-ending punctuation ;; ;; KNOWN BUGS: ;; - q. e. d. ;; ;; QUESTIONS: ;; "så" SBU missing in "delkorp.cor" ;; "da" pref missing ;; "i morgen" etc., but not "i dag" ;;------------------------------------------------------------------------------------- (in-package :cgp) (defparameter *terminate-on-following-insignificant-token-p* nil) (defparameter *lookup-unknown-in-nny-lexicon-p* nil) (defparameter *merge-hyphenated-words-p* t) (defparameter *recognize-sgml-tags-p* nil) (defparameter *no-compounds-p* nil) (defvar *token-memory* nil) (#-sbcl defconstant #+sbcl defparameter $encoding #+(and mcl (not openmcl)) :macintosh #+unix :unix #+mswindows :windows) (defparameter $expression-max-length 5) ;; Table holding all registered Constraint Grammars (defvar *cg-table* (make-hash-table :test #'equal :size 16)) (defvar *nbo-cg*) (defvar *nny-cg*) ;; Object holding the CG rules. When parsing, dynamically bind *CG* to a grammar. (defvar *cg* t) (defparameter *context-size* 1000) (defparameter *regexp-parser* nil) (defparameter *term-extractor* nil) (defparameter *np-recognizer* nil) ;; is set in term-extractor.lisp (defparameter *name-term-extractor* nil) ;; debug (defparameter *memory* nil) ;; for debugging (defvar *sentence* nil) (defparameter *record-tokenization-p* t) (defparameter *debug-mem* nil) ;; buffer holding the line being tokenized (defclass line-buffer () ((line :initform "" :accessor buffered-line) (length :initform 0 :accessor buffered-line-length) (pos :initform 0 :accessor buffered-line-position) (stream-pos :initform 0 :accessor stream-pos) (stream-line-length :initform 0 :accessor stream-line-length) (encoding :initarg :encoding :initform $encoding :reader encoding))) ;; Must be bound dynamically to a fresh buffer in order to achieve thread safety (defvar *line-buffer* t) (#-sbcl defconstant #+sbcl defparameter $whitespace #.(coerce (list #\Space #\Tab #+allegro #\no-break_space #-allegro (code-char 160) #\Newline) 'string)) (#-sbcl defconstant #+sbcl defparameter $punctuation ".,!?;:()[]{}") (#-sbcl defconstant #+sbcl defparameter $parenthesis "()[]{}") (#-sbcl defconstant #+sbcl defparameter $left-punctuation "([{") (#-sbcl defconstant #+sbcl defparameter $right-punctuation ".,!?;:)]}") (#-sbcl defconstant #+sbcl defparameter $quote #+allegro (coerce '(#\" #\' #\left_double_quotation_mark #\right_double_quotation_mark #\left_single_quotation_mark #\right_single_quotation_mark #\left-pointing_double_angle_quotation_mark #\right-pointing_double_angle_quotation_mark) 'string) #-allegro (coerce '(#\" #\') 'string)) (#-sbcl defconstant #+sbcl defparameter $left-quote #+allegro (coerce '(#\" #\' #\left_double_quotation_mark #\left_single_quotation_mark #\left-pointing_double_angle_quotation_mark) 'string) #-allegro (coerce '(#\" #\') 'string)) (#-sbcl defconstant #+sbcl defparameter $right-quote #+allegro (coerce '(#\" #\' #\right_double_quotation_mark #\right_single_quotation_mark #\right-pointing_double_angle_quotation_mark) 'string) #-allegro (coerce '(#\" #\') 'string)) (#-sbcl defconstant #+sbcl defparameter $whitespace+punctuation (concat $whitespace $punctuation)) (#-sbcl defconstant #+sbcl defparameter $terminator ".?:!|") (#-sbcl defconstant #+sbcl defparameter $special-character "!?.<>ß@£$%&/=*-") (defun digit-p (char) (char<= #\0 char #\9)) (defun special-char-p (char) (find char $special-character)) (defun quote-p (char) (find char $quote)) (defun left-parenthesis-p (char) (find char "([{")) (defun left-quote-p (char) (find char $left-quote)) (defun right-quote-p (char) (find char $right-quote)) (defun whitespace-p (char) (find char $whitespace)) (defun punctuation-p (char) (find char $punctuation)) (defun left-punctuation-p (char) (find char $left-punctuation)) (defun right-punctuation-p (char) (find char $right-punctuation)) (defun whitespace-or-punctuation-p (char) (find char $whitespace+punctuation)) (defun parenthesis-p (char) (find char $parenthesis)) (defun comment-start-p (pos line) (and (< pos (1- (length line))) (char= (char line pos) #\/) (char= (char line (1+ pos)) #\*))) (defun sgml-tag-start-p (pos line) (char= (char line pos) #\<)) (defparameter *eof-magic-string* nil) (defmethod read-next-line ((stream stream)) (read-line stream nil :eof)) (defvar *separate-quotes-p* nil) (defun next-word (stream &optional (buffer *line-buffer*) in-comment-p) "Tokenizes the input stream and returns one (string) token or :newline, and file position as second value." (with-slots (line length pos stream-pos stream-line-length encoding) buffer (declare (fixnum length pos)) (unless (eq line :eof) (if (= length pos) (let ((new-line (read-next-line stream))) (unless (or (eq new-line :eof) (eq encoding $encoding)) (setf new-line (convert-string new-line encoding $encoding))) (setf stream-line-length (if (eq new-line :eof) 0 (1+ (length new-line)))) ;; hack!! (when (and (eq encoding :unix) (not (eq new-line :eof))) #+allegro (setf new-line (delete (code-char 13) new-line)) #+ignore(setf new-line (substitute #\» #\È new-line))) (setf line (cond ((null *eof-magic-string*) new-line) ((string= *eof-magic-string* new-line) :eof) (t new-line))) (incf stream-pos stream-line-length) ;; what about cr+lf? (unless (eq line :eof) (setf length (length line) pos 0) (values :newline (- stream-pos 1 stream-line-length)))) (unless nil ;;(eq line :eof) (let* ((start pos) (comment-p nil) (sgml-tag-p nil) (word-start (if in-comment-p 0 (position-if-not #'whitespace-p line :start start))) (word-end (when word-start #+debug(print (list word-start line)) (let ((first-char (char line word-start))) (cond (in-comment-p (setf comment-p t) (let ((comment-end-pos (search "*/" line :start2 (+ word-start 0)))) (when comment-end-pos (setf in-comment-p nil) (+ comment-end-pos 2)))) ((punctuation-p first-char) ;; keep together equal punctuation chars (position-if-not (lambda (c) (char= c first-char)) line :start word-start)) ((comment-start-p word-start line) (setf comment-p t) (or (search "*/" line :start2 (+ word-start 0)) (setf in-comment-p t)) length) ((and *recognize-sgml-tags-p* (sgml-tag-start-p word-start line)) (setf sgml-tag-p t) (1+ (position #\> line :start (1+ word-start)))) ;; separate hyphen from following word not ;; starting with lowercase; ;; seems to be direct speech marker ((and (char= #\- first-char) (< word-start (1- length)) (char/= (char line (1+ word-start)) #\-) (not (lower-case-p (char line (1+ word-start))))) (1+ word-start)) ((and *separate-quotes-p* (quote-p first-char)) #+ignore(position-if-not #'quote-p line :start word-start) (1+ word-start)) (t (let ((ws-pos (or (position-if #'whitespace-p line :start word-start) length))) (loop while (or (punctuation-p (char line (1- ws-pos))) (and (not (quote-p first-char)) (quote-p (char line (1- ws-pos))))) do (decf ws-pos)) ;; contractions (let ((contraction-pos (position #\/ line :start word-start :end ws-pos))) (cond ((not contraction-pos) ws-pos) (*separate-quotes-p* ;; for Logon (if (= contraction-pos word-start) (1+ contraction-pos) contraction-pos)) ((and (= contraction-pos (1+ word-start)) (< (+ contraction-pos 2) ws-pos) ;; eg. 6/7-80 (not (digit-p (char line word-start)))) (1+ contraction-pos)) (t ws-pos)))))))))) (setf pos (or word-end length)) (cond (in-comment-p (next-word stream buffer in-comment-p) ; jump over :newline (next-word stream buffer in-comment-p)) ((or comment-p sgml-tag-p) (next-word stream buffer in-comment-p)) ((not word-start) (next-word stream buffer in-comment-p)) (t (values (subseq line word-start word-end) (+ stream-pos word-start (- stream-line-length))))))))))) #+ignore-yet (defmethod word-token-position ((chain token-chain) token) ) (#-sbcl defconstant #+sbcl defparameter $max-subtitle-length 7) (defmethod effective-token-value ((token token)) (or (token-normalized-value token) (token-value token))) ;; new 5.2.2002 (defmethod possible-terminator-p ((sentence sentence) (tokenizer tokenizer) token) "Checks if TOKEN is a possible terminator, using some left neighborhood information" (or (null token) (let ((value (effective-token-value token))) (cond ((newlinep token) ;; check if left neighborhood makes headline possible (left-possible-headline-end-p token)) ((not (symbolp value)) (and (find-if (lambda (c) (find c $terminator)) value) (not (find-if-not #'whitespace-or-punctuation-p value)))) (t nil))))) #+old (defmethod possible-terminator-p ((sentence sentence) token) "Checks if TOKEN is a possible terminator, using some left neighborhood information" (or (null token) (let ((value (effective-token-value token))) (cond ((newlinep token) ;; check if left neighborhood makes headline possible (left-possible-headline-end-p token)) ((not (symbolp value)) (and (find-if (lambda (c) (find c $terminator)) value) (not (find-if-not #'whitespace-or-punctuation-p value)))) (t nil))))) (defmethod possible-newline-terminator-p ((sentence sentence) token &optional prev-terminator) "Checks if TOKEN is a possible terminator, using some left neighborhood information" (let ((value (effective-token-value token))) (and (eq value :newline) ;; check if left neighborhood makes headline possible (left-possible-headline-end-p token) (or ;; possible subtitle (let ((maybe-first-token (if prev-terminator (token-next prev-terminator) (first-token sentence)))) (and (eq (effective-token-value maybe-first-token) :newline) (<= (token-distance maybe-first-token token) (1+ $max-subtitle-length))) (not (eq (token-prev token) prev-terminator)) ;; *** TO DO: check that newline is first one in sentence ) ;; possible headline if empty line follows (and (eq (effective-token-value (token-prev token)) :newline) ;; but not if we already have signaled a possible headline ;; at the previous newline (not (eq (token-prev token) prev-terminator))))))) ; s/($muligOverskrift)\s+([$quotsParantes]*[-$lettersla\d$specLetters])/$1\| $2/; (defun left-possible-headline-end-p (token) "Checks if there is no terminator, #\, or #\; left to TOKEN" (cond ((null token) nil) ((newlinep token) (left-possible-headline-end-p (token-prev token))) ((insignificant-token-p token) (left-possible-headline-end-p (token-prev token))) ((not (find-if-not #'quote-p (effective-token-value token))) (left-possible-headline-end-p (token-prev token))) ((find-if (lambda (c) (or (find c $terminator) (find c ",;"))) (effective-token-value token)) nil) (t t))) (defun right-possible-headline-end-p (token) (cond ((null token) nil) ((newlinep token) (right-possible-headline-end-p (token-next token))) ((insignificant-token-p token) (right-possible-headline-end-p (token-next token))) ((find-if #'quote-p (effective-token-value token)) (right-possible-headline-end-p (token-next token))) ((find #\( (effective-token-value token)) ;; Perl: "()" (right-possible-headline-end-p (token-next token))) ((let ((c (char (effective-token-value token) 0))) (or (upper-case-p c) (special-char-p c) (digit-p c)))) (t nil))) (defun left-possible-terminator-p (token) (cond ((null token) nil) #+ignore ((symbolp (effective-token-value token)) nil) ((or (find #\) (effective-token-value token)) ; Perl: "()" (all-p #'quote-p (effective-token-value token)) (not (find-if-not (lambda (c) (find c $terminator)) ; something like "virkelig!?" (effective-token-value token)))) (left-possible-terminator-p (prev-str-token token))) ((let* ((str (effective-token-value token)) (c (find-if-not #'quote-p str :from-end t))) (or (lower-case-p c) (special-char-p c) (digit-p c) (and (upper-case-p c) (or (and (= 1 (length str)) (let ((prev (token-prev token))) (and prev (not (symbolp (effective-token-value prev))) ;; (or ...) ; ? $letters\d$specLetters ))) (< 1 (length str)))))) t) (t nil))) (defun right-possible-terminator-p (token) (cond ((null token) t) ((and (newlinep token) (newlinep (next-stream-token token))) t) ((and (not (newlinep token)) (not (find-if-not (lambda (c) (find c $terminator)) (effective-token-value token)))) nil) ; because next one is terminator ((or (newlinep token) ; ** should we say T if we encounter two successive :NEWLINEs? (find-if #'quote-p (effective-token-value token)) (find-if #'parenthesis-p (effective-token-value token))) (right-possible-terminator-p (next-stream-token token))) ((let ((c (char (effective-token-value token) 0))) (or ;(char= #\- c) ; ?? is contained in $special-characters (upper-case-p c) (special-char-p c) (digit-p c)))) (t nil))) #+test (let ((*tagger* *nbo-tagger*)) (print (title-p "dr.scient" t))) ;; checks if ABB is an abbreviation ending in period (defun non-terminating-abbreviation-p (abb) (let* ((length (length abb)) (str (make-string (+ length 2)))) (declare (dynamic-extent str)) ; no effect (dotimes (i length) (setf (char str i) (char abb i))) (setf (char str length) #\. (char str (1+ length)) #\:) (or (nth-value 1 (match-string (abbreviations *tagger*) str)) #+test (nth-value 1 (match-string (titles *tagger*) str)) (and (upper-case-p (char str 0)) (setf (char str 0) (char-downcase (char str 0))) (or (nth-value 1 (match-string (abbreviations *tagger*) str)) #+test (nth-value 1 (match-string (titles *tagger*) str))))))) (defun title-p (title &optional add-period-p) (let* ((length (length title)) (str (make-string (+ length (if add-period-p 2 1))))) (declare (dynamic-extent str)) ; no effect (dotimes (i length) (setf (char str i) (char title i))) (when add-period-p (setf (char str length) #\.) (incf length)) (setf (char str length) #\:) (nth-value 1 (match-string (titles *tagger*) str)))) ;(title-p "dr.scient" t) ;(title-p "dr" t) ;; could be more efficient, without consing (defun name-p (str &optional (accept-unknown-p t)) (let ((features (append (fullform-features str) (fullform-features (string-downcase str))))) (or (and accept-unknown-p (null features) (find-if #'alpha-char-p str)) (find-if (lambda (fs) (has-feature-p fs 'prop)) features) ;; try without hyphen (and (find #\- str) (let* ((str (remove #\- str)) (features (append (fullform-features str) (fullform-features (string-downcase str))))) (or (and accept-unknown-p (null features)) (find-if (lambda (fs) (has-feature-p fs 'prop)) features))))))) ; jumps over :newline etc (defun next-str-token (token) (when token (let ((next-token (token-next token))) (when next-token (if (stringp (effective-token-value next-token)) next-token (next-str-token next-token)))))) (defun prev-str-token (token) (when token (let ((prev-token (token-prev token))) (when prev-token (if (stringp (effective-token-value prev-token)) prev-token (prev-str-token prev-token)))))) (defmethod insignificant-token-p ((token token)) (or (eq (effective-token-value token) :insignificant) (equal (effective-token-value token) "¶"))) (defmethod next-stream-token ((token token)) "gets a new token from the token stream if next token has not been fetched yet" (let ((token (or (token-next token) (and (sentence-add-token (token-chain token) (tokenizer (token-chain token))) (token-next token))))) (when token (if (insignificant-token-p token) (next-stream-token token) token)))) #+test (untrace terminates-sentence-p token-value) (defun terminates-sentence-p (token) "Decides if a possibly terminating token terminates a sentence, using left and right neighborhood information" (flet ((add-period-to-prev (token) (let ((str (token-value (prev-str-token token)))) (setf (token-value (prev-str-token token)) (concat str "."))) nil)) (cond ((newlinep token) ;; possibly headline end (or (newlinep (next-stream-token token)) ; always headline if empty line follows (and (right-possible-headline-end-p token)))) ;; abbr and dot can't be separated by insignificant token ((and (string= (effective-token-value token) ".") (insignificant-token-p (token-prev token))) t) ((and (left-possible-terminator-p (prev-str-token token)) (right-possible-terminator-p (next-stream-token token))) ;; was: next-str-token (cond (;; abbreviation followed by name ;; *** TO DO: compound abbreviations (and (string= (effective-token-value token) ".") (or (and (non-terminating-abbreviation-p (effective-token-value (prev-str-token token))) (next-str-token token) (let ((next-str (effective-token-value (next-str-token token)))) (or (name-p next-str) (digit-p (char next-str 0)) ;; *** not all abbreviations can have digits next! ;; check for names of type "Rigus P. Ackerschwaan" (let ((prev-prev-str-token (prev-str-token (prev-str-token token)))) (and prev-prev-str-token (name-p (effective-token-value prev-prev-str-token)))) ;; adds period and returns NIL (add-period-to-prev token)))) ;; special treatment of "min." before number (and (string-equal (effective-token-value (prev-str-token token)) "min") (next-str-token token) (digit-p (char (effective-token-value (next-str-token token)) 0))))) nil) (;; title followed by name (and (string= (effective-token-value token) ".") (title-p (effective-token-value (prev-str-token token)) t) (token-next token) (or (name-p (effective-token-value (next-str-token token))) ;; adds period and returns NIL (add-period-to-prev token))) nil) ((string= (effective-token-value token) "...") (insert-token (token-prev token) (get-token :value ".." :chain (token-chain token) :position (token-stream-position (token-prev token)))) (setf (token-value token) ".") t) (t t))) (t nil)))) (defmethod split-at-token ((sentence sentence) token) (with-slots (cg) sentence (when (token-next token) (let ((next-sentence ;; use the rest to build a new sentence (get-sentence :first-token (token-next token) :last-token (last-token sentence) ;; new 21.09.2000 :stream (sentence-stream sentence) ;; new 10.01.2001 :sentence-class (class-of sentence) :cg cg ;;:feature-vector (feature-vector (multi-tagger cg)) ))) (setf (last-token sentence) token (token-next token) nil (token-prev (first-token next-sentence)) nil) (labels ((set-chain (token) (when token (setf (token-chain token) next-sentence) (set-chain (token-next token))))) (set-chain (first-token next-sentence))) next-sentence)))) (defmethod split-at-token ((sentence compare-sentence) token) (declare (ignore token)) (let ((next-sentence (call-next-method))) (when next-sentence (setf (compare-cg next-sentence) (compare-cg sentence)) next-sentence))) (defun newlinep (token) (and token (find (effective-token-value token) '(:newline "¶" :paragraph) :test #'equal))) ;(print-strings *titles*) ;(disambiguate-from-string "Du er dr. scient. eller Mag. art. og ingenting annet.") (defmethod sentence-add-token ((sentence sentence) (tokenizer tokenizer)) "appends token(s) from stream and returns the last token added" (with-slots (stream) sentence (multiple-value-bind (word stream-pos) (next-word stream) (cond ((null word) nil) ((symbolp word) (add-token sentence word :position stream-pos)) ;; handle quotes ((quote-p (char word 0)) ; TO DO: handle left and right quotation separately (cond ((and (> (length word) 3) (quote-p (last-char word)) (punctuation-p (last-char word 2))) (let ((word-end (1+ (position-if-not #'punctuation-p word :from-end t :end (- (length word) 1))))) (add-token sentence "«" :position stream-pos) (add-token sentence (subseq word 1 word-end) :position (1+ stream-pos)) (add-token sentence (subseq word word-end (1- (length word))) :position (+ stream-pos word-end)) (add-token sentence "»" :position (+ stream-pos (length word) -1)))) ((or (< (length word) 3) (and (evenp (count-if #'quote-p word)))) (add-token sentence word :position stream-pos)) (t (add-token sentence "«" :position stream-pos) (add-token sentence (subseq word 1) :position (1+ stream-pos))))) ((quote-p (last-char word)) (add-token sentence (subseq word 0 (1- (length word))) :position stream-pos) (add-token sentence "»" :position (+ stream-pos (length word) -1))) (t (add-token sentence word :position stream-pos)))))) #+test (untrace token-attributes) (defmethod build-sentence ((sentence sentence) (tokenizer tokenizer) &optional prev-token terminator) "Returns a fully tokenized sentence." (let ((token (or (and prev-token (token-next prev-token)) (sentence-add-token sentence (tokenizer sentence))))) (cond ((null token) ;; new for logon (when (and terminator (string= (effective-token-value terminator) ".") (prev-str-token terminator) (next-str-token terminator)) ;; append period to prev. token (concatenate-tokens (prev-str-token terminator) terminator)) sentence) ((possible-terminator-p sentence tokenizer token) ;; we found a possible terminator (cond ;; if there is no previous possible terminator, continue ((null terminator) (if token (build-sentence sentence tokenizer token ;; final check for headline (when (possible-headline-or-no-newline-p sentence token) token)) sentence)) (;; check whether the previous terminator really terminates the sentence (or (and (insignificant-token-p terminator) (consp (token-attributes terminator))) (terminates-sentence-p terminator) ;; has side effect! (and *terminate-on-following-insignificant-token-p* (insignificant-token-p (token-next terminator)))) ;; deal with headline (when (or (eq :newline (effective-token-value terminator)) (and (insignificant-token-p terminator) (consp (token-attributes terminator)))) (insert-token terminator (get-token :value :newline :chain sentence :position (token-stream-position terminator))) (setf (token-value terminator) "|")) (let ((new-sentence (split-at-token sentence terminator))) (values sentence new-sentence ;; final check for headline (when (and new-sentence (possible-headline-or-no-newline-p new-sentence token)) token) token))) (t ;; does not terminate: we go on (when (and (string= (effective-token-value terminator) ".") (prev-str-token terminator)) ;; append period to prev. token (concatenate-tokens (prev-str-token terminator) terminator)) (build-sentence sentence tokenizer token ;; final check for headline (when (possible-headline-or-no-newline-p sentence token) token))))) (t (build-sentence sentence tokenizer token terminator))))) ;; for debugging (defmethod %print-sentence ((sentence sentence)) (terpri) (write-string "[ ") (labels ((walk (token) (when token (format t "~a " (effective-token-value token)) (walk (token-next token))))) (walk (first-token sentence))) (write-char #\]) sentence) (defmethod token-line-length ((token token)) "number of tokens in line starting from TOKEN" (let ((length 0)) (block counted (map-tokens (token-chain token) (lambda (token) (incf length) (when (eq (effective-token-value token) :newline) (return-from counted))) :start token :walk-function #'next-stream-token)) length)) (defmethod possible-headline-or-no-newline-p ((sentence sentence) token) (when token (let ((value (effective-token-value token))) (or (not (eq value :newline)) ; has been dealt with before ;; possible subtitle (and (token-prev token) ;; headline starts with :newline (eq (effective-token-value (first-token sentence)) :newline) ;; :newline is first one in sentence (eq (find-token sentence :newline :start (find-token-if sentence (lambda (value) (not (eq value :newline))) :key #'effective-token-value) :key #'effective-token-value) token) ;; subtitle has bounded length (<= (token-distance (first-token sentence) token) (1+ $max-subtitle-length)) ;; subtitle is shorter than next line (let* ((next-token (next-stream-token token)) (line-length (when next-token (token-line-length next-token)))) (and next-token (or (= line-length 1) (<= (token-distance (first-token sentence) token) line-length))))) ;; possible headline if empty line follows (and ;; but not if we already have signaled a possible headline ;; at the previous newline (token-prev token) ;(not (equal (effective-token-value (token-prev token)) "|")) (eq (effective-token-value (token-prev token)) :newline)))))) (defun ends-in-hyphen-p (token) (let ((str (effective-token-value token))) (and (stringp str) (find-if-not (lambda (c) (char= c #\-)) str) ; hyphen alone does not count (char= (char str (1- (length str))) #\-)))) ; rewrite this one! (defun word-exists-p (word) (let* ((length (length word)) (str (make-string (1+ length)))) (declare (dynamic-extent str)) ; no effect (dotimes (i length) (setf (char str i) (char word i))) (setf (char str length) #\:) (nth-value 1 (match-string (lexicon *tagger*) str)))) ;; destructive! (defun ndowncase-first (string) (setf (char string 0) (char-downcase (char string 0))) string) (defun downcase-first (string) (if (upper-case-p (char string 0)) (ndowncase-first (copy-seq string)) string)) #|| (disambiguate-from-string "det er rom for bedre komprimering siden lemma- og kodelisten inneholder mye felles informasjon") (disambiguate-from-string "det er rom for bedre komprimering siden fullforms-, lemma- og kodelisten inneholder mye felles informasjon") (disambiguate-from-string "Det sitter en nisse p� l�vebrua.") (disambiguate-from-string "Faen ta deg, din j¾vla \"dust\"!") ||# (defmethod merge-quoted-subphrases ((sentence sentence)) (labels ((merge-words (token) (cond ((null token) nil) ((symbolp (effective-token-value token)) (merge-words (token-next token))) ((eq token (last-token sentence)) nil) ((not (find-if-not #'quote-p (effective-token-value token))) (let ((next-quote-token (find-token-if sentence (lambda (value) (and (stringp value) (find-if #'quote-p value))) :start (token-next token) :key #'effective-token-value))) (when (and next-quote-token (search "\"-" (effective-token-value next-quote-token))) (concatenate-tokens token next-quote-token) (merge-words (token-next (or next-quote-token token)))))) (t (merge-words (token-next token)))))) (merge-words (first-token sentence)) ;;(map-tokens sentence #'print) sentence)) (defmethod merge-hyphenated-words ((sentence sentence)) (when *merge-hyphenated-words-p* (labels ((merge-words (token) (cond ((null token) nil) ((eq token (last-token sentence)) nil) ((not (ends-in-hyphen-p token)) (merge-words (token-next token))) ((eq :newline (effective-token-value (token-next token))) (merge-words (merge-hyphenated-words token))) (t (merge-words (token-next (merge-coordinations token))))))) (merge-words (first-token sentence)))) sentence) (defmethod merge-hyphenated-words ((token token)) (let* (;; next token is a :newline token; have to jump over it (next-token (token-next (token-next token))) (next-word (when next-token (effective-token-value next-token)))) (if (find next-word '("og" "eller") :test #'string-equal) ;; coordinated phrase (token-next (merge-coordinations token)) (let* ((hyphen-word (effective-token-value token)) (word (subseq hyphen-word 0 (1- (length hyphen-word)))) (concatenation (concat word next-word))) (labels ((det-quant-p (word) (or (token-match regex::*special-number-regexp* word) (token-match regex::*area-measure-regexp* word) (token-match regex::*rational-number-regexp* word)))) (setf (token-value next-token) (cond ((or ;; look up in lexicon (word-exists-p concatenation) (word-exists-p (concat (ndowncase-first word) next-word)) ;; TO DO: check if concatenation is compound! ) ;; if concatenation without hyphen exists, take it concatenation) ((or (det-quant-p word) (find-if (lambda (feature) ;;(print feature) (has-features-p (cdr feature) '(det kvant))) (lemma-and-features word))) (concat hyphen-word next-word)) ((most-probable-compound-last-chunk concatenation t) concatenation) (t ;; else combine and keep hyphen (concat hyphen-word next-word))))) ;; remove :newline token and prefix, return merged token (remove-token (remove-token token)))))) (defmethod merge-coordinations ((token token)) (labels ((try-merge (prev-token-type tk) (unless (null tk) (let ((next-tk (token-next tk))) (cond ((eq :newline (effective-token-value tk)) (try-merge prev-token-type next-tk)) ((and (eq :hyphen-word prev-token-type) (string= (effective-token-value tk) ",")) (try-merge :comma next-tk)) ((and (eq :hyphen-word prev-token-type) (find (effective-token-value tk) '("og" "eller") :test #'string-equal)) (try-merge :og/eller next-tk)) ((and (eq :comma prev-token-type) (ends-in-hyphen-p tk)) (try-merge :hyphen-word next-tk)) ((eq :og/eller prev-token-type) tk) (t nil)))))) (let ((last-coord-token (try-merge :hyphen-word (token-next token)))) (if last-coord-token (concatenate-tokens token last-coord-token :new-p t) token)))) (defmethod remove-symbol-tokens ((sentence sentence)) (with-slots (first-token last-token) sentence (cond ((null first-token) sentence) ((symbolp (effective-token-value first-token)) (setf first-token (remove-token first-token)) (remove-symbol-tokens sentence)) (t (labels ((remove-newline (token) (cond ((null token) nil) ((symbolp (effective-token-value token)) (multiple-value-bind (next prev) (remove-token token) (if next (remove-newline next) (setf last-token prev)))) (t (remove-newline (token-next token)))))) (remove-newline first-token) sentence))))) (defmethod tag-sentence ((sentence sentence) &key &allow-other-keys) (tag-sentence (first-token sentence)) ;; add sentence-end marker <<< (let ((last-string-token (if (stringp (effective-token-value (last-token sentence))) (last-token sentence) (prev-str-token (last-token sentence))))) (when last-string-token (dolist (features (token-features last-string-token)) (set-feature (cdr features) '<<<)))) sentence) (defparameter *tag-name-suffix-p* t) (defvar *normalize-word-list* nil) (defvar *tag-as-multi-word-expression* t) (defvar *tag-from-gazetteer* t) ;; main tagging function (defmethod tag-sentence ((token token) &key (iterate-p t) &allow-other-keys) (let ((next-token (cond ((stringp (token-value token)) (when *normalize-word-list* ;; If token has normalized-value its value is written to *normalize-word-list*, ;; else, normalized-value is set to the value in *normalize-word-list*. (cond ((null (token-normalized-value token)) #+disabled ;; fix this for merged words! (setf (token-normalized-value token) (dat:string-tree-get (cdr *normalize-word-list*) (token-value token)))) ((string= (token-normalized-value token) (token-value token)) (setf (token-normalized-value token) nil) (dat:string-tree-remove (cdr *normalize-word-list*) (token-value token))) ((string= (token-normalized-value token) "#") nil) (t (setf (dat:string-tree-get (cdr *normalize-word-list*) (token-value token)) (token-normalized-value token))))) (let ((next-token (or (tag-as-foreign-word token) ;; if language slot is not equal language (when *tag-as-multi-word-expression* (tag-as-multi-word-expression token iterate-p)) (tag-as-punctuation token) (tag-from-memory token :return-nil-p nil) (when *tag-from-gazetteer* (tag-from-gazetteer token)) ;; returns NIL (tag-from-special-list token) (tag-as-word token :no-subst-prop-p t) #+debug (progn (describe token) (print :ppppp) nil) (tag-as-word token :features-fn #'get-scarrie-features) ;;(progn (print token) nil) (tag-as-one-word-expression token) ;;(tag-as-compound token :hyphenated-only-p t) (tag-as-proper-noun token) #+:nny-parser (when *lookup-unknown-in-nny-lexicon-p* (tag-as-nny token)) ;; preliminary (unless *no-compounds-p* (tag-as-compound token) (tag-as-interjection token) (tag-as-proper-noun token :second-try t) (tag-as-symbol token) #+:nny-parser (when *lookup-unknown-in-nny-lexicon-p* (tag-as-nny-compound token))) ;; preliminary (when *no-compounds-p* (tag-as-proper-noun token :second-try t)) (tag-as-unknown token)))) (when *tag-name-suffix-p* ;; don't if word is tagged from gazetteer?? (tag-name-suffix token)) (when *token-memory* (let ((token (if (eq next-token t) token (or (token-prev next-token) token)))) (when token (memoize-token (multi-tagger-memory *token-memory*) ;; can't simply take TOKEN because some of the TAG-AS-...-functions ;; merge tokens, invalidating TOKEN. token :key :lemma+value)))) next-token)) (t (token-next token))))) (when iterate-p (tag-sentence next-token)))) ;; This word list can be bound locally by an individual tagging task. Words in that list override the main lexicon. ;; When bound to a word list should look like (name tree . reversed-tree). (defvar *special-word-list* nil) (defun tag-from-special-list (token) (when *special-word-list* (tag-as-word token :net (cddr *special-word-list*) :pn-at-sentence-start-p nil))) (defun tag-from-gazetteer (token) (with-slots (new-wordforms) *nbo-tagger* (or (and (upper-case-p (first-char (effective-token-value token))) (tag-as-name token)) #+test (tag-as-word token :net (cdr new-wordforms) :pn-at-sentence-start-p nil)) nil)) #+test (print (let ((*tagger* *nbo-tagger*)) (lemma-and-features "Addis Abeba" ;;"Oslo" :net (cdr (names *tagger*))))) #+test (with-slots (names) *nbo-tagger* (print (get-features "Og" :net (cdr names)))) (defun find-name (token) (with-slots (names) *tagger* ;;*nbo-tagger* (let* ((max-token nil) (features ())) (labels ((find-max-token (token str wc) (when token (let* ((extended-str (if str (concat str " " (effective-token-value token)) (effective-token-value token))) (ef (get-features extended-str :net (cdr names)))) (when ef (setf max-token token features ef)) (when (< (incf wc) $expression-max-length) (find-max-token (next-str-token token) extended-str wc)))))) (find-max-token token nil 0) (when *debug-mem* (when max-token (print (list :find-name max-token (mapcar (lambda (f) (code-features (cdr f))) features))))) (values max-token features))))) (defun subsumes-p (bw1 bw2) (loop for b1 across bw1 for b2 across bw2 always (or (zerop b1) (and (= 1 b1) (= 1 b2))))) ;;(print (subsumes-p #*10001 #*11001)) (defun tag-as-name (token) (multiple-value-bind (max-token features) (find-name token) (cond ((null max-token) nil) ((eq token max-token) (setf (token-features token) (append (token-features token) features)) (when *record-tokenization-p* (push :nm (token-used-rules token))) (or (token-next token) t)) (t (let ((conc-token (concatenate-tokens token max-token :new-p t :label :nm))) (setf (token-features conc-token) features) (when *record-tokenization-p* (push :nm (token-used-rules conc-token))) (or (token-next conc-token) t)))))) (defun tag-name-suffix (token) (with-slots (suffixes) *tagger* ;;(print (mapcar #'car (token-features token))) (dolist (f (token-features token)) ;;(print (list token (token-features token))) (destructuring-bind (lemma . bv) f (let* ((suffix (find-if (lambda (sx) (let ((sx (if (consp sx) (car sx) sx))) (if (char= (char sx 0) #\*) (and (= (length lemma) (1- (length sx))) (string-equal lemma sx :start2 1)) (and (> (length lemma) (length sx)) (string-equal lemma sx :start1 (- (length lemma) (length sx))))))) suffixes)) (suffix-tag (cond ((null suffix) nil) ((consp suffix) (cadr suffix)) (t suffix)))) (when suffix-tag (add-features bv (intern (string-upcase (concat "<*" suffix-tag ">")) :cgp)))))))) #+test (print (gethash '<*vei> (feature-table *nbo-tagger*))) (defun tag-as-foreign-word (token) (with-slots (language) token (when (and language (not (eq language (language *tagger*)))) (unless (token-features token) (setf (token-features token) (list (%code-features (effective-token-value token) 'foreign)))) (when *record-tokenization-p* (push :fgn (token-used-rules token))) (or (token-next token) t)))) #+:nny-parser (defun tag-as-nny (token) (let ((*tagger* *nny-tagger*)) (let ((next-token (tag-as-word token))) (when next-token (translate-features token *tagger* *nbo-tagger*) next-token)))) #+:nny-parser (defun tag-as-nny-compound (token) (let ((*tagger* *nny-tagger*)) (let ((next-token (tag-as-compound token))) (when next-token (translate-features token *tagger* *nbo-tagger*) next-token)))) (defmethod translate-features ((token token) from-tagger to-tagger) (let ((*tagger* from-tagger)) (dolist (w+fv (token-features token)) (when (cdr w+fv) (setf (cdr w+fv) (let* ((from-features (code-features (cdr w+fv))) (*tagger* to-tagger)) (apply #'encode-features (cons 'nynorsk (substitute 'ent 'eint from-features))))))))) ;; new 31.7.2001 (defun tag-from-memory (token &key return-nil-p) #+debug(print token) (when (and *token-memory* (sentence-start-p token) ;;(null (token-prev token)) (upper-case-p (char (effective-token-value token) 0))) (let* ((word (string-downcase (effective-token-value token))) (memoized-tokens (gethash word (token-table (multi-tagger-memory *token-memory*))))) (dolist (memoized-token memoized-tokens) ;; closed categories previously wrongly memoized as subst prop should not be tagget from memory (unless (find-if (lambda (pos) (loop for lemma.features in (lc-features memoized-token) thereis (and (cdr lemma.features) (has-feature-p (cdr lemma.features) pos)))) '(det konj subj adv prep pron)) (let* ((prop-feature (find-if (lambda (feature) (has-features-p (cdr feature) '(subst prop))) (token-features memoized-token))) (has-already-prop-feature-p (and prop-feature (find-if (lambda (feature) (has-features-p (cdr feature) '(subst prop))) (token-features token))))) (when (and prop-feature (not has-already-prop-feature-p)) ;; *** don't take over all features! (not gen!) (setf (token-features token) (cons prop-feature (token-features token)) (lc-features token) (lc-features memoized-token)) (when *debug-mem* (print (list :tag-from-memory token :tf (mapcar (lambda (f) (code-features (cdr f))) (token-features token)) :lcf (mapcar (lambda (f) (code-features (cdr f))) (lc-features token))))) ;; new 22.10.2003 (unless return-nil-p (return-from tag-from-memory (or (token-next token) t))))))) (token-features token))) nil) (defmethod combine-equal-features ((token t)) token) (defmethod combine-equal-features ((token token)) (let ((combined-p nil)) (loop for (form+features . rest) on (token-features token) for others = (collecting (loop for rest-features on rest do (destructuring-bind (form1 . features1) (car rest) (when (equal features1 (cdr form+features)) (collect form1) (setf (car rest) nil))))) when others do (setf combined-p t (car form+features) (cons (car form+features) others))) (when combined-p (setf (token-features token) (remove-if #'null (token-features token)))) token)) ; *sentence* #+old (defmethod tag-sentence ((token token)) (tag-sentence (or (tag-as-multi-word-expression token) (tag-as-punctuation token) (tag-as-word token) (tag-as-proper-noun token) (tag-as-compound token) (tag-as-interjection token) (tag-as-proper-noun token :second-try t) (tag-as-unknown token)))) (defmethod tag-sentence ((token t) &key &allow-other-keys) nil) ;; for testing #+buggy (defun tag-word (word) (let ((token (get-token :value word :prev (get-token :value word)))) (tag-sentence token) (token-features token))) ;; the TAG-AS-functions return, if successful, the next untagged token ;; (or T if there is no token left), else NIL ;; the longest match wins ;; if there is equalty, prefer number before expression before uncomprehensible (defun tag-as-multi-word-expression (token &optional (iterate-p t)) (multiple-value-bind (number-token number-features) (find-number token) (multiple-value-bind (exp-token exp-features) (find-expression token) (multiple-value-bind (unc-token unc-features) (find-uncomprehensible-expression token) (multiple-value-bind (max-token features) (if (or (token< number-token exp-token) (token< number-token unc-token)) (if (token< exp-token unc-token) (values unc-token unc-features) (values exp-token exp-features)) (values number-token number-features)) (when max-token (cond (iterate-p (let ((conc-token (concatenate-tokens token max-token :new-p t :label :mw))) (setf (token-features conc-token) (if (or (eq features number-features) (eq features unc-features)) (mapcar (lambda (f) (cons (effective-token-value conc-token) f)) features) features)) (when *record-tokenization-p* (pushnew :mw (token-used-rules conc-token))) (or (token-next conc-token) t))) (t (setf (token-features token) (if (or (eq features number-features) (eq features unc-features)) (mapcar (lambda (f) (cons (effective-token-value token) f)) features) features)) (or (token-next token) t))))))))) (defun tag-as-one-word-expression (token) (let ((ef (expression-features (effective-token-value token)))) (when ef (setf (token-features token) ef) (or (token-next token) t)))) ;; merges tokens and concatenates the token strings ;; If NEW-P is true, the concatenated token is a fresh token which is inserted into the chain (defun concatenate-tokens (first-token last-token &key new-p features label) #+debug(print (list :t first-token :m last-token)) (labels ((merge-tokens (token string) (cond ((not (eq token last-token)) (let* ((next-token (token-next token)) (next-value (effective-token-value next-token))) (merge-tokens (remove-token token) (if (or (eq :newline next-value) (insignificant-token-p next-token)) string (concat string (if (or (right-quote-token-p (next-str-token token)) (right-punctuation-token-p (next-str-token token)) (left-quote-p (last-char string))) "" " ") next-value))))) (new-p ;;(print (list :concatenating first-token last-token)) (let ((new-token (make-instance (class-of token) :chain (token-chain token) :prev (token-prev token) :next (token-next token) :expansion #-test (cons first-token last-token) #+test (cons (if (token-expansion first-token) (car (token-expansion first-token)) first-token) (if (token-expansion last-token) (cdr (token-expansion last-token)) last-token)) :value string :whitespace (token-whitespace first-token) :features (or features (token-features token)) :used-rules (list (or label :rx)) :position (token-stream-position first-token)))) ;;(print (list :-> new-token)) (if-let (prev (token-prev token)) (setf (token-next prev) new-token );;(token-prev token) nil) (setf (first-token (token-chain token)) new-token)) (if-let (next (token-next token)) (setf (token-prev next) new-token );;(token-next token) nil) (setf (last-token (token-chain token)) new-token)) )) (t (setf (token-value last-token) string (token-normalized-value last-token) nil (token-stream-position last-token) (token-stream-position first-token) (token-whitespace last-token) (token-whitespace first-token)) last-token)))) (merge-tokens first-token (effective-token-value first-token)))) ;; *sentence* ;; debug ;(defparameter *token* nil) ;; returns the rightmost of the successive tokens (starting with TOKEN) matching the regexp (defun token-match (regexp token &key end-test) (regex:string-match regexp token :exact t :maximal t :end-test end-test)) (defun token< (token1 token2) (or (and (null token1) token2) (and token1 token2 (or (eq token1 (token-prev token2)) (token< token1 (token-prev token2)))))) #+test (regex::string-match regex::*number-regexp* "10") #+bit-vectors-ignore (defun find-number (token) (let* ((last-matched-token (token-match regex::*number-regexp* token :end-test (lambda (token) (and token (equal (effective-token-value token) ".") (not (token-next token)))))) (max-token last-matched-token) (features ())) (when last-matched-token (push (cond ((and (eq last-matched-token token) (string= (effective-token-value token) "1")) (list 'det 'kvant 'ent)) ((char= (u:last-char (effective-token-value token)) #\.) (list 'adj ' 'fl)) (t (list 'det 'kvant 'fl))) features)) (labels ((check-regexp (regexp f &optional add-if-eq) (setf last-matched-token (token-match regexp token)) (cond ((and add-if-eq (eq max-token last-matched-token)) (push f features) #+ignore (setf features (append features f))) ((token< max-token last-matched-token) (setf features (list f) max-token last-matched-token)) (t nil)))) (check-regexp regex::*special-number-regexp* (list 'det 'kvant)) (check-regexp regex::*area-measure-regexp* (list 'det 'kvant)) (check-regexp regex::*rational-number-regexp* (list 'det 'kvant)) (check-regexp regex::*date-regexp* (list 'subst ') t) (check-regexp regex::*time-regexp* (list 'subst ') t) (check-regexp regex::*amount-regexp* (list 'det ') t) (check-regexp regex::*uppercase-roman-numeral-regexp* (list 'det ') t) (when (sentence-start-p token) (check-regexp regex::*sentence-start-roman-numeral-regexp* (list 'det ') t)) (values max-token features)))) (defun find-number (token) (let* ((last-matched-token (token-match regex::*number-regexp* token :end-test (lambda (token) (and token (equal (effective-token-value token) ".") (not (token-next token)))))) (max-token last-matched-token) (features ())) (when last-matched-token (push (cond ((and (eq last-matched-token token) (string= (effective-token-value token) "1")) (encode-features 'det 'kvant 'ent)) ((char= (last-char (effective-token-value token)) #\.) (encode-features 'adj ' 'fl)) (t (encode-features 'det 'kvant 'fl))) features)) (labels ((check-regexp (regexp f &optional add-if-eq) (setf last-matched-token (token-match regexp token)) (cond ((and add-if-eq (eq max-token last-matched-token)) (push f features) #+ignore (setf features (append features f))) ((token< max-token last-matched-token) (setf features (list f) max-token last-matched-token)) (t nil)))) (check-regexp regex::*special-number-regexp* (encode-features 'det 'kvant)) (check-regexp regex::*area-measure-regexp* (encode-features 'det 'kvant)) (check-regexp regex::*rational-number-regexp* (encode-features 'det 'kvant)) (check-regexp regex::*date-regexp* (encode-features 'subst ') t) (check-regexp regex::*time-regexp* (encode-features 'subst ') t) (check-regexp regex::*amount-regexp* (encode-features 'det ') t) (check-regexp regex::*uppercase-roman-numeral-regexp* (encode-features 'det ') t) (when (sentence-start-p token) (check-regexp regex::*sentence-start-roman-numeral-regexp* (encode-features 'det ') t)) #+debug(print (list :token token :max max-token)) (values max-token features)))) ;; TO DO: eliminate consing (defun find-expression (token) (let* ((max-token nil) (features ())) (labels ((find-max-token (token str wc) (when token (let* ((extended-str (if str (concat str " " (effective-token-value token)) (effective-token-value token))) ;; check only if extended-str consists of more than one word (ef (when str (expression-features extended-str)))) (when ef (setf max-token token features ef)) (when (< (incf wc) $expression-max-length) (find-max-token (next-str-token token) extended-str wc)))))) (find-max-token token nil 0) (if (sentence-start-p token) ;; check if lowercase expression exists (let ((orig-max-token max-token) (orig-features features) (lc-str (downcase-first (effective-token-value token)))) (find-max-token (next-str-token token) lc-str 1) ;; compare found expressions for downcased and original string, return longest ;; match if length differs, or append features if matches are of equal length (cond ((eq max-token orig-max-token) (values max-token (append orig-features features))) ((token< orig-max-token max-token) (values max-token features)) (t (values orig-max-token orig-features)))) (values max-token features))))) ;; old ;; (dotimes (i 1000) (expression-features "i morgen")) took 505 milliseconds (0.505 seconds) to run. ;; 1,104,008 bytes of memory allocated. ;; new, with bit-vectors ;; (dotimes (i 1000) (expression-features "i morgen")) took 985 milliseconds (0.986 seconds) to run. ;; 1,240,016 bytes of memory allocated. ;; without bit-vector translation ;; (dotimes (i 1000) (expression-features "i morgen")) took 314 milliseconds (0.314 seconds) to run. ;; 944,000 bytes of memory allocated. ;; 8.4.2002 Allegro/Decentius: ;; 32 milliseconds ;; 347,000 bytes of memory allocated. ;; *** BUG: (d "i morgen") (defun expression-features (exp) "Returns the features of an expression." ;; TO DO: simplify this: put them into one net. (or (lemma-and-features exp :net (expressions *tagger*)) (lemma-and-features exp :net (abbreviations *tagger*)) (lemma-and-features exp :net (word-like-abbreviations *tagger*)) (lemma-and-features exp :net (titles *tagger*)) (lemma-and-features exp :net (symbols *tagger*)))) #+old (defun expression-features (exp) (mapcar (lambda (value) (let* ((colon-pos (position #\: value)) (lemma (subseq value 0 colon-pos)) (features (read-from-string (u:concat "(" (subseq value (1+ colon-pos)) ")")))) (cons lemma features))) ;; TO DO: simplify this: put them into one net. (or (string-values *expressions* exp) (string-values *abbreviations* exp) (string-values *word-like-abbreviations* exp) (string-values *titles* exp) (string-values *symbols* exp)))) ;; TO DO: account for quote nesting (defun find-uncomprehensible-expression (token) (when (not (find-if-not #'left-quote-p (effective-token-value token))) (let ((known 0) (unknown 0)) (labels ((test-token (token) (cond ((null token) ;; should not happen in well-formed text ;; means that quotes are not balanced and we reached the end ;; of the sentence before a right quote nil) ((find-if-not #'right-quote-p (effective-token-value token)) ;; OBS: we check also for compounds (if (or (word-p token) ;; TO DO: look up downcased etc. variants? (most-probable-compound-last-chunk (effective-token-value token))) (incf known) (incf unknown)) (test-token (next-str-token token))) ((> unknown known) ;; #-bit-vectors (values token (list (list 'subst 'prop))) (when *debug-mem* (print (list :find-uncomp-exp token '(subst prop)))) (values token (list (encode-features 'subst 'prop)))) (t nil)))) (test-token token))))) (defun %code-features (lemma &rest features) (cons lemma (apply #'encode-features features))) #+test (let ((*tagger* *nbo-tagger*)) (print (%code-features "fifi" 'subst 'foreign))) (defun all-p (predicate codes) (not (find-if-not predicate codes))) (defun tag-as-punctuation (token) (let* ((str (effective-token-value token)) (features (cond ((string= str ".") (list (%code-features "$." 'CLB '))) ((morph::all-p (lambda (c) (char= c #\.)) str) (list (%code-features "$..." 'CLB '))) ((string= str "|") (list (%code-features "$|" 'CLB '))) ((string= str "¶") (if (prev-str-token token) ;; headline if terminates sentence (list (%code-features "$¶" 'CLB ')) (list (%code-features "$¶")))) ((string= str ",") (list (%code-features "$," 'CLB ') (%code-features "$," '))) ((all-p (lambda (c) (char= c #\!)) str) (list (%code-features "$!" 'CLB '))) ((string= str ":") (list (%code-features "$:" 'CLB '))) ((string= str ";") (list (%code-features "$;" 'CLB '))) ((all-p (lambda (c) (char= c #\?)) str) (list (%code-features "$?" 'CLB '))) ((string= str "-") (list (%code-features "$-" '))) ((string= str "--") (list (%code-features "$--" '))) ((not (find-if-not #'quote-p str)) (list (%code-features "$\"" '))) ((string= str "(") (list (%code-features "$(" '))) ((string= str ")") (list (%code-features "$)" '))) ((string= str "[") (list (%code-features "$[" '))) ((string= str "]") (list (%code-features "$]" '))) (t nil)))) (when features (when *record-tokenization-p* (push :pct (token-used-rules token))) (setf (token-features token) features) (or (token-next token) t)))) #+bit-vectors-ignore (defun tag-as-punctuation (token) (let* ((str (effective-token-value token)) (features (cond ((string= str ".") (list (list "$." 'CLB '))) ((all-p (lambda (c) (char= c #\.)) str) (list (list "$..." 'CLB '))) ((string= str "|") (list (list "$|" 'CLB '))) ((string= str ",") (list (list "$," 'CLB ') (list "$," '))) ((all-p (lambda (c) (char= c #\!)) str) (list (list "$!" 'CLB '))) ((string= str ":") (list (list "$:" 'CLB '))) ((string= str ";") (list (list "$;" 'CLB '))) ((all-p (lambda (c) (char= c #\?)) str) (list (list "$?" 'CLB '))) ((string= str "-") (list (list "$-" '))) ((string= str "--") (list (list "$--" '))) ((not (find-if-not #'quote-p str)) (list (list "$\"" '))) ((string= str "(") (list (list "$(" '))) ((string= str ")") (list (list "$)" '))) (t nil)))) (when features (when *record-tokenization-p* (push :pct (token-used-rules token))) (setf (token-features token) features) (or (token-next token) t)))) ;; preliminary (defun right-punctuation-token-p (token) (let ((str (effective-token-value token))) (and (stringp str) (<= 1 (length str)) (find (char str 0) $right-punctuation)))) ;; preliminary (defun quote-token-p (token) (let ((str (effective-token-value token))) (and (stringp str) (= (length str) 1) (quote-p (char str 0))))) (defun left-quote-token-p (token) (let ((str (effective-token-value token))) (and (stringp str) (= (length str) 1) (left-quote-p (char str 0))))) (defun quote-hyphen-token-p (token) (let ((str (effective-token-value token))) (and (stringp str) (string= "-" str)))) (defun left-parenthesis-token-p (token) (let ((str (effective-token-value token))) (and (stringp str) (= (length str) 1) (left-parenthesis-p (char str 0))))) (defun right-quote-token-p (token) (let ((str (effective-token-value token))) (and (stringp str) (= (length str) 1) (right-quote-p (char str 0))))) (defun stroke-token-p (token) (let ((str (effective-token-value token))) (and (stringp str) (or (and (= (length str) 1) (char= (char str 0) #\-)) (string= str "--"))))) (defparameter *sentence-start-p-fun* nil) (defun sentence-start-p (token) (or (null (prev-str-token token)) (and (null (prev-str-token (prev-str-token token))) (or (equal (effective-token-value (prev-str-token token)) "\"") (quote-token-p (prev-str-token token)) (stroke-token-p (prev-str-token token)) (and *sentence-start-p-fun* (funcall *sentence-start-p-fun* token)))))) (defun first-uppercase-p (str) (and (not (zerop (length str))) (upper-case-p (char str 0)))) ;; look up in lexicon; merge with the next function (defun word-p (token) (let* ((word (string-trim "\"" (effective-token-value token))) (features (get-features word))) ; TO DO: remove unnecessary consing here (when features (return-from word-p t)) (when (first-uppercase-p word) ;; word starts with capital letter (let ((first-lowercase-word (downcase-first word)) (lowercase-word (string-downcase word))) ;; lookup if we are not at sentence start or right after left quote (when (or (sentence-start-p token) ;; changed 22.09.2000 (left-parenthesis-token-p (prev-str-token token)) (quote-hyphen-token-p (prev-str-token token)) (left-quote-token-p (prev-str-token token))) (when (get-features first-lowercase-word) (return-from word-p t))) ;; more than one capital letter ; ** what if first is not capital? (when (string/= lowercase-word first-lowercase-word) (when (get-features lowercase-word) (return-from word-p t))) ;; word consists of exactly one capital letter (when (and (= (length word) 1) (string/= word lowercase-word)) (unless (or (sentence-start-p token) (quote-token-p (token-prev token))) (when (get-features first-lowercase-word) (return-from word-p t))) )) ;; TO DO: apostrophe (??) diacritics?? nil))) #+test (get-features "Olav") #+test (get-features "Dr.") #+test (get-features "dr.") ;; new 23.01.2001 (defun append-features (features1 features2) ;;(print (list :appending features1 features2)) (dolist (f features1) (pushnew f features2 :test #'equal)) features2) (defparameter *sentence-initial-lc-features-p* t) ;; look up in lexicon (defun tag-as-word (token &key (net (lexicon *tagger*)) (pn-at-sentence-start-p t) no-subst-prop-p (features-fn #'get-features)) (let* ((word (string-trim $quote (effective-token-value token))) (features (append-features (token-features token) (funcall features-fn word :net net)))) (when (first-uppercase-p word) ;; word starts with capital letter (let ((first-lowercase-word (ndowncase-first (copy-seq word))) (lowercase-word (string-downcase word)) (capitalized-word (when (sentence-uppercase-p (token-chain token)) (string-capitalize word)))) ;; lookup if we are not at sentence start or right after left quote (when (or (sentence-start-p token) (left-parenthesis-token-p (prev-str-token token)) (quote-hyphen-token-p (prev-str-token token)) (left-quote-token-p (prev-str-token token))) (setf features (append-features features (funcall features-fn first-lowercase-word :net net))) (when *sentence-initial-lc-features-p* (add-lowercase-features token))) ;; sentence was all uppercase (when capitalized-word (setf features (append-features features (funcall features-fn capitalized-word :net net)))) ;; more than one capital letter ; ** what if first is not capital? (when (string/= lowercase-word first-lowercase-word) (setf features (append-features features (funcall features-fn lowercase-word :net net)))) ;; word consists of exactly one capital letter (when (and (= (length word) 1) (string/= word lowercase-word)) (unless (or (sentence-start-p token) (quote-token-p (prev-str-token token))) (when *debug-mem* (print (list :prop-one-char token))) (unless (sentence-start-p token) (add-lowercase-features token)) (setf features (cons (cons word (encode-features 'subst 'prop)) features)) (setf features (append-features features (funcall features-fn first-lowercase-word :net net)))))) ;; TO DO: apostrophe (??) diacritics?? ) (cond (features (when *record-tokenization-p* (push :wd (token-used-rules token))) (setf (token-features token) features) (or (token-next token) t)) ((and pn-at-sentence-start-p (sentence-start-p token) (let* ((word (effective-token-value token)) (hyphen-pos (position #\- word))) (not (and hyphen-pos (< hyphen-pos (1- (length word))) (lower-case-p (char word (1+ hyphen-pos))))))) ;; we suspect that it is an unrecognized name (when *debug-mem* (print (list :prop-unrec-name token))) (unless no-subst-prop-p (setf (token-features token) (list (cons word (encode-features 'subst 'prop))))) nil) (t nil)))) (defun suffix-p (str sfx) "checks if STR ends in SFX" (let ((start (- (length str) (length sfx)))) (and (<= 0 start) (string= str sfx :start1 start)))) (defun find-genitive-root (word) (cond ((suffix-p word "ss") nil) ((find-if (lambda (sfx) (suffix-p word sfx)) '("s'" "z'" "sch'" "sh'" "sj'" "x'" "c'")) (subseq word 0 (- (length word) 1))) ((suffix-p word "'s") (subseq word 0 (- (length word) 2))) ((suffix-p word "s") (subseq word 0 (- (length word) 1))) (t nil))) #+test (setf *cg* *nbo-cg* *tagger* *nbo-tagger*) (defun get-features (word &key (net (lexicon *tagger*))) (when net (let* ((gen-root (find-genitive-root word)) (lemma+features (lemma-and-features word :net net)) (gen-lemma+features (when gen-root (add-gen-feature (lemma-and-features gen-root :net net))))) (append lemma+features gen-lemma+features)))) (defun get-scarrie-features (word &key add-styles-p &allow-other-keys) (when *scarrie-net* (let* ((gen-root (find-genitive-root word)) (lemma+features (scarrie-lemma-and-features word :add-styles-p add-styles-p)) (gen-lemma+features (when gen-root (add-gen-feature (scarrie-lemma-and-features gen-root :add-styles-p add-styles-p))))) (append lemma+features gen-lemma+features)))) #+bit-vectors-ignore (defun add-gen-feature (features-list) ;; ... and remove readings that are incompatible with gen (u:collecting (dolist (features features-list) (cond ((find-if (lambda (f) (find f '(konj pron interj prep gen))) (cdr features)) nil) ((and (find 'verb (cdr features)) (find 'imp (cdr features))) nil) ((find 'gen (cdr features)) (u:collect features)) (t (u:collect ;(list* (car features) 'gen (cdr features)) (append features (list 'gen)))))))) (defun add-gen-feature (features-list) ;; ... and remove readings that are incompatible with gen (collecting (dolist (features features-list) (let ((feature-vector (cdr features))) (cond ((thereis-feature-p feature-vector '(konj pron interj prep gen)) ; *** why gen? nil) ((has-features-p feature-vector '(verb imp)) nil) ((has-feature-code-p feature-vector (feature-code 'gen)) ; TO DO: change the others appropriately (collect features)) (t (set-feature feature-vector 'gen) #+ignore (collect (append features (list 'gen))) (collect features))))))) ;; TO DO: do some memoizing here (defmethod sentence-uppercase-p ((sentence sentence)) "Checks if whole sentence is in uppercase" (labels ((upper-p (token) (or (null token) (and (or (not (stringp (effective-token-value token))) (not (find-if #'lower-case-p (effective-token-value token)))) (upper-p (token-next token)))))) (upper-p (first-token sentence)))) #-bit-vector (defmethod proper-noun-p ((token token)) (find-if (lambda (features) (find 'prop (cdr features))) (token-features token))) #+bit-vector (defmethod proper-noun-p ((token token)) (find-if (lambda (features) (has-feature-p (cdr features) 'prop)) (token-features token))) #+old (defmethod add-lowercase-features ((token token)) nil) (defmethod add-lowercase-features ((token token)) #+debug(print (list :add-lc token)) (let* ((word (string-downcase (string-trim $quote (effective-token-value token)))) (features (or (get-features word) (tag-compound word)))) (when features ;; add feature 'prop as sign for starting proper noun phrase (loop for (w . fv) in features do (progn w (set-feature fv 'prop) #+ignore(set-feature fv 'lc))) (setf (lc-features token) features) (when *debug-mem* (print (list :add-lowercase-features token (token-chain token) :features (mapcar (lambda (f) (code-features (cdr f))) (token-features token)) :lc-features (mapcar (lambda (f) (code-features (cdr f))) (lc-features token)))))))) (defun tag-as-proper-noun (token &key second-try) #+debug(print (list :token token :secondp second-try)) (let ((word (string-trim $quote (effective-token-value token)))) (when ;; not at sentence start, ;; and second try, where we are less demanding, or: ;; word starts with uppercase, ;; does not contain hyphen followed by lowercase (or (and (not second-try) (not (sentence-start-p token)) (first-uppercase-p word) (or (not (sentence-uppercase-p (token-chain token))) (when-let (prev-token (prev-str-token token)) (proper-noun-p prev-token))) (let ((hyphen-pos (position #\- word))) (or (not hyphen-pos) (= hyphen-pos (1- (length word))) (not (lower-case-p (char word (1+ hyphen-pos))))))) (and second-try (sentence-start-p token))) (let ((gen-root (find-genitive-root word))) (cond ((not gen-root) ;; seems to be a proper noun (when *debug-mem* (print (list :prop-1 token))) (setf (token-features token) ;; bit-vector has to be fresh (list (cons word (encode-features 'subst 'prop)))) ;; add lowercase-features (when (or (not (sentence-start-p token)) #+ignore *sentence-initial-lc-features-p*) (add-lowercase-features token))) ((and (suffix-p word "s") (not (suffix-p word "ss"))) ;; gen or non-gen (when *debug-mem* (print (list :prop-2 token))) (setf (token-features token) (list (cons word (encode-features 'subst 'prop)) (cons gen-root (encode-features 'subst 'prop 'gen))))) (t ;; gen (when *debug-mem* (print (list :prop-gen token))) (setf (token-features token) (list (cons gen-root (encode-features 'subst 'prop 'gen))))))) (when *token-memory* (let* ((word (effective-token-value token)) (memoized-tokens (gethash (string-downcase word) (token-table (multi-tagger-memory *token-memory*))))) (dolist (memoized-token memoized-tokens) (when (and (upper-case-p (char (effective-token-value memoized-token) 0)) ;; is this sensible? (not (string= (effective-token-value token) (effective-token-value memoized-token))) ;; or this? #+test (not (upper-case-p (char (effective-token-value token) 0)))) (setf (token-features memoized-token) (append-features (token-features token) (token-features memoized-token)))) ))) (when *record-tokenization-p* (push :pn (token-used-rules token))) (when (or (not (sentence-start-p token)) #+ignore *sentence-initial-lc-features-p*) (or (token-next token) t))))) ;(tag-word "St.Olaf's") ;(mt "TROSSET HIV-FAREN FOR � F� BARN MED TINE ELLER STINE") ;(mt "DR. OLAV TRYGGE STORVIK") (defun grammar-code-to-features (str code &key features-as-list-p) (let ((str+code (concat str ":" (%alpha-to-decimal code)))) (declare (dynamic-extent str+code)) (lemma-and-features str+code :decompress-base str :features-as-list-p features-as-list-p :net (code-lexicon *tagger*)))) #|| (string-net::print-strings (lemmata-forms *tagger*)) (print (grammar-code-to-features "forvaltningssystemene" "810:4")) ;(grammar-code-to-features "base" "702:0") (let ((*tagger* (multi-tagger *nbo-cg*))) (grammar-code-to-features "base" "702:0")) (let ((*tagger* (multi-tagger (newest-cg :nbo)))) #-ignore (string-net::print-strings (code-lexicon *tagger*)) #+ignore (match-string (code-lexicon *tagger*) "nummer:è::è") #+ignore (Print (string-values (code-lexicon *tagger*) "fisk")) #+ignore (grammar-code-to-features "nummer" "835:0")) (%decode-str (%alpha-to-decimal "835:1")) (alpha-to-decimal "835:1") (decimal-to-alpha 835 1) (let ((*tagger* (multi-tagger *nbo-cg*))) (string-net::string-values (code-lexicon *tagger*) "nummer")) (print (%decode-str "ë::è")) "nummer:ë::è" "nummer:è::´" "nummer:!è::è" "nummer:!ë::è" "nummer:#è::´" ||# #+old (defun tag-compound (word) (let ((analysis (most-probable-compound-last-chunk word))) (when analysis (destructuring-bind (sfx . codes) analysis (let ((pfx (subseq word 0 (- (length word) (length sfx))))) ;; TO DO: remove duplicates (collecting (dolist (code codes) (print code) (dolist (lemma+features (grammar-code-to-features sfx code)) (destructuring-bind (lemma . features) lemma+features (collect (cons (concat pfx (string-left-trim "-" lemma)) (append features (list "samset"))))))))))))) #+bit-vectors-ignore (defun tag-compound (word) (let ((analysis (most-probable-compound-last-chunk word t))) (when analysis (destructuring-bind (sfx . codes) analysis (let ((pfx (subseq word 0 (- (length word) (length sfx))))) (remove-duplicates (u:collecting (dolist (code codes) ;; remove forl_...-codes (cmp. def. of v_fuge_fullform, v_fuge_lemma) (let ((colon-pos (position #\: code))) (dolist (lemma+features (if (= colon-pos 3) (grammar-code-to-features sfx code) (grammar-code-to-features (concat "-" sfx) (subseq code 2)))) (destructuring-bind (lemma . features) lemma+features (u:collect (cons (concat pfx (string-left-trim "-" (if (find-if #'lower-case-p word) lemma (string-upcase lemma)))) (append features (list 'samset))))))))) :test (lambda (fl1 fl2) (and (string= (car fl1) (car fl2)) (loop for f1 in (cdr fl1) and f2 in (cdr fl2) always (eq f1 f2)))))))))) #+ignore (let* ((*tagger* *nbo-tagger*) (*analyser-lexicon* (compound-analyser *tagger*))) ;;(morph::print-ranked-analyses "glitterheim") (print (tag-compound "glitterheim"))) (defun tag-compound (word) (let ((analysis (most-probable-compound-last-chunk word t))) (when analysis (destructuring-bind (sfx . codes) analysis (let ((pfx (subseq word 0 (- (length word) (length sfx))))) (remove-duplicates (collecting (dolist (lemma+features codes) (destructuring-bind (lemma . features) lemma+features (collect (cons (concat pfx (string-left-trim "-" (if (find-if #'lower-case-p word) lemma (string-upcase lemma)))) (set-feature features 'samset)))))) :test (lambda (fl1 fl2) (and (string= (car fl1) (car fl2)) (equal (cdr fl1) (cdr fl2)))))))))) #+old (defun tag-compound (word) (let ((analysis (most-probable-compound-last-chunk word t))) (when (print analysis) (destructuring-bind (sfx . codes) analysis (let ((pfx (subseq word 0 (- (length word) (length sfx))))) (remove-duplicates (collecting (dolist (lemma+features codes) (destructuring-bind (lemma . features) lemma+features (collect (cons (concat pfx (string-left-trim "-" (if (find-if #'lower-case-p word) lemma (string-upcase lemma)))) (set-feature features 'samset)))))) :test (lambda (fl1 fl2) (and (string= (car fl1) (car fl2)) (equal (cdr fl1) (cdr fl2)))))))))) ;; used by UD-base (defun compound-lemma (word) (let ((analysis (most-probable-compound-last-chunk word t))) (when analysis (destructuring-bind (sfx . codes) analysis (let ((pfx (subseq word 0 (- (length word) (length sfx)))) (lemma-list ())) (dolist (lemma+features codes) (destructuring-bind (lemma . features) lemma+features (pushnew (concat pfx (string-left-trim "-" (if (find-if #'lower-case-p word) lemma (string-upcase lemma)))) lemma-list :test #'string=))) lemma-list))))) ;(tag-compound "edderkoppskall") ;; 517 ms #+test (time (with-open-file (stream "projects:cgp;multi-tagger;texts;korea.text") (map-sentences stream #'identity))) ;(tag-compound "kunnskap") ;(most-probable-compound-last-chunk "kunnskapsbase") ;(time (analyse-compound ",,,")) ;(analyse-compound "OLAV") ; (most-probable-compound-last-chunk "OLAV" t) (defun tag-as-compound (token &key hyphenated-only-p) (let ((word (effective-token-value token))) (when (and (or (not hyphenated-only-p) (let ((hyphen-pos (position #\- word))) (and hyphen-pos (< hyphen-pos (1- (length word))) (lower-case-p (char word (1+ hyphen-pos)))))) (> (length word) 3)) (let* ((word (if (and (quote-p (char word 0)) (quote-p (last-char word))) (string-trim $quote word) word)) (gen-root (find-genitive-root word)) (lemma+features (append-features (token-features token) (tag-compound word))) (gen-lemma+features (when gen-root (add-gen-feature (tag-compound gen-root))))) (when (or gen-lemma+features lemma+features) (setf (token-features token) (append lemma+features gen-lemma+features)) (when *record-tokenization-p* (pushnew :cp (token-used-rules token))) (or (token-next token) t)))))) ;; this is easier in Perl... *** use regexp! (defun interjection-p (word &optional (start 0)) "checks for three consecutive vowels" (let ((vowel-pos (position-if (lambda (c) (morph::vowelp (char-downcase c))) word :start start))) (cond ((null vowel-pos) nil) ((> vowel-pos (- (length word) 3)) nil) ((and (morph::vowelp (char-downcase (char word (+ vowel-pos 1)))) (morph::vowelp (char-downcase (char word (+ vowel-pos 2))))) t) (t (interjection-p word (+ vowel-pos 2)))))) #+test (interjection-p "aaargh") (defun tag-as-interjection (token) (let ((word (effective-token-value token))) (when (interjection-p word) ;#-bit-vectors (setf (token-features token) (list (list word 'interj))) (setf (token-features token) (list (%code-features word 'interj))) (when *record-tokenization-p* (push :ij (token-used-rules token))) (or (token-next token) t)))) (defun tag-as-symbol (token) (let ((word (effective-token-value token))) (when (or (and (= (length word) 1) (find (char word 0) "+<>@*#$%&/")) (not (find-if-not (lambda (c) (find c "1234567890+/,.")) word :end (1- (length word))))) (setf (token-features token) (list (%code-features word 'symb))) (when *record-tokenization-p* (push :sy (token-used-rules token))) (or (token-next token) t)))) (defparameter *unknown* ()) ;; no conditions here (defun tag-as-unknown (token) (unless (find (effective-token-value token) *unknown* :test #'equal) (push (effective-token-value token) *unknown*)) (unless (token-features token) (setf (token-features token) (list (%code-features (effective-token-value token) 'ukjent)))) (when *record-tokenization-p* (push :uk (token-used-rules token))) (or (token-next token) t)) (defmethod do-pending-operations ((sentence sentence)) nil) (defmethod do-pending-operations ((sentence regexp-sentence)) (with-slots (pending-operations cg) sentence (labels ((do-operation () (when pending-operations (funcall (car pending-operations) cg sentence) (setf pending-operations (cdr pending-operations)) (do-operation)))) (do-operation)))) #+test (print (feature-vector (multi-tagger *nbo-cg*))) (defun map-sentences (stream fun &key (sentence-class *sentence-class*) (multitag-p t) compare-cg print-function token-print-fn (encoding $encoding)) (let ((*line-buffer* (make-instance 'line-buffer :encoding encoding)) (*sentence-class* sentence-class) (delay-queue (when *token-memory* (make-queue))) (print-queue (when *token-memory* (make-queue))) (token-count 0)) (loop with sentence = (get-sentence :stream stream :sentence-class sentence-class :encoding encoding :cg *cg* :compare-cg compare-cg ;;:feature-vector (feature-vector (multi-tagger *cg*)) ;; don't remove! ) and next-sentence and terminator and token do (cond ((null sentence) ;; empty the queues (cond ((front delay-queue) #+debug(print :null-sentence-delay) (when-let (sentence (pop-queue delay-queue)) (apply-sentence-mapping-function sentence fun :compare-cg compare-cg) #+debug(when (front (regexp-delay-queue *token-memory*)) (print :regexp-delay)) ;; It might happen that the sentence is pushed onto the regexp-delay-queue, ;; then we must not print it here. (if (front (regexp-delay-queue *token-memory*)) (when-let (sentence (apply-sentence-mapping-function (pop-queue (regexp-delay-queue *token-memory*)) fun :compare-cg compare-cg)) (do-pending-operations sentence) #+orig (funcall print-function sentence :token-print-fn token-print-fn)) #+orig (funcall print-function sentence :token-print-fn token-print-fn)))) ((front (regexp-delay-queue *token-memory*)) #+debug(print :null-sentence-regexp) (when-let (sentence (apply-sentence-mapping-function (pop-queue (regexp-delay-queue *token-memory*)) fun :compare-cg compare-cg)) (do-pending-operations sentence) (funcall print-function sentence :token-print-fn token-print-fn))) ((front print-queue) (funcall print-function (pop-queue print-queue) :token-print-fn token-print-fn)))) ((null *token-memory*) (multiple-value-setq (sentence next-sentence terminator token) (build-sentence sentence (tokenizer sentence) token terminator)) (funcall print-function (apply-sentence-mapping-function (if multitag-p (tag-sentence (merge-quoted-subphrases (merge-hyphenated-words sentence))) sentence) fun :compare-cg compare-cg) :token-print-fn token-print-fn)) (t (multiple-value-setq (sentence next-sentence terminator token) (build-sentence sentence (tokenizer sentence) token terminator)) (incf token-count (sentence-length sentence)) (let ((s (tag-sentence (merge-quoted-subphrases (merge-hyphenated-words sentence))))) (enqueue s delay-queue) (enqueue s print-queue)) #+debug(print (list :token-count token-count :context-size (context-size (multi-tagger-memory *token-memory*)))) (when-let (sentence (and (> token-count (context-size (multi-tagger-memory *token-memory*))) (pop-queue delay-queue) (pop-queue print-queue))) (decf token-count (sentence-length sentence)) ;; +-1?? (setf *sentence* sentence) (when-let (sentence (apply-sentence-mapping-function sentence fun :compare-cg compare-cg)) (funcall print-function sentence :token-print-fn token-print-fn))))) while (or next-sentence (and *token-memory* (or (not (empty-queue-p delay-queue)) (not (empty-queue-p print-queue)) (not (empty-queue-p (regexp-delay-queue *token-memory*)))))) do (setf sentence next-sentence)))) ;; restore sentence as before multitagging (defun map-retag-sentence (sentence fun &key (multitag-p t) print-function token-print-fn &allow-other-keys) (labels ((walk (token) (when token (setf (token-features token) nil) (when (and (token-next token) (token-expansion (token-next token))) (setf (token-next token) (car (token-expansion (token-next token))) (token-prev (token-next token)) token)) (let ((nv (token-normalized-value token))) ;;(print (list :nv nv)) (cond ((null nv) nil) #+ignore ;; this doesn't work unless we update both index files when a new word is inserted ((find #\Space nv) (let ((words (split nv #\Space))) (setf (token-normalized-value token) (car words)) (dolist (word (cdr words)) (setf token (insert-token token (get-token :value "" :normalized-value word :chain sentence :position (token-stream-position token))))))) ((string= nv "#") (setf (sentence-array sentence) (let ((new-array (make-array 0 :fill-pointer t :adjustable t))) (loop with i = 0 for tk across (sentence-array sentence) ;; do (print (list tk token)) unless (eq tk token) do (vector-push-extend tk new-array)) new-array)) ;;(describe (sentence-array sentence)) ;;(setf (token-normalized-value token) :insignificant) ))) (walk (token-next token))))) (walk (first-token sentence))) (funcall print-function (apply-sentence-mapping-function (if multitag-p (tag-sentence sentence) sentence) fun) :token-print-fn token-print-fn)) ;; *sentence* ;;(regexp-delay-queue *memory*) (defmethod apply-sentence-mapping-function ((sentence sentence) fun &key &allow-other-keys) (funcall fun sentence #+ignore(tag-sentence (merge-quoted-subphrases (merge-hyphenated-words sentence))))) (defmethod apply-sentence-mapping-function ((sentence compare-sentence) fun &key compare-cg &allow-other-keys) (with-slots (first-token) sentence (labels ((copy-feature-list (features) (when features (destructuring-bind ((str . fv) . rest) features (cons (cons str (copy-seq fv)) (copy-feature-list rest))))) (copy-features (token) (when token (with-slots (next features compare-features used-rules compare-used-rules) token (setf compare-features (copy-feature-list features) compare-used-rules (copy-seq used-rules)) (copy-features next)))) (switch (token) (when token (with-slots (next features compare-features used-rules compare-used-rules) token (let ((temp features)) (setf features compare-features compare-features temp temp used-rules used-rules compare-used-rules compare-used-rules temp)) (switch next))))) ;;(tag-sentence (merge-quoted-subphrases (merge-hyphenated-words sentence))) (copy-features first-token) (funcall fun sentence) (switch first-token) (let ((*cg* compare-cg)) (funcall fun sentence)) (switch first-token) sentence))) #+test (with-open-file (in-stream "projects:cgp;multitagger;texts;test.text") (map-sentences in-stream #'print-sentence)) #+test (with-open-file (stream "projects:cgp;multitagger;texts;test.tag" :direction :output :if-exists :supersede) (with-open-file (in-stream "projects:cgp;multitagger;texts;test.text") (map-sentences in-stream (lambda (s) (print-sentence s stream))))) #+test (with-open-file (stream "projects:cgp;multitagger;texts;delkorp.tag" :direction :output :if-exists :supersede) (with-open-file (in-stream "projects:cgp;multitagger;texts;delkorp.text") (map-sentences in-stream (lambda (s) (print-sentence s stream))))) #+test (with-open-file (stream "projects:cgp;multitagger;texts;korea.text") (map-sentences stream #'print-sentence)) #+test (with-open-file (stream "projects:cgp;multi-tagger;texts;delkorp.text") (map-sentences stream #'print-sentence)) #+test (time (with-open-file (stream "projects:cgp;multitagger;texts;korea.text") (map-sentences stream #'identity))) (defmethod print-sentence ((sentence sentence) &rest rest &key (expand-tokens-p t) ;; print-lc-features (token-print-fn #'print-token) &allow-other-keys) "prints a tagged sentence in CG style" (let ((token-print-fn (or token-print-fn #'print-token))) (labels ((walk (token concat-token) #+debug(print (list token concat-token)) (cond ((null token) nil) ((and (or (eq expand-tokens-p t) (find expand-tokens-p (token-used-rules token))) (token-expansion token) ;; avoid nesting (not concat-token)) (do ((ex-token (car (token-expansion token)) (token-next ex-token))) ((eq ex-token (cdr (token-expansion token))) (walk ex-token token)) (walk ex-token token)) (walk (token-next token) nil)) (t (apply token-print-fn token :expand-tokens-p expand-tokens-p :concat-token concat-token rest) (unless (or (eq token (last-token sentence)) concat-token) (walk (token-next token) nil)))))) (walk (first-token sentence) nil))) sentence) (defmethod sentence-token-length ((sentence sentence) &key (expand-tokens-p t)) (let ((length 0)) (labels ((walk (token concat-token) (cond ((null token) nil) ((and expand-tokens-p (token-expansion token) ;; avoid nesting (not concat-token)) (do ((ex-token (car (token-expansion token)) (token-next ex-token))) ((eq ex-token (cdr (token-expansion token))) (walk ex-token token)) (walk ex-token token)) (walk (token-next token) nil)) (t (incf length) (unless (or (eq token (last-token sentence)) concat-token) (walk (token-next token) nil)))))) (walk (first-token sentence) nil)) length)) ;; destructive! (defun remove-feature-inclusions (features) (loop for f-tail on features when (car f-tail) do (block loop (loop for f-t-tail on (cdr f-tail) when (car f-t-tail) do (let ((l.f1 (car f-tail)) (l.f2 (car f-t-tail))) (cond ((string/= (car l.f1) (car l.f2)) nil) ((subsumes-p (cdr l.f1) (cdr l.f2)) (setf (car f-tail) nil) (return-from loop)) ((subsumes-p (cdr l.f2) (cdr l.f1)) (setf (car f-t-tail) nil)) (t nil)))))) features) (defmethod print-token ((token token) &key (stream *standard-output*) concat-token additional-features expand-tokens-p (print-features t) (print-rules t) print-lc-features (print-attributes t) (print-special-tokens-p t) &allow-other-keys) (let ((value (effective-token-value token)) (features (remove-feature-inclusions (or (token-features token) (when concat-token (token-features concat-token))))) (lc-features (when print-lc-features (or (lc-features token) (when concat-token (lc-features concat-token))))) (attributes (token-attributes token))) (cond ((stringp value) (let ((expanded-mark (cond ((not expand-tokens-p) "") ((not concat-token) "") ((eq token (car (token-expansion concat-token))) (if (eq token (cdr (token-expansion concat-token))) "* " "[ ")) ((eq token (cdr (token-expansion concat-token))) "] ") (t "| ")))) (if print-features (cond ((and print-rules (token-used-rules token)) (destructuring-bind (mt-rule &rest rules) (reverse (token-used-rules token)) (format stream "~a\"<~a>\" ~{ ~a~}" expanded-mark (if *debug-mem* token value) (if mt-rule (cons mt-rule (mapcar #'rule-string rules)) (mapcar #'rule-string rules))))) ((and print-attributes attributes) (format stream "~a~{ ~a~}" value attributes)) (t (format stream "~a\"<~a>\"" expanded-mark value))) (write-string value stream))) ; ** fix this! (when (and *record-discarded-correct-readings-p* (correct-reading-discarded-p token)) (write-string " " stream)) (when print-features (dolist (fl features) (when (car fl) (terpri stream) (format stream "~c\"~a\" ~(~{~a~^ ~}~{ ~a~}~)" #\Tab (car fl) (code-features (cdr fl)) additional-features))) (dolist (fl lc-features) (when (car fl) (terpri stream) (format stream "~c\"~a\" ~(~{~a~^ ~}~)" #\Tab (car fl) (code-features (cdr fl)))))) (terpri stream)) ((insignificant-token-p token) (when print-special-tokens-p #+ignore-yet(format stream "~a~{ ~a~}~%" value attributes) (format stream "~a~%" attributes)))))) (defmethod print-token ((token control-token) &key (stream *standard-output*) concat-token (print-rules t) (print-attributes t) (print-special-tokens-p t) &allow-other-keys) (let ((value (effective-token-value token)) (features (token-features token #+ignore(or concat-token token))) (control-features (token-control-features token #+ignore(or concat-token token))) (attributes (token-attributes token))) (cond ((stringp value) (cond ((and print-rules (token-used-rules token)) (destructuring-bind (mt-rule &rest rules) (reverse (token-used-rules token)) (format stream "\"<~a>\" ~{ ~a~}" value (if mt-rule (cons mt-rule (mapcar #'rule-string rules)) (mapcar #'rule-string rules))))) ((and print-attributes attributes) (format stream "~a~{ ~a~}" value attributes)) (t (format stream "\"<~a>\"" value))) (when (and *record-discarded-correct-readings-p* (correct-reading-discarded-p token)) (write-string " " stream)) (loop for fl in features for cfl in control-features when (car fl) do (terpri stream) (format stream "~c\"~a\" ~(~{~a~^ ~}~)~(~{ ~a~}~)" #\Tab (car fl) (code-features (cdr fl)) cfl)) (terpri stream)) ((insignificant-token-p token) (when print-special-tokens-p #+ignore-yet(format stream "~a~{ ~a~}~%" value attributes) (format stream "~a~%" attributes)))))) #+old (defmethod print-sentence ((sentence sentence) &rest rest &key #||(stream *standard-output*) (print-features t) (print-rules t) (print-attributes t) (print-special-tokens-p t)||# (expand-tokens-p t) &allow-other-keys) "prints a tagged sentence in CG style" (labels ((walk (token concat-token) (cond ((null token) nil) ((and expand-tokens-p (token-expansion token) ;; avoid nesting (not concat-token)) (do ((ex-token (car (token-expansion token)) (token-next ex-token))) ((eq ex-token (cdr (token-expansion token))) (walk ex-token token)) (walk ex-token token)) (walk (token-next token) nil)) (t (let ((value (effective-token-value token)) (features (token-features (or concat-token token))) (attributes (token-attributes token))) (cond ((stringp value) (if print-features (cond ((and print-rules (token-used-rules token)) (destructuring-bind (mt-rule &rest rules) (reverse (token-used-rules token)) (format stream "\"<~a>\" ~{ ~a~}" value (if mt-rule (cons mt-rule (mapcar #'rule-string rules)) (mapcar #'rule-string rules))))) ((and print-attributes attributes) (format stream "~a~{ ~a~}" value attributes)) (t (format stream "\"<~a>\"" value))) (write-string value stream)) ; ** fix this! (when (and *record-discarded-correct-readings-p* (correct-reading-discarded-p token)) (write-string " " stream)) (when (and features print-features) (dolist (fl features) (when (car fl) (terpri stream) (format stream "~c\"~a\" ~(~{~a~^ ~}~)" #\Tab (car fl) (code-features (cdr fl)))))) (terpri stream)) ((insignificant-token-p token) (when print-special-tokens-p #+ignore-yet(format stream "~a~{ ~a~}~%" value attributes) (format stream "~a~%" attributes))))) (unless (or (eq token (last-token sentence)) concat-token) (walk (token-next token) nil)))))) (walk (first-token sentence) nil)) sentence) #+test (let* ((*cg* *nbo-cg*) (*tagger* (multi-tagger *cg*))) (print (code-features #*00000000000010100000000000000000000001010100000000000000000000000000000101010000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000))) #+test (with-open-file (stream "projects:cgp;multitagger;texts;tvang.text") (reset-buffer *line-buffer*) (loop for word = (next-word stream) while word do (print word))) #+test (u:with-file-lines (line "projects:cgp;texts;mtag_tag2616.txt") (unless (char= (char line 0) #\Tab) (format t "~a~%" (string-trim "\"" line)))) ;; somewhat hacky #+mcl (defun reconstruct-text (in-path out-path) (let ((char-count 0) (no-space nil) (left-quote nil) (stream (make-instance 'string-output-stream)) (sentence-end-pos 0)) (with-open-file (out-stream out-path :direction :output :if-exists :supersede :if-does-not-exist :create) (u:with-file-lines (line in-path) (cond ((string= line "") nil) ((char= (char line 0) #\Tab) (when (or (search "\"$.\" CLB" line) (search "\"$!\" CLB" line) (search "\"$?\" CLB" line)) (setf sentence-end-pos (fill-pointer (slot-value stream 'ccl::my-string))))) (t (setf line (subseq line 2 (- (length line) 2))) (cond ((string= line "$|") (setf char-count 0) (cond ((zerop sentence-end-pos) (terpri out-stream)) (t (write-line (subseq (slot-value stream 'ccl::my-string) 0 sentence-end-pos) out-stream) (terpri out-stream) (write-line (subseq (slot-value stream 'ccl::my-string) sentence-end-pos) out-stream) (terpri out-stream) (setf (fill-pointer (slot-value stream 'ccl::my-string)) 0))) #+ignore (write-line (get-output-stream-string stream) out-stream) (setf sentence-end-pos 0)) ((and (char= (char line 0) #\$) (find (char line 1) "([{")) (incf char-count) (write-char #\Space stream) (setf no-space t) (write-string (subseq line 1) stream)) ((string= line "$-") (incf char-count 2) (write-string " -" stream)) ((string= line "$\"") (setf left-quote (not left-quote)) (incf char-count) (cond (left-quote (write-string " \"" stream) (setf no-space t)) (t (write-char #\" stream)))) ((char= (char line 0) #\$) (incf char-count) (write-string (subseq line 1) stream)) (t (setf line (remove-stars line)) (when (> (incf char-count (1+ (length line))) 80) (setf sentence-end-pos 0 char-count (length line) no-space t) ;line break (write-line (get-output-stream-string stream) out-stream) #+ignore (terpri stream)) (if no-space (setf no-space nil) (write-char #\Space stream)) (write-string line stream)))))) (write-line (get-output-stream-string stream) out-stream) nil))) (defun remove-stars (word) (let ((star-found-p nil) (prev-star-p nil)) (loop for c across word for i from 0 do (cond ((char= c #\*) (setf star-found-p t prev-star-p t)) (prev-star-p (setf prev-star-p nil (char word i) (char-upcase c))) (t nil))) (if star-found-p (remove #\* word) word))) #+test (let ((*analyser-lexicon* (compound-analyser *nbo-tagger*))) (print (most-probable-compound-last-chunk "arkskriver"))) #+test (let ((*analyser-lexicon* (compound-analyser *nbo-tagger*))) (cgp::mt "En liten laksebolle.")) #+test (let ((*tagger* *nbo-tagger*) (*analyser-lexicon* (compound-analyser *nbo-tagger*))) (tag-compound "laksebolle")) :eof