;;; -*- Mode: LISP; Package: CGP; BASE: 10; Syntax: ANSI-Common-Lisp; -*- ;; ;; Copyright (C) Paul Meurer 2002. All rights reserved. ;; paul.meurer@hit.uib.no ;; HIT-centre, University of Bergen ;; ;; Version 0.9 ;; (in-package :cgp) (defmethod print-sentence-start ((sentence sentence)) (let ((count 0)) (terpri) (block map (map-tokens sentence (lambda (token) (when (stringp (token-value token)) (write-string (token-value token)) (write-char #\Space) (incf count)) (when (= count 3) (return-from map))))))) (defmethod possible-terminator-p ((sentence sentence) (tokenizer xml-tokenizer) token) "Checks if TOKEN is a possible terminator, using some left neighborhood information" (with-slots (paragraph-delimiter-elements headline-delimiter-elements in-sentence-elements stack) tokenizer #+debug(print (list (token-value token) stack)) #+debug(print-sentence-start sentence) (and #+test(not (some (lambda (ise) (find ise stack)) in-sentence-elements)) (or (null token) (and (insignificant-token-p token) (consp (token-attributes token)) #+ignore (print (list (token-attributes token) paragraph-delimiter-elements headline-delimiter-elements)) (or (find (caddr (token-attributes token)) paragraph-delimiter-elements) (find (caddr (token-attributes token)) headline-delimiter-elements))) (call-next-method))))) (defun xml-terminates-sentence-p (token paragraph-delimiter-elements headline-delimiter-elements) ;;(print-sentence-start (token-chain token)) ;;(print (list :terminator token)) (labels ((walk (token) (when token (with-slots (attributes) token ;;(print (cons token attributes)) (cond ((and (insignificant-token-p token) (consp attributes) (eq (car attributes) :%end) (or (find (caddr attributes) paragraph-delimiter-elements) (find (caddr attributes) headline-delimiter-elements))) t) ((stringp (token-value token)) nil) (t (walk (token-next token)))))))) (walk (token-next token)))) ;; TO DO: check if there are sentence-delimiter-elements. If not, do sentence recognition. (defmethod build-sentence ((sentence sentence) (tokenizer xml-tokenizer) &optional prev-token terminator) "Returns a fully tokenized sentence." (with-slots (sentence-delimiter-elements paragraph-delimiter-elements headline-delimiter-elements in-sentence-elements stack) tokenizer (let ((token (or (and prev-token (token-next prev-token)) (sentence-add-token sentence (tokenizer sentence))))) #+debug(print (cons :token token)) (if sentence-delimiter-elements (cond ((null token) sentence) ((and (insignificant-token-p token) (consp (token-attributes token)) (find (caddr (token-attributes token)) sentence-delimiter-elements)) ;; make a new sentence (let ((new-sentence (split-at-token sentence (token-prev token)))) (values sentence new-sentence token token))) (t (build-sentence sentence tokenizer token))) (cond ((null token) sentence) ((possible-terminator-p sentence tokenizer token) ;; we found a possible terminator (cond ;; if there is no previous possible terminator, continue ((null terminator) (if token (build-sentence sentence tokenizer token ;; final check for headline (when (possible-headline-or-no-newline-p sentence token) token)) sentence)) (;; check whether the previous terminator really terminates the sentence (and (or (and (insignificant-token-p terminator) (consp (token-attributes terminator))) (xml-terminates-sentence-p terminator paragraph-delimiter-elements headline-delimiter-elements) (terminates-sentence-p terminator) ;; has side effect! (and *terminate-on-following-insignificant-token-p* (token-next terminator) (insignificant-token-p (token-next terminator)))) #+ignore (print (list (token-value terminator) stack)) #+ignore (not (some (lambda (ise) (find ise stack)) in-sentence-elements))) ;; deal with headline (let ((atts (token-attributes terminator))) (when (or ;;(eq :newline (token-value terminator)) (and (insignificant-token-p terminator) (consp atts) (eq (car atts) :%end) (find (caddr atts) headline-delimiter-elements))) ;;(print (list :terminator (token-attributes terminator))) (when (token-prev terminator) (insert-token (token-prev terminator) (get-token :value "|" :chain sentence))) #+ignore (setf (token-value terminator) "|"))) (let ((new-sentence (split-at-token sentence terminator))) (values sentence new-sentence ;; final check for headline (when (and new-sentence (possible-headline-or-no-newline-p new-sentence token)) token) token))) (t ;; does not terminate: we go on (when (and (string= (token-value terminator) ".") (prev-str-token terminator)) ;; append period to prev. token (concatenate-tokens (prev-str-token terminator) terminator)) (build-sentence sentence tokenizer token ;; final check for headline (when (possible-headline-or-no-newline-p sentence token) token))))) (t (build-sentence sentence tokenizer token terminator))))))) ;;; EOF