;;;   -*- Mode: LISP; Package: CGP; BASE: 10; Syntax: ANSI-Common-Lisp; -*-
;;
;; Copyright (C) Paul Meurer 2000 - 2007. All rights reserved.
;; paul.meurer@aksis.uib.no
;; Aksis, University of Bergen
;;
;; Reimplementation in ANSI CommonLisp of the multi-tagger module 
;; (Perl program) written by Lars J¿rgen Tvedt, UiO 1999
;; Used in the CG parser/tagger system (Oslo-tagger) developed at UiO
;; (Dokumentasjonsprosjektet; Tekstlaboratoriet)
;;

;; Short description of the algorithm
;;
;; Parsing consists of the following steps:
;; 1. Tokenizing
;; 2. Determination of sentence boundaries
;; 3. Multitagging of tokens (including recognition of compounds)
;; 4. Disambiguation using CG parser
;;
;; The tokenizer performs an initial tokenization of the input (NEXT-WORD) (using a line
;; buffer which holds the last line read and the position in the line).
;; In BUILD-SENTENCE, the tokens are successively read into a TOKEN-CHAIN (SENTENCE).
;; Each token which might represent a sentence boundary (newline, punctuation) is immediately
;; tested using left context (since no right context is available at that stage), and the 
;; previously found possible sentence terminator (if existing) undergoes a final test, this
;; time using left and right context (right context is now available until the new possible 
;; terminator token). If the token passes the test, we have found a sentence boundary, and
;; the token chain is split at that token, the left part representing the fully tokenized sentence,
;; and the right part being a partially tokenized new sentence.
;; After a sentence is found, several adjustments are made to the tokenization, e.g.
;; divided words are joined.
;;
;; In the next step, the tokens are multitagged (TAG-SENTENCE).
;;
;; Since the CG parser needs random access to the tokens in a sentence, whereas a token-chain
;; only provides sequential access, a vector (SENTENCE-ARRAY) containing the tokens is built.
;;
;; Thread safety is achieved by object slots instead of globals (e.g. in "compounds.lisp")
;; or by rebinding globals dynamically to thread-owned objects (*line-buffer*). 
;; Thread safety is necessary and essential in web context where several users might
;; simultaneously access the tagger.

;;-------------------------------------------------------------------------------------
;; TO DO:
;; - parentheses and quotes after sentence-ending punctuation
;;

;; KNOWN BUGS:
;; - q. e. d.
;;
;; QUESTIONS:
;; "så" SBU missing in "delkorp.cor"
;; "da" pref missing
;; "i morgen" etc., but not "i dag"
;;-------------------------------------------------------------------------------------

(in-package :cgp)

(defparameter *terminate-on-following-insignificant-token-p* nil)
(defparameter *lookup-unknown-in-nny-lexicon-p* nil)
(defparameter *merge-hyphenated-words-p* t)
(defparameter *recognize-sgml-tags-p* nil)
(defparameter *no-compounds-p* nil)
(defvar *token-memory* nil)

(#-sbcl defconstant #+sbcl defparameter $encoding #+(and mcl (not openmcl)) :macintosh #+unix :unix #+mswindows :windows)
(defparameter $expression-max-length 5)

;; Table holding all registered Constraint Grammars
(defvar *cg-table* (make-hash-table :test #'equal :size 16))
(defvar *nbo-cg*)
(defvar *nny-cg*)

;; Object holding the CG rules. When parsing, dynamically bind *CG* to a grammar.
(defvar *cg* t)
(defparameter *context-size* 1000)

(defparameter *regexp-parser* nil)

(defparameter *term-extractor* nil)
(defparameter *np-recognizer* nil) ;; is set in term-extractor.lisp
(defparameter *name-term-extractor* nil)

;; debug
(defparameter *memory* nil)

;; for debugging
(defvar *sentence* nil)
(defparameter *record-tokenization-p* t)
(defparameter *debug-mem* nil)

;; buffer holding the line being tokenized
(defclass line-buffer ()
  ((line :initform "" :accessor buffered-line)
   (length :initform 0 :accessor buffered-line-length)
   (pos :initform 0 :accessor buffered-line-position)
   (stream-pos :initform 0 :accessor stream-pos)
   (stream-line-length :initform 0 :accessor stream-line-length)
   (encoding :initarg :encoding :initform $encoding :reader encoding)))

;; Must be bound dynamically to a fresh buffer in order to achieve thread safety
(defvar *line-buffer* t)

(#-sbcl defconstant #+sbcl defparameter $whitespace #.(coerce (list #\Space #\Tab 
					 #+allegro #\no-break_space
					 #-allegro (code-char 160)
					 #\Newline) 'string))
(#-sbcl defconstant #+sbcl defparameter $punctuation ".,!?;:()[]{}")
(#-sbcl defconstant #+sbcl defparameter $parenthesis "()[]{}")
(#-sbcl defconstant #+sbcl defparameter $left-punctuation "([{")
(#-sbcl defconstant #+sbcl defparameter $right-punctuation ".,!?;:)]}")
(#-sbcl defconstant #+sbcl defparameter $quote
  #+allegro
  (coerce '(#\" #\'
	    #\left_double_quotation_mark #\right_double_quotation_mark
	    #\left_single_quotation_mark #\right_single_quotation_mark
	    #\left-pointing_double_angle_quotation_mark #\right-pointing_double_angle_quotation_mark)
	  'string)
  #-allegro
  (coerce '(#\" #\')
	  'string))
(#-sbcl defconstant #+sbcl defparameter $left-quote
  #+allegro
  (coerce '(#\" #\'
	    #\left_double_quotation_mark
	    #\left_single_quotation_mark
	    #\left-pointing_double_angle_quotation_mark)
	  'string)
  
  #-allegro
  (coerce '(#\" #\')
	  'string))
(#-sbcl defconstant #+sbcl defparameter $right-quote
  #+allegro
  (coerce '(#\" #\'
	    #\right_double_quotation_mark
	    #\right_single_quotation_mark
	    #\right-pointing_double_angle_quotation_mark)
	  'string)

  #-allegro
  (coerce '(#\" #\')
	  'string))
(#-sbcl defconstant #+sbcl defparameter $whitespace+punctuation (concat $whitespace $punctuation))
(#-sbcl defconstant #+sbcl defparameter $terminator ".?:!|")
(#-sbcl defconstant #+sbcl defparameter $special-character  "!?.<>ß@£$%&/=*-")

(defun digit-p (char) (char<= #\0 char #\9))
(defun special-char-p (char) (find char $special-character))
(defun quote-p (char) (find char $quote))
(defun left-parenthesis-p (char) (find char "([{"))
(defun left-quote-p (char) (find char $left-quote))
(defun right-quote-p (char) (find char $right-quote))
(defun whitespace-p (char) (find char $whitespace))
(defun punctuation-p (char) (find char $punctuation))
(defun left-punctuation-p (char) (find char $left-punctuation))
(defun right-punctuation-p (char) (find char $right-punctuation))
(defun whitespace-or-punctuation-p (char) (find char $whitespace+punctuation))
(defun parenthesis-p (char) (find char $parenthesis))

(defun comment-start-p (pos line)
  (and (< pos (1- (length line)))
       (char= (char line pos) #\/)
       (char= (char line (1+ pos)) #\*)))

(defun sgml-tag-start-p (pos line)
  (char= (char line pos) #\<))

(defparameter *eof-magic-string* nil)

(defmethod read-next-line ((stream stream))
  (read-line stream nil :eof))

(defvar *separate-quotes-p* nil)

(defun next-word (stream &optional (buffer *line-buffer*) in-comment-p)
  "Tokenizes the input stream and returns one (string) token or :newline, and file position as second value."
  (with-slots (line length pos stream-pos stream-line-length encoding) buffer
    (declare (fixnum length pos))
    (unless (eq line :eof)
      (if (= length pos)
	  (let ((new-line (read-next-line stream)))
	    (unless (or (eq new-line :eof) (eq encoding $encoding))
	      (setf new-line (convert-string new-line encoding $encoding)))
	    (setf stream-line-length (if (eq new-line :eof) 0 (1+ (length new-line)))) ;; hack!!
	    (when (and (eq encoding :unix) (not (eq new-line :eof)))
	      #+allegro
	      (setf new-line (delete (code-char 13) new-line))
	      #+ignore(setf new-line (substitute #\» #\È new-line)))
	    (setf line (cond ((null *eof-magic-string*) new-line)
			     ((string= *eof-magic-string* new-line)
			      :eof)
			     (t
			      new-line)))
	    (incf stream-pos stream-line-length) ;; what about cr+lf?
	    (unless (eq line :eof)
	      (setf length (length line)
		    pos 0)
	      (values :newline (- stream-pos 1 stream-line-length))))
	(unless nil ;;(eq line :eof)
          (let* ((start pos)
                 (comment-p nil)
                 (sgml-tag-p nil)
                 (word-start (if in-comment-p
				 0
			       (position-if-not #'whitespace-p line :start start)))
                 (word-end (when word-start
			     #+debug(print (list word-start line))
                             (let ((first-char (char line word-start)))
                               (cond (in-comment-p
                                      (setf comment-p t)
                                      (let ((comment-end-pos
					     (search "*/" line :start2 (+ word-start 0))))
                                        (when comment-end-pos
                                          (setf in-comment-p nil)
                                          (+ comment-end-pos 2))))
                                     ((punctuation-p first-char)
				      ;; keep together equal punctuation chars
                                      (position-if-not (lambda (c) (char= c first-char))
                                                       line :start word-start))
                                     ((comment-start-p word-start line)
                                      (setf comment-p t)
                                      (or (search "*/" line :start2 (+ word-start 0))
                                          (setf in-comment-p t))
                                      length)
                                     ((and *recognize-sgml-tags-p*
					   (sgml-tag-start-p word-start line))
                                      (setf sgml-tag-p t)
                                      (1+ (position #\> line :start (1+ word-start))))
				     ;; separate hyphen from following word not
				     ;; starting with lowercase;
				     ;; seems to be direct speech marker
                                     ((and (char= #\- first-char)
                                           (< word-start (1- length))
                                           (char/= (char line (1+ word-start)) #\-)
                                           (not (lower-case-p (char line (1+ word-start)))))
                                      (1+ word-start))
				     ((and *separate-quotes-p* (quote-p first-char))
				      #+ignore(position-if-not #'quote-p line :start word-start)
				      (1+ word-start))
                                     (t
                                      (let ((ws-pos
                                             (or (position-if #'whitespace-p line :start word-start)
                                                 length)))
                                        (loop while (or (punctuation-p (char line (1- ws-pos)))
                                                        (and (not (quote-p first-char))
                                                             (quote-p (char line (1- ws-pos)))))
					    do (decf ws-pos))
					;; contractions
                                        (let ((contraction-pos (position #\/ line :start word-start :end ws-pos)))
                                          (cond ((not contraction-pos)
						 ws-pos)
						(*separate-quotes-p* ;; for Logon
						 (if (= contraction-pos word-start)
						     (1+ contraction-pos)
						     contraction-pos))
						((and (= contraction-pos (1+ word-start))
						      (< (+ contraction-pos 2) ws-pos)
						      ;; eg. 6/7-80
						      (not (digit-p (char line word-start))))
						 (1+ contraction-pos))
						(t
						 ws-pos))))))))))
            (setf pos (or word-end length))
            (cond (in-comment-p
                   (next-word stream buffer in-comment-p) ; jump over :newline
                   (next-word stream buffer in-comment-p))
                  ((or comment-p sgml-tag-p)
                   (next-word stream buffer in-comment-p))
                  ((not word-start)
                   (next-word stream buffer in-comment-p))
                  (t
		   (values (subseq line word-start word-end)
			   (+ stream-pos word-start (- stream-line-length)))))))))))

#+ignore-yet
(defmethod word-token-position ((chain token-chain) token)
  )

(#-sbcl defconstant #+sbcl defparameter $max-subtitle-length 7)

(defmethod effective-token-value ((token token))
  (or (token-normalized-value token)
      (token-value token)))

;; new 5.2.2002
(defmethod possible-terminator-p ((sentence sentence) (tokenizer tokenizer) token)
  "Checks if TOKEN is a possible terminator, using some left neighborhood information"
  (or (null token)
      (let ((value (effective-token-value token)))
        (cond ((newlinep token)
               ;; check if left neighborhood makes headline possible
               (left-possible-headline-end-p token))
              ((not (symbolp value))
               (and (find-if (lambda (c) (find c $terminator)) value)
                    (not (find-if-not #'whitespace-or-punctuation-p value))))
              (t
               nil)))))

#+old
(defmethod possible-terminator-p ((sentence sentence) token)
  "Checks if TOKEN is a possible terminator, using some left neighborhood information"
  (or (null token)
      (let ((value (effective-token-value token)))
        (cond ((newlinep token)
               ;; check if left neighborhood makes headline possible
               (left-possible-headline-end-p token))
              ((not (symbolp value))
               (and (find-if (lambda (c) (find c $terminator)) value)
                    (not (find-if-not #'whitespace-or-punctuation-p value))))
              (t
               nil)))))

(defmethod possible-newline-terminator-p ((sentence sentence) token &optional prev-terminator)
  "Checks if TOKEN is a possible terminator, using some left neighborhood information"
  (let ((value (effective-token-value token)))
    (and (eq value :newline)
         ;; check if left neighborhood makes headline possible
         (left-possible-headline-end-p token)
         (or 
          ;; possible subtitle
          (let ((maybe-first-token (if prev-terminator
                                     (token-next prev-terminator)
                                     (first-token sentence))))
            (and (eq (effective-token-value maybe-first-token) :newline)
                 (<= (token-distance maybe-first-token token)
                     (1+ $max-subtitle-length)))
            (not (eq (token-prev token) prev-terminator))
            ;; *** TO DO: check that newline is first one in sentence
            ) 
          ;; possible headline if empty line follows
          (and (eq (effective-token-value (token-prev token)) :newline)
               ;; but not if we already have signaled a possible headline
               ;; at the previous newline
               (not (eq (token-prev token) prev-terminator)))))))

; s/($muligOverskrift)\s+([$quotsParantes]*[-$lettersla\d$specLetters])/$1\| $2/;

(defun left-possible-headline-end-p (token)
  "Checks if there is no terminator, #\, or #\; left to TOKEN"
  (cond ((null token)
         nil)
        ((newlinep token)
         (left-possible-headline-end-p (token-prev token)))
        ((insignificant-token-p token)
         (left-possible-headline-end-p (token-prev token)))
        ((not (find-if-not #'quote-p (effective-token-value token)))
         (left-possible-headline-end-p (token-prev token)))
        ((find-if (lambda (c) (or (find c $terminator)
                                  (find c ",;")))
                  (effective-token-value token))
         nil)
        (t t)))

(defun right-possible-headline-end-p (token)
  (cond ((null token)
         nil)
        ((newlinep token)
         (right-possible-headline-end-p (token-next token)))
        ((insignificant-token-p token)
         (right-possible-headline-end-p (token-next token)))
        ((find-if #'quote-p (effective-token-value token))
         (right-possible-headline-end-p (token-next token)))
        ((find #\( (effective-token-value token)) ;; Perl: "()"
         (right-possible-headline-end-p (token-next token)))
        ((let ((c (char (effective-token-value token) 0)))
           (or (upper-case-p c)
               (special-char-p c)
               (digit-p c))))
	(t nil)))

(defun left-possible-terminator-p (token)
  (cond ((null token)
         nil)
        #+ignore
        ((symbolp (effective-token-value token))
         nil)
        ((or (find #\) (effective-token-value token)) ; Perl: "()"
             (all-p #'quote-p (effective-token-value token))
             (not (find-if-not (lambda (c) (find c $terminator)) ; something like "virkelig!?" 
                               (effective-token-value token))))
         (left-possible-terminator-p (prev-str-token token)))
        ((let* ((str (effective-token-value token))
                (c (find-if-not #'quote-p str :from-end t)))
           (or (lower-case-p c)
               (special-char-p c)
               (digit-p c)
               (and (upper-case-p c)
                    (or (and (= 1 (length str))
                             (let ((prev (token-prev token)))
                               (and prev
                                    (not (symbolp (effective-token-value prev)))
				    ;; (or ...) ; ? $letters\d$specLetters
                                    )))
                        (< 1 (length str))))))
         t)
        (t nil)))

(defun right-possible-terminator-p (token)
  (cond ((null token)
         t)
        ((and (newlinep token)
              (newlinep (next-stream-token token)))
         t)
        ((and (not (newlinep token))
              (not (find-if-not (lambda (c) (find c $terminator)) 
                                (effective-token-value token))))
         nil) ; because next one is terminator
        ((or (newlinep token) ; ** should we say T if we encounter two successive :NEWLINEs?
             (find-if #'quote-p (effective-token-value token))
             (find-if #'parenthesis-p (effective-token-value token)))
         (right-possible-terminator-p (next-stream-token token)))
        ((let ((c (char (effective-token-value token) 0)))
           (or ;(char= #\- c) ; ?? is contained in $special-characters 
               (upper-case-p c)
               (special-char-p c)
               (digit-p c))))
        (t nil)))

#+test
(let ((*tagger* *nbo-tagger*))
  (print (title-p "dr.scient" t))) 

;; checks if ABB is an abbreviation ending in period
(defun non-terminating-abbreviation-p (abb)
  (let* ((length (length abb))
         (str (make-string (+ length 2))))
    (declare (dynamic-extent str))	; no effect
    (dotimes (i length) (setf (char str i) (char abb i)))
    (setf (char str length) #\.
          (char str (1+ length)) #\:)
    (or (nth-value 1 (match-string (abbreviations *tagger*) str))
	#+test
        (nth-value 1 (match-string (titles *tagger*) str))
        (and (upper-case-p (char str 0))
             (setf (char str 0) (char-downcase (char str 0)))
             (or (nth-value 1 (match-string (abbreviations *tagger*) str))
		 #+test
		 (nth-value 1 (match-string (titles *tagger*) str)))))))

(defun title-p (title &optional add-period-p)
  (let* ((length (length title))
         (str (make-string (+ length (if add-period-p 2 1)))))
    (declare (dynamic-extent str)) ; no effect
    (dotimes (i length) (setf (char str i) (char title i)))
    (when add-period-p 
      (setf (char str length) #\.)
      (incf length))
    (setf (char str length) #\:)
    (nth-value 1 (match-string (titles *tagger*) str))))

;(title-p "dr.scient" t)
;(title-p "dr" t)

;; could be more efficient, without consing
(defun name-p (str &optional (accept-unknown-p t))
  (let ((features (append (fullform-features str)
                          (fullform-features (string-downcase str)))))
    (or (and accept-unknown-p
             (null features)
             (find-if #'alpha-char-p str))
        (find-if (lambda (fs) (has-feature-p fs 'prop)) features)
        ;; try without hyphen
        (and (find #\- str)
             (let* ((str (remove #\- str)) 
                    (features (append (fullform-features str)
                                      (fullform-features (string-downcase str)))))
               (or (and accept-unknown-p (null features))
                   (find-if (lambda (fs) (has-feature-p fs 'prop)) features)))))))

; jumps over :newline etc
(defun next-str-token (token)
  (when token
    (let ((next-token (token-next token)))
      (when next-token
	(if (stringp (effective-token-value next-token))
	    next-token
	  (next-str-token next-token))))))
  
(defun prev-str-token (token)
  (when token
    (let ((prev-token (token-prev token)))
      (when prev-token
        (if (stringp (effective-token-value prev-token))
          prev-token
          (prev-str-token prev-token))))))

(defmethod insignificant-token-p ((token token))
  (or (eq (effective-token-value token) :insignificant)
      (equal (effective-token-value token) "¶")))

(defmethod next-stream-token ((token token))
  "gets a new token from the token stream if next token has not been fetched yet"
  (let ((token (or (token-next token)
		   (and (sentence-add-token (token-chain token)
					    (tokenizer (token-chain token)))
			(token-next token)))))
    (when token
      (if (insignificant-token-p token)
	  (next-stream-token token)
	token))))

#+test
(untrace terminates-sentence-p token-value)

(defun terminates-sentence-p (token)
  "Decides if a possibly terminating token terminates a sentence,
using left and right neighborhood information"
  (flet ((add-period-to-prev (token)
           (let ((str (token-value (prev-str-token token))))
             (setf (token-value (prev-str-token token))
                   (concat str ".")))
           nil))
    (cond ((newlinep token)
	   ;; possibly headline end
           (or (newlinep (next-stream-token token)) ; always headline if empty line follows
               (and (right-possible-headline-end-p token))))
	  ;; abbr and dot can't be separated by insignificant token 
	  ((and (string= (effective-token-value token) ".")
		(insignificant-token-p (token-prev token)))
	   t)
	  ((and (left-possible-terminator-p (prev-str-token token))
                (right-possible-terminator-p (next-stream-token token))) ;; was: next-str-token
	   (cond (;; abbreviation followed by name
                  ;; *** TO DO: compound abbreviations
                  (and (string= (effective-token-value token) ".")
		       (or (and (non-terminating-abbreviation-p (effective-token-value (prev-str-token token)))
				(next-str-token token)
				(let ((next-str (effective-token-value (next-str-token token))))
				  (or (name-p next-str)
				      (digit-p (char next-str 0)) ;; *** not all abbreviations can have digits next!
				      ;; check for names of type "Rigus P. Ackerschwaan"
				      (let ((prev-prev-str-token (prev-str-token (prev-str-token token))))
					(and prev-prev-str-token
					     (name-p (effective-token-value prev-prev-str-token))))
				      ;; adds period and returns NIL
				      (add-period-to-prev token))))
			   ;; special treatment of "min." before number
			   (and (string-equal (effective-token-value (prev-str-token token)) "min")
				(next-str-token token)
				(digit-p (char (effective-token-value (next-str-token token)) 0)))))
		  nil)
		 (;; title followed by name
                  (and (string= (effective-token-value token) ".")
                       (title-p (effective-token-value (prev-str-token token)) t)
                       (token-next token)
                       (or (name-p (effective-token-value (next-str-token token)))
                           ;; adds period and returns NIL
                           (add-period-to-prev token)))
                  nil)
                 ((string= (effective-token-value token) "...")
                  (insert-token (token-prev token) 
                                (get-token :value ".."
					   :chain (token-chain token)
					   :position (token-stream-position (token-prev token))))
                  (setf (token-value token) ".")
                  t)
                 (t
                  t)))
          (t
           nil))))

(defmethod split-at-token ((sentence sentence) token)
  (with-slots (cg) sentence
    (when (token-next token)
      (let ((next-sentence ;; use the rest to build a new sentence
             (get-sentence :first-token (token-next token)
                           :last-token (last-token sentence)
                           ;; new 21.09.2000
                           :stream (sentence-stream sentence)
                           ;; new 10.01.2001
                           :sentence-class (class-of sentence)
                           :cg cg
			   ;;:feature-vector (feature-vector (multi-tagger cg))
			   )))
        (setf (last-token sentence) token
              (token-next token) nil
              (token-prev (first-token next-sentence)) nil)
        (labels ((set-chain (token)
                   (when token
                     (setf (token-chain token) next-sentence)
                     (set-chain (token-next token)))))
          (set-chain (first-token next-sentence)))
        next-sentence))))

(defmethod split-at-token ((sentence compare-sentence) token)
  (declare (ignore token))
  (let ((next-sentence (call-next-method)))
    (when next-sentence
      (setf (compare-cg next-sentence) (compare-cg sentence))
      next-sentence)))

(defun newlinep (token)
  (and token (find (effective-token-value token) '(:newline "¶" :paragraph) :test #'equal)))

;(print-strings *titles*)

;(disambiguate-from-string "Du er dr. scient. eller Mag. art. og ingenting annet.")

(defmethod sentence-add-token ((sentence sentence) (tokenizer tokenizer))
  "appends token(s) from stream and returns the last token added"
  (with-slots (stream) sentence
    (multiple-value-bind (word stream-pos) (next-word stream)
      (cond ((null word)
             nil)
            ((symbolp word)
             (add-token sentence word :position stream-pos))
            ;; handle quotes
            ((quote-p (char word 0)) ; TO DO: handle left and right quotation separately
             (cond ((and (> (length word) 3)
                         (quote-p (last-char word))
                         (punctuation-p (last-char word 2)))
                    (let ((word-end (1+ (position-if-not #'punctuation-p word 
                                                         :from-end t
                                                         :end (- (length word) 1)))))
                      (add-token sentence "«" :position stream-pos)
                      (add-token sentence (subseq word 1 word-end) :position (1+ stream-pos))
                      (add-token sentence (subseq word word-end (1- (length word)))
				 :position (+ stream-pos word-end))
		      (add-token sentence "»" :position (+ stream-pos (length word) -1))))
                   ((or (< (length word) 3)
                        (and (evenp (count-if #'quote-p word))))
                    (add-token sentence word :position stream-pos))
                   (t
                    (add-token sentence "«" :position stream-pos)
                    (add-token sentence (subseq word 1) :position (1+ stream-pos)))))
            ((quote-p (last-char word))
             (add-token sentence (subseq word 0 (1- (length word))) :position stream-pos)
             (add-token sentence "»" :position (+ stream-pos (length word) -1)))
	    (t
             (add-token sentence word :position stream-pos))))))

#+test
(untrace token-attributes)

(defmethod build-sentence ((sentence sentence) (tokenizer tokenizer)
			   &optional prev-token terminator)
  "Returns a fully tokenized sentence."
  (let ((token (or (and prev-token (token-next prev-token))
		   (sentence-add-token sentence (tokenizer sentence)))))
    (cond ((null token)
	   ;; new for logon
	   (when (and terminator
		      (string= (effective-token-value terminator) ".")
		      (prev-str-token terminator)
		      (next-str-token terminator))
	     ;; append period to prev. token
	     (concatenate-tokens (prev-str-token terminator) terminator))
	   sentence)
	  ((possible-terminator-p sentence tokenizer token)
           ;; we found a possible terminator
           (cond ;; if there is no previous possible terminator, continue
		 ((null terminator)
                  (if token
                    (build-sentence sentence tokenizer token
                                    ;; final check for headline
                                    (when (possible-headline-or-no-newline-p sentence token) 
                                      token))
                    sentence))
                 (;; check whether the previous terminator really terminates the sentence
                  (or (and (insignificant-token-p terminator)
			   (consp (token-attributes terminator)))
		      (terminates-sentence-p terminator) ;; has side effect!
		      (and *terminate-on-following-insignificant-token-p*
			   (insignificant-token-p (token-next terminator))))
		  ;; deal with headline
		  (when (or (eq :newline (effective-token-value terminator))
			    (and (insignificant-token-p terminator)
				 (consp (token-attributes terminator)))) 
                    (insert-token terminator (get-token :value :newline :chain sentence
							:position (token-stream-position terminator)))
                    (setf (token-value terminator) "|"))
                  (let ((new-sentence (split-at-token sentence terminator)))
                    (values sentence 
                            new-sentence
                            ;; final check for headline
                            (when (and new-sentence
                                       (possible-headline-or-no-newline-p new-sentence token)) 
                              token)
                            token)))
		 (t ;; does not terminate: we go on
                  (when (and (string= (effective-token-value terminator) ".")
			     (prev-str-token terminator))
		    ;; append period to prev. token
		    (concatenate-tokens (prev-str-token terminator) terminator))
                  (build-sentence sentence tokenizer token
                                  ;; final check for headline
                                  (when (possible-headline-or-no-newline-p sentence token) 
                                    token)))))
          (t
           (build-sentence sentence tokenizer token terminator)))))

;; for debugging
(defmethod %print-sentence ((sentence sentence))
  (terpri) (write-string "[ ")
  (labels ((walk (token)
             (when token
               (format t "~a " (effective-token-value token))
               (walk (token-next token)))))
    (walk (first-token sentence)))
  (write-char #\])
  sentence)

(defmethod token-line-length ((token token))
  "number of tokens in line starting from TOKEN"
  (let ((length 0))
    (block counted
      (map-tokens (token-chain token)
                  (lambda (token)
                    (incf length)
                    (when (eq (effective-token-value token) :newline)
                      (return-from counted)))
                  :start token
                  :walk-function #'next-stream-token))
    length))

(defmethod possible-headline-or-no-newline-p ((sentence sentence) token)
  (when token
    (let ((value (effective-token-value token)))
      (or (not (eq value :newline)) ; has been dealt with before
          ;; possible subtitle
          (and (token-prev token)
               ;; headline starts with :newline
               (eq (effective-token-value (first-token sentence)) :newline)
               ;; :newline is first one in sentence
               (eq (find-token sentence :newline
                               :start (find-token-if sentence
						     (lambda (value) (not (eq value :newline)))
                                                     :key #'effective-token-value)
                               :key #'effective-token-value)
                   token)
               ;; subtitle has bounded length
               (<= (token-distance (first-token sentence) token) (1+ $max-subtitle-length))
               ;; subtitle is shorter than next line
               (let* ((next-token (next-stream-token token))
		      (line-length (when next-token (token-line-length next-token))))
		 (and next-token
		      (or (= line-length 1)
			  (<= (token-distance (first-token sentence) token) line-length)))))
	  ;; possible headline if empty line follows
          (and 
           ;; but not if we already have signaled a possible headline
           ;; at the previous newline
           (token-prev token)
           ;(not (equal (effective-token-value (token-prev token)) "|"))
           (eq (effective-token-value (token-prev token)) :newline))))))

(defun ends-in-hyphen-p (token)
  (let ((str (effective-token-value token)))
    (and (stringp str)
         (find-if-not (lambda (c) (char= c #\-)) str) ; hyphen alone does not count
         (char= (char str (1- (length str))) #\-))))

; rewrite this one!
(defun word-exists-p (word)
  (let* ((length (length word))
         (str (make-string (1+ length))))
    (declare (dynamic-extent str)) ; no effect
    (dotimes (i length) (setf (char str i) (char word i)))
    (setf (char str length) #\:)
    (nth-value 1 (match-string (lexicon *tagger*) str))))

;; destructive!
(defun ndowncase-first (string)
  (setf (char string 0) (char-downcase (char string 0)))
  string)

(defun downcase-first (string)
  (if (upper-case-p (char string 0))
    (ndowncase-first (copy-seq string))
    string))

#||
(disambiguate-from-string "det er rom for bedre komprimering siden lemma- 
og kodelisten inneholder mye felles informasjon")
(disambiguate-from-string "det er rom for bedre komprimering siden fullforms-, lemma- 
og kodelisten inneholder mye felles informasjon")
(disambiguate-from-string "Det sitter en nisse p� l�vebrua.")
(disambiguate-from-string "Faen ta deg, din j¾vla \"dust\"!")
||#

(defmethod merge-quoted-subphrases ((sentence sentence))
  (labels ((merge-words (token)
	     (cond ((null token)
		    nil)
		   ((symbolp (effective-token-value token))
		    (merge-words (token-next token)))
		   ((eq token (last-token sentence))
		    nil)
		   ((not (find-if-not #'quote-p (effective-token-value token)))
		    (let ((next-quote-token (find-token-if sentence
							   (lambda (value)
							     (and (stringp value)
								  (find-if #'quote-p value)))
							   :start (token-next token)
							   :key #'effective-token-value)))
		      (when (and next-quote-token
				 (search "\"-" (effective-token-value next-quote-token)))
			(concatenate-tokens token next-quote-token)
			(merge-words (token-next (or next-quote-token token))))))
		   (t
		    (merge-words (token-next token))))))
    (merge-words (first-token sentence))
    ;;(map-tokens sentence #'print) 
    sentence))

(defmethod merge-hyphenated-words ((sentence sentence))
  (when *merge-hyphenated-words-p*
    (labels ((merge-words (token)
	       (cond ((null token)
		      nil)
		     ((eq token (last-token sentence))
		      nil)
		     ((not (ends-in-hyphen-p token))
		      (merge-words (token-next token)))
		     ((eq :newline (effective-token-value (token-next token)))
		      (merge-words (merge-hyphenated-words token)))
		     (t
		      (merge-words (token-next (merge-coordinations token)))))))
      (merge-words (first-token sentence))))
  sentence)

(defmethod merge-hyphenated-words ((token token))
  (let* (;; next token is a :newline token; have to jump over it
         (next-token (token-next (token-next token)))
         (next-word (when next-token (effective-token-value next-token))))
    (if (find next-word '("og" "eller") :test #'string-equal)
      ;; coordinated phrase
      (token-next (merge-coordinations token))
      (let* ((hyphen-word (effective-token-value token))
             (word (subseq hyphen-word 0 (1- (length hyphen-word))))
             (concatenation (concat word next-word)))
	(labels ((det-quant-p (word)
		   (or (token-match regex::*special-number-regexp* word)
		       (token-match regex::*area-measure-regexp* word)
		       (token-match regex::*rational-number-regexp* word))))
	  (setf (token-value next-token)
	    (cond ((or ;; look up in lexicon
		    (word-exists-p concatenation)
		    (word-exists-p (concat (ndowncase-first word) next-word))
		    ;; TO DO: check if concatenation is compound!
		    )
		   ;; if concatenation without hyphen exists, take it
		   concatenation)
		  ((or (det-quant-p word)
		       (find-if (lambda (feature)
				  ;;(print feature)
				  (has-features-p (cdr feature) 
						  '(det kvant)))
				(lemma-and-features word)))
		   (concat hyphen-word next-word))
		  ((most-probable-compound-last-chunk concatenation t)
		   concatenation)
		  (t
		   ;; else combine and keep hyphen
		   (concat hyphen-word next-word)))))
	;; remove :newline token and prefix, return merged token
        (remove-token (remove-token token))))))

(defmethod merge-coordinations ((token token))
  (labels ((try-merge (prev-token-type tk)
             (unless (null tk)
               (let ((next-tk (token-next tk)))
                 (cond ((eq :newline (effective-token-value tk))
                        (try-merge prev-token-type next-tk))
                       ((and (eq :hyphen-word prev-token-type)
                             (string= (effective-token-value tk) ","))
                        (try-merge :comma next-tk))
                       ((and (eq :hyphen-word prev-token-type)
                             (find (effective-token-value tk) '("og" "eller") :test #'string-equal))
                        (try-merge :og/eller next-tk))
                       ((and (eq :comma prev-token-type)
                             (ends-in-hyphen-p tk))
                        (try-merge :hyphen-word next-tk))
                       ((eq :og/eller prev-token-type)
                        tk)
                       (t
                        nil))))))
    (let ((last-coord-token (try-merge :hyphen-word (token-next token))))
      (if last-coord-token
        (concatenate-tokens token last-coord-token :new-p t)
        token))))

(defmethod remove-symbol-tokens ((sentence sentence))
  (with-slots (first-token last-token) sentence 
    (cond ((null first-token)
           sentence)
          ((symbolp (effective-token-value first-token))
           (setf first-token (remove-token first-token))
           (remove-symbol-tokens sentence))
          (t
           (labels ((remove-newline (token)
                      (cond ((null token)
                             nil)
                            ((symbolp (effective-token-value token))
                             (multiple-value-bind (next prev)
                                                  (remove-token token)
                               (if next 
                                 (remove-newline next)
                                 (setf last-token prev))))
                            (t
                             (remove-newline (token-next token))))))
             (remove-newline first-token)
             sentence)))))

(defmethod tag-sentence ((sentence sentence) &key &allow-other-keys)
  (tag-sentence (first-token sentence))
  ;; add sentence-end marker <<<
  (let ((last-string-token (if (stringp (effective-token-value (last-token sentence)))
			       (last-token sentence)
			     (prev-str-token (last-token sentence)))))
    (when last-string-token
      (dolist (features (token-features last-string-token))
	(set-feature (cdr features) '<<<))))
  sentence)

(defparameter *tag-name-suffix-p* t)

(defvar *normalize-word-list* nil)

(defvar *tag-as-multi-word-expression* t)

(defvar *tag-from-gazetteer* t)

;; main tagging function
(defmethod tag-sentence ((token token) &key (iterate-p t) &allow-other-keys)
  (let ((next-token
	 (cond ((stringp (token-value token))
		(when *normalize-word-list*
		  ;; If token has normalized-value its value is written to *normalize-word-list*,
		  ;; else, normalized-value is set to the value in *normalize-word-list*.
		  (cond ((null (token-normalized-value token))
			 #+disabled ;; fix this for merged words!
			 (setf (token-normalized-value token)
			   (dat:string-tree-get (cdr *normalize-word-list*) (token-value token))))
			((string= (token-normalized-value token)
				  (token-value token))
			 (setf (token-normalized-value token) nil)
			 (dat:string-tree-remove (cdr *normalize-word-list*) (token-value token)))
			((string= (token-normalized-value token) "#")
			 nil)
			(t
			 (setf (dat:string-tree-get (cdr *normalize-word-list*) (token-value token))
			       (token-normalized-value token)))))
		(let ((next-token
		       (or (tag-as-foreign-word token) ;; if language slot is not equal language
			   (when *tag-as-multi-word-expression*
			     (tag-as-multi-word-expression token iterate-p))
			   (tag-as-punctuation token)
			   (tag-from-memory token :return-nil-p nil)
			   (when *tag-from-gazetteer*
			     (tag-from-gazetteer token)) ;; returns NIL 
			   (tag-from-special-list token)
			   (tag-as-word token :no-subst-prop-p t)
			   #+debug
			   (progn (describe token)
				  (print :ppppp)
				  nil)
                           (tag-as-word token :features-fn #'get-scarrie-features)
			   ;;(progn (print token) nil)
			   (tag-as-one-word-expression token)
			   ;;(tag-as-compound token :hyphenated-only-p t)
			   (tag-as-proper-noun token)
			   #+:nny-parser
			   (when *lookup-unknown-in-nny-lexicon-p*
			     (tag-as-nny token)) ;; preliminary
			   (unless *no-compounds-p*
			     (tag-as-compound token)
			     (tag-as-interjection token)
			     (tag-as-proper-noun token :second-try t)
			     (tag-as-symbol token)
			     #+:nny-parser
			     (when *lookup-unknown-in-nny-lexicon-p*
			       (tag-as-nny-compound token))) ;; preliminary
			   (when *no-compounds-p* (tag-as-proper-noun token :second-try t))
			   (tag-as-unknown token))))
		  (when *tag-name-suffix-p* ;; don't if word is tagged from gazetteer??
		    (tag-name-suffix token))
		  (when *token-memory*
		    (let ((token (if (eq next-token t) token (or (token-prev next-token) token))))
		      (when token
			(memoize-token (multi-tagger-memory *token-memory*)
				       ;; can't simply take TOKEN because some of the TAG-AS-...-functions
				       ;; merge tokens, invalidating TOKEN.
				       token
				       :key :lemma+value))))
		  next-token))
	       (t
		(token-next token)))))
    (when iterate-p
      (tag-sentence next-token))))

;; This word list can be bound locally by an individual tagging task. Words in that list override the main lexicon.
;; When bound to a word list should look like (name tree . reversed-tree).
(defvar *special-word-list* nil)

(defun tag-from-special-list (token)
  (when *special-word-list*
    (tag-as-word token :net (cddr *special-word-list*) :pn-at-sentence-start-p nil)))

(defun tag-from-gazetteer (token)
  (with-slots (new-wordforms) *nbo-tagger*
    (or (and (upper-case-p (first-char (effective-token-value token)))
	     (tag-as-name token))
	#+test
	(tag-as-word token :net (cdr new-wordforms) :pn-at-sentence-start-p nil))
    nil))

#+test
(print (let ((*tagger* *nbo-tagger*))
	 (lemma-and-features "Addis Abeba" ;;"Oslo"
			     :net (cdr (names *tagger*)))))

#+test
(with-slots (names) *nbo-tagger*
  (print (get-features "Og" :net (cdr names))))

(defun find-name (token)
  (with-slots (names) *tagger* ;;*nbo-tagger*
    (let* ((max-token nil)
	   (features ()))
      (labels ((find-max-token (token str wc)
		 (when token
		   (let* ((extended-str (if str
					    (concat str " " (effective-token-value token))
					  (effective-token-value token)))
			  (ef (get-features extended-str :net (cdr names))))
		     (when ef (setf max-token token features ef))
		     (when (< (incf wc) $expression-max-length)
		       (find-max-token (next-str-token token) extended-str wc))))))
	(find-max-token token nil 0)
	(when *debug-mem* (when max-token (print (list :find-name max-token (mapcar (lambda (f) (code-features (cdr f))) features)))))
	(values max-token features)))))

(defun subsumes-p (bw1 bw2)
  (loop for b1 across bw1 for b2 across bw2
      always (or (zerop b1) (and (= 1 b1) (= 1 b2)))))

;;(print (subsumes-p #*10001 #*11001))

(defun tag-as-name (token)
  (multiple-value-bind (max-token features)
      (find-name token)
    (cond ((null max-token)
	   nil)
	  ((eq token max-token)
	   (setf (token-features token) (append (token-features token) features))
	   (when *record-tokenization-p* (push :nm (token-used-rules token)))
	   (or (token-next token) t))
	  (t
	   (let ((conc-token (concatenate-tokens token max-token :new-p t :label :nm)))
	     (setf (token-features conc-token) features)
	     (when *record-tokenization-p* (push :nm (token-used-rules conc-token)))
	     (or (token-next conc-token) t))))))

(defun tag-name-suffix (token)
  (with-slots (suffixes) *tagger*
    ;;(print (mapcar #'car (token-features token)))
    (dolist (f (token-features token))
      ;;(print (list token (token-features token)))
      (destructuring-bind (lemma . bv) f
	(let* ((suffix (find-if (lambda (sx)
				 (let ((sx (if (consp sx) (car sx) sx)))
				   (if (char= (char sx 0) #\*)
				       (and (= (length lemma) (1- (length sx)))
					    (string-equal lemma sx :start2 1))
				     (and (> (length lemma) (length sx))
					  (string-equal
					   lemma sx
					   :start1 (- (length lemma) (length sx)))))))
				suffixes))
	       (suffix-tag
		(cond ((null suffix) nil)
		      ((consp suffix) (cadr suffix))
		      (t suffix))))
	  (when suffix-tag
	    (add-features bv (intern (string-upcase (concat "<*" suffix-tag ">"))
				     :cgp))))))))

#+test
(print (gethash '<*vei> (feature-table *nbo-tagger*)))

(defun tag-as-foreign-word (token)
  (with-slots (language) token
    (when (and language
	       (not (eq language (language *tagger*))))
      (unless (token-features token)
	(setf (token-features token)
	  (list (%code-features (effective-token-value token) 'foreign))))
      (when *record-tokenization-p* (push :fgn (token-used-rules token)))
      (or (token-next token) t))))

#+:nny-parser
(defun tag-as-nny (token)
  (let ((*tagger* *nny-tagger*))
    (let ((next-token (tag-as-word token)))
      (when next-token
	(translate-features token *tagger* *nbo-tagger*)
	next-token))))

#+:nny-parser
(defun tag-as-nny-compound (token)
  (let ((*tagger* *nny-tagger*))
    (let ((next-token (tag-as-compound token)))
      (when next-token
	(translate-features token *tagger* *nbo-tagger*)
	next-token))))

(defmethod translate-features ((token token) from-tagger to-tagger)
  (let ((*tagger* from-tagger))
    (dolist (w+fv (token-features token))
      (when (cdr w+fv)
	(setf (cdr w+fv) (let* ((from-features (code-features (cdr w+fv)))
				(*tagger* to-tagger))
			   (apply #'encode-features
				  (cons 'nynorsk (substitute 'ent 'eint from-features)))))))))

;; new 31.7.2001
(defun tag-from-memory (token &key return-nil-p)
  #+debug(print token)
  (when (and *token-memory*
             (sentence-start-p token)
	     ;;(null (token-prev token))
             (upper-case-p (char (effective-token-value token) 0)))
    (let* ((word (string-downcase (effective-token-value token)))
           (memoized-tokens (gethash word (token-table (multi-tagger-memory *token-memory*)))))
      (dolist (memoized-token memoized-tokens)
	;; closed categories previously wrongly memoized as subst prop should not be tagget from memory
	(unless (find-if (lambda (pos)
			   (loop for lemma.features in (lc-features memoized-token)
			       thereis (and (cdr lemma.features) (has-feature-p (cdr lemma.features) pos))))
			 '(det konj subj adv prep pron))
	  (let* ((prop-feature
		  (find-if (lambda (feature)
			     (has-features-p (cdr feature) '(subst prop)))
			   (token-features memoized-token)))
		 (has-already-prop-feature-p
		  (and prop-feature
		       (find-if (lambda (feature)
				  (has-features-p (cdr feature) '(subst prop)))
				(token-features token)))))
	    (when (and prop-feature (not has-already-prop-feature-p))
	      ;; *** don't take over all features! (not gen!)
	      (setf (token-features token) (cons prop-feature (token-features token))
		    (lc-features token) (lc-features memoized-token))
	      (when *debug-mem*
		(print (list :tag-from-memory token
			     :tf (mapcar (lambda (f) (code-features (cdr f))) (token-features token))
			     :lcf (mapcar (lambda (f) (code-features (cdr f))) (lc-features token)))))
	      ;; new 22.10.2003
	      (unless return-nil-p
		(return-from tag-from-memory (or (token-next token) t)))))))
      (token-features token)))
  nil)

(defmethod combine-equal-features ((token t))
  token)

(defmethod combine-equal-features ((token token))
  (let ((combined-p nil))
    (loop for (form+features . rest) on (token-features token)
          for others = (collecting (loop for rest-features on rest
                                         do (destructuring-bind (form1 . features1) (car rest)
                                              (when (equal features1 (cdr form+features))
                                                (collect form1)
                                                (setf (car rest) nil)))))
          when others
          do
          (setf combined-p t
                (car form+features) (cons (car form+features) others)))
    (when combined-p
      (setf (token-features token)
            (remove-if #'null (token-features token))))
    token))

; *sentence*

#+old
(defmethod tag-sentence ((token token))
  (tag-sentence
   (or (tag-as-multi-word-expression token)
       (tag-as-punctuation token)
       (tag-as-word token)
       (tag-as-proper-noun token)
       (tag-as-compound token)
       (tag-as-interjection token)
       (tag-as-proper-noun token :second-try t)
       (tag-as-unknown token))))

(defmethod tag-sentence ((token t) &key &allow-other-keys)
  nil)

;; for testing
#+buggy
(defun tag-word (word)
  (let ((token (get-token :value word :prev (get-token :value word))))
    (tag-sentence token)
    (token-features token)))

;; the TAG-AS-functions return, if successful, the next untagged token
;; (or T if there is no token left), else NIL

;; the longest match wins
;; if there is equalty, prefer number before expression before uncomprehensible
(defun tag-as-multi-word-expression (token &optional (iterate-p t))
  (multiple-value-bind (number-token number-features)
                       (find-number token)
    (multiple-value-bind (exp-token exp-features)
                         (find-expression token)
      (multiple-value-bind (unc-token unc-features)
                           (find-uncomprehensible-expression token)
        (multiple-value-bind (max-token features)
                             (if (or (token< number-token exp-token)
                                     (token< number-token unc-token))
                               (if (token< exp-token unc-token)
                                 (values unc-token unc-features)
                                 (values exp-token exp-features))
                               (values number-token number-features))
          (when max-token
	    (cond (iterate-p
		   (let ((conc-token (concatenate-tokens token max-token :new-p t :label :mw)))
		     (setf (token-features conc-token)
			   (if (or (eq features number-features)
				   (eq features unc-features))
			       (mapcar (lambda (f) (cons (effective-token-value conc-token) f))
				       features)
			       features))
		     (when *record-tokenization-p* (pushnew :mw (token-used-rules conc-token)))
		     (or (token-next conc-token) t)))
		  (t
		   (setf (token-features token)
			   (if (or (eq features number-features)
				   (eq features unc-features))
			       (mapcar (lambda (f) (cons (effective-token-value token) f))
				       features)
			       features))
		   (or (token-next token) t)))))))))

(defun tag-as-one-word-expression (token)
  (let ((ef (expression-features (effective-token-value token))))
    (when ef
      (setf (token-features token) ef)
      (or (token-next token) t))))

;; merges tokens and concatenates the token strings
;; If NEW-P is true, the concatenated token is a fresh token which is inserted into the chain
(defun concatenate-tokens (first-token last-token &key new-p features label)
  #+debug(print (list :t first-token :m last-token))
  (labels ((merge-tokens (token string)
	     (cond ((not (eq token last-token))
                    (let* ((next-token (token-next token))
			   (next-value (effective-token-value next-token)))
                      (merge-tokens
		       (remove-token token)
		       (if (or (eq :newline next-value)
			       (insignificant-token-p next-token))
			   string
			 (concat string
				 (if (or (right-quote-token-p (next-str-token token))
					 (right-punctuation-token-p (next-str-token token))
					 (left-quote-p (last-char string)))
				     "" " ")
				 next-value)))))
                   (new-p
		    ;;(print (list :concatenating first-token last-token))
                    (let ((new-token (make-instance (class-of token)
                                       :chain (token-chain token)
                                       :prev (token-prev token)
                                       :next (token-next token)
				       :expansion 
				       #-test
				       (cons first-token last-token)
				       #+test
				       (cons (if (token-expansion first-token)
						 (car (token-expansion first-token))
					       first-token)
					     (if (token-expansion last-token)
						 (cdr (token-expansion last-token))
					       last-token))
				       :value string
				       :whitespace (token-whitespace first-token)
				       :features (or features (token-features token))
				       :used-rules (list (or label :rx))
				       :position (token-stream-position first-token))))
                      ;;(print (list :-> new-token))
		      (if-let (prev (token-prev token))
                        (setf (token-next prev) new-token );;(token-prev token) nil)
                        (setf (first-token (token-chain token)) new-token))
                      (if-let (next (token-next token))
                        (setf (token-prev next) new-token );;(token-next token) nil)
                        (setf (last-token (token-chain token)) new-token))
		      ))
                   (t
                    (setf (token-value last-token) string
			  (token-normalized-value last-token) nil
			  (token-stream-position last-token) (token-stream-position first-token)
			  (token-whitespace last-token) (token-whitespace first-token))
                    last-token))))
    (merge-tokens first-token (effective-token-value first-token))))

;; *sentence*

;; debug
;(defparameter *token* nil)

;; returns the rightmost of the successive tokens (starting with TOKEN) matching the regexp
(defun token-match (regexp token &key end-test)
  (regex:string-match regexp token :exact t :maximal t :end-test end-test))

(defun token< (token1 token2)
  (or (and (null token1) token2)
      (and token1 token2
           (or (eq token1 (token-prev token2))
               (token< token1 (token-prev token2))))))

#+test
(regex::string-match regex::*number-regexp* "10")

#+bit-vectors-ignore
(defun find-number (token)
  (let* ((last-matched-token
          (token-match regex::*number-regexp* token
                       :end-test (lambda (token)
                                   (and token
                                        (equal (effective-token-value token) ".")
                                        (not (token-next token))))))
         (max-token last-matched-token)
         (features ()))
    (when last-matched-token
      (push (cond ((and (eq last-matched-token token)
                        (string= (effective-token-value token) "1"))
                   (list 'det 'kvant 'ent))
                  ((char= (u:last-char (effective-token-value token)) #\.)
                   (list 'adj '<ordenstall> 'fl))
                  (t
                   (list 'det 'kvant 'fl)))
            features))
    (labels ((check-regexp (regexp f &optional add-if-eq)
               (setf last-matched-token (token-match regexp token))
               (cond ((and add-if-eq (eq max-token last-matched-token))
                      (push f features)
                      #+ignore (setf features (append features f)))
                     ((token< max-token last-matched-token)
                      (setf features (list f) max-token last-matched-token))
                     (t nil))))
      (check-regexp regex::*special-number-regexp* (list 'det 'kvant))
      (check-regexp regex::*area-measure-regexp* (list 'det 'kvant))
      (check-regexp regex::*rational-number-regexp* (list 'det 'kvant))
      (check-regexp regex::*date-regexp* (list 'subst '<dato>) t)
      (check-regexp regex::*time-regexp* (list 'subst '<klokke>) t)
      (check-regexp regex::*amount-regexp* (list 'det '<beloep>) t)
      (check-regexp regex::*uppercase-roman-numeral-regexp* (list 'det '<romertall>) t)
      (when (sentence-start-p token)
        (check-regexp regex::*sentence-start-roman-numeral-regexp* (list 'det '<romertall>) t))
      (values max-token features))))

(defun find-number (token)
  (let* ((last-matched-token
          (token-match regex::*number-regexp* token
                       :end-test (lambda (token)
                                   (and token
                                        (equal (effective-token-value token) ".")
                                        (not (token-next token))))))
         (max-token last-matched-token)
         (features ()))
    (when last-matched-token
      (push (cond ((and (eq last-matched-token token)
                        (string= (effective-token-value token) "1"))
                   (encode-features 'det 'kvant 'ent))
                  ((char= (last-char (effective-token-value token)) #\.)
                   (encode-features 'adj '<ordenstall> 'fl))
                  (t
                   (encode-features 'det 'kvant 'fl)))
            features))
    (labels ((check-regexp (regexp f &optional add-if-eq)
               (setf last-matched-token (token-match regexp token))
               (cond ((and add-if-eq (eq max-token last-matched-token))
                      (push f features)
                      #+ignore (setf features (append features f)))
                     ((token< max-token last-matched-token)
                      (setf features (list f) max-token last-matched-token))
                     (t nil))))
      (check-regexp regex::*special-number-regexp* (encode-features 'det 'kvant))
      (check-regexp regex::*area-measure-regexp* (encode-features 'det 'kvant))
      (check-regexp regex::*rational-number-regexp* (encode-features 'det 'kvant))
      (check-regexp regex::*date-regexp* (encode-features 'subst '<dato>) t)
      (check-regexp regex::*time-regexp* (encode-features 'subst '<klokke>) t)
      (check-regexp regex::*amount-regexp* (encode-features 'det '<beloep>) t)
      (check-regexp regex::*uppercase-roman-numeral-regexp* (encode-features 'det '<romertall>) t)
      (when (sentence-start-p token)
        (check-regexp regex::*sentence-start-roman-numeral-regexp* (encode-features 'det '<romertall>) t))
      #+debug(print (list :token token :max max-token))
      (values max-token features))))

;; TO DO: eliminate consing
(defun find-expression (token)
  (let* ((max-token nil)
         (features ()))
    (labels ((find-max-token (token str wc)
               (when token
                 (let* ((extended-str (if str
                                        (concat str " " (effective-token-value token))
                                        (effective-token-value token)))
                        ;; check only if extended-str consists of more than one word
                        (ef (when str (expression-features extended-str))))
                   (when ef (setf max-token token features ef))
                   (when (< (incf wc) $expression-max-length)
                     (find-max-token (next-str-token token) extended-str wc))))))
      (find-max-token token nil 0)
      (if (sentence-start-p token)
        ;; check if lowercase expression exists
        (let ((orig-max-token max-token)
              (orig-features features)
              (lc-str (downcase-first (effective-token-value token))))
          (find-max-token (next-str-token token) lc-str 1)
          ;; compare found expressions for downcased and original string, return longest
          ;; match if length differs, or append features if matches are of equal length
          (cond ((eq max-token orig-max-token)
                 (values max-token (append orig-features features)))
                ((token< orig-max-token max-token)
                 (values max-token features))
                (t
                 (values orig-max-token orig-features))))
        (values max-token features)))))

;; old
;; (dotimes (i 1000) (expression-features "i morgen")) took 505 milliseconds (0.505 seconds) to run.
;;  1,104,008 bytes of memory allocated.
;; new, with bit-vectors
;; (dotimes (i 1000) (expression-features "i morgen")) took 985 milliseconds (0.986 seconds) to run.
;;  1,240,016 bytes of memory allocated.
;; without bit-vector translation
;; (dotimes (i 1000) (expression-features "i morgen")) took 314 milliseconds (0.314 seconds) to run.
;;  944,000 bytes of memory allocated.
;; 8.4.2002 Allegro/Decentius:
;; 32 milliseconds
;;  347,000 bytes of memory allocated.

;; *** BUG: (d "i morgen")

(defun expression-features (exp)
  "Returns the features of an expression."
  ;; TO DO: simplify this: put them into one net.
  (or (lemma-and-features exp :net (expressions *tagger*))
      (lemma-and-features exp :net (abbreviations *tagger*))
      (lemma-and-features exp :net (word-like-abbreviations *tagger*))
      (lemma-and-features exp :net (titles *tagger*))
      (lemma-and-features exp :net (symbols *tagger*))))

#+old
(defun expression-features (exp)
  (mapcar (lambda (value)
            (let* ((colon-pos (position #\: value))
                   (lemma (subseq value 0 colon-pos))
                   (features (read-from-string (u:concat "(" (subseq value (1+ colon-pos)) ")"))))
              (cons lemma features)))
          ;; TO DO: simplify this: put them into one net.
          (or (string-values *expressions* exp)
              (string-values *abbreviations* exp)
              (string-values *word-like-abbreviations* exp)
              (string-values *titles* exp)
              (string-values *symbols* exp))))

;; TO DO: account for quote nesting
(defun find-uncomprehensible-expression (token)
  (when (not (find-if-not #'left-quote-p (effective-token-value token)))
    (let ((known 0)
          (unknown 0))
      (labels ((test-token (token)
                 (cond ((null token)
                        ;; should not happen in well-formed text
                        ;; means that quotes are not balanced and we reached the end
                        ;; of the sentence before a right quote
                        nil)
                       ((find-if-not #'right-quote-p (effective-token-value token))
                        ;; OBS: we check also for compounds
                        (if (or (word-p token)
                                ;; TO DO: look up downcased etc. variants?
                                (most-probable-compound-last-chunk (effective-token-value token)))
                          (incf known)
                          (incf unknown))
                        (test-token (next-str-token token)))
                       ((> unknown known)
			;; #-bit-vectors (values token (list (list 'subst 'prop)))
			(when *debug-mem* (print (list :find-uncomp-exp token '(subst prop))))
                        (values token (list (encode-features 'subst 'prop))))
                       (t
                        nil))))
        (test-token token)))))

(defun %code-features (lemma &rest features)
  (cons lemma (apply #'encode-features features)))

#+test
(let ((*tagger* *nbo-tagger*))
  (print (%code-features "fifi" 'subst 'foreign)))

(defun all-p (predicate codes)
  (not (find-if-not predicate codes)))

(defun tag-as-punctuation (token)
  (let* ((str (effective-token-value token))
         (features
          (cond ((string= str ".")
                 (list (%code-features "$." 'CLB '<PUNKT>)))
                ((morph::all-p (lambda (c) (char= c #\.)) str)
                 (list (%code-features "$..." 'CLB '<ELLIPSE>)))
                ((string= str "|")
                 (list (%code-features "$|" 'CLB '<OVERSKRIFT>)))
		((string= str "¶")
		 (if (prev-str-token token)
		     ;; headline if terminates sentence
		     (list (%code-features "$¶" 'CLB '<OVERSKRIFT>))
		   (list (%code-features "$¶"))))
                ((string= str ",")
                 (list (%code-features "$," 'CLB '<KOMMA>)
                       (%code-features "$," '<KOMMA>)))
                ((all-p (lambda (c) (char= c #\!)) str)
                 (list (%code-features "$!" 'CLB '<UTROP>)))
                ((string= str ":")
                 (list (%code-features "$:" 'CLB '<KOLON>)))
                ((string= str ";")
                 (list (%code-features "$;" 'CLB '<SEMI>)))
                ((all-p (lambda (c) (char= c #\?)) str)
                 (list (%code-features "$?" 'CLB '<SPM>)))
                ((string= str "-")
                 (list (%code-features "$-" '<STREK>)))
                ((string= str "--")
                 (list (%code-features "$--" '<STREK>)))
                ((not (find-if-not #'quote-p str))
                 (list (%code-features "$\"" '<ANF>)))
                ((string= str "(")
                 (list (%code-features "$(" '<PARENTES-BEG>)))
                ((string= str ")")
                 (list (%code-features "$)" '<PARENTES-SLUTT>)))
		((string= str "[")
                 (list (%code-features "$[" '<PARENTES-BEG>)))
                ((string= str "]")
                 (list (%code-features "$]" '<PARENTES-SLUTT>)))
                (t
                 nil))))
    (when features
      (when *record-tokenization-p* (push :pct (token-used-rules token)))
      (setf (token-features token) features)
      (or (token-next token) t))))

#+bit-vectors-ignore
(defun tag-as-punctuation (token)
  (let* ((str (effective-token-value token))
         (features
          (cond ((string= str ".")
                 (list (list "$." 'CLB '<PUNKT>)))
                ((all-p (lambda (c) (char= c #\.)) str)
                 (list (list "$..." 'CLB '<ELLIPSE>)))
                ((string= str "|")
                 (list (list "$|" 'CLB '<OVERSKRIFT>)))
                ((string= str ",")
                 (list (list "$," 'CLB '<KOMMA>)
                       (list "$," '<KOMMA>)))
                ((all-p (lambda (c) (char= c #\!)) str)
                 (list (list "$!" 'CLB '<UTROP>)))
                ((string= str ":")
                 (list (list "$:" 'CLB '<KOLON>)))
                ((string= str ";")
                 (list (list "$;" 'CLB '<SEMI>)))
                ((all-p (lambda (c) (char= c #\?)) str)
                 (list (list "$?" 'CLB '<SPM>)))
                ((string= str "-")
                 (list (list "$-" '<STREK>)))
                ((string= str "--")
                 (list (list "$--" '<STREK>)))
                ((not (find-if-not #'quote-p str))
                 (list (list "$\"" '<ANF>)))
                ((string= str "(")
                 (list (list "$(" '<PARENTES-BEG>)))
                ((string= str ")")
                 (list (list "$)" '<PARENTES-SLUTT>)))
                (t
                 nil))))
    (when features
      (when *record-tokenization-p* (push :pct (token-used-rules token)))
      (setf (token-features token) features)
      (or (token-next token) t))))

;; preliminary
(defun right-punctuation-token-p (token)
  (let ((str (effective-token-value token)))
    (and (stringp str)
	 (<= 1 (length str))
         (find (char str 0) $right-punctuation))))

;; preliminary
(defun quote-token-p (token)
  (let ((str (effective-token-value token)))
    (and (stringp str)
	 (= (length str) 1)
         (quote-p (char str 0)))))

(defun left-quote-token-p (token)
  (let ((str (effective-token-value token)))
    (and (stringp str)
	 (= (length str) 1)
         (left-quote-p (char str 0)))))

(defun quote-hyphen-token-p (token)
  (let ((str (effective-token-value token)))
    (and (stringp str)
	 (string= "-" str))))

(defun left-parenthesis-token-p (token)
  (let ((str (effective-token-value token)))
    (and (stringp str)
	 (= (length str) 1)
         (left-parenthesis-p (char str 0)))))

(defun right-quote-token-p (token)
  (let ((str (effective-token-value token)))
    (and (stringp str)
	 (= (length str) 1)
         (right-quote-p (char str 0)))))

(defun stroke-token-p (token)
  (let ((str (effective-token-value token)))
    (and (stringp str)
	 (or (and (= (length str) 1)
		  (char= (char str 0) #\-))
	     (string= str "--")))))

(defparameter *sentence-start-p-fun* nil)

(defun sentence-start-p (token)
  (or (null (prev-str-token token))
      (and (null (prev-str-token (prev-str-token token)))
           (or (equal (effective-token-value (prev-str-token token)) "\"")  
	       (quote-token-p (prev-str-token token))
               (stroke-token-p (prev-str-token token))
	       (and *sentence-start-p-fun*
		    (funcall *sentence-start-p-fun* token))))))

(defun first-uppercase-p (str)
  (and (not (zerop (length str)))
       (upper-case-p (char str 0))))

;; look up in lexicon; merge with the next function
(defun word-p (token)
  (let* ((word (string-trim "\"" (effective-token-value token)))
         (features (get-features word))) ; TO DO: remove unnecessary consing here
    (when features (return-from word-p t))
    (when (first-uppercase-p word)
      ;; word starts with capital letter
      (let ((first-lowercase-word (downcase-first word))
            (lowercase-word (string-downcase word)))
        ;; lookup if we are not at sentence start or right after left quote
        (when (or (sentence-start-p token)
                  ;; changed 22.09.2000
                  (left-parenthesis-token-p (prev-str-token token))
                  (quote-hyphen-token-p (prev-str-token token))
                  (left-quote-token-p (prev-str-token token)))
          (when (get-features first-lowercase-word) (return-from word-p t))) 
        ;; more than one capital letter ; ** what if first is not capital?
        (when (string/= lowercase-word first-lowercase-word)
          (when (get-features lowercase-word) (return-from word-p t))) 
        ;; word consists of exactly one capital letter
        (when (and (= (length word) 1)
                   (string/= word lowercase-word))
          (unless (or (sentence-start-p token)
                      (quote-token-p (token-prev token)))
            (when (get-features first-lowercase-word) (return-from word-p t))) ))
      ;; TO DO: apostrophe (??) diacritics??
      nil)))

#+test
(get-features "Olav")

#+test
(get-features "Dr.")
#+test
(get-features "dr.")

;; new 23.01.2001
(defun append-features (features1 features2)
  ;;(print (list :appending features1 features2))
  (dolist (f features1)
    (pushnew f features2 :test #'equal))
  features2)

(defparameter *sentence-initial-lc-features-p* t)

;; look up in lexicon
(defun tag-as-word (token &key (net (lexicon *tagger*)) (pn-at-sentence-start-p t) no-subst-prop-p
                          (features-fn #'get-features))
  (let* ((word (string-trim $quote (effective-token-value token)))
         (features (append-features (token-features token) (funcall features-fn word :net net))))
    (when (first-uppercase-p word)
      ;; word starts with capital letter
      (let ((first-lowercase-word (ndowncase-first (copy-seq word)))
            (lowercase-word (string-downcase word))
            (capitalized-word (when (sentence-uppercase-p (token-chain token))
                                (string-capitalize word))))
        ;; lookup if we are not at sentence start or right after left quote
        (when (or (sentence-start-p token)
		  (left-parenthesis-token-p (prev-str-token token))
                  (quote-hyphen-token-p (prev-str-token token))
                  (left-quote-token-p (prev-str-token token)))
	  (setf features
                (append-features features (funcall features-fn first-lowercase-word :net net)))
	  (when *sentence-initial-lc-features-p*
	    (add-lowercase-features token)))
        ;; sentence was all uppercase
        (when capitalized-word
          (setf features
                (append-features features (funcall features-fn capitalized-word :net net))))
        ;; more than one capital letter ; ** what if first is not capital?
        (when (string/= lowercase-word first-lowercase-word)
          (setf features
                (append-features features (funcall features-fn lowercase-word :net net))))
        ;; word consists of exactly one capital letter
        (when (and (= (length word) 1)
                   (string/= word lowercase-word))
	  (unless (or (sentence-start-p token)
                      (quote-token-p (prev-str-token token)))
	    (when *debug-mem* (print (list :prop-one-char token)))
	    (unless (sentence-start-p token)
	      (add-lowercase-features token))
	    (setf features (cons (cons word (encode-features 'subst 'prop)) features))
            (setf features
                  (append-features features (funcall features-fn first-lowercase-word :net net))))))
      ;; TO DO: apostrophe (??) diacritics??
      )
    (cond (features
	   (when *record-tokenization-p* (push :wd (token-used-rules token)))
	   (setf (token-features token) features)
	   (or (token-next token) t))
	  ((and pn-at-sentence-start-p (sentence-start-p token)
		(let* ((word (effective-token-value token))
		       (hyphen-pos (position #\- word)))
		     (not (and hyphen-pos
			       (< hyphen-pos (1- (length word)))
			       (lower-case-p (char word (1+ hyphen-pos)))))))
	   ;; we suspect that it is an unrecognized name
	   (when *debug-mem*
	     (print (list :prop-unrec-name token)))
	   (unless no-subst-prop-p
	     (setf (token-features token) (list (cons word (encode-features 'subst 'prop)))))
	   nil)
	  (t
	   nil))))

(defun suffix-p (str sfx)
  "checks if STR ends in SFX"
  (let ((start (- (length str) (length sfx))))
    (and (<= 0 start)
         (string= str sfx :start1 start))))

(defun find-genitive-root (word)
  (cond ((suffix-p word "ss")
         nil)
        ((find-if (lambda (sfx) (suffix-p word sfx))
                  '("s'" "z'" "sch'" "sh'" "sj'" "x'" "c'"))
         (subseq word 0 (- (length word) 1)))
	((suffix-p word "'s")
          (subseq word 0 (- (length word) 2)))
        ((suffix-p word "s")
          (subseq word 0 (- (length word) 1)))
        (t
         nil)))

#+test
(setf *cg* *nbo-cg* *tagger* *nbo-tagger*)

(defun get-features (word &key (net (lexicon *tagger*)))
  (when net
    (let* ((gen-root (find-genitive-root word))
	   (lemma+features (lemma-and-features word :net net))
	   (gen-lemma+features
	    (when gen-root (add-gen-feature (lemma-and-features gen-root :net net)))))
      (append lemma+features gen-lemma+features))))

(defun get-scarrie-features (word &key add-styles-p &allow-other-keys)
  (when *scarrie-net*
    (let* ((gen-root (find-genitive-root word))
	   (lemma+features (scarrie-lemma-and-features word :add-styles-p add-styles-p))
	   (gen-lemma+features
	    (when gen-root (add-gen-feature (scarrie-lemma-and-features 
                                             gen-root :add-styles-p add-styles-p)))))
      (append lemma+features gen-lemma+features))))

#+bit-vectors-ignore
(defun add-gen-feature (features-list)
  ;; ... and remove readings that are incompatible with gen
  (u:collecting
    (dolist (features features-list)
      (cond ((find-if (lambda (f) (find f '(konj pron interj prep gen)))
                      (cdr features))
             nil)
            ((and (find 'verb (cdr features))
                  (find 'imp (cdr features)))
             nil)
            ((find 'gen (cdr features))
             (u:collect features))
            (t
             (u:collect ;(list* (car features) 'gen (cdr features))
               (append features (list 'gen))))))))

(defun add-gen-feature (features-list)
  ;; ... and remove readings that are incompatible with gen
  (collecting
    (dolist (features features-list)
      (let ((feature-vector (cdr features)))
        (cond ((thereis-feature-p feature-vector '(konj pron interj prep gen)) ; *** why gen?
               nil)
              ((has-features-p feature-vector '(verb imp))
               nil)
              ((has-feature-code-p feature-vector (feature-code 'gen)) ; TO DO: change the others appropriately
               (collect features))
              (t
               (set-feature feature-vector 'gen)
               #+ignore (collect (append features (list 'gen)))
               (collect features)))))))

;; TO DO: do some memoizing here
(defmethod sentence-uppercase-p ((sentence sentence))
  "Checks if whole sentence is in uppercase"
  (labels ((upper-p (token)
             (or (null token)
                 (and (or (not (stringp (effective-token-value token)))
                          (not (find-if #'lower-case-p (effective-token-value token))))
                      (upper-p (token-next token))))))
    (upper-p (first-token sentence))))

#-bit-vector
(defmethod proper-noun-p ((token token))
  (find-if (lambda (features) (find 'prop (cdr features))) (token-features token)))

#+bit-vector
(defmethod proper-noun-p ((token token))
  (find-if (lambda (features) (has-feature-p (cdr features) 'prop))
           (token-features token)))

#+old
(defmethod add-lowercase-features ((token token))
  nil)

(defmethod add-lowercase-features ((token token))
  #+debug(print (list :add-lc token))
  (let* ((word (string-downcase (string-trim $quote (effective-token-value token))))
         (features (or (get-features word) (tag-compound word))))
    (when features
      ;; add feature 'prop as sign for starting proper noun phrase
      (loop for (w . fv) in features
	  do (progn w (set-feature fv 'prop) #+ignore(set-feature fv 'lc)))
      (setf (lc-features token) features)
      (when *debug-mem* (print (list :add-lowercase-features token (token-chain token)
				     :features
				     (mapcar (lambda (f) (code-features (cdr f))) (token-features token))
				     :lc-features
				     (mapcar (lambda (f) (code-features (cdr f))) (lc-features token))))))))

(defun tag-as-proper-noun (token &key second-try)
  #+debug(print (list :token token :secondp second-try))
  (let ((word (string-trim $quote (effective-token-value token))))
    (when 
      ;; not at sentence start,
      ;; and second try, where we are less demanding, or:
      ;; word starts with uppercase,
      ;; does not contain hyphen followed by lowercase
      (or (and (not second-try)
               (not (sentence-start-p token))
               (first-uppercase-p word)
               (or (not (sentence-uppercase-p (token-chain token)))
                   (when-let (prev-token (prev-str-token token))
                     (proper-noun-p prev-token)))
               (let ((hyphen-pos (position #\- word)))
                 (or (not hyphen-pos)
                     (= hyphen-pos (1- (length word)))
                     (not (lower-case-p (char word (1+ hyphen-pos)))))))
          (and second-try
               (sentence-start-p token)))
      (let ((gen-root (find-genitive-root word)))
        (cond ((not gen-root) 
	       ;; seems to be a proper noun
	       (when *debug-mem* 
		 (print (list :prop-1 token)))
	       (setf (token-features token)
                     ;; bit-vector has to be fresh
                     (list (cons word (encode-features 'subst 'prop))))
               ;; add lowercase-features
               (when (or (not (sentence-start-p token)) #+ignore *sentence-initial-lc-features-p*)
                 (add-lowercase-features token)))
              ((and (suffix-p word "s")
                    (not (suffix-p word "ss")))
	       ;; gen or non-gen
	       (when *debug-mem*
		 (print (list :prop-2 token)))
               (setf (token-features token)
                     (list (cons word (encode-features 'subst 'prop))
                           (cons gen-root (encode-features 'subst 'prop 'gen)))))
              (t
               ;; gen
	       (when *debug-mem* 
		 (print (list :prop-gen token)))
               (setf (token-features token)
                     (list (cons gen-root (encode-features 'subst 'prop 'gen)))))))
      
      (when *token-memory*
        (let* ((word (effective-token-value token))
               (memoized-tokens (gethash (string-downcase word) 
                                         (token-table (multi-tagger-memory *token-memory*)))))
          (dolist (memoized-token memoized-tokens)
	    (when (and (upper-case-p (char (effective-token-value memoized-token) 0))
		       ;; is this sensible?
		       (not (string= (effective-token-value token) (effective-token-value memoized-token)))
		       ;; or this?
		       #+test
		       (not (upper-case-p (char (effective-token-value token) 0))))
	      (setf (token-features memoized-token)
		(append-features (token-features token) (token-features memoized-token)))) )))
      
      (when *record-tokenization-p* (push :pn (token-used-rules token)))
      (when (or (not (sentence-start-p token)) #+ignore *sentence-initial-lc-features-p*) 
	(or (token-next token) t)))))

;(tag-word "St.Olaf's")

;(mt "TROSSET HIV-FAREN FOR � F� BARN MED TINE ELLER STINE")

;(mt "DR. OLAV TRYGGE STORVIK")

(defun grammar-code-to-features (str code &key features-as-list-p)
  (let ((str+code (concat str ":" (%alpha-to-decimal code))))
    (declare (dynamic-extent str+code))
    (lemma-and-features str+code :decompress-base str
			:features-as-list-p features-as-list-p
                        :net (code-lexicon *tagger*))))

#||

(string-net::print-strings (lemmata-forms *tagger*))

(print (grammar-code-to-features "forvaltningssystemene" "810:4"))

;(grammar-code-to-features "base" "702:0")

(let ((*tagger* (multi-tagger *nbo-cg*)))
  (grammar-code-to-features "base" "702:0"))

(let ((*tagger* (multi-tagger (newest-cg :nbo))))
  #-ignore
  (string-net::print-strings (code-lexicon *tagger*))
  #+ignore
  (match-string  (code-lexicon *tagger*) "nummer:è::è")
  #+ignore
  (Print (string-values (code-lexicon *tagger*) "fisk"))
  #+ignore
  (grammar-code-to-features "nummer" "835:0"))

(%decode-str (%alpha-to-decimal "835:1"))
(alpha-to-decimal "835:1")

(decimal-to-alpha 835 1)

(let ((*tagger* (multi-tagger *nbo-cg*)))
  (string-net::string-values (code-lexicon *tagger*) "nummer"))

(print (%decode-str "ë::è"))


"nummer:ë::è" 
"nummer:è::´" 
"nummer:!è::è" 
"nummer:!ë::è" 
"nummer:#è::´" 

||#
#+old
(defun tag-compound (word)
  (let ((analysis (most-probable-compound-last-chunk word)))
    (when analysis
      (destructuring-bind (sfx . codes) analysis
        (let ((pfx (subseq word 0 (- (length word) (length sfx)))))
          ;; TO DO: remove duplicates
          (collecting
            (dolist (code codes)
              (print code)
              (dolist (lemma+features (grammar-code-to-features sfx code))
                (destructuring-bind (lemma . features) lemma+features
                  (collect (cons (concat pfx (string-left-trim "-" lemma))
				 (append features (list "samset")))))))))))))

#+bit-vectors-ignore
(defun tag-compound (word)
  (let ((analysis (most-probable-compound-last-chunk word t)))
    (when analysis
      (destructuring-bind (sfx . codes) analysis
        (let ((pfx (subseq word 0 (- (length word) (length sfx)))))
          (remove-duplicates
           (u:collecting
             (dolist (code codes)
               ;; remove forl_...-codes (cmp. def. of v_fuge_fullform, v_fuge_lemma)
               (let ((colon-pos (position #\: code)))
                 (dolist (lemma+features 
                          (if (= colon-pos 3)
                            (grammar-code-to-features sfx code)
                            (grammar-code-to-features (concat "-" sfx) (subseq code 2))))
                   (destructuring-bind (lemma . features) lemma+features
                     (u:collect (cons (concat pfx (string-left-trim 
                                                   "-"
                                                   (if (find-if #'lower-case-p word) 
                                                     lemma
                                                     (string-upcase lemma))))
                                      (append features (list 'samset)))))))))
           :test (lambda (fl1 fl2)
                   (and (string= (car fl1) (car fl2))
                        (loop for f1 in (cdr fl1)
                              and f2 in (cdr fl2)
                              always (eq f1 f2))))))))))

#+ignore
(let* ((*tagger* *nbo-tagger*)
       (*analyser-lexicon* (compound-analyser *tagger*)))
  ;;(morph::print-ranked-analyses "glitterheim")
  (print (tag-compound "glitterheim")))

(defun tag-compound (word)
  (let ((analysis (most-probable-compound-last-chunk word t)))
    (when analysis
      (destructuring-bind (sfx . codes) analysis
        (let ((pfx (subseq word 0 (- (length word) (length sfx)))))
	  (remove-duplicates
           (collecting
             (dolist (lemma+features codes)
	       (destructuring-bind (lemma . features) lemma+features
		 (collect (cons (concat pfx (string-left-trim 
					     "-"
					     (if (find-if #'lower-case-p word) 
						 lemma
						 (string-upcase lemma))))
				(set-feature features 'samset))))))
           :test (lambda (fl1 fl2)
                   (and (string= (car fl1) (car fl2))
                        (equal (cdr fl1) (cdr fl2))))))))))

#+old
(defun tag-compound (word)
  (let ((analysis (most-probable-compound-last-chunk word t)))
    (when (print analysis)
      (destructuring-bind (sfx . codes) analysis
        (let ((pfx (subseq word 0 (- (length word) (length sfx)))))
	  (remove-duplicates
           (collecting
             (dolist (lemma+features codes)
	       (destructuring-bind (lemma . features) lemma+features
		 (collect (cons (concat pfx (string-left-trim 
					     "-"
					     (if (find-if #'lower-case-p word) 
						 lemma
						 (string-upcase lemma))))
				(set-feature features 'samset))))))
           :test (lambda (fl1 fl2)
                   (and (string= (car fl1) (car fl2))
                        (equal (cdr fl1) (cdr fl2))))))))))


;; used by UD-base
(defun compound-lemma (word)
  (let ((analysis (most-probable-compound-last-chunk word t)))
    (when analysis
      (destructuring-bind (sfx . codes) analysis
        (let ((pfx (subseq word 0 (- (length word) (length sfx))))
	      (lemma-list ()))
	  (dolist (lemma+features codes)
	    (destructuring-bind (lemma . features) lemma+features
	      (pushnew (concat pfx (string-left-trim 
				    "-"
				    (if (find-if #'lower-case-p word) 
					lemma
					(string-upcase lemma))))
		       lemma-list :test #'string=)))
	  lemma-list)))))

;(tag-compound "edderkoppskall")


;; 517 ms
#+test
(time (with-open-file (stream "projects:cgp;multi-tagger;texts;korea.text")
        (map-sentences stream #'identity)))

;(tag-compound "kunnskap")

;(most-probable-compound-last-chunk "kunnskapsbase")

;(time (analyse-compound ",,,"))

;(analyse-compound "OLAV")

; (most-probable-compound-last-chunk "OLAV" t)

(defun tag-as-compound (token &key hyphenated-only-p)
  (let ((word (effective-token-value token)))
    (when (and (or (not hyphenated-only-p)
		   (let ((hyphen-pos (position #\- word)))
		     (and hyphen-pos
			  (< hyphen-pos (1- (length word)))
			  (lower-case-p (char word (1+ hyphen-pos))))))
	       (> (length word) 3))
      (let* ((word (if (and (quote-p (char word 0)) (quote-p (last-char word))) 
		       (string-trim $quote word)
		     word))
	     (gen-root (find-genitive-root word))
	     (lemma+features (append-features (token-features token) (tag-compound word)))
	     (gen-lemma+features (when gen-root
				   (add-gen-feature (tag-compound gen-root)))))
	(when (or gen-lemma+features lemma+features)
	  (setf (token-features token)
            (append lemma+features gen-lemma+features))
	  (when *record-tokenization-p* (pushnew :cp (token-used-rules token)))
	  (or (token-next token) t))))))

;; this is easier in Perl... *** use regexp!
(defun interjection-p (word &optional (start 0))
  "checks for three consecutive vowels"
  (let ((vowel-pos (position-if (lambda (c) (morph::vowelp (char-downcase c))) word :start start)))
    (cond ((null vowel-pos)
           nil)
          ((> vowel-pos (- (length word) 3))
           nil)
          ((and (morph::vowelp (char-downcase (char word (+ vowel-pos 1))))
                (morph::vowelp (char-downcase (char word (+ vowel-pos 2)))))
           t)
          (t
           (interjection-p word (+ vowel-pos 2))))))

#+test
(interjection-p "aaargh")

(defun tag-as-interjection (token)
  (let ((word (effective-token-value token)))
    (when (interjection-p word)
      ;#-bit-vectors (setf (token-features token) (list (list word 'interj)))
      (setf (token-features token) (list (%code-features word 'interj)))
      (when *record-tokenization-p* (push :ij (token-used-rules token)))
      (or (token-next token) t))))

(defun tag-as-symbol (token)
  (let ((word (effective-token-value token)))
    (when (or (and (= (length word) 1)
		   (find (char word 0) "+<>@*#$%&/"))
	      (not (find-if-not (lambda (c) (find c "1234567890+/,.")) word
				:end (1- (length word)))))
      (setf (token-features token) (list (%code-features word 'symb)))
      (when *record-tokenization-p* (push :sy (token-used-rules token)))
      (or (token-next token) t))))

(defparameter *unknown* ())

;; no conditions here
(defun tag-as-unknown (token)
  (unless (find (effective-token-value token) *unknown* :test #'equal)
    (push (effective-token-value token) *unknown*))
  (unless (token-features token)
    (setf (token-features token)
          (list (%code-features (effective-token-value token) 'ukjent))))
  (when *record-tokenization-p* (push :uk (token-used-rules token)))
  (or (token-next token) t))

(defmethod do-pending-operations ((sentence sentence))
  nil)

(defmethod do-pending-operations ((sentence regexp-sentence))
  (with-slots (pending-operations cg) sentence
    (labels ((do-operation ()
	       (when pending-operations
		 (funcall (car pending-operations) cg sentence)
		 (setf pending-operations (cdr pending-operations))
		 (do-operation))))
      (do-operation))))

#+test
(print (feature-vector (multi-tagger *nbo-cg*)))

(defun map-sentences (stream fun &key (sentence-class *sentence-class*) (multitag-p t)
				      compare-cg print-function
				      token-print-fn (encoding $encoding))
  (let ((*line-buffer* (make-instance 'line-buffer :encoding encoding))
        (*sentence-class* sentence-class)
	(delay-queue (when *token-memory* (make-queue)))
	(print-queue (when *token-memory* (make-queue)))
	(token-count 0))
    (loop with sentence = (get-sentence :stream stream :sentence-class sentence-class
					:encoding encoding
                                        :cg *cg*
					:compare-cg compare-cg
					;;:feature-vector (feature-vector (multi-tagger *cg*)) ;; don't remove!
					)
	  and next-sentence and terminator and token
	  do (cond ((null sentence)
		    ;; empty the queues
		    (cond ((front delay-queue)
			   #+debug(print :null-sentence-delay)
			   (when-let (sentence (pop-queue delay-queue))
			     (apply-sentence-mapping-function sentence fun :compare-cg compare-cg)
			     #+debug(when (front (regexp-delay-queue *token-memory*)) (print :regexp-delay))
			     ;; It might happen that the sentence is pushed onto the regexp-delay-queue,
			     ;; then we must not print it here.
			     (if (front (regexp-delay-queue *token-memory*))
				 (when-let (sentence (apply-sentence-mapping-function 
						      (pop-queue (regexp-delay-queue *token-memory*)) fun
						      :compare-cg compare-cg))
				   (do-pending-operations sentence)
				   #+orig
				   (funcall print-function sentence :token-print-fn token-print-fn))
				 #+orig
				 (funcall print-function sentence :token-print-fn token-print-fn))))
			  ((front (regexp-delay-queue *token-memory*))
			   #+debug(print :null-sentence-regexp)
			   (when-let (sentence (apply-sentence-mapping-function 
						(pop-queue (regexp-delay-queue *token-memory*)) fun
						:compare-cg compare-cg))
			     (do-pending-operations sentence)
			     (funcall print-function sentence :token-print-fn token-print-fn)))
			  ((front print-queue)
			   (funcall print-function (pop-queue print-queue) :token-print-fn token-print-fn))))
		   ((null *token-memory*)
		    (multiple-value-setq (sentence next-sentence terminator token)
		      (build-sentence sentence (tokenizer sentence) token terminator))
		    (funcall print-function
			     (apply-sentence-mapping-function
			      (if multitag-p
				  (tag-sentence
				   (merge-quoted-subphrases (merge-hyphenated-words sentence)))
				  sentence)
			      fun :compare-cg compare-cg)
			     :token-print-fn token-print-fn))
		   (t
		    (multiple-value-setq (sentence next-sentence terminator token)
		      (build-sentence sentence (tokenizer sentence) token terminator))
		    (incf token-count (sentence-length sentence))
		    (let ((s (tag-sentence (merge-quoted-subphrases (merge-hyphenated-words sentence)))))
		      (enqueue s delay-queue)
		      (enqueue s print-queue))
		    #+debug(print (list :token-count token-count
					:context-size (context-size (multi-tagger-memory *token-memory*))))
		    (when-let (sentence (and (> token-count (context-size (multi-tagger-memory *token-memory*)))
					     (pop-queue delay-queue)
					     (pop-queue print-queue)))
		      (decf token-count (sentence-length sentence)) ;; +-1??
		      (setf *sentence* sentence)
		      (when-let (sentence (apply-sentence-mapping-function sentence fun
									   :compare-cg compare-cg))
			(funcall print-function sentence :token-print-fn token-print-fn)))))
        while (or next-sentence
                  (and *token-memory*
                       (or (not (empty-queue-p delay-queue))
			   (not (empty-queue-p print-queue))
                           (not (empty-queue-p (regexp-delay-queue *token-memory*)))))) 
        do (setf sentence next-sentence))))

;; restore sentence as before multitagging
(defun map-retag-sentence (sentence fun &key (multitag-p t) print-function token-print-fn &allow-other-keys)
  (labels ((walk (token)
	     (when token
	       (setf (token-features token) nil)
	       (when (and (token-next token) (token-expansion (token-next token)))
		 (setf (token-next token) (car (token-expansion (token-next token)))
		       (token-prev (token-next token)) token))
	       (let ((nv (token-normalized-value token)))
		 ;;(print (list :nv nv))
		 (cond ((null nv)
			nil)
		       #+ignore ;; this doesn't work unless we update both index files when a new word is inserted
		       ((find #\Space nv)
			(let ((words (split nv #\Space)))
			  (setf (token-normalized-value token) (car words))
			  (dolist (word (cdr words))
			    (setf token
				  (insert-token token
						(get-token :value ""
							   :normalized-value word
							   :chain sentence
							   :position (token-stream-position token)))))))
		       ((string= nv "#")
			(setf (sentence-array sentence)
			      (let ((new-array
				     (make-array 0 :fill-pointer t :adjustable t)))
				(loop with i = 0
				      for tk across (sentence-array sentence) ;; do (print (list tk token))
				      unless (eq tk token)
				      do (vector-push-extend tk new-array))
				new-array))
			;;(describe (sentence-array sentence))
			;;(setf (token-normalized-value token) :insignificant)
			)))
	       (walk (token-next token)))))
    (walk (first-token sentence)))
  (funcall print-function
	   (apply-sentence-mapping-function
	    (if multitag-p
		(tag-sentence sentence)
	      sentence)
	    fun)
	   :token-print-fn token-print-fn))

;; *sentence*
;;(regexp-delay-queue *memory*)

(defmethod apply-sentence-mapping-function ((sentence sentence) fun &key &allow-other-keys)
  (funcall fun sentence
	   #+ignore(tag-sentence (merge-quoted-subphrases (merge-hyphenated-words sentence)))))

(defmethod apply-sentence-mapping-function ((sentence compare-sentence) fun
					    &key compare-cg &allow-other-keys)
  (with-slots (first-token) sentence
    (labels ((copy-feature-list (features)
                (when features
                  (destructuring-bind ((str . fv) . rest) features
                    (cons (cons str (copy-seq fv))
                          (copy-feature-list rest)))))
             (copy-features (token)
                (when token
                  (with-slots (next features compare-features used-rules compare-used-rules) token
                    (setf compare-features (copy-feature-list features)
                          compare-used-rules (copy-seq used-rules))
                    (copy-features next))))
             (switch (token)
                (when token
                  (with-slots (next features compare-features used-rules compare-used-rules) token
                    (let ((temp features))
                      (setf features compare-features compare-features temp
                            temp used-rules used-rules compare-used-rules compare-used-rules temp))
                    (switch next)))))
      ;;(tag-sentence (merge-quoted-subphrases (merge-hyphenated-words sentence)))
      (copy-features first-token)                 
      (funcall fun sentence)
      (switch first-token)
      (let ((*cg* compare-cg))
        (funcall fun sentence))
      (switch first-token)
      sentence)))


#+test
(with-open-file (in-stream "projects:cgp;multitagger;texts;test.text")
    (map-sentences in-stream #'print-sentence))

#+test
(with-open-file (stream "projects:cgp;multitagger;texts;test.tag"
                        :direction :output :if-exists :supersede)
  (with-open-file (in-stream "projects:cgp;multitagger;texts;test.text")
    (map-sentences in-stream (lambda (s) (print-sentence s stream)))))

#+test
(with-open-file (stream "projects:cgp;multitagger;texts;delkorp.tag"
                        :direction :output :if-exists :supersede)
  (with-open-file (in-stream "projects:cgp;multitagger;texts;delkorp.text")
    (map-sentences in-stream (lambda (s) (print-sentence s stream)))))

#+test
(with-open-file (stream "projects:cgp;multitagger;texts;korea.text")
  (map-sentences stream #'print-sentence))

#+test
(with-open-file (stream "projects:cgp;multi-tagger;texts;delkorp.text")
  (map-sentences stream #'print-sentence))

#+test
(time (with-open-file (stream "projects:cgp;multitagger;texts;korea.text")
        (map-sentences stream #'identity)))

(defmethod print-sentence ((sentence sentence) &rest rest
			   &key (expand-tokens-p t) ;; print-lc-features
				(token-print-fn #'print-token)
			   &allow-other-keys)
  "prints a tagged sentence in CG style"
  (let ((token-print-fn (or token-print-fn #'print-token)))
    (labels ((walk (token concat-token)
	       #+debug(print (list token concat-token))
	       (cond ((null token)
		      nil)
		     ((and (or (eq expand-tokens-p t)
			       (find expand-tokens-p (token-used-rules token)))
			   (token-expansion token)
			   ;; avoid nesting
			   (not concat-token))
		      (do ((ex-token (car (token-expansion token)) (token-next ex-token)))
			  ((eq ex-token (cdr (token-expansion token)))
			   (walk ex-token token))
			(walk ex-token token))
		      (walk (token-next token) nil))
		     (t
		      (apply token-print-fn token
			     :expand-tokens-p expand-tokens-p
			     :concat-token concat-token rest)
		      (unless (or (eq token (last-token sentence)) concat-token)
			(walk (token-next token) nil))))))
      (walk (first-token sentence) nil)))
  sentence)

(defmethod sentence-token-length ((sentence sentence) &key (expand-tokens-p t))
  (let ((length 0))
    (labels ((walk (token concat-token)
	       (cond ((null token)
		      nil)
		     ((and expand-tokens-p
			   (token-expansion token)
			   ;; avoid nesting
			   (not concat-token))
		      (do ((ex-token (car (token-expansion token)) (token-next ex-token)))
			  ((eq ex-token (cdr (token-expansion token)))
			   (walk ex-token token))
			(walk ex-token token))
		      (walk (token-next token) nil))
		     (t
		      (incf length)
		      (unless (or (eq token (last-token sentence)) concat-token)
			(walk (token-next token) nil))))))
      (walk (first-token sentence) nil))
    length))

;; destructive!
(defun remove-feature-inclusions (features)
  (loop for f-tail on features
      when (car f-tail)
      do (block loop
	   (loop for f-t-tail on (cdr f-tail)
	       when (car f-t-tail)
	       do (let ((l.f1 (car f-tail))
			(l.f2 (car f-t-tail)))
		    (cond ((string/= (car l.f1) (car l.f2))
			   nil)
			  ((subsumes-p (cdr l.f1) (cdr l.f2))
			   (setf (car f-tail) nil)
			   (return-from loop))
			  ((subsumes-p (cdr l.f2) (cdr l.f1))
			   (setf (car f-t-tail) nil))
			  (t
			   nil))))))
  features)

(defmethod print-token ((token token)
			&key (stream *standard-output*)
			     concat-token
			     additional-features
			     expand-tokens-p
			     (print-features t) (print-rules t) print-lc-features (print-attributes t)
			     (print-special-tokens-p t)
			&allow-other-keys)
  (let ((value (effective-token-value token))
	(features (remove-feature-inclusions (or (token-features token)
						 (when concat-token (token-features concat-token)))))
	(lc-features (when print-lc-features (or (lc-features token)
						 (when concat-token (lc-features concat-token)))))
	(attributes (token-attributes token)))
    (cond ((stringp value)
	   (let ((expanded-mark
		  (cond ((not expand-tokens-p)
			 "")
			((not concat-token)
			 "")
			((eq token (car (token-expansion concat-token)))
			 (if (eq token (cdr (token-expansion concat-token)))
			     "* " "[ "))
			((eq token (cdr (token-expansion concat-token)))
			 "] ")
			(t
			 "| "))))
	     (if print-features
		 (cond ((and print-rules (token-used-rules token))
			(destructuring-bind (mt-rule &rest rules)
			    (reverse (token-used-rules token))      
			  (format stream "~a\"<~a>\" ~{ ~a~}" 
				  expanded-mark
				  (if *debug-mem* token value)
				  (if mt-rule
				      (cons mt-rule (mapcar #'rule-string rules))
				    (mapcar #'rule-string rules)))))
		       ((and print-attributes attributes)
			(format stream "~a~{	~a~}" 
				value attributes))
		       (t
			(format stream "~a\"<~a>\"" expanded-mark value)))
	       (write-string value stream))) ; ** fix this!
	   (when (and *record-discarded-correct-readings-p*
		      (correct-reading-discarded-p token))
	     (write-string " <Correct discarded!>" stream))
	   (when print-features
	     (dolist (fl features)
	       (when (car fl)
		 (terpri stream)
		 (format stream "~c\"~a\" ~(~{~a~^ ~}~{ ~a~}~)" #\Tab (car fl)
			 (code-features (cdr fl)) additional-features)))
	     (dolist (fl lc-features)
	       (when (car fl)
		 (terpri stream)
		 (format stream "~c\"~a\" ~(~{~a~^ ~}~)" #\Tab (car fl)
			 (code-features (cdr fl))))))
	   (terpri stream))
	  ((insignificant-token-p token)
	   (when print-special-tokens-p
	     #+ignore-yet(format stream "~a~{	~a~}~%" value attributes)
	     (format stream "~a~%" attributes))))))

(defmethod print-token ((token control-token)
			&key (stream *standard-output*)
			     concat-token
			     (print-rules t) (print-attributes t)
			     (print-special-tokens-p t)
			&allow-other-keys)
  (let ((value (effective-token-value token))
	(features (token-features token #+ignore(or concat-token token)))
	(control-features (token-control-features token #+ignore(or concat-token token)))
	(attributes (token-attributes token)))
    (cond ((stringp value)
	   (cond ((and print-rules (token-used-rules token))
		  (destructuring-bind (mt-rule &rest rules)
		      (reverse (token-used-rules token))
		    (format stream "\"<~a>\" ~{ ~a~}" 
			    value (if mt-rule
				      (cons mt-rule (mapcar #'rule-string rules))
				    (mapcar #'rule-string rules)))))
		 ((and print-attributes attributes)
		  (format stream "~a~{	~a~}" value attributes))
		 (t
		  (format stream "\"<~a>\"" value)))
	   (when (and *record-discarded-correct-readings-p*
		      (correct-reading-discarded-p token))
	     (write-string " <Correct discarded!>" stream))
	   (loop for fl in features
	       for cfl in control-features
	       when (car fl)
	       do (terpri stream)
		  (format stream "~c\"~a\" ~(~{~a~^ ~}~)~(~{ ~a~}~)"
			  #\Tab
			  (car fl)
			  (code-features (cdr fl))
			  cfl))
	   (terpri stream))
	  ((insignificant-token-p token)
	   (when print-special-tokens-p
	     #+ignore-yet(format stream "~a~{	~a~}~%" value attributes)
	     (format stream "~a~%" attributes))))))

#+old
(defmethod print-sentence ((sentence sentence)
			   &rest rest
			   &key #||(stream *standard-output*)
				(print-features t) (print-rules t) (print-attributes t)
				(print-special-tokens-p t)||#
				(expand-tokens-p t)
			   &allow-other-keys)
  "prints a tagged sentence in CG style"
  (labels ((walk (token concat-token)
             (cond ((null token)
		    nil)
		   ((and expand-tokens-p (token-expansion token)
			 ;; avoid nesting
			 (not concat-token))
		    (do ((ex-token (car (token-expansion token)) (token-next ex-token)))
			((eq ex-token (cdr (token-expansion token)))
			 (walk ex-token token))
		      (walk ex-token token))
		    (walk (token-next token) nil))
		   (t
		    (let ((value (effective-token-value token))
			  (features (token-features (or concat-token token)))
			  (attributes (token-attributes token)))
		      (cond ((stringp value)
			     (if print-features
				 (cond ((and print-rules (token-used-rules token))
					(destructuring-bind (mt-rule &rest rules)
					    (reverse (token-used-rules token))      
					  (format stream "\"<~a>\" ~{ ~a~}" 
						  value (if mt-rule
							    (cons mt-rule (mapcar #'rule-string rules))
							  (mapcar #'rule-string rules)))))
				       ((and print-attributes attributes)
					(format stream "~a~{	~a~}" 
						value attributes))
				       (t
					(format stream "\"<~a>\"" value)))
			       (write-string value stream)) ; ** fix this!
			     (when (and *record-discarded-correct-readings-p*
					(correct-reading-discarded-p token))
			       (write-string " <Correct discarded!>" stream))
			     (when (and features print-features)
			       (dolist (fl features)
				 (when (car fl)
				   (terpri stream)
				   (format stream "~c\"~a\" ~(~{~a~^ ~}~)" #\Tab (car fl)
					   (code-features (cdr fl))))))
			     (terpri stream))
			    ((insignificant-token-p token)
			     (when print-special-tokens-p
			       #+ignore-yet(format stream "~a~{	~a~}~%" value attributes)
			       (format stream "~a~%" attributes)))))
		    (unless (or (eq token (last-token sentence)) concat-token)
		      (walk (token-next token) nil))))))
    (walk (first-token sentence) nil))
  sentence)

#+test
(let* ((*cg* *nbo-cg*)
      (*tagger* (multi-tagger *cg*)))
  (print
  (code-features #*00000000000010100000000000000000000001010100000000000000000000000000000101010000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000)))

#+test
(with-open-file (stream "projects:cgp;multitagger;texts;tvang.text")
  (reset-buffer *line-buffer*)
  (loop for word = (next-word stream)
        while word
        do (print word)))

#+test
(u:with-file-lines (line "projects:cgp;texts;mtag_tag2616.txt")
  (unless (char= (char line 0) #\Tab)
    (format t "~a~%" (string-trim "\"" line))))

;; somewhat hacky
#+mcl
(defun reconstruct-text (in-path out-path)
  (let ((char-count 0)
        (no-space nil)
        (left-quote nil)
        (stream (make-instance 'string-output-stream))
        (sentence-end-pos 0))
    (with-open-file (out-stream out-path :direction :output 
                            :if-exists :supersede :if-does-not-exist :create)
      (u:with-file-lines (line in-path)
        (cond ((string= line "")
               nil)
              ((char= (char line 0) #\Tab)
               (when (or (search "\"$.\" CLB" line)
                         (search "\"$!\" CLB" line)
                         (search "\"$?\" CLB" line))
                 (setf sentence-end-pos (fill-pointer (slot-value stream 'ccl::my-string)))))
              (t
               (setf line (subseq line 2 (- (length line) 2)))
               (cond ((string= line "$|")
                      (setf char-count 0)
                      (cond ((zerop sentence-end-pos)
                             (terpri out-stream))
                            (t
                             (write-line (subseq (slot-value stream 'ccl::my-string)
                                                 0 sentence-end-pos)
                                         out-stream)
                             (terpri out-stream)
                             (write-line (subseq (slot-value stream 'ccl::my-string)
                                                 sentence-end-pos)
                                         out-stream)
                             (terpri out-stream)
                             (setf (fill-pointer (slot-value stream 'ccl::my-string)) 0)))
                      #+ignore
                      (write-line (get-output-stream-string stream) out-stream)
                      (setf sentence-end-pos 0))
                     ((and (char= (char line 0) #\$)
                           (find (char line 1) "([{"))
                      (incf char-count)
                      (write-char #\Space stream)
                      (setf no-space t)
                      (write-string (subseq line 1) stream))
                     ((string= line "$-")
                      (incf char-count 2)
                      (write-string " -" stream))
                     ((string= line "$\"")
                      (setf left-quote (not left-quote))
                      (incf char-count)
                      (cond (left-quote
                             (write-string " \"" stream)
                             (setf no-space t))
                            (t
                             (write-char #\" stream))))
                     ((char= (char line 0) #\$)
                      (incf char-count)
                      (write-string (subseq line 1) stream))
                     (t
                      (setf line (remove-stars line))
                      (when (> (incf char-count (1+ (length line))) 80)
                        (setf sentence-end-pos 0 char-count (length line) no-space t) ;line break
                        (write-line (get-output-stream-string stream) out-stream)
                        #+ignore
                        (terpri stream))
                      (if no-space
                        (setf no-space nil)
                        (write-char #\Space stream))
                      (write-string line stream))))))
      (write-line (get-output-stream-string stream) out-stream)
      nil)))

(defun remove-stars (word)
  (let ((star-found-p nil)
        (prev-star-p nil))
    (loop for c across word
          for i from 0
          do (cond ((char= c #\*)
                    (setf star-found-p t
                          prev-star-p t))
                   (prev-star-p
                    (setf prev-star-p nil
                          (char word i) (char-upcase c)))
                   (t
                    nil)))
    (if star-found-p
      (remove #\* word)
      word)))

#+test
(let ((*analyser-lexicon* (compound-analyser *nbo-tagger*)))
  (print (most-probable-compound-last-chunk "arkskriver")))

#+test
(let ((*analyser-lexicon* (compound-analyser *nbo-tagger*)))
  (cgp::mt "En liten laksebolle."))

#+test
(let ((*tagger* *nbo-tagger*)
      (*analyser-lexicon* (compound-analyser *nbo-tagger*)))
  (tag-compound "laksebolle"))

:eof