;;;   -*- Mode: LISP; Package: CGP; BASE: 10; Syntax: ANSI-Common-Lisp; -*-

;; Copyright (C) Paul Meurer 2001 - 2007. All rights reserved.
;; paul.meurer@aksis.uib.no
;; Aksis, University of Bergen
;;

;;-------------------------------------------------------------------------------------
;; TO DO:
;;-------------------------------------------------------------------------------------

(in-package :cgp)

(use-package :zebu)
(use-package :lxml)

(eval-when (:compile-toplevel :load-toplevel :execute)
  (#-sbcl defconstant #+sbcl defparameter +non-word-punctuation-chars+
    '(#\;
      #\/
      #\!
      #\?
      #\"
      #\left_double_quotation_mark
      #\right_double_quotation_mark
      #\left_single_quotation_mark
      #\right_single_quotation_mark
      #\left-pointing_double_angle_quotation_mark
      #\right-pointing_double_angle_quotation_mark
      #\ethiopic_full_stop
      #\ethiopic_comma 
      #\ethiopic_semicolon 
      #\ethiopic_colon 
      #\ethiopic_preface_colon 
      #\ethiopic_question_mark 
      #\ethiopic_paragraph_separator
      #\ethiopic_wordspace))

  (#-sbcl defconstant #+sbcl defparameter +non-word-chars+
    (list* 
     #\Newline
     #\Linefeed
     #\Return
     #\Tab
     +non-word-punctuation-chars+))

  (#-sbcl defconstant #+sbcl defparameter +punctuation-chars+
    (list* 
     #\.
     #\:
     #\,
     #\Tab
     +non-word-punctuation-chars+)))

#+test
(print (collecting
	(let ((*token-function* (lambda (token type) (collect (cons token type)))))
	  (read-parser (concat "fifi" (string #\ethiopic_comma) " lolo")
		       :grammar (find-grammar "plain-tokenizer")))))

#+test
(print (collecting
	 (let ((*token-function* (lambda (token type) (collect (cons token type)))))
	   (read-parser "3.4"
			:grammar (find-grammar "plain-tokenizer")))))
#+test
(Print (zebu::grammar-identifier-start-chars-V (find-grammar "plain-tokenizer")))

#+test
(compile-tokenizer *plain-tokenizer* :force-p t)

;;(format t "[\\-~{~c~}]+" cgp::+punctuation-chars+)


(defparameter *plain-tokenizer* (make-instance 'plain-tokenizer))

#+test
(compile-tokenizer *plain-tokenizer*)

(defmethod compile-tokenizer ((tokenizer plain-tokenizer) &key force-p)
  (when (or force-p (not (find-grammar "plain-tokenizer")))
    (let ((*warn-conflicts* t)
	  (*allow-conflicts* t)
	  (zb-file "projects:cgp;multi-tagger;plain-tokenizer.zb")
	  (tab-file "projects:cgp;multi-tagger;plain-tokenizer.tab"))    
      (zebu-compile-file zb-file :output-file tab-file)
      (zebu-load-file tab-file))))

(defparameter *pretokenized-tokenizer* (make-instance 'pretokenized-tokenizer))

(defmethod sentence-add-token ((sentence sentence) (tokenizer pretokenized-tokenizer))
  "appends token(s) from stream and returns the last token added"
  (with-slots (stream) sentence
    (with-slots (token-delimiter attributes-delimiter insignificant-token-fn
		 string-mappings) tokenizer
      (let ((token-string
	     (if (char= token-delimiter #\Newline)
		 (read-next-line stream)
	       (with-output-to-string (str-stream) ;; preliminary
		 (loop for c = (read-char stream nil nil)
		     while (and c (char/= c token-delimiter))
		     do (write-char c str-stream))))))
	(setf token-string (gethash token-string string-mappings token-string))
	(cond ((or (null token-string) (eq token-string :eof))
	       nil)
	      ((funcall insignificant-token-fn token-string)
	       (add-token sentence :insignificant :attributes token-string)
	       (sentence-add-token sentence tokenizer))
	      ((stringp token-string)
	       (destructuring-bind (token-value . token-attributes)
		   (string-parse token-string :whitespace (list attributes-delimiter))
		 (add-token sentence token-value :attributes token-attributes)))
	      (t
	       (add-token sentence token-string)))))))

;;; -------------------------------- XML-tokenizer -------------------------------------

(defclass xml-tokenizer (tokenizer)
  ((zebu-grammar-name :initform "xml-parser" :initarg :zebu-grammar-name :reader zebu-grammar-name)
   (encoding :initform :utf-8 :initarg :encoding :accessor encoding)
   (stack :initform () :accessor xml-tokenizer-stack)
   (language-stack :initform () :accessor xml-language-stack)
   (tumbler :initform () :accessor xml-tokenizer-tumbler)
   ;; list of simplified path expressions indicating which elements to include in tagging.
   ;; () means all.
   (include-path :initform () :initarg :include-path :reader include-path)
   ;; list of simplified path expressions indicating which elements to exclude in tagging
   (exclude-path :initform () :initarg :exclude-path :reader exclude-path)
   ;; list of simplified path expressions indicating which tokens to mark as insignificant in tagging
   (insignificant-path :initform () :initarg :insignificant-path :reader insignificant-path)
   (parsed-token :initform nil :accessor parsed-token)
   (word-tokens :initform () :accessor word-tokens)
   (sentence-delimiter-elements :initform nil
				:initarg :sentence-delimiter-elements
				:accessor sentence-delimiter-elements)
   (paragraph-delimiter-elements :initform nil
				 :initarg :paragraph-delimiter-elements
				 :accessor paragraph-delimiter-elements)
   (headline-delimiter-elements :initform nil
				:initarg :headline-delimiter-elements
				:accessor headline-delimiter-elements)
   (in-sentence-elements :initform nil
			 :initarg :in-sentence-elements
			 :accessor in-sentence-elements)
   (entity-list :initform () :initarg :entity-list :accessor entity-list)
   (normalized-value-function :initform nil :initarg :normalized-value-function :accessor normalized-value-function)
   (error :initform nil :accessor tokenizer-error)))

(defmethod initialize-instance :after ((tokenizer xml-tokenizer) &key &allow-other-keys)
  (compile-tokenizer tokenizer))

(defmethod compile-tokenizer ((tokenizer xml-tokenizer) &key force-p)
  (when (or force-p (not (find-grammar "xml-parser" #+ignore"xml-tree-walker")))
    (lxml::compile-xml-parser)
    #+ignore
    (let ((*package* (find-package :xml))
	  (*warn-conflicts* t)
	  (*allow-conflicts* t)
	  (zb-file "projects:xml;xml-tree-walker.zb")
	  (tab-file "projects:xml;xml-tree-walker.tab"))
      (zebu-compile-file zb-file :output-file tab-file)
      (zebu-load-file tab-file)))
  (when (or force-p (not (find-grammar "plain-tokenizer")))
    (let ((*package* (find-package :xml))
	  (*warn-conflicts* t)
	  (*allow-conflicts* t)
	  (zb-file "projects:cgp;multi-tagger;plain-tokenizer.zb")
	  (tab-file "projects:cgp;multi-tagger;plain-tokenizer.tab"))    
      (zebu-compile-file zb-file :output-file tab-file)
      (zebu-load-file tab-file))))

;; locally bind to the function that digests the token
#+moved ;; to xml-parser.lisp
(defvar *token-function* #'print)

#+moved ;; to xml-parser.lisp
(defun lxml::add-token (token)
  (funcall *token-function* token))

;; fix this!
(defun lxml::add-plain-token (token &optional type)
  (funcall *token-function* token type))

(defun add-plain-token (token &optional type)
  (funcall *token-function* token type))

;; this is preliminary!!
(defun token-included-p (stack include-path exclude-path)
  (and (or (null include-path)
	   (loop for path in include-path
	       thereis (find path stack)))
       (not (loop for path in exclude-path
		thereis (find path stack)))))
#+test
(print (token-included-p '(:a :b :c) '(:b) '(:a)))

#+test
(compile-tokenizer *plain-tokenizer* :force-p t)

#+test
(print (collecting
	(let ((*token-function* (lambda (token type) (collect (cons token type)))))
	  (read-parser "ha-ha &mdash; +3-4 2,3 hoho 1.2.1998 3/4 jaja/23 gaga�á��¢ lolo" ;": Men det er jo " ;;"Dette er en setning."
		       :grammar (find-grammar "plain-tokenizer")))))

;; not needed
(defun get-word-positions (word-list string)
  (collecting
   (labels ((get-position (word-list pos)
	      (when word-list
		(let ((pos (search (car word-list) string :start2 pos)))
		  (collect pos)
		  (get-position (cdr word-list) (+ pos (length (car word-list))))))))
     (get-position word-list 0))))

#+test
(compile-tokenizer (make-instance 'xml-tokenizer) :force-p t)

#+test
(print (get-word-positions '("Dette" "er" "en" "setning" ".") "Dette er en setning."))

;;(format t "~:[~;~a~]" 3 3)

;;(format t "~@[~a:~]" 3)

;;(format t "~{~d~^ ~}~@[:~d~]" '(1 2 0) 0)

;;(unintern 'add-token :cgp)

;; should make this obsolete!
(defun %translate-language-code (code-string)
  (cond ((null code-string)
	 nil)
	((string-equal code-string "nob")
	 :nbo)
	((string-equal code-string "nno")
	 :nny)
	(t
	 (intern (string-upcase code-string) :keyword))))

#+ignore
(lxml::compile-xml-parser)

(defun decode (string encoding &optional entity-list)
  (case encoding
    (:utf-8
     (utf-8-decode string t entity-list))
    (otherwise ;; prelimiary
     (if entity-list
	 (decode-entities string entity-list)
       string))))

#+test
(print (decode-entities "gaga&name;fifi" '("name" "lolo")))

(defun decode-entities (string external-entities)
  (with-output-to-string (stream)
    (let ((length (length string))
	  (pos 0))
      (labels ((decode ()
		 (unless (= pos length) 
		   (let* ((char (char string pos)))
		     (cond ((char= char #\&)
			    (let* ((ent-end (position #\; string :start (1+ pos)))
				   ;; missing: hex entities &x...;
				   (ent (subseq string (1+ pos) ent-end)))
			      (incf pos (- ent-end pos -1))
			      (let ((char (entity-to-char ent)))
				(cond (char
				       (write-char (entity-to-char ent) stream))
				      (t
				       (let ((ent-res (cadr (member ent external-entities :test #'string=))))
					 (cond (ent-res
						(write-string ent-res stream))
					       (t ;; if entity can't be resolved write plain entity
						(write-char #\& stream)
						(write-string ent stream)
						(write-char #\; stream)))))))))
			   (t
			    (incf pos)
			    (write-char char stream))))
		   (decode))))
	(decode)))))

;; destructive!1
(defun decode-xml-token (xml-token encoding &optional entity-list)
  (when (or encoding entity-list)
    (loop for tail on xml-token
	when (stringp (car tail))
	do (setf (car tail) (decode (car tail) encoding entity-list))))
  xml-token)

#+test
(Print (decode-xml-token '(:%STRING 4413 "&priimek;") nil '("ime" "priimek")))

#+test
(print (corpus::update-document (corpus::get-document corpus::*tigrinya-corpus* "hadas-ertra-01") :force-p t))

;;(defvar *parser-error* nil)

;; TO DO: for ASK: extract normalized-value from orth of preceding sic element
(defmethod sentence-add-token ((sentence sentence) (tokenizer xml-tokenizer))
  "appends token(s) from stream and returns the last token added" 
  (with-slots (stream) sentence
    (with-slots (encoding insignificant-token-fn sentence-delimiter-elements
		 paragraph-delimiter-elements headline-delimiter-elements
		 stack language-stack tumbler include-path exclude-path insignificant-path
		 word-tokens entity-list normalized-value-function) tokenizer
      (let ((word+pos (pop word-tokens))
	    (pos nil))
	(cond (word+pos
	       (prog1 (add-token sentence (car word+pos)
				 :position (cdr word+pos)
				 :language (find-if-not #'null language-stack)
				 :normalized-value
				 (when normalized-value-function
				   (funcall normalized-value-function
					    sentence (last-token sentence)))
				 :insignificant-p (loop for path in insignificant-path
							thereis (find path stack))
				 #+ignore(format nil "~{~d~^ ~}~@[:~d~]" (reverse tumbler) (cdr word+pos)))
		 (unless word-tokens (incf (car tumbler)))))
	      (t
	       (let ((xml-token nil))
		 #+allegro
		 (mp::without-scheduling
		  (mp:process-wait "waiting-for-token" ;; #'parsed-token tokenizer
				   (lambda ()
				     (or (tokenizer-error tokenizer)
					 (parsed-token tokenizer))))
		  (setf xml-token (parsed-token tokenizer)))
		 #+mcl
		 (ccl::process-wait "waiting-for-token" #'parsed-token tokenizer)
		 ;;(let ((xml-token (parsed-token tokenizer)))
		 (when (tokenizer-error tokenizer) ;; catch error from parse thread and signal in disambiguate-stream()
		   (error (tokenizer-error tokenizer)))
		 (setf (parsed-token tokenizer) nil)
		 (unless (eq xml-token :eof)
		   (destructuring-bind (tag-type xml-file-pos content . attributes) xml-token
		     (cond ((eq tag-type :%xml-declaration)
			    ;; read out character encoding
			    (let ((enc (getf attributes :|encoding|)))
			      (when enc
				(let ((enc (intern (string-upcase enc) :keyword)))
				  (ecase enc
				    ((:utf-8 :iso-8859-1)
				     (setf encoding enc))))))
			    (sentence-add-token sentence tokenizer))
			   ((eq tag-type :%string)
			    (cond ((token-included-p stack include-path exclude-path)
				   (setf pos xml-file-pos)
				   (setf word-tokens
					 (collecting
					   (let ((*token-function*
						  (lambda (token type)
						    #+debug(print (list (map 'list (lambda (c) c) token) type))
						    (let ((decoded-token token #+old(decode token encoding)))
						      (case type
							((:word :number :brace)
							 (collect (cons decoded-token pos))
							 #+debug(print (cons token encoding))
							 (incf pos (length (if (eq encoding :utf-8) (utf-8-encode token) token))))
							(:punctuation
							 ;; group equal chars together
							 (let ((start 0)
							       (start-pos pos))
							   (loop for i from 0
								 for c across decoded-token
								 do (when (char/= c (char decoded-token start))
								      (let ((chunk (subseq decoded-token start i)))
									(collect (cons chunk pos))
									(incf pos (length (if (eq encoding :utf-8) (utf-8-encode chunk) chunk)))
									(setf start i))))
							   (if (zerop start)
							       (collect (cons decoded-token pos))
							       (collect (cons (subseq decoded-token start) pos)))
							   (setf pos
								 (+ start-pos
								    (length (if (eq encoding :utf-8) (utf-8-encode token) token))))))
							(otherwise
							 #+debug(print (list :token token :length (length token)))
							 (incf pos #+old(length token)
							       (length (if (eq encoding :utf-8) (utf-8-encode token) token)))))))))
					     ;; old: we decode entities immediately, but postpone eventual UTF-8
					     ;; decoding until plain-tokenizer has run.
					     #+debug(print (list :xml-token xml-token))
					     #+old(decode-xml-token xml-token nil entity-list)
					     (decode-xml-token xml-token encoding entity-list)
					     #+debug (print (list :plain (caddr xml-token)))
					     (read-parser (caddr xml-token) ;;(replace-quotes (caddr xml-token))
							  :grammar (find-grammar "plain-tokenizer")))))
				   (let ((word+pos (pop word-tokens)))
				     ;; return
				     (cond (word+pos
					    #+debug(unless (cdr word+pos) (print (list :no-pos word+pos)))
					    (prog1 (add-token
						    sentence (car word+pos)
						    :position (cdr word+pos)
						    :normalized-value
						    (when normalized-value-function
						      (funcall normalized-value-function
							       sentence (last-token sentence)))
						    :language (find-if-not #'null language-stack)
						    :insignificant-p (loop for path in insignificant-path
									   thereis (find path stack)))
					      (unless word-tokens (incf (car tumbler)))))
					   (t
					    (when tumbler (incf (car tumbler)))
					    (add-token sentence :newline :position pos)))))
				  (t
				   (decode-xml-token xml-token encoding)
				   (add-token sentence :insignificant
					      :attributes xml-token
					      :position xml-file-pos)
				   (when tumbler (incf (car tumbler)))
				   (sentence-add-token sentence tokenizer))))
			   ((eq tag-type :%end)
			    (unless (eq content (car stack))
			      (error "Mismatch between parsed elt and elt on stack: ~s : ~s"
				     content stack))
			    (pop stack)
			    (pop tumbler)
			    (pop language-stack)
			    (when tumbler (incf (car tumbler)))
			    ;; return also when a paragraph or headline delimiter is reached
			    (cond ((and (token-included-p stack include-path exclude-path)
					(or (find content paragraph-delimiter-elements)
					    (find content headline-delimiter-elements)))
				   (add-token sentence :insignificant :attributes xml-token :position xml-file-pos))
				  (t
				   (add-token sentence :insignificant :attributes xml-token :position xml-file-pos)
				   (sentence-add-token sentence tokenizer))))
			   ((find content sentence-delimiter-elements) 
			    (decode-xml-token xml-token encoding)
			    (push 0 tumbler)
			    (push (%translate-language-code (getf attributes :|lang|)) language-stack)
			    (push content stack)
			    (add-token sentence :insignificant :attributes xml-token :position xml-file-pos))
			   ((find tag-type '(:%empty :%comment))
			    (decode-xml-token xml-token encoding)
			    (incf (car tumbler))
			    (add-token sentence :insignificant :attributes xml-token :position xml-file-pos)
			    (sentence-add-token sentence tokenizer))
			   (t
			    (decode-xml-token xml-token encoding)
			    (push 0 tumbler)
			    (push content stack)
			    (push (%translate-language-code (getf attributes :|lang|)) language-stack)
			    (add-token sentence :insignificant :attributes xml-token :position xml-file-pos)
			    (sentence-add-token sentence tokenizer))))))))))))

#+old
(defmethod sentence-add-token ((sentence sentence) (tokenizer xml-tokenizer))
  "appends token(s) from stream and returns the last token added" 
  (with-slots (stream) sentence
    (with-slots (encoding insignificant-token-fn sentence-delimiter-elements
		 paragraph-delimiter-elements headline-delimiter-elements
		 stack language-stack tumbler include-path exclude-path
		 word-tokens entity-list normalized-value-function) tokenizer
      (let ((word+pos (pop word-tokens))
	    (pos nil))
	(cond (word+pos
	       (prog1 (add-token sentence (car word+pos)
				 :position (cdr word+pos)
				 :language (find-if-not #'null language-stack)
				 :normalized-value
				 (when normalized-value-function
				   (funcall normalized-value-function
					    sentence (last-token sentence)))
				 #+ignore(format nil "~{~d~^ ~}~@[:~d~]" (reverse tumbler) (cdr word+pos)))
		 (unless word-tokens (incf (car tumbler)))))
	      (t
               #+allegro
	       (mp:process-wait "waiting-for-token" #'parsed-token tokenizer)
               #+mcl
	       (ccl::process-wait "waiting-for-token" #'parsed-token tokenizer)
	       (let ((xml-token (parsed-token tokenizer)))
		 (setf (parsed-token tokenizer) nil)
		 (unless (eq xml-token :eof)
		   (destructuring-bind (tag-type xml-file-pos content . attributes) xml-token
		     (cond ((eq tag-type :%xml-declaration)
			    ;; read out character encoding
			    (let ((enc (getf attributes :|encoding|)))
			      (when enc
				(let ((enc (intern (string-upcase enc) :keyword)))
				  (ecase enc
				    ((:utf-8 :iso-8859-1)
				     (setf encoding enc))))))
			    (sentence-add-token sentence tokenizer))
			   ((eq tag-type :%string)
			    (cond ((token-included-p stack include-path exclude-path)
				   (setf pos xml-file-pos)
				   (setf word-tokens
				     (collecting
				      (let ((*token-function*
					     (lambda (token type)
					       #+debug(print (list token type))
					       (let ((decoded-token (decode token encoding)))
						 (case type
						   ((:word :number :brace)
						    (collect (cons decoded-token pos))
						    (incf pos (length token)))
						   (:punctuation
						    ;; group equal chars together
						    (let ((start 0))
						      (loop for i from 0
							  for c across decoded-token
							  do (when (char/= c (char decoded-token start))
							       (let ((chunk (subseq decoded-token start i)))
								 (collect (cons chunk pos))
								 ;; fix this: increment should be taken rel. to token,
								 ;; not decoded-token!
								 (incf pos (length (utf-8-encode chunk))) ;; fixme: works only for utf-8!!
								 ;;(print (list :start (- pos (utf-8-encode chunk)) :pos pos :chunk chunk :enc (utf-8-encode chunk))) 
								 ;; (incf pos (- i start)) ;; ******** old
								 (setf start i))))
						      (if (zerop start)
							  (progn
							    #+debug(print (list :token token :length (length token)))
							    (collect (cons decoded-token pos)))
							(collect (cons (subseq decoded-token start) pos)))
						      (incf pos (- (length token) start))))
						   (otherwise
						    #+debug(print (list :token token :length (length token)))
						    (incf pos (length token))))))))
					;; we decode entities immediately, but postpone eventual UTF-8
					;; decoding until plain-tokenizer has run.
					#+debug(print (list :xml-token xml-token))
					(decode-xml-token xml-token nil entity-list)
					#+debug (print (list :plain (caddr xml-token)))
					(read-parser (replace-quotes (caddr xml-token))
						     :grammar (find-grammar "plain-tokenizer")))))
				   (let ((word+pos (pop word-tokens)))
				     ;; return
				     (cond (word+pos
					    #+debug(unless (cdr word+pos) (print (list :no-pos word+pos)))
					    (prog1 (add-token
						    sentence (car word+pos)
						    :position (cdr word+pos)
						    :normalized-value
						    (when normalized-value-function
						      (funcall normalized-value-function
							       sentence (last-token sentence)))
						    :language (find-if-not #'null language-stack))
					      (unless word-tokens (incf (car tumbler)))))
					   (t
					    (when tumbler (incf (car tumbler)))
					    (add-token sentence :newline :position pos)))))
				  (t
				   (decode-xml-token xml-token encoding)
				   (add-token sentence :insignificant
					      :attributes xml-token
					      :position xml-file-pos)
				   (when tumbler (incf (car tumbler)))
				   (sentence-add-token sentence tokenizer))))
			   ((eq tag-type :%end)
			    (unless (eq content (car stack))
			      (error "Mismatch between parsed elt and elt on stack: ~s : ~s"
				     content stack))
			    (pop stack)
			    (pop tumbler)
			    (pop language-stack)
			    (when tumbler (incf (car tumbler)))
			    ;; return also when a paragraph or headline delimiter is reached
			    (cond ((and (token-included-p stack include-path exclude-path)
					(or (find content paragraph-delimiter-elements)
					    (find content headline-delimiter-elements)))
				   (add-token sentence :insignificant :attributes xml-token :position xml-file-pos))
				  (t
				   (add-token sentence :insignificant :attributes xml-token :position xml-file-pos)
				   (sentence-add-token sentence tokenizer))))
			   ((find content sentence-delimiter-elements) 
			    (decode-xml-token xml-token encoding)
			    (push 0 tumbler)
			    (push (%translate-language-code (getf attributes :|lang|)) language-stack)
			    (push content stack)
			    (add-token sentence :insignificant :attributes xml-token :position xml-file-pos))
			   ((find tag-type '(:%empty :%comment))
			    (decode-xml-token xml-token encoding)
			    (incf (car tumbler))
			    (add-token sentence :insignificant :attributes xml-token :position xml-file-pos)
			    (sentence-add-token sentence tokenizer))
			   (t
			    (decode-xml-token xml-token encoding)
			    (push 0 tumbler)
			    (push content stack)
			    (push (%translate-language-code (getf attributes :|lang|)) language-stack)
			    (add-token sentence :insignificant :attributes xml-token :position xml-file-pos)
			    (sentence-add-token sentence tokenizer))))))))))))

;; replace non-8 bit quotes by 8 bit quotes
(defun replace-quotes (utf-8-str)
  ;; utf-8-str #+ignore
  (subst-substrings utf-8-str
		    (list (utf-8-encode (string #\left_double_quotation_mark))
			  (utf-8-encode (string #\" #+ignore #\left-pointing_double_angle_quotation_mark))
			  (utf-8-encode (string #\right_double_quotation_mark))
			  (utf-8-encode (string #\" #+ignore #\right-pointing_double_angle_quotation_mark)))))

#+test
(compile-tokenizer (make-instance 'xml-tokenizer) :force-p t)

#+test
(let ((*package* (find-package :xml))
      (*token-function* #'print))
  (zebu::xml-file-parser "projects:semantic-mirrors;corpus;MA1T-4.xml"
			 :grammar (find-grammar "xml-tree-walker")
			 :verbose nil))
#+test
(let ((*package* (find-package :xml))
      (*token-function* #'print))
  (zebu::xml-file-parser "projects:semantic-mirrors;corpus;MA1T-4.xml"
			 :grammar (find-grammar "xml-tree-walker")
			 :verbose nil))


;;(print (zb::read-parser "<a>&mdash; Dette er en setning.</a>" :grammar (find-grammar "xml-parser")))

#+test
(compile-tokenizer *plain-tokenizer*)

#+test
(let ((*package* (find-package :xml)))
  (compile-tokenizer *xml-tokenizer*))

;;(print (find-grammar "xml-tree-walker"))

#+test
(with-open-file (stream "projects:semantic-mirrors;corpus;MA1T-4.xml")
  (disambiguate-stream 'xml-tokenizer stream
		       :tokenizer-initargs '(:sentence-delimiter-elements (:|s|))
		       :tagging-niveau :morphological-disambiguation
		       :cg (gethash "nbo" *cg-table*)
		       :print-function
		       (lambda (s &key stream &allow-other-keys)
			 (print-sentence s
					 :print-rules nil
					 :expand-tokens-p t
					 :stream stream))))

#+ignore
(lxml::compile-xml-parser)

#+test
(with-open-file (stream "/usr/local/cwb/corpora/ask/0067TB_KT.xml")
  (disambiguate-stream 'xml-tokenizer stream
		       :tokenizer-initargs '(:sentence-delimiter-elements (:|s|))
		       :tagging-niveau :morphological-disambiguation
		       :cg (gethash "nbo" *cg-table*)
		       :print-function
		       (lambda (s &key stream &allow-other-keys)
			 (print-sentence s
					 :print-rules nil
					 :expand-tokens-p t
					 :stream stream))))

;; *sentence*
#||


(xml::compile-xml-parser)
(let ((xml::*serialize* :xml-file-pos))
  #+ignore(xml::compile-xml-parser)
  (zb::xml-file-parser "~/test.xml"
		       :grammar (find-grammar "xml-parser")
		       :verbose nil))


(let ((*warn-conflicts* t)
      (*allow-conflicts* t)
      (zb-file "projects:cgp;multi-tagger;test.zb")
      (tab-file "projects:cgp;multi-tagger;test.tab"))
  (zebu-compile-file zb-file :output-file tab-file)
  (zebu-load-file tab-file)
  (read-parser "gaga"
	       :grammar (find-grammar "test")
	       :junk-allowed nil)
  )

(progn
  (compile-tokenizer *plain-tokenizer*)
  (let ((*tokenizer-stream* t))
    (read-parser "Hei. Dette er en liten test, men i dag er den 2.7.1999. 3  4."
		 :grammar (find-grammar "plain-tokenizer")
		 :junk-allowed nil)))

(progn
  (compile-tokenizer *xml-tokenizer*)
  (with-open-file (stream "projects:cgp;training;zebu-test.out"
		   :direction :output :if-exists :supersede)
    (let ((*tokenizer-stream* stream))
      (file-parser "projects:mlcd;texts;norm-1736.xml"
		   :grammar (find-grammar "xml-tokenizer")
		   :verbose nil))))

(print (find-grammar "xml-tokenizer"))

(time
 (progn
   (compile-tokenizer *plain-tokenizer*)
   (with-open-file (stream "projects:cgp;training;zebu-test.out"
		    :direction :output :if-exists :supersede)
     (let ((*tokenizer-stream* stream))
       (file-parser "projects:cgp;training;delkorp.txt"
		    :grammar (find-grammar "plain-tokenizer")
		    :verbose nil)))))

(defparameter *tokenizer-grammar* (find-grammar "plain-tokenizer"))

(with-input-from-string (stream "Dette er en liten text.")
   (compile-tokenizer *plain-tokenizer*)
   (let ((*tokenizer-stream* t))
     (zebu::file-parser-aux stream #'error nil (find-grammar "plain-tokenizer") nil)))

(print (find-grammar "plain-tokenizer"))
(print (find-grammar "xml-parser"))

(let ((*tokenizer* *pretokenized-tokenizer*)
      (*terminate-on-following-insignificant-token-p* t)
      (*merge-hyphenated-words-p* nil)
      (*lookup-unknown-in-nny-lexicon-p* t))
  (with-open-file (out-stream "projects:cgp;texts;magnhild.dis" :direction :output
		   :if-exists :supersede :if-does-not-exist :create)
    (with-open-file (stream "projects:cgp;texts;magnhild")
      (disambiguate-stream *tokenizer* stream
			   :tagging-niveau :morphological-disambiguation
			   :cg (gethash "nbo-mai" *cg-table*)
			   :print-function
			   (lambda (s &key stream &allow-other-keys)
			     (print-sentence s
					     :print-rules nil
					     :expand-tokens-p t
					     :stream out-stream))))))

(with-open-file (stream "projects:cgp;texts;magnhild-test")
  (disambiguate-stream 'pretokenized-tokenizer stream
		       :tagging-niveau :morphological-disambiguation
		       :cg (gethash "nbo-mai" *cg-table*)
		       :print-function
		       (lambda (s &key &allow-other-keys)
			 (print-sentence s
					 :print-rules nil
					 :expand-tokens-p t
					 :stream *standard-output*))))


(let ((xml::*serialize* t))
  #+ignore(xml::compile-xml-parser)
  (zb::xml-file-parser "/usr/local/cwb/corpora/nsb/documents/xml/enpc/nonfict/nbo/ROB1T.xml"
		       :grammar (find-grammar "xml-parser")
		       :verbose nil))

(let ((xml::*serialize* t))
  #+ignore(xml::compile-xml-parser)
  (zb::xml-file-parser "projects:cgp;texts;text.prov"
		       :grammar (find-grammar "xml-parser")
		       :verbose nil))

(with-open-file (out-stream "projects:cgp;texts;text-prov.out"
		 :direction :output :if-exists :supersede)
  (with-open-file (stream "projects:cgp;texts;text.prov")
    (disambiguate-stream 'xml-tokenizer stream
			 :tokenizer-initargs ;;'(:sentence-delimiter-elements (:|s|))
			 '(:sentence-delimiter-elements (:|s| :|S|)
			   :include-path (:|text| :|TEXT|)
			   :exclude-path ())
			 :tagging-niveau :morphological-disambiguation
			 :cg (gethash "nny" *cg-table*)
			 :print-function
			 (lambda (s)
			   #+ignore
			   (print s)
			   (print-sentence-xml s
					       :print-rules nil
					       :print-sentence-elts-p nil
					       :word-elt :WORD
					       :fresh-line-before-word-p t
					       :expand-tokens-p t
					       :stream out-stream
					       )))))

||#

:eof