;;;   -*- Mode: LISP; Package: CGP; BASE: 10; Syntax: ANSI-Common-Lisp; Readtable: augmented-readtable -*-
;;
;; Copyright (C) Paul Meurer 1999 - 2004. All rights reserved.
;; paul.meurer@aksis.uib.no
;; Aksis, University of Bergen
;;
(in-package :cgp)
(setf (logical-pathname-translations "oslo-tekster")
  '(("**;*.*" "/home/kristinh/tekster/**/*.*")))
(setf (logical-pathname-translations "till-tekster")
      '(("**;*.*" "/home/till/tekster/**/*.*")))
(setf (logical-pathname-translations "helgeh-tekster")
      '(("**;*.*" "/home/helgeh/tekster/**/*.*")))
#+old
(defparameter *oslo-corpus-dirs* '("oslo-corpus:bm;aviser;txt;*.renset"
				  "oslo-corpus:bm;sakprosa;txt;*.renset"
				  "oslo-corpus:bm;skj-litt;txt;*.renset"))
#+old
(defparameter *oslo-corpus-dirs* '("tekster:**;*.xml"
				   "tekster:**;*.txt"
				   "tekster:**;*.renset"))
(defparameter *corpus-dirs* '(:oslo ("oslo-tekster:**;*.xml"
				     "oslo-tekster:**;*.txt"
				     "oslo-tekster:**;*.renset")
			      :till ("till-tekster:**;*.xml"
				     "till-tekster:**;*.txt")
			      :helgeh ("helgeh-tekster:**;*.xml"
				     "helgeh-tekster:**;*.txt")))
#+test
(print (directory "tekster:**;*.txt"))
#+test
(print (namestring #p"oslo-corpus:bm;gaga.lolo"))
#+test
(print (pathname-directory #p"oslo-corpus:bm;fifi;gaga.lolo"))
(defun map-corpus-dirs (fun corpus-dirs)
  (dolist (dir corpus-dirs)
    (dolist (in-file (directory dir))
      (let ((out-file (concat (namestring in-file) ".syn")))
	(unless (probe-file out-file)
	  (funcall fun in-file out-file))))))
#+test
(print (merge-pathnames "fifi.lolo" #p"/home/kristinh/tekster/v01_no/turist/11.no.txt"))
   
#+test
(map-corpus-dirs
 (lambda (in-file out-file dir)
   (print (list in-file out-file)))
 *oslo-corpus-dirs*)
;; (defparameter *break-tagging-p* nil)
(defparameter *tag-processes* (list :oslo nil :till nil :helgeh nil))
(defparameter *log-strings* (list :oslo "" :till "" :helgeh ""))
#+test
(with-open-file (stream "/home/kristinh/tekster/xml/tagger-parameters.xml" :direction :output :if-exists :supersede)
  #m(?xml :version "1.0" :encoding "utf-8" :standalone "yes")
  #m(TAGGER-PARAMETERS
     (SENTENCE-DELIMITER-ELEMENTS
      (ELEMENT "s")
      (ELEMENT "S"))
     (INCLUDE-PATH 
      (ELEMENT "text")
      (ELEMENT "TEXT"))
     (EXCLUDE-PATH)))
#+test
(with-open-file (stream "/home/kristinh/tekster/xml/tagger-parameters.xml" :direction :output :if-exists :supersede)
  #m(tagger-parameters
     (sentence-delimiter-elements)
     (include-path 
      (element "rota"))
     (exclude-path)))
#+test
((:TAGGER-PARAMETERS)
 ((:SENTENCE-DELIMITER-ELEMENTS)
  ((:ELEMENT) "s")
  ((:ELEMENT) "S"))
 ((:INCLUDE-PATH)
  ((:ELEMENT) "text")
  ((:ELEMENT) "TEXT"))
 ((:EXCLUDE-PATH)))
#+test
(print (parse-tagger-parameters #p"/home/kristinh/tekster/xml/tagger-parameters.xml"))
#+test
(print (parse-tagger-parameters #p"/home/paul/temp.xml"))
(defun parse-tagger-parameters (file)
  (unless (find-grammar "xml-parser")
    (lxml::compile-xml-parser))
  (let* ((xml (cadar (zebu::xml-file-parser (print file)
					    :grammar (find-grammar "xml-parser")
					    :verbose nil)))
	 (parameters ()))
    (dolist (params (cdr xml))
      (when (listp params)
	(destructuring-bind ((key) . values) params
	  (setf (getf parameters (intern (string-upcase key) :keyword))
	    (mapcar (lambda (val-elt) (intern (print (cadr val-elt)) :keyword)) values)))))
    (print parameters)))
;; main routine
(defun start-corpus-tagger (&key user print-lc-features)
  (unless (getf *tag-processes* user)
    (setf (getf *tag-processes* user)
      (mp:process-run-function "tag-corpus"
	(lambda ()
	  (let ((*package* (find-package :cgp)))
	    (setf (getf *log-strings* user) "")
	    (setf (getf *log-strings* user) (format nil "Starting at ~a.~%" (now)))
	    (map-corpus-dirs
	     (lambda (in-file out-file)
	       (let ((param-file (merge-pathnames "tagger-parameters.xml" in-file)))
		 (unless (equal param-file in-file)
		   (handler-case
		       (progn
			 (format *standard-output* "~%
Tagging ~a ... " in-file)
			 (setf (getf *log-strings* user)
			   (concat (getf *log-strings* user) (format nil "~%
Tagging ~a ... " in-file)))
			 (let* ((path-components (pathname-directory in-file))
				(*package* :cgp))
			   (if (find "xml" path-components :test #'equal)
			       (with-open-file (in-stream in-file :direction :input)
				 (with-open-file (out-stream (concat out-file ".tmp")
						  :direction :output :if-exists :supersede
						  :if-does-not-exist :create)
				   (disambiguate-stream 'xml-tokenizer in-stream
							:tokenizer-initargs
							(if (probe-file param-file)
							    (parse-tagger-parameters param-file)
							  '(:sentence-delimiter-elements (:|s| :|S|)
							    :include-path (:|text| :|TEXT|)
							    :exclude-path ()))
							:tagging-niveau (if (find "synt" path-components :test #'equal)
									    :syntactic-disambiguation
									    :morphological-disambiguation)
							:cg (if (find "nny" path-components :test #'equal)
								(gethash "nny" *cg-table*)
							      (gethash "nbo" *cg-table*))
							:print-function
							(lambda (s &key token-print-fn)
							  (declare (ignore token-print-fn))
							  (print-sentence-xml s
									      :print-rules nil
									      :print-sentence-elts-p t ;;nil
									      :word-elt :|word|
									      :fresh-line-before-word-p t
									      :expand-tokens-p t
									      :print-lc-features print-lc-features
									      :stream out-stream)))))
			     (disambiguate-file in-file (concat out-file ".tmp")
						:tagging-niveau :syntactic-disambiguation
						:cg (if (find "nny" path-components :test #'equal)
							(gethash "nny" *cg-table*)
						      (gethash "nbo" *cg-table*))
						#+ignore(newest-cg :nbo)
						:print-function
						(lambda (s &key stream token-print-fn)
						  (declare (ignore token-print-fn))
						  (print-sentence s
								  :stream stream
								  :print-features t
								  :print-rules nil
								  :print-attributes t
								  :print-lc-features print-lc-features
								  :print-special-tokens-p nil
								  :expand-tokens-p nil)))))
			 (rename-file (concat out-file ".tmp") out-file)
			 (setf (getf *log-strings* user)
			   (concat (getf *log-strings* user) (format nil "done (~a)." (now)))))
		     (error (cond)
		       (setf (getf *log-strings* user) (concat (getf *log-strings* user) (format nil "~%
Error: ~a
" cond))))))))
	     (getf *corpus-dirs* user)))
	  (stop-corpus-tagger :user user))))))
(defun stop-corpus-tagger (&key user)
  (when (getf *tag-processes* user)
    (mp::process-kill (getf *tag-processes* user))
    (setf (getf *tag-processes* user) nil)
    (with-open-file (log-stream (ecase user
				  (:till "till-tekster:tagger.log")
				  (:oslo "oslo-tekster:tagger.log")
				  (:helgeh "helgeh-tekster:tagger.log")
				  )     
		     :direction :output :if-exists :append :if-does-not-exist :create)
      (write-string (getf *log-strings* user) log-stream))
    (setf (getf *log-strings* user) "")))
#+test
(start-corpus-tagger :user :till)
#+test
(print (getf *tag-processes* :till))
#+test
(stop-corpus-tagger :user :till)
(defmethod tag-corpus-page ((request http-request) entity &key user-key)
  (with-html-response (request entity stream (user update stop start print-lc-features))
    (let ((user-key (or user-key (intern (string-upcase user) :keyword))))
      (cond (update
	     nil)
	    (stop
	     (stop-corpus-tagger :user user-key))
	    (start
	     (start-corpus-tagger :user user-key :print-lc-features print-lc-features)))
    #m(html
       (head (title "Tagg korpuset"))
       (body
	((form:method "post")
	 (input/ :type "hidden" :name "user" :value #S(string-downcase (symbol-name user-key)))
	 (p (input/ :type "submit" :name "start" :value "Start taggingen"))
	 (p (input/ :type "submit" :name "stop" :value "Avbryt taggingen"))
	 (p (input/ :type "submit" :name "update" :value "Oppdater"))
	 (p ((input :type "checkbox" :name "print-lc-features") "legg til LC-trekk")))
	(br/)
	(nobr #L(write-string (getf *log-strings* user-key) stream)))))))
(publish :path "/cl/cgp/tag-oslo-corpus.html"
	 :content-type "text/html"
	 :authorizer *authorizer*
	 :function (lambda (request entity) (tag-corpus-page request entity :user-key :oslo)))
(publish :path "/cl/cgp/tag-till-corpus.html"
	 :content-type "text/html"
	 :authorizer *authorizer*
	 :function (lambda (request entity) (tag-corpus-page request entity :user-key :till)))
(publish :path "/cl/cgp/tag-helgeh-corpus.html"
	 :content-type "text/html"
	 :authorizer *authorizer*
	 :function (lambda (request entity) (tag-corpus-page request entity :user-key :helgeh)))
#+test
(print (probe-file "lisp:projects;cgp;texts;vaartl-1.renset"))
#+test
(print (probe-file "/Users/paul/lisp/projects/cgp/texts/vaartl-1.renset"))
#+test
(print (directory "lisp:projects;cgp;texts;vaartl_*.renset"))
;;; EOF