;;; -*- Mode: LISP; Package: CGP; BASE: 10; Syntax: ANSI-Common-Lisp; Readtable: augmented-readtable -*-
;;
;; Copyright (C) Paul Meurer 1999 - 2004. All rights reserved.
;; paul.meurer@aksis.uib.no
;; Aksis, University of Bergen
;;
(in-package :cgp)
(setf (logical-pathname-translations "oslo-tekster")
'(("**;*.*" "/home/kristinh/tekster/**/*.*")))
(setf (logical-pathname-translations "till-tekster")
'(("**;*.*" "/home/till/tekster/**/*.*")))
(setf (logical-pathname-translations "helgeh-tekster")
'(("**;*.*" "/home/helgeh/tekster/**/*.*")))
#+old
(defparameter *oslo-corpus-dirs* '("oslo-corpus:bm;aviser;txt;*.renset"
"oslo-corpus:bm;sakprosa;txt;*.renset"
"oslo-corpus:bm;skj-litt;txt;*.renset"))
#+old
(defparameter *oslo-corpus-dirs* '("tekster:**;*.xml"
"tekster:**;*.txt"
"tekster:**;*.renset"))
(defparameter *corpus-dirs* '(:oslo ("oslo-tekster:**;*.xml"
"oslo-tekster:**;*.txt"
"oslo-tekster:**;*.renset")
:till ("till-tekster:**;*.xml"
"till-tekster:**;*.txt")
:helgeh ("helgeh-tekster:**;*.xml"
"helgeh-tekster:**;*.txt")))
#+test
(print (directory "tekster:**;*.txt"))
#+test
(print (namestring #p"oslo-corpus:bm;gaga.lolo"))
#+test
(print (pathname-directory #p"oslo-corpus:bm;fifi;gaga.lolo"))
(defun map-corpus-dirs (fun corpus-dirs)
(dolist (dir corpus-dirs)
(dolist (in-file (directory dir))
(let ((out-file (concat (namestring in-file) ".syn")))
(unless (probe-file out-file)
(funcall fun in-file out-file))))))
#+test
(print (merge-pathnames "fifi.lolo" #p"/home/kristinh/tekster/v01_no/turist/11.no.txt"))
#+test
(map-corpus-dirs
(lambda (in-file out-file dir)
(print (list in-file out-file)))
*oslo-corpus-dirs*)
;; (defparameter *break-tagging-p* nil)
(defparameter *tag-processes* (list :oslo nil :till nil :helgeh nil))
(defparameter *log-strings* (list :oslo "" :till "" :helgeh ""))
#+test
(with-open-file (stream "/home/kristinh/tekster/xml/tagger-parameters.xml" :direction :output :if-exists :supersede)
#m(?xml :version "1.0" :encoding "utf-8" :standalone "yes")
#m(TAGGER-PARAMETERS
(SENTENCE-DELIMITER-ELEMENTS
(ELEMENT "s")
(ELEMENT "S"))
(INCLUDE-PATH
(ELEMENT "text")
(ELEMENT "TEXT"))
(EXCLUDE-PATH)))
#+test
(with-open-file (stream "/home/kristinh/tekster/xml/tagger-parameters.xml" :direction :output :if-exists :supersede)
#m(tagger-parameters
(sentence-delimiter-elements)
(include-path
(element "rota"))
(exclude-path)))
#+test
((:TAGGER-PARAMETERS)
((:SENTENCE-DELIMITER-ELEMENTS)
((:ELEMENT) "s")
((:ELEMENT) "S"))
((:INCLUDE-PATH)
((:ELEMENT) "text")
((:ELEMENT) "TEXT"))
((:EXCLUDE-PATH)))
#+test
(print (parse-tagger-parameters #p"/home/kristinh/tekster/xml/tagger-parameters.xml"))
#+test
(print (parse-tagger-parameters #p"/home/paul/temp.xml"))
(defun parse-tagger-parameters (file)
(unless (find-grammar "xml-parser")
(lxml::compile-xml-parser))
(let* ((xml (cadar (zebu::xml-file-parser (print file)
:grammar (find-grammar "xml-parser")
:verbose nil)))
(parameters ()))
(dolist (params (cdr xml))
(when (listp params)
(destructuring-bind ((key) . values) params
(setf (getf parameters (intern (string-upcase key) :keyword))
(mapcar (lambda (val-elt) (intern (print (cadr val-elt)) :keyword)) values)))))
(print parameters)))
;; main routine
(defun start-corpus-tagger (&key user print-lc-features)
(unless (getf *tag-processes* user)
(setf (getf *tag-processes* user)
(mp:process-run-function "tag-corpus"
(lambda ()
(let ((*package* (find-package :cgp)))
(setf (getf *log-strings* user) "")
(setf (getf *log-strings* user) (format nil "Starting at ~a.~%" (now)))
(map-corpus-dirs
(lambda (in-file out-file)
(let ((param-file (merge-pathnames "tagger-parameters.xml" in-file)))
(unless (equal param-file in-file)
(handler-case
(progn
(format *standard-output* "~%
Tagging ~a ... " in-file)
(setf (getf *log-strings* user)
(concat (getf *log-strings* user) (format nil "~%
Tagging ~a ... " in-file)))
(let* ((path-components (pathname-directory in-file))
(*package* :cgp))
(if (find "xml" path-components :test #'equal)
(with-open-file (in-stream in-file :direction :input)
(with-open-file (out-stream (concat out-file ".tmp")
:direction :output :if-exists :supersede
:if-does-not-exist :create)
(disambiguate-stream 'xml-tokenizer in-stream
:tokenizer-initargs
(if (probe-file param-file)
(parse-tagger-parameters param-file)
'(:sentence-delimiter-elements (:|s| :|S|)
:include-path (:|text| :|TEXT|)
:exclude-path ()))
:tagging-niveau (if (find "synt" path-components :test #'equal)
:syntactic-disambiguation
:morphological-disambiguation)
:cg (if (find "nny" path-components :test #'equal)
(gethash "nny" *cg-table*)
(gethash "nbo" *cg-table*))
:print-function
(lambda (s &key token-print-fn)
(declare (ignore token-print-fn))
(print-sentence-xml s
:print-rules nil
:print-sentence-elts-p t ;;nil
:word-elt :|word|
:fresh-line-before-word-p t
:expand-tokens-p t
:print-lc-features print-lc-features
:stream out-stream)))))
(disambiguate-file in-file (concat out-file ".tmp")
:tagging-niveau :syntactic-disambiguation
:cg (if (find "nny" path-components :test #'equal)
(gethash "nny" *cg-table*)
(gethash "nbo" *cg-table*))
#+ignore(newest-cg :nbo)
:print-function
(lambda (s &key stream token-print-fn)
(declare (ignore token-print-fn))
(print-sentence s
:stream stream
:print-features t
:print-rules nil
:print-attributes t
:print-lc-features print-lc-features
:print-special-tokens-p nil
:expand-tokens-p nil)))))
(rename-file (concat out-file ".tmp") out-file)
(setf (getf *log-strings* user)
(concat (getf *log-strings* user) (format nil "done (~a)." (now)))))
(error (cond)
(setf (getf *log-strings* user) (concat (getf *log-strings* user) (format nil "~%
Error: ~a
" cond))))))))
(getf *corpus-dirs* user)))
(stop-corpus-tagger :user user))))))
(defun stop-corpus-tagger (&key user)
(when (getf *tag-processes* user)
(mp::process-kill (getf *tag-processes* user))
(setf (getf *tag-processes* user) nil)
(with-open-file (log-stream (ecase user
(:till "till-tekster:tagger.log")
(:oslo "oslo-tekster:tagger.log")
(:helgeh "helgeh-tekster:tagger.log")
)
:direction :output :if-exists :append :if-does-not-exist :create)
(write-string (getf *log-strings* user) log-stream))
(setf (getf *log-strings* user) "")))
#+test
(start-corpus-tagger :user :till)
#+test
(print (getf *tag-processes* :till))
#+test
(stop-corpus-tagger :user :till)
(defmethod tag-corpus-page ((request http-request) entity &key user-key)
(with-html-response (request entity stream (user update stop start print-lc-features))
(let ((user-key (or user-key (intern (string-upcase user) :keyword))))
(cond (update
nil)
(stop
(stop-corpus-tagger :user user-key))
(start
(start-corpus-tagger :user user-key :print-lc-features print-lc-features)))
#m(html
(head (title "Tagg korpuset"))
(body
((form:method "post")
(input/ :type "hidden" :name "user" :value #S(string-downcase (symbol-name user-key)))
(p (input/ :type "submit" :name "start" :value "Start taggingen"))
(p (input/ :type "submit" :name "stop" :value "Avbryt taggingen"))
(p (input/ :type "submit" :name "update" :value "Oppdater"))
(p ((input :type "checkbox" :name "print-lc-features") "legg til LC-trekk")))
(br/)
(nobr #L(write-string (getf *log-strings* user-key) stream)))))))
(publish :path "/cl/cgp/tag-oslo-corpus.html"
:content-type "text/html"
:authorizer *authorizer*
:function (lambda (request entity) (tag-corpus-page request entity :user-key :oslo)))
(publish :path "/cl/cgp/tag-till-corpus.html"
:content-type "text/html"
:authorizer *authorizer*
:function (lambda (request entity) (tag-corpus-page request entity :user-key :till)))
(publish :path "/cl/cgp/tag-helgeh-corpus.html"
:content-type "text/html"
:authorizer *authorizer*
:function (lambda (request entity) (tag-corpus-page request entity :user-key :helgeh)))
#+test
(print (probe-file "lisp:projects;cgp;texts;vaartl-1.renset"))
#+test
(print (probe-file "/Users/paul/lisp/projects/cgp/texts/vaartl-1.renset"))
#+test
(print (directory "lisp:projects;cgp;texts;vaartl_*.renset"))
;;; EOF