;;; -*- Mode: LISP; Package: CGP; BASE: 10; Syntax: ANSI-Common-Lisp; -*- ;; Copyright (C) Paul Meurer 2001 - 2007. All rights reserved. ;; paul.meurer@aksis.uib.no ;; Aksis, University of Bergen ;; ;;------------------------------------------------------------------------------------- ;; TO DO: ;;------------------------------------------------------------------------------------- (in-package :cgp) (use-package :zebu) (use-package :lxml) (eval-when (:compile-toplevel :load-toplevel :execute) (#-sbcl defconstant #+sbcl defparameter +non-word-punctuation-chars+ '(#\; #\/ #\! #\? #\" #\left_double_quotation_mark #\right_double_quotation_mark #\left_single_quotation_mark #\right_single_quotation_mark #\left-pointing_double_angle_quotation_mark #\right-pointing_double_angle_quotation_mark #\ethiopic_full_stop #\ethiopic_comma #\ethiopic_semicolon #\ethiopic_colon #\ethiopic_preface_colon #\ethiopic_question_mark #\ethiopic_paragraph_separator #\ethiopic_wordspace)) (#-sbcl defconstant #+sbcl defparameter +non-word-chars+ (list* #\Newline #\Linefeed #\Return #\Tab +non-word-punctuation-chars+)) (#-sbcl defconstant #+sbcl defparameter +punctuation-chars+ (list* #\. #\: #\, #\Tab +non-word-punctuation-chars+))) #+test (print (collecting (let ((*token-function* (lambda (token type) (collect (cons token type))))) (read-parser (concat "fifi" (string #\ethiopic_comma) " lolo") :grammar (find-grammar "plain-tokenizer"))))) #+test (print (collecting (let ((*token-function* (lambda (token type) (collect (cons token type))))) (read-parser "3.4" :grammar (find-grammar "plain-tokenizer"))))) #+test (Print (zebu::grammar-identifier-start-chars-V (find-grammar "plain-tokenizer"))) #+test (compile-tokenizer *plain-tokenizer* :force-p t) ;;(format t "[\\-~{~c~}]+" cgp::+punctuation-chars+) (defparameter *plain-tokenizer* (make-instance 'plain-tokenizer)) #+test (compile-tokenizer *plain-tokenizer*) (defmethod compile-tokenizer ((tokenizer plain-tokenizer) &key force-p) (when (or force-p (not (find-grammar "plain-tokenizer"))) (let ((*warn-conflicts* t) (*allow-conflicts* t) (zb-file "projects:cgp;multi-tagger;plain-tokenizer.zb") (tab-file "projects:cgp;multi-tagger;plain-tokenizer.tab")) (zebu-compile-file zb-file :output-file tab-file) (zebu-load-file tab-file)))) (defparameter *pretokenized-tokenizer* (make-instance 'pretokenized-tokenizer)) (defmethod sentence-add-token ((sentence sentence) (tokenizer pretokenized-tokenizer)) "appends token(s) from stream and returns the last token added" (with-slots (stream) sentence (with-slots (token-delimiter attributes-delimiter insignificant-token-fn string-mappings) tokenizer (let ((token-string (if (char= token-delimiter #\Newline) (read-next-line stream) (with-output-to-string (str-stream) ;; preliminary (loop for c = (read-char stream nil nil) while (and c (char/= c token-delimiter)) do (write-char c str-stream)))))) (setf token-string (gethash token-string string-mappings token-string)) (cond ((or (null token-string) (eq token-string :eof)) nil) ((funcall insignificant-token-fn token-string) (add-token sentence :insignificant :attributes token-string) (sentence-add-token sentence tokenizer)) ((stringp token-string) (destructuring-bind (token-value . token-attributes) (string-parse token-string :whitespace (list attributes-delimiter)) (add-token sentence token-value :attributes token-attributes))) (t (add-token sentence token-string))))))) ;;; -------------------------------- XML-tokenizer ------------------------------------- (defclass xml-tokenizer (tokenizer) ((zebu-grammar-name :initform "xml-parser" :initarg :zebu-grammar-name :reader zebu-grammar-name) (encoding :initform :utf-8 :initarg :encoding :accessor encoding) (stack :initform () :accessor xml-tokenizer-stack) (language-stack :initform () :accessor xml-language-stack) (tumbler :initform () :accessor xml-tokenizer-tumbler) ;; list of simplified path expressions indicating which elements to include in tagging. ;; () means all. (include-path :initform () :initarg :include-path :reader include-path) ;; list of simplified path expressions indicating which elements to exclude in tagging (exclude-path :initform () :initarg :exclude-path :reader exclude-path) ;; list of simplified path expressions indicating which tokens to mark as insignificant in tagging (insignificant-path :initform () :initarg :insignificant-path :reader insignificant-path) (parsed-token :initform nil :accessor parsed-token) (word-tokens :initform () :accessor word-tokens) (sentence-delimiter-elements :initform nil :initarg :sentence-delimiter-elements :accessor sentence-delimiter-elements) (paragraph-delimiter-elements :initform nil :initarg :paragraph-delimiter-elements :accessor paragraph-delimiter-elements) (headline-delimiter-elements :initform nil :initarg :headline-delimiter-elements :accessor headline-delimiter-elements) (in-sentence-elements :initform nil :initarg :in-sentence-elements :accessor in-sentence-elements) (entity-list :initform () :initarg :entity-list :accessor entity-list) (normalized-value-function :initform nil :initarg :normalized-value-function :accessor normalized-value-function) (error :initform nil :accessor tokenizer-error))) (defmethod initialize-instance :after ((tokenizer xml-tokenizer) &key &allow-other-keys) (compile-tokenizer tokenizer)) (defmethod compile-tokenizer ((tokenizer xml-tokenizer) &key force-p) (when (or force-p (not (find-grammar "xml-parser" #+ignore"xml-tree-walker"))) (lxml::compile-xml-parser) #+ignore (let ((*package* (find-package :xml)) (*warn-conflicts* t) (*allow-conflicts* t) (zb-file "projects:xml;xml-tree-walker.zb") (tab-file "projects:xml;xml-tree-walker.tab")) (zebu-compile-file zb-file :output-file tab-file) (zebu-load-file tab-file))) (when (or force-p (not (find-grammar "plain-tokenizer"))) (let ((*package* (find-package :xml)) (*warn-conflicts* t) (*allow-conflicts* t) (zb-file "projects:cgp;multi-tagger;plain-tokenizer.zb") (tab-file "projects:cgp;multi-tagger;plain-tokenizer.tab")) (zebu-compile-file zb-file :output-file tab-file) (zebu-load-file tab-file)))) ;; locally bind to the function that digests the token #+moved ;; to xml-parser.lisp (defvar *token-function* #'print) #+moved ;; to xml-parser.lisp (defun lxml::add-token (token) (funcall *token-function* token)) ;; fix this! (defun lxml::add-plain-token (token &optional type) (funcall *token-function* token type)) (defun add-plain-token (token &optional type) (funcall *token-function* token type)) ;; this is preliminary!! (defun token-included-p (stack include-path exclude-path) (and (or (null include-path) (loop for path in include-path thereis (find path stack))) (not (loop for path in exclude-path thereis (find path stack))))) #+test (print (token-included-p '(:a :b :c) '(:b) '(:a))) #+test (compile-tokenizer *plain-tokenizer* :force-p t) #+test (print (collecting (let ((*token-function* (lambda (token type) (collect (cons token type))))) (read-parser "ha-ha — +3-4 2,3 hoho 1.2.1998 3/4 jaja/23 gaga�á��¢ lolo" ;": Men det er jo " ;;"Dette er en setning." :grammar (find-grammar "plain-tokenizer"))))) ;; not needed (defun get-word-positions (word-list string) (collecting (labels ((get-position (word-list pos) (when word-list (let ((pos (search (car word-list) string :start2 pos))) (collect pos) (get-position (cdr word-list) (+ pos (length (car word-list)))))))) (get-position word-list 0)))) #+test (compile-tokenizer (make-instance 'xml-tokenizer) :force-p t) #+test (print (get-word-positions '("Dette" "er" "en" "setning" ".") "Dette er en setning.")) ;;(format t "~:[~;~a~]" 3 3) ;;(format t "~@[~a:~]" 3) ;;(format t "~{~d~^ ~}~@[:~d~]" '(1 2 0) 0) ;;(unintern 'add-token :cgp) ;; should make this obsolete! (defun %translate-language-code (code-string) (cond ((null code-string) nil) ((string-equal code-string "nob") :nbo) ((string-equal code-string "nno") :nny) (t (intern (string-upcase code-string) :keyword)))) #+ignore (lxml::compile-xml-parser) (defun decode (string encoding &optional entity-list) (case encoding (:utf-8 (utf-8-decode string t entity-list)) (otherwise ;; prelimiary (if entity-list (decode-entities string entity-list) string)))) #+test (print (decode-entities "gaga&name;fifi" '("name" "lolo"))) (defun decode-entities (string external-entities) (with-output-to-string (stream) (let ((length (length string)) (pos 0)) (labels ((decode () (unless (= pos length) (let* ((char (char string pos))) (cond ((char= char #\&) (let* ((ent-end (position #\; string :start (1+ pos))) ;; missing: hex entities &x...; (ent (subseq string (1+ pos) ent-end))) (incf pos (- ent-end pos -1)) (let ((char (entity-to-char ent))) (cond (char (write-char (entity-to-char ent) stream)) (t (let ((ent-res (cadr (member ent external-entities :test #'string=)))) (cond (ent-res (write-string ent-res stream)) (t ;; if entity can't be resolved write plain entity (write-char #\& stream) (write-string ent stream) (write-char #\; stream))))))))) (t (incf pos) (write-char char stream)))) (decode)))) (decode))))) ;; destructive!1 (defun decode-xml-token (xml-token encoding &optional entity-list) (when (or encoding entity-list) (loop for tail on xml-token when (stringp (car tail)) do (setf (car tail) (decode (car tail) encoding entity-list)))) xml-token) #+test (Print (decode-xml-token '(:%STRING 4413 "&priimek;") nil '("ime" "priimek"))) #+test (print (corpus::update-document (corpus::get-document corpus::*tigrinya-corpus* "hadas-ertra-01") :force-p t)) ;;(defvar *parser-error* nil) ;; TO DO: for ASK: extract normalized-value from orth of preceding sic element (defmethod sentence-add-token ((sentence sentence) (tokenizer xml-tokenizer)) "appends token(s) from stream and returns the last token added" (with-slots (stream) sentence (with-slots (encoding insignificant-token-fn sentence-delimiter-elements paragraph-delimiter-elements headline-delimiter-elements stack language-stack tumbler include-path exclude-path insignificant-path word-tokens entity-list normalized-value-function) tokenizer (let ((word+pos (pop word-tokens)) (pos nil)) (cond (word+pos (prog1 (add-token sentence (car word+pos) :position (cdr word+pos) :language (find-if-not #'null language-stack) :normalized-value (when normalized-value-function (funcall normalized-value-function sentence (last-token sentence))) :insignificant-p (loop for path in insignificant-path thereis (find path stack)) #+ignore(format nil "~{~d~^ ~}~@[:~d~]" (reverse tumbler) (cdr word+pos))) (unless word-tokens (incf (car tumbler))))) (t (let ((xml-token nil)) #+allegro (mp::without-scheduling (mp:process-wait "waiting-for-token" ;; #'parsed-token tokenizer (lambda () (or (tokenizer-error tokenizer) (parsed-token tokenizer)))) (setf xml-token (parsed-token tokenizer))) #+mcl (ccl::process-wait "waiting-for-token" #'parsed-token tokenizer) ;;(let ((xml-token (parsed-token tokenizer))) (when (tokenizer-error tokenizer) ;; catch error from parse thread and signal in disambiguate-stream() (error (tokenizer-error tokenizer))) (setf (parsed-token tokenizer) nil) (unless (eq xml-token :eof) (destructuring-bind (tag-type xml-file-pos content . attributes) xml-token (cond ((eq tag-type :%xml-declaration) ;; read out character encoding (let ((enc (getf attributes :|encoding|))) (when enc (let ((enc (intern (string-upcase enc) :keyword))) (ecase enc ((:utf-8 :iso-8859-1) (setf encoding enc)))))) (sentence-add-token sentence tokenizer)) ((eq tag-type :%string) (cond ((token-included-p stack include-path exclude-path) (setf pos xml-file-pos) (setf word-tokens (collecting (let ((*token-function* (lambda (token type) #+debug(print (list (map 'list (lambda (c) c) token) type)) (let ((decoded-token token #+old(decode token encoding))) (case type ((:word :number :brace) (collect (cons decoded-token pos)) #+debug(print (cons token encoding)) (incf pos (length (if (eq encoding :utf-8) (utf-8-encode token) token)))) (:punctuation ;; group equal chars together (let ((start 0) (start-pos pos)) (loop for i from 0 for c across decoded-token do (when (char/= c (char decoded-token start)) (let ((chunk (subseq decoded-token start i))) (collect (cons chunk pos)) (incf pos (length (if (eq encoding :utf-8) (utf-8-encode chunk) chunk))) (setf start i)))) (if (zerop start) (collect (cons decoded-token pos)) (collect (cons (subseq decoded-token start) pos))) (setf pos (+ start-pos (length (if (eq encoding :utf-8) (utf-8-encode token) token)))))) (otherwise #+debug(print (list :token token :length (length token))) (incf pos #+old(length token) (length (if (eq encoding :utf-8) (utf-8-encode token) token))))))))) ;; old: we decode entities immediately, but postpone eventual UTF-8 ;; decoding until plain-tokenizer has run. #+debug(print (list :xml-token xml-token)) #+old(decode-xml-token xml-token nil entity-list) (decode-xml-token xml-token encoding entity-list) #+debug (print (list :plain (caddr xml-token))) (read-parser (caddr xml-token) ;;(replace-quotes (caddr xml-token)) :grammar (find-grammar "plain-tokenizer"))))) (let ((word+pos (pop word-tokens))) ;; return (cond (word+pos #+debug(unless (cdr word+pos) (print (list :no-pos word+pos))) (prog1 (add-token sentence (car word+pos) :position (cdr word+pos) :normalized-value (when normalized-value-function (funcall normalized-value-function sentence (last-token sentence))) :language (find-if-not #'null language-stack) :insignificant-p (loop for path in insignificant-path thereis (find path stack))) (unless word-tokens (incf (car tumbler))))) (t (when tumbler (incf (car tumbler))) (add-token sentence :newline :position pos))))) (t (decode-xml-token xml-token encoding) (add-token sentence :insignificant :attributes xml-token :position xml-file-pos) (when tumbler (incf (car tumbler))) (sentence-add-token sentence tokenizer)))) ((eq tag-type :%end) (unless (eq content (car stack)) (error "Mismatch between parsed elt and elt on stack: ~s : ~s" content stack)) (pop stack) (pop tumbler) (pop language-stack) (when tumbler (incf (car tumbler))) ;; return also when a paragraph or headline delimiter is reached (cond ((and (token-included-p stack include-path exclude-path) (or (find content paragraph-delimiter-elements) (find content headline-delimiter-elements))) (add-token sentence :insignificant :attributes xml-token :position xml-file-pos)) (t (add-token sentence :insignificant :attributes xml-token :position xml-file-pos) (sentence-add-token sentence tokenizer)))) ((find content sentence-delimiter-elements) (decode-xml-token xml-token encoding) (push 0 tumbler) (push (%translate-language-code (getf attributes :|lang|)) language-stack) (push content stack) (add-token sentence :insignificant :attributes xml-token :position xml-file-pos)) ((find tag-type '(:%empty :%comment)) (decode-xml-token xml-token encoding) (incf (car tumbler)) (add-token sentence :insignificant :attributes xml-token :position xml-file-pos) (sentence-add-token sentence tokenizer)) (t (decode-xml-token xml-token encoding) (push 0 tumbler) (push content stack) (push (%translate-language-code (getf attributes :|lang|)) language-stack) (add-token sentence :insignificant :attributes xml-token :position xml-file-pos) (sentence-add-token sentence tokenizer)))))))))))) #+old (defmethod sentence-add-token ((sentence sentence) (tokenizer xml-tokenizer)) "appends token(s) from stream and returns the last token added" (with-slots (stream) sentence (with-slots (encoding insignificant-token-fn sentence-delimiter-elements paragraph-delimiter-elements headline-delimiter-elements stack language-stack tumbler include-path exclude-path word-tokens entity-list normalized-value-function) tokenizer (let ((word+pos (pop word-tokens)) (pos nil)) (cond (word+pos (prog1 (add-token sentence (car word+pos) :position (cdr word+pos) :language (find-if-not #'null language-stack) :normalized-value (when normalized-value-function (funcall normalized-value-function sentence (last-token sentence))) #+ignore(format nil "~{~d~^ ~}~@[:~d~]" (reverse tumbler) (cdr word+pos))) (unless word-tokens (incf (car tumbler))))) (t #+allegro (mp:process-wait "waiting-for-token" #'parsed-token tokenizer) #+mcl (ccl::process-wait "waiting-for-token" #'parsed-token tokenizer) (let ((xml-token (parsed-token tokenizer))) (setf (parsed-token tokenizer) nil) (unless (eq xml-token :eof) (destructuring-bind (tag-type xml-file-pos content . attributes) xml-token (cond ((eq tag-type :%xml-declaration) ;; read out character encoding (let ((enc (getf attributes :|encoding|))) (when enc (let ((enc (intern (string-upcase enc) :keyword))) (ecase enc ((:utf-8 :iso-8859-1) (setf encoding enc)))))) (sentence-add-token sentence tokenizer)) ((eq tag-type :%string) (cond ((token-included-p stack include-path exclude-path) (setf pos xml-file-pos) (setf word-tokens (collecting (let ((*token-function* (lambda (token type) #+debug(print (list token type)) (let ((decoded-token (decode token encoding))) (case type ((:word :number :brace) (collect (cons decoded-token pos)) (incf pos (length token))) (:punctuation ;; group equal chars together (let ((start 0)) (loop for i from 0 for c across decoded-token do (when (char/= c (char decoded-token start)) (let ((chunk (subseq decoded-token start i))) (collect (cons chunk pos)) ;; fix this: increment should be taken rel. to token, ;; not decoded-token! (incf pos (length (utf-8-encode chunk))) ;; fixme: works only for utf-8!! ;;(print (list :start (- pos (utf-8-encode chunk)) :pos pos :chunk chunk :enc (utf-8-encode chunk))) ;; (incf pos (- i start)) ;; ******** old (setf start i)))) (if (zerop start) (progn #+debug(print (list :token token :length (length token))) (collect (cons decoded-token pos))) (collect (cons (subseq decoded-token start) pos))) (incf pos (- (length token) start)))) (otherwise #+debug(print (list :token token :length (length token))) (incf pos (length token)))))))) ;; we decode entities immediately, but postpone eventual UTF-8 ;; decoding until plain-tokenizer has run. #+debug(print (list :xml-token xml-token)) (decode-xml-token xml-token nil entity-list) #+debug (print (list :plain (caddr xml-token))) (read-parser (replace-quotes (caddr xml-token)) :grammar (find-grammar "plain-tokenizer"))))) (let ((word+pos (pop word-tokens))) ;; return (cond (word+pos #+debug(unless (cdr word+pos) (print (list :no-pos word+pos))) (prog1 (add-token sentence (car word+pos) :position (cdr word+pos) :normalized-value (when normalized-value-function (funcall normalized-value-function sentence (last-token sentence))) :language (find-if-not #'null language-stack)) (unless word-tokens (incf (car tumbler))))) (t (when tumbler (incf (car tumbler))) (add-token sentence :newline :position pos))))) (t (decode-xml-token xml-token encoding) (add-token sentence :insignificant :attributes xml-token :position xml-file-pos) (when tumbler (incf (car tumbler))) (sentence-add-token sentence tokenizer)))) ((eq tag-type :%end) (unless (eq content (car stack)) (error "Mismatch between parsed elt and elt on stack: ~s : ~s" content stack)) (pop stack) (pop tumbler) (pop language-stack) (when tumbler (incf (car tumbler))) ;; return also when a paragraph or headline delimiter is reached (cond ((and (token-included-p stack include-path exclude-path) (or (find content paragraph-delimiter-elements) (find content headline-delimiter-elements))) (add-token sentence :insignificant :attributes xml-token :position xml-file-pos)) (t (add-token sentence :insignificant :attributes xml-token :position xml-file-pos) (sentence-add-token sentence tokenizer)))) ((find content sentence-delimiter-elements) (decode-xml-token xml-token encoding) (push 0 tumbler) (push (%translate-language-code (getf attributes :|lang|)) language-stack) (push content stack) (add-token sentence :insignificant :attributes xml-token :position xml-file-pos)) ((find tag-type '(:%empty :%comment)) (decode-xml-token xml-token encoding) (incf (car tumbler)) (add-token sentence :insignificant :attributes xml-token :position xml-file-pos) (sentence-add-token sentence tokenizer)) (t (decode-xml-token xml-token encoding) (push 0 tumbler) (push content stack) (push (%translate-language-code (getf attributes :|lang|)) language-stack) (add-token sentence :insignificant :attributes xml-token :position xml-file-pos) (sentence-add-token sentence tokenizer)))))))))))) ;; replace non-8 bit quotes by 8 bit quotes (defun replace-quotes (utf-8-str) ;; utf-8-str #+ignore (subst-substrings utf-8-str (list (utf-8-encode (string #\left_double_quotation_mark)) (utf-8-encode (string #\" #+ignore #\left-pointing_double_angle_quotation_mark)) (utf-8-encode (string #\right_double_quotation_mark)) (utf-8-encode (string #\" #+ignore #\right-pointing_double_angle_quotation_mark))))) #+test (compile-tokenizer (make-instance 'xml-tokenizer) :force-p t) #+test (let ((*package* (find-package :xml)) (*token-function* #'print)) (zebu::xml-file-parser "projects:semantic-mirrors;corpus;MA1T-4.xml" :grammar (find-grammar "xml-tree-walker") :verbose nil)) #+test (let ((*package* (find-package :xml)) (*token-function* #'print)) (zebu::xml-file-parser "projects:semantic-mirrors;corpus;MA1T-4.xml" :grammar (find-grammar "xml-tree-walker") :verbose nil)) ;;(print (zb::read-parser "— Dette er en setning." :grammar (find-grammar "xml-parser"))) #+test (compile-tokenizer *plain-tokenizer*) #+test (let ((*package* (find-package :xml))) (compile-tokenizer *xml-tokenizer*)) ;;(print (find-grammar "xml-tree-walker")) #+test (with-open-file (stream "projects:semantic-mirrors;corpus;MA1T-4.xml") (disambiguate-stream 'xml-tokenizer stream :tokenizer-initargs '(:sentence-delimiter-elements (:|s|)) :tagging-niveau :morphological-disambiguation :cg (gethash "nbo" *cg-table*) :print-function (lambda (s &key stream &allow-other-keys) (print-sentence s :print-rules nil :expand-tokens-p t :stream stream)))) #+ignore (lxml::compile-xml-parser) #+test (with-open-file (stream "/usr/local/cwb/corpora/ask/0067TB_KT.xml") (disambiguate-stream 'xml-tokenizer stream :tokenizer-initargs '(:sentence-delimiter-elements (:|s|)) :tagging-niveau :morphological-disambiguation :cg (gethash "nbo" *cg-table*) :print-function (lambda (s &key stream &allow-other-keys) (print-sentence s :print-rules nil :expand-tokens-p t :stream stream)))) ;; *sentence* #|| (xml::compile-xml-parser) (let ((xml::*serialize* :xml-file-pos)) #+ignore(xml::compile-xml-parser) (zb::xml-file-parser "~/test.xml" :grammar (find-grammar "xml-parser") :verbose nil)) (let ((*warn-conflicts* t) (*allow-conflicts* t) (zb-file "projects:cgp;multi-tagger;test.zb") (tab-file "projects:cgp;multi-tagger;test.tab")) (zebu-compile-file zb-file :output-file tab-file) (zebu-load-file tab-file) (read-parser "gaga" :grammar (find-grammar "test") :junk-allowed nil) ) (progn (compile-tokenizer *plain-tokenizer*) (let ((*tokenizer-stream* t)) (read-parser "Hei. Dette er en liten test, men i dag er den 2.7.1999. 3 4." :grammar (find-grammar "plain-tokenizer") :junk-allowed nil))) (progn (compile-tokenizer *xml-tokenizer*) (with-open-file (stream "projects:cgp;training;zebu-test.out" :direction :output :if-exists :supersede) (let ((*tokenizer-stream* stream)) (file-parser "projects:mlcd;texts;norm-1736.xml" :grammar (find-grammar "xml-tokenizer") :verbose nil)))) (print (find-grammar "xml-tokenizer")) (time (progn (compile-tokenizer *plain-tokenizer*) (with-open-file (stream "projects:cgp;training;zebu-test.out" :direction :output :if-exists :supersede) (let ((*tokenizer-stream* stream)) (file-parser "projects:cgp;training;delkorp.txt" :grammar (find-grammar "plain-tokenizer") :verbose nil))))) (defparameter *tokenizer-grammar* (find-grammar "plain-tokenizer")) (with-input-from-string (stream "Dette er en liten text.") (compile-tokenizer *plain-tokenizer*) (let ((*tokenizer-stream* t)) (zebu::file-parser-aux stream #'error nil (find-grammar "plain-tokenizer") nil))) (print (find-grammar "plain-tokenizer")) (print (find-grammar "xml-parser")) (let ((*tokenizer* *pretokenized-tokenizer*) (*terminate-on-following-insignificant-token-p* t) (*merge-hyphenated-words-p* nil) (*lookup-unknown-in-nny-lexicon-p* t)) (with-open-file (out-stream "projects:cgp;texts;magnhild.dis" :direction :output :if-exists :supersede :if-does-not-exist :create) (with-open-file (stream "projects:cgp;texts;magnhild") (disambiguate-stream *tokenizer* stream :tagging-niveau :morphological-disambiguation :cg (gethash "nbo-mai" *cg-table*) :print-function (lambda (s &key stream &allow-other-keys) (print-sentence s :print-rules nil :expand-tokens-p t :stream out-stream)))))) (with-open-file (stream "projects:cgp;texts;magnhild-test") (disambiguate-stream 'pretokenized-tokenizer stream :tagging-niveau :morphological-disambiguation :cg (gethash "nbo-mai" *cg-table*) :print-function (lambda (s &key &allow-other-keys) (print-sentence s :print-rules nil :expand-tokens-p t :stream *standard-output*)))) (let ((xml::*serialize* t)) #+ignore(xml::compile-xml-parser) (zb::xml-file-parser "/usr/local/cwb/corpora/nsb/documents/xml/enpc/nonfict/nbo/ROB1T.xml" :grammar (find-grammar "xml-parser") :verbose nil)) (let ((xml::*serialize* t)) #+ignore(xml::compile-xml-parser) (zb::xml-file-parser "projects:cgp;texts;text.prov" :grammar (find-grammar "xml-parser") :verbose nil)) (with-open-file (out-stream "projects:cgp;texts;text-prov.out" :direction :output :if-exists :supersede) (with-open-file (stream "projects:cgp;texts;text.prov") (disambiguate-stream 'xml-tokenizer stream :tokenizer-initargs ;;'(:sentence-delimiter-elements (:|s|)) '(:sentence-delimiter-elements (:|s| :|S|) :include-path (:|text| :|TEXT|) :exclude-path ()) :tagging-niveau :morphological-disambiguation :cg (gethash "nny" *cg-table*) :print-function (lambda (s) #+ignore (print s) (print-sentence-xml s :print-rules nil :print-sentence-elts-p nil :word-elt :WORD :fresh-line-before-word-p t :expand-tokens-p t :stream out-stream ))))) ||# :eof