;;; -*- Mode: LISP; Package: CGP; BASE: 10; Syntax: ANSI-Common-Lisp; -*- ;; Copyright (C) Paul Meurer 2001. All rights reserved. ;; paul.meurer@hit.uib.no ;; HIT-centre, University of Bergen ;; ;; Version 0.0 ;;------------------------------------------------------------------------------------- ;; TO DO: ;;------------------------------------------------------------------------------------- (in-package :cgp) (use-package :zebu) (defclass tokenizer () ()) (defclass plain-tokenizer (tokenizer) ()) (defmethod initialize-instance :after ((tokenizer plain-tokenizer) &key &allow-other-keys) (compile-tokenizer tokenizer)) (defmethod compile-tokenizer ((tokenizer pretokenized-tokenizer) &key force-p) (declare (ignore force-p)) nil) (defmethod compile-tokenizer ((tokenizer plain-tokenizer) &key force-p) (when (or force-p (not (find-grammar "plain-tokenizer"))) (let ((*warn-conflicts* t) (*allow-conflicts* t) (zb-file "projects:cgp;multi-tagger;plain-tokenizer.zb") (tab-file "projects:cgp;multi-tagger;plain-tokenizer.tab")) (zebu-compile-file zb-file :output-file tab-file) (zebu-load-file tab-file)))) (defvar *tokenizer-stream*) (defun put-token (string) (write-line string *tokenizer-stream*) (values)) (defun build-parsed-key-value-pair (key value) (list (intern (string-upcase key) :keyword) value)) ;; reads tokenized material (defclass pretokenized-tokenizer (tokenizer) ((token-delimiter :initform #\Newline :initarg :token-delimiter :reader token-delimiter) (attributes-delimiter :initform #\Tab :initarg :attributes-delimiter :reader attributes-delimiter) (insignificant-token-fn :initform (lambda (str) (and (not (zerop (length str))) (member (char str 0) '(#\< #\{)))) :initarg :insignificant-token-fn :accessor insignificant-token-fn))) (defparameter *tokenizer* (make-instance 'tokenizer)) (defparameter *plain-tokenizer* (make-instance 'plain-tokenizer)) (defparameter *pretokenized-tokenizer* (make-instance 'pretokenized-tokenizer)) (defmethod sentence-add-token ((sentence sentence) (tokenizer pretokenized-tokenizer)) "appends token(s) from stream and returns the last token added" (with-slots (stream) sentence (with-slots (token-delimiter attributes-delimiter insignificant-token-fn) tokenizer (let ((token-string (if (char= token-delimiter #\Newline) (read-next-line stream) (with-output-to-string (str-stream) ;; preliminary (loop for c = (read-char stream nil nil) while (and c (char/= c token-delimiter)) do (write-char c str-stream)))))) (cond ((eq token-string :eof) nil) ((funcall insignificant-token-fn token-string) (add-token sentence :insignificant :attributes token-string) (sentence-add-token sentence tokenizer)) (t (destructuring-bind (token-value . token-attributes) (string-parse token-string :whitespace (list attributes-delimiter)) (add-token sentence token-value :attributes token-attributes)))))))) ;;; -------------------------------- XML-tokenizer ------------------------------------- (defclass xml-tokenizer (tokenizer) ((parsed-token :initform nil :accessor parsed-token) (word-tokens :initform () :accessor word-tokens) (sentence-delimiter-elements :initform nil :initarg :sentence-delimiter-elements :accessor sentence-delimiter-elements))) (defmethod initialize-instance :after ((tokenizer xml-tokenizer) &key &allow-other-keys) (compile-tokenizer tokenizer)) (defmethod compile-tokenizer ((tokenizer xml-tokenizer) &key force-p) (when (or force-p (not (find-grammar "xml-tree-walker"))) (let ((*package* (find-package :xml)) (*warn-conflicts* t) (*allow-conflicts* t) (zb-file "projects:xml;xml-tree-walker.zb") (tab-file "projects:xml;xml-tree-walker.tab")) (zebu-compile-file zb-file :output-file tab-file) (zebu-load-file tab-file))) (when (or force-p (not (find-grammar "plain-tokenizer"))) (let ((*warn-conflicts* t) (*allow-conflicts* t) (zb-file "projects:cgp;multi-tagger;plain-tokenizer.zb") (tab-file "projects:cgp;multi-tagger;plain-tokenizer.tab")) (zebu-compile-file zb-file :output-file tab-file) (zebu-load-file tab-file)))) ;; locally bind to the function that digests the token (defvar *token-function* #'print) (defun xml::add-token (token) (funcall *token-function* token)) (defun add-plain-token (token) (funcall *token-function* token)) (defmethod sentence-add-token ((sentence sentence) (tokenizer xml-tokenizer)) "appends token(s) from stream and returns the last token added" (with-slots (stream) sentence (let ((word (pop (word-tokens tokenizer)))) (cond (word (add-token sentence word)) (t (with-slots (insignificant-token-fn sentence-delimiter-elements) tokenizer (mp:process-wait "waiting-for-token" #'parsed-token tokenizer) (let ((xml-token (parsed-token tokenizer))) (setf (parsed-token tokenizer) nil) (cond ((eq xml-token :eof) nil) ((stringp xml-token) (setf (word-tokens tokenizer) (collecting (let ((*token-function* (lambda (token) (collect token)))) (read-parser xml-token :grammar (find-grammar "plain-tokenizer"))))) (add-token sentence (pop (word-tokens tokenizer)))) ((and (listp xml-token) (find (car xml-token) sentence-delimiter-elements)) (add-token sentence :insignificant :attributes xml-token)) (t (add-token sentence :insignificant :attributes xml-token) (sentence-add-token sentence tokenizer)))))))))) #+test (let ((*package* (find-package :xml)) (*token-function* #'print)) (zebu::xml-file-parser "projects:semantic-mirrors;corpus;MA1T-4.xml" :grammar (find-grammar "xml-tree-walker") :verbose nil)) #+test (compile-tokenizer *plain-tokenizer*) #+test (let ((*package* (find-package :xml))) (compile-tokenizer *xml-tokenizer*)) ;;(print (find-grammar "xml-tree-walker")) #+test (with-open-file (stream "projects:semantic-mirrors;corpus;MA1T-4.xml") (disambiguate-stream 'xml-tokenizer stream *standard-output* :tokenizer-initargs '(:sentence-delimiter-elements (:|s|)) :tagging-niveau :morphological-disambiguation :cg (gethash "nbo-mai" *cg-table*) :print-function (lambda (s &key stream &allow-other-keys) (print-sentence s :print-rules nil :expand-tokens-p t :stream stream)))) ;; *sentence* #|| (let ((*warn-conflicts* t) (*allow-conflicts* t) (zb-file "projects:cgp;multi-tagger;test.zb") (tab-file "projects:cgp;multi-tagger;test.tab")) (zebu-compile-file zb-file :output-file tab-file) (zebu-load-file tab-file) (read-parser "gaga" :grammar (find-grammar "test") :junk-allowed nil) ) (progn (compile-tokenizer *plain-tokenizer*) (let ((*tokenizer-stream* t)) (read-parser "Hei. Dette er en liten test, men i dag er den 2.7.1999. 3 4." :grammar (find-grammar "plain-tokenizer") :junk-allowed nil))) (progn (compile-tokenizer *xml-tokenizer*) (with-open-file (stream "projects:cgp;training;zebu-test.out" :direction :output :if-exists :supersede) (let ((*tokenizer-stream* stream)) (file-parser "projects:mlcd;texts;norm-1736.xml" :grammar (find-grammar "xml-tokenizer") :verbose nil)))) (print (find-grammar "xml-tokenizer")) (time (progn (compile-tokenizer *plain-tokenizer*) (with-open-file (stream "projects:cgp;training;zebu-test.out" :direction :output :if-exists :supersede) (let ((*tokenizer-stream* stream)) (file-parser "projects:cgp;training;delkorp.txt" :grammar (find-grammar "plain-tokenizer") :verbose nil))))) (defparameter *tokenizer-grammar* (find-grammar "plain-tokenizer")) (with-input-from-string (stream "Dette er en liten text.") (compile-tokenizer *plain-tokenizer*) (let ((*tokenizer-stream* t)) (zebu::file-parser-aux stream #'error nil (find-grammar "plain-tokenizer") nil))) (print (find-grammar "plain-tokenizer")) (let ((*tokenizer* *pretokenized-tokenizer*) (*terminate-on-following-insignificant-token-p* t) (*merge-hyphenated-words-p* nil) (*lookup-unknown-in-nny-lexicon-p* t)) (with-open-file (out-stream "projects:cgp;texts;magnhild.dis" :direction :output :if-exists :supersede :if-does-not-exist :create) (with-open-file (stream "projects:cgp;texts;magnhild") (disambiguate-stream *tokenizer* stream :tagging-niveau :morphological-disambiguation :cg (gethash "nbo-mai" *cg-table*) :print-function (lambda (s &key stream &allow-other-keys) (print-sentence s :print-rules nil :expand-tokens-p t :stream out-stream)))))) (with-open-file (stream "projects:cgp;texts;magnhild") (disambiguate-stream 'pretokenized-tokenizer stream :tagging-niveau :morphological-disambiguation :cg (gethash "nbo-mai" *cg-table*) :print-function (lambda (s &key &allow-other-keys) (print-sentence s :print-rules nil :expand-tokens-p t :stream *standard-output*)))) ||# ;;; EOF