;;; -*- Mode: LISP; Package: CGP; BASE: 10; Syntax: ANSI-Common-Lisp; -*- ;; Copyright (C) Paul Meurer 2001 - 2007. All rights reserved. ;; paul.meurer@aksis.uib.no ;; Aksis, UNIFOB, University of Bergen ;; ;;------------------------------------------------------------------------------------- ;; TO DO: ;;------------------------------------------------------------------------------------- (in-package :cgp) (defclass tokenizer () ((sentence-start-p-fun :initform nil :initarg :sentence-start-p-fun :reader sentence-start-p-fun))) (defclass plain-tokenizer (tokenizer) ()) (defmethod initialize-instance :after ((tokenizer tokenizer) &key &allow-other-keys) (compile-tokenizer tokenizer)) (defmethod compile-tokenizer ((tokenizer tokenizer) &key force-p) (declare (ignore force-p)) nil) (defvar *tokenizer-stream*) (defun put-token (string) (write-line string *tokenizer-stream*) (values)) (defun build-parsed-key-value-pair (key value) (list (intern (string-upcase key) :keyword) value)) ;; reads tokenized material (defclass pretokenized-tokenizer (tokenizer) ((token-delimiter :initform #\Newline :initarg :token-delimiter :reader token-delimiter) (attributes-delimiter :initform #\Tab :initarg :attributes-delimiter :reader attributes-delimiter) (string-mappings :initform (make-hash-table :test #'equal) :accessor string-mappings) (insignificant-token-fn :initform (lambda (str) (and (stringp str) (not (zerop (length str))) (member (char str 0) '(#\< #\{)))) :initarg :insignificant-token-fn :accessor insignificant-token-fn))) (defmethod initialize-instance :after ((tokenizer pretokenized-tokenizer) &key string-mappings &allow-other-keys) (loop for (str replacement) on string-mappings by #'cddr do (setf (gethash str (string-mappings tokenizer)) replacement))) (defclass pretagged-tokenizer (tokenizer) ()) (defmethod sentence-add-token ((sentence sentence) (tokenizer pretagged-tokenizer)) (with-slots (cg stream) sentence (let* ((line (read-line stream nil nil))) (cond ((null line) nil) ((string= line "\"<" :end1 2) (let* ((sentence-end-p nil) (word-end (search ">\"" line)) (word (subseq line 2 word-end))) (collecting-into (readings control-features) (loop until (member (peek-char nil stream nil nil) '(#\" nil)) do (let ((parsed-string (string-parse (read-line stream nil nil) :whitespace #.(format nil " ~c" #\Tab) :delimiter-pairs '((#\" . #\"))))) (when (car parsed-string) (destructuring-bind (lemma . features) parsed-string (let ((features (mapcar (lambda (f) (intern (string-upcase f) :cgp)) features))) (when (find '<<< features) (setf sentence-end-p t)) (multiple-value-bind (feature-vector unknown-features) (apply #'encode-features features) (collect-into readings (cons (string-trim "\"" lemma) feature-vector)) (collect-into control-features unknown-features))))))) ;;(when (car control-features) (print control-features)) (add-token sentence word :token-class 'control-token :features readings :control-features control-features :used-rules '(()))) (not sentence-end-p))) (t (warn "Wrong format: line = '~a'~%" line) t))))) (defmethod build-sentence ((sentence sentence) (tokenizer pretagged-tokenizer) &optional prev-token terminator) (declare (ignore prev-token terminator)) ;;(print (cons :build-sentence sentence)) (loop while (sentence-add-token sentence tokenizer)) (values sentence (when (peek-char nil (sentence-stream sentence) nil nil) (get-sentence :stream (sentence-stream sentence) :sentence-class (class-of sentence) :cg *cg* :feature-vector (feature-vector (multi-tagger *cg*)))))) #+test (with-open-file (stream "/home/paul/test.in") (let* ((*cg* (gethash "nbo" *cg-table*)) (*tagger* (multi-tagger *cg*))) (disambiguate-stream 'pretagged-tokenizer stream :tagging-niveau :named-entity-recognition-only :print-function (lambda (s) (print-sentence s :expand-tokens-p nil)) :context-size 1000))) #+test (with-open-file (stream "/home/kristinh/sammen_ny.nvncor") (let* ((*cg* (gethash "nbo-navn" *cg-table*)) (*tagger* (multi-tagger *cg*))) (disambiguate-stream 'pretagged-tokenizer stream :tagging-niveau :named-entity-recognition-only :print-function (lambda (s) (print-sentence s :expand-tokens-p nil)) ;;:context-size 1000 ))) #+test (with-open-file (stream "projects:cgp;training;navn-test.in") (let* ((*cg* (gethash "nbo-navn" *cg-table*)) (*tagger* (multi-tagger *cg*))) (disambiguate-stream 'pretagged-tokenizer stream :tagging-niveau :named-entity-recognition-only :print-function (lambda (s) (print-sentence s :expand-tokens-p nil)) ;;:context-size 1000 ))) ;; Doesn't really work for ne-recognition of pre-tagged text because lc-features are not preserved. ;; (Maybe other problems too.) (defmethod disambiguate-stream ((tokenizer pretagged-tokenizer) in-stream &key (print-function #'print-sentence) mapping-function ;; (mapping-function #'map-sentences) (pretagging-niveau :morphological-disambiguation) (tagging-niveau :morphological-disambiguation) message-fn (cg *cg*) token-print-fn context-size &allow-other-keys) (let ((*cg* cg) (*tagger* (multi-tagger cg)) (*sentence-class* (if (find tagging-niveau '(:named-entity-disambiguation :syntactic-named-entity-mapping :syntactic-disambiguation-named-entity-mapping :syntactic-named-entity-disambiguation :term-extraction :term-ne-extraction :named-entity-mapping-only)) 'regexp-sentence *sentence-class*)) (*token-class* (case tagging-niveau ((:named-entity-disambiguation :syntactic-named-entity-mapping :syntactic-disambiguation-named-entity-mapping :syntactic-named-entity-disambiguation :term-extraction :term-ne-extraction) 'regexp-token) (:named-entity-mapping-only 'control-token) (otherwise *token-class*))) (*token-memory* (when context-size (make-instance 'memory :context-size context-size)))) #+debug(setf *memory* *token-memory*) (funcall (or mapping-function #'map-sentences) in-stream (lambda (s) (when (and message-fn *sentence-count* (zerop (mod *sentence-count* 1))) (funcall message-fn (format nil "~4d sentences disambiguated." *sentence-count*))) (ecase tagging-niveau ((:none :multi-tagging) s) (:morphological-disambiguation (disambiguate (initialize-sentence-array s :sort-cohorts-p nil) (list #'disambiguate-sentence #'h-disambiguate-sentence #'disambiguate-sentence))) (:syntactic-mapping (disambiguate (initialize-sentence-array s) (list #'disambiguate-sentence #'h-disambiguate-sentence #'disambiguate-sentence #'map-sentence))) (:syntactic-disambiguation (disambiguate (initialize-sentence-array s) (list #'disambiguate-sentence #'h-disambiguate-sentence #'disambiguate-sentence #'map-sentence #'s-disambiguate-sentence (lambda (cg sentence) (h-s-disambiguate-sentence cg sentence 1)) (lambda (cg sentence) (h-s-disambiguate-sentence cg sentence 3)) #'s-disambiguate-sentence))) (:named-entity-mapping (disambiguate (initialize-sentence-array s) (list #'disambiguate-sentence #'h-disambiguate-sentence #'disambiguate-sentence #'named-entity-map-sentence ))) (:named-entity-disambiguation (disambiguate (initialize-sentence-array s) (list #'disambiguate-sentence #'h-disambiguate-sentence #'disambiguate-sentence ;;#'regexp-merge-sentence (lambda (cg sentence) (regexp-merge-sentence cg sentence :enqueue-sentence-p nil)) #'named-entity-map-sentence #'named-entity-disambiguate-sentence #'h-named-entity-disambiguate-sentence #'named-entity-disambiguate-sentence))) (:term-extraction (disambiguate (initialize-sentence-array s) (list #'disambiguate-sentence #'h-disambiguate-sentence #'disambiguate-sentence ;;#'regexp-merge-sentence (lambda (cg sentence) (regexp-merge-sentence cg sentence :enqueue-sentence-p nil :regexp-parser *term-extractor* :use-lc-features-p nil :first-uppercase-p nil :label :te)) #|| #'named-entity-map-sentence #'named-entity-disambiguate-sentence #'h-named-entity-disambiguate-sentence #'named-entity-disambiguate-sentence ||#))) (:term-ne-extraction (disambiguate (initialize-sentence-array s) (list #'disambiguate-sentence #'h-disambiguate-sentence #'disambiguate-sentence (lambda (cg sentence) (regexp-merge-sentence cg sentence :enqueue-sentence-p nil :regexp-parser *name-term-extractor* :label :te)) (lambda (cg sentence) (regexp-merge-sentence cg sentence :enqueue-sentence-p nil :regexp-parser *term-extractor* :use-lc-features-p nil :first-uppercase-p nil :label :te)) #|| #'named-entity-map-sentence #'named-entity-disambiguate-sentence #'h-named-entity-disambiguate-sentence #'named-entity-disambiguate-sentence ||#))) (:syntactic-named-entity-disambiguation (disambiguate (initialize-sentence-array s) (list #'disambiguate-sentence #'h-disambiguate-sentence #'disambiguate-sentence #'regexp-merge-sentence #'map-sentence #'s-disambiguate-sentence (lambda (cg sentence) (h-s-disambiguate-sentence cg sentence 1)) (lambda (cg sentence) (h-s-disambiguate-sentence cg sentence 3)) #'s-disambiguate-sentence ;;#'h-s-disambiguate-sentence ;;#'h-s-disambiguate-sentence-3 (lambda (cg sentence) (regexp-merge-sentence cg sentence :enqueue-sentence-p nil)) #'named-entity-map-sentence #'named-entity-disambiguate-sentence #'h-named-entity-disambiguate-sentence #'named-entity-disambiguate-sentence ))) (:syntactic-named-entity-mapping (disambiguate (initialize-sentence-array s) (list #'disambiguate-sentence #'h-disambiguate-sentence #'disambiguate-sentence #'regexp-merge-sentence #'map-sentence #'s-disambiguate-sentence (lambda (cg sentence) (h-s-disambiguate-sentence cg sentence 1)) (lambda (cg sentence) (h-s-disambiguate-sentence cg sentence 3)) #'s-disambiguate-sentence #'map-sentence (lambda (cg sentence) (regexp-merge-sentence cg sentence :enqueue-sentence-p nil)) #'named-entity-map-sentence ))) (:syntactic-disambiguation-named-entity-mapping (disambiguate (initialize-sentence-array s) (list #'disambiguate-sentence #'h-disambiguate-sentence #'disambiguate-sentence #'regexp-merge-sentence #'map-sentence #'s-disambiguate-sentence (lambda (cg sentence) (h-s-disambiguate-sentence cg sentence 1)) (lambda (cg sentence) (h-s-disambiguate-sentence cg sentence 3)) #'s-disambiguate-sentence (lambda (cg sentence) (regexp-merge-sentence cg sentence :enqueue-sentence-p nil)) #'named-entity-map-sentence ))) (:named-entity-recognition-only (disambiguate (initialize-sentence-array s) (list #'named-entity-map-sentence #'named-entity-disambiguate-sentence #'h-named-entity-disambiguate-sentence #'named-entity-disambiguate-sentence ))))) :multitag-p nil :sentence-class *sentence-class* :print-function print-function :token-print-fn token-print-fn))) (defparameter *tokenizer* (make-instance 'tokenizer)) :eof