;; -*- Mode: lisp; Syntax: ansi-common-lisp; Package: cgp; Base: 10; Readtable: augmented-readtable -*- ;; ;; Copyright (C) Paul Meurer 2002-2007. All rights reserved. ;; paul.meurer@aksis.uib.no ;; Aksis, UNIFOB University of Bergen ;; ;;------------------------------------------------------------------------------------- ;; TO DO: ;;------------------------------------------------------------------------------------- ;; PROBLEMS: ;; ;;------------------------------------------------------------------------------------- (in-package :cgp) (defparameter *user-wordlist-table* (make-hash-table :test #'equal)) (defparameter *normalize-wordlist-table* (make-hash-table :test #'equal)) (defparameter *wordlist-directory* "projects:cgp;nets;") #+test (print (get-user-wordlist "test")) (defmethod get-user-wordlist ((tagger multi-tagger) name &key file force-reload-p (restore-p t)) (let* ((name (string-downcase name)) (file (or file (concat *wordlist-directory* name ".txt")))) (or (and (not force-reload-p) (gethash name *user-wordlist-table*)) (setf (gethash name *user-wordlist-table*) (cons name (load-wordforms tagger file :restore-p restore-p)))))) (defun get-normalize-wordlist (name &key file force-reload-p) (let ((file (or file (concat *wordlist-directory* name "-normalize.txt")))) (or (and (not force-reload-p) (gethash name *normalize-wordlist-table*)) (setf (gethash name *normalize-wordlist-table*) (cons name (if (probe-file file) (dat::load-string-tree file) (dat::make-string-tree))))))) (defun store-normalize-wordlist (name tree &key file) (let ((file (or file (concat *wordlist-directory* name "-normalize.txt")))) (dat::store-string-tree tree file))) (defmethod add-new-wordform ((tagger multi-tagger) lemma word features &key net #+not-yet overrides-p (net-accessor #'new-wordforms)) (let* ((net (or net (funcall net-accessor tagger) (cond ((eq net-accessor #'new-wordforms) (setf (new-wordforms tagger) (cons (make-instance 'string-net::list-string-net) (make-instance 'string-net::list-string-net)))) ((eq net-accessor #'names) (setf (names tagger) (cons (make-instance 'string-net::list-string-net) (make-instance 'string-net::list-string-net)))) (t (error "Accessor function ~s does not exist." net-accessor))))) (fvector (bit-vector-to-string (apply 'encode-features (feature-vector tagger) (mapcar (lambda (str) (intern (string-upcase str) :cgp)) (if (listp features) features (split features #\Space)))))) (lemma (restore-string (copy-seq lemma))) (word (restore-string (copy-seq word)))) #+debug(print (list :adding word lemma fvector)) (string-net::add-string (car net) (concat lemma ":" (string-net::compress-string word lemma) ":" fvector)) (string-net::add-string (cdr net) (concat word ":" (string-net::compress-string lemma word) #+not-yet(if overrides-p "o:" ":") ":" fvector)))) (defmethod store-wordforms ((tagger multi-tagger) net file) #-debug(print (list :file file)) (let ((*tagger* tagger)) (with-open-file (stream file :direction :output :if-exists :supersede :external-format :iso-8859-1) (nmap-strings net (lambda (string) (destructuring-bind (lemma word features) (split string #\: 3) (write-string lemma stream) (write-char #\: stream) (write-string (string-net::decompress-string word lemma) stream) (write-char #\: stream) (format stream "~(~{~a~^ ~}~)~%" (code-features (string-to-bit-vector features) (feature-vector tagger))))))))) (defmethod store-wordforms ((tagger menota-multi-tagger) net file) #-debug(print (list :file file)) (let ((*tagger* tagger)) (with-open-file (stream file :direction :output :if-exists :supersede :external-format :iso-8859-1) (nmap-strings net (lambda (string) (destructuring-bind (source mode user lemma word features) (split string #\: 6) #||(write-string lemma stream) (write-char #\: stream) (write-string (string-net::decompress-string word lemma) stream) (write-char #\: stream)||# (format stream "~a:~a:~a:~a:~a:~(~{~a~^ ~}~)~%" source mode user lemma (string-net::decompress-string word lemma) (code-features (string-to-bit-vector features) (feature-vector tagger))))))))) (defmethod load-wordforms ((tagger multi-tagger) file &key (restore-p t)) (let* ((*tagger* tagger) (net (make-instance 'string-net::list-string-net)) (inverse-net (make-instance 'string-net::list-string-net)) (net-cons (cons net inverse-net))) (when (and file (probe-file file)) (with-file-lines (line file :external-format :iso-8859-1) (let ((string (if restore-p (restore-string line) line))) (destructuring-bind (lemma word features) (split string #\: 3) (add-new-wordform tagger lemma word features :net net-cons ;; :net-accessor net-accessor ) #+ignore (when build-inverse-net-p (let ((word (decompress-string word lemma))) (string-net::add-string inverse-net (concat word ":" (string-net::compress-string lemma word) ":" features)))) #+ignore (string-net::add-string net string))))) #+ignore (if build-inverse-net-p (cons net inverse-net) net) net-cons)) (defmethod load-wordforms ((tagger menota-multi-tagger) file &key (restore-p t)) (let* ((*tagger* tagger) (net (make-instance 'string-net::list-string-net)) (inverse-net (make-instance 'string-net::list-string-net)) (net-cons (cons net inverse-net))) (when (and file (probe-file file)) (with-file-lines (line file :external-format :iso-8859-1) (let ((string (if restore-p (restore-string line) line))) (destructuring-bind (source mode user lemma word features) (split string #\: 6) (add-new-wordform tagger lemma word features :net net-cons :source source :mode mode :user user))))) net-cons)) (defmethod add-uncompressed-word-list ((tagger multi-tagger) file &key (net-accessor #'new-wordforms)) (with-file-lines (line file :external-format :iso-8859-1) (unless (not (find-if-not (lambda (c) (eq c #\Space)) line)) (destructuring-bind (lemma . features) (string-parse line :delimiter-pairs '(("\"" . "\"")) :whitespace " ") (add-new-wordform tagger (string-trim "\"" lemma) (string-trim "\"" lemma) features :net-accessor net-accessor))))) #+test (print (string-parse "\"Antigua og Barbuda\" subst prop " ;;:brace-pairs '((#\" . #\")) :delimiter-pairs '(("\"" . "\"")) ;;:whitespace " " )) #+test (print (directory "projects:cgp;nets;lists;*.txt")) #+test (let ((*tagger* *nbo-tagger*)) (with-slots (names) *tagger* (setf names nil)) (dolist (list (directory "projects:cgp;nets;lists;*.txt")) (add-uncompressed-word-list *tagger* list :net-accessor #'names))) #+test (setf ;;(new-wordforms *nbo-tagger*) (load-wordforms "projects:cgp;nets;nbo-new-wordforms.txt") (names *nbo-tagger*) (load-wordforms "projects:cgp;nets;nbo-new-names.txt") ) ;;(defparameter *nbo-new-wordforms-net* (load-wordforms "projects:cgp;nets;nbo-new-wordforms.txt")) ;;(defparameter *nny-new-wordforms-net* (load-wordforms "projects:cgp;nets;nny-new-wordforms.txt")) #+test (store-new-wordforms *nbo-new-wordforms-net* "projects:cgp;nets;nbo-new-wordforms.txt") #+test (store-new-wordforms *nny-new-wordforms-net* "projects:cgp;nets;nny-new-wordforms.txt") #+test (let ((net (car (new-wordforms *nbo-tagger*)))) (nmap-strings net #'print (string-net::string-subnet (car (new-wordforms *nbo-tagger*)) "lærgutt:"))) (defun remove-lemma (net-cons lemma) (let ((restored-lemma (restore-string (copy-seq lemma)))) remove ;; from inverse net (nmap-strings (car net-cons) (lambda (compressed-word+features) (destructuring-bind (compressed-word fvector) (split compressed-word+features #\: 2) (let ((word (decompress-string compressed-word lemma))) (string-net::remove-branch (cdr net-cons) (let ((restored-word (restore-string (copy-seq word)))) (concat restored-word ":" (string-net::compress-string restored-lemma restored-word) ":" fvector)))))) (string-net::string-subnet (car net-cons) (concat lemma ":"))) ;; remove from net (string-net::remove-branch (car net-cons) (concat restored-lemma ":")))) #+ignore (setf (new-wordforms *nbo-tagger*) (load-wordforms "nbo-new-wordforms.txt")) (defun translate-string (string) (loop for i from 0 for c across string do (setf (char string i) (string-net::translate-char c))) string) (defun restore-string (string) (loop for i from 0 for c across string do (setf (char string i) (string-net::restore-char c))) string) #+ignore (let ((count 0) (net (make-instance 'string-net::list-string-net))) (block map (nmap-strings (lexicon tagger) (lambda (string) (destructuring-bind (word compressed-lemma &optional fvector) (split string #\: 3) (let ((lemma (decompress-string compressed-lemma word))) (print (list word lemma fvector)) (string-net::add-string net (concat lemma ":" (string-net::compress-string word lemma) ":" fvector)))) (when (= (incf count) 20) (return-from map))))) (minimize-net net) (string-net::write-string-net net net-file)) #+test (string-net::print-strings *nbo-new-wordforms-net*) ;;; EOF