;;; -*- Mode: LISP; Package: CGP; BASE: 10; Syntax: ANSI-Common-Lisp; -*- ;; ;; Copyright (C) Paul Meurer 2000 - 2004. All rights reserved. ;; paul.meurer@hit.uib.no ;; HIT-centre, University of Bergen ;; ;; Reimplementation in ANSI Common Lisp of the compound analyser for Norwegian (C program) ;; written by Helge Hauglin and Lars Jørgen Tvedt, UiO 1999 ;; Used in the CG parser/tagger system (Oslo-tagger) developed at UiO ;; (Dokumentasjonsprosjektet; Tekstlaboratoriet) ;; ;;------------------------------------------------------------------------------------- ;; TO DO: ;; - CLOSify (partially done) ;; - get rid of ranking-info structure, lazy eval instead ;; - rewrite the whole thing as a fsa! ;; ;; QUESTIONS: ;; - why aren't numbers lemmata? ;;------------------------------------------------------------------------------------- (in-package "CGP") (defconstant $alpha-base 36) (defvar *code-table* (make-hash-table)) (defparameter *analyser-version* nil) (defvar *%strings* (make-array 0 :adjustable t :fill-pointer t)) (defvar *%byte-arrays* (make-array 0 :adjustable t :fill-pointer t)) (defun %vector-pop (vector) (let* ((fill (fill-pointer vector))) (declare (fixnum fill)) (unless (zerop fill) (decf fill) (setf (fill-pointer vector) fill) (aref vector fill)))) #+copy (defmacro %with-string ((string) &body body) `(let ((,string (or (%vector-pop *%strings*) (make-array 0 :element-type #+mcl 'base-character #-mcl 'character :adjustable t :fill-pointer t)))) (unwind-protect (progn ,@body) (setf (fill-pointer ,string) 0) (vector-push-extend ,string *%strings*)))) #+copy (defmacro %with-byte-array ((array) &body body) `(let ((,array (or (%vector-pop *%byte-arrays*) (make-array 0 :element-type '(unsigned-byte 8) :adjustable t :fill-pointer t)))) (unwind-protect (progn ,@body) (setf (fill-pointer ,array) 0) (vector-push-extend ,array *%byte-arrays*)))) (defun vowelp (c) (find c "aeiouyæøå")) (defun long-chunk-p (chunk) (let* ((first-vowel-group-pos (position-if #'vowelp chunk)) (middel-cons-group-pos (when first-vowel-group-pos (position-if-not #'vowelp chunk :start first-vowel-group-pos)))) (when middel-cons-group-pos (find-if #'vowelp chunk :start middel-cons-group-pos)))) (defun codes (chunk) (cddr chunk)) (defun chunk (chunk) (cadr chunk)) (defun chunk-and-codes (chunk) (cdr chunk)) ;(analyse-compound "femogsytti") #+test (let ((*tagger* *nbo-tagger*)) (print (analyse-compound "fattig-Norge"))) #+test (let ((*tagger* *nbo-tagger*)) #-test (print (analyse-compound "kratere")) (print-ranked-analyses "kratere")) ;; sjekk_ledd (defun check-chunk (analysis chunk codes last-p) (cond ;; suppress chunks of length 1, except junctions ((and (= (length chunk) 1) (not (find (char chunk 0) "es"))) nil) ((and (all-p #'adjective-p codes) (not (zerop (length analysis))) (all-p #'adverb-p (codes (aref analysis 0)))) nil) ((and last-p (or (all-p #'adverb-p codes) (all-p #'code-98x-p codes))) nil) ((and (not last-p) (not (find-if #'previous-chunk-p codes))) nil) #+to-be-fixed ((all-p #'prefix-p codes) (incf affix-count)) (t t))) (defun all-have-features-p (codes &rest features) (not (find-if-not (lambda (code) (has-features-p code features)) codes))) (defun one-has-features-p (codes &rest features) (find-if (lambda (code) (has-features-p code features)) codes)) (defun check-chunk-features (analysis chunk codes last-p) (cond ;; suppress chunks of length 1, except junctions ((and (= (length chunk) 1) (not (find (char chunk 0) "es"))) nil) ((and ;;(all-p #'adjective-p codes) (all-have-features-p codes 'adj) (not (zerop (length analysis))) ;;(all-p #'adverb-p (codes (aref analysis 0))) (all-have-features-p (codes (aref analysis 0)) 'adv)) nil) ((and last-p (or (all-have-features-p codes 'adv) ;; (all-p #'adverb-p codes) #+later(all-p #'code-98x-p codes) )) nil) ((and (not last-p) (not (find-if #'f-previous-chunk-p codes))) nil) #+to-be-fixed ((all-p #'prefix-p codes) (incf affix-count)) (t t))) ;; alfa2decimal (defun alpha-value (char) (cond ((char<= #\0 char #\9) (- (char-code char) (char-code #\0))) ((char<= #\A char #\Z) (- (+ (char-code char) 10) (char-code #\A))) ((char<= #\a char #\z) (- (+ (char-code char) 10) (char-code #\a))) (t nil))) (defun alpha-to-decimal (alpha) (let ((code 0) (form nil)) (loop with colon = nil for c across alpha do (cond ((char= c #\:) (setf colon t form 0)) (colon (setf form (+ (* form 10) (alpha-value c)))) (t (setf code (+ (* code $alpha-base) (alpha-value c)))))) (values code form))) ;; the inverse: (multiple-value-call #'decimal-to-alpha (alpha-to-decimal x)) == x (defun decimal-to-alpha (code form) (let ((char-list ())) (labels ((walk (code) (unless (zerop code) (let ((cd (mod code $alpha-base))) (if (< cd 10) (push (code-char (+ (char-code #\0) cd)) char-list) (push (code-char (+ (char-code #\a) -10 cd)) char-list)) (walk (floor code 36)))))) (walk code) (u:concat (coerce char-list 'string) ":" (write-to-string form))))) (defun read-code-file (file) (clrhash *code-table*) (let ((intervals ())) (u:with-file-lines (line file) #+(or :pcl :unix)(setf line (string-right-trim (string (code-char 13)) line)) (unless (or (zerop (length line)) (char= (char line 0) #\#)) (destructuring-bind (cat-str alpha-code-start alpha-code-end &optional form-start form-end) (u:string-parse line :whitespace " ") (let ((class (intern (nsubstitute #\- #\_ cat-str) :keyword)) (code-start (alpha-to-decimal alpha-code-start)) (code-end (alpha-to-decimal alpha-code-end))) (cond (form-start (push (list code-start code-end class (parse-integer form-start) (parse-integer form-end)) intervals) (push (list code-start code-end (parse-integer form-start) (parse-integer form-end)) (gethash class *code-table*))) (t (push (list code-start code-end class) intervals) (push (list code-start code-end) (gethash class *code-table*)))))))) (sort intervals (lambda (x y) (if (= (car x) (car y)) (> (cadr x) (cadr y)) (< (car x) (car y))))))) (defvar *code-intervals* (read-code-file "projects:cgp;multi-tagger;fuge-code.tab")) (defun code-to-class (alpha-codes) (let ((class-list ())) (dolist (alpha-code alpha-codes) (multiple-value-bind (code form) (alpha-to-decimal alpha-code) (labels ((iterate (intervals) (destructuring-bind (start end class &optional form-start form-end) (car intervals) (when (<= start code) (when (and (<= code end) (or (null form) (null form-start) (<= form-start form form-end))) (pushnew class class-list)) (iterate (cdr intervals)))))) (iterate *code-intervals*)))) class-list)) (defun code-in-class-p (class alpha-code) (if (symbolp alpha-code) ; fix for junctures (eq class alpha-code) (multiple-value-bind (code form) (alpha-to-decimal alpha-code) (let ((intervals (gethash class *code-table*))) (when (find-if (lambda (interval) (and (<= (car interval) code (cadr interval)) (or (null form) (null (cddr interval)) (<= (caddr interval) form (cadddr interval))))) intervals) t))))) (defun e-juncture-p (codes) (if (null *analyser-version*) (code-in-class-p :e-juncture (car codes)) (when (find-if (lambda (code) (and (listp code) (find :e-juncture code))) codes) t))) (defun s-juncture-p (codes) ;;(Print (list :codes codes)) (if (null *analyser-version*) (code-in-class-p :s-juncture (car codes)) (when (find-if (lambda (code) (and (listp code) (find :s-juncture code))) codes) t))) (defun juncture-p (codes) (or (e-juncture-p codes) (s-juncture-p codes))) ;;(defun unknown-p (code) (code-in-class-p :unknown code)) (defun verb-p (code) (code-in-class-p :verb code)) (defun noun-p (code) (code-in-class-p :substantiv code)) (defun real-noun-p (code) (code-in-class-p :ekte-substantiv code)) (defun adjective-p (code) (code-in-class-p :adjektiv code)) (defun adverb-p (code) (code-in-class-p :adverb code)) (defun norv-participle-p (code) (code-in-class-p :norv-partisipp code)) (defun norsv-participle-p (code) (code-in-class-p :norsv-partisipp code)) (defun superlative-p (code) (code-in-class-p :superlativ code)) ; ?? (defun norv-p (code) (code-in-class-p :norv code)) (defun norsv-p (code) (code-in-class-p :norsv code)) (defun imp-verb-p (code) (code-in-class-p :impverb code)) (defun inf-verb-p (code) (code-in-class-p :infverb code)) (defun pfx-p (code) (code-in-class-p :prefiks code)) ; fixed ;(defun sfx-p (code) (code-in-class-p :suffiks code)) (defun sfx-p (code) (or (code-in-class-p :suffiks code) (sfx-verb-arbitrary-prev-p code) (sfx-noun-imp-verb-prev-p code) (sfx-noun-adj-prev-p code) (sfx-noun-noun/imp-verb-prev-p code) (sfx-noun-arbitrary-prev-p code) (sfx-adj-noun-prev-p code) (sfx-adv-noun-prev-p code) (sfx-adj-imp-verb-prev-p code) (sfx-adj-inf-verb/noun-prev-p code) (sfx-adv-noun-prev-p code))) (defun sfx-verb-arbitrary-prev-p (code) (code-in-class-p :suffiks-verb-vilk-forledd code)) (defun sfx-noun-imp-verb-prev-p (code) (code-in-class-p :suffiks-subst-impverb-forledd code)) (defun sfx-noun-adj-prev-p (code) (code-in-class-p :suffiks-subst-adj-forledd code)) (defun sfx-noun-noun/imp-verb-prev-p (code) (code-in-class-p :suffiks-subst-subst-el-impverb-forledd code)) (defun sfx-noun-arbitrary-prev-p (code) (code-in-class-p :suffiks-subst-vilk-forledd code)) (defun sfx-adj-noun-prev-p (code) (code-in-class-p :suffiks-adj-subst-forledd code)) (defun sfx-adv-noun-prev-p (code) (code-in-class-p :suffiks-adv-subst-forledd code)) (defun sfx-adj-imp-verb-prev-p (code) (code-in-class-p :suffiks-adj-impverb-forledd code)) (defun sfx-adj-inf-verb/noun-prev-p (code) (code-in-class-p :suffiks-adj-infverb-el-subst-forledd code)) (defun sfx-adv-noun-prev-p (code) (code-in-class-p :suffiks-adv-subst-forledd code)) (defun code-1000-p (code) (code-in-class-p :kode-1000 code)) (defun number-og-p (code) (code-in-class-p :tall-og code)) (defun number-p (code) (code-in-class-p :tall code)) (defun big-number-p (code) (code-in-class-p :stort-tall code)) (defun code-98x-p (code) (code-in-class-p :kode-98x code)) (defun participle-p (code) (or (norv-participle-p code) (norsv-participle-p code))) (defun affix-p (code) (or (pfx-p code) (sfx-p code))) (defun code-1000-plus-og-p (code) (or (code-1000-p code) (number-og-p code))) (defun previous-chunk-p (code) (or (real-noun-p code) (verb-p code) (adjective-p code) (adverb-p code) (participle-p code) (code-1000-plus-og-p code) (number-p code) ;; *** ADDED! (affix-p code))) (defun f-previous-chunk-p (code) (or (has-feature-p code 'subst) ;; ?? (real-noun-p code) (has-feature-p code 'verb) ;; (verb-p code) (has-feature-p code 'adj) ;; (adjective-p code) (has-feature-p code 'adv) ;; (adverb-p code) (has-feature-p code ') ;; (participle-p code) (has-feature-p code ') #+later(code-1000-plus-og-p code) ;?? (has-feature-p code 'kvant) ;; (number-p code) ;; *** ADDED! (has-feature-p code ') #+later(affix-p code))) (defun lemma-p (alpha-code) (multiple-value-bind (code form) (alpha-to-decimal alpha-code) (declare (ignore code)) (eql form 0))) (defun f-lemma-p (word compressed-lemma bv) (declare (ignore word)) #+debug (let ((*package* (find-package :cgp))) (print (list* word compressed-lemma (code-features bv)))) (and (string= compressed-lemma "") (cond ((has-feature-p bv 'verb) (has-feature-p bv 'inf)) ((has-feature-p bv 'subst) (has-feature-p bv 'ent)) ((has-feature-p bv 'adj) (and (has-feature-p bv 'sg) (has-feature-p bv 'pos))) (t t)))) (defun fullform-p (code) (not (lemma-p code))) (defun filter-analysis (analysis) (declare (ignore analysis)) t) (defun all-p (predicate codes) (not (find-if-not predicate codes))) ; betingelser_fuge_e (defun e-juncture-allowed-p (chunk codes analysis) (and ; either no previous chunk or at least two of them (/= (fill-pointer analysis) 1) ; (C+)V+C+ (let* ((vowel-pos (position-if (lambda (c) (vowelp c)) chunk)) (cons-pos (when vowel-pos (position-if-not (lambda (c) (vowelp c)) chunk :start vowel-pos))) (next-vowel-pos (when cons-pos (position-if (lambda (c) (vowelp c)) chunk :start cons-pos)))) (and cons-pos (not next-vowel-pos))) ; noun or unknown (or (null codes) (find-if #'noun-p codes)))) (defun f-e-juncture-allowed-p (chunk codes analysis) (and ; either no previous chunk or at least two of them (/= (fill-pointer analysis) 1) ; (C+)V+C+ (let* ((vowel-pos (position-if (lambda (c) (vowelp c)) chunk)) (cons-pos (when vowel-pos (position-if-not (lambda (c) (vowelp c)) chunk :start vowel-pos))) (next-vowel-pos (when cons-pos (position-if (lambda (c) (vowelp c)) chunk :start cons-pos)))) (and cons-pos (not next-vowel-pos))) ; noun or unknown (or (null codes) (one-has-features-p codes 'subst) ;; (find-if #'noun-p codes) ))) (defun s-juncture-allowed-p (chunk codes analysis) (and ; is noun (find-if #'noun-p codes) ; does not end in "s", "sj" or "ch" (let ((length (length chunk))) (or (< length 2) (and (string/= chunk "s" :start1 (- length 1)) (string/= chunk "sj" :start1 (- length 2)) (string/= chunk "ch" :start1 (- length 2))))) ; either chunk is not first chunk or the last consonant group does not contain #\s (or (> (fill-pointer analysis) 0) (let ((last-s-pos (position #\s chunk :from-end t))) (or (not last-s-pos) (position-if (lambda (c) (vowelp c)) chunk :start last-s-pos)))))) (defun f-s-juncture-allowed-p (chunk codes analysis) (and ;; is noun (one-has-features-p codes 'subst) ;; (find-if #'noun-p codes) ;; does not end in "s", "sj" or "ch" (let ((length (length chunk))) (or (< length 2) (and (string/= chunk "s" :start1 (- length 1)) (string/= chunk "sj" :start1 (- length 2)) (string/= chunk "ch" :start1 (- length 2))))) ;; either chunk is not first chunk or the last consonant group does not contain #\s (or (> (fill-pointer analysis) 0) (let ((last-s-pos (position #\s chunk :from-end t))) (or (not last-s-pos) (position-if (lambda (c) (vowelp c)) chunk :start last-s-pos)))))) #|| ; 570 ms (time (dotimes (i 10) (tag-compound "kjempeklatrestevneorganisasjonskomite"))) (let ((*analyser-version* :regexp)) (print-ranked-analyses "sildesalgslag")) (print-ranked-analyses "kjempe-klatresko") (analyse-compound "Klatresko") (analyse-compound "ibsenstykke") (setf *tagger* *nbo-tagger*) (let ((*tagger* *nbo-tagger*)) (print (tag-compound "futureskontrakter"))) (let ((*tagger* *nbo-tagger*)) (print (tag-compound "vest-afrikansk"))) (print-ranked-analyses "fattignorge") (print-ranked-analyses "fattigNorge") (print-ranked-analyses "billeoppslag") (print-ranked-analyses "enebærbuskspill") (print-ranked-analyses "vareheissjakt") (print-ranked-analyses "ekskjæreste") (print-ranked-analyses "uamerikansk") (print-ranked-analyses "diskonterbar") (print-ranked-analyses "gammelmannsaktig") (print-ranked-analyses "policymessig") (print-ranked-analyses "aldershjemsenhet") (let ((*tagger* *nbo-tagger*)) (print-ranked-analyses "bananmodneri")) (let ((*tagger* *nbo-tagger*)) (print-ranked-analyses "fyxynografi")) (tag-compound "fyxynografi") (string-values *fullforms* "grafi") (setf *tagger* *nbo-tagger*) ||# (defclass compound-analyser () ((current-analysis :initform (make-array 0 :fill-pointer t :adjustable t) :accessor current-analysis) (ranking :initform (make-array 0 :fill-pointer t :adjustable t) :accessor ranking) (compound-chunks :initform (make-array 0 :fill-pointer t :adjustable t) :accessor compound-chunks) (analysis-tree :initform nil :accessor analysis-tree))) (defparameter *compound-analyser-pool* ()) (defun get-compound-analyser () (let ((analyser (or (pop *compound-analyser-pool*) (make-instance 'compound-analyser)))) (with-slots (analysis-tree current-analysis ranking compound-chunks) analyser (setf (fill-pointer current-analysis) 0 (fill-pointer ranking) 0 (fill-pointer compound-chunks) 0 analysis-tree nil)) analyser)) (defmacro with-compound-analyser ((analyser) &body body) `(let ((,analyser (get-compound-analyser))) ; thread safe! (multiple-value-prog1 (progn ,@body) (push ,analyser *compound-analyser-pool*)))) (defun analyse-compound (string) (with-compound-analyser (analyser) (analysis-tree (%analyse-compound analyser string)))) ;; TO DO: analyse from last space if string is multi-word. (defmethod %analyse-compound ((analyser compound-analyser) string) (if (null *analyser-version*) (with-slots (analysis-tree current-analysis ranking compound-chunks) analyser (dotimes (i (1+ (length string))) (vector-push-extend nil compound-chunks)) ; does not cons unnecessarily (let ((hyphen-pos (position #\- string))) (setf analysis-tree (or (%sub-analyse-compound analyser string (if hyphen-pos (1+ hyphen-pos) 0)) (and (upper-case-p (char string 0)) (%sub-analyse-compound analyser (string-downcase string) (if hyphen-pos (1+ hyphen-pos) 0))) ;; if there is no valid analysis with known chunks collect all analyses ;; with unknown prefix containing at least one vowel (let ((vowel-pos (position-if #'vowelp string))) (when vowel-pos (collecting (loop for start from (1+ vowel-pos) to (1- (length string)) with count = -1 for analysis = (%sub-analyse-compound analyser string start t count) do (when analysis (collect (car analysis)) (incf count (caaar analysis))))))))))) (with-slots (analysis-tree current-analysis ranking compound-chunks) analyser (dotimes (i (1+ (length string))) (vector-push-extend nil compound-chunks)) ; does not cons unnecessarily (let ((hyphen-pos (position #\- string))) (setf analysis-tree (or (%sub-analyse-compound-f analyser string (if hyphen-pos (1+ hyphen-pos) 0)) (and (upper-case-p (char string 0)) (%sub-analyse-compound-f analyser (string-downcase string) (if hyphen-pos (1+ hyphen-pos) 0))) ;; if there is no valid analysis with known chunks collect all analyses ;; with unknown prefix containing at least one vowel (let ((vowel-pos (position-if #'vowelp string))) (when vowel-pos (collecting #+old (loop for start from (1+ vowel-pos) to (- (length string) 2) with count = -1 with key = 0 for analysis = (%sub-analyse-compound-f analyser string (print start) :unknown-first-p t :count count) do (when analysis (collect (car analysis)) (incf count (caaar analysis)))) (loop for start from (1+ vowel-pos) to (- (length string) 2) with count = -1 and key = 0 and analysis do (multiple-value-bind (analysis cnt) (%sub-analyse-compound-f analyser string start :unknown-first-p t :count count) (when analysis (collect (car analysis)) (if cnt (setf count cnt) (incf count (caaar analysis)))))))))))))) analyser) #|| ;(analyse-compound "barne-TV") ;(analyse-compound "fem-seks") ;(analyse-compound "steiro") ;(%analyse-compound "ro" 0 *fullforms* *lemmata* t) (analyse-compound "fyxynografienes") (print-ranked-analyses "fyxynografienes") (print-ranked-analyses "motkulturelle") (print (analyse-compound "barneforsikring")) ;; *** FEIL ranking! (string-values *fullforms* "Norge") ||# (defmethod %sub-analyse-compound ((analyser compound-analyser) string start &optional unknown-first-p (count -1)) (with-slots (current-analysis ranking compound-chunks) analyser (let* ((word-end-marker #\:) (length (length string)) (memory (make-array (1+ length) :initial-element t)) (l-net (list-net (lemmata *tagger*))) (ff-net (list-net (fullforms *tagger*)))) (labels ((walk-memoized (analysis) ; change name! (dolist (sub-analysis analysis) (vector-push-extend (car sub-analysis) current-analysis) (if (cdr sub-analysis) (walk-memoized (cdr sub-analysis)) (vector-push-extend (compute-ranking-info (incf count) analyser) ranking)) (decf (fill-pointer current-analysis)))) (analyse (start) (let ((memoized-analysis (aref memory start))) (if (eq memoized-analysis t) ; means NOT memoized! ** change to :not-memoized or so (setf (aref memory start) (collecting (labels ((walk (pos ff-n l-n) (let* ((ff-sub-node (when (< pos length) (find (string-net::restore-char (char string pos)) ff-n :key #'car))) (ff-end-marker-node (find word-end-marker ff-n :key #'car)) (l-sub-node (when (and l-n (< pos length)) (find (string-net::restore-char (char string pos)) l-n :key #'car))) (l-end-marker-node (when l-n (find word-end-marker l-n :key #'car)))) (when ff-sub-node (walk (1+ pos) (cdr ff-sub-node) (cdr l-sub-node))) (when ff-end-marker-node ;; then, extract codes using the remaining sub-network (let ((address (find word-end-marker ff-n :key #'car)) (codes ()) (chunk (subseq string start pos))) (nmap-strings (fullforms *tagger*) (lambda (code) (push (copy-seq code) codes)) address) (when (check-chunk current-analysis chunk codes (= pos length)) (let ((chunk+codes (cons chunk codes))) (vector-push-extend (cons nil chunk+codes) current-analysis) ;; check if this is not first chunk (let ((analysis (filter-two chunk codes (analyse pos) current-analysis nil unknown-first-p))) (when (or analysis (= pos length)) (unless (zerop start) (setf (aref compound-chunks pos) chunk)) (collect (cons (cons ;; label the node by the number of complete paths starting from it (if analysis (reduce #'+ analysis :key #'caar) 1) chunk+codes) analysis))) (decf (fill-pointer current-analysis))))))) ;; the same for lemmata, but last chunk must be fullform (when (and l-end-marker-node (< pos length)) (let ((address (find word-end-marker l-n :key #'car)) (codes ()) (chunk (subseq string start pos))) (nmap-strings (lemmata *tagger*) (lambda (code) (push (copy-seq code) codes)) address) (when (check-chunk current-analysis chunk codes nil) ;; no juncture (let ((chunk+codes (cons chunk codes))) (vector-push-extend (cons nil chunk+codes) current-analysis) (let ((analysis (filter-two chunk codes (analyse pos) current-analysis nil unknown-first-p))) (when (or analysis (= pos length)) (unless (zerop start) (setf (aref compound-chunks pos) chunk)) (collect (cons (cons (if analysis (reduce #'+ analysis :key #'caar) 1) chunk+codes) analysis))) (decf (fill-pointer current-analysis)))) ;; juncture (unless (or (find-if #'adverb-p codes) (= pos length)) (cond ((and (char= (char string pos) #\e) (e-juncture-allowed-p chunk codes current-analysis)) (let ((chunk+codes (cons chunk codes)) (juncture (list "e" :e-juncture))) (vector-push-extend (cons nil chunk+codes) current-analysis) (vector-push-extend (cons nil juncture) current-analysis) (let ((analysis (filter-two chunk codes (analyse (1+ pos)) current-analysis :e-juncture unknown-first-p))) (decf (fill-pointer current-analysis) 2) (when analysis (unless (zerop start) (setf (aref compound-chunks pos) chunk)) (let ((branch-count (reduce #'+ analysis :key #'caar))) (collect (list (cons branch-count chunk+codes) (cons (cons branch-count juncture) analysis)))))))) ;; code duplication! ((and (char= (char string pos) #\s) (s-juncture-allowed-p chunk codes current-analysis)) (let ((chunk+codes (cons chunk codes)) (juncture (list "s" :s-juncture))) (vector-push-extend (cons nil chunk+codes) current-analysis) (vector-push-extend (cons nil juncture) current-analysis) (let ((analysis (filter-two chunk codes (analyse (1+ pos)) current-analysis :s-juncture unknown-first-p))) (decf (fill-pointer current-analysis) 2) (when analysis (unless (zerop start) (setf (aref compound-chunks pos) chunk)) (let ((branch-count (reduce #'+ analysis :key #'caar))) (collect (list (cons branch-count chunk+codes) (cons (cons branch-count juncture) analysis)))))))))))))))) (walk start ff-net l-net)))) (values memoized-analysis t))))) (let ((analyses (if (zerop start) (analyse start) (progn (vector-push-extend (list nil (subseq string 0 start)) current-analysis) (let* ((sub-analysis (if unknown-first-p (filter-two (subseq string start) nil (analyse start) current-analysis nil unknown-first-p) (analyse start))) (branch-count (count-analyses sub-analysis))) (prog1 (when sub-analysis (list (cons (list branch-count (subseq string 0 start)) sub-analysis))) (decf (fill-pointer current-analysis)))))))) (walk-memoized analyses) #+debug(print analyses) analyses))))) (defmacro do-chunk-readings ((node lemma feature-bv) &body body) (with-gensyms (%string %array code) `(%with-string (,%string) (%with-byte-array (,%array) (string-net::nmap-string+array lexicon (lambda (,lemma ,code) (declare (ignorable ,lemma ,code)) (let ((,feature-bv (array-to-bit-vector ,code))) ,@body)) ,%string ,%array ,node #\:))))) (defmethod %sub-analyse-compound-f ((analyser compound-analyser) string start &key unknown-first-p (count -1)) (if (eq *analyser-version* :regexp) (%sub-analyse-compound-regexp analyser string start :unknown-first-p unknown-first-p :count count) (with-slots (current-analysis ranking compound-chunks) analyser (let* ((word-end-marker #\:) (length (length string)) (memory (make-array (1+ length) :initial-element :not-memoized)) (lexicon (lexicon *tagger*)) (lex-net (list-net lexicon))) (labels ((analyse (start) (let ((memoized-analysis (aref memory start))) (if (eq memoized-analysis :not-memoized) (setf (aref memory start) (collecting (labels ((walk (pos lex-n) (let* ((lex-sub-node (when (< pos length) (find (string-net::restore-char (char string pos)) lex-n :key #'car))) (lex-end-marker-node (find word-end-marker lex-n :key #'car))) (when lex-sub-node (walk (1+ pos) (cdr lex-sub-node))) (when lex-end-marker-node ;; then, extract codes using the remaining sub-network (let ((lex-address (find word-end-marker lex-n :key #'car)) (features ()) (chunk (subseq string start pos))) (do-chunk-readings (lex-address lemma bv) (when (or (= pos length) (f-lemma-p chunk lemma bv)) (push bv features))) (when (check-chunk-features current-analysis chunk features (= pos length)) ;; no juncture (let ((chunk+features (cons chunk features))) (vector-push-extend (cons nil chunk+features) current-analysis) (let ((analysis (filter-two-features chunk features (analyse pos) current-analysis nil unknown-first-p))) (when (or analysis (= pos length)) (unless (zerop start) (setf (aref compound-chunks pos) chunk)) (collect (cons (cons ;; label the node by the number of complete paths starting from it (if analysis (reduce #'+ analysis :key #'caar) 1) chunk+features) analysis))) (decf (fill-pointer current-analysis)))) ;; juncture (unless (or (one-has-features-p features 'adv) ;; (find-if #'adverb-p codes) (= pos length)) (cond ((and (char= (char string pos) #\e) (f-e-juncture-allowed-p chunk features current-analysis)) (let ((chunk+features (cons chunk features)) (juncture (list "e" :e-juncture))) (vector-push-extend (cons nil chunk+features) current-analysis) (vector-push-extend (cons nil juncture) current-analysis) (let ((analysis (filter-two-features chunk features (analyse (1+ pos)) current-analysis :e-juncture unknown-first-p))) (decf (fill-pointer current-analysis) 2) (when analysis (unless (zerop start) (setf (aref compound-chunks pos) chunk)) (let ((branch-count (reduce #'+ analysis :key #'caar))) (collect (list (cons branch-count chunk+features) (cons (cons branch-count juncture) analysis)))))))) ;; code duplication! ((and (char= (char string pos) #\s) (f-s-juncture-allowed-p chunk features current-analysis)) (let ((chunk+features (cons chunk features)) (juncture (list "s" :s-juncture))) (vector-push-extend (cons nil chunk+features) current-analysis) (vector-push-extend (cons nil juncture) current-analysis) (let ((analysis (filter-two-features chunk features (analyse (1+ pos)) current-analysis :s-juncture unknown-first-p))) (decf (fill-pointer current-analysis) 2) (when analysis (unless (zerop start) (setf (aref compound-chunks pos) chunk)) (let ((branch-count (reduce #'+ analysis :key #'caar))) (collect (list (cons branch-count chunk+features) (cons (cons branch-count juncture) analysis)))))))))))))))) (walk start lex-net)))) (values memoized-analysis t)))) (walk-memoized (analysis) ; change name! (dolist (sub-analysis analysis) (vector-push-extend (car sub-analysis) current-analysis) (if (cdr sub-analysis) (walk-memoized (cdr sub-analysis)) (vector-push-extend (compute-f-ranking-info (incf count) analyser) ranking)) (decf (fill-pointer current-analysis))))) (let ((analyses (if (zerop start) (analyse start) (progn (vector-push-extend (list nil (subseq string 0 start)) current-analysis) (let* ((sub-analysis (if unknown-first-p (filter-two-features (subseq string start) nil (analyse start) current-analysis nil unknown-first-p) (analyse start))) (branch-count (count-analyses sub-analysis))) (prog1 (when sub-analysis (list (cons (list branch-count (subseq string 0 start)) sub-analysis))) (decf (fill-pointer current-analysis)))))))) (walk-memoized analyses) #+debug(print analyses) analyses)))))) ;;;; New regexp pruning of chunking analyses ;; (eval-when (:execute :load-toplevel :compile-toplevel) (fsa::def-string-test-function pre-e-juncture (chunk) (let* ((vowel-pos (position-if (lambda (c) (vowelp c)) chunk)) (cons-pos (when vowel-pos (position-if-not (lambda (c) (vowelp c)) chunk :start vowel-pos))) (next-vowel-pos (when cons-pos (position-if (lambda (c) (vowelp c)) chunk :start cons-pos)))) (and cons-pos (not next-vowel-pos)))) (fsa::def-string-test-function pre-s-juncture (chunk) ;; chunk is not first chunk ;; does not end in "s", "sj" or "ch" (let ((length (length chunk))) (and (string/= chunk "s" :start1 (- length 1)) (string/= chunk "sj" :start1 (- length 2)) (string/= chunk "ch" :start1 (- length 2))))) (fsa::def-string-test-function immediate-pre-s-juncture (chunk) ;; the last consonant group does not contain #\s ;; does not end in "s", "sj" or "ch" (let ((length (length chunk))) (and (string/= chunk "s" :start1 (- length 1)) (string/= chunk "sj" :start1 (- length 2)) (string/= chunk "ch" :start1 (- length 2)) (let ((last-s-pos (position #\s chunk :from-end t))) (or (not last-s-pos) (position-if (lambda (c) (vowelp c)) chunk :start last-s-pos)))))) #+test (print pre-e-juncture) (defparameter *compound-regexp* (make-instance 'feature-regexp :name "Regexp for compound analyser" :source-regexp `(:or (:seq ;; first chunk (:or ukjent (:and subst ub ent) (:and verb inf) (:and det kvant) (:seq (:and subst ub ent ,immediate-pre-s-juncture) :s-juncture) (:seq (:and subst ub ent ,pre-e-juncture) :e-juncture)) ;; inner chunks (:* (:or (:and subst ub ent) (:and verb inf) (:seq (:and subst ub ent ,pre-s-juncture) :s-juncture) (:seq (:and subst ub ent ,pre-e-juncture) :e-juncture))) ;; last chunk subst) ;; number (:seq (:+ (:and det kvant)) (:? (:seq (:and "og" konj) (:and det kvant))))))) (defun cp-boolean-list-delta-get (features-list state delta) (let ((relation (fsa::relation-get state delta)) (result ())) (when relation (let* ((features features-list) (states (fsa::relation-subsumed-get features relation))) ;; STATE ist a list of state id + new features list (cp. fsa::KEY-SUBSUME) (dolist (state states) (pushnew (append state (list features)) result :test #'equal))) result))) #+test (setf *tagger* *nbo-tagger*) #+test (let ((*analyser-version* :regexp)) (print-ranked-analyses-regexp "fiskesluse")) #+test (let ((*analyser-version* :regexp)) (print-ranked-analyses-regexp "beatlesplate")) (defmethod %sub-analyse-compound-regexp ((analyser compound-analyser) string start &key (count -1) &allow-other-keys) (with-slots (current-analysis ranking compound-chunks) analyser (let* ((word-end-marker #\:) (length (length string)) (memory (make-array (1+ length) :initial-element :not-memoized)) (lexicon (lexicon *tagger*)) (lex-net (list-net lexicon))) (labels ((analyse (start) (let ((memoized-analysis (aref memory start))) (if (eq memoized-analysis :not-memoized) (setf (aref memory start) (collecting (labels ((walk (pos lex-n) (let* ((lex-sub-node (when (< pos length) (find (string-net::restore-char (char string pos)) lex-n :key #'car))) (lex-end-marker-node (find word-end-marker lex-n :key #'car))) (when lex-sub-node (walk (1+ pos) (cdr lex-sub-node))) (when lex-end-marker-node ;; then, extract codes using the remaining sub-network (let ((lex-node (find word-end-marker lex-n :key #'car)) (features ()) (chunk (subseq string start pos))) ;;(print chunk) (when (or (> (- pos start) 2) (and (= (- pos start) 2) (find chunk '("en" "og" "to" "ni" "år" "gå" "øl") :test #'string-equal))) (do-chunk-readings (lex-node lemma bv) (when (find-if (lambda (f) (has-feature-p bv f)) (if (= pos length) '(subst adj adv kvant) '(subst adv konj det verb kvant))) (push (cons (decompress-string lemma chunk) bv) features)) #+debug(let ((*package* (find-package :cgp))) (print (list* chunk (decompress-string lemma chunk) (code-features bv))))) (when features ;; no juncture (let ((chunk+features (cons chunk features))) (vector-push-extend (cons nil chunk+features) current-analysis) (let ((analysis (analyse pos))) (when (or analysis (= pos length)) (unless (zerop start) (setf (aref compound-chunks pos) chunk)) (collect (cons (cons ;; label the node by the number of complete paths starting from it (* (if analysis (reduce #'+ analysis :key #'caar) 1) (length features)) chunk+features) analysis))) (decf (fill-pointer current-analysis)))) ;; juncture (unless (= pos length) (let ((char (char string pos))) (when (find char "es") (let ((chunk+features (cons chunk features)) (juncture (if (char= char #\e) (list "e" (list "e" :e-juncture)) (list "s" (list "s" :s-juncture))))) (vector-push-extend (cons nil chunk+features) current-analysis) (vector-push-extend (cons nil juncture) current-analysis) (let ((analysis (analyse (1+ pos)))) (decf (fill-pointer current-analysis) 2) (when analysis (unless (zerop start) (setf (aref compound-chunks pos) chunk)) (let* ((sub-branch-count (reduce #'+ analysis :key #'caar)) (branch-count (* (length features) sub-branch-count))) (collect (list (cons branch-count chunk+features) (cons (cons sub-branch-count juncture) analysis))))))))))))))))) (walk start lex-net)))) (values memoized-analysis t))))) (let ((analyses (if (zerop start) (analyse start) (progn (vector-push-extend (list nil (subseq string 0 start)) current-analysis) (let* ((sub-analysis (analyse start)) (branch-count (count-analyses sub-analysis))) (prog1 (when sub-analysis (list (cons (list branch-count (subseq string 0 start) #+old(list (subseq string 0 start) :unknown) (cons (subseq string 0 start) (encode-features 'ukjent)) ) sub-analysis))) (decf (fill-pointer current-analysis)))))))) (when analyses #+debug(print analyses) (let ((pruned-analyses (prune-analyses *compound-regexp* analyses :key (1+ count)))) #+debug(print (list :unknown (subseq string 0 start) :count (1+ count) :pa pruned-analyses)) (labels ((walk-memoized (analysis) ; change name! (dolist (sub-analysis analysis) (destructuring-bind (nkey chunk . readings) (car sub-analysis) (dolist (reading readings) ;;(print (list :pushing nkey chunk (if (consp reading) (cdr reading) (list reading)))) (vector-push-extend (list nkey chunk (if (consp reading) (cdr reading) (list reading))) current-analysis) (if (cdr sub-analysis) (walk-memoized (cdr sub-analysis)) ;; COUNT is a hash key (progn (when (find (incf count) pruned-analyses) #+debug(print :valid) (vector-push-extend (compute-f-ranking-info count analyser) ranking)) #+debug (print current-analysis))) (decf (fill-pointer current-analysis))))))) (walk-memoized analyses) #+debug(print ranking) #+debug(print analyses) (values (when pruned-analyses analyses) count))))))))) (defun print-ranked-analyses-regexp (word) (with-compound-analyser (analyser) (%analyse-compound analyser word) (with-slots (analysis-tree ranking) analyser (setf ranking (rank-analyses analyser)) (dotimes (i (length ranking)) (print (cons (ranking-info-position (aref ranking i)) (nth-analysis-codes-regexp (ranking-info-position (aref ranking i)) analysis-tree t))))))) #+test (let ((*analyser-version* :regexp)) (print-ranked-analyses-regexp "barnemat")) #+test (let ((*analyser-version* :regexp)) (print-ranked-analyses-regexp "fisikesluse")) (defmethod prune-analyses ((fr feature-regexp) analyses &key (key 0) &allow-other-keys) (collecting (with-slots (dfa) fr (let ((delta (fsa:fsa-delta dfa)) (start-state (fsa:fsa-start-state dfa))) (labels ((walk (state analyses readings key chunking) #+debug(print chunking) ;; CHUNKING is for debugging only (if readings (let* ((reading (car readings)) (analysis (car analyses)) (chunk (cadar analysis)) (sub-count (/ (caar analysis) (length (cddar analysis)))) (features (if (consp (cdr reading)) reading (cons (car reading) (code-features (cdr reading))))) (new-states (cp-boolean-list-delta-get features state delta))) #+debug(print features) (dolist (new-state+morph new-states) (destructuring-bind (new-state new-features . morph) new-state+morph (declare (ignore morph)) (cond ((cdr analysis) (destructuring-bind (nkey ch . readings) (caadr analysis) (declare (ignore ch)) (walk new-state (cdr analysis) readings key (cons (cons sub-count chunk) chunking)))) ((fsa:set-member-p new-state (fsa:fsa-final-states dfa)) #+debug(print (cons key (reverse (cons (cons sub-count chunk) chunking)))) (collect key)) (t nil)))) (cond ((cdr readings) (walk state analyses (cdr readings) (+ sub-count key) chunking)) ((cdr analyses) (walk state (cdr analyses) nil (+ sub-count key) chunking)) (t nil))) (destructuring-bind (nkey chunk . readings) (caar analyses) (walk state analyses readings key chunking))))) (walk start-state analyses nil key ())))))) #|| (((3 "fiske" #) ((3 "sluse" # # #))) ((1 "fiske" #) ((1 "s" :S-JUNCTURE) ((1 "luse" #)))) ((3 "fisk" #) ((3 "e" :E-JUNCTURE) ((3 "sluse" # # #))))) ||# (defun nth-analysis-regexp (n analysis) "Uses the perfect hash function in a network where every node is labeled by the number of complete paths starting from it. The n-th path is the following: At each node, look at the adjacent (following) nodes; if the label of the first node is lower or equal n, decrease n by that number and test the next one, and so on; if not, proceed to that node, not altering n." (when analysis (let ((branch-count (caaar analysis))) (if (< n branch-count) (destructuring-bind (chunk . f-codes) (cdaar analysis) (multiple-value-bind (f-number n) (floor n (if (cdar analysis) (reduce #'+ (cdar analysis) :key #'caar) 1)) (cons chunk (nth-analysis-regexp n (cdar analysis))))) (nth-analysis-regexp (- n branch-count) (cdr analysis)))))) (defun nth-analysis-codes-regexp (n analysis &optional features-as-list-p) "Same algorithm as in NTH-ANALYSIS, but returns all chunks and their codes" ;;(print (list :n n analysis)) (when analysis (let ((branch-count (caaar analysis))) (if (< n branch-count) (destructuring-bind (chunk . f-codes) (cdaar analysis) (multiple-value-bind (f-number n) (floor n (if (cdar analysis) (reduce #'+ (cdar analysis) :key #'caar) 1)) (let ((reading (nth f-number f-codes))) (cons (cons chunk (if features-as-list-p (if (consp (cdr reading)) reading (cons (car reading) (code-features (cdr reading)))) reading)) (nth-analysis-codes-regexp n (cdar analysis) features-as-list-p))))) (nth-analysis-codes-regexp (- n branch-count) (cdr analysis) features-as-list-p))))) #+test (let ((*analyser-version* :regexp)) (print (nth-analysis-codes-regexp 5 (analyse-compound "barnevakt") t))) ;; LEKS_S_VERB_FINAL (defun final-lexical-s-verb-p (analysis) "checks if last chunk is a verb starting with #\s, without preceding juncture" (let* ((length (length analysis)) (last-chunk (aref analysis (1- length))) (next-to-last-chunk (when (> length 1) (aref analysis (- length 2))))) (and (char= (char (chunk last-chunk) 0) #\s) (if (null *analyser-version*) (all-p #'verb-p (codes last-chunk)) (all-have-features-p (codes last-chunk) 'verb)) (not (and next-to-last-chunk (juncture-p (codes next-to-last-chunk))))))) ;; LEKS_VERB_INITIAL (defun initial-lexical-verb-p (analysis) (and (if (null *analyser-version*) (all-p #'verb-p (codes (aref analysis 0))) (all-have-features-p (codes (aref analysis 0)) 'verb)) (or (= (length analysis) 1) (not (juncture-p (codes (aref analysis 1))))))) (defun final-s-juncture-p (analysis) (let* ((length (length analysis)) (next-to-last-chunk (when (> length 1) (aref analysis (- length 2))))) (and next-to-last-chunk (s-juncture-p (codes next-to-last-chunk))))) (defun initial-e-juncture-p (analysis) ;;(print :e-j) (and (> (length analysis) 2) (e-juncture-p (codes (aref analysis 1))))) (defun initial-s-juncture-p (analysis) (and (> (length analysis) 2) (s-juncture-p (codes (aref analysis 1))))) ;; *** get rid of it (defstruct ranking-info position effective-length juncture-count unknown-first-chunk-p final-lexical-s-verb-p initial-lexical-verb-p final-s-juncture-p initial-e-juncture-p initial-s-juncture-p possibly-noun-p last-chunk-length first-chunk-compound-p long-first-chunk-p) (defun compute-ranking-info (position analyser) (with-slots (current-analysis compound-chunks) analyser (let ((effective-length (effective-length current-analysis))) (make-ranking-info ;; position in the analysis tree :position position ;; length not counting junctures :effective-length effective-length ;; number of junctures :juncture-count (- (length current-analysis) effective-length) ;; unknown first chunk? :unknown-first-chunk-p (null (codes (aref current-analysis 0))) ;; final lexical s-verb? :final-lexical-s-verb-p (final-lexical-s-verb-p current-analysis) ;; initial lexical verb? :initial-lexical-verb-p (initial-lexical-verb-p current-analysis) ;; final s-juncture? :final-s-juncture-p (final-s-juncture-p current-analysis) ;; initial e-juncture? :initial-e-juncture-p (initial-e-juncture-p current-analysis) ;; initial e-juncture? :initial-s-juncture-p (initial-s-juncture-p current-analysis) ;; last chunk might be noun? :possibly-noun-p (find-if #'noun-p (codes (aref current-analysis (1- (length current-analysis))))) ;; length of last chunk :last-chunk-length (length (chunk (aref current-analysis (1- (length current-analysis))))) ;; first chunk has elsewhere been analysed as compound? :first-chunk-compound-p (aref compound-chunks (length (chunk (aref current-analysis 0)))) :long-first-chunk-p (long-chunk-p (chunk (aref current-analysis 0))))))) (defun compute-f-ranking-info (position analyser) (with-slots (current-analysis compound-chunks) analyser (let ((effective-length (effective-length current-analysis))) (make-ranking-info ;; position in the analysis tree :position position ;; length not counting junctures :effective-length effective-length ;; number of junctures :juncture-count (- (length current-analysis) effective-length) ;; unknown first chunk? :unknown-first-chunk-p (null (codes (aref current-analysis 0))) ;; final lexical s-verb? :final-lexical-s-verb-p (final-lexical-s-verb-p current-analysis) ;; initial lexical verb? :initial-lexical-verb-p (initial-lexical-verb-p current-analysis) ;; final s-juncture? :final-s-juncture-p (final-s-juncture-p current-analysis) ;; initial e-juncture? :initial-e-juncture-p (initial-e-juncture-p current-analysis) ;; initial e-juncture? :initial-s-juncture-p (initial-s-juncture-p current-analysis) ;; last chunk might be noun? :possibly-noun-p ;;(find-if #'noun-p (codes (aref current-analysis (1- (length current-analysis))))) (find-if (lambda (code) (has-feature-p code 'subst)) (codes (aref current-analysis (1- (length current-analysis))))) ;; length of last chunk :last-chunk-length (length (chunk (aref current-analysis (1- (length current-analysis))))) ;; first chunk has elsewhere been analysed as compound? :first-chunk-compound-p (aref compound-chunks (length (chunk (aref current-analysis 0)))) ;;:long-first-chunk-p (long-chunk-p (chunk (aref current-analysis 0))) )))) (defun regexp-compare-juncture-s-word (analyser n1 n2) (with-slots (analysis-tree compound-chunks) analyser (labels ((next (a n) (when a (let ((branch-count (caaar a))) (if (< n branch-count) (multiple-value-bind (f-number n) (floor n (if (cdar a) (reduce #'+ (cdar a) :key #'caar) 1)) (values (car a) f-number n)) (next (cdr a) (- n branch-count)))))) (compare (n1 n2 f-n1 f-n2 pos1 pos2 a1 a2 sj1 sj2) (let* ((features1 (nth f-n1 (cddar a1))) (features2 (nth f-n2 (cddar a2))) (chunk1 (car features1)) (chunk2 (car features2)) (codes1 (cdr features1)) (codes2 (cdr features2))) #+debug(print (list :f1 features1 :f2 features2)) (cond ((and (= pos1 pos2) (s-juncture-p codes1) (char= (char chunk2 0) #\s)) (multiple-value-bind (a f-n n) (next (cdr a1) n1) ;; advance a1 (compare n n2 f-n f-n2 (1+ pos1) pos2 a a2 t nil))) ((and (= pos1 pos2) (s-juncture-p codes2) (char= (char chunk1 0) #\s)) (multiple-value-bind (a f-n n) (next (cdr a2) n2) ;; advance a2 (compare n1 n f-n1 f-n pos1 (1+ pos2) a1 a nil t))) ((and sj1 ;; condition we wanted to find (string= chunk1 chunk2 :start2 1) (compound-p compound-chunks chunk1 pos1)) t) ((and sj2 ;; condition we wanted to find (string= chunk1 chunk2 :start1 1) (compound-p compound-chunks chunk2 pos2)) nil) ((= pos1 pos2) ;; that's the only case in which we may have reached the end of the string (multiple-value-bind (a1 f-n1 n1) (next (cdr a1) n1) ; advance both (multiple-value-bind (a2 f-n2 n2) (next (cdr a2) n2) (if (and a1 a2) (compare n1 n2 f-n1 f-n2 (+ pos1 (length chunk1)) (+ pos2 (length chunk2)) a1 a2 nil nil) :incomparable)))) ((< pos1 pos2) (multiple-value-bind (a1 f-n1 n1) (next (cdr a1) n1) ; advance a1 (compare n1 n2 f-n1 f-n2 (+ pos1 (length chunk1)) pos2 a1 a2 nil nil))) ((> pos1 pos2) (multiple-value-bind (a2 f-n2 n2) (next (cdr a2) n2) ; advance a2 (compare n1 n2 f-n1 f-n2 pos1 (+ pos2 (length chunk2)) a1 a2 nil nil))) (t (error "a condition left: ~a" (list n1 n2 pos1 pos2 a1 a2 sj1 sj2))))))) (multiple-value-bind (a1 f-n1 n1) (next analysis-tree n1) ; advance both (multiple-value-bind (a2 f-n2 n2) (next analysis-tree n2) (compare n1 n2 f-n1 f-n2 0 0 a1 a2 nil nil)))))) (defun compare-juncture-s-word (analyser n1 n2) (with-slots (analysis-tree compound-chunks) analyser (labels ((next (a n) (when a (let ((branch-count (caaar a))) (if (< n branch-count) (values (car a) n) (next (cdr a) (- n branch-count)))))) (compare (n1 n2 pos1 pos2 a1 a2 sj1 sj2) ;;(print (list :ca1 (car a1) :ca2 (car a2))) (let ((chunk1 (chunk (car a1))) (chunk2 (chunk (car a2))) (codes1 (codes (car a1))) (codes2 (codes (car a2)))) (cond ((and (= pos1 pos2) (s-juncture-p codes1) (char= (char chunk2 0) #\s)) (multiple-value-bind (a n) (next (cdr a1) n1) ; advance a1 (compare n n2 (1+ pos1) pos2 a a2 t nil))) ((and (= pos1 pos2) (s-juncture-p codes2) (char= (char chunk1 0) #\s)) (multiple-value-bind (a n) (next (cdr a2) n2) ; advance a2 (compare n1 n pos1 (1+ pos2) a1 a nil t))) ((and sj1 ; condition we wanted to find (string= chunk1 chunk2 :start2 1) (compound-p compound-chunks chunk1 pos1)) t) ((and sj2 ; condition we wanted to find (string= chunk1 chunk2 :start1 1) (compound-p compound-chunks chunk2 pos2)) nil) ((= pos1 pos2) ; that's the only case in which we may have reached the end of the string (multiple-value-bind (a1 n1) (next (cdr a1) n1) ; advance both (multiple-value-bind (a2 n2) (next (cdr a2) n2) (if (and a1 a2) (compare n1 n2 (+ pos1 (length chunk1)) (+ pos2 (length chunk2)) a1 a2 nil nil) :incomparable)))) ((< pos1 pos2) (multiple-value-bind (a1 n1) (next (cdr a1) n1) ; advance a1 (compare n1 n2 (+ pos1 (length chunk1)) pos2 a1 a2 nil nil))) ((> pos1 pos2) (multiple-value-bind (a2 n2) (next (cdr a2) n2) ; advance a2 (compare n1 n2 pos1 (+ pos2 (length chunk2)) a1 a2 nil nil))) (t (error "a condition left: ~a" (list n1 n2 pos1 pos2 a1 a2 sj1 sj2))))))) (multiple-value-bind (a1 n1) (next analysis-tree n1) ; advance both (multiple-value-bind (a2 n2) (next analysis-tree n2) (compare n1 n2 0 0 a1 a2 nil nil)))))) (defun compound-p (compound-chunks chunk chunk-start) (or (> chunk-start 0) ; strange condition! (aref compound-chunks (+ chunk-start (length chunk))))) (defun rank-analyses (analyser) (with-slots (ranking) analyser #+debug(print ranking) (sort ranking (lambda (ri1 ri2) (block rank (cond ((< (ranking-info-effective-length ri1) (ranking-info-effective-length ri2)) (return-from rank t)) ((> (ranking-info-effective-length ri1) (ranking-info-effective-length ri2)) (return-from rank nil))) ;; equal effective length ;; check if diff in juncture count is > 1 (let ((jc1 (ranking-info-juncture-count ri1)) (jc2 (ranking-info-juncture-count ri2))) (cond ((< jc1 (1- jc2)) (return-from rank t)) ((> jc1 (1+ jc2)) (return-from rank nil)) ;; diff in juncture count is 1 ((/= jc1 jc2) (let ((flsv1 (ranking-info-final-lexical-s-verb-p ri1)) (flsv2 (ranking-info-final-lexical-s-verb-p ri2))) (when (and (not flsv1) flsv2 ; <1> (ranking-info-final-s-juncture-p ri1)) (return-from rank t)) (when (and flsv1 (not flsv2) (ranking-info-final-s-juncture-p ri2)) (return-from rank nil)) (if (not (ranking-info-unknown-first-chunk-p ri1)) ; <2> (let ((comp (if (eq *analyser-version* :regexp) (regexp-compare-juncture-s-word analyser (ranking-info-position ri1) (ranking-info-position ri2)) (compare-juncture-s-word analyser (ranking-info-position ri1) (ranking-info-position ri2))))) (if (not (eq comp :incomparable)) (return-from rank comp) (let ((ilv1 (ranking-info-initial-lexical-verb-p ri1)) (ilv2 (ranking-info-initial-lexical-verb-p ri2))) #+debug(print (list ilv1 ri1 ilv2 ri2)) (cond ((and (not ilv1) ilv2 (ranking-info-initial-e-juncture-p ri1)) (return-from rank t)) ((and ilv1 (not ilv2) (ranking-info-initial-e-juncture-p ri1)) (return-from rank nil)))))) (let ((long-chunk1-p (and (ranking-info-long-first-chunk-p ri1) (ranking-info-initial-s-juncture-p ri1))) (long-chunk2-p (and (ranking-info-long-first-chunk-p ri2) (ranking-info-initial-s-juncture-p ri2)))) (cond ((and long-chunk1-p (not long-chunk2-p)) (return-from rank t)) ((and (not long-chunk1-p) long-chunk2-p) (return-from rank nil))))) (return-from rank (< jc1 jc2)))) ;; no difference in number of junctures ((and (ranking-info-possibly-noun-p ri1) (not (ranking-info-possibly-noun-p ri2))) (return-from rank t)) ((and (not (ranking-info-possibly-noun-p ri1)) (ranking-info-possibly-noun-p ri2)) (return-from rank nil)) ((and (not (ranking-info-unknown-first-chunk-p ri1)) (ranking-info-first-chunk-compound-p ri1) (not (ranking-info-first-chunk-compound-p ri2))) (return-from rank t)) ((and (not (ranking-info-unknown-first-chunk-p ri1)) (not (ranking-info-first-chunk-compound-p ri1)) (ranking-info-first-chunk-compound-p ri2)) (return-from rank nil)) (t (return-from rank (> (ranking-info-last-chunk-length ri1) (ranking-info-last-chunk-length ri2)))))) nil))))) ;(print-ranked-analyses "sanddekka") (defun print-ranked-analyses (word) (with-compound-analyser (analyser) (%analyse-compound analyser word) (with-slots (analysis-tree ranking) analyser (setf ranking (rank-analyses analyser)) (dotimes (i (length ranking)) (print (cons (ranking-info-position (aref ranking i)) (nth-analysis (ranking-info-position (aref ranking i)) analysis-tree))))))) (defun most-probable-compound-last-chunk (word &optional nil-if-fullform-p) (with-compound-analyser (analyser) (%analyse-compound analyser word) (with-slots (analysis-tree ranking) analyser (setf ranking (rank-analyses analyser)) (unless (or (zerop (length ranking)) ;; *** make this nicer! (and nil-if-fullform-p (= (ranking-info-effective-length (aref ranking 0)) 1))) (nth-analysis-last-chunk-and-codes (ranking-info-position (aref ranking 0)) analysis-tree))))) #+test (("abonnement" ("811:0" "810:0" "801:0" "800:0")) ("s" (:S-JUNCTURE) ("s" :S-JUNCTURE)) ("biblioteket" ("811:2" "810:2" "801:2" "800:2") ("bibliotek" SUBST APPELL NØYT BE ENT))) (defun most-probable-compound (word &optional nil-if-fullform-p (divider "+")) (with-compound-analyser (analyser) (%analyse-compound analyser word) (with-slots (analysis-tree ranking) analyser (setf ranking (rank-analyses analyser)) (unless (or (zerop (length ranking)) (and nil-if-fullform-p (= (ranking-info-effective-length (aref ranking 0)) 1))) (let ((prefix "") (features ())) (loop for (chunk . rest) on (nth-analysis-codes (ranking-info-position (aref ranking 0)) analysis-tree nil) do (cond (rest (setf prefix (concat prefix (car chunk) (if (char= (last-char (car chunk)) #\-) "" (or divider ""))))) (t (dolist (suffix-analysis (cddr chunk)) (push (cons (concat prefix (car suffix-analysis)) (cdr suffix-analysis)) features)) (setf prefix (concat prefix (car chunk)))))) (values prefix features)))))) #+test (print (most-probable-compound "abonnementsbibliotek" t)) (defun ranked-analyses (word) (with-compound-analyser (analyser) (%analyse-compound analyser word) (with-slots (analysis-tree ranking) analyser (setf ranking (rank-analyses analyser)) (dotimes (i (length ranking)) (print (cons (ranking-info-position (aref ranking i)) (nth-analysis-last-chunk-and-codes (ranking-info-position (aref ranking i)) analysis-tree))))))) #|| (setf *tagger* *nbo-tagger*) (print (most-probable-compound-last-chunk "flaggermuseksklusjon")) (most-probable-compound-last-chunk "fyxynografiene" t) (tag-compound "Oslo-taggeren") (grammar-code-to-features "-grafi" "23700:1") (print (analyse-compound "femtiåtte")) (fullform-features "-grafi") (fullform-values "-grafi") #+test (ranked-analyses "antioksidant") #+test (print-ranked-analyses "asfaltdekkete") #+test (print-ranked-analyses "massesammenst¯t") #+test (print-ranked-analyses "sjanselikhetsombudsmann") #+test (print-ranked-analyses "Oslo-taggeren") #+test (print (analyse-compound "femtiåtte")) #+test (analyse-compound "steiro") ||# #+test ; (0.351 seconds) (0.111 seconds) (time (analyse-compound "tradisjonsrik")) ; 1.505 s, 83,512 bytes without memoizing ; 0.159 s, 17,384 bytes with memoizing ; 0.169 s, 21,408 bytes with filter-two ; 0.183 s, 32,144 bytes ; 0.087 s, 36,312 bytes with string-net optimized ; G4 ; ca. 50 ms unoptimized (get-bits ...) ; 25 ms optimized #+test (time (progn (analyse-compound "sildesalgslag") nil)) #+test (analyse-compound "fiskebo") #+test (analyse-compound "borettslag") #+test (analyse-compound "olav") (defun count-analyses (analysis) (reduce #'+ analysis :key #'caar)) #+test (count-analyses (analyse-compound "sildesalgslag")) (defun nth-analysis (n analysis) "Uses the perfect hash function in a network where every node is labeled by the number of complete paths starting from it. The n-th path is the following: At each node, look at the adjacent (following) nodes; if the label of the first node is lower or equal n, decrease n by that number and test the next one, and so on; if not, proceed to that node, not altering n." (when analysis (let ((branch-count (caaar analysis))) (if (< n branch-count) (cons (chunk (caar analysis)) (nth-analysis n (cdar analysis))) (nth-analysis (- n branch-count) (cdr analysis)))))) (defun nth-analysis-last-chunk-and-codes (n analysis) "Same algorithm as in NTH-ANALYSIS, but returns the last chunk and all of its codes" (when analysis (let ((branch-count (caaar analysis))) (if (< n branch-count) (or (nth-analysis-last-chunk-and-codes n (cdar analysis)) (chunk-and-codes (caar analysis))) (nth-analysis-last-chunk-and-codes (- n branch-count) (cdr analysis)))))) #+old (defun nth-analysis-codes (n analysis) "Same algorithm as in NTH-ANALYSIS, but returns all chunks and their codes" (when analysis (let ((branch-count (caaar analysis))) (if (< n branch-count) (destructuring-bind (chunk . f-codes) (chunk-and-codes (caar analysis)) (list* (cons chunk (mapcar (lambda (f-code) (if (stringp f-code) (cons f-code (grammar-code-to-features chunk f-code :features-as-list-p t)) (list (string-downcase f-code)))) f-codes)) (nth-analysis-codes n (cdar analysis)))) (nth-analysis-codes (- n branch-count) (cdr analysis)))))) (defun nth-analysis-codes (n analysis &optional (features-as-list-p t)) "Same algorithm as in NTH-ANALYSIS, but returns all chunks and their codes" (when analysis (let ((branch-count (caaar analysis))) (if (< n branch-count) (destructuring-bind (chunk . f-codes) (chunk-and-codes (caar analysis)) (list* (list* chunk f-codes (remove-duplicates (collecting (dolist (f-code f-codes) (collect-append (if (stringp f-code) (grammar-code-to-features chunk f-code :features-as-list-p features-as-list-p) (list (list chunk f-code)))))) :test (lambda (fl1 fl2) (and (string= (car fl1) (car fl2)) (equal (cdr fl1) (cdr fl2)))))) (nth-analysis-codes n (cdar analysis) features-as-list-p))) (nth-analysis-codes (- n branch-count) (cdr analysis) features-as-list-p))))) #+test (print (nth-analysis-codes 1 (analyse-compound "fiskerinæring"))) #+test (print (nth-analysis 1 (analyse-compound "sildesalgslag"))) #+test (print-all-analyses (analyse-compound "kosehøne")) ;; test function; not needed (defun print-all-analyses (analysis) (let ((n (count-analyses analysis))) (dotimes (i n) (print (nth-analysis i analysis))))) (defun effective-length (analysis) (let ((i 0)) (loop for chunk across analysis unless (juncture-p (codes chunk)) do (incf i)) i)) (defun previous-chunk (analysis) (let ((length (length analysis))) (if (juncture-p (codes (aref analysis (- length 2)))) (aref analysis (- length 3)) (aref analysis (- length 2))))) ;(analyse-compound "seksogfemti") ;(analyse-compound "tusenniogtretti") ;(analyse-compound "Steiro") ;(analyse-compound "beckettstykke") ;(print-ranked-analyses "tusenniogtretti") ;(analyse-compound "fattig-Norge") ;(%analyse-compound "Norge" 0) ;(analyse-compound "fem-seks") ;(tag-compound "debilografienes") ;(analyse-compound "debilografienes") ;(get-features "Norge") ;(string-values (fullforms *tagger*) "seks") ;; CURRENT-ANALYSIS is a vector containing the current partial compound analysis (defun filter-two (chunk codes analysis current-analysis &optional juncture unknown-first-p last-p) #+debug (print (list chunk codes analysis current-analysis)) (let* ((removed nil) (filtered-analysis (collecting (dolist (sub-analysis analysis) (let ((next-codes (codes (car sub-analysis))) (next-chunk (chunk (car sub-analysis)))) (if (or (and (or unknown-first-p (> (effective-length current-analysis) 1)) (or (< (length next-chunk) 3) (find-if (lambda (chunk) (and (not (juncture-p (codes chunk))) (< (length (chunk chunk)) 3))) current-analysis)) (not (and (or (not unknown-first-p) (and unknown-first-p (not last-p))) (or (string= chunk "og") (string= chunk "en") (string= chunk "to") ;; *** added (string= chunk "ni") )))) (and codes (all-p #'fullform-p codes) ; ?? ;; (not (and (find-if #'imp-verb-p codes) (find-if #'noun-p next-codes))) ;; *** fix for numbers (not (find-if #'number-p codes))) (and (not unknown-first-p) ; <1> (= (effective-length current-analysis) 1) (find-if (lambda (code) (or (adjective-p code) (adverb-p code))) codes) (all-p #'fullform-p codes) (not (find-if #'participle-p next-codes))) (and (not unknown-first-p) ; <2> (= (length current-analysis) 1) (find-if #'superlative-p codes) (not (find-if #'noun-p next-codes))) (and (all-p #'sfx-p next-codes) ;; (or (and juncture (string/= next-chunk "aktig") (string/= next-chunk "messig")) (and (all-p #'sfx-noun-adj-prev-p next-codes) (not (find-if #'adjective-p codes))) ;; (zerop (length current-analysis)) (and (all-p #'sfx-adj-noun-prev-p next-codes) (not (find-if #'noun-p codes))) (and (all-p #'sfx-adv-noun-prev-p next-codes) (not (find-if #'noun-p codes))) (and (all-p #'sfx-noun-imp-verb-prev-p next-codes) (not (find-if #'imp-verb-p codes))) (and (all-p #'sfx-noun-noun/imp-verb-prev-p next-codes) (not (find-if (lambda (c) (or (noun-p c) (imp-verb-p c))) codes))) (and (all-p #'sfx-adj-imp-verb-prev-p next-codes) (not (find-if #'imp-verb-p codes))) ;; ** this occurs twice in C code! (and (all-p #'sfx-adj-inf-verb/noun-prev-p next-codes) (not (find-if (lambda (c) (or (inf-verb-p c) (noun-p c))) codes))))) ()) (setf removed t) (collect sub-analysis))))))) (if removed filtered-analysis analysis))) (defun filter-two-features (chunk features analysis current-analysis &optional juncture unknown-first-p last-p) #+debug (print (list :chunk chunk features analysis current-analysis)) (let* ((removed nil) (filtered-analysis (collecting (dolist (sub-analysis analysis) (let ((next-features (codes (car sub-analysis))) (next-chunk (chunk (car sub-analysis)))) (if (or (and (or unknown-first-p (> (effective-length current-analysis) 1)) (or (< (length next-chunk) 3) (find-if (lambda (chunk) (and (not (juncture-p (codes chunk))) (< (length (chunk chunk)) 3))) current-analysis)) (not (and (or (not unknown-first-p) (and unknown-first-p (not last-p))) (or (string= chunk "og") (string= chunk "en") (string= chunk "to") ;; *** added (string= chunk "ni") )))) (all-have-features-p features 'det) ;; new (one-has-features-p features 'pron) ;; new #+later (and features #+later(all-p #'fullform-p features) ; ?? ;; (not (and (one-has-features-p features 'imp) ;;(find-if #'imp-verb-p features) (one-has-features-p next-features 'subst) ;; (find-if #'noun-p next-features) )) ;; *** fix for numbers (not (or (one-has-features-p features 'kvant) ;; (find-if #'number-p features) (one-has-features-p features ')))) (and (not unknown-first-p) ; <1> (= (effective-length current-analysis) 1) (or (one-has-features-p features 'adj) (one-has-features-p features 'adv)) ;;(find-if (lambda (code) (or (adjective-p code) (adverb-p code))) features) #+later(all-p #'fullform-p features) ;;(not (find-if #'participle-p next-features)) (not (or (one-has-features-p features ' (one-has-features-p features '))))) (and (not unknown-first-p) ; <2> (= (length current-analysis) 1) (one-has-features-p features 'sup) ;; (find-if #'superlative-p features) (not (one-has-features-p next-features 'subst) ;; (find-if #'noun-p next-features) )) #+later (and (all-p #'sfx-p next-features) ;; (or (and juncture (string/= next-chunk "aktig") (string/= next-chunk "messig")) (and (all-p #'sfx-noun-adj-prev-p next-features) (not (one-has-features-p features 'adj) ;; (find-if #'adjective-p features) )) ;; (zerop (length current-analysis)) (and (all-p #'sfx-adj-noun-prev-p next-features) (not (one-has-features-p features 'subst) ;; (find-if #'noun-p features) )) (and (all-p #'sfx-adv-noun-prev-p next-features) (not (one-has-features-p features 'subst) ;; (find-if #'noun-p features) )) (and (all-p #'sfx-noun-imp-verb-prev-p next-features) (not (one-has-features-p features 'imp) ;; (find-if #'imp-verb-p features) )) (and (all-p #'sfx-noun-noun/imp-verb-prev-p next-features) (not ;;(find-if (lambda (c) (or (noun-p c) (imp-verb-p c))) features) (or (one-has-features-p features 'subst) (one-has-features-p features 'imp)))) (and (all-p #'sfx-adj-imp-verb-prev-p next-features) (not (one-has-features-p features 'imp) ;; (find-if #'imp-verb-p features) )) ;; ** this occurs twice in C code! (and (all-p #'sfx-adj-inf-verb/noun-prev-p next-features) (not (find-if (lambda (c) (or (inf-verb-p c) (noun-p c))) features))))) ()) (setf removed t) (collect sub-analysis))))))) (if removed filtered-analysis analysis))) ;(tag-compound "syttenhundretreogtredve") ;(get-features "o") #|| (print-ranked-analyses "bilmotstanderen") (print-ranked-analyses "barneskje") (print-ranked-analyses "beatlesplater") (print-ranked-analyses "beckettstykke") ||# :EOF