;;; -*- Mode: LISP; Package: CGP; BASE: 10; Syntax: ANSI-Common-Lisp; -*- ;; ;; Copyright (C) Paul Meurer 2000 - 2004. All rights reserved. ;; paul.meurer@hit.uib.no ;; HIT-centre, University of Bergen ;; ;; Reimplementation in ANSI Common Lisp of the compound analyser for Norwegian (C program) ;; written by Helge Hauglin and Lars Jørgen Tvedt, UiO 1999 ;; Used in the CG parser/tagger system (Oslo-tagger) developed at UiO ;; (Dokumentasjonsprosjektet; Tekstlaboratoriet) ;; ;;------------------------------------------------------------------------------------- ;; TO DO: ;; - CLOSify (partially done) ;; - get rid of ranking-info structure, lazy eval instead ;; - rewrite the whole thing as a fsa! DONE. ;; ;;------------------------------------------------------------------------------------- (in-package "CGP") (defclass compound-analyser-lexicon () ((morph-feature-vector :initarg :morph-feature-vector :reader morph-feature-vector) (morph-feature-table :initarg :morph-feature-table :reader morph-feature-table) (fullform-net :initarg :fullform-net :reader fullform-net))) (defparameter *bm-analyser-lexicon* (let ((mfv #("Noun" "Gen" "Def" "Indef" "Sg" "Masc" "Pl" "Neut" "MF" "Adj" "Verb" "MFN" "SP" "Pos" "PastPart" "Def/Pl" "Sup" "Fem" "NoDef" "Comp" "SForm" "Pres" "Infin" "PresPart" "Past" "Prop" "Impv" "Adv" "Prep" "Interj" "Num" "Card" "Aux" "Conj" "Pron" "DetPron" "Ord" "Acc" "Nom" "Coord" "Punct" "Det" "Art" "Symbol" "Money" "PartInf" "Unknown"))) (make-instance 'compound-analyser-lexicon :morph-feature-vector mfv :morph-feature-table (let ((mft (dat::make-string-tree))) (loop for i from 0 for f across mfv do (setf (dat:string-tree-get mft (string-downcase f)) i))) :fullform-net (string-net::read-net "~/lisp/projects/xle/morph/bm-morph.net" :translate-p nil)))) (defvar *analyser-lexicon*) (defvar *%strings* (make-array 0 :adjustable t :fill-pointer t)) (defvar *%byte-arrays* (make-array 0 :adjustable t :fill-pointer t)) (defun %vector-pop (vector) (let* ((fill (fill-pointer vector))) (declare (fixnum fill)) (unless (zerop fill) (decf fill) (setf (fill-pointer vector) fill) (aref vector fill)))) (defmacro %with-string ((string) &body body) `(let ((,string (or (%vector-pop *%strings*) (make-array 0 :element-type #+mcl 'base-character #-mcl 'character :adjustable t :fill-pointer t)))) (unwind-protect (progn ,@body) (setf (fill-pointer ,string) 0) (vector-push-extend ,string *%strings*)))) (defmacro %with-byte-array ((array) &body body) `(let ((,array (or (%vector-pop *%byte-arrays*) (make-array 0 :element-type '(unsigned-byte 8) :adjustable t :fill-pointer t)))) (unwind-protect (progn ,@body) (setf (fill-pointer ,array) 0) (vector-push-extend ,array *%byte-arrays*)))) (defun vowelp (c) (find c "aeiouyæøå")) (defun long-chunk-p (chunk) (let* ((first-vowel-group-pos (position-if #'vowelp chunk)) (middel-cons-group-pos (when first-vowel-group-pos (position-if-not #'vowelp chunk :start first-vowel-group-pos)))) (when middel-cons-group-pos (find-if #'vowelp chunk :start middel-cons-group-pos)))) (defun codes (chunk) (cddr chunk)) (defun chunk (chunk) (cadr chunk)) (defun chunk-and-codes (chunk) (cdr chunk)) ;; sjekk_ledd (defun check-chunk (analysis chunk codes last-p) (cond ;; suppress chunks of length 1, except junctions ((and (= (length chunk) 1) (not (find (char chunk 0) "es"))) nil) ((and (all-p #'adjective-p codes) (not (zerop (length analysis))) (all-p #'adverb-p (codes (aref analysis 0)))) nil) ((and last-p (or (all-p #'adverb-p codes) (all-p #'code-98x-p codes))) nil) ((and (not last-p) (not (find-if #'previous-chunk-p codes))) nil) #+to-be-fixed ((all-p #'prefix-p codes) (incf affix-count)) (t t))) (defmethod %has-feature-p ((lexicon compound-analyser-lexicon) feature-vector feature) (with-slots (morph-feature-table) lexicon (let ((code (dat:string-tree-get morph-feature-table (string-downcase feature)))) (when code (= 1 (sbit feature-vector (the fixnum code))))))) (defmethod %has-features-p ((lexicon compound-analyser-lexicon) feature-vector features) (loop for feature in features always (%has-feature-p lexicon feature-vector feature))) (defmethod %all-have-features-p ((lexicon compound-analyser-lexicon) codes &rest features) (not (find-if-not (lambda (code) (%has-features-p lexicon code features)) codes))) (defmethod %one-has-features-p ((lexicon compound-analyser-lexicon) codes &rest features) (find-if (lambda (code) (%has-features-p lexicon code features)) codes)) #+obsolete (defun check-chunk-features (analysis chunk codes last-p) (cond ;; suppress chunks of length 1, except junctions ((and (= (length chunk) 1) (not (find (char chunk 0) "es"))) nil) ((and ;;(all-p #'adjective-p codes) (all-have-features-p codes 'adj) (not (zerop (length analysis))) ;;(all-p #'adverb-p (codes (aref analysis 0))) (all-have-features-p (codes (aref analysis 0)) 'adv)) nil) ((and last-p (or (all-have-features-p codes 'adv) ;; (all-p #'adverb-p codes) #+later(all-p #'code-98x-p codes) )) nil) ((and (not last-p) (not (find-if #'f-previous-chunk-p codes))) nil) #+to-be-fixed ((all-p #'prefix-p codes) (incf affix-count)) (t t))) (defun e-juncture-p (codes) (when (find-if (lambda (code) (and (listp code) (find :e-juncture code))) codes) t)) (defun s-juncture-p (codes) (when (find-if (lambda (code) (and (listp code) (find :s-juncture code))) codes) t)) (defun juncture-p (codes) (or (e-juncture-p codes) (s-juncture-p codes))) #+obsolete (defun f-previous-chunk-p (code) (or (has-feature-p code 'subst) ;; ?? (real-noun-p code) (has-feature-p code 'verb) ;; (verb-p code) (has-feature-p code 'adj) ;; (adjective-p code) (has-feature-p code 'adv) ;; (adverb-p code) (has-feature-p code ') ;; (participle-p code) (has-feature-p code ') #+later(code-1000-plus-og-p code) ;?? (has-feature-p code 'kvant) ;; (number-p code) ;; *** ADDED! (has-feature-p code ') #+later(affix-p code))) (defun f-lemma-p (word compressed-lemma bv) (declare (ignore word)) (and (string= compressed-lemma "") (cond ((has-feature-p bv 'verb) (has-feature-p bv 'inf)) ((has-feature-p bv 'subst) (has-feature-p bv 'ent)) ((has-feature-p bv 'adj) (and (has-feature-p bv 'sg) (has-feature-p bv 'pos))) (t t)))) (defun filter-analysis (analysis) (declare (ignore analysis)) t) (defun all-p (predicate codes) (not (find-if-not predicate codes))) #|| ; 570 ms (time (dotimes (i 10) (tag-compound "kjempeklatrestevneorganisasjonskomite"))) (let ((*analyser-version* :regexp)) (print-ranked-analyses "sildesalgslag")) (print-ranked-analyses "kjempe-klatresko") (analyse-compound "Klatresko") (analyse-compound "ibsenstykke") (setf *tagger* *nbo-tagger*) (let ((*tagger* *nbo-tagger*)) (print (tag-compound "futureskontrakter"))) (let ((*tagger* *nbo-tagger*)) (print (tag-compound "vest-afrikansk"))) (print-ranked-analyses "fattignorge") (print-ranked-analyses "fattigNorge") (print-ranked-analyses "billeoppslag") (print-ranked-analyses "enebærbuskspill") (print-ranked-analyses "vareheissjakt") (print-ranked-analyses "ekskjæreste") (print-ranked-analyses "uamerikansk") (print-ranked-analyses "diskonterbar") (print-ranked-analyses "gammelmannsaktig") (print-ranked-analyses "policymessig") (print-ranked-analyses "aldershjemsenhet") (let ((*tagger* *nbo-tagger*)) (print-ranked-analyses "bananmodneri")) (let ((*tagger* *nbo-tagger*)) (print-ranked-analyses "fyxynografi")) (tag-compound "fyxynografi") (string-values *fullforms* "grafi") (setf *tagger* *nbo-tagger*) ||# (defclass compound-analyser () ((current-analysis :initform (make-array 0 :fill-pointer t :adjustable t) :accessor current-analysis) (ranking :initform (make-array 0 :fill-pointer t :adjustable t) :accessor ranking) (compound-chunks :initform (make-array 0 :fill-pointer t :adjustable t) :accessor compound-chunks) (analysis-tree :initform nil :accessor analysis-tree))) (defparameter *compound-analyser-pool* ()) (defun get-compound-analyser () (let ((analyser (or (pop *compound-analyser-pool*) (make-instance 'compound-analyser)))) (with-slots (analysis-tree current-analysis ranking compound-chunks) analyser (setf (fill-pointer current-analysis) 0 (fill-pointer ranking) 0 (fill-pointer compound-chunks) 0 analysis-tree nil)) analyser)) (defmacro with-compound-analyser ((analyser) &body body) `(let ((,analyser (get-compound-analyser))) ; thread safe! (multiple-value-prog1 (progn ,@body) (push ,analyser *compound-analyser-pool*)))) (defun analyse-compound (string) (with-compound-analyser (analyser) (analysis-tree (%analyse-compound analyser string)))) ;; TO DO: analyse from last space if string is multi-word. (defmethod %analyse-compound ((analyser compound-analyser) string) (with-slots (analysis-tree current-analysis ranking compound-chunks) analyser (dotimes (i (1+ (length string))) (vector-push-extend nil compound-chunks)) ; does not cons unnecessarily (let ((hyphen-pos (position #\- string))) (setf analysis-tree (or (%sub-analyse-compound-regexp analyser string (if hyphen-pos (1+ hyphen-pos) 0)) (and (upper-case-p (char string 0)) (%sub-analyse-compound-regexp analyser (string-downcase string) (if hyphen-pos (1+ hyphen-pos) 0))) ;; if there is no valid analysis with known chunks collect all analyses ;; with unknown prefix containing at least one vowel (let ((vowel-pos (position-if #'vowelp string))) (when vowel-pos (collecting (loop for start from (1+ vowel-pos) to (- (length string) 2) with count = -1 and key = 0 and analysis do (multiple-value-bind (analysis cnt) (%sub-analyse-compound-regexp analyser string start :unknown-first-p t :count count) (when analysis (collect (car analysis)) (if cnt (setf count cnt) (incf count (caaar analysis))))))))))))) analyser) (defvar *net* nil) ;; cp. morph-server-sockets.lisp (defmethod byte-array-to-bit-vector ((lexicon compound-analyser-lexicon) array) (declare (optimize (speed 3) (safety 0))) (let ((feature-bv (make-array (length (morph-feature-vector lexicon)) :element-type 'bit :initial-element 0))) (loop for byte across array for i fixnum from 0 do (dotimes (j 8) (declare (fixnum j)) (unless (zerop (the fixnum (logand byte (the fixnum (ash 1 j))))) (setf (sbit feature-bv (the fixnum (+ (the fixnum (* i 8)) j))) 1)))) feature-bv)) (defmethod byte-array-to-bit-vector ((lexicon t) array) (array-to-bit-vector array)) (defmacro do-chunk-readings ((lexicon node lemma feature-bv) &body body) (with-gensyms (%string %array code) `(%with-string (,%string) (%with-byte-array (,%array) (string-net::nmap-string+array (fullform-net ,lexicon) (lambda (,lemma ,code) (declare (ignorable ,lemma ,code)) (let ((,feature-bv (byte-array-to-bit-vector ,lexicon ,code))) ,@body)) ,%string ,%array ,node #\:))))) ;;;; New regexp pruning of chunking analyses (fsa::def-string-test-function pre-e-juncture (chunk) (let* ((vowel-pos (position-if (lambda (c) (vowelp c)) chunk)) (cons-pos (when vowel-pos (position-if-not (lambda (c) (vowelp c)) chunk :start vowel-pos))) (next-vowel-pos (when cons-pos (position-if (lambda (c) (vowelp c)) chunk :start cons-pos)))) (and cons-pos (not next-vowel-pos)))) (fsa::def-string-test-function pre-s-juncture (chunk) ;; chunk is not first chunk ;; does not end in "s", "sj" or "ch" (let ((length (length chunk))) (and (string/= chunk "s" :start1 (- length 1)) (string/= chunk "sj" :start1 (- length 2)) (string/= chunk "ch" :start1 (- length 2))))) (fsa::def-string-test-function immediate-pre-s-juncture (chunk) ;; the last consonant group does not contain #\s ;; does not end in "s", "sj" or "ch" (let ((length (length chunk))) (and (string/= chunk "s" :start1 (- length 1)) (string/= chunk "sj" :start1 (- length 2)) (string/= chunk "ch" :start1 (- length 2)) (let ((last-s-pos (position #\s chunk :from-end t))) (or (not last-s-pos) (position-if (lambda (c) (vowelp c)) chunk :start last-s-pos)))))) #+test (print pre-e-juncture) #+old ;; cgp (defparameter *compound-regexp* (make-instance 'feature-regexp :name "Regexp for compound analyser" :source-regexp `(:or (:seq ;; first chunk (:or ukjent (:and subst ub ent) (:and verb inf) (:and det kvant) (:seq (:and subst ub ent ,immediate-pre-s-juncture) :s-juncture) (:seq (:and subst ub ent ,pre-e-juncture) :e-juncture)) ;; inner chunks (:* (:or (:and subst ub ent) (:and verb inf) (:seq (:and subst ub ent ,pre-s-juncture) :s-juncture) (:seq (:and subst ub ent ,pre-e-juncture) :e-juncture))) ;; last chunk subst) ;; number (:seq (:+ (:and det kvant)) (:? (:seq (:and "og" adv) (:and det kvant))))))) #+test (print-ranked-analyses-regexp "sommerhatt") (defparameter *compound-regexp* (make-instance 'feature-regexp :name "Regexp for compound analyser" :source-regexp `(:or (:seq ;; first chunk (:or ukjent (:and Noun Indef Sg) (:and Verb Infin) #+not-yet(:and det kvant) (:seq (:and Noun Indef Sg ,immediate-pre-s-juncture) :s-juncture) (:seq (:and Noun Indef Sg ,pre-e-juncture) :e-juncture)) ;; inner chunks (:* (:or (:and Noun Indef Sg);;(:and subst ub ent) (:and Verb Infin) (:seq (:and Noun Indef Sg ,pre-s-juncture) :s-juncture) (:seq (:and Noun Indef Sg ,pre-e-juncture) :e-juncture))) ;; last chunk Noun) ;; number ;; Problem: det kvant has no counterpart in norgram #+not-yet (:seq (:+ (:and det kvant)) (:? (:seq (:and "og" adv) (:and det kvant))))))) (defun cp-boolean-list-delta-get (features-list state delta) (let ((relation (fsa::relation-get state delta)) (result ())) (when relation (let* ((features features-list) (states (fsa::relation-subsumed-get features relation))) ;; STATE ist a list of state id + new features list (cp. fsa::KEY-SUBSUME) (dolist (state states) (pushnew (append state (list features)) result :test #'equal))) result))) #+test (setf *tagger* *nbo-tagger*) #+test (print-ranked-analyses-regexp "fiskesluse") #+test (let ((*analyser-version* :regexp)) (print-ranked-analyses-regexp "beatlesplate")) #+test (setf *analyser-lexicon* *bm-analyser-lexicon*) #+test (print-ranked-analyses-regexp "sommerhatt") (defmethod %sub-analyse-compound-regexp ((analyser compound-analyser) string start &key (count -1) net &allow-other-keys) (with-slots (current-analysis ranking compound-chunks) analyser (let* ((word-end-marker #\:) (length (length string)) (memory (make-array (1+ length) :initial-element :not-memoized)) (lexicon (fullform-net *analyser-lexicon*)) ;; (or net *net* (lexicon *tagger*))) (lex-net (list-net lexicon))) (labels ((analyse (start) (let ((memoized-analysis (aref memory start))) (if (eq memoized-analysis :not-memoized) (setf (aref memory start) (collecting (labels ((walk (pos lex-n) (let* ((lex-sub-node (when (< pos length) (find (string-net::restore-char (char string pos)) lex-n :key #'car))) (lex-end-marker-node (find word-end-marker lex-n :key #'car))) (when lex-sub-node (walk (1+ pos) (cdr lex-sub-node))) (when lex-end-marker-node ;; then, extract codes using the remaining sub-network (let ((lex-node (find word-end-marker lex-n :key #'car)) (features ()) (chunk (subseq string start pos))) #+debug(print (list :start start :pos pos :chunk chunk)) (when (or (> (- pos start) 2) (and (= (- pos start) 2) (find chunk '("en" "to" "ni" "år" "gå" "øl") :test #'string-equal))) (do-chunk-readings (*analyser-lexicon* lex-node lemma bv) #+debug(print (list (decompress-string lemma chunk) bv)) (when t #+disabled(find-if (lambda (f) (has-feature-p bv f)) (if (= pos length) '(subst adj adv kvant) '(subst adv verb kvant))) (push (cons (decompress-string lemma chunk) bv) features)) #+debug(let ((*package* (find-package :cgp))) (print (list* chunk (decompress-string lemma chunk) (bv-features *analyser-lexicon* bv))))) (when features ;; no juncture (let ((chunk+features (cons chunk features))) (vector-push-extend (cons nil chunk+features) current-analysis) (let ((analysis (analyse pos))) (when (or analysis (= pos length)) (unless (zerop start) (setf (aref compound-chunks pos) chunk)) (collect (cons (cons ;; label the node by the number of complete paths starting from it (* (if analysis (reduce #'+ analysis :key #'caar) 1) (length features)) chunk+features) analysis))) (decf (fill-pointer current-analysis)))) ;; juncture (unless (= pos length) (let ((char (char string pos))) (when (find char "es") (let ((chunk+features (cons chunk features)) (juncture (if (char= char #\e) (list "e" (list "e" :e-juncture)) (list "s" (list "s" :s-juncture))))) (vector-push-extend (cons nil chunk+features) current-analysis) (vector-push-extend (cons nil juncture) current-analysis) (let ((analysis (analyse (1+ pos)))) (decf (fill-pointer current-analysis) 2) (when analysis (unless (zerop start) (setf (aref compound-chunks pos) chunk)) (let* ((sub-branch-count (reduce #'+ analysis :key #'caar)) (branch-count (* (length features) sub-branch-count))) (collect (list (cons branch-count chunk+features) (cons (cons sub-branch-count juncture) analysis))))))))))))))))) (walk start lex-net)))) (values memoized-analysis t))))) (let ((analyses (if (zerop start) (analyse start) (progn (vector-push-extend (list nil (subseq string 0 start)) current-analysis) (let* ((sub-analysis (analyse start)) (branch-count (count-analyses sub-analysis))) (prog1 (when sub-analysis (list (cons (list branch-count (subseq string 0 start) #+old(list (subseq string 0 start) :unknown) (cons (subseq string 0 start) (bv-encode-features *analyser-lexicon* 'Unknown))) sub-analysis))) (decf (fill-pointer current-analysis)))))))) (when analyses #+debug(print (list :analyses analyses)) (let ((pruned-analyses (prune-analyses *compound-regexp* analyses :key (1+ count)))) #+debug(print (list :unknown (subseq string 0 start) :count (1+ count) :pa pruned-analyses)) (labels ((walk-memoized (analysis) ; change name! (dolist (sub-analysis analysis) (destructuring-bind (nkey chunk . readings) (car sub-analysis) (dolist (reading readings) ;;(print (list :pushing nkey chunk (if (consp reading) (cdr reading) (list reading)))) (vector-push-extend (list nkey chunk (if (consp reading) (cdr reading) (list reading))) current-analysis) (if (cdr sub-analysis) (walk-memoized (cdr sub-analysis)) ;; COUNT is a hash key (progn (when (find (incf count) pruned-analyses) #+debug(print :valid) (vector-push-extend (compute-f-ranking-info *analyser-lexicon* count analyser) ranking)) #+debug (print current-analysis))) (decf (fill-pointer current-analysis))))))) (walk-memoized analyses) #+debug(print ranking) #+debug(print analyses) (values (when pruned-analyses analyses) count))))))))) (defun print-ranked-analyses-regexp (word) (with-compound-analyser (analyser) (%analyse-compound analyser word) (with-slots (analysis-tree ranking) analyser (setf ranking (rank-analyses analyser)) (dotimes (i (length ranking)) (print (cons (ranking-info-position (aref ranking i)) (nth-analysis-codes-regexp (ranking-info-position (aref ranking i)) analysis-tree t))))))) #+test (print-ranked-analyses-regexp "barnemat") #+test (print-ranked-analyses-regexp "fiskesluse") (defmethod bv-features ((lexicon compound-analyser-lexicon) bv &key symbolp) (with-slots (morph-feature-vector) lexicon (collecting (loop for pos from 0 for bit across bv do (when (= bit 1) (collect (if symbolp (intern (string-upcase (svref morph-feature-vector pos)) :cgp) (svref morph-feature-vector pos)))))))) ;; still missing: unknown-features (defmethod bv-encode-features ((lexicon compound-analyser-lexicon) &rest features) (with-slots (morph-feature-vector) lexicon (let ((feature-bv (make-array (length morph-feature-vector) :element-type 'bit :initial-element 0)) (unknown-features ())) (loop for f across morph-feature-vector for code from 0 when (find f features :test #'equal) do (setf (sbit feature-bv code) 1)) feature-bv))) #+test (print-ranked-analyses-regexp "sommerhatt") (defmethod prune-analyses ((fr feature-regexp) analyses &key (key 0) &allow-other-keys) (collecting (with-slots (dfa) fr (let ((delta (fsa:fsa-delta dfa)) (start-state (fsa:fsa-start-state dfa))) (labels ((walk (state analyses readings key chunking) #+debug(print chunking) ;; CHUNKING is for debugging only (if readings (let* ((reading (car readings)) (analysis (car analyses)) (chunk (cadar analysis)) (sub-count (/ (caar analysis) (length (cddar analysis)))) (features (if (consp (cdr reading)) reading (cons (car reading) (bv-features *analyser-lexicon* (cdr reading) :symbolp t)))) (new-states (cp-boolean-list-delta-get features state delta))) #+debug(print (list :features features :reading reading)) (dolist (new-state+morph new-states) (destructuring-bind (new-state new-features . morph) new-state+morph (declare (ignore morph)) (cond ((cdr analysis) (destructuring-bind (nkey ch . readings) (caadr analysis) (declare (ignore ch)) (walk new-state (cdr analysis) readings key (cons (cons sub-count chunk) chunking)))) ((fsa:set-member-p new-state (fsa:fsa-final-states dfa)) #+debug(print (cons key (reverse (cons (cons sub-count chunk) chunking)))) (collect key)) (t nil)))) (cond ((cdr readings) (walk state analyses (cdr readings) (+ sub-count key) chunking)) ((cdr analyses) (walk state (cdr analyses) nil (+ sub-count key) chunking)) (t nil))) (destructuring-bind (nkey chunk . readings) (caar analyses) (walk state analyses readings key chunking))))) (walk start-state analyses nil key ())))))) (defun nth-analysis-regexp (n analysis) "Uses the perfect hash function in a network where every node is labeled by the number of complete paths starting from it. The n-th path is the following: At each node, look at the adjacent (following) nodes; if the label of the first node is lower or equal n, decrease n by that number and test the next one, and so on; if not, proceed to that node, not altering n." (when analysis (let ((branch-count (caaar analysis))) (if (< n branch-count) (destructuring-bind (chunk . f-codes) (cdaar analysis) (multiple-value-bind (f-number n) (floor n (if (cdar analysis) (reduce #'+ (cdar analysis) :key #'caar) 1)) (cons chunk (nth-analysis-regexp n (cdar analysis))))) (nth-analysis-regexp (- n branch-count) (cdr analysis)))))) (defun nth-analysis-codes-regexp (n analysis &optional features-as-list-p) "Same algorithm as in NTH-ANALYSIS, but returns all chunks and their codes" ;;(print (list :n n analysis)) (when analysis (let ((branch-count (caaar analysis))) (if (< n branch-count) (destructuring-bind (chunk . f-codes) (cdaar analysis) (multiple-value-bind (f-number n) (floor n (if (cdar analysis) (reduce #'+ (cdar analysis) :key #'caar) 1)) (let ((reading (nth f-number f-codes))) (cons (cons chunk (if features-as-list-p (if (consp (cdr reading)) reading (cons (car reading) (bv-features *analyser-lexicon* (cdr reading)))) reading)) (nth-analysis-codes-regexp n (cdar analysis) features-as-list-p))))) (nth-analysis-codes-regexp (- n branch-count) (cdr analysis) features-as-list-p))))) #+test (print (nth-analysis-codes-regexp 5 (analyse-compound "barnevakt") t)) ;; LEKS_S_VERB_FINAL (defmethod final-lexical-s-verb-p ((lexicon compound-analyser-lexicon) analysis) "checks if last chunk is a verb starting with #\s, without preceding juncture" (let* ((length (length analysis)) (last-chunk (aref analysis (1- length))) (next-to-last-chunk (when (> length 1) (aref analysis (- length 2))))) (and (char= (char (chunk last-chunk) 0) #\s) (%all-have-features-p lexicon (codes last-chunk) 'verb) (not (and next-to-last-chunk (juncture-p (codes next-to-last-chunk))))))) ;; LEKS_VERB_INITIAL (defmethod initial-lexical-verb-p ((lexicon compound-analyser-lexicon) analysis) (and (%all-have-features-p lexicon (codes (aref analysis 0)) 'verb) (or (= (length analysis) 1) (not (juncture-p (codes (aref analysis 1))))))) (defun final-s-juncture-p (analysis) (let* ((length (length analysis)) (next-to-last-chunk (when (> length 1) (aref analysis (- length 2))))) (and next-to-last-chunk (s-juncture-p (codes next-to-last-chunk))))) (defun initial-e-juncture-p (analysis) ;;(print :e-j) (and (> (length analysis) 2) (e-juncture-p (codes (aref analysis 1))))) (defun initial-s-juncture-p (analysis) (and (> (length analysis) 2) (s-juncture-p (codes (aref analysis 1))))) ;; *** get rid of it (defstruct ranking-info position effective-length juncture-count unknown-first-chunk-p final-lexical-s-verb-p initial-lexical-verb-p final-s-juncture-p initial-e-juncture-p initial-s-juncture-p possibly-noun-p last-chunk-length first-chunk-compound-p long-first-chunk-p) (defmethod compute-f-ranking-info ((lexicon compound-analyser-lexicon) position analyser) (with-slots (current-analysis compound-chunks) analyser (let ((effective-length (effective-length current-analysis))) (make-ranking-info ;; position in the analysis tree :position position ;; length not counting junctures :effective-length effective-length ;; number of junctures :juncture-count (- (length current-analysis) effective-length) ;; unknown first chunk? :unknown-first-chunk-p (null (codes (aref current-analysis 0))) ;; final lexical s-verb? :final-lexical-s-verb-p (final-lexical-s-verb-p lexicon current-analysis) ;; initial lexical verb? :initial-lexical-verb-p (initial-lexical-verb-p lexicon current-analysis) ;; final s-juncture? :final-s-juncture-p (final-s-juncture-p current-analysis) ;; initial e-juncture? :initial-e-juncture-p (initial-e-juncture-p current-analysis) ;; initial e-juncture? :initial-s-juncture-p (initial-s-juncture-p current-analysis) ;; last chunk might be noun? :possibly-noun-p ;;(find-if #'noun-p (codes (aref current-analysis (1- (length current-analysis))))) (find-if (lambda (code) (%has-feature-p lexicon code 'Noun)) (codes (aref current-analysis (1- (length current-analysis))))) ;; length of last chunk :last-chunk-length (length (chunk (aref current-analysis (1- (length current-analysis))))) ;; first chunk has elsewhere been analysed as compound? :first-chunk-compound-p (aref compound-chunks (length (chunk (aref current-analysis 0)))) ;;:long-first-chunk-p (long-chunk-p (chunk (aref current-analysis 0))) )))) (defun regexp-compare-juncture-s-word (analyser n1 n2) (with-slots (analysis-tree compound-chunks) analyser (labels ((next (a n) (when a (let ((branch-count (caaar a))) (if (< n branch-count) (multiple-value-bind (f-number n) (floor n (if (cdar a) (reduce #'+ (cdar a) :key #'caar) 1)) (values (car a) f-number n)) (next (cdr a) (- n branch-count)))))) (compare (n1 n2 f-n1 f-n2 pos1 pos2 a1 a2 sj1 sj2) (let* ((features1 (nth f-n1 (cddar a1))) (features2 (nth f-n2 (cddar a2))) (chunk1 (car features1)) (chunk2 (car features2)) (codes1 (cdr features1)) (codes2 (cdr features2))) #+debug(print (list :f1 features1 :f2 features2)) (cond ((and (= pos1 pos2) (s-juncture-p codes1) (char= (char chunk2 0) #\s)) (multiple-value-bind (a f-n n) (next (cdr a1) n1) ;; advance a1 (compare n n2 f-n f-n2 (1+ pos1) pos2 a a2 t nil))) ((and (= pos1 pos2) (s-juncture-p codes2) (char= (char chunk1 0) #\s)) (multiple-value-bind (a f-n n) (next (cdr a2) n2) ;; advance a2 (compare n1 n f-n1 f-n pos1 (1+ pos2) a1 a nil t))) ((and sj1 ;; condition we wanted to find (string= chunk1 chunk2 :start2 1) (compound-p compound-chunks chunk1 pos1)) t) ((and sj2 ;; condition we wanted to find (string= chunk1 chunk2 :start1 1) (compound-p compound-chunks chunk2 pos2)) nil) ((= pos1 pos2) ;; that's the only case in which we may have reached the end of the string (multiple-value-bind (a1 f-n1 n1) (next (cdr a1) n1) ; advance both (multiple-value-bind (a2 f-n2 n2) (next (cdr a2) n2) (if (and a1 a2) (compare n1 n2 f-n1 f-n2 (+ pos1 (length chunk1)) (+ pos2 (length chunk2)) a1 a2 nil nil) :incomparable)))) ((< pos1 pos2) (multiple-value-bind (a1 f-n1 n1) (next (cdr a1) n1) ; advance a1 (compare n1 n2 f-n1 f-n2 (+ pos1 (length chunk1)) pos2 a1 a2 nil nil))) ((> pos1 pos2) (multiple-value-bind (a2 f-n2 n2) (next (cdr a2) n2) ; advance a2 (compare n1 n2 f-n1 f-n2 pos1 (+ pos2 (length chunk2)) a1 a2 nil nil))) (t (error "a condition left: ~a" (list n1 n2 pos1 pos2 a1 a2 sj1 sj2))))))) (multiple-value-bind (a1 f-n1 n1) (next analysis-tree n1) ; advance both (multiple-value-bind (a2 f-n2 n2) (next analysis-tree n2) (compare n1 n2 f-n1 f-n2 0 0 a1 a2 nil nil)))))) (defun compound-p (compound-chunks chunk chunk-start) (or (> chunk-start 0) ; strange condition! (aref compound-chunks (+ chunk-start (length chunk))))) (defun rank-analyses (analyser) (with-slots (ranking) analyser #+debug(print ranking) (sort ranking (lambda (ri1 ri2) (block rank (cond ((< (ranking-info-effective-length ri1) (ranking-info-effective-length ri2)) (return-from rank t)) ((> (ranking-info-effective-length ri1) (ranking-info-effective-length ri2)) (return-from rank nil))) ;; equal effective length ;; check if diff in juncture count is > 1 (let ((jc1 (ranking-info-juncture-count ri1)) (jc2 (ranking-info-juncture-count ri2))) (cond ((< jc1 (1- jc2)) (return-from rank t)) ((> jc1 (1+ jc2)) (return-from rank nil)) ;; diff in juncture count is 1 ((/= jc1 jc2) (let ((flsv1 (ranking-info-final-lexical-s-verb-p ri1)) (flsv2 (ranking-info-final-lexical-s-verb-p ri2))) (when (and (not flsv1) flsv2 ; <1> (ranking-info-final-s-juncture-p ri1)) (return-from rank t)) (when (and flsv1 (not flsv2) (ranking-info-final-s-juncture-p ri2)) (return-from rank nil)) (if (not (ranking-info-unknown-first-chunk-p ri1)) ; <2> (let ((comp (if (eq *analyser-version* :regexp) (regexp-compare-juncture-s-word analyser (ranking-info-position ri1) (ranking-info-position ri2)) (compare-juncture-s-word analyser (ranking-info-position ri1) (ranking-info-position ri2))))) (if (not (eq comp :incomparable)) (return-from rank comp) (let ((ilv1 (ranking-info-initial-lexical-verb-p ri1)) (ilv2 (ranking-info-initial-lexical-verb-p ri2))) #+debug(print (list ilv1 ri1 ilv2 ri2)) (cond ((and (not ilv1) ilv2 (ranking-info-initial-e-juncture-p ri1)) (return-from rank t)) ((and ilv1 (not ilv2) (ranking-info-initial-e-juncture-p ri1)) (return-from rank nil)))))) (let ((long-chunk1-p (and (ranking-info-long-first-chunk-p ri1) (ranking-info-initial-s-juncture-p ri1))) (long-chunk2-p (and (ranking-info-long-first-chunk-p ri2) (ranking-info-initial-s-juncture-p ri2)))) (cond ((and long-chunk1-p (not long-chunk2-p)) (return-from rank t)) ((and (not long-chunk1-p) long-chunk2-p) (return-from rank nil))))) (return-from rank (< jc1 jc2)))) ;; no difference in number of junctures ((and (ranking-info-possibly-noun-p ri1) (not (ranking-info-possibly-noun-p ri2))) (return-from rank t)) ((and (not (ranking-info-possibly-noun-p ri1)) (ranking-info-possibly-noun-p ri2)) (return-from rank nil)) ((and (not (ranking-info-unknown-first-chunk-p ri1)) (ranking-info-first-chunk-compound-p ri1) (not (ranking-info-first-chunk-compound-p ri2))) (return-from rank t)) ((and (not (ranking-info-unknown-first-chunk-p ri1)) (not (ranking-info-first-chunk-compound-p ri1)) (ranking-info-first-chunk-compound-p ri2)) (return-from rank nil)) (t (return-from rank (> (ranking-info-last-chunk-length ri1) (ranking-info-last-chunk-length ri2)))))) nil))))) ;(print-ranked-analyses "sanddekka") (defun print-ranked-analyses (word) (with-compound-analyser (analyser) (%analyse-compound analyser word) (with-slots (analysis-tree ranking) analyser (setf ranking (rank-analyses analyser)) (dotimes (i (length ranking)) (print (cons (ranking-info-position (aref ranking i)) (nth-analysis (ranking-info-position (aref ranking i)) analysis-tree))))))) (defun most-probable-compound-last-chunk (word &optional nil-if-fullform-p) (with-compound-analyser (analyser) (%analyse-compound analyser word) (with-slots (analysis-tree ranking) analyser (setf ranking (rank-analyses analyser)) (unless (or (zerop (length ranking)) ;; *** make this nicer! (and nil-if-fullform-p (= (ranking-info-effective-length (aref ranking 0)) 1))) (nth-analysis-last-chunk-and-codes (ranking-info-position (aref ranking 0)) analysis-tree))))) (defun most-probable-compound (word &optional nil-if-fullform-p (divider "+")) (with-compound-analyser (analyser) (%analyse-compound analyser word) (with-slots (analysis-tree ranking) analyser (setf ranking (rank-analyses analyser)) (unless (or (zerop (length ranking)) (and nil-if-fullform-p (= (ranking-info-effective-length (aref ranking 0)) 1))) (let ((prefix "") (features ())) (loop for (chunk . rest) on (nth-analysis-codes (ranking-info-position (aref ranking 0)) analysis-tree nil) do (cond (rest (setf prefix (concat prefix (car chunk) (if (char= (last-char (car chunk)) #\-) "" (or divider ""))))) (t (dolist (suffix-analysis (cddr chunk)) (push (cons (concat prefix (car suffix-analysis)) (cdr suffix-analysis)) features)) (setf prefix (concat prefix (car chunk)))))) (values prefix features)))))) #+test (print (most-probable-compound "abonnementsbibliotek" t)) (defun ranked-analyses (word) (with-compound-analyser (analyser) (%analyse-compound analyser word) (with-slots (analysis-tree ranking) analyser (setf ranking (rank-analyses analyser)) (dotimes (i (length ranking)) (print (cons (ranking-info-position (aref ranking i)) (nth-analysis-last-chunk-and-codes (ranking-info-position (aref ranking i)) analysis-tree))))))) (defun count-analyses (analysis) (reduce #'+ analysis :key #'caar)) #+test (count-analyses (analyse-compound "sildesalgslag")) (defun nth-analysis (n analysis) "Uses the perfect hash function in a network where every node is labeled by the number of complete paths starting from it. The n-th path is the following: At each node, look at the adjacent (following) nodes; if the label of the first node is lower or equal n, decrease n by that number and test the next one, and so on; if not, proceed to that node, not altering n." (when analysis (let ((branch-count (caaar analysis))) (if (< n branch-count) (cons (chunk (caar analysis)) (nth-analysis n (cdar analysis))) (nth-analysis (- n branch-count) (cdr analysis)))))) (defun nth-analysis-last-chunk-and-codes (n analysis) "Same algorithm as in NTH-ANALYSIS, but returns the last chunk and all of its codes" (when analysis (let ((branch-count (caaar analysis))) (if (< n branch-count) (or (nth-analysis-last-chunk-and-codes n (cdar analysis)) (chunk-and-codes (caar analysis))) (nth-analysis-last-chunk-and-codes (- n branch-count) (cdr analysis)))))) (defun nth-analysis-codes (n analysis &optional (features-as-list-p t)) "Same algorithm as in NTH-ANALYSIS, but returns all chunks and their codes" (when analysis (let ((branch-count (caaar analysis))) (if (< n branch-count) (destructuring-bind (chunk . f-codes) (chunk-and-codes (caar analysis)) (list* (list* chunk f-codes (remove-duplicates (collecting (dolist (f-code f-codes) (collect-append (if (stringp f-code) (grammar-code-to-features chunk f-code :features-as-list-p features-as-list-p) (list (list chunk f-code)))))) :test (lambda (fl1 fl2) (and (string= (car fl1) (car fl2)) (equal (cdr fl1) (cdr fl2)))))) (nth-analysis-codes n (cdar analysis) features-as-list-p))) (nth-analysis-codes (- n branch-count) (cdr analysis) features-as-list-p))))) #+test (print (nth-analysis-codes 1 (analyse-compound "fiskerinæring"))) #+test (print (nth-analysis 1 (analyse-compound "sildesalgslag"))) #+test (print-all-analyses (analyse-compound "kosehøne")) ;; test function; not needed (defun print-all-analyses (analysis) (let ((n (count-analyses analysis))) (dotimes (i n) (print (nth-analysis i analysis))))) (defun effective-length (analysis) (let ((i 0)) (loop for chunk across analysis unless (juncture-p (codes chunk)) do (incf i)) i)) (defun previous-chunk (analysis) (let ((length (length analysis))) (if (juncture-p (codes (aref analysis (- length 2)))) (aref analysis (- length 3)) (aref analysis (- length 2))))) ;(analyse-compound "seksogfemti") ;(analyse-compound "tusenniogtretti") ;(analyse-compound "Steiro") ;(analyse-compound "beckettstykke") ;(print-ranked-analyses "tusenniogtretti") ;(analyse-compound "fattig-Norge") ;(%analyse-compound "Norge" 0) ;(analyse-compound "fem-seks") ;(tag-compound "debilografienes") ;(analyse-compound "debilografienes") ;(get-features "Norge") ;(string-values (fullforms *tagger*) "seks") ;; CURRENT-ANALYSIS is a vector containing the current partial compound analysis :eof