;;; -*- Mode: LISP; Package: MORPH-SERVER; BASE: 10; Syntax: ANSI-Common-Lisp; Readtable: augmented-readtable -*- ;; Tables for CG -> Norgram mapping, (in-package :morph-server) (defparameter *lemma-features-mapping-table* '( ;; hun, ho. [pronomen, normal: nom | akk] (("pron pers 3 ent fem hum") ("+Pron" "+Fem")) ;; den, denne, noen, enhver, ingen. [pronomen, enkel] (("pron pers 3 ent mask fem") (("+DetPron" "+Masc" "+Sg") ("+DetPron" "+Fem" "+Sg")) ("noen" "ingen") (("+DetPron" "+Masc" "+Sg") ("+DetPron" "+Fem" "+Sg") ("+DetPron" "+MFN" "+Pl"))) ;; han, 'n. [pronomen, normal: nom | akk] (("pron pers 3 ent mask hum") ("+Pron" "+Masc")) ;; jeg. [pronomen, normal: nom | akk] (("pron pers 1 ent hum") ("+Pron" "+MFN")) ;; vi. [pronomen, normal: nom | akk] (("pron pers 1 fl hum") ("+Pron" "+MFN")) ;; du. [pronomen, normal: nom | akk] (("pron pers 2 ent hum") ("+Pron" "+MFN")) ;; dere. [pronomen, normal: nom | akk] (("pron pers 2 fl hum") ("+Pron" "+MFN")) ;; inga. [pronomen, enkel] (("pron pers 3 ent fem") ("+DetPron" "+Fem" "+Sg")) ;; det, dette, noe, intet, alt, slikt, sånt, somt. [pronomen, enkel] (("pron pers 3 ent nøyt") (("+Pron" "+Neut" "+Nom") ("+Pron" "+Neut" "+Acc")) (("dette" "denne") ("noe" "noen") ("intet" "ingen") ("alt" "all") ("slikt" "slik") ("sånt" "sånn")) ("+DetPron" "+Neut" "+Sg")) ;; I. [pronomen, normal: nom | akk] (("pron pers 3 fl høflig") ("+Pron" "+MFN")) ;; man. [pronomen, enkel] (("pron pers ent hum nom") ("+Pron" "+MFN" "+Nom")) ;; forrige, neste, samme. [determinativ, enkel] (("det dem be ") (("+Adj" "+Pos" "+MF" "+Indef" "+Sg") ("+Adj" "+Pos" "+Neut" "+Indef" "+Sg") ("+Adj" "+Pos" "+MFN" "+Def/Pl"))) ;; denne, den. [determinativ, enkel] (("det dem fem ent") ("+DetPron" "+Fem" "+Sg")) ;; denne, den. [determinativ, enkel] (("det dem mask ent") ("+DetPron" "+Masc" "+Sg")) ;; dette, det. [determinativ, enkel] (("det dem nøyt ent") ("+DetPron" "+Neut" "+Sg")) ;; eneste. [determinativ, enkel] (("det kvant be ") (("+Adj" "+Pos" "+MF" "+Indef" "+Sg") ("+Adj" "+Pos" "+Neut" "+Indef" "+Sg") ("+Adj" "+Pos" "+MFN" "+Def/Pl"))) ;; ene. [determinativ, enkel] (("det kvant be ent") (("+Adj" "+Pos" "+MF" "+Indef" "+Sg") ("+Adj" "+Pos" "+Neut" "+Indef" "+Sg") ("+Adj" "+Pos" "+MFN" "+Def/Pl"))) ;; de, disse, noen, alle, begge, ingen, samtlige, somme. [pronomen, enkel] ;; de [pronomen, normal: nom | akk] (("pron pers 3 fl") () (("de" "den" :no-sub) ("disse" "denne") ("alle" "all")) ("+DetPron" "+MFN" "+Pl") ("de") ("+Pron" "+MFN" "+Nom") ("noen" "ingen") (("+DetPron" "+Masc" "+Sg") ("+DetPron" "+Fem" "+Sg") ("+DetPron" "+MFN" "+Pl")) ("begge" "samtlige") (("+Adj" "+Pos" "+MF" "+Indef" "+Sg") ("+Adj" "+Pos" "+Neut" "+Indef" "+Sg") ("+Adj" "+Pos" "+MFN" "+Def/Pl")) ("somme") (("+DetPron" "+Masc" "+Sg") ("+DetPron" "+MFN" "+Pl") ("+Adj" "+Pos" "+MF" "+Indef" "+Sg") ("+Adj" "+Pos" "+Neut" "+Indef" "+Sg") ("+Adj" "+Pos" "+MFN" "+Def/Pl"))) ;; ei, en. [pronomen, enkel] (("pron pers ent hum") () ("en") ("+Det" "+Art" "+Masc") (("ei" "en")) ("+Det" "+Art" "+Fem")) ;; sic! ei: en+Det+Art+Masc ;; hinannen, hverandre. [pronomen, enkel] (("pron res fl hum") (("+DetPron" "+MFN" "+Pl") ("+DetPron" "+Masc" "+Sg"))) ;; hvis, hvems. [pronomen, enkel] (("pron sp poss hum") () (("hvis" "hvem") ("hvems" "hvem")) (("+DetPron" "+Masc" "+Sg" "+Gen") ("+DetPron" "+Fem" "+Sg" "+Gen") ("+DetPron" "+MFN" "+Pl" "+Gen"))) ;; bartolomeusmess (92) [substantiv, ubøy] (("subst fem appell ubøy") ("+Noun" "+Fem" "+Indef" "+Sg")) ;; Somewhat inhomogeneous ;; besserwissen (256) [substantiv, ubøy] (("subst mask appell ubøy") ("+Noun" "+Masc" "+Indef" "+Sg")) ;; bengali (320) [substantiv, ubøy] (("subst nøyt appell ubøy") ("+Noun" "+Neut" "+Indef" "+Sg")) ;; annen. [determinativ, : mask ub ent | fem ub ent | nøyt ub ent | be ent | fl] (("det dem ") ("+DetPron")) ;; disse. [determinativ, enkel] (("det dem fl") ("+DetPron" "+MFN" "+Pl")) ;; egen. [determinativ, : mask ub ent | fem ub ent | nøyt ub ent | be ent | fl] (("det forst ") ("+Adj" "+Pos")) ;; sic! two analyses in Norgram ;; egen. [determinativ, enkel] (("det forst ") ("+Adj" "+Pos" "+MF" "+Indef" "+Sg")) ;; ? ;; eders, Deres. [determinativ: mask ent | fem ent | nøyt ent | fl] (("det poss høflig") (("+Pron" "+Masc" "+Sg") ("+Pron" "+Fem" "+Sg") ("+Pron" "+Neut" "+Sg") ("+Pron" "+MFN" "+Pl"))) ;; hverandres. [determinativ: mask ent | fem ent | nøyt ent | fl] (("det poss res") () (("hverandres" "hverandres" :no-sub) ("+DetPron" "+MFN" "+Pl" "+Gen"))) ;; hvem, hvo. [pronomen, enkel] (("pron sp hum") (("+DetPron" "+Masc" "+Sg") ("+DetPron" "+Fem" "+Sg") ("+DetPron" "+MFN" "+Pl"))) ;; bero (500) [substantiv, ubøy] (("subst appell ubøy") ("+Noun")) ;; Inhomogeneous. Solution: look features up in Norgram? ;; beskaffenhet (21032) [substantiv: ent ub | ent be | fl ub | fl be] (("subst fem appell") ("+Noun" "+Fem")) ;; Aagot (326) [substantiv, ubøy] (("subst fem prop") ("+Prop" "+Fem" "+Indef" "+Sg")) ;; beskaffenhet (57300) [substantiv: ent ub | ent be | fl ub | fl be] (("subst mask appell") ("+Noun" "+Masc") ("i") (:excl)) ;; Ola (656) [substantiv, ubøy] (("subst mask prop") ("+Prop" "+Masc" "+Indef" "+Sg")) ;; beslag (25058) [substantiv: ent ub | ent be | fl ub | fl be] (("subst nøyt appell") ("+Noun" "+Neut")) ;; ABC-teatret (127) [substantiv, ubøy] (("subst nøyt prop") ("+Prop" "+Neut")) ;; Jesu, Pauli, Kristi. [substantiv, ubøy] (("subst prop gen") (("+Prop" "+Indef" "+Sg" "+Gen")) (("Jesu" "Jesus") ("Pauli" "Paulus") ("Kristi" "Kristus")) ("+Prop" "+Masc" "+Indef" "+Sg" "+Gen")) ;; for, og, men, eller, så. [konjunksjon] (("CLB konj") ("+Conj") ("og" "eller") ("+Conj" "+Coord")) ;; fort, langt, lenge, nær, nødig, ofte, svint, titt. [adjektiv, adverbtype: pos | komp | sup] (("adj ") ("+Adv") (("nær" "nær" :no-sub)) (("+Prep") ("+Adv"))) ;; attende [adjektiv: pos m/f ub ent | pos fl | pos be ent | pos nøyt ub ent | komp | sup ub | sup be] (("adj " :no-sub) ("+Num" "+Ord" "+MFN" "+NoDef" "+SP")) ;; selveste, sjølveste, selve. [determinativ, enkel] (("det be") () (("selve" "selv")) ("+Adj" "+Pos""+MFN" "+Def/Pl") (("selveste" "selv") ("sjølveste" "sjølv")) ("+Adj" "+Sup""+MFN" "+Def" "+SP")) ;; hin, slik, sådan, sånn [determinativ: mask ent | fem ent | nøyt ent | fl] (("det dem") () (("hin" "hin" :no-sub)) (("+DetPron" "+Masc" "+Sg") ("+DetPron" "+Fem" "+Sg")) ("slik") ("+DetPron") ("sådan" "sånn") ("+Adj" "+Pos" "+MF" "+Indef" "+Sg")) ;; all, annenhver, atten, begge, elleve, en, en, enhver, fem, femten, femti, fire, fjartan, fjorten, førti, ;; halvannen, halvhundre, hundre, hundreogén, hver, ingen, mang, ni, nitten, nitti, noen, null, samtlige, seks, ;; seksten, seksti, sju, somme, somt, sytten, sytti, ti, tjue, to, tolv, tre, tretten, tretti, tusen, zero, ;; åtte, åtti, ørten, hundre og én, tjueen, tredve, tyve, syv, en, én, no', noe. ;; [determinativ: mask ent | fem ent | nøyt ent | fl] ;; [determinativ, enkel] (("det kvant") ("+Num" "+Card" "+MFN" "+NoDef") ("en" "én") ("+Num" "+Card" "+Indef")) ;; + ;; deres, din, hans, hennes, min, sin, vår, eders. [determinativ: mask ent | fem ent | nøyt ent | fl] (("det poss") ("+Pron")) ;; hvilken. [determinativ: mask ent | fem ent | nøyt ent | fl] (("det sp") ("+DetPron")) ;; både, enten, verken, hverken. [konjunksjon] (("konj ") ("+Conj")) ;; allting, ingenting. [pronomen, enkel] (("pron ent") () ("allting") ("+Noun" "+Neut" "+Indef" "+SP") ("ingenting") ("+Prop" "+Masc" "+Indef" "+Sg")) ;; seg. [pronomen, normal: nom | akk] (("pron refl") ("+Pron" "+MFN")) ;; hva, hvilken, hvilke, hvilket. [pronomen, enkel] (("pron sp") () (("hva" "hvem")) ("+DetPron" "+Neut" "+Sg") (("hvilken")) (("+DetPron" "+Masc" "+Sg") ("+DetPron" "+Fem" "+Sg")) (("hvilke" "hvilken")) ("+DetPron" "+MFN" "+Pl") (("hvilket" "hvilken")) ("+DetPron" "+Neut" "+Sg")) ;; mon. [subjunksjon] (("sbu ") ("+Adv")) ;; facts [substantiv: ent ub | ent be | fl ub | fl be] (("subst appell") ("+Noun")) ;; kråketær [substantiv: ent ub | ent be | fl ub | fl be] (("subst fem") ("+Noun")) ;; ? ;; Vårherre, Nordkalotten, a-ha, Alta [substantiv, ubøy] (("subst prop") ("+Prop" "+Masc" "+Indef" "+Sg")) ;; ? Vårherre+Prop+Masc+Indef+Sg, Alta+Prop+Masc+Indef+Sg *** all are masc indef sg??? ;; fordundre, nåde, vorde, vredes, bære. [verb, ubøy] ;; (("verb ubøy") ("+Verb")) ;; ? missing in Norgram ?? ;; besk [adjektiv: pos m/f ub ent | pos fl | pos be ent | pos nøyt ub ent | komp | sup ub | sup be] ;; besk [adjektiv, ulik m/f: pos m/f ub ent | pos fem ub ent | pos nøyt ub ent | pos be ent | pos fl | komp | sup ub | sup be] (("adj") ("+Adj")) ;; betids [adverb] (("adv") ("+Adv")) ;; å [infinitivsmerke] (("inf-merke") ("+PartInf")) ;; bevares, basta [interjeksjon] (("interj") ("+Interj")) ;; eller, og, men, samt, et. [konjunksjon] (("konj") ("+Conj" "+Coord") ("men" "samt") ("+Conj")) ;; bent fram, innenfor [preposisjon] (("prep") ("+Prep") ("av" "i" "på" "under" "forbi" "gjennom" "ved") (("+Prep") ("+Adv"))) ;; best som [subjunksjon] (("sbu") ("+Conj")) ;; beskatte [verb, normal: inf | pres | inf pres pass | pret | perf-part | adj nøyt ub ent | adj m/f ub ent | adj be ent | adj fl | adj | imp] ;; beskatte [verb, s-verb: inf | pres | pret | perf-part | imp ] (("verb") ("+Verb") ("skulle" "måtte" "burde" "kunne") ("+Aux") ("være" "bli" "ha" "ville") (("+Verb") ("+Aux")) (:infl "inf" "pres") (("+Verb" "+Infin") ("+Verb" "+Pres"))))) (defparameter *lemma-features-class-index-table* (let ((index (dat:make-string-tree))) (dolist (mapping *lemma-features-mapping-table*) (let ((features (split (caar mapping) #\Space))) (push (list (cdr features) mapping) (dat:string-tree-get index (car features))))) index)) (defparameter *inflectional-features-mapping-table* '(("adj") ;; only together with ("" "+PastPart") ("be" "+Def") ("ent" "+Sg") ("fl" "+Pl") ("m/f" "+MF") ("ub" "+Indef") ("nøyt" "+Neut") ("" "+PresPart" "+MFN") ("akk" "+Acc") ;;("gen" "+Gen") ("fem" "+Fem") ("imp" "+Impv") ("" "+SForm") ("inf" "+Infin") ("pres" "+Pres") ("pass" "+SForm") ("komp" "+Comp" "+MFN" "+NoDef" "+SP") ("mask" "+Masc") ("nom" "+Nom") ("perf-part" "+PastPart") ("pos" "+Pos") ("pret" "+Past") ("sup" "+Sup" "+MFN" "+SP"))) (defparameter *norgram-features* (let ((features ())) (dolist (mapping *lemma-features-mapping-table*) (loop for fl in (cdr mapping) by #'cddr do (if (listp (car fl)) (dolist (sub-fl fl) (dolist (f sub-fl) (pushnew f features :test #'equal))) (dolist (f fl) (pushnew f features :test #'equal))))) (dolist (mapping *inflectional-features-mapping-table*) (dolist (f (cdr mapping)) (pushnew f features :test #'equal))) (list* "+Punct" "+Comma" "+Token" "+Sent" features))) #+test (let* ((cgp::*cg* cgp::*nbo-cg*) (cgp::*tagger* (cgp::multi-tagger cgp::*cg*)) (*package* (find-package :cgp)) (*print-case* :downcase)) (print (cgp::lemma-and-features "spiste" :features-as-list-p t))) #+test (("spise" adj be ent tr1 i1 tr11 rl9 pa3 pr6 tr4) ("spise" adj fl tr1 i1 tr11 rl9 pa3 pr6 tr4) ("spise" verb pret tr1 i1 tr11 rl9 pa3 pr6 tr4)) #+test (print (readings-norgram-features "spiste" '(("verb" "adj be ent tr1 i1" "spise" "-") ("verb" "adj fl" "spise" "-") ;;("verb" "pret" "spise" "-") ))) #+test (print (readings-norgram-features "størst" '(("adj" "sup ub" "stor" "-")))) #+test (print (readings-norgram-features "spiste" '(("verb" (adj be ent) "spise" "-") ("verb" "adj fl" "spise" "-") ;;("verb" "pret" "spise" "-") ))) #|| (("-" "spise" "+Verb" "+PastPart" "+Def" "+Sg" "+MFN")) (("-" "spise" "+Verb" "+PastPart" "+Pl" "+MFN")) (("-" "spise" "+Verb" "+PastPart" "+MFN" "+Def/Pl")) ||# (defun cg-to-norgram-features (word cg-features) (let ((readings (collecting (dolist (reading cg-features) (when reading (collect (destructuring-bind (lemma . bv) reading (let* ((features (cgp::code-features bv)) ;; perfect participles get pos ADJ in Norsk ordbank; ;; we do need VERB as pos of the lemma (class (if (find ' features) 'verb (car features)))) #+debug(let ((*package* (find-package :cgp))) (print features)) (list class (cdr features) lemma "-"))))))))) #+debug(print (list :mnf (cgp::ner-norgram-features word cg-features))) (let ((features (mapcar #'cdr (readings-norgram-features word readings)))) ;; remove lemma-id #+debug(print (list :rnf features)) features))) #+test (let* ((cgp::*cg* cgp::*nbo-cg*) (cgp::*tagger* (cgp::multi-tagger cgp::*cg*)) (*package* (find-package :cgp)) (*print-case* :downcase)) (print (cg-to-norgram-features "Norge" (cgp::lemma-and-features "Norge" :features-as-list-p nil)))) #+test (print (get-mapping '(subst (prop mask gen) "Per" "-"))) (defun get-mapping (reading) #+debug(print reading) (destructuring-bind (ordklasse boy-tekst grunnform lemma-id) reading (let ((mappings (dat:string-tree-get *lemma-features-class-index-table* (string-downcase ordklasse))) (mapping-list nil)) #+debug(print mappings) (loop for (key-features mapping) in mappings when (and (not (find-if-not (lambda (f) (find f boy-tekst :test #'string-equal)) key-features)) (>= (length mapping) (length mapping-list))) do (setf mapping-list mapping)) #+ignore (when (find 'cgp::samset boy-tekst) (Print (list :mapping-list mapping-list)) #+tets (setf mapping-list (append mapping-list (list "+Samset")))) mapping-list))) (defun readings-norgram-features (word readings &key genitive-p lemma-numbers-p clump-numbers-p sql-p) (merge-features (collecting (dolist (reading readings) (destructuring-bind (ordklasse boy-tekst grunnform clump-id) reading (let ((mapping-list (if (listp boy-tekst) ;; comes from CG bv (get-mapping reading) (find-if (lambda (mapping) (string-equal ordklasse (caar mapping))) *lemma-features-mapping-table*))) ;;(genitive-p (or genitive-p (and (listp boy-tekst) (find 'cgp::gen boy-tekst)))) (word+lemma nil)) #+debug(print (list :ordklasse ordklasse :infl boy-tekst :mapping-list mapping-list :genitive-p genitive-p)) (when (cddr mapping-list) (loop for (wordlist mapping) on (cddr mapping-list) by #'cddr when (let ((w+l (or (find grunnform wordlist :key (lambda (word-lemma) (if (listp word-lemma) (car word-lemma) word-lemma)) :test #'string=) (and (eq (car wordlist) :infl) (loop for f in (cdr wordlist) always (find f (if (listp boy-tekst) boy-tekst (split boy-tekst #\Space)) :test #'string-equal)) wordlist)))) (when w+l (setf word+lemma w+l) (not (find :excl mapping)))) do #+debug(print (list :wordlist wordlist :mapping mapping)) (dolist (norgram-features (ensure-list-of-lists mapping)) (collect (remove-duplicates (append (list clump-id ;;lemma-id (if (and (listp word+lemma) (not (eq (car word+lemma) :infl))) (cadr word+lemma) grunnform)) norgram-features (unless (and (listp word+lemma) (eq :no-sub (caddr word+lemma))) (get-norgram-inflectional-features boy-tekst :except (when (and (listp word+lemma) (eq (car word+lemma) :infl)) (cdr word+lemma)))) (when genitive-p (list "+Gen"))) :test #'equal))))) (when (null word+lemma) (dolist (norgram-features (ensure-list-of-lists (cadr mapping-list))) (collect (remove-duplicates (append (list clump-id ;; lemma-id grunnform) norgram-features (unless (eq :no-sub (cadar mapping-list)) (get-norgram-inflectional-features boy-tekst)) (when genitive-p (list "+Gen"))) :test #'equal))))))) (unless genitive-p (let* ((length (length word)) (last-char (char word (1- length)))) (when (and (> length 2) (or (and (char= last-char #\s) (char/= (char word (- length 2)) #\s)) (and (find last-char "'") (char= (char word (- length 2)) #\s)))) (collect-append ;; remove dependency on norgram-morphology()! (when sql-p (norgram-morphology-sql (subseq word 0 (1- (length word))) :lemma-numbers-p lemma-numbers-p :clump-numbers-p clump-numbers-p :genitive-p t))))))))) #+test (print *norgram-features*) (defun write-fake-morph-transducer (&optional (stream *standard-output*)) (format stream "clear stack~%read regex 0:(?*) 0:{AB} ~~$[{AB}|%+~{|\"~a\"~}] [" nil #+ignore *norgram-features*) (loop for (f . rest) on *norgram-features* do (if rest (format stream "\"~a\":{~a} | " f f #+ignore (u::subst-substrings f '("/" "%/"))) (format stream "\"~a\":{~a}" f f #+ignore(u::subst-substrings f '("/" "%/"))))) (format stream "]* 0:{AB} 0:(?*);~%save stack unpack.fst~%")) #+test (with-open-file (stream "projects:xle;fst;nets;unpack.regex" :direction :output :if-exists :supersede) (write-fake-morph-transducer stream)) #+test (print (length (ordklasse-examples "subst prop gen" nil))) #+test (print (ordklasse-examples "adj" 1000)) #+test (print-fullforms "fort" t t) (defun get-norgram-inflectional-features (i-features &key except) (when i-features (collecting (let ((i-features (if (listp i-features) i-features (split i-features #\Space)))) (dolist (i-feature i-features) (unless (find i-feature except :test #'string-equal) (collect-append (cdr (find i-feature *inflectional-features-mapping-table* :test #'string-equal :key #'car))))) ;; Fix for +MFN missing on +PastPart and +Adj. Devise mechanism if there are more of that sort! (when (and (or (find "pos" i-features :test #'string-equal) (find "adj" i-features :test #'string-equal)) (null (intersection i-features '("mask" "fem" "nøyt" "m/f") :test #'string-equal))) (collect "+MFN")))))) (defun ensure-list-of-lists (obj) (if (listp (car obj)) obj (list obj))) (defun merge-features-2 (fl1 fl2) #+debug(print (list :fl1 fl1 :fl2 fl2)) (when (and fl1 fl2) (let ((d1-2 (set-difference fl1 fl2 :test #'string=)) (d2-1 (set-difference fl2 fl1 :test #'string=))) (cond ((and (null d1-2) (null d2-1)) fl1) ((and (or (equal d1-2 '("+Sg" "+Def")) (equal d1-2 '("+Def" "+Sg"))) (equal d2-1 '("+Pl"))) (append (remove "+Pl" fl2 :test #'string=) (list "+Def/Pl"))) ((and (or (equal d2-1 '("+Sg" "+Def")) (equal d2-1 '("+Def" "+Sg"))) (equal d1-2 '("+Pl"))) (append (remove "+Pl" fl1 :test #'string=) (list "+Def/Pl"))) ((and (equal d1-2 '("+Masc")) (equal d2-1 '("+Fem"))) (append (remove "+Fem" fl2 :test #'string=) (list "+MF"))) ((and (equal d1-2 '("+Fem")) (equal d2-1 '("+Masc"))) (append (remove "+Fem" fl1 :test #'string=) (list "+MF"))) ((and (equal d1-2 '("+Sg")) (equal d2-1 '("+Pl"))) (append (remove "+Pl" fl2 :test #'string=) (list "+SP"))) ((and (equal d1-2 '("+Pl")) (equal d2-1 '("+Sg"))) (append (remove "+Pl" fl1 :test #'string=) (list "+SP"))) (t nil))))) (defun merge-features (feature-sets) #+debug(print (list :feature-sets feature-sets)) (let ((merged-p nil)) (loop for fs+tail on feature-sets when (cdr fs+tail) do (loop for tail on (cdr fs+tail) when (and (caar fs+tail) (caar tail) (equal (caar fs+tail) (caar tail))) do (let ((merged (merge-features-2 (cdar fs+tail) (cdar tail)))) (when merged (setf merged-p t (cdar fs+tail) merged (car tail) nil))))) (if merged-p (merge-features feature-sets) (delete nil feature-sets)))) #+test (print (compare-analyzers "eller")) (defun norgram-morphology-string (word) (with-output-to-string (stream) (write-char #\( stream) (let ((morph-list (cons (list word "+Token") (remove-duplicates (mapcar #'cdr (norgram-morphology word)) :test #'equal)))) (loop for (morph . rest) on morph-list do (dolist (f morph) (write-string f stream)) (write-char (if rest #\| #\)) stream))))) #+test (string-net::nmap-string-values *bm* "bøker" (lambda (v) (print v)) #\: nil nil nil) #+test (print (norgram-morphology-string "spiste")) (defun norgram-tag-sentence (string) (with-output-to-string (stream) (dolist (word (split string #\Space)) (write-string (norgram-morphology-string word) stream)))) #+test (norgram-tag-sentence "spis !") #+test (print (norgram-morphology ",")) #+test (print (norgram-morphology "somme")) (defun get-norgram-fst-morphology (word) (with-open-file (stream "projects:xle;fst;nets;norgram.fst" :direction :output :if-exists :supersede) (write-line "load stack < bok-ana-final-ron2.fst" stream) (format stream "up ~a~%" word)) (cl-user::run-shell-command "cd ~/lisp/projects/xle/fst/nets; ../fst -f norgram.fst > morph.out") (collecting (with-file-lines (line "projects:xle;fst;nets;morph.out") (unless (or (string= line "bye.") (search "'bok-ana-" line)) (collect line))))) (defun norgram-fst-morphology (word) (collecting (dolist (reading (get-norgram-fst-morphology word)) (collect (split reading #\+))))) ;; compares fst and ordbank analyzers (defun compare-analyzers (word) (when word (let ((fst-ana (norgram-fst-morphology word)) (ordbank-ana (norgram-morphology word))) (collecting-into (ordbank-common fst-common fst-only ordbank-only) (dolist (fst fst-ana) (if (find-if (lambda (ordbank) (and (= (1+ (length fst)) (length ordbank)) (string= (car fst) (cadr ordbank)) (loop for o in (cddr ordbank) always (find o (cdr fst) :test (lambda (o f) (string= o f :start1 1)))) (collect-into ordbank-common (cdr ordbank)))) ordbank-ana) (collect-into fst-common fst) (collect-into fst-only fst))) (dolist (ordbank ordbank-ana) (unless (find (cdr ordbank) ordbank-common) (collect-into ordbank-only (cdr ordbank)))) (print (list :ordbank-common ordbank-common :fst-only fst-only :ordbank-only ordbank-only)) (values ordbank-common fst-only ordbank-only))))) #+test (print (compare-analyzers "fiskes")) #+test (print (norgram-fst-morphology "fisker")) #|| (defmethod xle-compare-morph-analysers ((request http-request) entity) (with-html-response (request entity stream (word lemma-numbers)) (let ((word (utf-8-decode word))) (multiple-value-bind (common fst-only ordbank-only) (compare-analyzers word) #m(html (head (title "Compare morphological analyses") (meta/ :http-equiv "Content-Type" :content "text/html; charset=utf-8")) (body ((form :method "post") ((p :style "color: red") "Analyse:" (input/ :type "text" :name "word" :value #s (or word "")))) ((p :style "color: blue") "Norsk ordbank") (nobr #L(print-fullforms word stream t)) ((p :style "color: blue") "Norsk ordbank -> Norgram") (nobr #L(dolist (list common) (format stream "~{~a~}
~%" (mapcar #'utf-8-encode list)))) ((nobr :style "color: red") #L(dolist (list ordbank-only) (format stream "~{~a~}
~%" (mapcar #'utf-8-encode list)))) ((p :style "color: blue") "Norgram") (nobr #L(dolist (list common) (format stream "~{~a~}
~%" (mapcar #'utf-8-encode list)))) ((nobr :style "color: red") #L(dolist (list fst-only) (format stream "~{~a~^+~}
~%" (mapcar #'utf-8-encode list)))))))))) #+old (defmethod xle-compare-morph-analysers ((request http-request) entity) (with-html-response (request entity stream (word lemma-numbers)) (let ((word (utf-8-decode word))) #m(html (head (title "Compare morphological analyses") (meta/ :http-equiv "Content-Type" :content "text/html; charset=utf-8")) (body ((form :method "post") ((p :style "color: red") "Analyse:" (input/ :type "text" :name "word" :value #s (or word "")))) ((p :style "color: blue") "Norsk ordbank") (nobr #L(print-fullforms word stream t)) ((p :style "color: blue") "Norsk ordbank -> Norgram") #L(if lemma-numbers #m(nobr #L(dolist (list (norgram-morphology word :lemma-numbers-p t)) (format stream "~d ~{~a~}
~%" (car list) (mapcar #'utf-8-encode (cdr list))))) #m(nobr #L(dolist (list (norgram-morphology word :lemma-numbers-p nil)) (format stream "~{~a~}
~%" (mapcar #'utf-8-encode (cdr list)))))) ((p :style "color: blue") "Norgram") (nobr #L(dolist (line (get-norgram-fst-morphology word)) (write-line (utf-8-encode line) stream) #m(br/)))))))) ||# #+disabled (publish :path "/logon/morph.html" :content-type "text/html" :function #'xle-compare-morph-analysers) #+test (net.aserve::start :port 8029) ;;; Storing the morphology as string-net (defparameter *string-net* nil) #+test (defparameter *bm* (string-net::read-net "~/lisp/projects/xle/bm-morph.net" :translate-p nil)) #+test (string-net::write-string-net *string-net* "projects:xle;bm-morph.net") #+test (string-net::nmap-string-values *bm* "bøker" (lambda (v) (print v)) #\: nil nil nil) #+test ;; fix inserts colon before + if missing (let ((string-net (make-instance 'string-net::list-string-net)) (count 0)) (string-net::nmap-strings *bm* (lambda (string) (when (zerop (mod (incf count) 1000)) (print (cons count string))) (let ((colon-pos (position #\: string)) (plus-pos (position #\+ string))) (string-net::add-string string-net (if colon-pos string (concat (subseq string 0 plus-pos) ":" (subseq string plus-pos))))))) (string-net::minimize-net string-net) (setf *string-net* string-net) (print (string-net::count-strings string-net)) (string-net::write-string-net string-net "projects:xle;bm-morph-new.net")) #+test (defparameter *bm* (string-net::read-net "~/lisp/projects/xle/morph/bm-morph-clump.net" :translate-p nil)) #+test (print (string-net::count-strings *string-net*)) :eof