;;;-*- Mode: Lisp; Package: CGP -*- (in-package :cgp) (defvar *scarrie-net* (string-net::read-net "projects:cgp;nets;scarrie.net")) #+test (with-file-lines (line "projects:cgp;scarrie;scarrie.tsf") (let ((line (map 'string (lambda (c) (code-char (string-net::unix-to-mac-char-code (char-code c)))) line))) (print line))) (defun encode-scarrie-gram-info (gram-info) (let ((*tagger* (multi-tagger (gethash "nbo" *cg-table*)))) (collecting (labels ((encode (scf features) (if scf (dolist (f-list (cdr (find (car scf) *scarrie-to-cgp-features* :key #'car :test #'string=))) (encode (cdr scf) (append f-list features))) (collect (apply #'encode-features (mapcar (lambda (f) (intern (string-upcase f) :cgp)) features)))))) (encode (string-parse gram-info :whitespace '(#\_ #\\) :escape-char nil) ()))))) #+test (encode-scarrie-gram-info "N_pl") #+only-once (defun build-scarrie-net () (let ((net (make-instance 'string-net::list-string-net)) (count 0)) (with-file-lines (line "projects:cgp;scarrie;scarrie.tsf") (let ((line (map 'string (lambda (c) (code-char (string-net::unix-to-mac-char-code (char-code c)))) line))) (unless (char= (char line 0) #\#) (destructuring-bind (id base-form word-form style-info replacement gram-info compound-info) (split line #\Tab) (declare (ignore id compound-info)) (when (zerop (mod (incf count) 1000)) (print line)) (dolist (features (encode-scarrie-gram-info gram-info)) (add-string net (concat word-form ":" (string-difference word-form base-form) ":" style-info ":" (if (string= replacement "") "" (string-difference word-form replacement)) ":" (bit-vector-to-string features)))))))) (minimize-net net) net)) #+only-once (defun get-scarrie-features-table () (let ((table (make-hash-table :test #'equal))) (with-file-lines (line "projects:cgp;scarrie;scarrie.tsf") (let ((line (map 'string (lambda (c) (code-char (string-net::unix-to-mac-char-code (char-code c)))) line))) (unless (char= (char line 0) #\#) (destructuring-bind (id base-form word-form style-info replacement gram-info compound-info) (split line #\Tab) (declare (ignore id base-form word-form style-info replacement compound-info)) (dolist (f (string-parse gram-info :whitespace '(#\_ #\\) :escape-char nil)) (incf (gethash f table 0))))))) table)) #+test (defparameter *scarrie-features-table* (get-scarrie-features-table)) #+test (let ((features (collecting (maphash (lambda (f count) (collect (cons f count))) *scarrie-features-table*)))) (dolist (f+c (sort features #'string< :key #'car)) (format t "(\"~a\" (\"-~a\"))~%" (car f+c) (car f+c) ))) #+test (defparameter *scarrie-net* (build-scarrie-net)) #+test (string-net::write-string-net *scarrie-net* "projects:cgp;nets;scarrie.net") (defparameter *scarrie-to-cgp-features* '(("Aa" ("inf-merke")) ("Adj" ("adj")) ("Adv" ("adv")) ("Conj" ("konj")) ("Det" ("det")) ("Interj" ("interj")) ("N" ("subst")) ("P" ("prep")) ("PN" ("subst")) ;; occurs only once in Scarrie ("Pro" ("pron")) ("Subj" ("konj")) ("V" ("verb")) ("acc" ("akk")) ("accinf" ("akk" "inf")) ("acomp" ("-acomp")) ("active" ("-active")) ("aux" ("-aux")) ("comp" ("-comp")) ("def" ("be")) ("defindef" ()) ("dem" ("-dem")) ("ditrans" ("-ditrans")) ("dummysubj" ("-dummysubj")) ("e" ("be" "sg") ("be" "ub" "fl")) ("endepart" ("-endepart")) ("expl" ("-expl")) ("f" ("fem")) ("mf" ("m/f")) ("fn" ("fem") ("nøyt")) ("imp" ("imp")) ("indef" ("ub")) ("indic" ("-indic")) ("inf" ("inf")) ("infcomp" ("-infcomp")) ("infpres" ("-infpres")) ("int" ()) ("intrans" ()) ("m" ("mask")) ("main" ()) ("fm" ("mask")) ("mfn" ()) ("mn" ("mask") ("nøyt")) ("n" ("nøyt")) ("ncomp" ("-ncomp")) ("nogender" ("-nogender")) ("nom" ("-nom")) ("nomacc" ("-nomacc")) ("objacomp" ("-objacomp")) ("objinfcomp" ("-objinfcomp")) ("objncomp" ("-objncomp")) ("objscomp" ("-objscomp")) ("p1" ("1")) ("p123" ("-p123")) ("p2" ("2")) ("p3" ("3")) ("passive" ("-passive")) ("pastpart" ("perf-part")) ("pers" ("pers")) ("pl" ("fl")) ("pos" ()) ("poss" ("poss")) ("pres" ("pres")) ("pret" ("pret")) ("quant" ("kvant")) ("rec" ("-rec")) ("ref" ("-ref")) ("refinfcomp" ("-refinfcomp")) ("refl" ("refl")) ("refscomp" ("-refscomp")) ("scomp" ("-scomp")) ("sg" ("ent")) ("sgpl" ()) ("sup" ("sup")) ("svarord" ("-svarord")) ("trans" ("-trans")) ("transref" ("-transref")))) #|| Aa 1 Adj 67655 Adv 1048 Conj 7 Det 130 Interj 209 N 223178 P 236 PN 2 Pro 29 Subj 38 V 62124 acc 9 accinf 64 acomp 114 active 44419 aux 11 comp 1006 def 127484 defindef 1050 dem 9 ditrans 4922 dummysubj 8724 e 36978 endepart 7003 expl 1 f 40197 fm 12012 fn 132 imp 6999 indef 125480 indic 55125 inf 7052 infcomp 610 infpres 6744 int 4 intrans 34210 m 109816 main 69127 mf 3688 mfn 60534 mn 1034 n 63097 ncomp 113 nogender 482 nom 8 nomacc 12 objacomp 177 objinfcomp 62 objncomp 63 objscomp 88 p1 4 p123 4 p2 6 p3 12 passive 17705 pastpart 22375 pers 20 pl 104039 pos 64634 poss 8 pres 14063 pret 11890 quant 112 rec 2 ref 7501 refinfcomp 37 refl 3 refscomp 53 scomp 1946 sg 137559 sgpl 12418 sup 2015 svarord 3 trans 38462 transref 577 (loop for i from 0 for feature in ;; 71 '("noun" "m" "sg" "def" "indef" "n" "pl" "f" "adj" "pos" "verb" "main" "indic" "active" "e" "pastpart" "pres" "trans" "intrans" "passive" "inf" "pret" "endepart" "imp" "dummysubj" "sup" "ref" "adv" "comp" "ditrans" "prep" "interj" "transref" "det" "quant" "scomp" "subj" "pro" "infcomp" "acc" "nom" "pers" "p3" "acomp" "aux" "ncomp" "p2" "objacomp" "dem" "poss" "p1" "objncomp" "conj" "accinf" "int" "svarord" "refl" "pn" "rec" "aa" "expl" "objscomp" "objinfcomp" "refscomp" "refinfcomp" "gap-s" "gap-e" "hyphen" "punctuation" "comma" "stop") do (setf (gethash feature *feature-table*) i)) (:WORD "avdekka" :LEMMA "avdekke" :WORD "avdekka" :STYLES NIL :FEATURES (ADJ FL TR1)) ||# #+test (setf *scarrie-net* (string-net::read-net "projects:cgp;nets;scarrie.net")) (defun get-scarrie-styles-complement (word lemma features &optional (compress-lemma-p t)) (let ((restored-word (restore-string (copy-seq word))) (restored-lemma (restore-string (copy-seq lemma)))) (collecting-into (styles) (dolist (style+features (string-values *scarrie-net* (concat restored-word ":" (if compress-lemma-p (string-net::compress-string restored-lemma restored-word) restored-lemma)))) (destructuring-bind (style replacement compressed-features) (split style+features #\: 3) (declare (ignore replacement)) #+debug(print (list (code-features (string-to-bit-vector compressed-features)) (code-features features))) ;; (let ((scarrie-bv (string-to-bit-vector compressed-features))) (when (has-features-p scarrie-bv '(ub be)) ;; fix for different coding of disjunction in Scarrie and Norsk ordbank (setf (sbit scarrie-bv (feature-code 'ub)) 0 (sbit scarrie-bv (feature-code 'be)) 0)) (when (subsumes-p scarrie-bv features) (collect-into styles style))))) styles))) #+test (let* ((cgp::*cg* *nbo-cg*) (cgp::*tagger* (cgp::multi-tagger cgp::*cg*))) (print (get-scarrie-styles-complement "avdekka" "avdekke" (encode-features 'ADJ 'FL)))) #+test (print (string-values *scarrie-net* (concat "betinga"))) (defun set-scarrie-style (bv style-complement-list) (cond ((or (null style-complement-list) (find "N" style-complement-list :test #'string-equal)) (set-feature bv 'cn)) (t (let ((styles '(c1 c2 c3 c4))) (dolist (style styles) (set-feature bv style)) (dolist (style style-complement-list) (loop for i across "1234" for st in styles when (find i style) do (reset-feature bv st))) (unless (find-if (lambda (st) (= (sbit bv (feature-code st)) 1)) styles) ;; anything goes (set-feature bv 'c5)))))) (defun get-scarrie-style (style-complement-list) (cond ((null style-complement-list) nil) ((or ;;(null style-complement-list) (find "N" style-complement-list :test #'string-equal)) '(cn)) (t (let ((style-list (list 'c1 'c2 'c3 'c4))) (dolist (style style-complement-list) (loop for i across "1234" for st in '(c1 c2 c3 c4) when (find i style) do (setf style-list (delete st style-list)))) style-list)))) #+obsolete (defun build-scarrie-lexicon-net () (let ((net (make-instance 'string-net::list-string-net)) (count 0) (*tagger* (multi-tagger (gethash "nbo" *cg-table*)))) (nmap-strings *scarrie-net* (lambda (string) (destructuring-bind (word comp-lemma style comp-repl feature-vector) (split string #\: 5) (let* ((bv (string-to-bit-vector feature-vector)) (styles (get-scarrie-styles-complement word comp-lemma bv nil))) (set-scarrie-style bv styles) (when (zerop (mod (incf count) 1000)) (print (list count word comp-lemma style comp-repl (code-features bv) styles))) (add-string net (print (concat word ":" comp-lemma ":" (bit-vector-to-string bv)))))))) (minimize-net net) net)) (defmethod add-scarrie-styles ((token token)) (unless (insignificant-token-p token) (let ((*tagger* (multi-tagger (constraint-grammar (token-chain token))))) (dolist (lemma.features (token-features token)) (when lemma.features (destructuring-bind (lemma . features) lemma.features (let ((styles (get-scarrie-styles-complement (token-value token) lemma features))) (set-scarrie-style features styles) #+debug(print (list styles (code-features features)))))))))) (defmethod add-scarrie-styles ((sentence sentence)) (labels ((walk (token concat-token) (cond ((null token) nil) ((cgp::token-expansion token) (do ((ex-token (car (cgp::token-expansion token)) (token-next ex-token))) ((eq ex-token (cdr (cgp::token-expansion token))) (walk ex-token token)) (walk ex-token token)) (walk (token-next token) nil)) (t (unless (insignificant-token-p token) (add-scarrie-styles token)) (unless (or (eq token (last-token sentence)) concat-token) (walk (token-next token) nil)))))) (walk (first-token sentence) nil))) #|| (let ((*tagger* (multi-tagger (gethash "nbo" *cg-table*)))) (lemma-and-features "fl¿yta" :net *scarrie-net*)) (dsc "Han flaug i lufta i v�rjamd¿gra.") (dsc "Han fl¿y i luften.") (defun dsc (string) (disambiguate-from-string string ;; :tagging-niveau :multi-tagging :cg (gethash "nbo" *cg-table*) :print-function (lambda (s &rest rest) (apply #'print-sentence s :token-print-fn (lambda (token &rest rest) (add-scarrie-styles token) (apply #'print-token token rest)) rest)))) (word-lemma "alle" :net *scarrie-net*) (string-net::string-subnet *scarrie-net* "fl¿yta:") (let ((word "flaug") (lemma "fly")) (let ((*tagger* (multi-tagger (gethash "nbo" *cg-table*)))) (dolist (style+features (string-values *scarrie-net* (concat word ":" (string-net::compress-string lemma word)))) (destructuring-bind (style replacement compressed-features) (split (print style+features) #\: 3) (print (list style (unless (equal (print replacement) "") (string-net::decompress-string replacement word)) (string-to-bit-vector compressed-features))))))) (let ((word "fl¿yta") (*tagger* (multi-tagger (gethash "nbo" *cg-table*)))) (dolist (lemma.features (lemma-and-features word)) (destructuring-bind (lemma . features) lemma.features (print (list word lemma (code-features features))) (dolist (style+features (string-values *scarrie-net* (concat word ":" (string-net::compress-string lemma word)))) (destructuring-bind (style replacement compressed-features) (split style+features #\: 3) (when t #+test(subsumes-p (string-to-bit-vector compressed-features) features) (print (list (intern style :keyword) (string-net::decompress-string replacement word) (code-features (string-to-bit-vector compressed-features)) (subsumes-p (string-to-bit-vector compressed-features) features))))))))) (let ((word "fisk") (*tagger* (multi-tagger (gethash "nbo" *cg-table*)))) (dolist (lemma.features (lemma-and-features word)) (destructuring-bind (lemma . features) lemma.features (print (list word lemma (code-features features))) (dolist (style+features (string-values *scarrie-net* (concat word ":" (string-net::compress-string lemma word)))) (destructuring-bind (style replacement compressed-features) (split style+features #\: 3) (when t #+test(subsumes-p (string-to-bit-vector compressed-features) features) (print (list (intern style :keyword) (string-net::decompress-string replacement word) (code-features (string-to-bit-vector compressed-features)) (subsumes-p (string-to-bit-vector compressed-features) features))))))))) (string-difference "fl¿y" "flaug") (string-net::decompress-string (string-difference "fly" "fl¿y") "fly") ||# :eof