;;; -*- Mode: LISP; Package: MORPH-SERVER; BASE: 10; Syntax: ANSI-Common-Lisp; Readtable: augmented-readtable -*- (in-package :morph-server) #+test (cl-user::mk :oci) #+test (use-package :sql) (eval-when (:load-toplevel :compile-toplevel :execute) (enable-sql-reader-syntax)) (eval-when (:load-toplevel :compile-toplevel :execute) (defparameter *usdprod* (connect "usdprod.inger.uio.no" :user-id "paulm" :password "muXluX314" :db-type :oci :if-exists :warn-old))) #+test (eval-when (:load-toplevel :compile-toplevel :execute) (defparameter *decorcl* (connect "decentius.aksis.uio.no" :user-id "system" :password "gvprckvnis" :db-type :oci :if-exists :warn-old))) #+test (disconnect) #+old (defun print-fullforms (word &optional (stream *standard-output*) webp) (do-query ((lemma lemma-id tag) [select [grunnform] [lemma lemma-id] [tag] :from '([usd-leks-tagger-bm fullformsliste] [usd-leks-tagger-bm lemma]) :distinct t :where [and [= [oppslag] ?word] [= [lemma lemma-id] [fullformsliste lemma-id]]]]) (if webp (format stream "~&~d ~a ~a
" lemma-id (utf-8-encode lemma) (u::subst-substrings (utf-8-encode tag) '("<" "<" ">" ">"))) (format stream "~&~d ~a ~a" lemma-id lemma tag)))) #+does-not-work (defun print-fullforms (word &optional (stream *standard-output*) webp) (do-query ((lemma lemma-id tag) (print [select [grunnform] [lemma lemma-id] [usd-leks-bmordbok.ortoklump klump-id] [tag] :from '([usd-leks-tagger-bm fullformsliste] [usd-leks-tagger-bm lemma] [usd-leks-bmordbok ortoklump]) :distinct t :where [and [= [oppslag] ?word] [= [lemma kjelde-element] [usd-leks-bmordbok.ortoklump artikkel-id]] [= [lemma lemma-id] [fullformsliste lemma-id]]]])) (if webp (format stream "~&~d ~a ~a
" lemma-id (utf-8-encode lemma) (u::subst-substrings (utf-8-encode tag) '("<" "<" ">" ">"))) (format stream "~&~d ~a ~a" lemma-id lemma tag)))) (defun print-fullforms (word &optional (stream *standard-output*) webp) (do-query ((lemma lemma-id klump-id tag) [select [lemma grunnform] [lemma lemma-id] [klump-id] [fullformsliste tag] :from '([usd-leks-tagger-bm fullformsliste] [usd-leks-tagger-bm lemma] [usd-leks-tagger-bm v-logon-tagger-fullform]) :distinct t :where [and [= [fullformsliste oppslag] ?word] [= [lemma kjelde-element] [v-logon-tagger-fullform kjelde-element]] [= [lemma lemma-id] [fullformsliste lemma-id]]] :order-by '([klump-id])]) (if webp (format stream "~&~d ~a ~a ~a
" lemma-id (utf-8-encode lemma) klump-id (u::subst-substrings (utf-8-encode tag) '("<" "<" ">" ">"))) (format stream "~&~d ~a ~d ~a" lemma-id lemma klump-id tag)))) #|| (print-fullforms "anker") (select [count [*]] :from [usd-leks-bmordbok ortoklump] :distinct t) ;; calculate tag list (defparameter *tag-table* (let ((tag-list (dat::make-string-tree)) (count 0)) (do-query ((tag) [select [tag] :from [usd-leks-tagger-bm fullformsliste] ;; :where [< [rownum] 10] ]) (when (zerop (mod (incf count) 10000)) (print count)) (dolist (tag (split tag #\space)) (incf (dat::string-tree-get tag-list tag 0)) #+test (pushnew tag tag-list :test #'string=))) tag-list)) (print (car *tag-table*)) ;; frequency list (let ((tag-list ())) (dat::do-string-tree (tag count *tag-table*) (push (cons count tag) tag-list)) (setf tag-list (sort tag-list #'< :key #'car)) (dolist (pair tag-list) (format t "~&~d ~a" (car pair) (cdr pair)))) (print-fullforms "katten") ;; examples having given ordklasse (defun ordklasse-examples (ordklasse &optional (count 1)) (select [grunnform] :from [usd-leks-tagger-bm lemma] :flatp t :where [in [lemma-id] [select [lemma-id] :from [usd-leks-tagger-bm lemma-paradigme] :where [and [in [lemma-paradigme paradigme-id] [select [paradigme-id] :from [usd-leks-tagger-bm paradigme] :where [and ;;[<= [rownum] 1] [= [ordklasse] ?ordklasse]]]] (when count [<= [rownum] ?count])]]])) ;; ordklasse (do-query ((ordklasse boy-gruppe) [select [ordklasse] [boy-gruppe] :from [usd-leks-tagger-bm paradigme] :distinct t]) (let ((example (ordklasse-examples ordklasse)) (inflections (select [boy-tekst] :from [usd-leks-tagger-bm boying] :flatp t :where [like [boy-gruppe] ?boy-gruppe]))) (format t "~&~a ~a [~a: ~{(~a)~^ ~}]" ordklasse (car example) boy-gruppe inflections))) (do-query ((&rest rest) [select [boy-tekst] :from [usd-leks-tagger-bm boying] :distinct t]) (format t "~&~{~a ~}" rest)) ;; Inflectional features (do-query ((infl-features) [select [boy-tekst] :from [usd-leks-tagger-bm boying] :distinct t]) (format t "~& (~s (\"+\" \"+\" \"+\" \"+\" \"+\" \"+\"))" infl-features)) (let ((infl-features ())) (dolist (fs (select [boy-tekst] :from [usd-leks-tagger-bm boying] :flatp t :distinct t)) (dolist (f (split fs #\Space)) (unless (find f infl-features :test #'string=) (push f infl-features) (format t "~& (~s \"+\")" f))))) (let ((feature-sets (select [ordklasse] [boy-gruppe] :from [usd-leks-tagger-bm paradigme] :distinct t))) (dolist (features feature-sets) (let ((example (ordklasse-examples (car features))) (inflections (select [boy-tekst] :from [usd-leks-tagger-bm boying] :flatp t :where [= [boy-gruppe] (cadr features)]))) (setf (car features) (split (car features) #\Space)) (push inflections (cddr features)) (push (car example) (cddr features)))) (setf feature-sets (sort feature-sets #'> :key (lambda (set) (length (car set))))) (dolist (features feature-sets) (destructuring-bind (ordklasse boy-gruppe example inflections) features (format t "~& ;; ~a [~a: ~{~a~^ | ~}]~% ((~{~s~^ ~}) (\"+\" \"+\" \"+\" \"+\" \"+\" \"+\"))" example boy-gruppe inflections ordklasse)))) ;; feature mapping from Norsk ordbank to Norgram (print (length (ordklasse-examples "subst mask prop" nil))) (print (length *lemma-features-mapping-table*)) ||# #+orig (defun norgram-morphology-sql (word &key lemma-numbers-p clump-numbers-p genitive-p) (cond ((find word '("." "!" "?") :test #'string=) (list (list 0 word "+Punct" "+Sent"))) ((find word '(",") :test #'string=) (list (list 0 word "+Punct" "+Comma"))) ((find word '(":" ";" "_") :test #'string=) (list (list 0 word "+Punct"))) ((find word '("(" ")" "[" "]" "{" "}") :test #'string=) (list (list 0 word "+Punct" "+Paren"))) ((find word '("-") :test #'string=) (list (list 0 word "+Punct" "+Hyphen"))) ((find word '("#" "&" "=" "+" "*" "@" "§") :test #'string=) (list (list 0 word "+Punct" "+Symbol"))) ((find word '("/") :test #'string=) (list (list 0 word "+Punct" "+Symbol") (list 0 word "+Punct" "+Slash"))) ((find word '("$") :test #'string=) (list (list 0 word "+Punct" "+Money"))) (t (let ((readings (append (select [ordklasse] [boy-tekst] [lemma grunnform] (if lemma-numbers-p [lemma lemma-id] "-") (if clump-numbers-p [klump-id] "-") :distinct t :from '([usd-leks-tagger-bm paradigme] [usd-leks-tagger-bm boying] [usd-leks-tagger-bm fullformsliste] [usd-leks-tagger-bm lemma] [usd-leks-tagger-bm v-logon-tagger-fullform]) :where [and [= [fullformsliste oppslag] ?word] [= [lemma "kjelde-element"] ;; outer join syntax [v-logon-tagger-fullform "kjelde-element(+)"]] (when genitive-p [or [like [ordklasse] "subst%"] [like [ordklasse] "adj%"] [like [boy-tekst] "adj%"] #+test[like [boy-tekst] "%%"] #+test[like [boy-tekst] "%%"]]) [= [paradigme paradigme-id] [fullformsliste paradigme-id]] [= [fullformsliste lemma-id] [lemma lemma-id]] [= [boying boy-nummer] [fullformsliste boy-nummer]] [= [boying boy-gruppe] [paradigme boy-gruppe]]]) (select [ordklasse] "-" [lemma grunnform] (if lemma-numbers-p [lemma lemma-id] "-") (if clump-numbers-p [klump-id] "-") :distinct t :from '([usd-leks-tagger-bm paradigme] [usd-leks-tagger-bm boying] [usd-leks-tagger-bm fullformsliste] [usd-leks-tagger-bm lemma] [usd-leks-tagger-bm v-logon-tagger-fullform]) :where [and [= [fullformsliste oppslag] ?word] [= [lemma "kjelde-element"] [v-logon-tagger-fullform "kjelde-element(+)"]] (when genitive-p [or [like [ordklasse] "subst%"] [like [ordklasse] "adj%"] [like [boy-tekst] "adj%"] #+test[like [boy-tekst] "%%"] #+test[like [boy-tekst] "%%"]]) [= [paradigme paradigme-id] [fullformsliste paradigme-id]] [= [fullformsliste lemma-id] [lemma lemma-id]] [= 0 [fullformsliste boy-nummer]]])))) #+debug(print (list :readings readings)) (readings-norgram-features word readings :genitive-p genitive-p :lemma-numbers-p lemma-numbers-p :clump-numbers-p clump-numbers-p :sql-p t))))) (defparameter *lemma-id-table* (make-hash-table)) (do-query ((lemma-id klump-id grunnform) [select [lemma lemma-id] [klump-id] [lemma grunnform] :from '([usd-leks-tagger-bm lemma] [usd-leks-tagger-bm v-logon-tagger-fullform]) :distinct t :where [= [lemma kjelde-element] ;; outer join syntax [v-logon-tagger-fullform "kjelde-element(+)"]] :order-by '(grunnform)]) #+done (when klump-id (setf (gethash klump-id *lemma-id-table*) (cons lemma-id grunnform))) (when (and (null klump-id) (gethash lemma-id *lemma-id-table*)) (print (list :lemma grunnform :id lemma-id :klump (gethash lemma-id *lemma-id-table*)))) ) (print (select [*] :from [usd-leks-tagger-bm v-logon-tagger-fullform] :where [like [grunnform] "rundkjerke%"])) ;; extract tables (let ((excl::*locale* (excl::find-locale "C"))) #-done (with-open-file (stream "projects:xle;morph;norsk-ordbank;lemma.tab" :direction :output :if-exists :supersede) (do-query ((&rest rest) [select [lemma lemma-id] [klump-id] [lemma grunnform] :from '([usd-leks-tagger-bm lemma] [usd-leks-tagger-bm v-logon-tagger-fullform]) :distinct t :where [= [lemma kjelde-element] ;; outer join syntax [v-logon-tagger-fullform "kjelde-element(+)"]] :order-by '(grunnform)]) (format stream "~{~a~^|~}~%" (mapcar (lambda (val) (or val "")) rest)))) #+done (with-open-file (stream "projects:xle;morph;norsk-ordbank;paradigm.tab" :direction :output :if-exists :supersede) (do-query ((&rest rest) [select [paradigme-id] [boy-gruppe] :from [usd-leks-tagger-bm paradigme] :distinct t]) (format stream "~{~a~^|~}~%" (mapcar (lambda (val) (or val "")) rest)))) #+done (with-open-file (stream "projects:xle;morph;norsk-ordbank;boying.tab" :direction :output :if-exists :supersede) (do-query ((&rest rest) [select [boy-nummer] [boy-gruppe] :from [usd-leks-tagger-bm boying] :distinct t]) (format stream "~{~a~^|~}~%" (mapcar (lambda (val) (or val "")) rest)))) #+done (with-open-file (stream "projects:xle;morph;norsk-ordbank;fullformsliste.tab" :direction :output :if-exists :supersede) (do-query ((&rest rest) [select [lemma-id] [oppslag] [boy-nummer] [paradigme-id] :from [usd-leks-tagger-bm fullformsliste] :distinct t]) (format stream "~{~a~^|~}~%" (mapcar (lambda (val) (or val "")) rest))))) (defun norgram-morphology-sql (word &key lemma-numbers-p clump-numbers-p genitive-p) (cond ((find word '("." "!" "?") :test #'string=) (list (list 0 word "+Punct" "+Sent"))) ((find word '(",") :test #'string=) (list (list 0 word "+Punct" "+Comma"))) ((find word '(":" ";" "_") :test #'string=) (list (list 0 word "+Punct"))) ((find word '("(" ")" "[" "]" "{" "}") :test #'string=) (list (list 0 word "+Punct" "+Paren"))) ((find word '("-") :test #'string=) (list (list 0 word "+Punct" "+Hyphen"))) ((find word '("#" "&" "=" "+" "*" "@" "§") :test #'string=) (list (list 0 word "+Punct" "+Symbol"))) ((find word '("/") :test #'string=) (list (list 0 word "+Punct" "+Symbol") (list 0 word "+Punct" "+Slash"))) ((find word '("$") :test #'string=) (list (list 0 word "+Punct" "+Money"))) (t (let ((readings (select-union [select [ordklasse] [boy-tekst] [lemma grunnform] (if lemma-numbers-p [lemma lemma-id] "-") (if clump-numbers-p [klump-id] "-") :distinct t :from '([usd-leks-tagger-bm paradigme] [usd-leks-tagger-bm boying] [usd-leks-tagger-bm fullformsliste] [usd-leks-tagger-bm lemma] [usd-leks-tagger-bm v-logon-tagger-fullform]) :where [and [= [fullformsliste oppslag] ?word] [= [lemma "kjelde-element"] ;; outer join syntax [v-logon-tagger-fullform "kjelde-element(+)"]] (when genitive-p [or [like [ordklasse] "subst%"] [like [ordklasse] "adj%"] [like [boy-tekst] "adj%"] #+test[like [boy-tekst] "%%"] #+test[like [boy-tekst] "%%"]]) [= [paradigme paradigme-id] [fullformsliste paradigme-id]] [= [fullformsliste lemma-id] [lemma lemma-id]] [= [boying boy-nummer] [fullformsliste boy-nummer]] [= [boying boy-gruppe] [paradigme boy-gruppe]]]] [select [ordklasse] "-" [lemma grunnform] (if lemma-numbers-p [lemma lemma-id] "-") (if clump-numbers-p [klump-id] "-") :distinct t :from '([usd-leks-tagger-bm paradigme] [usd-leks-tagger-bm boying] [usd-leks-tagger-bm fullformsliste] [usd-leks-tagger-bm lemma] [usd-leks-tagger-bm v-logon-tagger-fullform]) :where [and [= [fullformsliste oppslag] ?word] [= [lemma "kjelde-element"] [v-logon-tagger-fullform "kjelde-element(+)"]] (when genitive-p [or [like [ordklasse] "subst%"] [like [ordklasse] "adj%"] [like [boy-tekst] "adj%"] #+test[like [boy-tekst] "%%"] #+test[like [boy-tekst] "%%"]]) [= [paradigme paradigme-id] [fullformsliste paradigme-id]] [= [fullformsliste lemma-id] [lemma lemma-id]] [= 0 [fullformsliste boy-nummer]]]]))) #+debug(print (list :readings readings)) (readings-norgram-features word readings :genitive-p genitive-p :lemma-numbers-p lemma-numbers-p :clump-numbers-p clump-numbers-p :sql-p t))))) #+test (create-view [usd-leks-tagger-bm logon-klump-fullformliste] :as [union [select [fullformsliste oppslag] [ordklasse] [boy-tekst] [lemma grunnform] [klump-id] :distinct t :from '([usd-leks-tagger-bm paradigme] [usd-leks-tagger-bm boying] [usd-leks-tagger-bm fullformsliste] [usd-leks-tagger-bm lemma] [usd-leks-tagger-bm v-logon-tagger-fullform]) :where [and ;;[= [fullformsliste oppslag] ?word] [= [lemma "kjelde-element"] ;; outer join syntax [v-logon-tagger-fullform "kjelde-element(+)"]] #+orig (when genitive-p [or [like [ordklasse] "subst%"] [like [ordklasse] "adj%"] [like [boy-tekst] "adj%"] #+test[like [boy-tekst] "%%"] #+test[like [boy-tekst] "%%"]]) [= [paradigme paradigme-id] [fullformsliste paradigme-id]] [= [fullformsliste lemma-id] [lemma lemma-id]] [= [boying boy-nummer] [fullformsliste boy-nummer]] [= [boying boy-gruppe] [paradigme boy-gruppe]]]] [select [fullformsliste oppslag] [ordklasse] "-" [lemma grunnform] [klump-id] :distinct t :from '([usd-leks-tagger-bm paradigme] [usd-leks-tagger-bm boying] [usd-leks-tagger-bm fullformsliste] [usd-leks-tagger-bm lemma] [usd-leks-tagger-bm v-logon-tagger-fullform]) :where [and ;;[= [fullformsliste oppslag] ?word] [= [lemma "kjelde-element"] [v-logon-tagger-fullform "kjelde-element(+)"]] #+orig (when genitive-p [or [like [ordklasse] "subst%"] [like [ordklasse] "adj%"] [like [boy-tekst] "adj%"] #+test[like [boy-tekst] "%%"] #+test[like [boy-tekst] "%%"]]) [= [paradigme paradigme-id] [fullformsliste paradigme-id]] [= [fullformsliste lemma-id] [lemma lemma-id]] [= 0 [fullformsliste boy-nummer]]]]]) #|| (print (select [*] :distinct t :from '([usd-leks-tagger-bm lemma] ) :where [= [grunnform] "flyve"])) (print (select [lemma-id] [paradigme-id] :distinct t :from '([usd-leks-tagger-bm fullformsliste] ) :where [= [oppslag] "flyve"])) (print (select [*] :distinct t :from '([usd-leks-tagger-bm paradigme] ) :where [= [paradigme-id] "000"])) (print (select [*] :from [usd-leks-tagger-bm fullformsliste] :where [= [paradigme-id] "000"])) ||# #+test (print (select [ordklasse] [boy-tekst] [grunnform] [lemma lemma-id] :distinct t :from '([usd-leks-tagger-bm paradigme] [usd-leks-tagger-bm boying] [usd-leks-tagger-bm fullformsliste] [usd-leks-tagger-bm lemma] ) :where [and [= [fullformsliste oppslag] "u-"] [= [paradigme paradigme-id] [fullformsliste paradigme-id]] [= [fullformsliste lemma-id] [lemma lemma-id]] [= [boying boy-nummer] [fullformsliste boy-nummer]] [= [boying boy-gruppe] [paradigme boy-gruppe]]])) ;; ***************************** #+test (print (norgram-morphology-sql "u-")) #+test (morph-server::print-norgram-morphology "spiste") #+test (let* ((cgp::*cg* cgp::*nbo-cg*) (cgp::*tagger* (cgp::multi-tagger cgp::*cg*)) (*package* (find-package :cgp)) (*print-case* :downcase)) (print (cgp::lemma-and-features "spiste" :features-as-list-p t))) #+test (print (select "-" [lemma-id] :flatp nil :from [usd-leks-tagger-bm lemma] :where [< [lemma-id] 3])) #+test (print (compare-analyzers "el")) #+test (print (norgram-morphology-sql "u-")) #+test (do-query ((word) [select [oppslag] :distinct t :from [usd-leks-tagger-bm fullformsliste] :flatp t :where [or [like [oppslag] "-xx%"] [like [oppslag] "%-"]]]) (print word)) #+test (do-query ((word) [select [oppslag] :distinct t :from [usd-leks-tagger-bm fullformsliste] :flatp t :where [or [like [oppslag] "-%"] [like [oppslag] "%flyve%"]]]) (print word)) ;; write morphology to disk #+test (let ((count 0)) (with-open-file (stream "projects:xle;morph;bm-morph.txt" :direction :output :if-exists :supersede) (do-query ((word) [select [oppslag] :distinct t :from [usd-leks-tagger-bm fullformsliste] :flatp t :where [not [like [oppslag] "-%"]]]) (dolist (reading (norgram-morphology-sql word)) (when (zerop (mod (incf count) 1000)) (print (cons count word))) (format stream "~a:~a:~{~a~}~%" word (cadr reading) (cddr reading)) (when (and (intersection (cddr reading) (list "+Noun" "+Adj" "+PastPart" "+PresPart" "+Prop") :test #'string=) (not (find "+Gen" (cddr reading) :test #'string=))) ;; add genitive (format stream "~a~c:~a:~{~a~}+Gen~%" word (if (char= (char word (1- (length word))) #\s) #\' #\s) (cadr reading) (cddr reading))))))) ;; with clumps #+test (let ((count 0) (excl::*locale* (excl::find-locale "C"))) (with-open-file (stream "projects:xle;morph;bm-morph-clumps.txt" :direction :output :if-exists :supersede) (do-query ((word) [select [oppslag] :distinct t :from [usd-leks-tagger-bm fullformsliste] :flatp t :where [not [like [oppslag] "-%"]] ;;[like [oppslag] "A-men%"] :order-by '([oppslag])]) (print word) (let ((readings (norgram-morphology-sql word :lemma-numbers-p nil :clump-numbers-p t))) (if readings (dolist (reading readings) (print (list :reading reading)) (when (zerop (mod (incf count) 100)) (print (list count word reading))) (format stream "~a:~a:~a:~{~a~}~%" word (cadr reading) (car reading) (cddr reading)) (when (and (intersection (cddr reading) (list "+Noun" "+Adj" "+PastPart" "+PresPart" "+Prop") :test #'string=) (not (find "+Gen" (cddr reading) :test #'string=))) ;; add genitive (format stream "~a~c:~a:~a:~{~a~}+Gen~%" word (if (char= (char word (1- (length word))) #\s) #\' #\s) (cadr reading) (car reading) (cddr reading)))) (print (list :no-reading word))))))) #+test (print (norgram-morphology-sql "A-aksje" :lemma-numbers-p nil :clump-numbers-p t)) #+test (print (select [count [oppslag]] :distinct t :from [usd-leks-tagger-bm fullformsliste] :flatp t :where [not [like [oppslag] "-%"]])) #+test (print (select [oppslag] :distinct t :from [usd-leks-tagger-bm fullformsliste] :flatp t :where [like [oppslag] "A-menn%"])) ;; OBS: use compress-morphology() in compress-morphology.lisp! ;; read in and build string-net #+test (let ((string-net (make-instance 'string-net::list-string-net)) (count 0)) (u::with-file-lines (line "projects:xle;morph;bm-morph.txt") (string-net::add-string string-net line) (when (zerop (mod (incf count) 1000)) (print (cons count line)))) ;; missing in lexicon (dolist (line '(".:.:+Punct+Sent" "!:!:+Punct+Sent" "?:?:+Punct+Sent")) (string-net::add-string string-net line)) (string-net::minimize-net string-net) (setf *string-net* string-net) (print (string-net::count-strings string-net)) (string-net::write-string-net string-net "projects:xle;morph;bm-morph.net")) #+test (let ((string-net (make-instance 'string-net::list-string-net)) (count 0)) (do-query ((word) [select [oppslag] :distinct t :from [usd-leks-tagger-bm fullformsliste] :flatp t :where [not [like [oppslag] "-%"]]]) (dolist (reading (norgram-morphology-sql word)) (when (zerop (mod (incf count) 1000)) (print (cons count word))) (string-net::add-string string-net (format nil "~a:~a:~{~a~}" word (cadr reading) (cddr reading))) (when (and (intersection (cddr reading) (list "+Noun" "+Adj" "+PastPart" "+PresPart") :test #'string=) (not (find "+Gen" (cddr reading) :test #'string=))) (string-net::add-string string-net (format nil "~a~c:~a:~{~a~}+Gen" word (cadr reading) (if (char= (char word (1- (length word))) #\s) #\' #\s) (cddr reading)))))) (string-net::minimize-net string-net) (setf *string-net* string-net) (print (string-net::count-strings string-net)) (string-net::write-string-net string-net "projects:xle;bm-morph.net")) ;;;;; orthoclumps #|| (defparameter *klump-tree* nil) (let ((count 0) (k-id 0) (l-id nil) (clump ()) (forms-tree (dat::make-string-tree))) (do-query ((klump-id lemma-id grunnform tag) [select [klump-id] [lemma-id] [grunnform] [tag] :from [usd-leks-tagger-bm v-logon-tagger-fullform] ;;:where [in [klump-id] '(1 123 2952 4444 20703 11)] :order-by '([klump-id] [lemma-id])]) (unless (= k-id klump-id) (when (cdr clump) (incf count) (print (cons k-id clump))) (setf clump () k-id klump-id)) (let* ((pos-end (position #\space tag)) (pos (subseq tag 0 pos-end)) (pos (if (and (equal pos "adj") (or (search "" tag) (search "" tag))) "verb" pos))) (pushnew (list grunnform lemma-id pos) clump :test #'equal) #+ignore (when (zerop (mod (incf count) 100)) (print (list klump-id lemma-id grunnform pos))))) (when (cdr clump) (incf count) (print (cons k-id clump))) (print count)) (with-open-file (stream "lisp:projects;xle;morph;orthoclumps.txt" :direction :output :if-exists :supersede) (let ((k-id 0) (l-id 0) #+ignore(stream *standard-output*)) (dat::do-tree-set (string *klump-tree*) (destructuring-bind (clump-id lemma lemma-id pos) (split string #\:) (let ((clump-id (parse-integer clump-id)) (lemma-id (parse-integer lemma-id))) (if (eq k-id clump-id) (format stream " ~a~32t" lemma) (format stream "~6d ~a~32t" clump-id lemma)) (if (eq l-id lemma-id) (format stream " ~a~%" pos) (format stream "~6d ~a~%" lemma-id pos)) (setf k-id clump-id l-id lemma-id)))))) (defun scarrie-style (word &key (net cgp::*scarrie-net*)) (let ((res ()) (cgp::*tagger* (cgp::multi-tagger (gethash "nbo" cgp::*cg-table*))) (restored-word (cgp::restore-string (copy-seq word)))) (dolist (val (string-net::string-values net restored-word)) (destructuring-bind (compressed-lemma style replacement feature-vector) (split val #\: 4) (declare (ignore replacement)) (let ((lemma-list (find-if (lambda (list) (and (string= (car list) compressed-lemma) (string= (cadr list) feature-vector))) res))) (if lemma-list (push style (cddr lemma-list)) (push (list* compressed-lemma feature-vector (list style)) res))))) (mapcar (lambda (lemma-list) (destructuring-bind (compressed-lemma feature-vector . styles-comp) lemma-list ;;(print (cons word lemma-list)) ;; check if this works on Linux (let ((bv (cgp::string-to-bit-vector (map 'array #'string-net::translate-char feature-vector)))) (when styles-comp (cgp::set-feature bv 'cgp::scarrie)) (cgp::set-scarrie-style bv styles-comp) (cons (cgp::translate-string (decompress-string compressed-lemma restored-word)) #-bit-vectors(code-features bv) #+bit-vectors bv)))) res))) (defun find-scarrie-normal-form (clump) (let ((cgp::*tagger* (cgp::multi-tagger (gethash "nbo" cgp::*cg-table*))) (cn nil) (c1 nil) (c2 nil) (all-found-p t)) (dolist (lemma-list clump) (destructuring-bind (lemma id pos) lemma-list (dolist (l+fs (scarrie-style lemma)) (when (and (string= (car l+fs) lemma) (cgp::has-features-p (cdr l+fs) (list (intern (string-upcase pos) :cgp))) ) (cond ((not (cgp::has-feature-p (cdr l+fs) 'cgp::scarrie)) (setf all-found-p nil)) ((cgp::has-feature-p (cdr l+fs) 'cgp::cn) (push lemma-list cn)) ((cgp::has-feature-p (cdr l+fs) 'cgp::c1) (push lemma-list c1)) ((cgp::has-feature-p (cdr l+fs) 'cgp::c2) (push lemma-list c2))))))) #+debug (print (list :cn cn :c1 c1 :c2 c2)) (cond ((not all-found-p) nil) (cn (and (null (cdr cn)) (cons "N" (car cn)))) (c1 (and (null (cdr c1)) (cons "C1" (car c1)))) (c2 (and (null (cdr c2)) (cons "C2" (car c2))))))) (eval-when (:load-toplevel :compile-toplevel :execute) (defparameter *declatin* (connect "declatin" :user-id "system" :password "gvprckvnis" :db-type :oci :if-exists :warn-old))) #+only-once (with-database (*declatin*) ;;(execute :drop-user [logon] :cascade) (create-user [logon] :password "muxluxi") (execute :grant '(connect resource) :to [logon]) (create-table [logon clump-ids] `((id integer :primary-key))) (create-table [logon clumps] `((id integer) (lemma (varchar 32)) (lemma-id integer) (pos (varchar 8)) (norm (varchar 4)) (:foreign-key [id] :references [logon clump-ids] [id]) (:primary-key ([id] [lemma] [lemma-id] [pos]))))) (defun store-clump (id lemmas) (with-database (*declatin*) (with-transaction () (insert-records :into [logon clump-ids] :values (list id)) (dolist (lemma-list lemmas) (if (cdddr lemma-list) (destructuring-bind (lemma norm lemma-id pos) lemma-list (insert-records :into [logon clumps] :values (list id lemma lemma-id pos norm))) (destructuring-bind (lemma lemma-id pos) lemma-list (insert-records :into [logon clumps] :values (list id lemma lemma-id pos nil)))))))) (let ((count 0) (k-id 0) (l-id nil) (clump ()) (forms-tree (dat::make-string-tree))) (with-database (*usdprod*) (do-query ((klump-id lemma-id grunnform tag) [select [klump-id] [lemma-id] [grunnform] [tag] :from [usd-leks-tagger-bm v-logon-tagger-fullform] ;;:where [in [klump-id] '(1 123 2952 4444 20703 11)] :order-by '([klump-id] [lemma-id])]) (unless (= k-id klump-id) #+old (when (cdr clump) (incf count) (print (cons k-id clump))) (when (cdr clump) (let ((normal-form (find-scarrie-normal-form clump))) (when normal-form (push (car normal-form) (cddr normal-form)) (incf count) (when (zerop (mod count 100)) (print (cons k-id clump)))) (store-clump k-id clump))) (setf clump () k-id klump-id)) (let* ((pos-end (position #\space tag)) (pos (subseq tag 0 pos-end)) (pos (if (and (equal pos "adj") (or (search "" tag) (search "" tag))) "verb" pos))) (pushnew (list grunnform lemma-id pos) clump :test #'equal) #+ignore (when (zerop (mod (incf count) 100)) (print (list klump-id lemma-id grunnform pos))))) (when (cdr clump) (let ((normal-form (find-scarrie-normal-form clump))) (when normal-form (push (car normal-form) (cddr normal-form)) (incf count)) (store-clump k-id clump)))) (print count)) (print (select [*] :from [logon clumps] :where [= [id] 3221])) (let ((clump-ids (select [*] :from [logon clump-ids] :flatp t))) (print (length clump-ids))) ||# :eof