;;; -*- Mode: LISP; Package: CGP; BASE: 10; Syntax: ANSI-Common-Lisp; -*- ;; change later (in-package :cgp) (cl-user::mk :sql) ;(um:use-module :sql "sql:sql;sql-system") (use-package :sql) (initialize-database-type) ;(disable-sql-reader-syntax) (enable-sql-reader-syntax) ; (disconnect) ;(connect "Oracle" :user-id "system" :password "gvprckvnis" :if-exists :warn-old) (connect "OsloTagger" :user-id "paulm" :password "gvprckvnis" :if-exists :warn-old) #+test (select [oppslag] [grunnform] [tag] [ordliknande] :from [tagger v-tagger-forkortingar] ;:where [= [ordliknande] "N"] ;:where [< [rownum] 10] ) (defparameter *abbreviations* (make-instance 'active-string-net)) (defparameter *word-like-abbreviations* (make-instance 'active-string-net)) (defparameter *titles* (make-instance 'active-string-net)) (defparameter *symbols* (make-instance 'active-string-net)) (defparameter *expressions* (make-instance 'active-string-net)) (defparameter *nn-abbreviations* (make-instance 'active-string-net)) (defparameter *nn-word-like-abbreviations* (make-instance 'active-string-net)) (defparameter *nn-titles* (make-instance 'active-string-net)) (defparameter *nn-symbols* (make-instance 'active-string-net)) (defparameter *nn-expressions* (make-instance 'active-string-net)) ;(defparameter *lemmata* (make-instance 'active-string-net)) ;; bokmål (let ((*feature-coding* *bm-coding*)) (loop for string-net in (list *expressions* *symbols* *titles* *abbreviations* *word-like-abbreviations*) for filename in (list "expressions" "symbols" "titles" "abbreviations" "word-like-abbreviations") for table in '(:v-tagger-uttrykk :v-tagger-symbol :v-tagger-titlar :v-tagger-forkortingar :v-tagger-forkortingar) do (let ((count 0)) (do-query ((word base-form tag) [select [oppslag] [grunnform] [tag] :from [tagger ?table] :where (cond ((string= filename "abbreviations") [= [ordliknande] "N"]) ((string= filename "word-like-abbreviations") [= [ordliknande] "Y"]) (t [= 1 1])) ]) (add-string string-net (u:concat word ":" (compress-string base-form word) ":" (bit-vector-to-string (code-from-features (u:string-parse tag :whitespace " ")))) #+old (u:concat word ":" base-form ":" tag)) (incf count) (when (zerop (mod count 1000)) (format t "~%~5d ~a" count word))) (print count) (print (count-strings string-net)) (print (count-nodes string-net)) (minimize-tree string-net) (calculate-gw-compression-tree string-net) (compress-net string-net) (store-net string-net (u:concat "projects:cgp;nets1;" filename ".net1")) #+ignore (count-strings string-net)))) ;; nynorsk (let ((*tagger* *nny-tagger*)) (loop for string-net = (make-instance 'active-string-net) for filename in (list "nny-expressions" "nny-symbols" "nny-titles" "nny-abbreviations" "nny-word-like-abbreviations") for table in '(:v-tagger-uttrykk :v-tagger-symbol :v-tagger-titlar :v-tagger-forkortingar :v-tagger-forkortingar) do (let ((count 0)) (do-query ((word base-form tag) [select [oppslag] [grunnform] [tag] :from [taggeadm-nn ?table] :where (cond ((string= filename "nny-abbreviations") [= [ordliknande] "N"]) ((string= filename "nny-word-like-abbreviations") [= [ordliknande] "Y"]) (t [= 1 1]))]) (add-string string-net (u:concat word ":" (compress-string base-form word) ":" (bit-vector-to-string (code-from-features (u:string-parse tag :whitespace " ")))) #+old (u:concat word ":" base-form ":" tag)) (incf count) (when (zerop (mod count 1000)) (format t "~%~5d ~a" count word))) (print count) (print (count-strings string-net)) (print (count-nodes string-net)) (minimize-tree string-net) (calculate-gw-compression-tree string-net) (compress-net string-net) (store-net string-net (u:concat "projects:cgp;nets;" filename ".net")) #+ignore (count-strings string-net)))) (select [count [*]] :from [taggeadm-nn :v-fuge-fullform]) (let ((count 0) (*tagger* *nny-tagger*)) (with-open-file (stream "projects:cgp;nets;nny-fullforms.txt" :direction :output :if-exists :supersede :if-does-not-exist :create) (do-query ((word paradigm-id infl-nr) [select [oppslag] [paradigme-id] [boy-nummer] ;:distinct t :from [taggeadm-nn v-fuge-fullform] ;:where [like [oppslag] "a%"] ]) (write-line (u:concat word ":" paradigm-id ":" (format nil "~d" (truncate infl-nr))) stream) (incf count) (when (zerop (mod count 1000)) (format t "~%~5d ~a" count word))))) (let ((count 0) (*tagger* *nny-tagger*)) (with-open-file (stream "projects:cgp;nets;nny-lemmata.txt" :direction :output :if-exists :supersede :if-does-not-exist :create) (do-query ((word paradigm-id infl-nr) [select [oppslag] [paradigme-id] [boy-nummer] ;:distinct t :from [taggeadm-nn v-fuge-lemma] ;:where [like [oppslag] "a%"] ]) (write-line (u:concat word ":" paradigm-id ":" (format nil "~d" (truncate infl-nr))) stream) (incf count) (when (zerop (mod count 1000)) (format t "~%~5d ~a" count word))))) (let ((count 0) (*tagger* *nny-tagger*) (net (make-instance 'active-string-net))) (with-open-file (stream "projects:cgp;nets;nny-lemmata.txt" :direction :output :if-exists :supersede :if-does-not-exist :create) (do-query ((word paradigm-id infl-nr) [select [oppslag] [paradigme-id] [boy-nummer] ;:distinct t :from [taggeadm-nn :v-fuge-lemma] ;:where [like [oppslag] "a%"] ]) (let ((line (u:concat word ":" paradigm-id ":" (format nil "~d" (truncate infl-nr))))) (write-line line stream) ;(add-string net line) (incf count) (when (zerop (mod count 1000)) (format t "~%~5d ~a" count word))))) (print count) (minimize-tree net) (calculate-gw-compression-tree net) (compress-net net) ;(store-net net (u:concat "projects:cgp;nets;nny-lemmata.net")) ) (let ((count 0) (*tagger* *nny-tagger*) (net (make-instance 'active-string-net))) (with-file-lines (line "projects:cgp;nets;nny-lemmata.txt") (when (zerop (mod (incf count) 1000)) (format t "~%~5d ~a" count line)) (add-string net line)) (print count) (minimize-tree net) (calculate-gw-compression-tree net) (compress-net net) (store-net net "projects:cgp;nets;nny-lemmata.net")) (let ((count 0) (*tagger* *nny-tagger*) (net (make-instance 'active-string-net))) (with-file-lines (line "projects:cgp;nets;nny-fullforms.txt") (when (zerop (mod (incf count) 1000)) (format t "~%~5d ~a" count line)) (add-string net line)) (print count) (minimize-tree net) (calculate-gw-compression-tree net) (compress-net net) (store-net net "projects:cgp;nets;nny-fullforms.net")) (pp #'select [owner] [table-name] :from [all-tables] :distinct t :where [like [owner] "TAGGEADM%"]) ;(defparameter *test-net* (make-instance 'huge-active-string-net)) (print-strings *nn-word-like-abbreviations*) #+test (let ((count 0) (string-net *test-net*)) (do-query ((word base-form tag) [select [oppslag] [grunnform] [tag] :from [tagger v-tagger-uttrykk]]) (add-string string-net (u:concat word ":" base-form ":" tag)) (incf count) (when (zerop (mod count 1000)) (format t "~%~5d ~a" count word))) (print count) #+ignore (progn (print (count-strings string-net)) (print (count-nodes string-net)) (minimize-tree string-net) (calculate-compression-mapping string-net) (compress-net string-net :iterate t) (count-strings string-net))) ;(minimize-tree *test-net*) ;(calculate-compression-mapping *test-net*) ;(compress-net *test-net* :iterate t) ;(print-strings *test-net*) #| (defparameter *abbreviations* (load-string-net "projects:cgp;nets;abbreviations.net")) (defparameter *word-like-abbreviations* (load-string-net "projects:cgp;multitagger;word-like-abbreviations.net")) (defparameter *titles* (load-string-net "projects:cgp;multitagger;titles.net")) (defparameter *symbols* (load-string-net "projects:cgp;multitagger;symbols.net")) (defparameter *expressions* (load-string-net "projects:cgp;multitagger;expressions.net")) |# #+only-once (let ((*feature-coding* *bm-coding*)) (loop for (net file) on (list ;*fullforms* "projects:cgp;multitagger;fullforms.text" ;*lemmata* "projects:cgp;multitagger;lemmata.text" *abbreviations* "projects:cgp;nets;abbreviations.text" *word-like-abbreviations* "projects:cgp;nets;word-like-abbreviations.text" *titles* "projects:cgp;nets;titles.text" *symbols* "projects:cgp;nets;symbols.text" *expressions* "projects:cgp;nets;expressions.text") by #'cddr do (with-open-file (stream file :direction :output :if-exists :supersede) (map-strings net (lambda (line) (write-line line stream)))))) #+only-once (loop for (file net-file) on (list "projects:cgp;nets;fullforms.text" "projects:cgp;nets1;fullforms.net" "projects:cgp;nets;lemmata.text" "projects:cgp;nets1;lemmata.net") by #'cddr do (let ((string-net (make-instance 'active-string-net)) (count 0)) (print file) (u:with-file-lines (line file) (add-string string-net line) (when (zerop (mod (incf count) 1000)) (format t "~%~5d ~a" count line))) (print count) (minimize-tree string-net) (calculate-gw-compression-tree string-net) (compress-net string-net :iterate t) (print (count-strings string-net)) (store-net string-net net-file))) #+only-once (let ((*feature-coding* *bm-coding*)) (loop for (file net-file) on (list "projects:cgp;nets;abbreviations.text" "projects:cgp;nets1;abbreviations.net" "projects:cgp;nets;word-like-abbreviations.text" "projects:cgp;nets1;word-like-abbreviations.net" "projects:cgp;nets;titles.text" "projects:cgp;nets1;titles.net" "projects:cgp;nets;symbols.text" "projects:cgp;nets1;symbols.net" "projects:cgp;nets;expressions.text" "projects:cgp;nets1;expressions.net") by #'cddr do (let ((string-net (make-instance 'active-string-net)) (count 0)) (print file) (u:with-file-lines (line file) (destructuring-bind (word base-form feature-string) (u:string-parse line :whitespace ":") (let ((features (u:string-parse feature-string :whitespace " "))) (add-string string-net (u:concat word ":" (compress-string base-form word) ":" (bit-vector-to-string (code-from-features features))) #+old (u:concat word ":" base-form ":" tag)) (when (zerop (mod (incf count) 1000)) (format t "~%~5d ~a" count line))))) (print count) (minimize-tree string-net) (calculate-gw-compression-tree string-net) (compress-net string-net :iterate t) (print (count-strings string-net)) (store-net string-net net-file)))) (lemma-and-features "i morgen" :net *expressions*) (print-strings *abbreviations*)