;;;-*- Mode: Lisp; Package: CL-USER -*- (in-package :cl-user) (defparameter *feature-table* (make-hash-table :test #'equal)) ;; copy from feature-coding-latin1.lisp (defun bit-vector-to-string (cv) (declare (optimize (speed 3) (safety 0))) (let ((string (make-string (ceiling (/ (length cv) 8)))) (byte 0)) (loop for bit across cv with i = 0 and pos = 0 do (when (= bit 1) (setf byte (logxor byte (ash 1 i)))) (when (= (incf i) 8) (setf i 0 (char string pos) (code-char byte) byte 0) (incf pos)) finally (unless (zerop i) (setf (char string pos) (code-char byte)))) ;; *** SOMEWHAT BUGGY? (let ((last-non-null (position-if-not (lambda (c) (char= c #\Null)) string :from-end t))) (if last-non-null (subseq string 0 (1+ last-non-null)) "")))) (defparameter *bm-morph-feature-vector* #("Noun" "Prop" "Adj" "Verb" "Aux" "Adv" "Prep" "Interj" "Conj" "Pron" "DetPron" "Ord" "Coord" "Punct" "Det" "Art" "Symbol" "Money" "PartInf" "Sent" "Num" "Card" "Pos" "Comp" "Sup" "PastPart" "PresPart" "Masc" "Fem" "Neut" "MF" "MFN" "Def" "Indef" "NoDef" "Def/Pl" "Sg" "Pl" "SP" "Nom" "Acc" "Gen" "Pres" "Past" "Impv" "Infin" "SForm" "Unknown")) (let ((n -1) (count 0)) (clrhash *feature-table*) (loop for i from 0 for f across *bm-morph-feature-vector* do (setf (gethash f *feature-table*) i))) (defun compress-features (feature-string bv) (dotimes (i (length bv)) (setf (sbit bv i) 0)) (let ((features (u::split feature-string #\+ nil nil t))) (dolist (f features) (let ((code (gethash f *feature-table*))) (when (null code) (error "No code for ~s" f)) (setf (sbit bv code) 1))) (bit-vector-to-string bv))) #+test (compress-morphology) #+old (defparameter *bm-morph-feature-vector* #("Noun" "Gen" "Def" "Indef" "Sg" "Masc" "Pl" "Neut" "MF" "Adj" "Verb" "MFN" "SP" "Pos" "PastPart" "Def/Pl" "Sup" "Fem" "NoDef" "Comp" "SForm" "Pres" "Infin" "PresPart" "Past" "Prop" "Impv" "Adv" "Prep" "Interj" "Num" "Card" "Aux" "Conj" "Pron" "DetPron" "Ord" "Acc" "Nom" "Coord" "Punct" "Det" "Art" "Symbol" "Money" "PartInf")) #+test (let ((n -1) (count 0)) (clrhash *feature-table*) (u::with-file-lines (line "projects:xle;bm-morph.txt") (destructuring-bind (word lemma features) (u::split line #\: 3) (when (zerop (mod (incf count) 10000)) (print (cons count line))) (let ((fl (u::split features #\+ nil nil t))) (dolist (f fl) (let ((pair (gethash f *feature-table*))) (if pair (incf (car pair)) (setf (gethash f *feature-table*) (cons 0 (incf n))))))))) (let* ((f-list (sort (u::collecting (maphash (lambda (key val) (u::collect (cons key (car val)))) *feature-table*)) #'> :key #'cdr)) (n -1)) (format t "(defparameter *bm-morph-feature-vector*~% #(~{~s~^ ~}))" (mapcar #'car f-list)) (dolist (pair f-list) (setf (cdr (gethash (car pair) *feature-table*)) (incf n))))) (defparameter *bv* nil) ;;#+test (defun compress-morphology () (let ((string-net (make-instance 'string-net::list-string-net)) (count 0) (bv (make-array (let ((size 0)) (maphash (lambda (key val) (declare (ignore key)) (setf size (max size val))) *feature-table*) (1+ size)) :element-type 'bit :initial-element 0))) (block test (u::with-file-lines (line "projects:xle;morph;bm-morph.txt") (destructuring-bind (word lemma features) (u::split line #\: 3) (unless (find word '("I" "Å") :test #'string=) (let ((compressed-string (u::concat word ":" (string-net::compress-string lemma word) ":" (compress-features features bv)))) (string-net::add-string string-net compressed-string) ;;(when (= count 5000) (return-from test)) (when (zerop (mod (incf count) 1000)) (print (cons count compressed-string)))))))) #+prog (dolist (addition '(("Vollavik" ))) (destructuring-bind (word lemma features) addition (let ((compressed-string (u::concat word ":" (string-net::compress-string lemma word) ":" (compress-features features bv)))) (string-net::add-string string-net compressed-string) ;;(when (= count 5000) (return-from test)) (when (zerop (mod (incf count) 1000)) (print (cons count compressed-string)))))) (string-net::minimize-net string-net) (setf *bv* string-net) (print (string-net::count-strings string-net)) (string-net::write-string-net string-net "projects:xle;morph;bm-morph.net"))) #+test (compress-morphology) #+old (defun compress-morphology-clumps () (let ((string-net (make-instance 'string-net::list-string-net)) (count 0) (bv (make-array (let ((size 0)) (maphash (lambda (key val) (declare (ignore key)) (setf size (max size val))) *feature-table*) (1+ size)) :element-type 'bit :initial-element 0))) (block test (u::with-file-lines (line "projects:xle;morph;bm-morph-clumps.txt" :external-format :iso-8859-1) (destructuring-bind (word lemma clump-id features) (u::split line #\: 4) (unless (find word '("I" "Å") :test #'string=) (let ((compressed-string (u::concat word ":" (string-net::compress-string lemma word) ":" clump-id ":" (compress-features features bv)))) (string-net::add-string string-net compressed-string) ;;(when (= count 5000) (return-from test)) (when (zerop (mod (incf count) 1000)) (print (cons count compressed-string)))))))) (string-net::minimize-net string-net) (setf *bv* string-net) (print (string-net::count-strings string-net)) (string-net::write-string-net string-net "projects:xle;morph;bm-morph-clumps.net"))) #+old (defun compress-morphology-clumps () (let ((string-net (make-instance 'string-net::list-string-net)) (count 0) (bv (make-array (let ((size 0)) (maphash (lambda (key val) (declare (ignore key)) (setf size (max size val))) *feature-table*) (1+ size)) :element-type 'bit :initial-element 0))) (block test (u::with-file-lines (line "projects:xle;morph;bm-morph-clumps.txt" :external-format :iso-8859-1) (destructuring-bind (word lemma clump-id features) (u::split line #\: 4) (unless (find word '("I" "Å") :test #'string=) (let ((compressed-string (u::concat word ":" (string-net::compress-string lemma word) ":" (compress-features features bv) ":" clump-id))) (string-net::add-string string-net compressed-string) ;;(when (= count 5000) (return-from test)) (when (zerop (mod (incf count) 1000)) (print (cons count compressed-string)))))))) (string-net::minimize-net string-net) (setf *bv* string-net) (print (string-net::count-strings string-net)) (string-net::write-string-net string-net "projects:xle;morph;bm-morph-clumps-1.net"))) (defparameter *def-table* (make-hash-table)) (u::with-file-lines (line "projects:xle;morph;norsk-ordbank;bm-def.tab" :external-format :iso-8859-1) (destructuring-bind (clump-id def-id main-def-p definition) (u:split line #\| 4) (setf (gethash (parse-integer clump-id) *def-table*) (if (gethash (parse-integer clump-id) *def-table*) (u:concat (gethash (parse-integer clump-id) *def-table*) "; " definition) definition)))) (defparameter *significant-k-numbers* ()) (defparameter *k-numbers-def-table* (make-hash-table)) (dat::with-file-lines (line "projects:xle;morph;bm-morph-k-list.txt") (destructuring-bind (lemma pos genus k def) (dat::split line #\tab 5) (dolist (k (dat::split k #\+)) (let ((k (parse-integer k))) (unless (zerop k) (setf (gethash k *k-numbers-def-table*) def) (pushnew k *significant-k-numbers*)))))) (defun compress-morphology-clumps (&key (store-net-p t)) (let ((string-tree (dat::make-string-tree)) (count 0) (clump-count 0) (duplicate-count 0) (string-net (make-instance 'string-net::list-string-net)) (bv (make-array (let ((size 0)) (maphash (lambda (key val) (declare (ignore key)) (setf size (max size val))) *feature-table*) (1+ size)) :element-type 'bit :initial-element 0))) (u::with-file-lines (line "projects:xle;morph;bm-morph-clumps.txt" :external-format :iso-8859-1) (destructuring-bind (word lemma clump-id features) (u::split line #\: 4) (let ((pos (cadr (u::split features #\+ 3)))) (unless (find word '("I" "Å") :test #'string=) (let ((clump-id (parse-integer clump-id))) (if (find clump-id (dat::string-tree-get string-tree (u:concat lemma ":" pos)) :key #'caar) (push (u:concat word ":" features) (cdr (find clump-id (dat::string-tree-get string-tree (u:concat lemma ":" pos)) :key #'caar))) (push (list (list clump-id) (u:concat word ":" features)) (dat::string-tree-get string-tree (u:concat lemma ":" pos)))) (when (zerop (mod (incf count) 100000)))))))) (print count) (dat::do-string-tree (word clump-list string-tree) (when (cdr clump-list) ;; more than one clump (incf clump-count) (loop for (clump . rest) on clump-list do (loop for clump1 in rest do (when (and (car clump) (= (length clump) (length clump1)) (= (length (cdr clump)) (length (intersection (cdr clump) (cdr clump1) :test #'string=)))) (push (caar clump1) (car clump)) (setf (car clump1) ()) ;;(print (length clump-list)) (incf duplicate-count)))))) (with-open-file (pos-stream "projects:xle;morph;bm-lemma-pos-k-list.txt" :direction :output :if-exists :supersede) (with-open-file (stream "projects:xle;morph;bm-morph-k-list.txt" :direction :output :if-exists :supersede) (dat::do-string-tree (lemma clump-list string-tree) ;;(print (list lemma clump-list)) (let ((effective-clump-count (count-if #'car clump-list))) (dolist (clump clump-list) (when (car clump) (dolist (word+features (cdr clump)) (destructuring-bind (word features) (u::split word+features #\: 2) (let ((compressed-string (format nil "~a:~a:~a:~{~d~^+~}" word (string-net::compress-string (car (u::split lemma #\: 2)) word) (compress-features features bv) (when (or (> effective-clump-count 1) (loop for k in (car clump) thereis (find k *significant-k-numbers*))) (car clump))))) ;;(when (> effective-clump-count 1) (print compressed-string)) ;;(write-line compressed-string stream) (string-net::add-string string-net compressed-string) (when (zerop (mod (incf count) 1000))#+ignore (print (list* count compressed-string clump-list)))))) (destructuring-bind (lemma pos) (u::split lemma #\: 2) (format pos-stream "~a~c~a~c~{~d~^+~}~%" lemma #\tab pos #\tab (car clump))) (when (> effective-clump-count 1) (destructuring-bind (lemma pos) (u::split lemma #\: 2) (format stream "~a~c~a~c~a~c~{~d~^+~}~c~{~d~^ | ~}~%" lemma #\tab pos #\tab (if (string= pos "Noun") (cond ((search "+Masc" (cadr clump)) "+Masc") ((search "+Fem" (cadr clump)) "+Fem") ((search "+Neut" (cadr clump)) "+Neut") ((search "+MF" (cadr clump)) "+MF") (t "")) "") #\tab (car clump) #\tab (mapcar (lambda (clump-id) (or (find clump-id *significant-k-numbers*) (gethash clump-id *def-table*) "")) (car clump))))))))))) (when store-net-p (string-net::minimize-net string-net) (setf *bv* string-net) (print (string-net::count-strings string-net)) (string-net::write-string-net string-net "projects:xle;morph;bm-morph-clumps.net")) (print (list clump-count duplicate-count)))) #+test (compress-morphology-clumps :store-net-p t) #+test (defparameter *bm* (string-net::read-net "~/lisp/projects/xle/morph/bm-morph-clumps.net" :translate-p nil)) :eof