;;; -*- Mode: LISP; Package: CGP; BASE: 10; Syntax: ANSI-Common-Lisp; -*- (in-package "CGP") (cl-user::mk :sql) (use-package :sql) (initialize-database-type) ;(disable-sql-reader-syntax) (enable-sql-reader-syntax) (connect "Oracle" :user-id "system" :password "gvprckvnis" :if-exists :warn-old) #+test (select [oppslag] [paradigme_id] [boy_nummer] :from [tagger v-fuge-fullform] :where [like [oppslag] "alfabetis%"] ) ;(defparameter *fullforms* (make-instance 'active-string-net)) ;(defparameter *lemmata* (make-instance 'active-string-net)) ;; bugs: ;; EÿS-avtale ;; < is typo in: triki entrée ;; destructive! (defun translate-string (str) (loop for c across str for i from 0 do (let ((pos (position c "£ÿÄÒÙѽǥ+§"))) (when pos (setf (char str i) (char "ëÖÖïàíçäöôü" pos))))) str) #+test (progn ;time (let ((count 0)) (reset-net *fullforms*) (do-query ((word paradigm-id infl-nr) [select [oppslag] [paradigme-id] [boy-nummer] ;:distinct t :from [tagger v-fuge-fullform] :where [like [oppslag] "a%"]]) (add-string *fullforms* (u:concat word ":" paradigm-id ":" (format nil "~d" (truncate infl-nr)))) (incf count) (when (zerop (mod count 1000)) (format t "~%~5d ~a" count word))) (print count) (print (count-strings *fullforms*)) (print (count-nodes *fullforms*)) (minimize-tree *fullforms*) (calculate-compression-mapping *fullforms*) (compress-net *fullforms* :iterate t) (count-strings *fullforms*))) #+test (print-strings *fullforms*) #+test (store-net *fullforms* "projects:cgp;multitagger;fullforms.net") #+test (store-net *lemmata* "projects:cgp;multitagger;lemmata.net") #+test (match-string *fullforms* "abbreviert") #+test (defun create-tagger-index (file string-net) (let ((count 0)) (with-file-lines (line file) (add-string string-net line) (incf count) (when (zerop (mod count 1000)) (format t "~%~5d ~a" count line))) (print count) (print (count-strings string-net)) (print (count-nodes string-net)) (minimize-tree string-net) (calculate-compression-mapping string-net) (compress-net string-net :iterate t) (count-strings string-net))) #+test (time (create-tagger-index "projects:cgp;multitagger;lemmata.text" *lemmata*)) #+test (destructuring-bind (fullform code inflection-nr lemma+features) (u:string-parse "forråtnet:011:6:\"forråtne\" adj nøyt ub ent" :whitespace ":") (declare (ignore fullform code inflection-nr)) (destructuring-bind (lemma &rest features) (u:string-parse lemma+features :whitespace " " :delimiter-pairs '((#\" . #\"))) (declare (ignore lemma)) (print features))) #+test (let ((feature-list ())) (maphash (lambda (feature count) (push (cons feature count) feature-list)) *statistics-table*) (dolist (f+c (sort feature-list #'> :key #'cdr)) (format t "~%~6d ~a" (cdr f+c) (car f+c)))) #+test (let ((feature-list ())) (maphash (lambda (feature count) (push (cons feature count) feature-list)) *statistics-table*) (mapcar #'car (sort feature-list #'> :key #'cdr))) #+scarrie (loop for i from 0 for feature in ;; 71 '("noun" "m" "sg" "def" "indef" "n" "pl" "f" "adj" "pos" "verb" "main" "indic" "active" "e" "pastpart" "pres" "trans" "intrans" "passive" "inf" "pret" "endepart" "imp" "dummysubj" "sup" "ref" "adv" "comp" "ditrans" "prep" "interj" "transref" "det" "quant" "scomp" "subj" "pro" "infcomp" "acc" "nom" "pers" "p3" "acomp" "aux" "ncomp" "p2" "objacomp" "dem" "poss" "p1" "objncomp" "conj" "accinf" "int" "svarord" "refl" "pn" "rec" "aa" "expl" "objscomp" "objinfcomp" "refscomp" "refinfcomp" "gap-s" "gap-e" "hyphen" "punctuation" "comma" "stop") do (setf (gethash feature *feature-table*) i)) #+scarrie (defun translate-scarrie-feature (feature) (loop for (scarrie-f . f) in '(("mfn" "m" "f" "n") ("sgpl" "sg" "pl") ("fm" "m") ("fn" "f" "n") ("mf" "m" "f") ("mn" "m" "n") ("nogender" "m" "f" "n") ; ?? ("nomacc" "nom" "acc") ("defindef" "def" "indef") ("p123" "p1" "p2" "p3") ("infpres" "pres" "inf") ("endepart" "endepart" "adj" "m" "f" "n" "pos" "def" "indef" "sg" "pl")) when (string-equal feature scarrie-f) return f)) ;(translate-scarrie-feature "mfn") ;; translates a list of Scarrie features into a bit vector #+scarrie (defun code-scarrie-features (features) (let ((feature-code (make-array 80 :element-type 'bit :initial-element 0)) (first-p t)) (dolist (feature features) (let* ((f (string-downcase (symbol-name feature))) (f-list (translate-scarrie-feature f))) (cond (f-list (dolist (f f-list) (let ((code (gethash f *feature-table*))) (when code (setf (bit feature-code code) 1))))) (t (when first-p (setf f (case feature (N "noun") (V "verb") (P "prep") (otherwise f)) first-p nil)) (let ((code (gethash f *feature-table*))) (when code (setf (bit feature-code code) 1))))))) feature-code)) #+scarrie (defun scarrie-features-statistics (features code-vector) (let ((first-p t)) (dolist (feature features) (let* ((f (string-downcase (symbol-name feature))) (f-list (translate-scarrie-feature f))) (cond (f-list (dolist (f f-list) (let ((code (gethash f *feature-table*))) (when code (incf (aref code-vector code)))))) (t (when first-p (setf f (case feature (N "noun") (V "verb") (P "prep") (otherwise f)) first-p nil)) (let ((code (gethash f *feature-table*))) (when code (incf (aref code-vector code)))))))))) #+test (defun order-features-by-frequency () (mapcar (lambda (i) (aref *feature-vector* i)) (stable-sort (loop for i from 0 to *code-vector-length* collect i) (lambda (x y) (> (aref *code-vector* x) (aref *code-vector* y)))))) ; calculate feature frequency #+test (do-query ((features) [select [conc "(" [replace [gram-info] "_" " "] ")"] :from [lexicon] ;:where [< [rownum] 1000] ]) (scarrie-features-statistics (read-from-string features) *code-vector*)) #+test (do-query ((id word-form features) [select [id] [word-form] [conc "(" [replace [gram-info] "_" " "] ")"] :from [scarrie lexicon] :where [< [rownum] 1000]]) (print (code-scarrie-features (read-from-string features)))) #+test (string-to-bit-vector (bit-vector-to-string #*00100011110000001110000100000001000000000000000000000000000000000000000000000000 )) ;; base forms #+test (time (let ((count 0)) (reset-net) (do-query ((word) [select [base-form] :distinct t :from [lexicon] ;:where [< [rownum] 100000] ]) (add-word word) (incf count) (when (zerop (mod count 1000)) (format t "~%~5d ~a" count word))) (print count) (print (count-words)) (print (count-nodes)) (minimize-tree) (print (count-words)) (count-nodes))) #+test (do-query ((features) [select [conc "(" [replace [gram-info] "_" " "] ")"] :from [lexicon] :where [< [rownum] 1000]]) (print (code-scarrie-features (read-from-string features)))) #+test (time (let ((count 0)) (reset-net) (do-query ((word features) [select [conc [word-form] ":" [base-form]] [conc "(" [replace [gram-info] "_" " "] ")"] :from [lexicon] :where [< [rownum] 100000]]) (add-word (u:concat word ":" (bit-vector-to-string (code-scarrie-features (read-from-string features))))) (incf count) (when (zerop (mod count 1000)) (format t "~%~5d ~a" count word))) (print count) (print (count-words)) (print (count-nodes)) (minimize-tree) (print (count-words)) (count-nodes))) #+test (count-nodes) #+test (select [count [*]] :from [select [word-form] [gram-info] :distinct t :from [lexicon] :where [< [rownum] 200000]]) #+test (match-word "hukommelse") ;; word-form + base-form #+test (time (let ((count 0)) (reset-net) (do-query ((word base) [select [word-form] [base-form] :distinct t :from [lexicon] ;:where [< [rownum] 100000] ]) (add-word (u:concat word ":" (compress-word base word))) (incf count) (when (zerop (mod count 1000)) (format t "~%~5d ~a" count word))) (print count) (print (count-words)) (print (count-nodes)) (minimize-tree) (print (count-words)) (count-nodes))) ;-> (65012 154520) #+test (string-baseform "bundet") ;; base-form + word-form #+test (time (let ((count 0)) (reset-net) (do-query ((word base) [select [word-form] [base-form] :distinct t :from [lexicon]]) (add-word (u:concat base ":" (compress-word word base))) (incf count) (when (zerop (mod count 1000)) (format t "~%~5d ~a" count word))) (print count) (print (count-words)) (print (count-nodes)) (minimize-tree) (print (count-words)) (count-nodes))) ;-> (61172 125630) #+test (word-baseform "fint") #| (time (calculate-compression-mapping *scarrie*)) ; 8 sec (time (compress-net *scarrie*)) ; 13 sec (time (compress-net *scarrie* :iterate t)) ; 46 sec (print-words *scarrie*) (count-words *scarrie*) |# ;; scarrie text size: 13,7 MB ;; StuffIt archive: 1,7 MB ;; word-net: 948.552 byte ;; word-net (absolute addresses): 710.630 byte ;; word-net (relative addresses): 599.589 byte (time (count-words *scarrie*)) ; 33,8 sec ; 35,2 ; 32,8 (time (count-words *scarrie*)) (match-word *scarrie* "dativ") (print-words *scarrie*) (defun create-tagger-index (file string-net) (let ((count 0)) (with-file-lines (line file) (add-string string-net line) (incf count) (when (zerop (mod count 1000)) (format t "~%~5d ~a" count line))) (print count) (print (count-strings string-net)) (print (count-nodes string-net)) (minimize-tree string-net) (calculate-compression-mapping string-net) (compress-net string-net :iterate t) (count-strings string-net))) (defparameter *bm-lexicon* (make-instance 'active-string-net)) #+test (store-net *bm-lexicon* "projects:cgp;multitagger;bm-lexicon.net") (defparameter *bm-code-lexicon1* (make-instance 'active-string-net)) (defparameter *bm-code-lexicon2* (make-instance 'active-string-net)) #+test (store-net *bm-code-lexicon1* "projects:cgp;multitagger;bm-code-lexicon1.net") #+test (store-net *bm-code-lexicon2* "projects:cgp;multitagger;bm-code-lexicon2.net") ; 1205212 ; ;sorting ... ;minimizing (1) ... [1142946] ;sorting ... [4855] [4655] [4546] 0 trimmed [4351] (print-strings *nn-code-lexicon*) (string-values *nn-lexicon* "tommelfinger") ; [6944] word code lemma infl-nr features ; [7372] word code infl-nr lemma features ; [8016] word infl-nr code lemma features #+only-once (time (let ((*feature-coding* *nn-coding*)) (setf *nn-lexicon* nil) (defparameter *nn-code-lexicon* (make-instance 'huge-active-string-net)) (let ((count 0) (string-net *nn-code-lexicon*)) (block add (u:with-file-lines (line "projects:cgp;nets;nn-lexicon.txt") ;(when (> count 5000) (return-from add)) (setf line (u:subst-substrings line '("< " "<" " >" ">"))) (when t ; (> count 600001) (destructuring-bind (word code inflection-nr lemma+features) (u:string-parse line :whitespace ":") (declare (dynamic-extent word code lemma+features)) (destructuring-bind (lemma &rest features) (u:string-parse lemma+features :whitespace " " :delimiter-pairs '((#\" . #\"))) #+ignore (setf word (translate-string word) lemma (translate-string lemma)) (add-string string-net (u:concat word ":" (%alpha-to-decimal (u:concat code ":" inflection-nr)) ":" (compress-string (string-trim "\"" lemma) word) ":" (bit-vector-to-string (code-from-features features))))) (when (zerop (mod (incf count) 1000)) (format t "~%~5d ~a" count word)))))) (print count) (minimize-tree string-net) (calculate-gw-compression-tree string-net) (compress-net string-net :iterate t) (store-net string-net "projects:cgp;nets;nn-code-lexicon.net") (count-strings string-net)))) #+only-once (time (progn (setf *bm-lexicon* nil) (defparameter *bm-code-lexicon* (make-instance 'huge-active-string-net)) (let ((count 0) (string-net *bm-code-lexicon*)) (block add (u:with-file-lines (line "projects:cgp;nets;bm-lexicon.txt") ;(when (> count 5000) (return-from add)) (when t ; (> count 600001) (destructuring-bind (word code inflection-nr lemma+features) (u:string-parse line :whitespace ":") (destructuring-bind (lemma &rest features) (u:string-parse lemma+features :whitespace " " :delimiter-pairs '((#\" . #\"))) (setf word (translate-string word) lemma (translate-string lemma)) (add-string string-net (u:concat word ":" (%alpha-to-decimal (u:concat code ":" inflection-nr)) ":" (compress-string (string-trim "\"" lemma) word) ":" (bit-vector-to-string (code-from-features features))))) (when (zerop (mod (incf count) 1000)) (format t "~%~5d ~a" count word)))))) (print count) (minimize-tree string-net) (calculate-gw-compression-tree string-net) (compress-net string-net :iterate t) (store-net string-net "projects:cgp;nets1;bm-code-lexicon.net") (count-strings string-net)))) ;(time (count-strings *bm-code-lexicon*)) ; => 1143814; (89.161 seconds) ;(print-strings *nn-code-lexicon*) ;(alpha-to-decimal "0034:4") ; *tagger* ;(%decode-str (%alpha-to-decimal "0034:4")) #+copy (defun %alpha-to-decimal (code) (multiple-value-bind (code form) (alpha-to-decimal code) (let ((int (+ (* code 16) form))) (with-output-to-string (stream) (loop with i = int until (zerop i) do (write-char (code-char (logand i 255)) stream) (setf i (ash i -8))))))) #+copy (defun %decode-str (str) (let ((code 0)) (loop for i from 0 and c across str do (setf code (+ code (ash (char-code c) (* 8 i))))) (floor code 16))) ;(fullform-values "bøkkere") #+only-once (time (progn (defparameter *bm-lexicon* (make-instance 'active-string-net)) (let ((count 0) (string-net *bm-lexicon*)) (block add (u:with-file-lines (line "projects:cgp;nets;bm-lexicon.txt") ;(when (> count 5000) (return-from add)) (destructuring-bind (word code inflection-nr lemma+features) (u:string-parse line :whitespace ":") (declare (ignore code inflection-nr)) (destructuring-bind (lemma &rest features) (u:string-parse lemma+features :whitespace " " :delimiter-pairs '((#\" . #\"))) (setf word (translate-string word) lemma (translate-string lemma)) (add-string string-net (u:concat word ":" (compress-string (string-trim "\"" lemma) word) ":" (bit-vector-to-string (code-from-features features))))) (when (zerop (mod (incf count) 1000)) (format t "~%~5d ~a" count word))))) (print count) (minimize-tree string-net) (calculate-gw-compression-tree string-net) (compress-net string-net) (store-net string-net "projects:cgp;nets1;bm-lexicon.net") (count-strings string-net)))) #+only-once (time (let ((*feature-coding* *nn-coding*)) (defparameter *nn-lexicon* (make-instance 'active-string-net)) (let ((count 0) (string-net *nn-lexicon*)) (block add (u:with-file-lines (line "projects:cgp;nets;nn-lexicon.txt") ;(when (> count 5000) (return-from add)) (setf line (u:subst-substrings line '("< " "<" " >" ">"))) (destructuring-bind (word code inflection-nr lemma+features) (u:string-parse line :whitespace ":") (declare (ignore code inflection-nr)) (destructuring-bind (lemma &rest features) (u:string-parse lemma+features :whitespace " " :delimiter-pairs '((#\" . #\"))) (setf word (translate-string word) lemma (translate-string lemma)) (add-string string-net (u:concat word ":" (compress-string (string-trim "\"" lemma) word) ":" (bit-vector-to-string (code-from-features features))))) (when (zerop (mod (incf count) 1000)) (format t "~%~5d ~a" count word))))) (print count) (minimize-tree string-net) (calculate-gw-compression-tree string-net) (compress-net string-net) (store-net string-net "projects:cgp;nets;nn-lexicon.net") (count-strings string-net)))) (count-strings *bm-lexicon*) (let ((*feature-coding* *nn-coding*)) (lemma-and-features "tsjekkisk" :net *nn-lexicon*)) (let ((*feature-coding* *bm-coding*)) (lemma-and-features "tsjekkisk" :net *bm-lexicon*)) #| #+test (let ((count 0)) (u:with-file-lines (line "projects:cgp;bm-lexicon.txt") (destructuring-bind (word code inflection-nr lemma+features) (u:string-parse line :whitespace ":") (declare (ignore code inflection-nr)) (destructuring-bind (lemma &rest features) (u:string-parse lemma+features :whitespace " " :delimiter-pairs '((#\" . #\"))) (when (string= word "administrerende") (print line)) #+ignore (when (zerop (mod (incf count) 1000)) (format t "~%~5d ~a" count word)))))) ;(bit-vector-to-string (code-from-features (list "pron" "refl" "ent/fl" "akk"))) (code-features (string-to-bit-vector (bit-vector-to-string (code-from-features (list "adj" "" ""))))) ;"seg:459:2:\"seg\" pron refl ent/fl akk" ;"administrerende:020:10:\"administrere\" adj " (let ((line "administrerende:020:10:\"administrere\" adj " #+ignore"seg:459:2:\"seg\" pron refl ent/fl akk")) (destructuring-bind (word code inflection-nr lemma+features) (u:string-parse line :whitespace ":") (declare (ignore code inflection-nr)) (destructuring-bind (lemma &rest features) (u:string-parse lemma+features :whitespace " " :delimiter-pairs '((#\" . #\"))) (u:concat word ":" (compress-string (string-trim "\"" lemma) word) ":" (print (bit-vector-to-string (print (code-from-features features)))))))) (defun silly-char-p (char) (not (or (char<= #\a char #\z) (char<= #\A char #\Z) (find char "\"-æøåÆØÅ")))) #+test (let ((chars ())) (u:with-file-lines (line "projects:cgp;bm-lexicon.txt") (destructuring-bind (word code inflection-nr lemma+features) (u:string-parse line :whitespace ":") (declare (ignore code inflection-nr)) (destructuring-bind (lemma &rest features) (u:string-parse lemma+features :whitespace " " :delimiter-pairs '((#\" . #\"))) #+ignore (when (or (find-if #'silly-char-p word) (find-if #'silly-char-p lemma)) (print (list word lemma))) (let ((word-pos (position-if #'silly-char-p word)) (lemma-pos (position-if #'silly-char-p lemma))) (when word-pos (pushnew (char word word-pos) chars)) (when lemma-pos (pushnew (char lemma lemma-pos) chars)) (when (or word-pos lemma-pos) (print (list word lemma)))) #+ignore (when (zerop (mod (incf count) 1000)) (format t "~%~5d ~a" count word))))) (coerce chars 'string)) |#