;;; -*- Mode: LISP; Package: CGP; BASE: 10; Syntax: ANSI-Common-Lisp; -*- ;; Copyright (C) Paul Meurer 1999, 2000. All rights reserved. ;; paul.meurer@hit.uib.no ;; HIT-centre, University of Bergen ;; ;; Version 0.8 ;;------------------------------------------------------------------------------------- ;; TO DO: ;; ;; - optimize translation of features into Lisp keywords ;; - make split char a slot in the net ;; - -> etc. ;; ;; QUESTIONS: ;; ;;------------------------------------------------------------------------------------- (in-package "CGP") ;;; compression #-bit-vector-coding (defparameter *bm-feature-table* (make-hash-table :test #'equal)) #+bit-vector-coding (defparameter *bm-feature-table* (make-hash-table)) ;; features statistics (defparameter *statistics-table* (make-hash-table)) (defparameter *cg-statistics-table* (make-hash-table)) #+test (let ((count 0)) (clrhash *statistics-table*) (u:with-file-lines (line "projects:cgp;nets;bm-lexicon.txt") (destructuring-bind (fullform code inflection-nr lemma+features) (u:string-parse line :whitespace ":") (declare (ignore code inflection-nr)) (when (zerop (mod (incf count) 1000)) (print (list count fullform))) (destructuring-bind (lemma &rest features) (u:string-parse lemma+features :whitespace " " :delimiter-pairs '((#\" . #\"))) (declare (ignore lemma)) (dolist (feature (abbreviate-features features)) (incf (gethash feature *statistics-table* 0))))))) #+test (let ((features-list ())) (maphash (lambda (key value) (push (cons key value) features-list)) *statistics-table*) (setf *bm-ordered-simplified-features* (mapcar #'car (sort features-list #'> :key #'cdr)))) #+test ;; old (let ((count 0)) (clrhash *statistics-table*) (u:with-file-lines (line "projects:cgp;nets;bm-lexicon.txt") (destructuring-bind (fullform code inflection-nr lemma+features) (u:string-parse line :whitespace ":") (declare (ignore code inflection-nr)) (when (zerop (mod (incf count) 1000)) (print (list count fullform))) (destructuring-bind (lemma &rest features) (u:string-parse lemma+features :whitespace " " :delimiter-pairs '((#\" . #\"))) (declare (ignore lemma)) (dolist (feature features) (incf (gethash feature *statistics-table* 0))))))) ;; frequency ordering ;; DON'T CHANGE! (defparameter *bm-ordered-simplified-features* '(subst appell ent ub be fl mask noeyt adj verb fem tr1 pos m/f pres inf i1 sup i2 tr11 perf-part pret imp pass komp pa1 rl4 d5 rl9 a3 prop pa4 rl5 tr2 pa2 d1 a6 n pa5 tr11/til unorm tr10 a8 tr5 tr12 i4 d5/til rl6 tr9 a7 n1 pa1/til a12 tr3 d4 adv rl9/til a11 fork pa3 tr6 uboey a4 a5 i3 pr8 rl14 rl1 d6/til pa3/til tr15 rl15 rl3/til prep pa2/til tr8 rl3 tr7 tr18 rl12 a9 pa4/til pr9 @adv rl11 rl13 pr7 tr13 n3 d6 pref d8/til pr6 rl2 tr4 rl10 interj det d3 tr19 tr21 d2 pr3 rl8 tr12/til tr23 prep+subst rl7 tr20 a2 d7/til n4/til a15 pr10 pr13 pa5/til tr14 tr16 tr rl14/til pr1 pr2 tr17 a14 d7 rl16 rl10/til kvant tr22 prob d4/til pr12 d8 rl17 pa/til pa pa6 pr4 a13 @tittel pa1refl4 tr13/til pa7 pr4/til n2 d9/til symb rl18 sbu dem pron poss rl17/til rl16/til tr21/til rl12/til @s-pred @interj pers @ pr11 d9 n4 pa11 pa8 adj+subst hum konj+adj \3 forst prep+adj pr5 pr10/til hop> @loes-np @ prep+subst+subst subst+subst det+adj prep+det+subst res clb prep+prep @adv> interj+adv verb+det konj+adv+adj \2 subst+prep+subst pron+verb+verb \1 adj+verb konj+adv+prep @i-obj gen @det> prep+konj+prep + subst+prep adj+det prep+adv eint adv+prep verb+verb sbu+adj adv+adj prep+adj+adj interj+adj subst+konj+subst konj+det+adj adv+subst verb+det+subst prep+perf-part+subst prep+adv+subst v+v @kon @adj> prep+det+subst+kon+det+subst adj+prep+subst verb+subst subst+kvant prep+subst+prep+sbu adv+adv+prep mask/fem/noeyt prep+det+sbu ub/be inf-merke det+adj+det subst+prep+adj+subst pron+prep+adj det+subst+prep+subst mask/fem adj+kon+adj part+prep adv+prep+subst refl adj+adj inf/pres prep+subst+konj+subst adv+adj+prep fl/be subst+adj subst+v+subst subst+perf-part ent/fl ;; refl4 a1 t ;; those are fishy ;; added after calculation below ukjent samset inter bu >>> @sbu <<< @infmerke ;; fra multi-tagger.lisp )) ;; find out which features are added later in the CG parser #+test (let ((cg-features ())) (maphash (lambda (set definition) (declare (ignore set)) (dolist (def definition) (when (atom def) (setf def (list def))) (dolist (f def) (unless (or (stringp f) (find f *bm-ordered-simplified-features*)) (pushnew f cg-features))))) (set-declarations *cg*)) cg-features) ;; frequency ordering ;; DON'T CHANGE! (defparameter *bm-ordered-features* '("subst" "appell" "ent" "ub" "be" "fl" "mask" "noeyt" "adj" "verb" "fem" "" "pos" "" "m/f" "pres" "inf" "" "sup" "" "perf-part" "pret" "imp" "" "pass" "komp" "" "" "" "" "" "prop" "" "" "" "" "" "" "" "" "" "" "unorm" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "adv" "" "" "" "" "fork" "" "" "uboey" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "prep" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "@adv" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "pref" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "interj" "" "" "" "" "" "" "" "" "" "" "" "" "det" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "prep+subst" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "kvant" "prob" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "@tittel" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "symb" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "sbu" "dem" "pron" "poss" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "@s-pred" "" "@interj" "pers" "@" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "vei>" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "adj+subst" "hum" "" "konj+adj" "3" "forst" "" "prep+adj" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "hop>" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "@" "@loes-np" "" "@obj" "" "@subj" "det+subst" "prep+subst+prep" "konj" "" "akk" "" "nom" "" "prep+adj+subst" "subst+kon+subst" "hoeflig" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "suff" "" "" "prep+subst+kon+subst" "sp" "@fv" "@iv" "subst+verb" "prep+prop" "prep+subst+subst" "" "res" "prep+det+subst" "det+adj" "subst+subst" "clb" "" "prep+prep" "verb+det" "interj+adv" "subst+prep+subst" "1" "konj+adv+adj" "2" "pron+verb+verb" "konj+adv+prep" "@adv>" "adj+verb" "@i-obj" "+" "@det>" "prep+konj+prep" "adj+det" "prep+adv" "subst+prep" "gen" "v+v" "interj+adj" "sbu+adj" "prep+adv+subst" "@kon" "eint" "subst+konj+subst" "prep+adj+adj" "@adj>" "prep+perf-part+subst" "adv+subst" "perf-part." "adv+prep" "verb+verb" "verb+det+subst" "konj+det+adj" "adv+adj" "pron+prep+adj" "prep+det+sbu" "ub/be" "det+subst+prep+subst" "det+adj+det" "adv+prep+subst" "adj+prep+subst" "mask/fem/noeyt" "adv+adv+prep" "prep+subst+konj+subst" "part+prep" "fl/be" "subst+v+subst" "prep+subst+prep+sbu" "inf-merke" "verb+subst" "" "adj+kon+adj" "prep+det+subst+kon+det+subst" "subst+adj" "mask/fem" "adj+adj" "adv+adj+prep" "ent/fl" "subst+perf-part" "subst+prep+adj+subst" "subst+kvant" "refl" "inf/pres")) ;; ordered by frequency #-bit-vector-coding (loop for i from 0 for feature in ;; 71 *bm-ordered-features* do (setf (gethash (string-downcase feature) *bm-feature-table*) i)) #+bit-vector-coding (loop for i from 0 for feature in ;; 71 *bm-ordered-simplified-features* do (setf (gethash feature *bm-feature-table*) i)) #-bit-vector-coding (defun feature-code (feature) (let ((feature-name (string-downcase (if (stringp feature) feature (symbol-name feature))))) (gethash feature-name *bm-feature-table*))) #+bit-vector-coding (defun feature-code (feature) (gethash feature *bm-feature-table*)) ;; inverse of previous (defparameter *bm-feature-vector* (make-array (hash-table-count *bm-feature-table*))) (maphash (lambda (cat code) (setf (aref *feature-vector* code) cat)) *bm-feature-table*) (defparameter *code-vector-length* (length *feature-vector*)) (defparameter *code-vector-sort-array* (make-array *code-vector-length*)) (defun code-feature (code) (svref *feature-vector* code)) ;(code-feature (feature-code 'konj)) (defun has-feature-p (feature-vector feature) (= 1 (sbit feature-vector (the fixnum (feature-code feature))))) (defun has-features-p (feature-vector features) (loop for feature in features always (has-feature-p feature-vector feature))) (defun thereis-feature-p (feature-vector features) "Checks if FEATURE-VECTOR has at least one of features" (loop for feature in features thereis (has-feature-p feature-vector feature))) (defun has-feature-code-p (feature-vector code) (declare (fixnum code)) (= 1 (the fixnum (sbit feature-vector code)))) (defun has-feature-codes-p (feature-vector codes) (loop for code fixnum in codes always (has-feature-code-p feature-vector code))) (defun thereis-feature-code-p (feature-vector codes) (loop for code fixnum in codes thereis (has-feature-code-p feature-vector code))) (defparameter *simplify-subcats-p* t) #+test (length (code-features (make-array 679 :element-type 'bit :initial-element 1))) ; -> 309 (defparameter *string-to-symbol-table* (make-hash-table :test #'equal)) (defun %intern (string) (or (gethash string *string-to-symbol-table*) (setf (gethash string *string-to-symbol-table*) (intern (string-upcase string) :cgp)))) #-bit-vector-coding (defun code-features (code-bv) "Returns a list of features, in order determined by *CODE-VECTOR-SORT-ARRAY*. Abbreviates some features." (let ((used-abbrevs ())) (macrolet ((has-feature-p (features) ; true if has one of the features `(find-if (lambda (f) (= 1 (sbit code-bv (feature-code f)))) ,features)) (starts-with-p (pfx string) ;; pfx has to be a constant! `(let ((pfx-length (length ,pfx))) (and (<= pfx-length (length ,string)) (string= ,pfx ,string :end2 pfx-length)))) (abbreviate-feature (pfx abbr fstr) `(when (and (starts-with-p ,pfx ,fstr) (string/= ,fstr "")) (let* ((pfx-length (length ,pfx)) (slash-pos (position #\/ ,fstr)) (angle-pos (position #\> ,fstr)) (feature-sfx (cond ((null slash-pos) (subseq ,fstr pfx-length angle-pos)) ((string= ,fstr "TIL" :start1 (1+ slash-pos) :end1 angle-pos) (subseq ,fstr pfx-length angle-pos)) (t (subseq ,fstr pfx-length slash-pos)))) (abbreviation (%intern (u:concat ,abbr feature-sfx)))) (unless (find abbreviation used-abbrevs) ; don't collect an abbrev twice (u:collect abbreviation) (push abbreviation used-abbrevs)) (return-from abbreviate))))) (u:collecting (loop for pos across *code-vector-sort-array* for bit = (sbit code-bv pos) do (when (= bit 1) (let* ((feature-string (string-upcase (code-feature pos))) (feature (%intern feature-string))) (declare (dynamic-extent feature-string)) (cond ((not *simplify-subcats-p*) (u:collect feature)) (t (block abbreviate (mapc (lambda (prefix abbr) (abbreviate-feature prefix abbr feature-string)) '("")) (let* ((pfx-length (length ,pfx)) (slash-pos (position #\/ ,fstr)) (angle-pos (position #\> ,fstr)) (feature-sfx (cond ((null slash-pos) (subseq ,fstr pfx-length angle-pos)) ((string= ,fstr "TIL" :start1 (1+ slash-pos) :end1 angle-pos) (subseq ,fstr pfx-length angle-pos)) (t (subseq ,fstr pfx-length slash-pos)))) (abbreviation (%intern (u:concat ,abbr feature-sfx)))) (unless (find abbreviation used-abbrevs) ; don't collect an abbrev twice (u:collect abbreviation) (push abbreviation used-abbrevs)) (return-from abbreviate))))) (u:collecting (loop for feature-string in features do (let* ((feature (%intern (string-trim "." feature-string)))) (cond ((not *simplify-subcats-p*) (u:collect feature)) (t (block abbreviate (mapc (lambda (prefix abbr) (abbreviate-feature prefix abbr (string-upcase feature-string))) '("" "" "" "1" "2" "3" "mask" "fem" "noeyt" "m/f" "mask/fem" "appell" "prop" "ub" "be" "ent" "fl" "ent/fl" "uboey" "inf" "pres" "inf/pres" "perf-part" "pret" "imp" "pass" "hum" "nom" "akk" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "@adv" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "pref" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "interj" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "prep+subst" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "@tittel" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "@s-pred" "" "@interj" "@" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "vei>" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "adj+subst" "" "konj+adj" "forst" "" "prep+adj" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "hop>" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "@" "@loes-np" "" "@obj" "" "@subj" "det+subst" "prep+subst+prep" "" "" "" "prep+adj+subst" "subst+kon+subst" "hoeflig" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "suff" "" "" "prep+subst+kon+subst" "sp" "@fv" "@iv" "subst+verb" "prep+prop" "prep+subst+subst" "" "prep+det+subst" "det+adj" "subst+subst" "clb" "" "prep+prep" "verb+det" "interj+adv" "subst+prep+subst" "konj+adv+adj" "pron+verb+verb" "konj+adv+prep" "@adv>" "adj+verb" "@i-obj" "+" "@det>" "prep+konj+prep" "adj+det" "prep+adv" "subst+prep" "gen" "v+v" "interj+adj" "sbu+adj" "prep+adv+subst" "@kon" "eint" "subst+konj+subst" "prep+adj+adj" "@adj>" "prep+perf-part+subst" "adv+subst" "perf-part." "adv+prep" "verb+verb" "verb+det+subst" "konj+det+adj" "adv+adj" "pron+prep+adj" "prep+det+sbu" "ub/be" "det+subst+prep+subst" "det+adj+det" "adv+prep+subst" "adj+prep+subst" "mask/fem/noeyt" "adv+adv+prep" "prep+subst+konj+subst" "part+prep" "fl/be" "subst+v+subst" "prep+subst+prep+sbu" "inf-merke" "verb+subst" "" "adj+kon+adj" "prep+det+subst+kon+det+subst" "subst+adj" "adj+adj" "adv+adj+prep" "subst+perf-part" "subst+prep+adj+subst" "subst+kvant" "unorm") do (setf (aref *code-vector-sort-array* i) (position f *feature-vector* :test #'string-equal))) ;; ordered by hand #+bit-vector-coding (let ((ordered-simplified-features ())) (loop for i from 0 to (1- (length *code-vector-sort-array*)) do (setf (aref *code-vector-sort-array* i) i)) (loop for f across #("fork" "symb" "subst" "adj" "verb" "prep" "adv" "sbu" "konj" "det" "dem" "kvant" "prob" "pron" "pers" "poss" "refl" "pos" "komp" "sup" "res" "" "" "" "1" "2" "3" "mask" "fem" "noeyt" "m/f" "mask/fem" "appell" "prop" "ub" "be" "ent" "fl" "ent/fl" "uboey" "inf" "pres" "inf/pres" "perf-part" "pret" "imp" "pass" "hum" "nom" "akk" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "@adv" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "pref" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "interj" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "prep+subst" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "@tittel" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "@s-pred" "" "@interj" "@" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "vei>" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "adj+subst" "" "konj+adj" "forst" "" "prep+adj" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "hop>" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "@" "@loes-np" "" "@obj" "" "@subj" "det+subst" "prep+subst+prep" "" "" "" "prep+adj+subst" "subst+kon+subst" "hoeflig" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "suff" "" "" "prep+subst+kon+subst" "sp" "@fv" "@iv" "subst+verb" "prep+prop" "prep+subst+subst" "" "prep+det+subst" "det+adj" "subst+subst" "clb" "" "prep+prep" "verb+det" "interj+adv" "subst+prep+subst" "konj+adv+adj" "pron+verb+verb" "konj+adv+prep" "@adv>" "adj+verb" "@i-obj" "+" "@det>" "prep+konj+prep" "adj+det" "prep+adv" "subst+prep" "gen" "v+v" "interj+adj" "sbu+adj" "prep+adv+subst" "@kon" "eint" "subst+konj+subst" "prep+adj+adj" "@adj>" "prep+perf-part+subst" "adv+subst" "perf-part." "adv+prep" "verb+verb" "verb+det+subst" "konj+det+adj" "adv+adj" "pron+prep+adj" "prep+det+sbu" "ub/be" "det+subst+prep+subst" "det+adj+det" "adv+prep+subst" "adj+prep+subst" "mask/fem/noeyt" "adv+adv+prep" "prep+subst+konj+subst" "part+prep" "fl/be" "subst+v+subst" "prep+subst+prep+sbu" "inf-merke" "verb+subst" "" "adj+kon+adj" "prep+det+subst+kon+det+subst" "subst+adj" "adj+adj" "adv+adj+prep" "subst+perf-part" "subst+prep+adj+subst" "subst+kvant" "unorm") do (pushnew (car (abbreviate-features (list f))) ordered-simplified-features)) (loop for i from 0 and f in (nreverse ordered-simplified-features) do (setf (aref *code-vector-sort-array* i) (position f *feature-vector* :test #'string-equal)))) (defun encode-features (&rest features) (let ((feature-code (make-array *code-vector-length* :element-type 'bit :initial-element 0))) (dolist (feature features) (let* ((code (feature-code feature))) (when code (setf (sbit feature-code code) 1)))) feature-code)) (defun translate-feature (feature) (declare (ignore feature))) #| (time (dotimes (i 10000) (make-array *code-vector-length* :element-type 'bit :initial-element 0))) (time (let* ((bit-vector (make-array *code-vector-length* :element-type 'bit :initial-element 0)) (codes '(1 6 9 30))) (dolist (i '(6 9 30 50 40)) (setf (sbit bit-vector i) 1)) (dotimes (i 10000) (loop for i fixnum in codes always (= (sbit bit-vector i) 1))))) (time (let ((features '(zuzu gaga fifi roro loeloe))) (dotimes (i 10000) (loop for c in '(gaga fifi roro loeloe) always (find c features))))) |# #-bit-vector-coding (defun code-from-features (features) "Translates a list of features into a bit vector." (let ((feature-code (make-array *code-vector-length* :element-type 'bit :initial-element 0))) (dolist (feature features) (let* ((f (string-downcase feature)) (code (gethash f *bm-feature-table*))) (if code (setf (sbit feature-code code) 1) (warn "Could not find code for ~s" f)))) feature-code)) #+bit-vector-coding (defun code-from-features (features) "Translates a list of features into a bit vector." (setf features (mapcar #'%convert-string features)) (let ((feature-code (make-array *code-vector-length* :element-type 'bit :initial-element 0))) (dolist (feature (abbreviate-features features)) (let ((code (gethash feature *bm-feature-table*))) (if code (setf (sbit feature-code code) 1) (warn "Could not find code for ~s" feature)))) feature-code)) (defun set-feature (bit-vector feature) (setf (sbit bit-vector (feature-code feature)) 1) bit-vector) ;(abbreviate-features '("mask" "fem" "noeyt")) ;(code-from-features '("mask" "fem" "noeyt")) ;(code-from-features '("mask" "fem")) (defun features-statistics (features code-vector) (dolist (feature features) (let* ((f (string-downcase (symbol-name feature))) (f-list (translate-feature f))) (cond (f-list (dolist (f f-list) (let ((code (gethash f *bm-feature-table*))) (when code (incf (aref code-vector code)))))) (t (let ((code (gethash f *bm-feature-table*))) (when code (incf (aref code-vector code))))))))) (defparameter *code-vector* (make-array *code-vector-length* :element-type 'fixnum :initial-element 0)) #+old (defun bit-vector-to-string (cv) (let ((string (make-string (ceiling (/ (length cv) 8)))) (byte 0)) (loop for bit across cv with i = 0 and pos = 0 do (when (= bit 1) (setf byte (logxor byte (ash 1 i)))) (when (= (incf i) 8) (setf i 0 (char string pos) (code-char byte) byte 0) (incf pos)) ;finally (setf (char string pos) (code-char byte)) ) ;; *** SOMEWHAT BUGGY (let ((last-non-null (position-if-not (lambda (c) (char= c #\Null)) string :from-end t))) (if last-non-null (subseq string 0 (1+ last-non-null)) "")))) (defun bit-vector-to-string (cv) (let ((string (make-string (ceiling (/ (length cv) 8)))) (byte 0)) (loop for bit across cv with i = 0 and pos = 0 do (when (= bit 1) (setf byte (logxor byte (ash 1 i)))) (when (= (incf i) 8) (setf i 0 (char string pos) (code-char byte) byte 0) (incf pos)) finally (unless (zerop i) (setf (char string pos) (code-char byte)))) ;; *** SOMEWHAT BUGGY? (let ((last-non-null (position-if-not (lambda (c) (char= c #\Null)) string :from-end t))) (if last-non-null (subseq string 0 (1+ last-non-null)) "")))) (defun string-to-bit-vector (string) (declare (optimize (speed 3) (safety 0))) (let ((bv (make-array *code-vector-length* :element-type 'bit :initial-element 0))) (loop for c across string for i fixnum from 0 do (let ((byte (the fixnum (char-code c)))) (dotimes (j 8) (declare (fixnum j)) (unless (zerop (the fixnum (logand byte (the fixnum (ash 1 j))))) (setf (sbit bv (the fixnum (+ (the fixnum (* i 8)) j))) 1))))) bv)) (defun array-to-bit-vector (array) (declare (optimize (speed 3) (safety 0))) (let ((bv (make-array *code-vector-length* :element-type 'bit :initial-element 0))) (loop for byte fixnum across array for i fixnum from 0 do (dotimes (j 8) (declare (fixnum j)) (unless (zerop (the fixnum (logand byte (the fixnum (ash 1 j))))) (setf (sbit bv (the fixnum (+ (the fixnum (* i 8)) j))) 1)))) bv)) #+unoptimized (defun string-to-bit-vector (string) (let ((bv (make-array *code-vector-length* :element-type 'bit :initial-element 0))) (loop for c across string for i from 0 do (let ((byte (char-code c))) (dotimes (j 8) (unless (zerop (logand byte (ash 1 j))) (setf (sbit bv (+ (* i 8) j)) 1))))) bv)) (defun split (string char &optional count) "COUNT is max number of returned chunks." (labels ((walk (pos) (let ((next-pos (position char string :start pos))) (if (and next-pos (or (null count) (not (zerop (decf count))))) (cons (subseq string pos next-pos) (walk (1+ next-pos))) (list (subseq string pos)))))) (walk 0))) ;(split "asdf:qwer:tyui:xcvb" #\: 3) ;(pushnew :bit-vectors *features*) (defun fullform-values (word) (let ((values (string-values *bm-code-lexicon* word))) (mapcar (lambda (value) (destructuring-bind (code+inflection-nr compressed-lemma feature-vector) (split value #\: 3) ;; OBS: feature vector might contain split char! (list* (multiple-value-call #'decimal-to-alpha (%decode-str code+inflection-nr)) (decompress-string compressed-lemma word) #-bit-vectors(code-features (string-to-bit-vector feature-vector)) #+bit-vectors(string-to-bit-vector feature-vector)))) values))) #+test (time (dotimes (i 1000) (fullform-values "kommet"))) ; #-bit-vectors (2.590 seconds, 6,424,056 bytes) ; #+bit-vectors (2.352 seconds, 5,712,056 bytes) ;; returns a list of feature lists (defun fullform-features (word &optional (net *bm-lexicon*)) (let ((values (string-values net word))) (mapcar (lambda (value) (destructuring-bind (compressed-lemma feature-vector) (split value #\: 2) (declare (ignore compressed-lemma)) #-bit-vectors(code-features (string-to-bit-vector feature-vector)) #+bit-vectors(string-to-bit-vector feature-vector))) values))) #+test (time (dotimes (i 1000) (fullform-features "kommet"))) ;#-bit-vectors (0.939 seconds, 2,456,016 bytes) ;#+bit-vectors (0.796 seconds, 2,032,016 bytes) (0.948 seconds without optimizing of string-to-bit-vector) ;(lemma-and-features "se") #+test (time (dotimes (i 1000) (lemma-and-features "kommet"))) ; 0.921 seconds, 2,256,016 bytes #+old+old (defun lemma-and-features (word &key decompress-base (net *bm-lexicon*)) (declare (optimize (speed 3) (safety 0)) (string word)) (let ((values (string-values net word))) (mapcar (lambda (value) (destructuring-bind (compressed-lemma feature-vector) (split value #\: 2) (cons (decompress-string compressed-lemma (or decompress-base word)) #-bit-vectors(code-features (string-to-bit-vector feature-vector)) #+bit-vectors(string-to-bit-vector feature-vector)))) values))) #+old (defun lemma-and-features (word &key decompress-base (net *bm-lexicon*)) (declare (optimize (speed 3) (safety 0)) (string word)) (u:collecting (nmap-string-values net word (lambda (value) (destructuring-bind (compressed-lemma feature-vector) (split value #\: 2) (u:collect (cons (decompress-string compressed-lemma (or decompress-base word)) #-bit-vectors(code-features (string-to-bit-vector feature-vector)) #+bit-vectors(string-to-bit-vector feature-vector)))))))) (defvar *%strings* (make-array 0 :adjustable t :fill-pointer t)) (defvar *%byte-arrays* (make-array 0 :adjustable t :fill-pointer t)) (defun %vector-pop (vector) (let* ((fill (fill-pointer vector))) (declare (fixnum fill)) (unless (zerop fill) (decf fill) (setf (fill-pointer vector) fill) (aref vector fill)))) (defmacro %with-string ((string) &body body) `(let ((,string (or (%vector-pop *%strings*) (make-array 0 :element-type #+mcl 'base-character #-mcl 'character :adjustable t :fill-pointer t)))) (unwind-protect (progn ,@body) (setf (fill-pointer ,string) 0) (vector-push-extend ,string *%strings*)))) (defmacro %with-byte-array ((array) &body body) `(let ((,array (or (%vector-pop *%byte-arrays*) (make-array 0 :element-type '(unsigned-byte 8) :adjustable t :fill-pointer t)))) (unwind-protect (progn ,@body) (setf (fill-pointer ,array) 0) (vector-push-extend ,array *%byte-arrays*)))) (defun lemma-and-features (word &key decompress-base (net *bm-lexicon*)) (declare (optimize (speed 3) (safety 0)) (string word)) (%with-string (%string) (%with-byte-array (%array) (u:collecting (nmap-string+array-values net word (lambda (compressed-lemma feature-vector) (u:collect (cons (decompress-string compressed-lemma (or decompress-base word)) #-bit-vectors(code-features (array-to-bit-vector feature-vector)) #+bit-vectors(array-to-bit-vector feature-vector)))) %string %array))))) (defun %alpha-to-decimal (code) (multiple-value-bind (code form) (alpha-to-decimal code) (let ((int (+ (* code 16) form))) (with-output-to-string (stream) (loop with i = int until (zerop i) do (write-char (code-char (logand i 255)) stream) (setf i (ash i -8))))))) (defun %decode-str (str) (declare (optimize (speed 3) (debug 0))) (let ((code 0)) (declare (fixnum code)) (loop for i fixnum from 0 and c across str do (setf code (the fixnum (+ code (the fixnum (ash (the fixnum (char-code c)) (the fixnum (* 8 i)))))))) (floor code 16))) #+old (defun %decode-str (str) (let ((code 0)) (loop for i from 0 and c across str do (setf code (+ code (ash (char-code c) (* 8 i))))) (floor code 16))) ;(time (dotimes (i 10000) (%decode-str (%alpha-to-decimal "200:3")))) ;(alpha-to-decimal "200:3") ;(multiple-value-call #'decimal-to-alpha (alpha-to-decimal "200:3")) #| ; features frequency 826310 subst 820674 appell 540346 ent 515119 ub 471557 be 453587 fl 379957 mask 363895 noeyt 250145 adj 125571 verb 120919 fem 112882 91021 pos 83339 43586 m/f 41796 pres 41784 inf 41037 36545 sup 27363 20949 perf-part 20945 pret 20928 imp 20843 20841 pass 18279 komp 12155 8767 7439 7252 5863 5103 prop 4950 4764 4678 3890 3622 3158 3155 2982 2948 2926 2734 unorm 2662 2662 2518 2377 2321 2271 2250 2222 2181 2069 1958 1848 1826 1793 1771 1710 1639 1595 1584 1553 1529 1496 1485 1485 1320 adv 1309 1298 1265 1233 1200 fork 1188 1166 1154 uboey 1134 1133 1099 1085 1078 1034 1023 1012 1001 979 968 963 891 869 858 847 759 737 726 715 706 prep 704 704 704 703 693 685 682 682 674 649 638 595 594 572 550 550 550 539 531 517 509 @ADV 506 495 463 456 440 429 418 418 397 396 390 385 385 385 385 369 pref 333 330 330 330 330 330 319 319 308 308 308 308 308 308 302 297 297 286 286 286 286 283 interj 279 275 275 275 264 264 264 253 253 243 243 242 242 det 242 231 231 225 220 220 220 210 209 209 209 198 198 198 198 190 187 187 187 184 prep+subst 177 176 176 176 176 176 176 165 165 165 165 165 165 165 165 154 154 154 154 143 143 143 143 143 143 143 143 132 132 132 132 132 132 132 132 132 132 132 131 127 125 121 121 121 121 121 121 121 121 114 111 110 110 110 110 110 110 104 kvant 104 prob 99 99 99 99 99 99 99 99 91 88 88 88 88 88 88 88 88 88 88 88 88 81 77 77 77 77 77 77 77 77 77 77 77 77 77 77 74 @TITTEL 67 66 66 66 66 66 66 66 66 66 66 66 66 66 66 66 66 66 66 66 66 66 66 66 66 66 58 57 55 55 55 55 55 55 symb 55 55 55 55 55 55 55 55 55 55 55 55 55 55 52 sbu 50 dem 47 pron 46 poss 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 41 @S-PRED 38 37 @INTERJ 37 pers 34 @ 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 vei> 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 30 adj+subst 29 hum 28 26 konj+adj 26 3 25 forst 25 25 prep+adj 22 22 22 22 22 22 22 22 22 22 22 22 22 22 22 22 22 22 22 22 22 22 22 22 22 hop> 22 22 22 22 22 22 22 22 22 22 22 22 22 22 22 21 @ 21 @LoeS-NP 20 20 @OBJ 20 17 @SUBJ 17 det+subst 16 prep+subst+prep 15 konj 14 14 akk 14 14 nom 14 14 prep+adj+subst 13 subst+kon+subst 12 hoeflig 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 10 suff 10 10 9 prep+subst+kon+subst 9 sp 9 @FV 8 @IV 8 subst+verb 8 prep+prop 7 prep+subst+subst 7 6 res 6 prep+det+subst 6 det+adj 6 subst+subst 5 CLB 5 5 prep+prep 4 verb+det 4 interj+adv 4 subst+prep+subst 4 1 4 konj+adv+adj 4 2 4 pron+verb+verb 4 konj+adv+prep 4 @ADV> 4 adj+verb 3 @I-OBJ 3 + 3 @DET> 3 prep+konj+prep 3 adj+det 3 prep+adv 3 subst+prep 3 gen 2 v+v 2 interj+adj 2 sbu+adj 2 prep+adv+subst 2 @KON 2 eint 2 subst+konj+subst 2 prep+adj+adj 2 @ADJ> 2 prep+perf-part+subst 2 adv+subst 2 perf-part. 2 adv+prep 2 verb+verb 2 verb+det+subst 2 konj+det+adj 2 adv+adj 1 pron+prep+adj 1 prep+det+sbu 1 ub/be 1 det+subst+prep+subst 1 det+adj+det 1 adv+prep+subst 1 adj+prep+subst 1 mask/fem/noeyt 1 adv+adv+prep 1 prep+subst+konj+subst 1 part+prep 1 fl/be 1 subst+v+subst 1 prep+subst+prep+sbu 1 inf-merke 1 verb+subst 1 1 adj+kon+adj 1 prep+det+subst+kon+det+subst 1 subst+adj 1 mask/fem 1 adj+adj 1 adv+adj+prep 1 ent/fl 1 subst+perf-part 1 subst+prep+adj+subst 1 subst+kvant 1 refl 1 inf/pres |#