;;; -*- Mode: LISP; Package: CGP; BASE: 10; Syntax: ANSI-Common-Lisp; -*-
;; Copyright (C) Paul Meurer 1999 - 2007. All rights reserved.
;; paul.meurer@hit.uib.no
;; Aksis, UNIFOB, University of Bergen
;;
;;-------------------------------------------------------------------------------------
;; TO DO:
;;
;; +- optimize translation of features into Lisp keywords
;; - make split char a slot in the net
;; - -> etc.
;;
;;-------------------------------------------------------------------------------------
;; QUESTIONS:
;;
;;-------------------------------------------------------------------------------------
(in-package "CGP")
'("*avtale" "*berg" "*blad" "*bok" "*bolig" "*bre" "*bukt" "*by" "*dal" "*elv" "*film" "*fjell" "*fjord" "*foss" "*fred" "*fylke" "*gate" "*hall" "*hav" "*hjem" "*hotell" "*hus" "*kirke" "*kommune" "*krig" "*kyst" "*land" "*lov" "*løkke" "*minister" "*myr" "*nes" "*pakt" "*park" "*plass" "*president" "*prinsipp" "*pris" "*program" "*protokoll" "*roman" "*sang" "*sen" "*senter" "*serie" "*seter" "*sjø" "*skog" "*skole" "*smug" "*son" "*stad" "*strand" "*sund" "*syndrom" "*teorem" "*torg" "*torv" "*vann" "*veg" "*vei" "*verk" "*vidde" "*vik" "*ø" "*ørken" "*øy" "*ås")
(defvar *%strings* (make-array 0 :adjustable t :fill-pointer t))
(defvar *%byte-arrays* (make-array 0 :adjustable t :fill-pointer t))
(defun %vector-pop (vector)
(let* ((fill (fill-pointer vector)))
(declare (fixnum fill))
(unless (zerop fill)
(decf fill)
(setf (fill-pointer vector) fill)
(aref vector fill))))
(defmacro %with-string ((string) &body body)
`(let ((,string (or (%vector-pop *%strings*)
(make-array 0 :element-type #+mcl 'base-character #-mcl 'character
:adjustable t :fill-pointer t))))
(unwind-protect (progn ,@body)
(setf (fill-pointer ,string) 0)
(vector-push-extend ,string *%strings*))))
(defmacro %with-byte-array ((array) &body body)
`(let ((,array (or (%vector-pop *%byte-arrays*)
(make-array 0 :element-type '(unsigned-byte 8) :adjustable t
:fill-pointer t))))
(unwind-protect (progn ,@body)
(setf (fill-pointer ,array) 0)
(vector-push-extend ,array *%byte-arrays*))))
;;; compression
#-bit-vector-coding
(defparameter *bm-feature-table* (make-hash-table :test #'equal))
#+bit-vector-coding-xx
(defparameter *bm-feature-table* (make-hash-table))
;; features statistics
#+no-class
(defparameter *statistics-table* (make-hash-table))
#+no-class
(defparameter *cg-statistics-table* (make-hash-table))
#+only-once
(let* ((count 0)
(*tagger* *nny-tagger*)
(statistics-table (statistics-table *tagger*)))
(clrhash statistics-table)
(with-file-lines (line "projects:cgp;nets;nny-lexicon.txt")
(setf line (subst-substrings line '("< " "<" " >" ">")))
(destructuring-bind (fullform code inflection-nr lemma+features)
(string-parse line :whitespace ":")
(declare (ignore code inflection-nr))
(when (zerop (mod (incf count) 1000)) (print (list count fullform)))
(destructuring-bind (lemma &rest features)
(string-parse lemma+features :whitespace " "
:delimiter-pairs '((#\" . #\")))
(declare (ignore lemma))
(dolist (feature (abbreviate-features features))
(incf (gethash feature statistics-table 0)))))))
;; collect unabbreviated features
#+only-once
(let* ((count 0)
(*tagger* *nny-tagger*)
(features-list ()))
(u:with-file-lines (line "projects:cgp;nets;nny-lexicon.txt")
(setf line (u:subst-substrings line '("< " "<" " >" ">")))
(destructuring-bind (fullform code inflection-nr lemma+features)
(u:string-parse line :whitespace ":")
(declare (ignore code inflection-nr))
(when (zerop (mod (incf count) 1000)) (print (list count fullform)))
(destructuring-bind (lemma &rest features)
(u:string-parse lemma+features :whitespace " "
:delimiter-pairs '((#\" . #\")))
(declare (ignore lemma))
(dolist (feature features)
(let ((fixed-features (fix-feature-string feature)))
(unless (eq fixed-features :ignore)
(dolist (feature fixed-features)
(pushnew feature features-list :test #'string=))))))))
features-list)
#+testxx
(let* ((count 0)
(*tagger* *nny-tagger*)
(statistics-table (make-hash-table :test #'equal)))
(clrhash statistics-table)
(with-file-lines (line "projects:cgp;nets;nny-lexicon.txt")
(destructuring-bind (fullform code inflection-nr lemma+features)
(u:string-parse line :whitespace ":")
(declare (ignore code inflection-nr))
(when (zerop (mod (incf count) 1000)) (print (list count fullform)))
(destructuring-bind (lemma &rest features)
(u:string-parse lemma+features :whitespace " "
:delimiter-pairs '((#\" . #\")))
(declare (ignore lemma))
(dolist (feature features)
(incf (gethash feature statistics-table 0))))))
(let ((features-list ()))
(maphash (lambda (key value)
(push (cons key value) features-list))
statistics-table)
(print (sort features-list #'> :key #'cdr))))
#+test
(let ((features-list ()))
(maphash (lambda (key value)
(push (cons key value) features-list))
(statistics-table *nny-tagger*))
;(print (setf features-list (sort features-list #'> :key #'cdr)))
(setf (ordered-simplified-features *nny-tagger*)
(mapcar #'car (sort features-list #'> :key #'cdr))))
;; frequency ordering
;; DON'T CHANGE!
(setf (ordered-simplified-features *nbo-tagger*)
'(subst appell ent ub be fl mask nøyt adj verb fem tr1 pos m/f pres inf i1
sup i2 tr11 perf-part pret imp pass komp pa1 rl4 d5 rl9 a3 prop pa4
rl5 tr2 pa2 d1 a6 n pa5 tr11/til unorm tr10 a8 tr5 tr12 i4 d5/til rl6 tr9 a7 n1
pa1/til a12 tr3 d4 adv rl9/til a11 fork pa3 tr6 ubøy a4 a5 i3 pr8 rl14 rl1 d6/til
pa3/til tr15 rl15 rl3/til prep pa2/til tr8 rl3 tr7 tr18 rl12 a9 pa4/til pr9 @adv
rl11 rl13 pr7 tr13 n3 d6 pref d8/til pr6 rl2 tr4 rl10 interj det
d3
tr19 tr21 d2 pr3 rl8 tr12/til tr23 prep+subst rl7 tr20 a2 d7/til n4/til
a15 pr10 pr13 pa5/til tr14 tr16 tr rl14/til pr1 pr2 tr17 a14 d7 rl16 rl10/til kvant
tr22 prob d4/til pr12 d8 rl17 pa/til pa pa6 pr4 a13 @tittel pa1refl4
tr13/til pa7 pr4/til n2 d9/til symb rl18 sbu dem pron poss rl17/til rl16/til
tr21/til rl12/til @s-pred @interj pers @
pr11 d9
n4 pa11 pa8 adj+subst hum konj+adj \3 forst prep+adj
pr5 pr10/til hop> @løs-np @
prep+subst+subst subst+subst det+adj prep+det+subst res clb prep+prep @adv> interj+adv
verb+det konj+adv+adj \2 subst+prep+subst pron+verb+verb \1 adj+verb konj+adv+prep @i-obj
gen @det> prep+konj+prep + subst+prep adj+det prep+adv nynorsk adv+prep verb+verb sbu+adj
adv+adj prep+adj+adj interj+adj subst+konj+subst konj+det+adj adv+subst verb+det+subst
prep+perf-part+subst prep+adv+subst v+v @kon @adj> prep+det+subst+kon+det+subst
adj+prep+subst verb+subst subst+kvant prep+subst+prep+sbu adv+adv+prep mask/fem/nøyt
prep+det+sbu ub/be inf-merke det+adj+det
subst+prep+adj+subst pron+prep+adj
det+subst+prep+subst mask/fem adj+kon+adj part+prep adv+prep+subst refl adj+adj inf/pres
prep+subst+konj+subst adv+adj+prep fl/be subst+adj subst+v+subst subst+perf-part ent/fl
;; refl4 a1 t ;; those are fishy
;; added after calculation below
ukjent samset inter bu >>> @sbu <<<
@infmerke
;; fra multi-tagger.lisp
@ @s-gr @
;; named entity features
foreign
&person &sted &org &verk &hend &annet
<*avtale> <*berg> <*blad> <*bok> <*bolig> <*bre> <*bukt> <*by> <*dal> <*elv> <*film> <*fjell> <*fjord> <*foss> <*fred> <*fylke> <*gate> <*hall> <*hav> <*hjem> <*hotell> <*hus> <*kirke> <*kommune> <*krig> <*kyst> <*land> <*lov> <*løkke> <*minister> <*myr> <*nes> <*pakt> <*park> <*plass> <*president> <*prinsipp> <*pris> <*program> <*protokoll> <*roman> <*sang> <*sen> <*senter> <*serie> <*seter> <*sjø> <*skog> <*skole> <*smug> <*son> <*stad> <*strand> <*sund> <*syndrom> <*teorem> <*torg> <*vann> <*vei> <*verk> <*vidde> <*vik> <*ørken> <*øy> <*ås>
<*aksjon>
<*bevegelse>
<*departement>
<*direktorat>
<*forbund>
<*forening>
<*forum>
<*institutt>
;; <*kirke>
;; <*kommune>
<*kontor>
<*lag>
<*monopol>
<*møte>
<*nemnd>
<*organisasjon>
<*parti>
<*rett>
<*revisjon>
<*råd>
;; <*senter>
<*stand>
<*tilsyn>
<*utvalg>
<*as>
<*avis>
<*bygning>
<*dir.>
<*direktør>
<*fond>
<*formann>
<*forsker>
<*gjeng>
<*gruppe>
<*gård>
<*hytte>
<*ingeniør>
<*ist>
<*kamp>
<*klubb>
<*koordinator>
<*leder>
<*leilighet>
<*list>
<*log>
<*lokale>
<*misjon>
<*museum>
<*område>
<*produsent>
<*sal>
<*selskap>
<*sjef>
<*spesialist>
<*styre>
<*bedrift>
<*foretak>
<**institutt>
rx
c1 c2 c3 c4 c5 cn ;; Scarrie styles
scarrie
))
#+:nny-parser
(setf (ordered-simplified-features *nny-tagger*)
'(subst appell eint ub bu fl adj nøyt fem mask pos verb tr1 sup i1 inf m/f pres
komp i2 imp tr11 pa1 perf-part pret rl4 st-form d5 a3 rl9 prop pa4 rl5 pa2 pa5
tr2 a6 tr12 tr5 tr11/til d1 n i4 n1 a8 a12 unorm pa3 a7 d5/til tr9 tr10 d4 adv tr3
rl14 pa1/til tr6 fork rl15 rl6 i3 rl9/til a5 a11 pr8 tr18 tr15 d6/til rl13 rl1 prep a4 pa3/til
d6 pa4/til pa2/til tr8 rl3 @adv pr9 a9 pr6 rl12 rl3/til pref tr12/til d7 a15 tr7 det tr19 pr7
d3 tr13 d2 tr21 interj tr22 rl10 n2 rl8 tr4 n4 rl2
pa5/til prep+subst a2 tr n3 kvant rl11 pr13 tr16 tr20 rl7 pa6 tr17 rl17 pr2
d8/til pr1 pa8 tr23 tr14 sideform a14 pr11 pa7 pr3 k1 rl14/til k2 a13 dem rl17/til tr13/til
rl16 rl10/til symb pr12 poss sbu i pron d7/til rl21 adj+subst tr20/til rl12/til
rl16/til n4/til d4/til uttr pers pr4 rl18 d9 @interj prob prep+adj @ i12 d9/til pa11
prep+subst+prep pr10 tr21/til rl19 nom subst+kon+subst @subj tr24 forst pr5 prep+adj+subst
prep+subst+subst @fv prep+subst+kon+subst be konj+adv+prep prep+prop \1 @iv sp subst+verb
interj+adv suff prep+konj+prep res subst+subst clb høflig bokmål \2 adj+verb
prep+adv
adv+adj subst+prep+subst det+adj prep+det+subst verb+det subst+prep @adv> konj+adv+adj
prep+adv+subst pron+verb+verb @det> verb+verb adj+det adv+adv gen @i-obj adv+verb @adj>
prep+perf-part+subst adv+adv+prep adv+subst prep+prep prep+adj+adj konj+det+adj adj+kon+adj
@kon verb+det+subst prep+det+sbu subst+perf-part subst+kvant subst+verb+subst ikke-hum
prep+kon+subst @sbu interj+adj subst+prep+adj+subst adj+adj sbu+prep inf-merke
prep+det+subst+kon+subst part+prep prep+subst+konj+adv verb+subst refl1 det+subst+prep+subst
prep+subst+prep+sbu prep+det+subst+kon+det+subst prep+subst+konj+subst
;; added after calculation below
ukjent samset @infmerke test ubøy >>> <<<
;; fra multi-tagger.lisp
;; fra norsk-map.lisp; evaluate form below
@ @app @
foreign
rx))
; @iv-hj @fv-ho @fv-hj @iv-ho @f-subj
#+test
(maphash (lambda (key val)
(declare (ignore val))
(let ((*cg* *nny-cg*)
(*tagger* *nny-tagger*))
(print (list key (feature-code key)))))
(syntactic-functions *nny-cg*))
#+test
(let ((*cg* *nbo-cg*)
(syn-features ()))
(maphash (lambda (key rule-list)
(declare (ignore key))
;(Print rule-list)
(dolist (rule rule-list)
(dolist (label (rule-labels (cdr rule)))
(pushnew label syn-features))))
(morphosyntactic-mappings *cg*))
(set-difference syn-features (ordered-simplified-features *nbo-tagger*)))
#+test
(let ((*cg* *nbo-cg*)
(syn-features ()))
(maphash (lambda (key rule-list)
(declare (ignore key))
;(Print rule-list)
(dolist (rule rule-list)
(dolist (label (rule-labels (cdr rule)))
(pushnew label syn-features))))
(morphosyntactic-mappings *cg*))
(set-difference (collecting
(maphash (lambda (f val)
(declare (ignore val))
(collect f))
(syntactic-functions *cg*)))
syn-features
#+ignore
(ordered-simplified-features *nbo-tagger*)))
;; find out which features are added later in the CG parser
#+test
(let ((cg-features ()))
(maphash (lambda (set definition)
(declare (ignore set))
(dolist (def definition)
(when (atom def)
(setf def (list def)))
(dolist (f def)
(unless (or (stringp f)
(find f *bm-ordered-simplified-features*))
(pushnew f cg-features)))))
(set-declarations *cg*))
cg-features)
#+test
(let ((cg-features ()))
(maphash (lambda (set definition)
(declare (ignore set))
(dolist (def definition)
(when (atom def)
(setf def (list def)))
(dolist (f def)
(unless (or (stringp f)
(find f (ordered-simplified-features *nny-tagger*)))
(pushnew f cg-features)))))
(set-declarations *nn-cg*))
cg-features)
;; frequency ordering
;; DON'T CHANGE!
(setf (ordered-features *nbo-tagger*)
'("subst" "appell" "ent" "ub" "be" "fl" "mask" "nøyt" "adj" "verb" "fem"
"" "pos" "" "m/f" "pres" "inf" "" "sup"
"" "perf-part" "pret" "imp" "" "pass" "komp" ""
"" "" "" "" "prop" ""
"" "" "" "" "" ""
"" "" "" "unorm" "" ""
"" "" "" "" ""
"" "" "" "" ""
"" "" "" "" ""
"" "" "" "" "" ""
"" "adv" "" "" "" "" "fork"
"" "" "ubøy" "" "" "" ""
"" "" "" "" "" ""
"" "" "" "" ""
"" "" "" "" "" "prep"
"" "" "" "" ""
"" "" "" "" ""
"" "" "" "" ""
"" "" "" "" "" "@adv" ""
"" "" "" "" "" ""
"" "" "" "" ""
"" "" "" "pref" "" ""
"" "" "" "" ""
"" "" "" "" ""
"" "" "" "" "" ""
"" "" "" "interj" ""
"" "" "" "" ""
"" "" "" "" ""
"" "det" "" "" "" ""
"" "" "" "" ""
"" "" "" "" ""
"" "" "" "" ""
"prep+subst" "" "" "" ""
"" "" "" "" "" ""
"" "" "" ""
"" "" "" "" ""
"" "" "" "" ""
"" "" "" ""
"" "" "" "" ""
"