;;; -*- Mode: LISP; Package: MORPH-SERVER; BASE: 10; Syntax: ANSI-Common-Lisp -*- #|| MacOSX: do in xledir/lib: ln -s ../bin/sp-3.11.0/libsprt311.dylib . dynamic lib: this cannot be loaded with the RTLD_LOCAL option in dlopen(). gcc -dynamiclib -Wall -o morph-sockets-lib.dylib morph-sockets-lib.c \ -L/usr/local/xledir/lib/ -lxle -lsprt311 -install_name ./morph-sockets-lib.dylib Instead use the -bundle option: (cl-user::run-shell-command (concatenate 'string "gcc -Wall -bundle -o $PARGRAM/norwegian/bokmal/token-sockets-lib.dylib ~/lisp/projects/xle/morph/token-sockets-lib.c -L/usr/local/xledir/lib/ -lxlecore; " "gcc -Wall -bundle -o $PARGRAM/norwegian/bokmal/morph-sockets-lib.dylib ~/lisp/projects/xle/morph/morph-sockets-lib.c -L/usr/local/xledir/lib/ -lxlecore")) OBS: extension should be .bundle Linux: gcc -Bsymbolic -shared morph-sockets-lib.c -fPIC -L/usr/local/xledir/lib/ -lxlecore -o $PARGRAM/norwegian/bokmal/morph-sockets-lib.so; gcc -Bsymbolic -shared token-sockets-lib.c -fPIC -L/usr/local/xledir/lib/ -lxlecore -o $PARGRAM/norwegian/bokmal/token-sockets-lib.so --------------------- Main functions and objects: norgram-tokenize-sentence(), calls cgp::named-entity-tokenize() norgram-tag-word(), calls norgram-morphology-regexp() *segments* : (misnomer?) a table which stores tokens and compound segments together with their analyses; acts as a link between the tokenizer and morphology modules *segments* is reinitialized in norgram-tokenize-sentence(). This is not thread-safe (fix!). ||# ;; generate Linux libs #+test (cl-user::run-shell-command "gcc -m32 -fPIC -Bsymbolic -shared ~/lisp/projects/xle/morph/morph-sockets-lib.c -L/usr/local/xledir/lib/ -lxlecore -o $XLE/pargram/norwegian/bokmal/morph-sockets-lib-linux2.3.so; gcc -m32 -fPIC -Bsymbolic -shared ~/lisp/projects/xle/morph/token-sockets-lib.c -L/usr/local/xledir/lib/ -lxlecore -o $XLE/pargram/norwegian/bokmal/token-sockets-lib-linux2.3.so") ;; 64 bit #+test (cl-user::run-shell-command "gcc -m64 -fPIC -Bsymbolic -shared ~/lisp/projects/xle/morph/morph-sockets-lib.c -L/usr/local/xledir/lib/linux.x86.64/ -lxlecore -o $XLE/pargram/norwegian/bokmal/morph-sockets-lib-linux2.3_64bit.so; gcc -m64 -fPIC -Bsymbolic -shared ~/lisp/projects/xle/morph/token-sockets-lib.c -L/usr/local/xledir/lib/linux.x86.64/ -lxlecore -o $XLE/pargram/norwegian/bokmal/token-sockets-lib-linux2.3_64bit.so") ;; generate MacOSX libs #+test (cl-user::run-shell-command (concatenate 'string "gcc -Wall -bundle -o $PARGRAM/norwegian/bokmal/token-sockets-lib.dylib ~/lisp/projects/xle/morph/token-sockets-lib.c -L/usr/local/xledir/lib/ -lxlecore; " "gcc -Wall -bundle -o $PARGRAM/norwegian/bokmal/morph-sockets-lib.dylib ~/lisp/projects/xle/morph/morph-sockets-lib.c -L/usr/local/xledir/lib/ -lxlecore -flat_namespace")) ;; Bug: laksefiskeboller (??) ;; Problems: ;; Make sure that NE components get same analysis as the isolated words ;; unknown proper nouns get always +Masc ;; Per Hansen doesn't work sentence-initially ;; TO DO: ;; add final -e+- to compound analyser (for cases like bjørne- og ulvestammen) (in-package :morph-server) (eval-when (:compile-toplevel :load-toplevel :execute) (defun getenv (var) #+allegro(system:getenv var) #+sbcl(sb-sys::posix-getenv var))) (defvar *bm* nil) #-orig (setf *bm* (string-net::read-net "projects:xle;morph;bm-morph.net" :translate-p nil)) #+new (setf *bm* (string-net::read-net "projects:xle;morph;bm-morph-clumps.net" :translate-p nil)) #+(or allegro clisp) (defun get-norgram-fst-morphology (word) (with-open-file (stream "/home/paul/lisp/projects/xle/fst/nets/norgram.fst" :direction :output :if-exists :supersede) (write-line "load stack < bok-ana-final-ron2.fst" stream) (format stream "up ~a~%" word)) (cl-user::run-shell-command "cd /home/paul/lisp/projects/xle/fst/nets; ../fst -f norgram.fst > morph.out") (collecting (with-file-lines (line "/home/paul/lisp/projects/xle/fst/nets/morph.out") (unless (or (string= line "bye.") (search "'bok-ana-" line)) (collect line))))) #+(or allegro clisp) (defun get-norgram-fst-tokens (sentence) (with-open-file (stream "/home/paul/lisp/projects/xle/fst/nets/norgram.fst" :direction :output :if-exists :supersede) (write-line "load stack < parse.tok.fst" stream) (format stream "up ~a~%" sentence)) (cl-user::run-shell-command "cd /home/paul/lisp/projects/xle/fst/nets; ../fst -f norgram.fst > morph.out") (collecting (with-file-lines (line "/home/paul/lisp/projects/xle/fst/nets/morph.out") (unless (or (string= line "bye.") (search "'bok-ana-" line)) (collect line))))) #+(or allegro clisp) (defun fst-tokenize (word) (with-open-file (stream "/home/paul/lisp/projects/xle/morph/norgram.fst" :direction :output :if-exists :supersede) (write-line "load stack < cpsplit.fst" ;; "load stack < parse.tok.fst" stream) (format stream "up ~a~%" word)) (cl-user::run-shell-command "cd /home/paul/lisp/projects/xle/morph; ../fst//fst -f norgram.fst > morph.out") (collecting (with-file-lines (line "/home/paul/lisp/projects/xle/morph/morph.out") (unless (or (string= line "bye.") (search "'cpsplit" line)) (collect line))))) #+test (print (fst-tokenize "[Turen|turen][er][nok][lettest][Ã¥][gjennomføre][pÃ¥][ettersommeren][,|, ,][siden][det][til][andre][tider][av][Ã¥ret][kan][være][en del|en del|en`del][snø][i][den][bratte][renna][mellom][Fjedevatnet][og][skaret][under][BlÃ¥bretinden.]")) #+test (print (fst-tokenize "(Per|^per)(spiser)(og)(dyrker)(bananer , .|bananer .|bananer. .|bananer.)")) #+test (print (get-norgram-fst-tokens "Fra Molladalen kan du gÃ¥ sørover Molladalsskardet; til 'Ytre Standal'.")) #+test (print (get-norgram-fst-tokens "Mannen gikk opp ned og har sett seg selv i speilet.")) #+test (print (get-norgram-fst-morphology "seg selv")) #+test (print (print-norgram-morphology "regnet")) #+test (print (print-norgram-morphology "seg selv")) #+test (string-net::nmap-string-values *bm* "spis" (lambda (v) (print v)) #\: nil nil nil) (defparameter *bm-morph-feature-vector* #("Noun" "Prop" "Adj" "Verb" "Aux" "Adv" "Prep" "Interj" "Conj" "Pron" "DetPron" "Ord" "Coord" "Punct" "Det" "Art" "Symbol" "Money" "PartInf" "Sent" "Num" "Card" "Pos" "Comp" "Sup" "PastPart" "PresPart" "Masc" "Fem" "Neut" "MF" "MFN" "Def" "Indef" "NoDef" "Def/Pl" "Sg" "Pl" "SP" "Nom" "Acc" "Gen" "Pres" "Past" "Impv" "Infin" "SForm" "Digit" "Unknown")) (defun string-to-feature-string (string) (declare (optimize (speed 3) (safety 0))) (let ((feature-string "")) (loop for c across string for i fixnum from 0 do (let ((byte (the fixnum (char-code c)))) (dotimes (j 8) (declare (fixnum j)) (unless (zerop (the fixnum (logand byte (the fixnum (ash 1 j))))) (setf feature-string (concat feature-string "+" (aref *bm-morph-feature-vector* (the fixnum (+ (the fixnum (* i 8)) j))))))))) feature-string)) #-orig ;; disable for clumps! (defun decode-reading (word reading) (destructuring-bind (compressed-lemma coded-features) (split reading #\: 2) (list (decompress-string compressed-lemma word) (string-to-feature-string coded-features)))) ;; enable for clumps! #+test (defun decode-reading (word reading) (let* ((colon-pos (position #\: reading :from-end t)) ;; have to do it like this bc coded-features might contain #\: (clump-id (subseq reading (1+ colon-pos))) (clump-id (if (string= clump-id "") nil clump-id))) (destructuring-bind (compressed-lemma coded-features) (split (subseq reading 0 colon-pos) #\: 2) (list (if clump-id (concat (decompress-string compressed-lemma word) "+" clump-id) (decompress-string compressed-lemma word)) (string-to-feature-string coded-features))))) (defparameter *segments-table* (make-hash-table :test #'equal)) (defun print-norgram-morphology (word &key (stream *standard-output*) (divider #\newline) (encoding :none)) (string-net::nmap-string-values *bm* word (lambda (reading) (format stream "~{~a~}~c" (ecase encoding (:none (decode-reading word reading)) (:utf-8 (mapcar #'utf-8-encode (decode-reading word reading)))) divider)) #\: nil nil nil)) (defparameter +quote-chars+ (mapcar #'code-char (list 34 39 #x00ab #x00bb #x2018 #x2019 #x201a #x201b #x201c #x201d #x201e #x201f #x2033 #x2037 #x2039 #x203a))) (defun strip-emphase-diacritics (word) (let ((emphase-diacritics (mapcar #'code-char '(225 233 237 243 250 253 226 234 238 244 251 224 232 236 242 249))) (emphase-diacritics-stripped "aeiouyaeiouyaeiouy")) (when (find-if (lambda (c) (find c emphase-diacritics)) word) (let ((stripped (copy-seq word))) (loop for c across stripped for i from 0 do (let ((pos (position c emphase-diacritics))) (when pos (setf (char stripped i) (char emphase-diacritics-stripped pos))))) stripped)))) (defun norgram-morphology-regexp (word segments &key segment-type pos heap regexp add-+token-p) #+debug(print (list :word word :segment-type segment-type)) #+debug(dat::do-string-tree (str val segments) (print (list :word word :str str :val val))) (let* ((segment (cond ((zerop (length word)) (warn "word has length zero.") nil) ((or (char= (char word 0) #\+) (char= (char word (1- (length word))) #\+)) (string-trim "+-" word)) ((char= (char word (1- (length word))) #\-) (string-right-trim "-" word)) ((find #\_ word) word) (segment-type word) ((find :named-entity (car (dat:string-tree-get segments word))) word) ((find :named-entity-gen (car (dat:string-tree-get segments word))) word) (t nil))) (morph-list (cond (segment (destructuring-bind (str &optional id) (split segment #\\) (let* ((segment-analyses (dat:string-tree-get segments str)) (segment-analysis (if id (nth (- (length segment-analyses) (parse-integer id) 1) segment-analyses) (car segment-analyses)))) #+debug(print (list :segment segment :segments (dat:string-tree-get segments str))) #+debug(print (list :segment-analysis segment-analysis)) (list (list (car segment-analysis) (cond ((find :e-juncture segment-analysis) "+EJuncture") ((find :s-juncture segment-analysis) "+SJuncture") ((find :hyphen segment-analysis) "+Punct+Hyphen") ((find :named-entity segment-analysis) "+Prop+Indef") ((find :named-entity-gen segment-analysis) "+Prop+Indef+Gen") ((find #\_ (car segment-analysis)) (format nil "~{~a~}" (cdr segment-analysis))) (t (format nil "~a~{+~a~}" (if (or (char= (char word (1- (length word))) #\+) (char= (char word (1- (length word))) #\-) (find segment-type '(:first :first-hyphen :middle :middle-hyphen :last-hyphen))) "+Cmpel1" "+Cmpel2") (cdr segment-analysis))))))))) ;; multiword expression (MWE) ((find #\` word) (list (list (substitute #\Space #\` word) "+MWToken+Prefer"))) ;; missing in the lexicon ((find word '("." "!" "?") :test #'string=) (list (list word "+Punct+Sent"))) ((find word '(",") :test #'string=) (list (list word "+Punct+Comma"))) ((find word '("(" ")" "[" "]" "{" "}") :test #'string=) (list (list word "+Punct+Paren"))) ((find word '("-" "/") :test #'string=) (list (list word "+Punct+Hyphen"))) ((and (= (length word) 1) (find (char word 0) +quote-chars+)) (list (list "'" "+Punct+Quote"))) ((string= word "km") (list (list "kilometer" "+Meas+Masc+Abbr"))) ;; different in lexicon!! ((find word '("oppover" "herfra" "ned") :test #'string=) (list (list word "+Adv"))) ((find word '("til") :test #'string=) (list (list word "+Prep") (list word "+Adv"))) ((find word '("det") :test #'string=) (list (list "den" "+DetPron+Neut+Sg") (list word "+Pron+Neut+Acc") (list word "+Pron+Neut+Nom"))) ((find word '("mange") :test #'string=) (list (list word "+DetPron+MFN+Pl") (list word "+DetPron+Masc+Sg") (list "mang" "+Adj+Pos+MFN+Def/Pl"))) ((find word '("jeg") :test #'string=) (list (list word "+Pron+MFN+Nom"))) ;; numbers ((and (not (find-if-not (lambda (c) (find c "1234567890-.")) word)) (char= (last-char word) #\.)) (list (list word "+Dig+Ord"))) ((not (find-if-not (lambda (c) (find c "1234567890-")) word)) (list (list word "+Dig"))) ((and (not (find-if-not (lambda (c) (find c "1234567890.-")) word)) (not (char= (last-char word) #\.)) (not (char= (char word 0) #\.))) (list (list (remove #\. word) "+Dig"))) ((and (> (length word) 1) (not (find-if-not (lambda (c) (find c "1234567890")) word :end (1- (length word))))) (list (list word "+Prop+Indef+Numlit"))) ;; fallback for fragment analysis ((let ((segment (or (dat:string-tree-get segments word) (dat:string-tree-get segments (concat word "\\0"))))) #+debug(print (list :word word :analysis segment)) (and segment (car (split word #\\)))) ;; sic! car() was cdr() (destructuring-bind (str &optional id) (split word #\\) #+debug(print (list :str str :id id)) (let ((analyses (or (dat:string-tree-get segments word) (dat:string-tree-get segments (concat word "\\0"))))) #+debug(print (list :analysis analyses)) ;; :full-analysis full-analysis)) (collecting (dolist (analysis analyses) (collect (list (car analysis) (format nil (if (and (stringp (cadr analysis)) (char= (char (cadr analysis) 0) #\+)) ;; hack!! "~{~a~}" "~{+~a~}") (cdr analysis))))))))) (t (let ((readings (or (collecting (string-net::nmap-string-values *bm* word (lambda (reading) (collect (decode-reading word reading))) #\: nil nil nil)) (let ((stripped (strip-emphase-diacritics word))) (and stripped (collecting (string-net::nmap-string-values *bm* stripped (lambda (reading) (collect (decode-reading word reading))) #\: nil nil nil))))))) ;; fix for alle+Noun (when (string= word "alle") (setf readings (delete "alle" readings :key #'car :test #'string=))) #+debug(print (list :word word :readings readings)) (or readings (list #+ignore(list word "+Unk") (list word "+Token")))))))) (when (and add-+token-p (not (find (list word "+Token") morph-list :test #'equal))) (setf morph-list (cons (list word "+Token") morph-list))) #+debug(print (list* :word word :morph-list morph-list)) (labels ((build-regexp (morph-list) (if (cdr morph-list) (if heap (build-morph-or-re heap (morph-regexp (car morph-list)) (build-regexp (cdr morph-list)) regexp) (list 'or (morph-regexp (car morph-list)) (build-regexp (cdr morph-list)))) (morph-regexp (car morph-list) regexp))) (morph-regexp (morph &optional regexp) (collecting-into (lower upper positions) (destructuring-bind (lemma features) morph (let* ((fl (u::string-parse features :left-separating-chars "+")) #+kbn (nounp (find "+Noun" fl :test #'equal)) (wl (length word)) (ll (length lemma))) (when (and heap (find segment-type '(:middle :middle-hyphen :last :last-hyphen))) (collect-into lower "") (collect-into upper "+") (collect-into positions 0)) (loop for i from 0 until (and (>= i (if heap (max wl ll) ll)) (null fl)) do (collect-into lower (cond ((< i ll) (string (char lemma i))) (fl (pop fl)) (t ""))) (collect-into upper (if (< i wl) (string (char word i)) "")) (collect-into positions (if (and pos (< i wl)) (+ pos i) 0))) #+kbn (when nounp (collect-into lower "+Econ") (collect-into upper "") (collect-into positions 0)) (when (and heap (find segment-type '(:first :first-hyphen :middle :middle-hyphen))) (collect-into lower "") (collect-into upper "+") (collect-into positions 0)))) #+debug(print (list upper lower)) (if heap (build-morph-seq-re heap lower upper positions regexp) (build-morph-seq lower upper positions)))) (escape (str) (subst-substrings str '("(" "{" ")" "}"))) (build-morph-seq (lower upper positions) (if (cdr lower) (list 'seq (list 'pair (escape (car lower)) (escape (car lower)) (car positions)) (build-morph-seq (cdr lower) (cdr upper) (cdr positions))) (list 'pair (escape (car lower)) (escape (car lower)) (car positions))))) #+debug(print (list :morph-list morph-list)) (if heap (build-regexp morph-list) (values (if (and (null (cdr morph-list)) (equal (cadar morph-list) "")) "" (build-regexp morph-list)) morph-list))))) (defun norgram-tag-word (word &key client) #+debug(print (list :tag-word word :client client)) (let* ((segments (gethash client *segments-table*)) (colon-pos (position #\: word)) (plusp (char= (char word 0) #\+)) (pos (when colon-pos (parse-integer word :start (if plusp 1 0) :end colon-pos :junk-allowed t))) (word (if (and colon-pos (> (length word) 1)) (subseq word (1+ colon-pos)) word)) (word (if (and colon-pos plusp) (concat "+" word) word)) (*package* (find-package :morph-server))) #+debug(print (list :word word :pos pos)) (with-output-to-string (stream) (cond ((char= (char word 0) #\^) (write (norgram-morphology-regexp (subseq word 2) segments :pos pos) :stream stream :pretty nil :level nil)) (t (write (norgram-morphology-regexp word segments :pos pos) :stream stream :pretty nil :level nil)))))) (defparameter *aux-lex-net* (let ((string-net (make-instance 'string-net::list-string-net)) (*print-circle* t)) (setf (list-net string-net) (cadr '(x #1=((#\1 (#\: (#\: (#\null (#\null (#\null (#\null (#\null (#.(code-char 128) NIL)))))))) . #1#) (#\2 (#\: (#\: (#\null (#\null (#\null (#\null (#\null (#.(code-char 128) NIL)))))))) . #1#) (#\3 (#\: (#\: (#\null (#\null (#\null (#\null (#\null (#.(code-char 128) NIL)))))))) . #1#) (#\4 (#\: (#\: (#\null (#\null (#\null (#\null (#\null (#.(code-char 128) NIL)))))))) . #1#) (#\5 (#\: (#\: (#\null (#\null (#\null (#\null (#\null (#.(code-char 128) NIL)))))))) . #1#) (#\6 (#\: (#\: (#\null (#\null (#\null (#\null (#\null (#.(code-char 128) NIL)))))))) . #1#) (#\7 (#\: (#\: (#\null (#\null (#\null (#\null (#\null (#.(code-char 128) NIL)))))))) . #1#) (#\8 (#\: (#\: (#\null (#\null (#\null (#\null (#\null (#.(code-char 128) NIL)))))))) . #1#) (#\9 (#\: (#\: (#\null (#\null (#\null (#\null (#\null (#.(code-char 128) NIL)))))))) . #1#) (#\0 (#\: (#\: (#\null (#\null (#\null (#\null (#\null (#.(code-char 128) NIL)))))))) . #1#))))) (list-net string-net))) (progn (defparameter *bm-analyser-lexicon* (let ((mfv *bm-morph-feature-vector*)) (make-instance 'compound-analyser-lexicon :morph-feature-vector mfv :morph-feature-table (let ((mft (dat::make-string-tree))) (loop for i from 0 for f across mfv do (setf (dat:string-tree-get mft (string-downcase f)) i)) mft) :fullform-net *bm* :aux-net *aux-lex-net* :unknown-feature 'Unknown :special-readings-fn (lambda (chunk) (cond ((not (find-if-not (lambda (c) (find c "1234567890")) chunk)) (list (list chunk chunk (morph::bv-encode-features *analyser-lexicon* "Dig")))))) :package :morph-server :compound-regexp (make-instance 'fsa::feature-regexp :name "Regexp for compound analyser" :source-regexp `(:or (:seq ;; first chunk (:or ;; Unknown (:and Adj Pos) (:and Prop (:not Gen)) (:and Noun Indef (:or Sg SP) (:not Gen)) (:and Verb Infin (:not SForm)) (:and Verb Impv (:not SForm)) #+not-yet(:and det kvant) (:seq (:and Noun Indef (:or Sg SP) (:not Gen) ,morph::immediate-pre-s-juncture) :s-juncture (:? :hyphen)) (:seq (:and Noun Indef (:or Sg SP) (:not Gen) ,morph::pre-e-juncture) :e-juncture (:? :hyphen)) (:seq (:and Prop (:not Gen) ,morph::immediate-pre-s-juncture) :s-juncture (:? :hyphen)) (:seq (:and Prop (:not Gen) ,morph::pre-e-juncture) :e-juncture (:? :hyphen)) (:seq (:or (:and Noun Indef (:or Sg SP) (:not Gen)) ;; Digit ) :hyphen)) ;; inner chunks (:* (:or (:and Noun Indef (:or Sg SP) (:not Gen)) (:and Verb Infin (:not SForm)) (:and Verb Impv (:not SForm)) (:seq (:and Noun Indef (:or Sg SP) (:not Gen) ,morph::pre-s-juncture) :s-juncture) (:seq (:and Noun Indef (:or Sg SP) (:not Gen) ,morph::pre-e-juncture) :e-juncture) (:seq (:and Noun Indef (:or Sg SP) (:not Gen)) :hyphen))) ;; last chunk (:or Noun Adj PastPart :hyphen)) ;; why hyphen? (:seq ;; first chunk (:or (:seq Digit :hyphen)) ;; inner chunks (:* (:or (:and Noun Indef (:or Sg SP) (:not Gen)) (:and Verb Infin (:not SForm)) (:and Verb Impv (:not SForm)) (:seq (:and Noun Indef (:or Sg SP) (:not Gen) ,morph::pre-s-juncture) :s-juncture) (:seq (:and Noun Indef (:or Sg SP) (:not Gen) ,morph::pre-e-juncture) :e-juncture) (:seq (:and Noun Indef (:or Sg SP) (:not Gen)) :hyphen))) ;; last chunk (:or Noun)))) :hyphenated-compound-regexp (make-instance 'fsa::feature-regexp :name "Regexp for hyphenated compounds" :source-regexp `(:seq ;; first chunk (:or (:and Adj Pos #+ignore(:or "lav" "høy" "høg" (:not Neut))) (:seq (:and Noun Indef (:or Sg SP) (:not Gen)) (:? :hyphen)) (:and Verb Infin (:not SForm)) (:seq (:and Noun Indef (:or Sg SP) (:not Gen) ,morph::immediate-pre-s-juncture) :s-juncture (:? :hyphen)) (:seq (:and Noun Indef (:or Sg SP) (:not Gen) ,morph::pre-e-juncture) :e-juncture (:? :hyphen))) ;; inner chunks (:* (:or (:and Noun Indef (:or Sg SP) (:not Gen)) (:and Verb Infin (:not SForm)) (:seq (:and Noun Indef (:or Sg SP) (:not Gen) ,morph::pre-s-juncture) :s-juncture) (:seq (:and Noun Indef (:or Sg SP) (:not Gen) ,morph::pre-e-juncture) :e-juncture) (:seq (:and Noun Indef (:or Sg SP) (:not Gen)) :hyphen))) ;; last chunk is hyphen; not handled here ;; number ;; Problem: det kvant has no counterpart in norgram #+not-yet (:seq (:+ (:and det kvant)) (:? (:seq (:and "og" adv) (:and det kvant))))))))) (setf *analyser-lexicon* *bm-analyser-lexicon*)) #|| (print-norgram-morphology "lavt") (print (norgram-tokenize-sentence "Hvis pasienten ikke har puls må du i tillegg sette i gang med hjertekompresjon." :preparse-p nil)) (print (norgram-tokenize-sentence "Preposisjonsfrasetilknytningen er ikke normalisert." :preparse-p nil)) (print (norgram-tokenize-sentence "nasjonalparkgrensen" :preparse-p nil)) (print (norgram-tokenize-sentence "pensjons-" :preparse-p nil)) (print (norgram-tokenize-sentence "pensjons- og lønnsressursene" :preparse-p nil)) (print (norgram-tokenize-sentence "ulvebjørne-" :preparse-p nil)) (print (norgram-tokenize-sentence "kattekjøtt-" :preparse-p nil)) (print (norgram-tokenize-sentence "bjørne- og ulvestammen" :preparse-p nil)) (print (norgram-tokenize-sentence "bjørnekjøtt- og fleskebÃ¥ten" :preparse-p nil)) (print (norgram-tokenize-sentence "flyvehjelp" :preparse-p nil)) (print (norgram-tokenize-sentence "usiktbart" :preparse-p nil)) (print (norgram-tokenize-sentence "bjørne-" :preparse-p nil)) (print (norgram-tokenize-sentence "Fisker fisker appelsingravet eller gravet laks." :preparse-p t)) (print (norgram-tokenize-sentence "Per spiser bøker i Kvarvenveien i sentrum." :preparse-p nil)) (print (norgram-tokenize-sentence "Nesten uansett vær- og føreforhold er det mulig Ã¥ gÃ¥ pÃ¥ tur i disse traktene." :preparse-p nil)) (print (norgram-tokenize-sentence "Jeg sÃ¥ en jente og en jente i skogen." :preparse-p nil)) (print (norgram-tokenize-sentence "Jeg var pÃ¥ Universitetet i Ljubljana i gÃ¥r." :preparse-p nil)) (print (norgram-tokenize-sentence "Men Bjørnefloden er her." :preparse-p nil)) (print (norgram-tokenize-sentence "kjøp heste-kjerrer." :preparse-p nil)) (print (norgram-tokenize-sentence "a(b)d[c]r {q}r" :preparse-p nil)) ||# ;; generate list of MWEs (defparameter *mwe-list* nil) (defun build-mwe-list () (setf *mwe-list* nil) (let* ((mwe-dir (getenv "MWEDIR")) (mwe-dir (cond ((and mwe-dir (not (equal mwe-dir ""))) mwe-dir) ((let ((logonroot (getenv "LOGONROOT"))) (and logonroot (not (equal logonroot "")))) (concat (getenv "LOGONROOT") "/parc/pargram/norwegian/bokmal/")) ((let ((pargram (getenv "PARGRAM"))) (and pargram (not (equal pargram "")))) (concat (getenv "PARGRAM") "/norwegian/bokmal/"))))) (setf *mwe-list* (collecting (dolist (file '("bokmal-lex-mrs.lfg" "bokmal-mwe.lfg" "bokmal-nkllex.lfg" "bokmal-nklvrblex.lfg")) (with-file-lines (line (concat mwe-dir file) :external-format :iso-8859-1) (when (and (find #\` line) (not (find (char line 0) '(#\Space #\Tab #\")))) (let ((mwe-end (loop for i from 0 for c across line with prev-c until (and (find c '(#\Space #\Tab)) (char/= prev-c #\`)) do (setf prev-c c) finally (return i)))) (collect (mapcar (lambda (w) (string-trim "`" w)) (split (subseq line 0 mwe-end) #\Space)))))))))) #+debug(print *mwe-list*)) ;;#+test (build-mwe-list) (defun mwe-list (token tokens) ;;(print (list :token token :tokens tokens)) (let ((match ())) (dolist (mwe-list *mwe-list*) ;;(print (list (car mwe-list) token)) (when (string-equal (car mwe-list) token) (labels ((match-p (tokens tail) (cond ((null tail) t) ((null tokens) nil) ((string-equal (cadar tokens) (car tail)) (match-p (cdr tokens) (cdr tail))) (t nil)))) (when (and (match-p tokens (cdr mwe-list)) (> (length mwe-list) (length match))) (setf match (loop for token in tokens for mwe in (cdr mwe-list) collect (list (cadr token) (car token))) #+old (cdr mwe-list)))))) #+debug(print (list :token token :tokens tokens :match match)) match)) #+orig (defun mwe-list (token tokens) ;;(print (list :token token :tokens tokens)) (let ((match ())) (dolist (mwe-list *mwe-list*) (when (string-equal (car mwe-list) token) (labels ((match-p (tokens tail) (cond ((null tail) t) ((null tokens) nil) ((string-equal (cadar tokens) (car tail)) (match-p (cdr tokens) (cdr tail))) (t nil)))) (when (and (match-p tokens (cdr mwe-list)) (> (length mwe-list) (length match))) (setf match (cdr mwe-list)))))) #+debug(print (list :token token :tokens tokens :match match)) match)) #+test (print (norgram-tokenize-sentence "Turen er nok lettest å gjennomføre på ettersommeren, siden det til andre tider av Ã¥ret kan være en del snø i den bratte renna mellom Fjedevatnet og skaret under BlÃ¥bretinden.")) #+test (print (cgp::named-entity-tokenize "Det er et utmerket sted å tilbringe både påske- og sommerferie.")) #+test (print (cgp::named-entity-tokenize "Det er en vannbringebærfrukt.")) #+test (print (cgp::named-entity-tokenize "Hunden bjeffer.")) #+test (print (cgp::named-entity-tokenize "Vi jobber i Innovasjon Norge.")) #+test (print (norgram-tokenize-sentence "Vi på Universitetet i Bergens institutter.")) #+test (print (norgram-tokenize-sentence "M/B Bitihorn" :preparse-p nil)) #+test (print (norgram-tokenize-sentence "Grønolen fjellgård, Beito, 50 senger, 8 leiligheter, tlf.: 61 35 29 90, e-post: gronolen@gronolen.no" :preparse-p nil :heap t)) (defun norgram-tokenize-sentence (sentence &key preparse-p client heap segments) #+debug(print (list :sentence sentence :client client :preparse-p preparse-p)) (if (equal sentence "") (unless heap "[]") (let* ((segments (or segments (dat::make-string-tree))) (cgp::*separate-quotes-p* t) (tokens (cgp::named-entity-tokenize sentence :preparse-p preparse-p))) (unless heap (setf (gethash client *segments-table*) segments)) #+debug(print (list :sentence sentence :tokens tokens)) (labels ((analyzed-token-regexp (disjunction-list) (with-output-to-string (stream) (loop for (dis . rest) on disjunction-list do (format stream (cond ((cadddr (cddr (caar dis))) ;; unanalyzed hyphenated compound (ugly cddrerrarardar!) ;;"~{~a~^ ~}~{ ~a~}" "~{~a~^ ~}") ((cadddr (cdaar dis)) ;;"~{~a~^+~}-~{ ~a~}" "~{~a~^+~}-") (t ;;"~{~a~^+~}~{ ~a~}" "~{~a~^+~}")) (mapcar (lambda (segment) (destructuring-bind (str pos id dcp &optional next-hyphen-p unanalized-hyphen-p) segment (declare (ignore unanalized-hyphen-p)) (let ((length (when id (length (dat::string-tree-get segments str))))) (if (and id (> length 1)) (format nil "~a\\~d" str id) str)))) (car dis))) (dolist (pair (cdr dis)) (format stream " ~a" (car pair))) when rest do (write-char #\| stream)))) (get-unambiguous-token (token) (if (dat::string-tree-get segments token) (loop for id from 1 while (dat::string-tree-get segments (format nil "~a\\~d" token id)) finally (return (progn ;;(print (list :uat (format nil "~a\\~d" token id))) (format nil "~a\\~d" token id)))) token))) (let* ((firstp nil) (first-found-p nil) ;; in case there is more than one token with the same value record the number of them seen here (token-count ()) (prev-token ()) (full-token-list nil) (analyzed-sentence-list (collecting #+debug(print tokens) (loop while tokens do (setf full-token-list (pop tokens)) #+debug(print (list :full-token-list full-token-list)) (when (cdr full-token-list) (destructuring-bind (pos full-token &optional ne-marker norgram-features) full-token-list (when (and (null first-found-p) (find-if-not (lambda (c) (or (find c +quote-chars+) (find c "-_()[].?+*/"))) full-token)) (setf firstp t first-found-p t)) (let* ((upper-case-p (upper-case-p (char full-token 0))) (prev-is-colon-p (equal ":" (cadr prev-token))) (first-uppercase-p (and (or firstp prev-is-colon-p) upper-case-p)) (ne-list (unless (or (null ne-marker) (find ne-marker '(:match-end :whole-match))) (loop for ne-token.ne-marker = (pop tokens) collect (list (cadr ne-token.ne-marker) (car ne-token.ne-marker)) until (find (caddr ne-token.ne-marker) '(:match-end :dig-match-end))))) (next-is-.-p (find "." (cdar tokens) :test #'equal)) (next-is-hyphen-p (find "-" (cdar tokens) :test #'equal)) (last-token (if ne-list (caar (last ne-list)) full-token)) (last-char-is-.-p (or (and (> (length last-token) 1) (char= (char last-token (1- (length last-token))) #\.)) next-is-.-p)) (last-dot-pos (when last-char-is-.-p (caar tokens))) ;; ?? (token (if (and last-char-is-.-p (not next-is-.-p)) (subseq full-token 0 (1- (length full-token))) full-token)) (unambiguous-token (if preparse-p (get-unambiguous-token token) token))) #+debug(print (list :last-char-is-.-p last-char-is-.-p :next-is-.-p next-is-.-p :next-is-hyphen-p next-is-hyphen-p :prev-is-colon-p prev-is-colon-p :prev-token prev-token :full-token full-token :tokens tokens :last-dot-pos last-dot-pos :pos pos)) (collect (collecting (cond ((find ne-marker '(:match-start :dig-match-start :whole-match)) #+debug(print (list :ne-token token full-token)) (when (and next-is-.-p (eq ne-marker :whole-match)) (setf full-token (concat full-token ".")) (setf first-found-p nil) ;; new 9.10.2006 (pop tokens)) (let ((next-is-.-p (find "." (car tokens) :test #'equal))) #+debug(print (list :token token :full-token full-token :ne-list ne-list :tokens tokens :next-is-.-p next-is-.-p)) (when next-is-.-p (pop tokens)) (cond ((eq ne-marker :whole-match) nil) (next-is-.-p (collect-append (segment-compound full-token :punctuation (list (append ne-list (list (list "." pos)))) ;; pos is still wrong! :analyze-p nil :end-hyphen-p next-is-hyphen-p :segments segments :pos pos :heap heap))) (t #+debug(print (list :ft full-token)) (collect-append (segment-compound full-token :punctuation (list ne-list) ;; fix pos! :end-hyphen-p next-is-hyphen-p :analyze-p nil :pos pos :heap heap)))) (let ((ne (format nil (if (eq ne-marker :dig-match-start) "~a~{~a~}" "~a~{_~a~}") token (mapcar #'car ne-list)))) #+debug(print (list :ne ne :ne-list ne-list :norgram-features norgram-features :last-char-is-.-p last-char-is-.-p :next-is-.-p next-is-.-p)) (dolist (reading norgram-features) (push (if (find "+Gen" (cdr reading) :test #'string=) (let ((no-genitive (cond ((string= ne "'s" :start1 (- (length ne) 2)) (subseq ne 0 (- (length ne) 2))) ((string= ne "s" :start1 (- (length ne) 1)) (subseq ne 0 (- (length ne) 1))) ((string= ne "'" :start1 (- (length ne) 1)) (subseq ne 0 (- (length ne) 1))) (t ne)))) (list no-genitive #+ignore (car reading) (cadr reading) :named-entity-gen #+ignore "+Noun" "+Indef" "+Gen")) (list ne #+ignore(car reading) (cadr reading) :named-entity #+ignore "+Noun" "+Indef")) (dat::string-tree-get segments ne))) (collect-append (segment-compound ne :punctuation (when (or last-char-is-.-p next-is-.-p) (list (list (list "." pos)))) ;; pos is still wrong! :end-hyphen-p next-is-hyphen-p :analyze-p (and (eq ne-marker :whole-match) first-uppercase-p) ;;nil ;; new 27.10.2006 :analyze-compound-name-p (eq ne-marker :whole-match) :segments segments :pos pos :heap heap))))) (t (when (and next-is-.-p #+test(not last-char-is-.-p)) (setf full-token (concat full-token ".")) (setf first-found-p nil) ;; new 9.10.2006 (pop tokens)) (let ((mwe-list (mwe-list token tokens))) #+debug(print (list :mwe-list mwe-list :token token :tokens tokens :nf norgram-features)) (cond (mwe-list (dotimes (i (length mwe-list)) (pop tokens)) (let* ((next-is-.-p (find "." (car tokens) :test #'equal)) (dot-pos (when next-is-.-p (caar tokens)))) (cond (next-is-.-p (collect-append (segment-compound full-token :punctuation (list (append mwe-list (list (list "." dot-pos)))) :end-hyphen-p next-is-hyphen-p :analyze-p nil :segments segments :pos pos :heap heap)) (let ((downcased (string-downcase full-token))) (unless (string= downcased full-token) (collect-append (segment-compound downcased :punctuation (list (append mwe-list (list (list "." dot-pos)))) :end-hyphen-p next-is-hyphen-p :analyze-p nil :segments segments :pos pos :heap heap)))) (let* ((mwe-list-copy (copy-tree mwe-list)) ;; sic! (last (last mwe-list-copy))) (setf (caar last) (concat (caar last) ".")) (collect-append (segment-compound full-token :punctuation (list (append mwe-list-copy (list (list "." dot-pos))) mwe-list-copy) :end-hyphen-p next-is-hyphen-p :analyze-p nil :segments segments :pos pos :heap heap)) (let ((downcased (string-downcase full-token))) (unless (string= downcased full-token) (collect-append (segment-compound downcased :punctuation (list (append mwe-list-copy (list (list "." dot-pos))) mwe-list-copy) :end-hyphen-p next-is-hyphen-p :analyze-p nil :segments segments :pos pos :heap heap))))) (pop tokens)) (t (collect-append (segment-compound full-token :punctuation (list mwe-list) :segments segments :pos pos :heap heap)) (collect-append ;; add test on downcase (segment-compound (string-downcase full-token) :punctuation (list mwe-list) :downcased-p t :end-hyphen-p next-is-hyphen-p :segments segments :pos pos :heap heap)))) (let ((mwe (format nil "~a~{`~a~}" (string-downcase full-token) (mapcar #'car mwe-list)))) ;; make more efficient, no consing! #+debug(print (cons :norgram-features norgram-features)) (push (list mwe "+MWToken" "+Prefer") (dat::string-tree-get segments mwe)) (if next-is-.-p (collect `(((,mwe ,pos NIL NIL)) ("." ,dot-pos))) (collect `(((,mwe ,pos NIL NIL)))))))) (last-char-is-.-p #+debug(print (list :last-char-is-.-p last-char-is-.-p :token token :full-token full-token :unambiguous-token unambiguous-token)) (when preparse-p ;; fallback (dolist (reading norgram-features) ;; what about full-token? (when (or (not first-uppercase-p) (upper-case-p (char (car reading) 0))) (push reading (dat::string-tree-get segments (concat unambiguous-token "\\0")))))) (let ((analysis-found-p nil)) (collect-append (multiple-value-bind (analysis found-p) (segment-compound unambiguous-token :end-hyphen-p next-is-hyphen-p :punctuation `((("." ,last-dot-pos))) :segments segments :pos pos :heap heap) (setf analysis-found-p found-p) analysis)) (collect-append (multiple-value-bind (analysis found-p) (segment-compound full-token :punctuation (unless tokens `((("." ,last-dot-pos)) #+avoid-*TOP*())) :end-hyphen-p next-is-hyphen-p :segments segments :pos pos :heap heap) (setf analysis-found-p (or analysis-found-p found-p)) analysis)) ;; try name+noun analysis 12.10.2006 (when (and (not analysis-found-p) (not first-uppercase-p) (string/= unambiguous-token (string-capitalize unambiguous-token))) (collect-append (segment-compound unambiguous-token :end-hyphen-p next-is-hyphen-p :punctuation `((("." ,last-dot-pos))) :segments segments :capitalize-p t :pos pos :heap heap))) (when first-uppercase-p (when preparse-p ;; fallback (dolist (reading norgram-features) (unless (upper-case-p (char (car reading) 0)) (push reading (dat::string-tree-get segments (concat unambiguous-token "\\0")))))) (collect-append ;; what about unambiguous? (segment-compound (string-downcase token) :punctuation `((("." ,last-dot-pos))) :downcased-p t :end-hyphen-p next-is-hyphen-p :segments segments :pos pos :heap heap)) (collect-append (segment-compound (string-downcase full-token) :punctuation `((("." ,last-dot-pos)) ()) :downcased-p t :end-hyphen-p next-is-hyphen-p :segments segments :pos pos :heap heap))))) ((find token '("," ":" ";" "!" "?") :test #'string=) (collect-append (segment-compound token :punctuation nil :end-hyphen-p next-is-hyphen-p :segments segments :pos pos :heap heap))) (t (let ((analysis-found-p nil)) #+debug(print (list next-is-hyphen-p token)) (collect-append (multiple-value-bind (analysis found-p) (segment-compound unambiguous-token :punctuation nil :segments segments :end-hyphen-p next-is-hyphen-p :pos pos :heap heap) (setf analysis-found-p found-p) analysis)) ;; try name+noun analysis 12.10.2006 (when (and (not analysis-found-p) ;; ?? (not first-uppercase-p) (string/= unambiguous-token (string-capitalize unambiguous-token))) (collect-append (multiple-value-bind (analysis found-p) (segment-compound unambiguous-token :punctuation nil :segments segments :capitalize-p t :end-hyphen-p next-is-hyphen-p :pos pos :heap heap) (setf analysis-found-p found-p) analysis))) ;; fallback (when (or preparse-p (not analysis-found-p) (and (eq analysis-found-p :compound) firstp)) #+debug(print (list :norgram-features norgram-features)) (dolist (reading norgram-features) (when (or ;;(not first-uppercase-p) *** what was that for?? (upper-case-p (char (car reading) 0))) #+debug(print (list :reading1 reading)) (push reading (dat::string-tree-get segments (concat unambiguous-token "\\0")))))) #+debug(print (list :token token :unambiguous-token unambiguous-token :analysis-found-p analysis-found-p :firstp firstp :fup first-uppercase-p :nf norgram-features)) (when first-uppercase-p ;; fallback (when (or preparse-p #+ignore(not analysis-found-p)) (dolist (reading norgram-features) (unless (and #+ignore analysis-found-p (upper-case-p (char (car reading) 0))) (push reading (dat::string-tree-get segments (concat (string-downcase token) "\\0")))))) (collect-append (segment-compound (string-downcase token) :punctuation nil :downcased-p t :end-hyphen-p next-is-hyphen-p :segments segments :pos pos :heap heap))))))))))) (setf firstp nil)))) (setf prev-token full-token-list))))) #+debug(format t "~&analyzed-sentence-list: ~s" analyzed-sentence-list) #+debug(print segments) #+debug (dat:do-string-tree (string val segments) (print (list string val))) #+debug(format t "~%~{[~a]~}~%" (mapcar #'analyzed-token-regexp analyzed-sentence-list)) (if heap analyzed-sentence-list (values (format nil "~{[~a]~}" (mapcar #'analyzed-token-regexp analyzed-sentence-list)) analyzed-sentence-list))))))) #+test (print (norgram-tokenize-sentence "Jeg ser Bjørnefloden." :preparse-p nil)) #+test (print (norgram-tokenize-sentence "pensjons-" :preparse-p nil)) #+test (print (get-norgram-fst-morphology "urd")) #+test (print (norgram-tokenize-sentence "Han ser seg selv." :preparse-p t)) #+test (print (norgram-tokenize-sentence "motorsykkelhjulet" :preparse-p nil)) #+test (print (norgram-tag-word "alle")) #+test (print (norgram-tokenize-sentence "1700-tallet")) #+test (print *segments*) (defun fst-escape-string (string &optional dont-escape) (if dont-escape string (subst-substrings string '("[" "(" "]" ")")))) ;; preliminary! (defparameter *compound-table* nil) (defun build-compound-table () (let* ((logonroot (getenv "LOGONROOT")) (logonroot (if (or (null logonroot) (equal logonroot "")) "~/logon" logonroot)) (file (concat logonroot "/uio/nw/compounds/compounds.logon"))) (when (not (probe-file file)) (warn "file ~s could not be found." file) (return-from build-compound-table)) (setf *compound-table* (dat::make-string-tree)) (with-file-lines (line file) (let ((line (string-trim '(#\space #\tab) line))) (unless (or (zerop (length line)) (char= (char line 0) #\;)) (destructuring-bind (id compound segments &optional translation) (split line #\tab) (declare (ignore id translation)) (setf (dat:string-tree-get *compound-table* compound) (loop for (seg pos) on (split segments #\space) by #'cddr ;; when (< (length seg) 3) do (format t "~%short segment: ~a~%" seg) collect (cons seg (cond ((string= pos "subst") '("Noun")) ((string= pos "verb") '("Verb")) ((string= pos "adj") '("Adj")) ((string= pos "det") ;; ?? '("Num")) (t (error "pos ~s not recognized in line: ~s" pos line)))))))))) ;; recursively replace segments which are compounds by their segmentation (let ((changedp nil)) (labels ((replace-segments () (dat:do-string-tree (compound segments *compound-table*) (loop for segment-tail on segments do (let ((replacement (dat:string-tree-get *compound-table* (caar segment-tail)))) (when (and replacement (equal (cadar segment-tail) (cadar (last replacement)))) (setf changedp t (car segment-tail) (car replacement) (cdr segment-tail) (append (copy-seq (cdr replacement)) (cdr segment-tail))))))))) (loop do (replace-segments) while changedp do (setf changedp nil)))))) #+test (dat:do-string-tree (compound segments *compound-table*) (print (cons compound segments))) ;;#+test (build-compound-table) #+test (let ((*analyser-lexicon* *bm-analyser-lexicon*)) (print (segment-compound "tegningsrettsemisjon" :segments (dat::make-string-tree)))) #+test (print (segment-compound "fiskegiftstoff" :segments (dat::make-string-tree))) #+test (print (segment-compound "landskapsvernområde" :segments (dat::make-string-tree))) #+test (print (segment-compound "kildebergart" :segments (dat::make-string-tree))) #+test (let ((segments (dat::make-string-tree))) (print (segment-compound "ewwikk" :segments segments)) (print segments)) #+test (let ((*analyser-lexicon* *bm-analyser-lexicon*)) (let ((word "landskapsvernfisk")) (multiple-value-bind (c-analyses lexical-compound-p) (compound-optimal-analyses word :max 4 :restore-char-p nil :min-length-only-p t ;;:listed-compounds *compound-table* ) #-debug(print (list :word word :c-anal c-analyses :lexical-compound-p lexical-compound-p)) ))) #+test (print (dat::string-tree-get *compound-table* "villreinjakt")) ;; To do: exclude token + "."-reading for unknown hyphenated words? (defun segment-compound (word &key punctuation downcased-p (analyze-p t) analyze-compound-name-p capitalize-p segments end-hyphen-p pos heap) #+debug(print (list :analyze-p analyze-p :analyze-compound-name-p analyze-compound-name-p)) (multiple-value-bind (c-analyses lexical-compound-p) (when (or analyze-p analyze-compound-name-p) (compound-optimal-analyses (if capitalize-p (string-capitalize word) word) :max 4 :restore-char-p nil :min-length-only-p t :listed-compounds *compound-table* :end-hyphen-p end-hyphen-p :try-downcase-p (not analyze-compound-name-p) ;; new 27.10.2006 )) #+debug(print (list :word word :c-anal c-analyses :analyze-p analyze-p :analyze-compound-name-p analyze-compound-name-p :capitalize-p capitalize-p :downcased-p downcased-p)) (let ((found-p t)) #+debug(print segments) #+debug(print (list :word word :downcased-p downcased-p :c-analyses c-analyses)) (values (collecting (cond ((and (not lexical-compound-p) (not end-hyphen-p) (block find (or (string-net::nmap-string-values *bm* word (lambda (reading) (declare (ignore reading)) (return-from find t)) #\: nil nil nil) (let ((stripped (strip-emphase-diacritics word))) (and stripped (string-net::nmap-string-values *bm* stripped (lambda (reading) (return-from find t)) #\: nil nil nil))) (and (upper-case-p (char word 0)) (string-net::nmap-string-values *bm* (string-downcase word) (lambda (reading) (declare (ignore reading)) (return-from find t)) #\: nil nil nil))))) (if punctuation (dolist (punct punctuation) (collect (cons (list (list (fst-escape-string word heap) pos nil downcased-p)) punct))) (collect (list (list (list (fst-escape-string word heap) pos nil downcased-p)))))) #-without-compounds (c-analyses #+debug(print (list :c-analyses c-analyses)) (dolist (punct (or punctuation (list nil))) (loop for c-analysis in c-analyses do (collect (collecting (collect (collecting (let ((chunk-pos 0)) (dolist (segment c-analysis) (let* (;; verb stems are analysed as Impv, but NorGram needs them as Infin (features (substitute "Infin" "Impv" (cdr segment) :test #'string=)) (features (if (and capitalize-p (eq segment (car c-analysis))) (append features (list "Cap")) features)) (chunk (if (and capitalize-p (eq segment (car c-analysis))) (string-downcase (car segment)) (car segment))) (position (position features (dat::string-tree-get segments chunk) :test #'equal))) (unless position (setf position 0) (push features (dat::string-tree-get segments chunk))) (collect (list chunk (+ pos chunk-pos) (- (length (dat::string-tree-get segments chunk)) position 1) (when (eq segment (car c-analysis)) downcased-p) end-hyphen-p)) (incf chunk-pos (length chunk))))))) (collect-append punct)))) (setf found-p :compound) (when (and (not end-hyphen-p) (or #-ignore downcased-p (not lexical-compound-p))) (collect (cons (list (list word pos nil downcased-p)) punct))))) ;; unrecognized hyphenated word ((and (find #\- word)) (setf found-p nil) (let ((segs (split word #\-))) (dolist (punct (or punctuation (list nil))) (collect (collecting (collect (collecting (let ((p pos)) (loop for (seg . rest) on segs do (cond (rest (unless (zerop (length seg)) (collect (list (fst-escape-string seg heap) p nil (when (eq (car segs) seg) downcased-p) nil t)) (incf p (length seg))) (collect (list "-" p nil nil nil t)) (incf p)) ((zerop (length seg)) nil) (t (collect (list (fst-escape-string seg heap) p nil nil nil t)))))))) (collect-append punct)))))) (t (setf found-p nil) (if punctuation (dolist (punct punctuation) (collect (cons (list (list (fst-escape-string word heap) pos nil downcased-p)) punct))) (collect (list (list (list (fst-escape-string word heap) pos nil downcased-p))))))) ) found-p)))) #+test (print (segment-compound "bananhelikopter" :punctuation '(("." ".") (".")))) #+test (print (segment-compound "kildebergart")) #+test (print-ranked-analyses-regexp "fiskehansker") #+test (let ((*segments* nil)) (print (norgram-tokenize-sentence "sensommerdag")) (print *segments*) (print (norgram-morphology-regexp "sen+" *segments*)) (print (norgram-morphology-regexp "+sommer+" *segments*)) (print (norgram-morphology-regexp "+dag" *segments*))) #+test (let ((*segments* nil)) (print (norgram-tokenize-sentence "villreinjakt")) (print *segments*) (print (norgram-morphology-regexp "vill+" *segments*)) (print (norgram-morphology-regexp "+rein+" *segments*)) (print (norgram-morphology-regexp "+jakt" *segments*))) #+test (print (norgram-tokenize-sentence "kildebergart")) #+test (let ((*segments* nil)) (print (norgram-tokenize-sentence "Kari spiser fiskeboller.")) (print *segments*) (print (norgram-morphology-regexp "fisk+")) (print (norgram-morphology-regexp "+hatt"))) #+test (let ((*segments* nil)) (print (norgram-tokenize-sentence "Per" :preparse-p nil)) (print *segments*) (print (norgram-morphology-regexp "per"))) #+test (let ((client 1)) (print (norgram-tokenize-sentence "pÃ¥ Universitetet i Ljubljana" :client 1)) (let* ((segments (gethash client *segments-table*))) (print segments) (print (norgram-morphology-regexp "Universitetet_i_Ljubljana" segments)))) #+test (let ((client 1)) (print (norgram-tokenize-sentence "pÃ¥ ewwikk" :client 1)) (let* ((segments (gethash client *segments-table*))) (print segments) (print (norgram-morphology-regexp "ewwikk" segments)))) #+test (let ((client 1)) (print (norgram-tokenize-sentence "jeg")) (let ((segments (gethash client *segments-table*))) (print (nth-value 1 (norgram-morphology-regexp "jeg" segments))))) #+test (print (nth-value 1 (norgram-morphology-regexp "fisk" nil))) #+test (print (list (norgram-tokenize-sentence "På Byfjellene kan du raste, slå leir og overnatte.") *segments*)) #+test (setf *bm* (string-net::read-net "~/lisp/projects/xle/morph/bm-morph.net" :translate-p nil)) ;; gcc morph-sockets.c -o morph-sockets ;;(defparameter *socket-server* (socket:socket-server 2001)) ;;(defparameter *socket-stream* (socket:socket-accept *socket-server*)) ;;(socket:socket-status *socket-server*) #+clisp (defun start-server (&optional (stream-count 2)) (let* ((socket-server (socket:socket-server 2001)) ;;(socket-stream (socket:socket-accept socket-server)) (streams (make-array stream-count :initial-element nil)) (stream-nr 0) (sentence nil)) (print "server started.") (unwind-protect (loop do (unless (aref streams stream-nr) (setf (aref streams stream-nr) (ignore-errors (socket:socket-accept socket-server :timeout '(0 . 50)))) #+debug (if (aref streams stream-nr) (format *standard-output* "~s opened.~%" (aref streams stream-nr)) (format *standard-output* "Socket stream no. ~d still waiting.~%" stream-nr))) (when (aref streams stream-nr) (let ((socket-stream (aref streams stream-nr))) (when (find (socket:socket-status socket-stream) '(:input :io)) (setf sentence (read-line socket-stream)) (print sentence *standard-output*) (cond ((zerop (length sentence)) (write-string "" socket-stream)) ((eql 0 (search "tokenize" sentence)) (write-string (princ (norgram-tokenize-sentence (utf-8-decode (subseq sentence 9))) *standard-output*) socket-stream)) (t (write-string ;;(utf-8-encode (norgram-tag-word (utf-8-decode sentence))) (norgram-tag-word (utf-8-decode sentence)) socket-stream)))))) (setf stream-nr (mod (1+ stream-nr) stream-count)) while (or (null sentence) (> (length sentence) 0))) (princ "Closing socket connection.") (socket:socket-server-close socket-server) (cl-user::quit)))) #+test (ext:saveinitmem "/home/paul/lisp/projects/xle/morph/morph-server.mem" :quiet t :init-function #'start-server) (defparameter *socket-server* nil) (defparameter *server-connections* 0) (defparameter *client-processes* (make-hash-table)) #+allegro (ff:def-foreign-call getpid (:void)) ;; shutdown morph-server when parent process goes down #+allegro (mp::process-run-function "check-ppid" (lambda () (loop while (getppid) do (sleep 10) finally (print :exiting) (cl-user::exit)))) #+allegro (ff:def-foreign-call getppid (:void)) #+(or openmcl allegro) (defun start-server (&optional socket-server (proc-id 0)) (let ((socket-server (or socket-server (socket:make-socket :connect :passive ;;:local-host "decentius.aksis.uib.no" ;; client sets port in morph-sockets-lib.c :local-port 2001 #+allegro :reuse-address #+allegro t)))) #+debug(print (list :socket-server socket-server :pid (getpid))) (let* ((socket-stream (socket:accept-connection socket-server)) ;; only returns when new connection request comes in (client-pid (parse-integer (read-line socket-stream nil nil))) (sentence nil)) ;; write pid to /tmp. This will be used to kill the process from the Allegro that loaded the XLE lib #+debug(print (list :client-pid client-pid)) (with-open-file (stream (format nil "/tmp/morph-server-pid.~d" client-pid) :direction :output :if-exists :supersede) (write (getpid) :stream stream)) #+debug(print (list :remote-host (socket:remote-host socket-stream) :pid client-pid)) (setf (gethash socket-stream *client-processes*) (cons 0 ;; (socket:remote-host socket-stream) ;; fixme: sometimes returns 127.0.0.1 client-pid)) #+debug(print "connection accepted.") (incf *server-connections*) ;; spawn a new connection thread (print "Spawning new thread") #+openmcl (cl-user::process-run-function (format nil "morph-server-~d" (1+ proc-id) #+ignore *server-connections*) (lambda () (start-server socket-server (1+ proc-id)))) #+allegro (mp::process-run-function (format nil "morph-server-~d" (1+ proc-id)) (lambda () (start-server socket-server (1+ proc-id)))) (print "... new thread spawned") (unwind-protect (loop do (setf sentence (read-line socket-stream nil nil)) (cond ((null sentence) nil) ((zerop (length sentence)) (write-string "" socket-stream)) ((eql 0 (search "tokenize" sentence)) (write-string (handler-case (norgram-tokenize-sentence (utf-8-decode (subseq sentence 9 #+obsolete 10)) :preparse-p nil #+obsolete(char= (char sentence 9) #\y) :client (gethash socket-stream *client-processes*)) (storage-condition (condition) (declare (ignore condition)) #+:allegro (excl:exit 1 :no-unwind t)) (error (condition) (format t "error in norgram-tokenize-sentence(): ~a" condition) "")) socket-stream)) (t (write-string (handler-case (norgram-tag-word (utf-8-decode sentence) :client (gethash socket-stream *client-processes*)) (storage-condition (condition) (declare (ignore condition)) #+:allegro (excl:exit 1 :no-unwind t)) (error (condition) (format t "error in norgram-tag-word(): ~a" condition) "")) socket-stream))) #+openmcl (ccl::stream-force-output socket-stream) #+allegro (socket::force-output socket-stream) while (and sentence (> (length sentence) 0))) (close socket-stream))))) (defun find-free-port-and-make-socket (&optional (start 2002) (end 7000)) (loop for port from start to end for socket = (ignore-errors (socket:make-socket :connect :passive ;;:local-host "localhost" :local-port port #+allegro :reuse-address #+allegro t)) when socket do (return socket))) #+(or openmcl allegro) (defun cl-user::start-server-processes (&optional port) ;;(print :build-mwe-list) (build-mwe-list) (build-compound-table) (let ((socket-server (if port (socket:make-socket :connect :passive ;; :local-host "localhost" :local-port port #+allegro :reuse-address #+allegro t) (find-free-port-and-make-socket)))) (setf *socket-server* socket-server) #+debug(print (list :socket-server socket-server :port port)) #+openmcl (cl-user::process-run-function "morph-server" (lambda () (start-server socket-server))) #+allegro (mp::process-run-function "morph-server" (lambda () (start-server socket-server))) ;; shutdown morph-server when parent process goes down #+allegro (mp::process-run-function "check-ppid" (lambda () (loop with ppid = (getppid) while (= ppid (getppid)) do (sleep 10) finally (print :exiting) (cl-user::exit)))) #+debug(print (list :opening-pid-file (excl::getuid) (getppid) (print (socket:local-port socket-server)))) (with-open-file (stream (format nil "/tmp/cgp.socket.~s.~s" (excl::getuid) (getppid)) ;; not sure why parent pid is necessary :direction :output :if-exists :supersede) (write (socket:local-port socket-server) :stream stream)) socket-server)) #+test (cl-user::start-server-processes) #+dumplisp (setf cl-user::*restart-app-function* 'cl-user::start-server-processes) #+test (print (socket:local-port (find-free-port-and-make-socket))) #+test ;; openmcl (socket:shutdown *socket-server*) #+test ;; allegro (close *socket-server*) #+test (start-server (socket:make-socket :connect :passive ;;:local-host "localhost" :local-port 2001)) #+test (socket:socket-server-close *socket-server*) ;; (open-shared-library "/home/paul/lisp/projects/xle/morph/morph-sockets-lib.dylib") ;; (external-call "_initialize" :void) ;; build clisp image #+test (ext:saveinitmem "/home/paul/lisp/projects/xle/morph/morph-server.mem" :quiet t :init-function #'morph-server::start-server) ;; load as:> clisp -K full -M /home/paul/lisp/projects/xle/morph/morph-server.mem ;; count lemmas in bm-morph #+test (let ((tree (dat::make-string-tree))) (with-file-lines (line "projects:xle;morph;bm-morph.txt") (destructuring-bind (form lemma code) (split line #\:) (incf (dat:string-tree-get tree lemma 0)))) (print (car tree))) :eof