;;;   -*- Mode: LISP; Package: MORPH-SERVER; BASE: 10; Syntax: ANSI-Common-Lisp -*-

#||
MacOSX:

do in xledir/lib: ln -s ../bin/sp-3.11.0/libsprt311.dylib .

dynamic lib: this cannot be loaded with the RTLD_LOCAL option in dlopen().

gcc -dynamiclib -Wall -o morph-sockets-lib.dylib morph-sockets-lib.c \
    -L/usr/local/xledir/lib/ -lxle -lsprt311 -install_name ./morph-sockets-lib.dylib

Instead use the -bundle option:

(cl-user::run-shell-command
 (concatenate 'string "gcc -Wall -bundle -o $PARGRAM/norwegian/bokmal/token-sockets-lib.dylib ~/lisp/projects/xle/morph/token-sockets-lib.c -L/usr/local/xledir/lib/ -lxlecore; "
	 "gcc -Wall -bundle -o $PARGRAM/norwegian/bokmal/morph-sockets-lib.dylib ~/lisp/projects/xle/morph/morph-sockets-lib.c -L/usr/local/xledir/lib/ -lxlecore"))

OBS: extension should be .bundle

Linux:

gcc -Bsymbolic -shared morph-sockets-lib.c -fPIC -L/usr/local/xledir/lib/ -lxlecore -o $PARGRAM/norwegian/bokmal/morph-sockets-lib.so;
gcc -Bsymbolic -shared token-sockets-lib.c -fPIC -L/usr/local/xledir/lib/ -lxlecore -o $PARGRAM/norwegian/bokmal/token-sockets-lib.so


---------------------

Main functions and objects:

norgram-tokenize-sentence(), calls cgp::named-entity-tokenize()
norgram-tag-word(), calls norgram-morphology-regexp()

*segments* : (misnomer?) a table which stores tokens and compound segments together with their analyses; acts as a link between the tokenizer and morphology modules

*segments* is reinitialized in norgram-tokenize-sentence(). This is not thread-safe (fix!).

||#

;; generate Linux libs
#+test
(cl-user::run-shell-command
   "gcc -m32 -fPIC -Bsymbolic -shared ~/lisp/projects/xle/morph/morph-sockets-lib.c -L/usr/local/xledir/lib/ -lxlecore -o $XLE/pargram/norwegian/bokmal/morph-sockets-lib-linux2.3.so;
gcc -m32 -fPIC -Bsymbolic -shared ~/lisp/projects/xle/morph/token-sockets-lib.c -L/usr/local/xledir/lib/ -lxlecore -o $XLE/pargram/norwegian/bokmal/token-sockets-lib-linux2.3.so")

;; 64 bit
#+test
(cl-user::run-shell-command
   "gcc -m64 -fPIC -Bsymbolic -shared ~/lisp/projects/xle/morph/morph-sockets-lib.c -L/usr/local/xledir/lib/linux.x86.64/ -lxlecore -o $XLE/pargram/norwegian/bokmal/morph-sockets-lib-linux2.3_64bit.so;
gcc -m64 -fPIC -Bsymbolic -shared ~/lisp/projects/xle/morph/token-sockets-lib.c -L/usr/local/xledir/lib/linux.x86.64/ -lxlecore -o $XLE/pargram/norwegian/bokmal/token-sockets-lib-linux2.3_64bit.so")

;; generate MacOSX libs
#+test
(cl-user::run-shell-command
 (concatenate 'string "gcc -Wall -bundle -o $PARGRAM/norwegian/bokmal/token-sockets-lib.dylib ~/lisp/projects/xle/morph/token-sockets-lib.c -L/usr/local/xledir/lib/ -lxlecore; "
	 "gcc -Wall -bundle -o $PARGRAM/norwegian/bokmal/morph-sockets-lib.dylib ~/lisp/projects/xle/morph/morph-sockets-lib.c -L/usr/local/xledir/lib/ -lxlecore -flat_namespace"))

;; Bug: laksefiskeboller (??)


;; Problems:

;; Make sure that NE components get same analysis as the isolated words
;; unknown proper nouns get always +Masc 
;; Per Hansen doesn't work sentence-initially

;; TO DO:

;; add final -e+- to compound analyser (for cases like bjørne- og ulvestammen)


(in-package :morph-server)

(eval-when (:compile-toplevel :load-toplevel :execute)
  (defun getenv (var)
    #+allegro(system:getenv var)
    #+sbcl(sb-sys::posix-getenv var)))

(defvar *bm* nil)

#-orig
(setf *bm* (string-net::read-net "projects:xle;morph;bm-morph.net" :translate-p nil))
#+new
(setf *bm* (string-net::read-net "projects:xle;morph;bm-morph-clumps.net" :translate-p nil))

#+(or allegro clisp)
(defun get-norgram-fst-morphology (word)
  (with-open-file (stream "/home/paul/lisp/projects/xle/fst/nets/norgram.fst"
			  :direction :output :if-exists :supersede)
    (write-line "load stack < bok-ana-final-ron2.fst" stream)
    (format stream "up ~a~%" word))
  (cl-user::run-shell-command
   "cd /home/paul/lisp/projects/xle/fst/nets; ../fst -f norgram.fst > morph.out")
  (collecting
   (with-file-lines (line "/home/paul/lisp/projects/xle/fst/nets/morph.out")
     (unless (or (string= line "bye.") (search "'bok-ana-" line))
       (collect line)))))

#+(or allegro clisp)
(defun get-norgram-fst-tokens (sentence)
  (with-open-file (stream "/home/paul/lisp/projects/xle/fst/nets/norgram.fst"
			  :direction :output :if-exists :supersede)
    (write-line "load stack < parse.tok.fst" stream)
    (format stream "up ~a~%" sentence))
  (cl-user::run-shell-command
   "cd /home/paul/lisp/projects/xle/fst/nets; ../fst -f norgram.fst > morph.out")
  (collecting
   (with-file-lines (line "/home/paul/lisp/projects/xle/fst/nets/morph.out")
     (unless (or (string= line "bye.") (search "'bok-ana-" line))
       (collect line)))))

#+(or allegro clisp)
(defun fst-tokenize (word)
  (with-open-file (stream "/home/paul/lisp/projects/xle/morph/norgram.fst"
			  :direction :output :if-exists :supersede)
    (write-line "load stack < cpsplit.fst" ;; "load stack < parse.tok.fst"
		stream)
    (format stream "up ~a~%" word))
  (cl-user::run-shell-command
   "cd /home/paul/lisp/projects/xle/morph; ../fst//fst -f norgram.fst > morph.out")
  (collecting
   (with-file-lines (line "/home/paul/lisp/projects/xle/morph/morph.out")
     (unless (or (string= line "bye.") (search "'cpsplit" line))
       (collect line)))))

#+test
(print (fst-tokenize "[Turen|turen][er][nok][lettest][Ã¥][gjennomfÃ¸re][pÃ¥][ettersommeren][,|, ,][siden][det][til][andre][tider][av][Ã¥ret][kan][vÃ¦re][en del|en del|en`del][snÃ¸][i][den][bratte][renna][mellom][Fjedevatnet][og][skaret][under][BlÃ¥bretinden.]"))
#+test
(print (fst-tokenize "(Per|^per)(spiser)(og)(dyrker)(bananer , .|bananer .|bananer. .|bananer.)"))
#+test
(print (get-norgram-fst-tokens "Fra Molladalen kan du gÃ¥ sÃ¸rover Molladalsskardet; til 'Ytre Standal'."))
#+test
(print (get-norgram-fst-tokens "Mannen gikk opp ned og har sett seg selv i speilet."))

#+test
(print (get-norgram-fst-morphology "seg selv"))
#+test
(print (print-norgram-morphology "regnet"))
#+test
(print (print-norgram-morphology "seg selv"))

#+test
(string-net::nmap-string-values *bm* "spis" (lambda (v) (print v)) #\: nil nil nil)

(defparameter *bm-morph-feature-vector*
  #("Noun" "Prop" "Adj" "Verb" "Aux" "Adv" "Prep" "Interj" "Conj" "Pron" "DetPron" "Ord" "Coord" "Punct" "Det" "Art" "Symbol" "Money" "PartInf" "Sent" "Num" "Card"
    "Pos" "Comp" "Sup"
    "PastPart" "PresPart"
    "Masc" "Fem" "Neut" "MF" "MFN"
    "Def" "Indef" "NoDef" "Def/Pl"
    "Sg" "Pl" "SP"
    "Nom" "Acc" "Gen"
    "Pres" "Past" "Impv" "Infin" "SForm"
    "Digit" "Unknown"))

(defun string-to-feature-string (string)
  (declare (optimize (speed 3) (safety 0)))
  (let ((feature-string ""))
    (loop for c across string
          for i fixnum from 0
          do (let ((byte (the fixnum (char-code c))))
               (dotimes (j 8)
                 (declare (fixnum j))
                 (unless (zerop (the fixnum (logand byte (the fixnum (ash 1 j)))))
		   (setf feature-string
			 (concat feature-string "+"
				 (aref *bm-morph-feature-vector*
				       (the fixnum (+ (the fixnum (* i 8)) j)))))))))
    feature-string))

#-orig ;; disable for clumps!
(defun decode-reading (word reading)
  (destructuring-bind (compressed-lemma coded-features)
      (split reading #\: 2)
    (list (decompress-string compressed-lemma word)
	  (string-to-feature-string coded-features))))

;; enable for clumps!
#+test
(defun decode-reading (word reading)
  (let* ((colon-pos (position #\: reading :from-end t))	;; have to do it like this bc coded-features might contain #\:
	 (clump-id (subseq reading (1+ colon-pos)))
	 (clump-id (if (string= clump-id "") nil clump-id)))
    (destructuring-bind (compressed-lemma coded-features)
	(split (subseq reading 0 colon-pos) #\: 2)
      (list (if clump-id
		(concat (decompress-string compressed-lemma word) "+" clump-id)
		(decompress-string compressed-lemma word))
	    (string-to-feature-string coded-features)))))

(defparameter *segments-table* (make-hash-table :test #'equal))

(defun print-norgram-morphology (word &key (stream *standard-output*) (divider #\newline) (encoding :none))
  (string-net::nmap-string-values
   *bm* word (lambda (reading)
	       (format stream "~{~a~}~c"
		       (ecase encoding
			 (:none
			  (decode-reading word reading))
			 (:utf-8
			  (mapcar #'utf-8-encode (decode-reading word reading))))
		       divider))
   #\: nil nil nil))

(defparameter +quote-chars+
  (mapcar #'code-char (list 34 39
			    #x00ab #x00bb
			    #x2018 #x2019 #x201a #x201b #x201c #x201d #x201e #x201f
			    #x2033 #x2037 #x2039 #x203a)))

(defun strip-emphase-diacritics (word)
  (let ((emphase-diacritics (mapcar #'code-char '(225 233 237 243 250 253 226 234 238 244 251 224 232 236 242 249)))
	(emphase-diacritics-stripped "aeiouyaeiouyaeiouy"))
    (when (find-if (lambda (c) (find c emphase-diacritics)) word)
      (let ((stripped (copy-seq word)))
	(loop for c across stripped for i from 0
	      do (let ((pos (position c emphase-diacritics)))
		   (when pos (setf (char stripped i) (char emphase-diacritics-stripped pos)))))
	stripped))))

(defun norgram-morphology-regexp (word segments &key segment-type pos heap regexp add-+token-p)
  #+debug(print (list :word word :segment-type segment-type))
  #+debug(dat::do-string-tree (str val segments)
	   (print (list :word word :str str :val val)))
  (let* ((segment (cond ((zerop (length word))
			 (warn "word has length zero.")
			 nil)
			((or (char= (char word 0) #\+)
			     (char= (char word (1- (length word))) #\+))
			 (string-trim "+-" word))
			((char= (char word (1- (length word))) #\-)
			 (string-right-trim "-" word))
			((find #\_ word)
			 word)
			(segment-type
			 word)
			((find :named-entity (car (dat:string-tree-get segments word)))
			 word)
			((find :named-entity-gen (car (dat:string-tree-get segments word)))
			 word)
			(t
			 nil)))
	 (morph-list
	  (cond (segment
		 (destructuring-bind (str &optional id) (split segment #\\)
		   (let* ((segment-analyses (dat:string-tree-get segments str))
			  (segment-analysis (if id
						(nth (- (length segment-analyses) (parse-integer id) 1)
						     segment-analyses)
						(car segment-analyses))))
		     #+debug(print (list :segment segment :segments (dat:string-tree-get segments str)))
		     #+debug(print (list :segment-analysis segment-analysis))
		     (list (list (car segment-analysis)
				 (cond ((find :e-juncture segment-analysis)
					"+EJuncture")
				       ((find :s-juncture segment-analysis)
					"+SJuncture")
				       ((find :hyphen segment-analysis)
					"+Punct+Hyphen")
				       ((find :named-entity segment-analysis)
					"+Prop+Indef")
				       ((find :named-entity-gen segment-analysis)
					"+Prop+Indef+Gen")
				       ((find #\_ (car segment-analysis))
					(format nil "~{~a~}"
						(cdr segment-analysis)))
				       (t
					(format nil "~a~{+~a~}"
						(if (or (char= (char word (1- (length word))) #\+)
							(char= (char word (1- (length word))) #\-)
							(find segment-type
							      '(:first :first-hyphen :middle :middle-hyphen :last-hyphen)))
						    "+Cmpel1" "+Cmpel2")
						(cdr segment-analysis)))))))))
		;; multiword expression (MWE)
		((find #\` word)
		 (list (list (substitute #\Space #\` word) "+MWToken+Prefer")))
		;; missing in the lexicon
		((find word '("." "!" "?") :test #'string=)
		 (list (list word "+Punct+Sent")))
		((find word '(",") :test #'string=)
		 (list (list word "+Punct+Comma")))
		((find word '("(" ")" "[" "]" "{" "}") :test #'string=)
		 (list (list word "+Punct+Paren")))
		((find word '("-" "/") :test #'string=)
		 (list (list word "+Punct+Hyphen")))
		((and (= (length word) 1) (find (char word 0) +quote-chars+))
		 (list (list "'" "+Punct+Quote")))
		((string= word "km")
		 (list (list "kilometer" "+Meas+Masc+Abbr")))
		;; different in lexicon!!
		((find word '("oppover" "herfra" "ned") :test #'string=)
		 (list (list word "+Adv")))
		((find word '("til") :test #'string=)
		 (list (list word "+Prep") (list word "+Adv")))
		((find word '("det") :test #'string=)
		 (list (list "den" "+DetPron+Neut+Sg") (list word "+Pron+Neut+Acc") (list word "+Pron+Neut+Nom")))
		
		((find word '("mange") :test #'string=)
		 (list (list word "+DetPron+MFN+Pl") (list word "+DetPron+Masc+Sg") (list "mang" "+Adj+Pos+MFN+Def/Pl")))
		((find word '("jeg") :test #'string=)
		 (list (list word "+Pron+MFN+Nom")))
		;; numbers
		((and (not (find-if-not (lambda (c) (find c "1234567890-.")) word))
		      (char= (last-char word) #\.)) 
		 (list (list word "+Dig+Ord")))
		((not (find-if-not (lambda (c) (find c "1234567890-")) word))
		 (list (list word "+Dig")))
		((and (not (find-if-not (lambda (c) (find c "1234567890.-")) word))
		      (not (char= (last-char word) #\.))
		      (not (char= (char word 0) #\.)))
		 (list (list (remove #\. word) "+Dig")))
		((and (> (length word) 1)
		      (not (find-if-not (lambda (c) (find c "1234567890")) word :end (1- (length word)))))
		 (list (list word "+Prop+Indef+Numlit")))
		;; fallback for fragment analysis
		((let ((segment (or (dat:string-tree-get segments word)
				    (dat:string-tree-get segments (concat word "\\0")))))
		   #+debug(print (list :word word :analysis segment))
		   (and segment (car (split word #\\)))) ;; sic! car() was cdr()
		 (destructuring-bind (str &optional id) (split word #\\)
		   #+debug(print (list :str str :id id))
		   (let ((analyses (or (dat:string-tree-get segments word)
				       (dat:string-tree-get segments (concat word "\\0")))))
		     #+debug(print (list :analysis analyses)) ;; :full-analysis full-analysis)) 
		     (collecting
		       (dolist (analysis analyses)
			 (collect (list (car analysis)
					(format nil
						(if (and (stringp (cadr analysis))
							 (char= (char (cadr analysis) 0) #\+)) ;; hack!!
						    "~{~a~}"
						    "~{+~a~}")
						(cdr analysis)))))))))
		(t
		 (let ((readings (or (collecting
				       (string-net::nmap-string-values
					*bm* word (lambda (reading)
						    (collect
						     (decode-reading word reading)))
					#\: nil nil nil))
				     (let ((stripped (strip-emphase-diacritics word)))
				       (and stripped
					    (collecting
					      (string-net::nmap-string-values
					       *bm* stripped (lambda (reading)
							       (collect
								(decode-reading word reading)))
					       #\: nil nil nil)))))))
		   ;; fix for alle+Noun
		   (when (string= word "alle")
		     (setf readings (delete "alle" readings :key #'car :test #'string=)))
		   #+debug(print (list :word word :readings readings))
		   (or readings
		       (list #+ignore(list word "+Unk")
			     (list word "+Token"))))))))
    (when (and add-+token-p
	       (not (find (list word "+Token") morph-list :test #'equal)))
      (setf morph-list (cons (list word "+Token") morph-list)))
    #+debug(print (list* :word word :morph-list morph-list))
    (labels ((build-regexp (morph-list)
	       (if (cdr morph-list)
		   (if heap
		       (build-morph-or-re heap
					  (morph-regexp (car morph-list))
					  (build-regexp (cdr morph-list))
					  regexp)
		       (list 'or (morph-regexp (car morph-list))
			     (build-regexp (cdr morph-list))))
		   (morph-regexp (car morph-list) regexp)))
	     (morph-regexp (morph &optional regexp)
	       (collecting-into (lower upper positions)
		 (destructuring-bind (lemma features) morph
		   (let* ((fl (u::string-parse features :left-separating-chars "+"))
			  #+kbn
			  (nounp (find "+Noun" fl :test #'equal))
			  (wl (length word))
			  (ll (length lemma)))
		     (when (and heap (find segment-type '(:middle :middle-hyphen :last :last-hyphen)))
		       (collect-into lower "")
		       (collect-into upper "+")
		       (collect-into positions 0))
		     (loop for i from 0
			until (and (>= i (if heap (max wl ll) ll)) (null fl))
			do
			(collect-into lower
				      (cond ((< i ll)
					     (string (char lemma i)))
					    (fl
					     (pop fl))
					    (t
					     "")))
			(collect-into upper
				      (if (< i wl) (string (char word i)) ""))
			(collect-into positions
				      (if (and pos (< i wl)) (+ pos i) 0)))
		     
		     #+kbn
		     (when nounp
		       (collect-into lower "+Econ")
		       (collect-into upper "")
		       (collect-into positions 0))
		     (when (and heap (find segment-type '(:first :first-hyphen :middle :middle-hyphen)))
		       (collect-into lower "")
		       (collect-into upper "+")
		       (collect-into positions 0))))
		 #+debug(print (list upper lower))
		 (if heap
		     (build-morph-seq-re heap lower upper positions regexp)
		     (build-morph-seq lower upper positions))))
	     (escape (str)
	       (subst-substrings str '("(" "{" ")" "}")))
	     (build-morph-seq (lower upper positions)
	       (if (cdr lower)
		   (list 'seq (list 'pair (escape (car lower)) (escape (car lower)) (car positions))
			 (build-morph-seq (cdr lower) (cdr upper) (cdr positions)))
		   (list 'pair (escape (car lower)) (escape (car lower)) (car positions)))))
      #+debug(print (list :morph-list morph-list))
      (if heap
	  (build-regexp morph-list)
	  (values (if (and (null (cdr morph-list))
			   (equal (cadar morph-list) ""))
		      ""
		      (build-regexp morph-list))
		  morph-list)))))

(defun norgram-tag-word (word &key client)
  #+debug(print (list :tag-word word :client client))
  (let* ((segments (gethash client *segments-table*))
	 (colon-pos (position #\: word))
	 (plusp (char= (char word 0) #\+))
	 (pos (when colon-pos (parse-integer word :start (if plusp 1 0) :end colon-pos :junk-allowed t)))
	 (word (if (and colon-pos (> (length word) 1)) (subseq word (1+ colon-pos)) word))
	 (word (if (and colon-pos plusp) (concat "+" word) word))
	 (*package* (find-package :morph-server)))
    #+debug(print (list :word word :pos pos))
    (with-output-to-string (stream)
      (cond ((char= (char word 0) #\^)
	     (write (norgram-morphology-regexp (subseq word 2) segments :pos pos)
		    :stream stream :pretty nil :level nil))
	    (t
	     (write (norgram-morphology-regexp word segments :pos pos)
		    :stream stream :pretty nil :level nil))))))

(defparameter *aux-lex-net*
  (let ((string-net (make-instance 'string-net::list-string-net))
	(*print-circle* t))
    (setf (list-net string-net)
	  (cadr '(x #1=((#\1 (#\: (#\: (#\null (#\null (#\null (#\null (#\null (#.(code-char 128) NIL)))))))) . #1#)
			(#\2 (#\: (#\: (#\null (#\null (#\null (#\null (#\null (#.(code-char 128) NIL)))))))) . #1#)
			(#\3 (#\: (#\: (#\null (#\null (#\null (#\null (#\null (#.(code-char 128) NIL)))))))) . #1#)
			(#\4 (#\: (#\: (#\null (#\null (#\null (#\null (#\null (#.(code-char 128) NIL)))))))) . #1#)
			(#\5 (#\: (#\: (#\null (#\null (#\null (#\null (#\null (#.(code-char 128) NIL)))))))) . #1#)
			(#\6 (#\: (#\: (#\null (#\null (#\null (#\null (#\null (#.(code-char 128) NIL)))))))) . #1#)
			(#\7 (#\: (#\: (#\null (#\null (#\null (#\null (#\null (#.(code-char 128) NIL)))))))) . #1#)
			(#\8 (#\: (#\: (#\null (#\null (#\null (#\null (#\null (#.(code-char 128) NIL)))))))) . #1#)
			(#\9 (#\: (#\: (#\null (#\null (#\null (#\null (#\null (#.(code-char 128) NIL)))))))) . #1#)
			(#\0 (#\: (#\: (#\null (#\null (#\null (#\null (#\null (#.(code-char 128) NIL)))))))) . #1#)))))
    (list-net string-net)))

(progn
  (defparameter *bm-analyser-lexicon*
    (let ((mfv *bm-morph-feature-vector*))
      (make-instance
       'compound-analyser-lexicon
       :morph-feature-vector mfv
       :morph-feature-table (let ((mft (dat::make-string-tree)))
			      (loop for i from 0 for f across mfv
				    do (setf (dat:string-tree-get mft (string-downcase f)) i))
			      mft)
       :fullform-net *bm*
       :aux-net *aux-lex-net*
       :unknown-feature 'Unknown
       :special-readings-fn
       (lambda (chunk)
	 (cond ((not (find-if-not (lambda (c) (find c "1234567890")) chunk))
		(list (list chunk chunk (morph::bv-encode-features *analyser-lexicon* "Dig")))))) 
       :package :morph-server
       :compound-regexp
       (make-instance
	'fsa::feature-regexp
	:name "Regexp for compound analyser"
	:source-regexp
	`(:or
	  (:seq
	   ;; first chunk
	   (:or
	    ;; Unknown
	    (:and Adj Pos)
	    (:and Prop (:not Gen))
	    (:and Noun Indef (:or Sg SP) (:not Gen))
	    (:and Verb Infin (:not SForm))
	    (:and Verb Impv (:not SForm))
	    #+not-yet(:and det kvant)
	    (:seq (:and Noun Indef (:or Sg SP) (:not Gen) ,morph::immediate-pre-s-juncture)
		  :s-juncture (:? :hyphen))
	    (:seq (:and Noun Indef (:or Sg SP) (:not Gen) ,morph::pre-e-juncture)
		  :e-juncture (:? :hyphen))
	    (:seq (:and Prop (:not Gen) ,morph::immediate-pre-s-juncture)
		  :s-juncture (:? :hyphen))
	    (:seq (:and Prop (:not Gen) ,morph::pre-e-juncture)
		  :e-juncture (:? :hyphen))
	    (:seq (:or (:and Noun Indef (:or Sg SP) (:not Gen))	;; Digit
		       )
		  :hyphen))
	   ;; inner chunks
	   (:* (:or
		(:and Noun Indef (:or Sg SP) (:not Gen))
		(:and Verb Infin (:not SForm))
		(:and Verb Impv (:not SForm))
		(:seq (:and Noun Indef (:or Sg SP) (:not Gen) ,morph::pre-s-juncture)
		      :s-juncture)
		(:seq (:and Noun Indef (:or Sg SP) (:not Gen) ,morph::pre-e-juncture)
		      :e-juncture)
		(:seq (:and Noun Indef (:or Sg SP) (:not Gen))
		      :hyphen)))
	   ;; last chunk
	   (:or Noun Adj PastPart :hyphen)) ;; why hyphen?
	  
	  (:seq
	   ;; first chunk
	   (:or
	    (:seq Digit :hyphen))
	   ;; inner chunks
	   (:* (:or
		(:and Noun Indef (:or Sg SP) (:not Gen))
		(:and Verb Infin (:not SForm))
		(:and Verb Impv (:not SForm))
		(:seq (:and Noun Indef (:or Sg SP) (:not Gen) ,morph::pre-s-juncture)
		      :s-juncture)
		(:seq (:and Noun Indef (:or Sg SP) (:not Gen) ,morph::pre-e-juncture)
		      :e-juncture)
		(:seq (:and Noun Indef (:or Sg SP) (:not Gen))
		      :hyphen)))
	   ;; last chunk
	   (:or Noun))))
	:hyphenated-compound-regexp
	(make-instance
	 'fsa::feature-regexp
	 :name "Regexp for hyphenated compounds"
	 :source-regexp
	 `(:seq
	   ;; first chunk
	   (:or
	    (:and Adj Pos #+ignore(:or "lav" "høy" "høg" (:not Neut)))
	    (:seq (:and Noun Indef (:or Sg SP) (:not Gen)) (:? :hyphen))
	    (:and Verb Infin (:not SForm))
	    (:seq (:and Noun Indef (:or Sg SP) (:not Gen) ,morph::immediate-pre-s-juncture)
	     :s-juncture (:? :hyphen))
	    (:seq (:and Noun Indef (:or Sg SP) (:not Gen) ,morph::pre-e-juncture)
	     :e-juncture (:? :hyphen)))
	   ;; inner chunks
	   (:* (:or
		(:and Noun Indef (:or Sg SP) (:not Gen))
		(:and Verb Infin (:not SForm))
		(:seq (:and Noun Indef (:or Sg SP) (:not Gen) ,morph::pre-s-juncture)
		      :s-juncture)
		(:seq (:and Noun Indef (:or Sg SP) (:not Gen) ,morph::pre-e-juncture)
		      :e-juncture)
		(:seq (:and Noun Indef (:or Sg SP) (:not Gen))
		      :hyphen)))
	   ;; last chunk is hyphen; not handled here
	   
	   ;; number ;; Problem: det kvant has no counterpart in norgram
	   #+not-yet
	   (:seq
	    (:+ (:and det kvant))
	    (:? (:seq (:and "og" adv)
		      (:and det kvant)))))))))
  (setf *analyser-lexicon* *bm-analyser-lexicon*))

#||

(print-norgram-morphology "lavt") 

(print (norgram-tokenize-sentence "Hvis pasienten ikke har puls må du i tillegg sette i gang med hjertekompresjon." :preparse-p nil))

(print (norgram-tokenize-sentence "Preposisjonsfrasetilknytningen er ikke normalisert." :preparse-p nil))

(print (norgram-tokenize-sentence "nasjonalparkgrensen" :preparse-p nil))
(print (norgram-tokenize-sentence "pensjons-" :preparse-p nil))
(print (norgram-tokenize-sentence "pensjons- og lÃ¸nnsressursene" :preparse-p nil))
(print (norgram-tokenize-sentence "ulvebjÃ¸rne-" :preparse-p nil))
(print (norgram-tokenize-sentence "kattekjÃ¸tt-" :preparse-p nil))
(print (norgram-tokenize-sentence "bjÃ¸rne- og ulvestammen" :preparse-p nil))
(print (norgram-tokenize-sentence "bjÃ¸rnekjÃ¸tt- og fleskebÃ¥ten" :preparse-p nil))
(print (norgram-tokenize-sentence "flyvehjelp" :preparse-p nil))
(print (norgram-tokenize-sentence "usiktbart" :preparse-p nil))
(print (norgram-tokenize-sentence "bjÃ¸rne-" :preparse-p nil))

(print (norgram-tokenize-sentence "Fisker fisker appelsingravet eller gravet laks." :preparse-p t))

(print (norgram-tokenize-sentence "Per spiser bÃ¸ker i Kvarvenveien i sentrum." :preparse-p nil))

(print (norgram-tokenize-sentence "Nesten uansett vÃ¦r- og fÃ¸reforhold er det mulig Ã¥ gÃ¥ pÃ¥ tur i disse traktene." :preparse-p nil))
(print (norgram-tokenize-sentence "Jeg sÃ¥ en jente og en jente i skogen." :preparse-p nil))
(print (norgram-tokenize-sentence "Jeg var pÃ¥ Universitetet i Ljubljana i gÃ¥r." :preparse-p nil))
(print (norgram-tokenize-sentence "Men BjÃ¸rnefloden er her." :preparse-p nil))
(print (norgram-tokenize-sentence "kjøp heste-kjerrer." :preparse-p nil))

(print (norgram-tokenize-sentence "a(b)d[c]r {q}r" :preparse-p nil))

||#

;; generate list of MWEs
(defparameter *mwe-list* nil)

(defun build-mwe-list ()
  (setf *mwe-list* nil)
  (let* ((mwe-dir (getenv "MWEDIR"))
	 (mwe-dir (cond ((and mwe-dir (not (equal mwe-dir "")))
			 mwe-dir)
			((let ((logonroot (getenv "LOGONROOT")))
			   (and logonroot (not (equal logonroot ""))))
			 (concat (getenv "LOGONROOT") "/parc/pargram/norwegian/bokmal/"))
			((let ((pargram (getenv "PARGRAM")))
			   (and pargram (not (equal pargram ""))))
			 (concat (getenv "PARGRAM") "/norwegian/bokmal/")))))
    (setf *mwe-list*
	  (collecting
	    (dolist (file '("bokmal-lex-mrs.lfg" "bokmal-mwe.lfg" "bokmal-nkllex.lfg" "bokmal-nklvrblex.lfg"))
	      (with-file-lines (line (concat mwe-dir file) :external-format :iso-8859-1)
		(when (and (find #\` line)
			   (not (find (char line 0) '(#\Space #\Tab #\"))))
		  (let ((mwe-end (loop for i from 0 for c across line with prev-c
				       until (and (find c '(#\Space #\Tab))
						  (char/= prev-c #\`))
				       do (setf prev-c c)
				       finally (return i))))
		    (collect (mapcar (lambda (w) (string-trim "`" w)) (split (subseq line 0 mwe-end) #\Space))))))))))
  #+debug(print *mwe-list*))

;;#+test
(build-mwe-list)

(defun mwe-list (token tokens)
  ;;(print (list :token token :tokens tokens))
  (let ((match ()))
    (dolist (mwe-list *mwe-list*)
      ;;(print (list (car mwe-list) token))
      (when (string-equal (car mwe-list) token)
	(labels ((match-p (tokens tail)
		   (cond ((null tail)
			  t)
			 ((null tokens)
			  nil)
			 ((string-equal (cadar tokens) (car tail))
			  (match-p (cdr tokens) (cdr tail)))
			 (t
			  nil))))
	  (when (and (match-p tokens (cdr mwe-list))
		     (> (length mwe-list) (length match)))
	    (setf match
		  (loop for token in tokens for mwe in (cdr mwe-list)
		       collect (list (cadr token) (car token)))
		  #+old
		  (cdr mwe-list))))))
    #+debug(print (list :token token :tokens tokens :match match))
    match))

#+orig
(defun mwe-list (token tokens)
  ;;(print (list :token token :tokens tokens))
  (let ((match ()))
    (dolist (mwe-list *mwe-list*)
      (when (string-equal (car mwe-list) token)
	(labels ((match-p (tokens tail)
		   (cond ((null tail)
			  t)
			 ((null tokens)
			  nil)
			 ((string-equal (cadar tokens) (car tail))
			  (match-p (cdr tokens) (cdr tail)))
			 (t
			  nil))))
	  (when (and (match-p tokens (cdr mwe-list))
		     (> (length mwe-list) (length match)))
	    (setf match (cdr mwe-list))))))
    #+debug(print (list :token token :tokens tokens :match match))
    match))

#+test
(print (norgram-tokenize-sentence "Turen er nok lettest å gjennomføre på ettersommeren, siden det til andre tider av Ã¥ret kan vÃ¦re en del snÃ¸ i den bratte renna mellom Fjedevatnet og skaret under BlÃ¥bretinden."))

#+test
(print (cgp::named-entity-tokenize "Det er et utmerket sted å tilbringe både påske- og sommerferie."))
#+test
(print (cgp::named-entity-tokenize "Det er en vannbringebærfrukt."))
#+test
(print (cgp::named-entity-tokenize "Hunden bjeffer."))
#+test
(print (cgp::named-entity-tokenize "Vi jobber i Innovasjon Norge."))
#+test
(print (norgram-tokenize-sentence "Vi på Universitetet i Bergens institutter."))

#+test
(print (norgram-tokenize-sentence "M/B Bitihorn" :preparse-p nil))

#+test
(print (norgram-tokenize-sentence "Grønolen fjellgård, Beito, 50 senger, 8 leiligheter, tlf.: 61 35 29 90, e-post: gronolen@gronolen.no"
				  :preparse-p nil :heap t))

(defun norgram-tokenize-sentence (sentence &key preparse-p client heap segments)
  #+debug(print (list :sentence sentence :client client :preparse-p preparse-p))
  (if (equal sentence "")
      (unless heap "[]")
      (let* ((segments (or segments (dat::make-string-tree)))
	     (cgp::*separate-quotes-p* t)
	     (tokens (cgp::named-entity-tokenize sentence :preparse-p preparse-p)))
	(unless heap (setf (gethash client *segments-table*) segments))
	#+debug(print (list :sentence sentence :tokens tokens))
	(labels ((analyzed-token-regexp (disjunction-list)
		   (with-output-to-string (stream)
		     (loop for (dis . rest) on disjunction-list
			do (format stream (cond ((cadddr (cddr (caar dis))) ;; unanalyzed hyphenated compound (ugly cddrerrarardar!)
						 ;;"~{~a~^ ~}~{ ~a~}"
						 "~{~a~^ ~}")
						((cadddr (cdaar dis))
						 ;;"~{~a~^+~}-~{ ~a~}"
						 "~{~a~^+~}-")
						(t
						 ;;"~{~a~^+~}~{ ~a~}"
						 "~{~a~^+~}"))
				   (mapcar (lambda (segment)
					     (destructuring-bind (str pos id dcp &optional next-hyphen-p unanalized-hyphen-p) segment
					       (declare (ignore unanalized-hyphen-p))
					       (let ((length (when id (length (dat::string-tree-get segments str)))))
						 (if (and id (> length 1))
						     (format nil "~a\\~d" str id)
						     str))))
					   (car dis)))
			  (dolist (pair (cdr dis))
			    (format stream " ~a" (car pair)))
			when rest do (write-char #\| stream))))
		 (get-unambiguous-token (token)
		   (if (dat::string-tree-get segments token)
		       (loop for id from 1
			     while (dat::string-tree-get segments (format nil "~a\\~d" token id))
			     finally (return (progn ;;(print (list :uat (format nil "~a\\~d" token id)))
					       (format nil "~a\\~d" token id))))
		       token)))
	  (let* ((firstp nil)
		 (first-found-p nil)
		 ;; in case there is more than one token with the same value record the number of them seen here
		 (token-count ())
		 (prev-token ())
		 (full-token-list nil)
		 (analyzed-sentence-list
		  (collecting
		    #+debug(print tokens)
		    (loop while tokens
			  do (setf full-token-list (pop tokens))
			  #+debug(print (list :full-token-list full-token-list))
			  (when (cdr full-token-list)
			    (destructuring-bind (pos full-token &optional ne-marker norgram-features) full-token-list
			      (when (and (null first-found-p)
					 (find-if-not (lambda (c) (or (find c +quote-chars+)
								      (find c "-_()[].?+*/")))
						      full-token))
				(setf firstp t first-found-p t))
			      (let* ((upper-case-p (upper-case-p (char full-token 0)))
				     (prev-is-colon-p (equal ":" (cadr prev-token)))
				     (first-uppercase-p (and (or firstp prev-is-colon-p) upper-case-p))
				     (ne-list (unless (or (null ne-marker)
							  (find ne-marker '(:match-end :whole-match)))
						(loop for ne-token.ne-marker = (pop tokens)
						      collect (list (cadr ne-token.ne-marker) (car ne-token.ne-marker))
						      until (find (caddr ne-token.ne-marker)
								  '(:match-end :dig-match-end)))))
				     (next-is-.-p (find "." (cdar tokens) :test #'equal))
				     (next-is-hyphen-p (find "-" (cdar tokens) :test #'equal))
				     (last-token (if ne-list (caar (last ne-list)) full-token))
				     (last-char-is-.-p
				      (or (and (> (length last-token) 1)
					       (char= (char last-token (1- (length last-token))) #\.))
					  next-is-.-p))
				     (last-dot-pos (when last-char-is-.-p (caar tokens))) ;; ??
				     (token (if (and last-char-is-.-p (not next-is-.-p))
						(subseq full-token 0 (1- (length full-token)))
						full-token))
				     (unambiguous-token (if preparse-p (get-unambiguous-token token) token)))
				#+debug(print (list :last-char-is-.-p last-char-is-.-p
						    :next-is-.-p next-is-.-p 
						    :next-is-hyphen-p next-is-hyphen-p
						    :prev-is-colon-p prev-is-colon-p
						    :prev-token prev-token
						    :full-token full-token
						    :tokens tokens
						    :last-dot-pos last-dot-pos
						    :pos pos))
				(collect
				 (collecting
				   (cond ((find ne-marker '(:match-start :dig-match-start :whole-match))
					  #+debug(print (list :ne-token token full-token))
					  (when (and next-is-.-p (eq ne-marker :whole-match))
					    (setf full-token (concat full-token "."))
					    (setf first-found-p nil) ;; new 9.10.2006
					    (pop tokens))
					  (let ((next-is-.-p (find "." (car tokens) :test #'equal)))
					    #+debug(print (list :token token :full-token full-token :ne-list ne-list 
								:tokens tokens :next-is-.-p next-is-.-p))
					    (when next-is-.-p (pop tokens))
					    (cond ((eq ne-marker :whole-match)
						   nil)
						  (next-is-.-p
						   (collect-append
						    (segment-compound full-token
								      :punctuation
								      (list (append ne-list (list (list "." pos)))) ;; pos is still wrong!
								      :analyze-p nil
								      :end-hyphen-p next-is-hyphen-p
								      :segments segments
								      :pos pos :heap heap)))
						  (t
						   #+debug(print (list :ft full-token))
						   (collect-append
						    (segment-compound full-token
								      :punctuation (list ne-list) ;; fix pos!
								      :end-hyphen-p next-is-hyphen-p
								      :analyze-p nil
								      :pos pos :heap heap))))
					    (let ((ne (format nil (if (eq ne-marker :dig-match-start) "~a~{~a~}" "~a~{_~a~}")
							      token (mapcar #'car ne-list))))
					      #+debug(print (list :ne ne :ne-list ne-list :norgram-features norgram-features
								  :last-char-is-.-p last-char-is-.-p :next-is-.-p next-is-.-p))
					      (dolist (reading norgram-features)
						(push (if (find "+Gen" (cdr reading) :test #'string=)
							  (let ((no-genitive
								 (cond ((string= ne "'s" :start1 (- (length ne) 2))
									(subseq ne 0 (- (length ne) 2)))
								       ((string= ne "s" :start1 (- (length ne) 1))
									(subseq ne 0 (- (length ne) 1)))
								       ((string= ne "'" :start1 (- (length ne) 1))
									(subseq ne 0 (- (length ne) 1)))
								       (t
									ne))))
							    (list no-genitive #+ignore (car reading) (cadr reading) :named-entity-gen #+ignore "+Noun" "+Indef" "+Gen"))
							  (list ne #+ignore(car reading) (cadr reading) :named-entity #+ignore "+Noun" "+Indef"))
						      (dat::string-tree-get segments ne)))
					      (collect-append
					       (segment-compound ne
								 :punctuation
								 (when (or last-char-is-.-p next-is-.-p)
								   (list (list (list "." pos)))) ;; pos is still wrong!
								 :end-hyphen-p next-is-hyphen-p
								 :analyze-p (and (eq ne-marker :whole-match)
										 first-uppercase-p) ;;nil ;; new 27.10.2006
								 :analyze-compound-name-p (eq ne-marker :whole-match)
								 :segments segments
								 :pos pos :heap heap)))))
					 (t
					  (when (and next-is-.-p #+test(not last-char-is-.-p))
					    (setf full-token (concat full-token "."))
					    (setf first-found-p nil) ;; new 9.10.2006
					    (pop tokens))
					  (let ((mwe-list (mwe-list token tokens)))
					    #+debug(print (list :mwe-list mwe-list :token token :tokens tokens :nf norgram-features))
					    (cond (mwe-list
						   (dotimes (i (length mwe-list)) (pop tokens))
						   (let* ((next-is-.-p (find "." (car tokens) :test #'equal))
							  (dot-pos (when next-is-.-p (caar tokens))))
						     (cond (next-is-.-p
							    (collect-append
							     (segment-compound full-token
									       :punctuation
									       (list (append mwe-list (list (list "." dot-pos))))
									       :end-hyphen-p next-is-hyphen-p
									       :analyze-p nil
									       :segments segments
									       :pos pos :heap heap))
							    (let ((downcased (string-downcase full-token)))
							      (unless (string= downcased full-token)
								(collect-append
								 (segment-compound downcased
										   :punctuation
										   (list (append mwe-list (list (list "." dot-pos))))
										   :end-hyphen-p next-is-hyphen-p
										   :analyze-p nil
										   :segments segments
										   :pos pos :heap heap))))
							    (let* ((mwe-list-copy (copy-tree mwe-list))	;; sic!
								   (last (last mwe-list-copy)))	
							      (setf (caar last) (concat (caar last) "."))
							      (collect-append
							       (segment-compound full-token
										 :punctuation
										 (list (append mwe-list-copy (list (list "." dot-pos)))
										       mwe-list-copy)
										 :end-hyphen-p next-is-hyphen-p
										 :analyze-p nil
										 :segments segments
										 :pos pos :heap heap))
							      (let ((downcased (string-downcase full-token)))
								(unless (string= downcased full-token)
								  (collect-append
								   (segment-compound downcased
										     :punctuation
										     (list (append mwe-list-copy (list (list "." dot-pos)))
											   mwe-list-copy)
										     :end-hyphen-p next-is-hyphen-p
										     :analyze-p nil
										     :segments segments
										     :pos pos :heap heap)))))
							    (pop tokens))
							   (t
							    (collect-append
							     (segment-compound full-token :punctuation (list mwe-list)
									       :segments segments
									       :pos pos :heap heap))
							    (collect-append ;; add test on downcase
							     (segment-compound (string-downcase full-token)
									       :punctuation (list mwe-list)
									       :downcased-p t
									       :end-hyphen-p next-is-hyphen-p
									       :segments segments
									       :pos pos :heap heap))))
						     (let ((mwe (format nil "~a~{`~a~}" (string-downcase full-token)
									(mapcar #'car mwe-list)))) ;; make more efficient, no consing!
						       #+debug(print (cons :norgram-features norgram-features))
						       (push (list mwe "+MWToken" "+Prefer")
							     (dat::string-tree-get segments mwe))
						       (if next-is-.-p
							   (collect `(((,mwe ,pos NIL NIL)) ("." ,dot-pos)))
							   (collect `(((,mwe ,pos NIL NIL))))))))
						  (last-char-is-.-p
						   #+debug(print (list :last-char-is-.-p last-char-is-.-p
								       :token token :full-token full-token
								       :unambiguous-token unambiguous-token))
						   (when preparse-p
						     ;; fallback
						     (dolist (reading norgram-features)	;; what about full-token?
						       (when (or (not first-uppercase-p)
								 (upper-case-p (char (car reading) 0)))
							 (push reading (dat::string-tree-get segments (concat unambiguous-token "\\0"))))))
						
						   (let ((analysis-found-p nil))
						     (collect-append
						      (multiple-value-bind (analysis found-p)
							  (segment-compound unambiguous-token
									    :end-hyphen-p next-is-hyphen-p
									    :punctuation `((("." ,last-dot-pos)))
									    :segments segments
									    :pos pos :heap heap)
							(setf analysis-found-p found-p)
							analysis))
						     (collect-append
						      (multiple-value-bind (analysis found-p)
							  (segment-compound full-token
									    :punctuation (unless tokens
											   `((("." ,last-dot-pos)) #+avoid-*TOP*()))
									    :end-hyphen-p next-is-hyphen-p
									    :segments segments
									    :pos pos :heap heap)
							(setf analysis-found-p (or analysis-found-p found-p))
							analysis))
						     ;; try name+noun analysis 12.10.2006
						     (when (and (not analysis-found-p)
								(not first-uppercase-p)
								(string/= unambiguous-token (string-capitalize unambiguous-token)))
						       (collect-append
							(segment-compound unambiguous-token
									  :end-hyphen-p next-is-hyphen-p
									  :punctuation `((("." ,last-dot-pos)))
									  :segments segments :capitalize-p t
									  :pos pos :heap heap)))
						     (when first-uppercase-p
						       (when preparse-p
							 ;; fallback
							 (dolist (reading norgram-features)
							   (unless (upper-case-p (char (car reading) 0)) 
							     (push reading (dat::string-tree-get segments (concat unambiguous-token "\\0"))))))
						       (collect-append ;; what about unambiguous?
							(segment-compound (string-downcase token)
									  :punctuation `((("." ,last-dot-pos)))
									  :downcased-p t
									  :end-hyphen-p next-is-hyphen-p
									  :segments segments
									  :pos pos :heap heap))
						       (collect-append
							(segment-compound (string-downcase full-token)
									  :punctuation `((("." ,last-dot-pos)) ())
									  :downcased-p t
									  :end-hyphen-p next-is-hyphen-p
									  :segments segments
									  :pos pos :heap heap)))))
						  ((find token '("," ":" ";" "!" "?") :test #'string=)
						   (collect-append
						    (segment-compound token :punctuation nil
								      :end-hyphen-p next-is-hyphen-p
								      :segments segments
								      :pos pos :heap heap)))
						  (t
						   (let ((analysis-found-p nil))
						     #+debug(print (list next-is-hyphen-p token))
						     (collect-append
						      (multiple-value-bind (analysis found-p)
							  (segment-compound unambiguous-token :punctuation nil
									    :segments segments
									    :end-hyphen-p next-is-hyphen-p
									    :pos pos :heap heap)
							(setf analysis-found-p found-p)
							analysis))
						     ;; try name+noun analysis 12.10.2006
						     (when (and (not analysis-found-p) ;; ??
								(not first-uppercase-p)
								(string/= unambiguous-token (string-capitalize unambiguous-token)))
						       (collect-append
							(multiple-value-bind (analysis found-p)
							    (segment-compound unambiguous-token :punctuation nil
									      :segments segments :capitalize-p t
									      :end-hyphen-p next-is-hyphen-p
									      :pos pos :heap heap)
							  (setf analysis-found-p found-p)
							  analysis)))
						     ;; fallback
						     (when (or preparse-p
							       (not analysis-found-p)
							       (and (eq analysis-found-p :compound)
								    firstp))
						       #+debug(print (list :norgram-features norgram-features))
						       (dolist (reading norgram-features)
							 (when (or ;;(not first-uppercase-p) *** what was that for??
								(upper-case-p (char (car reading) 0)))
							   #+debug(print (list :reading1 reading))
							   (push reading (dat::string-tree-get segments (concat unambiguous-token "\\0"))))))
						     #+debug(print (list :token token :unambiguous-token unambiguous-token
									 :analysis-found-p analysis-found-p
									 :firstp firstp :fup first-uppercase-p
									 :nf norgram-features))
						     (when first-uppercase-p
						       ;; fallback
						       (when (or preparse-p #+ignore(not analysis-found-p))
							 (dolist (reading norgram-features)
							   (unless (and #+ignore analysis-found-p (upper-case-p (char (car reading) 0))) 
							     (push reading (dat::string-tree-get segments (concat (string-downcase token) "\\0"))))))
						       (collect-append
							(segment-compound (string-downcase token) :punctuation nil :downcased-p t
									  :end-hyphen-p next-is-hyphen-p
									  :segments segments
									  :pos pos :heap heap)))))))))))
				(setf firstp nil))))
			  (setf prev-token full-token-list)))))
	    #+debug(format t "~&analyzed-sentence-list: ~s" analyzed-sentence-list)
	    #+debug(print segments)
	    #+debug
	    (dat:do-string-tree (string val segments)
	      (print (list string val)))
	    #+debug(format t "~%~{[~a]~}~%" (mapcar #'analyzed-token-regexp analyzed-sentence-list))
	    (if heap
		analyzed-sentence-list
		(values (format nil "~{[~a]~}" (mapcar #'analyzed-token-regexp analyzed-sentence-list))
			analyzed-sentence-list)))))))

#+test
(print (norgram-tokenize-sentence "Jeg ser Bjørnefloden." :preparse-p nil))


#+test
(print (norgram-tokenize-sentence "pensjons-" :preparse-p nil))


#+test
(print (get-norgram-fst-morphology "urd"))

#+test
(print (norgram-tokenize-sentence "Han ser seg selv." :preparse-p t))

#+test
(print (norgram-tokenize-sentence "motorsykkelhjulet" :preparse-p nil))
#+test
(print (norgram-tag-word "alle"))

#+test
(print (norgram-tokenize-sentence "1700-tallet"))

#+test
(print *segments*)

(defun fst-escape-string (string &optional dont-escape)
  (if dont-escape
      string
      (subst-substrings string '("[" "(" "]" ")")))) ;; preliminary!

(defparameter *compound-table* nil)

(defun build-compound-table ()
  (let* ((logonroot (getenv "LOGONROOT"))
	 (logonroot (if (or (null logonroot) (equal logonroot "")) "~/logon" logonroot))
	 (file (concat logonroot "/uio/nw/compounds/compounds.logon")))
    (when (not (probe-file file))
      (warn "file ~s could not be found." file)
      (return-from build-compound-table))
    (setf *compound-table* (dat::make-string-tree))
    (with-file-lines (line file)
      (let ((line (string-trim '(#\space #\tab) line)))
	(unless (or (zerop (length line))
		    (char= (char line 0) #\;))
	  (destructuring-bind (id compound segments &optional translation) (split line #\tab)
	    (declare (ignore id translation))
	    (setf (dat:string-tree-get *compound-table* compound)
		  (loop for (seg pos) on (split segments #\space) by #'cddr
		     ;; when (< (length seg) 3) do (format t "~%short segment: ~a~%" seg)
		     collect (cons seg (cond ((string= pos "subst")
					      '("Noun"))
					     ((string= pos "verb")
					      '("Verb"))
					     ((string= pos "adj")
					      '("Adj"))
					     ((string= pos "det") ;; ??
					      '("Num"))
					     (t
					      (error "pos ~s not recognized in line: ~s" pos line))))))))))
    ;; recursively replace segments which are compounds by their segmentation
    (let ((changedp nil))
      (labels ((replace-segments ()
		 (dat:do-string-tree (compound segments *compound-table*)
		   (loop for segment-tail on segments
			 do (let ((replacement (dat:string-tree-get *compound-table* (caar segment-tail))))
			      (when (and replacement
					 (equal (cadar segment-tail) (cadar (last replacement))))
				(setf changedp t
				      (car segment-tail) (car replacement)
				      (cdr segment-tail) (append (copy-seq (cdr replacement)) (cdr segment-tail)))))))))
	(loop do (replace-segments)
	      while changedp
	      do (setf changedp nil))))))

#+test
(dat:do-string-tree (compound segments *compound-table*)
  (print (cons compound segments)))

;;#+test
(build-compound-table)

#+test
(let ((*analyser-lexicon* *bm-analyser-lexicon*))
  (print (segment-compound "tegningsrettsemisjon" :segments (dat::make-string-tree))))


#+test
(print (segment-compound "fiskegiftstoff" :segments (dat::make-string-tree)))
#+test
(print (segment-compound "landskapsvernområde" :segments (dat::make-string-tree)))
#+test
(print (segment-compound "kildebergart" :segments (dat::make-string-tree)))
#+test
(let ((segments (dat::make-string-tree)))
  (print (segment-compound "ewwikk" :segments segments))
  (print segments))

#+test
(let ((*analyser-lexicon* *bm-analyser-lexicon*))
  (let ((word "landskapsvernfisk"))
    (multiple-value-bind (c-analyses lexical-compound-p)
	(compound-optimal-analyses
	 word
	 :max 4
	 :restore-char-p nil
	 :min-length-only-p t
	 ;;:listed-compounds *compound-table*
	 )
      #-debug(print (list :word word :c-anal c-analyses :lexical-compound-p lexical-compound-p))
      )))

#+test
(print (dat::string-tree-get *compound-table* "villreinjakt"))

;; To do: exclude token + "."-reading for unknown hyphenated words?
(defun segment-compound (word &key punctuation downcased-p (analyze-p t) analyze-compound-name-p
			 capitalize-p segments end-hyphen-p pos heap)
  #+debug(print (list :analyze-p analyze-p :analyze-compound-name-p analyze-compound-name-p))
  (multiple-value-bind (c-analyses lexical-compound-p)
      (when (or analyze-p analyze-compound-name-p)
	(compound-optimal-analyses
	 (if capitalize-p (string-capitalize word) word)
	 :max 4
	 :restore-char-p nil
	 :min-length-only-p t
	 :listed-compounds *compound-table*
	 :end-hyphen-p end-hyphen-p
	 :try-downcase-p (not analyze-compound-name-p) ;; new 27.10.2006
	 ))
    #+debug(print (list :word word :c-anal c-analyses :analyze-p analyze-p
			:analyze-compound-name-p analyze-compound-name-p :capitalize-p capitalize-p :downcased-p downcased-p))
    (let ((found-p t))
      #+debug(print segments)
      #+debug(print (list :word word :downcased-p downcased-p :c-analyses c-analyses))
      (values (collecting
		(cond ((and (not lexical-compound-p)
			    (not end-hyphen-p)
			    (block find
			      (or (string-net::nmap-string-values
				   *bm* word
				   (lambda (reading)
				     (declare (ignore reading))
				     (return-from find t))
				   #\: nil nil nil)
				  (let ((stripped (strip-emphase-diacritics word)))
				    (and stripped
					 (string-net::nmap-string-values
					  *bm* stripped (lambda (reading) (return-from find t))
					  #\: nil nil nil)))
				  (and (upper-case-p (char word 0))
				       (string-net::nmap-string-values
					*bm* (string-downcase word)
					(lambda (reading)
					  (declare (ignore reading))
					  (return-from find t))
					#\: nil nil nil)))))
		       (if punctuation
			   (dolist (punct punctuation)
			     (collect (cons (list (list (fst-escape-string word heap) pos nil downcased-p)) punct)))
			   (collect (list (list (list (fst-escape-string word heap) pos nil downcased-p))))))
		      #-without-compounds
		      (c-analyses
		       #+debug(print (list :c-analyses c-analyses))
		       (dolist (punct (or punctuation (list nil)))
			 (loop for c-analysis in c-analyses
			       do
			       (collect
				(collecting
				  (collect
				   (collecting
				     (let ((chunk-pos 0))
				       (dolist (segment c-analysis)
					 (let* (;; verb stems are analysed as Impv, but NorGram needs them as Infin
						(features (substitute "Infin" "Impv" (cdr segment) :test #'string=))
						(features (if (and capitalize-p (eq segment (car c-analysis)))
 							      (append features (list "Cap"))
							      features))
						(chunk (if (and capitalize-p (eq segment (car c-analysis)))
							   (string-downcase (car segment))
							   (car segment)))
						(position (position features
								    (dat::string-tree-get segments chunk)
								    :test #'equal)))
					   (unless position
					     (setf position 0)
					     (push features (dat::string-tree-get segments chunk)))
					   (collect (list chunk
							  (+ pos chunk-pos)
							  (- (length (dat::string-tree-get
								      segments chunk)) position 1)
							  (when (eq segment (car c-analysis))
							    downcased-p)
							  end-hyphen-p))
					   (incf chunk-pos (length chunk)))))))
				  (collect-append punct))))
			 (setf found-p :compound)
			 (when (and (not end-hyphen-p) (or #-ignore downcased-p (not lexical-compound-p)))
			   (collect (cons (list (list word pos nil downcased-p)) punct)))))
		      ;; unrecognized hyphenated word
		      ((and (find #\- word))
		       (setf found-p nil)
		       (let ((segs (split word #\-)))
			 (dolist (punct (or punctuation (list nil)))
			   (collect
			    (collecting
			      (collect
			       (collecting
				 (let ((p pos))
				   (loop for (seg . rest) on segs
				      do
				      (cond (rest
					     (unless (zerop (length seg))
					       (collect (list (fst-escape-string seg heap) p nil
							      (when (eq (car segs) seg) downcased-p) nil t))
					       (incf p (length seg)))
					     (collect (list "-" p nil nil nil t))
					     (incf p))
					    ((zerop (length seg))
					     nil)
					    (t
					     (collect (list (fst-escape-string seg heap) p nil nil nil t))))))))
			      (collect-append punct))))))
		      (t
		       (setf found-p nil)
		       (if punctuation
			   (dolist (punct punctuation)
			     (collect (cons (list (list (fst-escape-string word heap) pos nil downcased-p)) punct)))
			   (collect (list (list (list (fst-escape-string word heap) pos nil downcased-p))))))) )
	      found-p))))

#+test
(print (segment-compound "bananhelikopter" :punctuation '(("." ".") ("."))))

#+test
(print (segment-compound "kildebergart"))

#+test
(print-ranked-analyses-regexp "fiskehansker")

#+test
(let ((*segments* nil))
  (print (norgram-tokenize-sentence "sensommerdag"))
  (print *segments*)
  (print (norgram-morphology-regexp "sen+" *segments*))
  (print (norgram-morphology-regexp "+sommer+" *segments*))
  (print (norgram-morphology-regexp "+dag" *segments*)))

#+test
(let ((*segments* nil))
  (print (norgram-tokenize-sentence "villreinjakt"))
  (print *segments*)
  (print (norgram-morphology-regexp "vill+" *segments*))
  (print (norgram-morphology-regexp "+rein+" *segments*))
  (print (norgram-morphology-regexp "+jakt" *segments*)))

#+test
(print (norgram-tokenize-sentence "kildebergart"))

#+test
(let ((*segments* nil))
  (print (norgram-tokenize-sentence "Kari spiser fiskeboller."))
  (print *segments*)
  (print (norgram-morphology-regexp "fisk+"))
  (print (norgram-morphology-regexp "+hatt")))

#+test
(let ((*segments* nil))
  (print (norgram-tokenize-sentence "Per" :preparse-p nil))
  (print *segments*)
  (print (norgram-morphology-regexp "per")))

#+test
(let ((client 1))
  (print (norgram-tokenize-sentence "pÃ¥ Universitetet i Ljubljana" :client 1))
  (let* ((segments (gethash client *segments-table*)))
    (print segments)
    (print (norgram-morphology-regexp "Universitetet_i_Ljubljana" segments))))

#+test
(let ((client 1))
  (print (norgram-tokenize-sentence "pÃ¥ ewwikk" :client 1))
  (let* ((segments (gethash client *segments-table*)))
    (print segments)
    (print (norgram-morphology-regexp "ewwikk" segments))))

#+test
(let ((client 1))
  (print (norgram-tokenize-sentence "jeg"))
  (let ((segments (gethash client *segments-table*)))
    (print (nth-value 1 (norgram-morphology-regexp "jeg" segments)))))

#+test
(print (nth-value 1 (norgram-morphology-regexp "fisk" nil)))

#+test
(print (list (norgram-tokenize-sentence "På Byfjellene kan du raste, slå leir og overnatte.")
	     *segments*))

#+test
(setf *bm* (string-net::read-net "~/lisp/projects/xle/morph/bm-morph.net" :translate-p nil))


;; gcc morph-sockets.c -o morph-sockets

;;(defparameter *socket-server* (socket:socket-server 2001))

;;(defparameter *socket-stream* (socket:socket-accept *socket-server*))

;;(socket:socket-status *socket-server*)

#+clisp
(defun start-server (&optional (stream-count 2))
  (let* ((socket-server (socket:socket-server 2001))
	 ;;(socket-stream (socket:socket-accept socket-server))
	 (streams (make-array stream-count :initial-element nil))
	 (stream-nr 0)
	 (sentence nil))
    (print "server started.")
    (unwind-protect
	 (loop do
	       (unless (aref streams stream-nr)
		 (setf (aref streams stream-nr)
		       (ignore-errors (socket:socket-accept socket-server :timeout '(0 . 50))))
		 #+debug
		 (if (aref streams stream-nr)
		     (format *standard-output* "~s opened.~%" (aref streams stream-nr))
		     (format *standard-output* "Socket stream no. ~d still waiting.~%" stream-nr)))
	       (when (aref streams stream-nr)
		 (let ((socket-stream (aref streams stream-nr)))
		   (when (find (socket:socket-status socket-stream) '(:input :io))
		     (setf sentence (read-line socket-stream))
		     (print sentence *standard-output*)
		     (cond ((zerop (length sentence))
			    (write-string "" socket-stream))
			   ((eql 0 (search "tokenize" sentence))
			    (write-string (princ (norgram-tokenize-sentence (utf-8-decode (subseq sentence 9)))
						 *standard-output*)
					  socket-stream))
			   (t
			    (write-string ;;(utf-8-encode (norgram-tag-word (utf-8-decode sentence)))
			     (norgram-tag-word (utf-8-decode sentence))
			     socket-stream))))))
	       (setf stream-nr (mod (1+ stream-nr) stream-count))
	       while (or (null sentence) (> (length sentence) 0)))
      (princ "Closing socket connection.")
      (socket:socket-server-close socket-server)
      (cl-user::quit))))

#+test
(ext:saveinitmem "/home/paul/lisp/projects/xle/morph/morph-server.mem"
		 :quiet t
		 :init-function #'start-server)

(defparameter *socket-server* nil)

(defparameter *server-connections* 0)

(defparameter *client-processes* (make-hash-table))

#+allegro
(ff:def-foreign-call getpid (:void))

;; shutdown morph-server when parent process goes down
#+allegro
(mp::process-run-function "check-ppid"
			  (lambda ()
			    (loop while (getppid)
				  do (sleep 10)
				  finally
				  (print :exiting)
				  (cl-user::exit))))

#+allegro
(ff:def-foreign-call getppid (:void))

#+(or openmcl allegro)
(defun start-server (&optional socket-server (proc-id 0))
  (let ((socket-server (or socket-server
			   (socket:make-socket :connect :passive
					       ;;:local-host "decentius.aksis.uib.no"
					       ;; client sets port in morph-sockets-lib.c
					       :local-port 2001
					       #+allegro :reuse-address #+allegro t))))
    #+debug(print (list :socket-server socket-server :pid (getpid)))
    (let* ((socket-stream (socket:accept-connection socket-server)) ;; only returns when new connection request comes in
	   (client-pid (parse-integer (read-line socket-stream nil nil)))
	   (sentence nil))
      ;; write pid to /tmp. This will be used to kill the process from the Allegro that loaded the XLE lib
      #+debug(print (list :client-pid client-pid))
      (with-open-file (stream (format nil "/tmp/morph-server-pid.~d" client-pid) :direction :output :if-exists :supersede)
	(write (getpid) :stream stream))
      #+debug(print (list :remote-host (socket:remote-host socket-stream)
		   :pid client-pid))
      (setf (gethash socket-stream *client-processes*)
	    (cons 0 ;; (socket:remote-host socket-stream) ;; fixme: sometimes returns 127.0.0.1
		  client-pid))
      #+debug(print "connection accepted.")
      (incf *server-connections*)
      ;; spawn a new connection thread
      (print "Spawning new thread")
      #+openmcl
      (cl-user::process-run-function (format nil "morph-server-~d" (1+ proc-id) #+ignore *server-connections*)
				     (lambda () (start-server socket-server (1+ proc-id))))
      #+allegro
      (mp::process-run-function (format nil "morph-server-~d" (1+ proc-id))
				(lambda () (start-server socket-server (1+ proc-id))))
      (print "... new thread spawned")
      (unwind-protect
	   (loop do
		 (setf sentence (read-line socket-stream nil nil))
		 (cond ((null sentence)
			nil)
		       ((zerop (length sentence))
			(write-string "" socket-stream))
		       ((eql 0 (search "tokenize" sentence))
			(write-string (handler-case
					  (norgram-tokenize-sentence
					   (utf-8-decode (subseq sentence 9 #+obsolete 10))
					   :preparse-p nil #+obsolete(char= (char sentence 9) #\y)
					   :client (gethash socket-stream *client-processes*))
					(storage-condition (condition)
					  (declare (ignore condition))
					  #+:allegro
					  (excl:exit 1 :no-unwind t))
					(error (condition)
					  (format t "error in norgram-tokenize-sentence(): ~a" condition)
					  ""))
				      socket-stream))
		       (t
			(write-string
			 (handler-case
			     (norgram-tag-word (utf-8-decode sentence)
					       :client (gethash socket-stream *client-processes*))
			   (storage-condition (condition)
			     (declare (ignore condition))
			     #+:allegro
			     (excl:exit 1 :no-unwind t))
			   (error (condition)
			     (format t  "error in norgram-tag-word(): ~a" condition)
			     ""))
			 socket-stream)))
		 #+openmcl
		 (ccl::stream-force-output socket-stream)
		 #+allegro
		 (socket::force-output socket-stream)
		 while (and sentence (> (length sentence) 0)))
	(close socket-stream)))))

(defun find-free-port-and-make-socket (&optional (start 2002) (end 7000))
  (loop for port from start to end
	for socket = (ignore-errors
		       (socket:make-socket :connect :passive
					   ;;:local-host "localhost"
					   :local-port port
					   #+allegro :reuse-address #+allegro t))
	when socket do (return socket)))

#+(or openmcl allegro)
(defun cl-user::start-server-processes (&optional port)
  ;;(print :build-mwe-list)
  (build-mwe-list)
  (build-compound-table)
  (let ((socket-server
	 (if port
	     (socket:make-socket :connect :passive
				 ;; :local-host "localhost"
				 :local-port port
				 #+allegro :reuse-address #+allegro t)
	     (find-free-port-and-make-socket))))
    (setf *socket-server* socket-server)
    #+debug(print (list :socket-server socket-server :port port))
    #+openmcl
    (cl-user::process-run-function "morph-server" (lambda () (start-server socket-server)))
    #+allegro
    (mp::process-run-function "morph-server" (lambda () (start-server socket-server)))
    ;; shutdown morph-server when parent process goes down
    #+allegro
    (mp::process-run-function "check-ppid"
			  (lambda ()
			    (loop with ppid = (getppid)
				  while (= ppid (getppid))
				  do (sleep 10)
				  finally
				  (print :exiting)
				  (cl-user::exit))))
    #+debug(print (list :opening-pid-file (excl::getuid) (getppid) (print (socket:local-port socket-server))))
    (with-open-file (stream (format nil "/tmp/cgp.socket.~s.~s" (excl::getuid) (getppid)) ;; not sure why parent pid is necessary
			    :direction :output :if-exists :supersede)
      (write (socket:local-port socket-server) :stream stream))
    socket-server))

#+test
(cl-user::start-server-processes)

#+dumplisp
(setf cl-user::*restart-app-function* 'cl-user::start-server-processes)

#+test
(print (socket:local-port (find-free-port-and-make-socket)))

#+test ;; openmcl
(socket:shutdown *socket-server*)
#+test ;; allegro
(close *socket-server*)

#+test
(start-server (socket:make-socket :connect :passive
				  ;;:local-host "localhost"
				  :local-port 2001))

#+test
(socket:socket-server-close *socket-server*)

;; (open-shared-library "/home/paul/lisp/projects/xle/morph/morph-sockets-lib.dylib")
;; (external-call "_initialize" :void)


;; build clisp image
#+test
(ext:saveinitmem "/home/paul/lisp/projects/xle/morph/morph-server.mem"
		 :quiet t
		 :init-function #'morph-server::start-server)

;; load as:> clisp -K full -M /home/paul/lisp/projects/xle/morph/morph-server.mem

;; count lemmas in bm-morph
#+test
(let ((tree (dat::make-string-tree)))
  (with-file-lines (line "projects:xle;morph;bm-morph.txt")
    (destructuring-bind (form lemma code) (split line #\:)
      (incf (dat:string-tree-get tree lemma 0))))
  (print (car tree)))

:eof