;;; -*- Mode: LISP; Package: CGP; BASE: 10; Syntax: ANSI-Common-Lisp; Readtable: augmented-readtable -*- ;; Copyright (C) Paul Meurer 2002. All rights reserved. ;; paul.meurer@hit.uib.no ;; HIT-centre, University of Bergen ;; ;; Version 0.1 ;;------------------------------------------------------------------------------------- ;; TO DO: ;; -- Fix charset conversion (extract from META element or guess) ;; -- Test more pages ;; -- Fix base-url redirection ;;------------------------------------------------------------------------------------- (in-package :cgp) (eval-when (:compile-toplevel :load-toplevel :execute) (setf lxml::*encoding* :utf-8) (setf lxml::*newline-after-endtag-p* nil)) ;;; -------------------------------- HTML-tokenizer ------------------------------------- (defclass html-tokenizer (xml-tokenizer) ((whitespace :initform nil :accessor html-whitespace) (encoding :initform :iso-8859-1 :accessor html-encoding)) (:default-initargs :zebu-grammar-name "html-parser")) #+test (let ((lxml::*serialize* :xml-file-pos) (*token-function* (lambda (token) (print token))) (zebu::*xml-file-pos-queue* (u:make-queue))) (read-parser "xøxåx " :grammar (find-grammar "html-parser"))) #+test (let ((lxml::*serialize* :xml-file-pos) (*token-function* (lambda (token val) (print (list token val))))) (read-parser (format nil "con~c" #\Return) ;;"fifi " :grammar (find-grammar "plain-tokenizer"))) #+test (print (xml::resolve-entity-ref "nbsp")) #+test (compile-tokenizer (make-instance 'html-tokenizer) :force-p t) (defmethod compile-tokenizer ((tokenizer html-tokenizer) &key force-p) (when (or force-p (not (find-grammar "html-parser"))) (lxml::compile-html-parser)) (when (or force-p (not (find-grammar "plain-tokenizer"))) (let ((*package* (find-package :lxml)) (*warn-conflicts* t) (*allow-conflicts* t) (zb-file "projects:cgp;multi-tagger;plain-tokenizer.zb") (tab-file "projects:cgp;multi-tagger;plain-tokenizer.tab")) (zebu-compile-file zb-file :output-file tab-file) (zebu-load-file tab-file)))) ;; this is preliminary!! #+copy (defun token-included-p (stack include-path exclude-path) (and (or (null include-path) (loop for path in include-path thereis (find path stack))) (not (loop for path in exclude-path thereis (find path stack))))) ;; (print (xml::resolve-entity-ref "nbsp")) (defmethod sentence-add-token ((sentence sentence) (tokenizer html-tokenizer)) "appends token(s) from stream and returns the last token added" (with-slots (stream) sentence (with-slots (insignificant-token-fn sentence-delimiter-elements paragraph-delimiter-elements headline-delimiter-elements stack include-path exclude-path word-tokens) tokenizer (let ((word+pos (pop word-tokens)) (pos nil)) (cond (word+pos (add-token sentence (car word+pos) :position (cadr word+pos) :whitespace (cddr word+pos) :language :nbo)) (t ;; wait until the html parsing process has a new token #+sbcl (acl-compat.mp:process-wait "waiting-for-token" #'parsed-token tokenizer) #+allegro (mp:process-wait "waiting-for-token" #'parsed-token tokenizer) #+mcl (ccl::process-wait "waiting-for-token" #'parsed-token tokenizer) (let ((xml-token (parsed-token tokenizer))) (setf (parsed-token tokenizer) nil) (unless (eq xml-token :eof) ;;(print (list :xml-token xml-token)) (destructuring-bind (tag-type xml-file-pos content . attributes) xml-token ;;(declare (ignore attributes)) (when (and (eq content :meta) (getf attributes :http-equiv) (string-equal (getf attributes :http-equiv) "Content-Type")) (let* ((content-att (getf attributes :content)) (charset-pos (search "charset" content-att)) (charset-str (when charset-pos (subseq content-att (+ charset-pos 8) (position #\Space content-att :start (+ charset-pos 8))))) (charset (when charset-str (intern (string-upcase charset-str) :keyword)))) (when charset (setf (html-encoding tokenizer) (print charset))))) (cond ((eq tag-type :%string) ;;; --- new for html --->>> ;; concatenate consecutive string tokens (loop do #+allegro (mp:process-wait "waiting-for-token" #'parsed-token tokenizer) #+sbcl (acl-compat.mp:process-wait "waiting-for-token" #'parsed-token tokenizer) #+mcl (ccl::process-wait "waiting-for-token" #'parsed-token tokenizer) while (and (not (eq (parsed-token tokenizer) :eof)) (eq (car (parsed-token tokenizer)) :%string)) do (setf content (concat content (caddr (parsed-token tokenizer))) (caddr xml-token) content (parsed-token tokenizer) nil)) ;;(print content) ;;; <<< ---- (cond ((token-included-p stack include-path exclude-path) (setf pos xml-file-pos) (setf word-tokens (collecting (let ((*token-function* (lambda (token type) (case type ((:word :brace) (collect (list* token pos (html-whitespace tokenizer))) (setf (html-whitespace tokenizer) nil) (incf pos (length token))) (:punctuation ;; group equal chars together (let ((start 0)) (loop for i from 0 for c across token do (when (char/= c (char token start)) (collect (list* (subseq token start i) pos (html-whitespace tokenizer))) (setf (html-whitespace tokenizer) nil) (incf pos (- i start)) (setf start i))) (if (zerop start) (collect (list* token pos (html-whitespace tokenizer))) (collect (list* (subseq token start) pos (html-whitespace tokenizer)))) (setf (html-whitespace tokenizer) nil) (incf pos (- (length token) start)))) (:whitespace (setf (html-whitespace tokenizer) token) (incf pos (length token))) (otherwise (setf (html-whitespace tokenizer) nil) (incf pos (length token))))))) (read-parser content :grammar (find-grammar "plain-tokenizer"))))) (let ((word+pos (pop word-tokens))) ;; return (cond (word+pos (add-token sentence (car word+pos) :position (cadr word+pos) :whitespace (cddr word+pos) :language :nbo)) (t (add-token sentence :newline))))) (t (add-token sentence :insignificant :attributes xml-token :whitespace (html-whitespace tokenizer)) (setf (html-whitespace tokenizer) nil) (sentence-add-token sentence tokenizer)))) ((eq tag-type :%end) ;; adjust stack (when (find content stack) (loop until (eq content (car stack)) do (pop stack)) (pop stack)) ;; return also when a paragraph or headline delimiter is reached (cond ((and (token-included-p stack include-path exclude-path) (or (find content paragraph-delimiter-elements) (find content headline-delimiter-elements))) (prog1 (add-token sentence :insignificant :attributes xml-token :whitespace (html-whitespace tokenizer)) (setf (html-whitespace tokenizer) nil))) (t (add-token sentence :insignificant :attributes xml-token :whitespace (html-whitespace tokenizer)) (setf (html-whitespace tokenizer) nil) (sentence-add-token sentence tokenizer)))) ((find content sentence-delimiter-elements) (push content stack) (prog1 (add-token sentence :insignificant :attributes xml-token :whitespace (html-whitespace tokenizer)) (setf (html-whitespace tokenizer) nil))) ((find tag-type '(:%empty :%comment)) (add-token sentence :insignificant :attributes xml-token :whitespace (html-whitespace tokenizer)) (setf (html-whitespace tokenizer) nil) (sentence-add-token sentence tokenizer)) (t ;; is %start (push content stack) (add-token sentence :insignificant :attributes xml-token :whitespace (html-whitespace tokenizer)) (setf (html-whitespace tokenizer) nil) (sentence-add-token sentence tokenizer)))))))))))) (let ((*package* (find-package :lxml)) (*warn-conflicts* t) (*allow-conflicts* t) (zb-file "projects:cgp;multi-tagger;plain-tokenizer.zb") (tab-file "projects:cgp;multi-tagger;plain-tokenizer.tab")) (zebu-compile-file zb-file :output-file tab-file) (zebu-load-file tab-file)) (defmethod format-sentence-html ((sentence sentence) (mode (eql :pos-colorize)) &key stream (print-features t) (print-sentence-elts-p t) (expand-tokens-p t) (word-elt :|word|) base-url &allow-other-keys) "outputs a tagged sentence as xml" (declare (ignore word-elt)) (let ((*package* (find-package :cgp)) (start-token nil) (end-token nil)) (when print-sentence-elts-p (multiple-value-setq (start-token end-token) (find-sentence-boundary-tokens sentence))) (labels ((walk (token concat-token) (when (eq token start-token) (setf start-token nil) (unless end-token (setf end-token t))) (when (eq token end-token) (setf end-token nil)) (cond ((null token) nil) ((and expand-tokens-p (token-expansion token)) (do ((ex-token (car (token-expansion token)) (token-next ex-token))) ((eq ex-token (cdr (token-expansion token))) (walk ex-token token)) (walk ex-token token)) (walk (token-next token) nil)) (t (let ((value (token-value token)) (features (token-features (or concat-token token))) (attributes (token-attributes token)) (whitespace (token-whitespace token))) (when whitespace (write-string whitespace stream)) (cond ((stringp value) ;; wrap words into FONT elements with color according to their POS #m((font :color #L(if (and features print-features) (case (intern (string-upcase (car (token-part-of-speech (or concat-token token)))) :keyword) (:subst "blue") (:verb "red") (:adj "green") (:det "magenta") (:prep "orange") (:pron "cyan") (:sbu "brown") (:konj "yellow") (t "black")) "black")) ((span :title #L (car (token-part-of-speech (or concat-token token)))) #L(write-string value stream)))) ((eq value :newline) (fresh-line stream)) ((eq value :insignificant) (write-xml-tag attributes stream) (when (and base-url (eq (car attributes) :%start) (eq (caddr attributes) :head)) #m(base/ :href #L base-url))) ((null value) (terpri stream)) (t nil)) (unless (or (eq token (last-token sentence)) concat-token) (walk (token-next token) nil))))))) (walk (first-token sentence) nil)))) (defmethod token-part-of-speech ((token token) &key substitutions &allow-other-keys) (with-slots (features) token (collecting (dolist (fl features) (when (car fl) (collect (subst-substrings (string-downcase (car (code-features (cdr fl)))) substitutions))))))) (defmethod pos-colorize-page ((request http-request) entity) (with-html-response (request entity stream (url)) (if url (multiple-value-bind (body response headers uri) (net.aserve.client::do-http-request url) (declare (ignore response headers uri)) #+debug (print body) (with-input-from-string (in-stream body) (disambiguate-stream 'html-tokenizer in-stream :tokenizer-initargs '(:paragraph-delimiter-elements (:table :p :td) ;;:headline-delimiter-elements (:|head| :|byline|) :in-sentence-elements (:b :i) :include-path (:p :span :table :ul :h1 :h2 :h3 :h4) :exclude-path () ;;:sentence-delimiter-elements (:table :p :td) ) :tagging-niveau :morphological-disambiguation :cg (gethash "nbo" *cg-table*) :print-function (lambda (s &key &allow-other-keys) (format-sentence-html s :pos-colorize :print-rules nil :expand-tokens-p t :base-url url :stream stream))))) #m(html (head (title "Vis farge-kodet webside")) (body (p "Skriv en URI:") ((form:method "post") (input/ :type "string" :id "url" :name "url" :size 80))))))) (publish :path "/cl/cgp/pos-colorize.html" :content-type "text/html" :function #'pos-colorize-page) #|| ;;; Ragnhild: eksempel som du kan ha i en egen fil og modifisere (defmethod example-page ((request http-request) entity) (with-html-response (request entity stream (url)) (if url ;; her leses nettsiden som skal manipuleres (multiple-value-bind (body response headers uri) (net.aserve.client::do-http-request url) (declare (ignore response headers uri)) #+debug (print body) (with-input-from-string (in-stream body) ;; her tagges teksten (disambiguate-stream 'html-tokenizer in-stream :tokenizer-initargs '(:paragraph-delimiter-elements (:table :p :td) :in-sentence-elements (:b :i) :include-path (:p :span :table :ul :h1 :h2 :h3 :h4) :exclude-path ()) :tagging-niveau :morphological-disambiguation :cg (gethash "nbo" *cg-table*) :print-function (lambda (s &key &allow-other-keys) ;; her skrives setningene som html i server-strømmen (format-sentence-html s :mine-egne-ting :print-rules nil :expand-tokens-p t :base-url url :stream stream))))) ;; dette er siden man kan skrive en URL i. #m(html (head (title "Eksempelside")) (body (p "Skriv en URI:") ((form:method "post") (input/ :type "string" :id "url" :name "url" :size 80))))))) (defmethod format-sentence-html ((sentence sentence) (mode (eql :mine-egne-ting)) ;; gjerne bruk et annet nøkkelord &key stream (print-features t) (print-sentence-elts-p t) (expand-tokens-p t) (word-elt :|word|) base-url &allow-other-keys) ;; Her kommer din kode som gjør noe med den taggete setningen og formaterer den ;; som HTML. Prøv å forstå hva som min kode (FORMAT-SENTENCE-HTML lenger oppe i filen) ;; gjør og spør hvis ikke alt er selvforklarende (noe jeg tror er ganske usannsynlig ;-). #+test (masse kode) ) ;; og så en side som genererer URLen. (publish :path "/cl/cgp/test-side.html" ;; eller hva som helst :content-type "text/html" :function #'example-page) ;; slik setter du web-serveren i gang. Gjerne bruk en annen port, men du må velge noe >= 8000. (net.aserve::start :port 8005) ||# ;;; EOF