;;; Hey, emacs(1), this is -*- Mode: Common-Lisp; -*-, got it? ;;; ;;; PAL --- PCFG Approximation and Parsing Library for DELPH-IN ;;; ;;; Copyright (c) 2009 -- 2012 Johan Benum Evensberget (johan.benum@gmail.com) ;;; ;;; This program is free software; you can redistribute it and/or modify it ;;; under the terms of the GNU Lesser General Public License as published by ;;; the Free Software Foundation; either version 2.1 of the License, or (at ;;; your option) any later version. ;;; ;;; This program is distributed in the hope that it will be useful, but WITHOUT ;;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ;;; FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public ;;; License for more details. ;;; (in-package :pcfg) (defstruct (cfr) id type lhs rhs (count 1) probability cfg) (defun cfr-hash (cfr) (with-slots (id lhs rhs) cfr (list lhs rhs))) (defun cfr-equal (foo bar) (or (eq foo bar) (and (= (cfr-lhs foo) (cfr-lhs bar)) (eq (cfr-type foo) (cfr-type bar)) (equal (cfr-rhs foo) (cfr-rhs bar))))) (defun print-cfr (cfr cfg &key (format :sussex) (stream t) (prefix "") (suffix "~%")) (let* ((table (cfg-table cfg)) (count (cfr-count cfr)) (probability (cfr-probability cfr)) (type (case (cfr-type cfr) (:root "S") (:rule "R") (:irule "I") (:word "W"))) (lhs (cfr-lhs cfr)) (rhs (cfr-rhs cfr)) (*print-case* :downcase)) (when prefix (format stream prefix)) (case format (:sussex (format stream "{~a}~@[ <~,4f>~] ~a: ~a[~a] -->" count probability type (code-to-symbol lhs table) lhs) (if (stringp rhs) (format stream " ~s" rhs) (loop for code in rhs do (format stream " ~a[~a]" (code-to-symbol code table) code)))) (:bitpar (format stream "~a ~a ~{~a~^ ~}" count lhs rhs)) (:export (let ((rhs (if (atom rhs) (list (code-to-symbol rhs table)) (loop for code in rhs collect (code-to-symbol code table))))) (format stream "(~d) [1 (0) ~s ~{~s~^ ~}] ~,4e {~d ~d}" (cfr-id cfr) (code-to-symbol lhs table) rhs (log (cfr-probability cfr)) (gethash lhs (cfg-counts cfg)) (cfr-count cfr))))) (when suffix (format stream suffix)))) ;;; Core CFG structs and handling: (defstruct (cfg) (table (make-symbol-table)) (rules (make-hash-table :size 4096)) (counts (make-hash-table :size 4096)) (epsilon 1.0) (samples 0) (size 4096) (count 0) rule-table settings) (defmethod print-object ((object cfg) stream) (format stream "#[CFG (~d sample~p, ~d rule~p) <~,4e>]" (cfg-samples object) (cfg-samples object) (cfg-count object) (cfg-count object) (log (cfg-epsilon object)))) (defun print-cfg (cfg &key (stream t) (format :sussex)) (if (stringp stream) (with-open-file (stream stream :direction :output :if-exists :supersede) (print-cfg cfg :stream stream :format format)) (loop with *package* = (find-package :lkb) with table = (cfg-table cfg) initially (case format (:export (format stream ";;;~%;;; ~a~%;;; (~a@~a; ~a)~%;;;~%" cfg (tsdb::current-user) (tsdb::current-host) (tsdb::current-time :long :pretty)) (format stream "~%:begin :pcfg ~d.~%~%" (cfg-samples cfg)) (format stream "*pcfg-use-preterminal-types-p* := ~:[no~;yes~].~%~%~ *pcfg-include-leafs-p* := ~:[no~;yes~].~%~%~ *pcfg-laplace-smoothing-p* := ~d.~%~%" *pcfg-use-preterminal-types-p* *pcfg-include-leafs-p* (if (numberp *pcfg-laplace-smoothing-p*) *pcfg-laplace-smoothing-p* (if *pcfg-laplace-smoothing-p* 1 0))) (format stream ":begin :rules ~d.~%~%" (cfg-count cfg)))) for i from 0 to (- (cfg-size cfg) 1) for bucket = (gethash i (cfg-rules cfg)) for count = (gethash i (cfg-counts cfg)) when bucket do (when (eq format :sussex) (let ((code (cfr-lhs (first bucket)))) (format stream "~a[~a] {~a}~%" (code-to-symbol code table) code count))) (loop for rule in bucket do (print-cfr rule cfg :stream stream :format format :prefix (and (eq format :sussex) " ") :suffix "~%")) finally (case format (:export (format stream "~%:end :rules.~%~%:end :pcfg.~%")))))) (defun record-cfr (rule cfg) (let ((i (cfr-lhs rule))) (if (cfr-cfg rule) (incf (cfr-count rule)) (loop ;;; this loop is a bottleneck when merging really big grammars. for foo in (gethash i (cfg-rules cfg)) when (cfr-equal foo rule) do (incf (cfr-count foo)) (return) finally (push rule (gethash i (cfg-rules cfg))) (incf (cfg-count cfg)))) (incf (gethash i (cfg-counts cfg) 0))) (when (cfg-rule-table cfg) (setf (gethash (cfr-hash rule) (cfg-rule-table cfg)) rule)) (setf (cfr-cfg rule) cfg)) (defun estimate-probabilities (cfg) (loop with laplace = (if (numberp *pcfg-laplace-smoothing-p*) *pcfg-laplace-smoothing-p* (and *pcfg-laplace-smoothing-p* 1)) with n = -1 ;;for i from 0 to (- (cfg-size cfg) 1) for key being the hash-keys in (cfg-rules cfg) for bucket = (gethash key (cfg-rules cfg)) for total = (gethash key (cfg-counts cfg)) when bucket do (when laplace (incf total (* (+ (length bucket) 1) laplace))) (loop for rule in bucket for count = (cfr-count rule) when laplace do (incf count laplace) do (setf (cfr-id rule) (incf n)) (let ((probability (/ count total))) (setf (cfr-probability rule) probability) (setf (cfg-epsilon cfg) (min (cfg-epsilon cfg) probability))))))