;;; -*- Mode: LISP; Package: CGP; BASE: 10; Syntax: ANSI-Common-Lisp; -*- ;; ;; Copyright (C) Paul Meurer 2000 - 2005. All rights reserved. ;; paul.meurer@aksis.uib.no ;; Aksis, University of Bergen ;; ;; Reimplementation in ANSI CommonLisp of the multi-tagger module ;; (Perl program) written by Lars Jørgen Tvedt, UiO 1999 ;; Used in the CG parser/tagger system (Oslo-tagger) developed at UiO ;; (Dokumentasjonsprosjektet; Tekstlaboratoriet) ;; (in-package "CGP") (defvar *tagger* t) (defclass multi-tagger () ((language :initarg :language :reader language) ;;; feature coding (feature-table :initform (make-hash-table) :accessor feature-table) ;; feature -> pos in feature vector (feature-vector :accessor feature-vector) ;; vector of feature symbols in cgp package (lemmata-forms :initform nil :accessor lemmata-forms) (new-wordforms :initform nil :accessor new-wordforms) (feature-print-fn :initform nil :initarg :feature-print-fn :reader feature-print-fn) )) (defmethod compound-analyser ((tagger multi-tagger)) nil) (defmethod names ((tagger multi-tagger)) nil) (defclass menota-multi-tagger (multi-tagger) ()) (defclass cg-multi-tagger (multi-tagger) ((statistics-table :initform (make-hash-table) :accessor statistics-table) (ordered-features :initform () :accessor ordered-features) (ordered-simplified-features :initform () :accessor ordered-simplified-features) (code-vector-sort-array :accessor code-vector-sort-array) ;;; nets (code-lexicon :accessor code-lexicon) ;; fullform + lemma + features (lexicon :accessor lexicon) (fullforms :accessor fullforms) (lemmata :accessor lemmata) (abbreviations :accessor abbreviations) (word-like-abbreviations :accessor word-like-abbreviations) (titles :accessor titles) (symbols :accessor symbols) (expressions :accessor expressions) (names :initform nil :accessor names) (suffixes :accessor suffixes) (suffix-table :initform (make-hash-table) :accessor suffix-table) (compound-analyser :initform :compound-analyser :accessor compound-analyser))) (defmethod initialize-instance :after ((tagger cg-multi-tagger) &key (path (concat "projects:cgp;nets;" (string-downcase (symbol-name (language tagger))) "-")) (file-extension ".net") (lexicon "lexicon") (code-lexicon "code-lexicon") (lemmata "lemmata") (fullforms "fullforms") (lemmata-forms "lemmata-forms") (abbreviations "abbreviations") (word-like-abbreviations "word-like-abbreviations") (titles "titles") (symbols "symbols") (expressions "expressions") &allow-other-keys) (setf (lexicon tagger) (read-net (concat path lexicon file-extension)) ;;(code-lexicon tagger) (read-net (concat path code-lexicon file-extension)) ;;(lemmata tagger) (read-net (concat path lemmata file-extension)) (lemmata-forms tagger) (read-net (concat path lemmata-forms file-extension)) ;;(fullforms tagger) (read-net (concat path fullforms file-extension)) (abbreviations tagger) (read-net (concat path abbreviations file-extension)) (word-like-abbreviations tagger) (read-net (concat path word-like-abbreviations file-extension)) (titles tagger) (read-net (concat path titles file-extension)) (symbols tagger) (read-net (concat path symbols file-extension)) (expressions tagger) (read-net (concat path expressions file-extension)) ;;(new-wordforms tagger) (load-wordforms tagger (concat path "new-wordforms.txt")) ;;(names tagger) (load-wordforms tagger (concat path "new-names.txt")) ;; *** preliminary (suffixes tagger) '("avtale" "berg" "blad" "bok" "bolig" "bre" "bukt" "by" "dal" "elv" "film" "fjell" "fjord" "foss" "fred" "fylke" "gate" "hall" "hav" "hjem" "hotell" "hus" "kirke" "kommune" "krig" "kyst" "land" "lov" "løkke" "minister" "myr" "nes" "pakt" "park" "plass" "president" "prinsipp" "pris" "program" "protokoll" "roman" "sang" "sen" "senter" "serie" "seter" "sjø" "skog" "skole" "smug" "son" "stad" "strand" "sund" "syndrom" "teorem" "torg" ("torv" "torg") "vann" ("veg" "vei") "vei" "verk" "vidde" "vik" "ørken" "øy" "ås" "aksjon" "bevegelse" "direktorat" "forbund" "forening" "forum" "institutt" "kontor" "lag" "monopol" "møte" "nemnd" "organisasjon" "parti" "rett" "revisjon" "råd" "stand" "tilsyn" "utvalg" ("A/S" "as") "as" ("AS" "as") ("As." "as") "avis" "bygning" "departement" "dir." "direktør" "fond" "formann" "forsker" "gjeng" "gruppe" "gård" "hytte" "ingeniør" "ist" "kamp" "klubb" "koordinator" "leder" "leilighet" "list" "log" "lokale" "misjon" "museum" "område" "produsent" "sal" "selskap" "sjef" "spesialist" "styre" "bedrift" "foretak" "*institutt")) (dolist (sfx (suffixes tagger)) (when (stringp sfx) (setf (gethash (intern (concat "<*" (string-upcase sfx) ">") :cgp) (suffix-table tagger)) t))) (dolist (name-tag '("" "" "" "" "" "")) (setf (gethash (intern (string-upcase name-tag) :cgp) (suffix-table tagger)) t))) :eof