;;; -*- mode: fundamental; coding: utf-8; indent-tabs-mode: t; -*-
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; first shot at a finite-state language for preprocessing, normalization, and
;;; tokenization in LKB grammars. requires LKB version after 1-feb-03. note
;;; that the syntax is rigid: everything starting in column 2 (i.e. right after
;;; the rule type marker) is used as the match pattern until the first `\t'
;;; (tabulator sign); one or more tabulator sign are considered the separator
;;; between the matching pattern and the replacement, but other whitespace will
;;; be considered part of the patterns. empty lines or lines with a semicolon
;;; in column 1 (i.e. in place of the rule type marker, this is not Lisp) will
;;; be ignored.
;;;
;;; rules are applied in order and, in the case of substitution rules, each see
;;; the output of the previous iteration. token-level augmentation rules (the
;;; `+' type, for now) are different in that they add an alternative for the
;;; token but the original form remains in the input buffer for subsequent rule
;;; applications (i.e. the alternative is _not_ visible to further rules).
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;
;;; preprocessor rules versioning; auto-maintained upon CVS check-in.
;;;
@$Date: 2008/01/29 18:41:12 $
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; tokenization pattern: after normalization, the string will be broken up at
;;; each occurrence of this pattern; the pattern match itself is deleted.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
:[ \t]+
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; string rewrite rules: all matches, over the entire string, are replaced by
;;; the right-hand side; grouping (using `(' and `)') in the pattern) and group
;;; references (`\1' for the first group, et al.) carry over part of the match.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;
;;; Delete space following string-initial double quote
;;;
!^(\") \1
;;;
;;; pad the full string with trailing and leading whitespace; makes matches for
;;; word boundaries a little easier down the road.
;;;
!^(.+)$ \1
;;;
;;; Cope with standard XML punctuation marks
;;;
!— -
!& mdash; -
!– -
!& ndash; -
!’ s 's
!& rsquo; s 's
!’ '
!& rsquo; '
!’ s 's
!” "
!& rdquo; "
!“ "
!& ldquo; "
!& &
!& amp; &
!" "
!& quot; "
!' '
!& apos; '
!% %
!& percnt; %
!•
!& bull;
!/ /
!& sol; /
!& deg; °
!`` "
!\u0020
!\rquote s 's
!\rquote '
!\222 '
!¦
!\\rquote s 's
!� :
!� -
!� -
!∼ ~
!“ "
!” "
!
!< emph >
!
!< /emph >
!
!< linebrk/ >
!< emph etype= " boldital " >
!< emph etype= " bold " >
!< dotfill/ > threedots
!Þ
!¤
;;; Correct spurious formatting character sequences
!\* \* h .
!\* \* f
!``. .
!, , ,
;;;
;;; separate hash sign from right-adjacent number(s)
;;;
!([#])([0-9]) \1 \2
;;;
;;; Replace three or more dots with token 'threedots '
;;;
!\.{3,} threedots
;;; Replace separator line of equal signs with a period.
;;;
!={3,} .
;;;
;;; replace multiple sentence-final punctuation marks with only the first one
;;;
;!([?!.])[?!.]+ \1
;;;
;;; replace two dots with hyphen - used for ranges as in |26 .. 62|
;;;
! \.{2} twodots
;;;
;;; Only for robust EC, VM where some items have final comma rather then period
;;;
;!([,])
;;;
;;; _fix_me_
;;; at least for hyphens, we should introduce a notion of `bound' tokens,
;;; e.g. |^.| for a hyphen that was stripped off from one or more tokens:
;;; |US-led| --> |US| |^-| |led|. this way, separating hyphens as
;;; individual tokens need not create ambiguity with the parenthetical hyphen,
;;; Collapse triple-hyphen with double-hyphen (for now), and pad on both
;;; sides with whitespace
!-{3} __
;;; Replace "--" with "__" to keep double-hyphen separate from single hyphen
!-{2} __
;;; Replace |«| and |»| with |"|
!(?:«|») "
;;; Replace – with -
!– -
;;; DPF 18-oct-06 - Temporarily replace |+| with |plus| for PET tokenizer bug
! ([+]) plus
;;;
;;; Separate funny punctuation with whitespace on either side from the
;;; preceding and following word(s).
;;;
!([a-zA-Z0-9])([#$%~+*\)§§])+ \1 \2
! ([#$%~+*§§])([a-zA-Z0-9]) \1 \2
;;;
;;; Eliminate spurious space preceding ordinary punctuation
;;;
!([a-zA-X0-9\)]) ([;,.!\)\?]) \1\2
;;;(lkb::preprocess "the image (Fig. 6a*) arrives." :verbose t)
;;; Eliminate any string-initial stranded punctuation (from faulty segmentatn)
;;;
! ([;,.!\)\?\*]+)
;;; Eliminate space preceding double quote when space on both sides
;;; (admittedly arbitrary, and not correct for S-initial stranded quote,
;;; but this is just a patch-up for odd punctuation convention).
! (") \1
;;; Add one more hack for sentence-final double-quote, where we've already
;;; swallowed the padded final space
! (")([.?!]) \1\2
;;;
;;; Add white space to the right of squished commas and colons
;;; except for numbers on both sides (but separate e.g. |2-day| and |V-neck|)
;;; Same for periods between capital letters: "D.B. Smith", "2.Kim"
;;; Add white space on both sides for forward slash
;;;
!([a-zA-ZÅåØø])([,:])([a-zA-ZÅåØø]) \1\2 \3
!([0-9])([,:])([a-zA-Z]) \1\2 \3
!([0-9])(-)([a-zA-Z][a-zA-Z]+) \1 \2 \3
! ([a-zA-ZαβΔÅåØø])([,:-])([a-zA-Z]) \1\2 \3
!([a-zA-Z])([,:])([0-9]) \1\2 \3
!([A-Z0-9])(\.)([A-Z])(\.)([A-Z]) \1\2 \3\4 \5
!([A-Z0-9])(\.)([A-Z]) \1\2 \3
! ([A-Z])([/-])([0-9]+) IdentifierErsatz
!([a-zA-ZÅåØø-])(/)([a-zA-ZÅåØø]) \1 \2 \3
!([a-zA-ZÅåØø])(→)([a-zA-ZÅåØø]) \1 \2 \3
!([a-zA-ZÅåØø])(/) ([a-zA-ZÅåØø]) \1 \2 \3
!([0-9])(/)([a-zA-ZÅåØø]) \1 \2 \3
!([a-zA-ZÅåØø])(/)([0-9]) \1 \2 \3
!([a-zA-Z])(-)(a)(-)([a-zA-Z0-9ÅåØø]) \1\2 \3\4 \5
!([a-zA-ZÅåØø])(-)([a-zA-Z0-9ÅåØø]) \1\2 \3
!([\(]*)[0-9]+ [0-9]{1,2}\/[0-9]{1,3}([.?!,;":\)]*) \1FractionErsatz\2
;;;
;;; Eliminate spurious space preceding right paren
;;;
! ([\)]) \1
;;;
;;; Eliminate spurious space(s) following left paren
;;;
!([\(]) \1
!([\(]) \1
;;;
;;; Parenthetical plurals - remove parens
!([A-Za-z]*)\(s\) \1s
;;;
;;; Add white space to left of ( and [
;;;
!([a-zA-Z0-9.])([\(\[]) \1 \2
;;;
;;; Add white space around colon if sandwiched with following alphanumeric
;;; (but not e.g. |http://...| and not ratios with numbers on both sides)
;;;
!([a-zA-Z0-9.])([:])([a-zA-Z]) \1 \2 \3
!([a-zA-Z.])([:])([a-zA-Z0-9]) \1 \2 \3
;;; And add white space to the left of colon when followed by white space:
!([a-zA-Z0-9.åø])([:]) \1 \2
;;; And the variants with surrounding punctuation
!([a-zA-Z])([,/])([a-zA-Z])([.?!,;]) \1\2 \3\4
!([0-9])([,/])([a-zA-Z])([.?!,;]) \1\2 \3\4
!([a-zA-Z])([,/])([0-9])([.?!,;]) \1\2 \3\4
;;;
;;; For now, simplify punctuation clusters found in subordinate quoted Ss, as
;;; in "Who arrived?, she asked" since current suffixing machinery doesn't
;;; produce result. Also for "... two hrs., ..." FIX
!([a-zA-Z0-9])[.?!](,) \1\2
;;; Also simplify awkward clause-final clusters
!([:])([.?]) \2
!(["\)])([:]) \1
;;;
;;; apostrophes are a bit tricky: generally, we want to separate leading and
;;; trailing single quotes from adjacent word material, so that they become a
;;; separate token (e.g. |abrams'| --> |abrams '|); the possesive |'s|, on the
;;; other hand, we want to separate but then consider a single token.
;;;
!([sS])' \1 '
!([^ ])'[sS] \1 's
!([^ ])'[sS]([.?!,;"]) \1 's\2
;;;
;;; contracted auxiliaries: separate contracted part from preceding word.
;;;
!([^ ])'ll \1 'll
!([^ ])'d \1 'd
!([^ ])'ve \1 've
!([^ ])'m \1 'm
!([^ ])'re \1 're
!([^ ])'LL \1 'LL
!([^ ])'D \1 'D
!([^ ])'VE \1 'VE
!([^ ])'M \1 'M
!([^ ])'RE \1 'RE
;;; Remove space after initial "O'" and "L'"
! O' O'
! ([lL])' \1'
;;;
;;; Experimental: mark capitalization with preceding special character |_|
;;; but right now only for single letters used as proper names.
;;; Add special case for sequence of two capitals separated by space, since
;;; the space after the first one gets consumed by the simple rule.
;;; Exclude "I" since it's so frequent as pronoun
;;;
! ([A-HJ-Z])([.?!,;:-]?) ([A-HJ-Z])([.?!,;:-]?) _\1\2 _\3\4
! ([\("]?)([A-HJ-Z])(["\)]?)([.?!,;:-]?) \1_\2\3\4
;;;
;;; Try correcting squished compounds, here just by listing
;;;
!backcountry back country
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; file inclusion: there is an ad hoc set of `spell correction' rules for the
;;; static ecommerce data sets which we want to keep in a separate file.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;?([.?!,;":\)]*) \1WebErsatz\2
^([\(]*)?([.?!,;":\)]*) \1WebErsatz\2
^([\(]*)[a-zA-Z]{2,}\.[a-zA-Z]{2,}>?([.?!,;":\)]*) \1WebErsatz\2
^([\(]*)[a-zA-Z]{2,}\.[a-zA-Z]{2,}\.[a-zA-Z]{2,}>?([.?!,;":\)]*) \1WebErsatz\2
^([\(]*)[a-zA-Z0-9_\.]{2,}@[a-zA-Z0-9._-]{2,}>?([.?!,;":\)]*) \1EmailErsatz\2
;;;
;;; reduced year names; possibly another case where, in full generality, we
;;; would have to be able to strip off the leading apostrophe first and later,
;;; in the token-level part, introduce a tokenization alternative, re-uniting
;;; the apostrophe and two-digit year.
;;;
^([\(]*)'[0-9][0-9]([.?!,;":\)]*) \1YearErsatz\2
;;; Range of years, as in |1970-75|
^([\(]*)[0-9]{3,4}-[0-9]{2,4}([.?!,;":\)]*) \1YearErsatz\2
;;; Also add special treatment for two-letter abbreviations like
;;; OR (Oregon), IN (Indiana), CO (Colorado), US, and IT
+([\(]*)OR([.?!,;":\)]*) _OR
+([\(]*)IN([.?!,;":\)]*) _IN
+([\(]*)CO([.?!,;":\)]*) _CO
+([\(]*)US([.?!,;":\)]*) _US
+([\(]*)IT([.?!,;":\)]*) _IT
;;; Similarly for ON, OFF, as in "the ON switch"
+([\(]*)ON([.?!,;":\)]*) _ON
+([\(]*)OFF([.?!,;":\)]*) _OFF
;+([^:]+): \1
;| _:
;;; Replace left and right angle marks with variant, to avoid communication
;;; troubles among preprocessor, [incr tsdb()], and PET.
;;;
! [<] leftangle
! [>] rightangle