;;; -*- mode: fundamental; coding: utf-8; indent-tabs-mode: t; -*- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; first shot at a finite-state language for preprocessing, normalization, and ;;; tokenization in LKB grammars. requires LKB version after 1-feb-03. note ;;; that the syntax is rigid: everything starting in column 2 (i.e. right after ;;; the rule type marker) is used as the match pattern until the first `\t' ;;; (tabulator sign); one or more tabulator sign are considered the separator ;;; between the matching pattern and the replacement, but other whitespace will ;;; be considered part of the patterns. empty lines or lines with a semicolon ;;; in column 1 (i.e. in place of the rule type marker, this is not Lisp) will ;;; be ignored. ;;; ;;; rules are applied in order and, in the case of substitution rules, each see ;;; the output of the previous iteration. token-level augmentation rules (the ;;; `+' type, for now) are different in that they add an alternative for the ;;; token but the original form remains in the input buffer for subsequent rule ;;; applications (i.e. the alternative is _not_ visible to further rules). ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; ;;; preprocessor rules versioning; auto-maintained upon CVS check-in. ;;; @$Date: 2008/01/29 18:41:12 $ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; tokenization pattern: after normalization, the string will be broken up at ;;; each occurrence of this pattern; the pattern match itself is deleted. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; :[ \t]+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; string rewrite rules: all matches, over the entire string, are replaced by ;;; the right-hand side; grouping (using `(' and `)') in the pattern) and group ;;; references (`\1' for the first group, et al.) carry over part of the match. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; ;;; Delete space following string-initial double quote ;;; !^(\") \1 ;;; ;;; pad the full string with trailing and leading whitespace; makes matches for ;;; word boundaries a little easier down the road. ;;; !^(.+)$ \1 ;;; ;;; Cope with standard XML punctuation marks ;;; !— - !& mdash; - !– - !& ndash; - !’ s 's !& rsquo; s 's !’ ' !& rsquo; ' !’ s 's !” " !& rdquo; " !“ " !& ldquo; " !& & !& amp; & !" " !& quot; " !' ' !& apos; ' !% % !& percnt; % !• !& bull; !/ / !& sol; / !& deg; ° !`` " !\u0020 !\rquote s 's !\rquote ' !\222 ' !¦ !\\rquote s 's !� : !� - !� - !∼ ~ !“ " !” " ! !< emph > ! !< /emph > ! !< linebrk/ > !< emph etype= " boldital " > !< emph etype= " bold " > !< dotfill/ > threedots !Þ !¤ ;;; Correct spurious formatting character sequences !\* \* h . !\* \* f !``. . !, , , ;;; ;;; separate hash sign from right-adjacent number(s) ;;; !([#])([0-9]) \1 \2 ;;; ;;; Replace three or more dots with token 'threedots ' ;;; !\.{3,} threedots ;;; Replace separator line of equal signs with a period. ;;; !={3,} . ;;; ;;; replace multiple sentence-final punctuation marks with only the first one ;;; ;!([?!.])[?!.]+ \1 ;;; ;;; replace two dots with hyphen - used for ranges as in |26 .. 62| ;;; ! \.{2} twodots ;;; ;;; Only for robust EC, VM where some items have final comma rather then period ;;; ;!([,]) ;;; ;;; _fix_me_ ;;; at least for hyphens, we should introduce a notion of `bound' tokens, ;;; e.g. |^.| for a hyphen that was stripped off from one or more tokens: ;;; |US-led| --> |US| |^-| |led|. this way, separating hyphens as ;;; individual tokens need not create ambiguity with the parenthetical hyphen, ;;; Collapse triple-hyphen with double-hyphen (for now), and pad on both ;;; sides with whitespace !-{3} __ ;;; Replace "--" with "__" to keep double-hyphen separate from single hyphen !-{2} __ ;;; Replace |«| and |»| with |"| !(?:«|») " ;;; Replace – with - !– - ;;; DPF 18-oct-06 - Temporarily replace |+| with |plus| for PET tokenizer bug ! ([+]) plus ;;; ;;; Separate funny punctuation with whitespace on either side from the ;;; preceding and following word(s). ;;; !([a-zA-Z0-9])([#$%~+*\)§§])+ \1 \2 ! ([#$%~+*§§])([a-zA-Z0-9]) \1 \2 ;;; ;;; Eliminate spurious space preceding ordinary punctuation ;;; !([a-zA-X0-9\)]) ([;,.!\)\?]) \1\2 ;;;(lkb::preprocess "the image (Fig. 6a*) arrives." :verbose t) ;;; Eliminate any string-initial stranded punctuation (from faulty segmentatn) ;;; ! ([;,.!\)\?\*]+) ;;; Eliminate space preceding double quote when space on both sides ;;; (admittedly arbitrary, and not correct for S-initial stranded quote, ;;; but this is just a patch-up for odd punctuation convention). ! (") \1 ;;; Add one more hack for sentence-final double-quote, where we've already ;;; swallowed the padded final space ! (")([.?!]) \1\2 ;;; ;;; Add white space to the right of squished commas and colons ;;; except for numbers on both sides (but separate e.g. |2-day| and |V-neck|) ;;; Same for periods between capital letters: "D.B. Smith", "2.Kim" ;;; Add white space on both sides for forward slash ;;; !([a-zA-ZÅåØø])([,:])([a-zA-ZÅåØø]) \1\2 \3 !([0-9])([,:])([a-zA-Z]) \1\2 \3 !([0-9])(-)([a-zA-Z][a-zA-Z]+) \1 \2 \3 ! ([a-zA-ZαβΔÅåØø])([,:-])([a-zA-Z]) \1\2 \3 !([a-zA-Z])([,:])([0-9]) \1\2 \3 !([A-Z0-9])(\.)([A-Z])(\.)([A-Z]) \1\2 \3\4 \5 !([A-Z0-9])(\.)([A-Z]) \1\2 \3 ! ([A-Z])([/-])([0-9]+) IdentifierErsatz !([a-zA-ZÅåØø-])(/)([a-zA-ZÅåØø]) \1 \2 \3 !([a-zA-ZÅåØø])(→)([a-zA-ZÅåØø]) \1 \2 \3 !([a-zA-ZÅåØø])(/) ([a-zA-ZÅåØø]) \1 \2 \3 !([0-9])(/)([a-zA-ZÅåØø]) \1 \2 \3 !([a-zA-ZÅåØø])(/)([0-9]) \1 \2 \3 !([a-zA-Z])(-)(a)(-)([a-zA-Z0-9ÅåØø]) \1\2 \3\4 \5 !([a-zA-ZÅåØø])(-)([a-zA-Z0-9ÅåØø]) \1\2 \3 !([$]*)[0-9]+ [0-9]{1,2}\/[0-9]{1,3}([.?!,;":$]*) \1FractionErsatz\2 ;;; ;;; Eliminate spurious space preceding right paren ;;; ! ([\)]) \1 ;;; ;;; Eliminate spurious space(s) following left paren ;;; !([$]) \1 !([\(]) \1 ;;; ;;; Parenthetical plurals - remove parens !([A-Za-z]*)\(s$ \1s ;;; ;;; Add white space to left of ( and [ ;;; !([a-zA-Z0-9.])([$\[]) \1 \2 ;;; ;;; Add white space around colon if sandwiched with following alphanumeric ;;; (but not e.g. |http://...| and not ratios with numbers on both sides) ;;; !([a-zA-Z0-9.])([:])([a-zA-Z]) \1 \2 \3 !([a-zA-Z.])([:])([a-zA-Z0-9]) \1 \2 \3 ;;; And add white space to the left of colon when followed by white space: !([a-zA-Z0-9.åø])([:]) \1 \2 ;;; And the variants with surrounding punctuation !([a-zA-Z])([,/])([a-zA-Z])([.?!,;]) \1\2 \3\4 !([0-9])([,/])([a-zA-Z])([.?!,;]) \1\2 \3\4 !([a-zA-Z])([,/])([0-9])([.?!,;]) \1\2 \3\4 ;;; ;;; For now, simplify punctuation clusters found in subordinate quoted Ss, as ;;; in "Who arrived?, she asked" since current suffixing machinery doesn't ;;; produce result. Also for "... two hrs., ..." FIX !([a-zA-Z0-9])[.?!](,) \1\2 ;;; Also simplify awkward clause-final clusters !([:])([.?]) \2 !(["$])([:]) \1 ;;; ;;; apostrophes are a bit tricky: generally, we want to separate leading and ;;; trailing single quotes from adjacent word material, so that they become a ;;; separate token (e.g. |abrams'| --> |abrams '|); the possesive |'s|, on the ;;; other hand, we want to separate but then consider a single token. ;;; !([sS])' \1 ' !([^ ])'[sS] \1 's !([^ ])'[sS]([.?!,;"]) \1 's\2 ;;; ;;; contracted auxiliaries: separate contracted part from preceding word. ;;; !([^ ])'ll \1 'll !([^ ])'d \1 'd !([^ ])'ve \1 've !([^ ])'m \1 'm !([^ ])'re \1 're !([^ ])'LL \1 'LL !([^ ])'D \1 'D !([^ ])'VE \1 'VE !([^ ])'M \1 'M !([^ ])'RE \1 'RE ;;; Remove space after initial "O'" and "L'" ! O' O' ! ([lL])' \1' ;;; ;;; Experimental: mark capitalization with preceding special character |_| ;;; but right now only for single letters used as proper names. ;;; Add special case for sequence of two capitals separated by space, since ;;; the space after the first one gets consumed by the simple rule. ;;; Exclude "I" since it's so frequent as pronoun ;;; ! ([A-HJ-Z])([.?!,;:-]?) ([A-HJ-Z])([.?!,;:-]?) _\1\2 _\3\4 ! ([$"]?)([A-HJ-Z])(["$]?)([.?!,;:-]?) \1_\2\3\4 ;;; ;;; Try correcting squished compounds, here just by listing ;;; !backcountry back country ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; file inclusion: there is an ad hoc set of `spell correction' rules for the ;;; static ecommerce data sets which we want to keep in a separate file. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;?([.?!,;":\)]*) \1WebErsatz\2 ^([$]*)?([.?!,;":$]*) \1WebErsatz\2 ^([$]*)?([.?!,;":$]*) \1WebErsatz\2 ^([$]*)?([.?!,;":$]*) \1WebErsatz\2 ^([$]*)?([.?!,;":$]*) \1EmailErsatz\2 ;;; ;;; reduced year names; possibly another case where, in full generality, we ;;; would have to be able to strip off the leading apostrophe first and later, ;;; in the token-level part, introduce a tokenization alternative, re-uniting ;;; the apostrophe and two-digit year. ;;; ^([$]*)'[0-9][0-9]([.?!,;":$]*) \1YearErsatz\2 ;;; Range of years, as in |1970-75| ^([$]*)[0-9]{3,4}-[0-9]{2,4}([.?!,;":$]*) \1YearErsatz\2 ;;; Also add special treatment for two-letter abbreviations like ;;; OR (Oregon), IN (Indiana), CO (Colorado), US, and IT +([$]*)OR([.?!,;":$]*) _OR +([$]*)IN([.?!,;":$]*) _IN +([$]*)CO([.?!,;":$]*) _CO +([$]*)US([.?!,;":$]*) _US +([$]*)IT([.?!,;":$]*) _IT ;;; Similarly for ON, OFF, as in "the ON switch" +([$]*)ON([.?!,;":$]*) _ON +([$]*)OFF([.?!,;":$]*) _OFF ;+([^:]+): \1 ;| _: ;;; Replace left and right angle marks with variant, to avoid communication ;;; troubles among preprocessor, [incr tsdb()], and PET. ;;; ! [<] leftangle ! [>] rightangle