/*
converter for HTML-entities
HTML-entities from the input stream are converted
to characters in UTF-8 or ISO-8859-1 (see options)
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2, or (at your option)
any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
02110-1301, USA.
Written 2004
by Sebastian Nagel, CIS Uni München */
%option caseful 8bit batch noyywrap
%{
#include
#include
#include
#include
typedef unsigned int BOOL;
enum { FALSE = 0, TRUE = 1 };
enum { ISO_LATIN_1 = 1, UTF8 };
struct option {
BOOL QUIET;
unsigned char ENCODING;
BOOL FORCE;
BOOL Replace;
unsigned int rChar;
BOOL replace;
char* rString;
} option
= { FALSE, /* (QUIET) default: report errors */
UTF8, /* default encoding */
FALSE, /* don't skip unrecognized/unprintable characters */
FALSE, /* don't replace unrecognized/unprintable characters */
0x3f, /* question mark */
FALSE, /* don't replace unrecognized/unprintable characters */
"???"
};
unsigned int num; /* buffer for value of numerical entities */
inline static char*
num_to_utf8 (int num, char* out)
{
int i = 0;
if (num <= 0x7f) {
out[i++] = (unsigned char)(num);
} else if (num <= 0x7ff) {
/* 00000000 00000000 00000bbb bbaaaaaa ==> 110bbbbb 10aaaaaa */
out[i++] = (unsigned char)(0xc0 | (num >> 6));
out[i++] = (unsigned char)(0x80 | (num & 0x3f));
} else if (num <= 0xffff) {
/* 00000000 00000000 ccccbbbb bbaaaaaa ==> 1110cccc 10bbbbbb 10aaaaaa */
out[i++] = (unsigned char)(0xe0 | (num >> 12));
out[i++] = (unsigned char)(0x80 | ((num >> 6) & 0x3f));
out[i++] = (unsigned char)(0x80 | (num & 0x3f));
} else if (num <= 0x10ffff) {
/* 00000000 000dddcc ccccbbbb bbaaaaaa ==> 11110ddd 10cccccc 10bbbbbb 10aaaaaa */
out[i++] = (unsigned char)(0xf0 | (num >> 18));
out[i++] = (unsigned char)(0x80 | ((num >> 12) & 0x3f));
out[i++] = (unsigned char)(0x80 | ((num >> 6) & 0x3f));
out[i++] = (unsigned char)(0x80 | (num & 0x3f));
} else {
/* characters > 0x10ffff are illegal in Unicode 4.0 */
if (option.QUIET != TRUE)
fprintf(stderr, "Char 0x%x (\"%s\") not defined in unicode!\n", num, yytext);
}
out[i] = '\0';
return out;
}
inline static char*
num_to_iso_latin_1 (int num, char* out)
{
int i = 0;
if (num <= 0xff)
out[i++] = (unsigned char)(num);
else
if (option.QUIET != TRUE)
fprintf(stderr, "Char 0x%x (\"%s\") doesn't exist in ISO-8859-1: use UTF-8!\n", num, yytext);
out[i] = '\0';
return out;
}
inline static void
out (unsigned int num)
{
char out[8];
if ( option.ENCODING == UTF8 )
num_to_utf8(num, out);
else if ( option.ENCODING == ISO_LATIN_1 )
num_to_iso_latin_1(num, out);
fputs(out, yyout);
}
%}
%%
" out(0x22); /* quotation mark = APL quote */
& out(0x26); /* ampersand */
< out(0x3c); /* less-than sign */
> out(0x3e); /* greater-than sign */
out(0xa0); /* no-break space = non-breaking space */
¡ out(0xa1); /* inverted exclamation mark */
¢ out(0xa2); /* cent sign */
£ out(0xa3); /* pound sign */
¤ out(0xa4); /* currency sign */
¥ out(0xa5); /* yen sign = yuan sign */
¦ out(0xa6); /* broken bar = broken vertical bar */
§ out(0xa7); /* section sign */
¨ out(0xa8); /* diaeresis = spacing diaeresis */
© out(0xa9); /* copyright sign */
ª out(0xaa); /* feminine ordinal indicator */
« out(0xab); /* left-pointing double angle quotation mark */
¬ out(0xac); /* not sign */
out(0xad); /* soft hyphen = discretionary hyphen */
® out(0xae); /* registered sign = registered trade mark sign */
¯ out(0xaf); /* macron = spacing macron = overline */
° out(0xb0); /* degree sign */
± out(0xb1); /* plus-minus sign = plus-or-minus sign */
² out(0xb2); /* superscript two = superscript digit two */
³ out(0xb3); /* superscript three = superscript digit three */
´ out(0xb4); /* acute accent = spacing acute */
µ out(0xb5); /* micro sign */
¶ out(0xb6); /* pilcrow sign = paragraph sign */
· out(0xb7); /* middle dot = Georgian comma */
¸ out(0xb8); /* cedilla = spacing cedilla */
¹ out(0xb9); /* superscript one = superscript digit one */
º out(0xba); /* masculine ordinal indicator */
» out(0xbb); /* right-pointing double angle quotation mark */
¼ out(0xbc); /* vulgar fraction one quarter */
½ out(0xbd); /* vulgar fraction one half */
¾ out(0xbe); /* vulgar fraction three quarters */
¿ out(0xbf); /* inverted question mark */
À out(0xc0); /* latin capital letter A with grave */
Á out(0xc1); /* latin capital letter A with acute */
 out(0xc2); /* latin capital letter A with circumflex */
à out(0xc3); /* latin capital letter A with tilde */
Ä out(0xc4); /* latin capital letter A with diaeresis */
Å out(0xc5); /* latin capital letter A with ring above */
Æ out(0xc6); /* latin capital letter AE */
Ç out(0xc7); /* latin capital letter C with cedilla */
È out(0xc8); /* latin capital letter E with grave */
É out(0xc9); /* latin capital letter E with acute */
Ê out(0xca); /* latin capital letter E with circumflex */
Ë out(0xcb); /* latin capital letter E with diaeresis */
Ì out(0xcc); /* latin capital letter I with grave */
Í out(0xcd); /* latin capital letter I with acute */
Î out(0xce); /* latin capital letter I with circumflex */
Ï out(0xcf); /* latin capital letter I with diaeresis */
Ð out(0xd0); /* latin capital letter ETH */
Ñ out(0xd1); /* latin capital letter N with tilde */
Ò out(0xd2); /* latin capital letter O with grave */
Ó out(0xd3); /* latin capital letter O with acute */
Ô out(0xd4); /* latin capital letter O with circumflex */
Õ out(0xd5); /* latin capital letter O with tilde */
Ö out(0xd6); /* latin capital letter O with diaeresis */
× out(0xd7); /* multiplication sign */
Ø out(0xd8); /* latin capital letter O with stroke */
Ù out(0xd9); /* latin capital letter U with grave */
Ú out(0xda); /* latin capital letter U with acute */
Û out(0xdb); /* latin capital letter U with circumflex */
Ü out(0xdc); /* latin capital letter U with diaeresis */
Ý out(0xdd); /* latin capital letter Y with acute */
Þ out(0xde); /* latin capital letter THORN */
ß out(0xdf); /* latin small letter sharp s = ess-zed */
à out(0xe0); /* latin small letter a with grave */
á out(0xe1); /* latin small letter a with acute */
â out(0xe2); /* latin small letter a with circumflex */
ã out(0xe3); /* latin small letter a with tilde */
ä out(0xe4); /* latin small letter a with diaeresis */
å out(0xe5); /* latin small letter a with ring above */
æ out(0xe6); /* latin small letter ae */
ç out(0xe7); /* latin small letter c with cedilla */
è out(0xe8); /* latin small letter e with grave */
é out(0xe9); /* latin small letter e with acute */
ê out(0xea); /* latin small letter e with circumflex */
ë out(0xeb); /* latin small letter e with diaeresis */
ì out(0xec); /* latin small letter i with grave */
í out(0xed); /* latin small letter i with acute */
î out(0xee); /* latin small letter i with circumflex */
ï out(0xef); /* latin small letter i with diaeresis */
ð out(0xf0); /* latin small letter eth */
ñ out(0xf1); /* latin small letter n with tilde */
ò out(0xf2); /* latin small letter o with grave */
ó out(0xf3); /* latin small letter o with acute */
ô out(0xf4); /* latin small letter o with circumflex */
õ out(0xf5); /* latin small letter o with tilde */
ö out(0xf6); /* latin small letter o with diaeresis */
÷ out(0xf7); /* division sign */
ø out(0xf8); /* latin small letter o with stroke */
ù out(0xf9); /* latin small letter u with grave */
ú out(0xfa); /* latin small letter u with acute */
û out(0xfb); /* latin small letter u with circumflex */
ü out(0xfc); /* latin small letter u with diaeresis */
ý out(0xfd); /* latin small letter y with acute */
þ out(0xfe); /* latin small letter thorn */
ÿ out(0xff); /* latin small letter y with diaeresis */
ƒ out(0x192); /* latin small f with hook = function */
Α out(0x391); /* greek capital letter alpha */
Β out(0x392); /* greek capital letter beta */
Γ out(0x393); /* greek capital letter gamma */
Δ out(0x394); /* greek capital letter delta */
Ε out(0x395); /* greek capital letter epsilon */
Ζ out(0x396); /* greek capital letter zeta */
Η out(0x397); /* greek capital letter eta */
Θ out(0x398); /* greek capital letter theta */
Ι out(0x399); /* greek capital letter iota */
Κ out(0x39a); /* greek capital letter kappa */
Λ out(0x39b); /* greek capital letter lambda */
Μ out(0x39c); /* greek capital letter mu */
Ν out(0x39d); /* greek capital letter nu */
Ξ out(0x39e); /* greek capital letter xi */
Ο out(0x39f); /* greek capital letter omicron */
Π out(0x3a0); /* greek capital letter pi */
Ρ out(0x3a1); /* greek capital letter rho */
Σ out(0x3a3); /* greek capital letter sigma */
Τ out(0x3a4); /* greek capital letter tau */
Υ out(0x3a5); /* greek capital letter upsilon */
Φ out(0x3a6); /* greek capital letter phi */
Χ out(0x3a7); /* greek capital letter chi */
Ψ out(0x3a8); /* greek capital letter psi */
Ω out(0x3a9); /* greek capital letter omega */
α out(0x3b1); /* greek small letter alpha */
β out(0x3b2); /* greek small letter beta */
γ out(0x3b3); /* greek small letter gamma */
δ out(0x3b4); /* greek small letter delta */
ε out(0x3b5); /* greek small letter epsilon */
ζ out(0x3b6); /* greek small letter zeta */
η out(0x3b7); /* greek small letter eta */
θ out(0x3b8); /* greek small letter theta */
ι out(0x3b9); /* greek small letter iota */
κ out(0x3ba); /* greek small letter kappa */
λ out(0x3bb); /* greek small letter lambda */
μ out(0x3bc); /* greek small letter mu */
ν out(0x3bd); /* greek small letter nu */
ξ out(0x3be); /* greek small letter xi */
ο out(0x3bf); /* greek small letter omicron */
π out(0x3c0); /* greek small letter pi */
ρ out(0x3c1); /* greek small letter rho */
ς out(0x3c2); /* greek small letter final sigma */
σ out(0x3c3); /* greek small letter sigma */
τ out(0x3c4); /* greek small letter tau */
υ out(0x3c5); /* greek small letter upsilon */
φ out(0x3c6); /* greek small letter phi */
χ out(0x3c7); /* greek small letter chi */
ψ out(0x3c8); /* greek small letter psi */
ω out(0x3c9); /* greek small letter omega */
ϑ out(0x3d1); /* greek small letter theta symbol */
ϒ out(0x3d2); /* greek upsilon with hook symbol */
ϖ out(0x3d6); /* greek pi symbol */
• out(0x2022); /* bullet = black small circle */
… out(0x2026); /* horizontal ellipsis = three dot leader */
′ out(0x2032); /* prime = minutes = feet */
″ out(0x2033); /* double prime = seconds = inches */
‾ out(0x203e); /* overline = spacing overscore */
⁄ out(0x2044); /* fraction slash */
℘ out(0x2118); /* script capital P = power set */
ℑ out(0x2111); /* blackletter capital I = imaginary part */
ℜ out(0x211c); /* blackletter capital R = real part symbol */
™ out(0x2122); /* trade mark sign */
ℵ out(0x2135); /* alef symbol = first transfinite cardinal */
← out(0x2190); /* leftwards arrow */
↑ out(0x2191); /* upwards arrow */
→ out(0x2192); /* rightwards arrow */
↓ out(0x2193); /* downwards arrow */
↔ out(0x2194); /* left right arrow */
↵ out(0x21b5); /* downwards arrow with corner leftwards */
⇐ out(0x21d0); /* leftwards double arrow */
⇑ out(0x21d1); /* upwards double arrow */
⇒ out(0x21d2); /* rightwards double arrow */
⇓ out(0x21d3); /* downwards double arrow */
⇔ out(0x21d4); /* left right double arrow */
∀ out(0x2200); /* for all */
∂ out(0x2202); /* partial differential */
∃ out(0x2203); /* there exists */
∅ out(0x2205); /* empty set = null set = diameter */
∇ out(0x2207); /* nabla = backward difference */
∈ out(0x2208); /* element of */
∉ out(0x2209); /* not an element of */
∋ out(0x220b); /* contains as member */
∏ out(0x220f); /* n-ary product = product sign */
∑ out(0x2211); /* n-ary sumation */
− out(0x2212); /* minus sign */
∗ out(0x2217); /* asterisk operator */
√ out(0x221a); /* square root = radical sign */
∝ out(0x221d); /* proportional to */
∞ out(0x221e); /* infinity */
∠ out(0x2220); /* angle */
∧ out(0x2227); /* logical and = wedge */
∨ out(0x2228); /* logical or = vee */
∩ out(0x2229); /* intersection = cap */
∪ out(0x222a); /* union = cup */
∫ out(0x222b); /* integral */
∴ out(0x2234); /* therefore */
∼ out(0x223c); /* tilde operator = varies with = similar to */
≅ out(0x2245); /* approximately equal to */
≈ out(0x2248); /* almost equal to = asymptotic to */
≠ out(0x2260); /* not equal to */
≡ out(0x2261); /* identical to */
≤ out(0x2264); /* less-than or equal to */
≥ out(0x2265); /* greater-than or equal to */
⊂ out(0x2282); /* subset of */
⊃ out(0x2283); /* superset of */
⊄ out(0x2284); /* not a subset of */
⊆ out(0x2286); /* subset of or equal to */
⊇ out(0x2287); /* superset of or equal to */
⊕ out(0x2295); /* circled plus = direct sum */
⊗ out(0x2297); /* circled times = vector product */
⊥ out(0x22a5); /* up tack = orthogonal to = perpendicular */
⋅ out(0x22c5); /* dot operator */
⌈ out(0x2308); /* left ceiling = apl upstile */
⌉ out(0x2309); /* right ceiling */
⌊ out(0x230a); /* left floor = apl downstile */
⌋ out(0x230b); /* right floor */
〈 out(0x2329); /* left-pointing angle bracket = bra */
〉 out(0x232a); /* right-pointing angle bracket = ket */
◊ out(0x25ca); /* lozenge */
♠ out(0x2660); /* black spade suit */
♣ out(0x2663); /* black club suit = shamrock */
♥ out(0x2665); /* black heart suit = valentine */
♦ out(0x2666); /* black diamond suit */
Œ out(0x152); /* latin capital ligature OE */
œ out(0x153); /* latin small ligature oe */
Š out(0x160); /* latin capital letter S with caron */
š out(0x161); /* latin small letter s with caron */
Ÿ out(0x178); /* latin capital letter Y with diaeresis */
ˆ out(0x2c6); /* modifier letter circumflex accent */
˜ out(0x2dc); /* small tilde */
out(0x2002); /* en space */
out(0x2003); /* em space */
out(0x2009); /* thin space */
out(0x200c); /* zero width non-joiner */
out(0x200d); /* zero width joiner */
out(0x200e); /* left-to-right mark */
out(0x200f); /* right-to-left mark */
– out(0x2013); /* en dash */
— out(0x2014); /* em dash */
‘ out(0x2018); /* left single quotation mark */
’ out(0x2019); /* right single quotation mark */
‚ out(0x201a); /* single low-9 quotation mark */
“ out(0x201c); /* left double quotation mark */
” out(0x201d); /* right double quotation mark */
„ out(0x201e); /* double low-9 quotation mark */
† out(0x2020); /* dagger */
‡ out(0x2021); /* double dagger */
‰ out(0x2030); /* per mille sign */
‹ out(0x2039); /* single left-pointing angle quotation mark */
› out(0x203a); /* single right-pointing angle quotation mark */
€ out(0x20ac); /* euro sign */
""[0-9A-Fa-f]+";" { sscanf(yytext, "%x", &num);
out(num); }
""[0-9A-Fa-f]+";" { sscanf(yytext, "%x", &num);
out(num); }
""[0-9]+";" { sscanf(yytext, "%d", &num);
out(num); }
"&"[^;]{1,8}";" { if (option.QUIET != TRUE)
fprintf(stderr, "invalid entity or unescaped ampersand: %s\n", yytext);
if (option.Replace == TRUE)
out(option.rChar);
else if (option.replace == TRUE)
fputs(option.rString, yyout);
else if (option.FORCE == TRUE) {}
else
fprintf(stdout, "&"); /* skip "&" */
yyless(1); }
[^&]+ ECHO;
. ECHO;
%%
void
help ()
{
fprintf(stdout,
"\n"
"htmlEnt2Char -- replaces HTML entities\n"
" options:\n"
" -C output encoding, actually supported:\n"
" l1 lat1 latin1 iso-8859-1\n"
" u8 utf-8 (default)\n"
" -o output filename\n"
" -f force: skip misspelled entities or\n"
" entities not printable in given charset\n"
" (see also -r or -R)\n"
" -r replace unrecognized/unprintable entities\n"
" by \n"
" -R replace unrecognized/unprintable entities\n"
" by a character given as , a Unicode code point\n"
" Interpretation of follows the C convention:\n"
" 0x.... for hexadecimal numbers\n"
" 0.... for octal numbers\n"
" .... for decimal numbers\n"
" -q quiet: don't report errors, misspelled\n"
" entities etc.\n"
" -h | -? print this help and exit\n"
" Other arguments will be read as input filenames.\n"
" If no input files are given, input is read from stdin.\n"
" If no output file is given, the text with replacements\n"
" is written to stdout.\n\n");
printf("htmlEnt2Char, Sebastian Nagel (wastl@cis.uni-muenchen.de)\n");
exit(1);
}
int
main (int argc, char **argv)
{
int c;
while (1) {
c = getopt(argc, argv, ":C:o:fqr:R:h?");
if (c == -1)
break;
switch (c) {
case 'h':
case '?':
help();
case 'q':
option.QUIET = TRUE;
break;
case 'C':
if ((! strcmp(optarg, "l1")) ||
(! strcmp(optarg, "ISO-8859-1")) ||
(! strcmp(optarg, "iso-8859-1")) ||
(! strcmp(optarg, "lat1")) ||
(! strcmp(optarg, "latin1")) ||
(! strcmp(optarg, "latin-1")) ||
(! strcmp(optarg, "Latin-1")))
option.ENCODING = ISO_LATIN_1;
else if ((! strcmp(optarg, "u8")) ||
(! strcmp(optarg, "utf-8")) ||
(! strcmp(optarg, "UTF-8")))
option.ENCODING = UTF8;
break;
case 'o':
if (optarg != NULL && (yyout = fopen(optarg, "w")) == NULL) {
fprintf(stderr, "Can't open %s for writing!\n", optarg);
exit(1);
}
break;
case 'f':
option.FORCE = TRUE;
break;
case 'r':
option.replace = TRUE;
option.rString = optarg;
break;
case 'R':
option.Replace = TRUE;
sscanf(optarg, "0x%x", &option.rChar)
|| sscanf(optarg, "0%o", &option.rChar)
|| sscanf(optarg, "%u", &option.rChar);
break;
}
}
if (option.ENCODING == ISO_LATIN_1
&& option.rChar >= 0xff)
{
fprintf(stderr, "Replacement character not in iso-8859-1!\n");
exit(1);
}
if (optind < argc) /* remaing ARGVs are filenames */
{
while (optind < argc)
{
if (argv[optind] != NULL && (yyin = fopen(argv[optind], "r")) == NULL)
{
fprintf(stderr, "Can't read from %s!\n", argv[optind]);
perror(argv[optind]);
exit(1);
}
yylex();
optind++;
}
}
else /* default: read yyin/stdin, when no input-files are given */
{
yylex();
}
return 0;
}