/**
* HtmlManipulator.java
* Copyright 2007 - 2008 Zach Scrivena
* zachscrivena@gmail.com
* http://zs.freeshell.org/
*
* TERMS AND CONDITIONS:
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package org.freeshell.zs.common;
import java.util.HashMap;
import java.util.Map;
/**
* Perform HTML-related operations.
*/
public class HtmlManipulator
{
/**
* Table of HTML entities obtained from http://www.w3.org/TR/html401/sgml/entities.html
* - Formatted as a series of space-delimited triplets (entity_name,entity_value,Unicode_value),
* e.g. (nbsp,#160,00A0).
* - entity_name and entity_value have been stripped of the surrounding "&" and ";",
* e.g. "nbsp" instead of " ", "#160" instead of " ".
*/
private static final String RAW_HTML_ENTITY_TABLE =
"nbsp #160 00A0 iexcl #161 00A1 cent #162 00A2 pound #163 00A3 curren #164 00A4 yen #165 00A5 " +
"brvbar #166 00A6 sect #167 00A7 uml #168 00A8 copy #169 00A9 ordf #170 00AA laquo #171 00AB " +
"not #172 00AC shy #173 00AD reg #174 00AE macr #175 00AF deg #176 00B0 plusmn #177 00B1 " +
"sup2 #178 00B2 sup3 #179 00B3 acute #180 00B4 micro #181 00B5 para #182 00B6 middot #183 00B7 " +
"cedil #184 00B8 sup1 #185 00B9 ordm #186 00BA raquo #187 00BB frac14 #188 00BC frac12 #189 00BD " +
"frac34 #190 00BE iquest #191 00BF Agrave #192 00C0 Aacute #193 00C1 Acirc #194 00C2 Atilde #195 00C3 " +
"Auml #196 00C4 Aring #197 00C5 AElig #198 00C6 Ccedil #199 00C7 Egrave #200 00C8 Eacute #201 00C9 " +
"Ecirc #202 00CA Euml #203 00CB Igrave #204 00CC Iacute #205 00CD Icirc #206 00CE Iuml #207 00CF " +
"ETH #208 00D0 Ntilde #209 00D1 Ograve #210 00D2 Oacute #211 00D3 Ocirc #212 00D4 Otilde #213 00D5 " +
"Ouml #214 00D6 times #215 00D7 Oslash #216 00D8 Ugrave #217 00D9 Uacute #218 00DA Ucirc #219 00DB " +
"Uuml #220 00DC Yacute #221 00DD THORN #222 00DE szlig #223 00DF agrave #224 00E0 aacute #225 00E1 " +
"acirc #226 00E2 atilde #227 00E3 auml #228 00E4 aring #229 00E5 aelig #230 00E6 ccedil #231 00E7 " +
"egrave #232 00E8 eacute #233 00E9 ecirc #234 00EA euml #235 00EB igrave #236 00EC iacute #237 00ED " +
"icirc #238 00EE iuml #239 00EF eth #240 00F0 ntilde #241 00F1 ograve #242 00F2 oacute #243 00F3 " +
"ocirc #244 00F4 otilde #245 00F5 ouml #246 00F6 divide #247 00F7 oslash #248 00F8 ugrave #249 00F9 " +
"uacute #250 00FA ucirc #251 00FB uuml #252 00FC yacute #253 00FD thorn #254 00FE yuml #255 00FF " +
"fnof #402 0192 Alpha #913 0391 Beta #914 0392 Gamma #915 0393 Delta #916 0394 Epsilon #917 0395 " +
"Zeta #918 0396 Eta #919 0397 Theta #920 0398 Iota #921 0399 Kappa #922 039A Lambda #923 039B " +
"Mu #924 039C Nu #925 039D Xi #926 039E Omicron #927 039F Pi #928 03A0 Rho #929 03A1 " +
"Sigma #931 03A3 Tau #932 03A4 Upsilon #933 03A5 Phi #934 03A6 Chi #935 03A7 Psi #936 03A8 " +
"Omega #937 03A9 alpha #945 03B1 beta #946 03B2 gamma #947 03B3 delta #948 03B4 epsilon #949 03B5 " +
"zeta #950 03B6 eta #951 03B7 theta #952 03B8 iota #953 03B9 kappa #954 03BA lambda #955 03BB " +
"mu #956 03BC nu #957 03BD xi #958 03BE omicron #959 03BF pi #960 03C0 rho #961 03C1 " +
"sigmaf #962 03C2 sigma #963 03C3 tau #964 03C4 upsilon #965 03C5 phi #966 03C6 chi #967 03C7 " +
"psi #968 03C8 omega #969 03C9 thetasym #977 03D1 upsih #978 03D2 piv #982 03D6 bull #8226 2022 " +
"hellip #8230 2026 prime #8242 2032 Prime #8243 2033 oline #8254 203E frasl #8260 2044 weierp #8472 2118 " +
"image #8465 2111 real #8476 211C trade #8482 2122 alefsym #8501 2135 larr #8592 2190 uarr #8593 2191 " +
"rarr #8594 2192 darr #8595 2193 harr #8596 2194 crarr #8629 21B5 lArr #8656 21D0 uArr #8657 21D1 " +
"rArr #8658 21D2 dArr #8659 21D3 hArr #8660 21D4 forall #8704 2200 part #8706 2202 exist #8707 2203 " +
"empty #8709 2205 nabla #8711 2207 isin #8712 2208 notin #8713 2209 ni #8715 220B prod #8719 220F " +
"sum #8721 2211 minus #8722 2212 lowast #8727 2217 radic #8730 221A prop #8733 221D infin #8734 221E " +
"ang #8736 2220 and #8743 2227 or #8744 2228 cap #8745 2229 cup #8746 222A int #8747 222B " +
"there4 #8756 2234 sim #8764 223C cong #8773 2245 asymp #8776 2248 ne #8800 2260 equiv #8801 2261 " +
"le #8804 2264 ge #8805 2265 sub #8834 2282 sup #8835 2283 nsub #8836 2284 sube #8838 2286 " +
"supe #8839 2287 oplus #8853 2295 otimes #8855 2297 perp #8869 22A5 sdot #8901 22C5 lceil #8968 2308 " +
"rceil #8969 2309 lfloor #8970 230A rfloor #8971 230B lang #9001 2329 rang #9002 232A loz #9674 25CA " +
"spades #9824 2660 clubs #9827 2663 hearts #9829 2665 diams #9830 2666 " +
"quot #34 0022 amp #38 0026 lt #60 003C gt #62 003E OElig #338 0152 oelig #339 0153 " +
"Scaron #352 0160 Yuml #376 0178 circ #710 02C6 tilde #732 02DC ensp #8194 2002 emsp #8195 2003 " +
"thinsp #8201 2009 zwnj #8204 200C zwj #8205 200D lrm #8206 200E rlm #8207 200F ndash #8211 2013 " +
"mdash #8212 2014 lsquo #8216 2018 rsquo #8217 2019 sbquo #8218 201A ldquo #8220 201C rdquo #8221 201D " +
"bdquo #8222 201E dagger #8224 2020 Dagger #8225 2021 permil #8240 2030 lsaquo #8249 2039 rsaquo #8250 203A " +
"euro #8364 20AC";
/** value given by RAW_HTML_ENTITY_TABLE.hashCode(), used to guard against accidental modification */
private static final int RAW_HTML_ENTITY_TABLE_HASHCODE = -301953893;
/** mapping: HTML entity ---> Unicode character */
private static final Map<String,Character> HTML_ENTITY_TO_UNICODE_MAP = new HashMap<String,Character>();
/** mapping: Unicode character ---> HTML entity */
private static final Map<Character,String> UNICODE_TO_HTML_ENTITY_MAP = new HashMap<Character,String>();
/**
* Static initialization block.
* Populates HTML_ENTITY_TO_UNICODE_MAP and UNICODE_TO_HTML_ENTITY_MAP.
*/
static
{
/* check hash code of RAW_HTML_ENTITY_TABLE */
if (RAW_HTML_ENTITY_TABLE.hashCode() != RAW_HTML_ENTITY_TABLE_HASHCODE)
{
throw new RuntimeException("(INTERNAL) Malformed HtmlManipulator.RAW_HTML_ENTITY_TABLE.");
}
/* populate HTML entity <---> Unicode character maps */
final String[] elements = RAW_HTML_ENTITY_TABLE.split("[\\s]++");
for (int i = 0; i < elements.length; i += 3)
{
final char unicode = (char) Integer.parseInt(elements[i + 2], 16);
HTML_ENTITY_TO_UNICODE_MAP.put(elements[i], unicode);
HTML_ENTITY_TO_UNICODE_MAP.put(elements[i + 1], unicode);
UNICODE_TO_HTML_ENTITY_MAP.put(unicode, elements[i]);
}
}
/**
* Replace HTML entities in a given string with their Unicode character representations.
*
* @param s
* input string
* @return
* string with HTML entities replaced
*/
public static String replaceHtmlEntities(
final String s)
{
final StringBuilder t = new StringBuilder();
for (int i = 0, n = s.length(); i < n; i++)
{
final char c = s.charAt(i);
if (c == '&')
{
/* candidate HTML entity */
final int j = s.indexOf(';', i);
if (j >= 0)
{
final Character unicode = HTML_ENTITY_TO_UNICODE_MAP.get(s.substring(i + 1, j));
if (unicode != null)
{
/* insert Unicode representation */
t.append((char) unicode);
i = j; /* advance index */
continue;
}
}
}
/* treat as a literal character */
t.append(c);
}
return t.toString();
}
/**
* Quote a specified string as HTML, by replacing all special characters with their
* equivalent HTML entities.
*
* @param s
* input string
* @return
* string with special characters replaced
*/
public static String quoteHtml(
final String s)
{
final StringBuilder t = new StringBuilder();
for (char c : s.toCharArray())
{
final String entity = UNICODE_TO_HTML_ENTITY_MAP.get(c);
if (entity == null)
{
t.append(c);
}
else
{
t.append('&');
t.append(entity);
t.append(';');
}
}
return t.toString();
}
}