Skip to contentMethod: charFromName(String)
1: /*
2: * Copyright 1999-2002,2004 The Apache Software Foundation.
3: *
4: * Licensed under the Apache License, Version 2.0 (the "License");
5: * you may not use this file except in compliance with the License.
6: * You may obtain a copy of the License at
7: *
8: * http://www.apache.org/licenses/LICENSE-2.0
9: *
10: * Unless required by applicable law or agreed to in writing, software
11: * distributed under the License is distributed on an "AS IS" BASIS,
12: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13: * See the License for the specific language governing permissions and
14: * limitations under the License.
15: */
16:
17:
18: // Aug 21, 2000:
19: // Fixed bug in isElement and made HTMLdtd public.
20: // Contributed by Eric SCHAEFFER" <eschaeffer@posterconseil.com>
21:
22:
23: package it.tidalwave.northernwind.core.impl.patches;
24:
25: import java.util.Hashtable;
26: import java.util.Locale;
27: import java.io.BufferedReader;
28: import java.io.InputStream;
29: import java.io.InputStreamReader;
30: import com.sun.org.apache.xerces.internal.dom.DOMMessageFormatter;
31:
32:
33: /**
34: * Utility class for accessing information specific to HTML documents.
35: * The HTML DTD is expressed as three utility function groups. Two methods
36: * allow for checking whether an element requires an open tag on printing
37: * ({@link #isEmptyTag}) or on parsing ({@link #isOptionalClosing}).
38: * <P>
39: * Two other methods translate character references from name to value and
40: * from value to name. A small entities resource is loaded into memory the
41: * first time any of these methods is called for fast and efficient access.
42: *
43: *
44: * @version $Revision$ $Date$
45: * @author <a href="mailto:arkin@intalio.com">Assaf Arkin</a>
46: */
47: @SuppressWarnings("all")
48: public final class HTMLdtd
49: {
50:
51: /**
52: * Public identifier for HTML 4.01 (Strict) document type.
53: */
54: public static final String HTMLPublicId = "-//W3C//DTD HTML 4.01//EN";
55:
56: /**
57: * System identifier for HTML 4.01 (Strict) document type.
58: */
59: public static final String HTMLSystemId =
60: "http://www.w3.org/TR/html4/strict.dtd";
61:
62: /**
63: * Public identifier for XHTML 1.0 (Strict) document type.
64: */
65: public static final String XHTMLPublicId =
66: "-//W3C//DTD XHTML 1.0 Strict//EN";
67:
68: /**
69: * System identifier for XHTML 1.0 (Strict) document type.
70: */
71: public static final String XHTMLSystemId =
72: "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd";
73:
74: /**
75: * Table of reverse character reference mapping. Character codes are held
76: * as single-character strings, mapped to their reference name.
77: */
78: private static Hashtable _byChar;
79:
80:
81: /**
82: * Table of entity name to value mapping. Entities are held as strings,
83: * character references as <TT>Character</TT> objects.
84: */
85: private static Hashtable _byName;
86:
87:
88: private static Hashtable _boolAttrs;
89:
90:
91: /**
92: * Holds element definitions.
93: */
94: private static Hashtable _elemDefs;
95:
96:
97: /**
98: * Locates the HTML entities file that is loaded upon initialization.
99: * This file is a resource loaded with the default class loader.
100: */
101: private static final String ENTITIES_RESOURCE = "HTMLEntities.res";
102:
103:
104: /**
105: * Only opening tag should be printed.
106: */
107: private static final int ONLY_OPENING = 0x0001;
108:
109: /**
110: * Element contains element content only.
111: */
112: private static final int ELEM_CONTENT = 0x0002;
113:
114:
115: /**
116: * Element preserve spaces.
117: */
118: private static final int PRESERVE = 0x0004;
119:
120:
121: /**
122: * Optional closing tag.
123: */
124: private static final int OPT_CLOSING = 0x0008;
125:
126:
127: /**
128: * Element is empty (also means only opening tag)
129: */
130: private static final int EMPTY = 0x0010 | ONLY_OPENING;
131:
132:
133: /**
134: * Allowed to appear in head.
135: */
136: private static final int ALLOWED_HEAD = 0x0020;
137:
138:
139: /**
140: * When opened, closes P.
141: */
142: private static final int CLOSE_P = 0x0040;
143:
144:
145: /**
146: * When opened, closes DD or DT.
147: */
148: private static final int CLOSE_DD_DT = 0x0080;
149:
150:
151: /**
152: * When opened, closes itself.
153: */
154: private static final int CLOSE_SELF = 0x0100;
155:
156:
157: /**
158: * When opened, closes another table section.
159: */
160: private static final int CLOSE_TABLE = 0x0200;
161:
162:
163: /**
164: * When opened, closes TH or TD.
165: */
166: private static final int CLOSE_TH_TD = 0x04000;
167:
168:
169: /**
170: * Returns true if element is declared to be empty. HTML elements are
171: * defines as empty in the DTD, not by the document syntax.
172: *
173: * @param tagName The element tag name (upper case)
174: * @return True if element is empty
175: */
176: public static boolean isEmptyTag( String tagName )
177: {
178: return isElement( tagName, EMPTY );
179: }
180:
181:
182: /**
183: * Returns true if element is declared to have element content.
184: * Whitespaces appearing inside element content will be ignored,
185: * other text will simply report an error.
186: *
187: * @param tagName The element tag name (upper case)
188: * @return True if element content
189: */
190: public static boolean isElementContent( String tagName )
191: {
192: return isElement( tagName, ELEM_CONTENT );
193: }
194:
195:
196: /**
197: * Returns true if element's textual contents preserves spaces.
198: * This only applies to PRE and TEXTAREA, all other HTML elements
199: * do not preserve space.
200: *
201: * @param tagName The element tag name (upper case)
202: * @return True if element's text content preserves spaces
203: */
204: public static boolean isPreserveSpace( String tagName )
205: {
206: return isElement( tagName, PRESERVE );
207: }
208:
209:
210: /**
211: * Returns true if element's closing tag is optional and need not
212: * exist. An error will not be reported for such elements if they
213: * are not closed. For example, <tt>LI</tt> is most often not closed.
214: *
215: * @param tagName The element tag name (upper case)
216: * @return True if closing tag implied
217: */
218: public static boolean isOptionalClosing( String tagName )
219: {
220: return isElement( tagName, OPT_CLOSING );
221: }
222:
223:
224: /**
225: * Returns true if element's closing tag is generally not printed.
226: * For example, <tt>LI</tt> should not print the closing tag.
227: *
228: * @param tagName The element tag name (upper case)
229: * @return True if only opening tag should be printed
230: */
231: public static boolean isOnlyOpening( String tagName )
232: {
233: return isElement( tagName, ONLY_OPENING );
234: }
235:
236:
237: /**
238: * Returns true if the opening of one element (<tt>tagName</tt>) implies
239: * the closing of another open element (<tt>openTag</tt>). For example,
240: * every opening <tt>LI</tt> will close the previously open <tt>LI</tt>,
241: * and every opening <tt>BODY</tt> will close the previously open <tt>HEAD</tt>.
242: *
243: * @param tagName The newly opened element
244: * @param openTag The already opened element
245: * @return True if closing tag closes opening tag
246: */
247: public static boolean isClosing( String tagName, String openTag )
248: {
249: // Several elements are defined as closing the HEAD
250: if ( openTag.equalsIgnoreCase( "HEAD" ) )
251: return ! isElement( tagName, ALLOWED_HEAD );
252: // P closes iteself
253: if ( openTag.equalsIgnoreCase( "P" ) )
254: return isElement( tagName, CLOSE_P );
255: // DT closes DD, DD closes DT
256: if ( openTag.equalsIgnoreCase( "DT" ) || openTag.equalsIgnoreCase( "DD" ) )
257: return isElement( tagName, CLOSE_DD_DT );
258: // LI and OPTION close themselves
259: if ( openTag.equalsIgnoreCase( "LI" ) || openTag.equalsIgnoreCase( "OPTION" ) )
260: return isElement( tagName, CLOSE_SELF );
261: // Each of these table sections closes all the others
262: if ( openTag.equalsIgnoreCase( "THEAD" ) || openTag.equalsIgnoreCase( "TFOOT" ) ||
263: openTag.equalsIgnoreCase( "TBODY" ) || openTag.equalsIgnoreCase( "TR" ) ||
264: openTag.equalsIgnoreCase( "COLGROUP" ) )
265: return isElement( tagName, CLOSE_TABLE );
266: // TD closes TH and TH closes TD
267: if ( openTag.equalsIgnoreCase( "TH" ) || openTag.equalsIgnoreCase( "TD" ) )
268: return isElement( tagName, CLOSE_TH_TD );
269: return false;
270: }
271:
272:
273: /**
274: * Returns true if the specified attribute it a URI and should be
275: * escaped appropriately. In HTML URIs are escaped differently
276: * than normal attributes.
277: *
278: * @param tagName The element's tag name
279: * @param attrName The attribute's name
280: */
281: public static boolean isURI( String tagName, String attrName )
282: {
283: // Stupid checks.
284: return ( attrName.equalsIgnoreCase( "href" ) || attrName.equalsIgnoreCase( "src" ) );
285: }
286:
287:
288: /**
289: * Returns true if the specified attribute is a boolean and should be
290: * printed without the value. This applies to attributes that are true
291: * if they exist, such as selected (OPTION/INPUT).
292: *
293: * @param tagName The element's tag name
294: * @param attrName The attribute's name
295: */
296: public static boolean isBoolean( String tagName, String attrName )
297: {
298: String[] attrNames;
299:
300: attrNames = (String[]) _boolAttrs.get( tagName.toUpperCase(Locale.ENGLISH) );
301: if ( attrNames == null )
302: return false;
303: for ( int i = 0 ; i < attrNames.length ; ++i )
304: if ( attrNames[ i ].equalsIgnoreCase( attrName ) )
305: return true;
306: return false;
307: }
308:
309:
310: /**
311: * Returns the value of an HTML character reference by its name. If the
312: * reference is not found or was not defined as a character reference,
313: * returns EOF (-1).
314: *
315: * @param name Name of character reference
316: * @return Character code or EOF (-1)
317: */
318: public static int charFromName( String name )
319: {
320: Object value;
321:
322: initialize();
323: value = _byName.get( name );
324:• if ( value != null && value instanceof Integer )
325: return ( (Integer) value ).intValue();
326: else
327: return -1;
328: }
329:
330:
331: /**
332: * Returns the name of an HTML character reference based on its character
333: * value. Only valid for entities defined from character references. If no
334: * such character value was defined, return null.
335: *
336: * @param value Character value of entity
337: * @return Entity's name or null
338: */
339: public static String fromChar(int value )
340: {
341: if (value > 0xffff)
342: return null;
343:
344: String name;
345:
346: initialize();
347: name = (String) _byChar.get( new Integer( value ) );
348: return name;
349: }
350:
351:
352: /**
353: * Initialize upon first access. Will load all the HTML character references
354: * into a list that is accessible by name or character value and is optimized
355: * for character substitution. This method may be called any number of times
356: * but will execute only once.
357: */
358: private static void initialize()
359: {
360: InputStream is = null;
361: BufferedReader reader = null;
362: int index;
363: String name;
364: String value;
365: int code;
366: String line;
367:
368: // Make sure not to initialize twice.
369: if ( _byName != null )
370: return;
371: try {
372: _byName = new Hashtable();
373: _byChar = new Hashtable();
374: is = HTMLdtd.class.getResourceAsStream( ENTITIES_RESOURCE );
375: if ( is == null ) {
376:         throw new RuntimeException(
377:                                  DOMMessageFormatter.formatMessage(
378:                                  DOMMessageFormatter.SERIALIZER_DOMAIN,
379: "ResourceNotFound", new Object[] {ENTITIES_RESOURCE}));
380: }
381: reader = new BufferedReader( new InputStreamReader( is, "ASCII" ) );
382: line = reader.readLine();
383: while ( line != null ) {
384: if ( line.length() == 0 || line.charAt( 0 ) == '#' ) {
385: line = reader.readLine();
386: continue;
387: }
388: index = line.indexOf( ' ' );
389: if ( index > 1 ) {
390: name = line.substring( 0, index );
391: ++index;
392: if ( index < line.length() ) {
393: value = line.substring( index );
394: index = value.indexOf( ' ' );
395: if ( index > 0 )
396: value = value.substring( 0, index );
397: code = Integer.parseInt( value );
398: defineEntity( name, (char) code );
399: }
400: }
401: line = reader.readLine();
402: }
403: is.close();
404: } catch ( Exception except ) {
405:                         throw new RuntimeException(
406:                                 DOMMessageFormatter.formatMessage(
407:                                 DOMMessageFormatter.SERIALIZER_DOMAIN,
408: "ResourceNotLoaded", new Object[] {ENTITIES_RESOURCE, except.toString()}));
409: } finally {
410: if ( is != null ) {
411: try {
412: is.close();
413: } catch ( Exception except ) { }
414: }
415: }
416: }
417:
418:
419: /**
420: * Defines a new character reference. The reference's name and value are
421: * supplied. Nothing happens if the character reference is already defined.
422: * <P>
423: * Unlike internal entities, character references are a string to single
424: * character mapping. They are used to map non-ASCII characters both on
425: * parsing and printing, primarily for HTML documents. '<amp;' is an
426: * example of a character reference.
427: *
428: * @param name The entity's name
429: * @param value The entity's value
430: */
431: private static void defineEntity( String name, char value )
432: {
433: if ( _byName.get( name ) == null ) {
434: _byName.put( name, new Integer( value ) );
435: _byChar.put( new Integer( value ), name );
436: }
437: }
438:
439:
440: private static void defineElement( String name, int flags )
441: {
442: _elemDefs.put( name, new Integer( flags ) );
443: }
444:
445:
446: private static void defineBoolean( String tagName, String attrName )
447: {
448: defineBoolean( tagName, new String[] { attrName } );
449: }
450:
451:
452: private static void defineBoolean( String tagName, String[] attrNames )
453: {
454: _boolAttrs.put( tagName, attrNames );
455: }
456:
457:
458: private static boolean isElement( String name, int flag )
459: {
460: Integer flags;
461:
462: flags = (Integer) _elemDefs.get( name.toUpperCase(Locale.ENGLISH) );
463: if ( flags == null )
464: return false;
465: else
466: return ( ( flags.intValue() & flag ) == flag );
467: }
468:
469:
470: static
471: {
472: _elemDefs = new Hashtable();
473: defineElement( "A", PRESERVE );
474: defineElement( "ADDRESS", CLOSE_P );
475: defineElement( "AREA", EMPTY );
476: defineElement( "BASE", EMPTY | ALLOWED_HEAD );
477: defineElement( "BASEFONT", EMPTY );
478: defineElement( "BLOCKQUOTE", CLOSE_P );
479: defineElement( "BODY", OPT_CLOSING );
480: defineElement( "BR", EMPTY );
481: defineElement( "COL", EMPTY );
482: defineElement( "COLGROUP", ELEM_CONTENT | OPT_CLOSING | CLOSE_TABLE );
483: defineElement( "DD", OPT_CLOSING | ONLY_OPENING | CLOSE_DD_DT );
484: defineElement( "DIV", CLOSE_P );
485: defineElement( "DL", ELEM_CONTENT | CLOSE_P );
486: defineElement( "DT", OPT_CLOSING | ONLY_OPENING | CLOSE_DD_DT );
487: defineElement( "FIELDSET", CLOSE_P );
488: defineElement( "FORM", CLOSE_P );
489: defineElement( "FRAME", EMPTY | OPT_CLOSING );
490: defineElement( "H1", CLOSE_P );
491: defineElement( "H2", CLOSE_P );
492: defineElement( "H3", CLOSE_P );
493: defineElement( "H4", CLOSE_P );
494: defineElement( "H5", CLOSE_P );
495: defineElement( "H6", CLOSE_P );
496: defineElement( "HEAD", ELEM_CONTENT | OPT_CLOSING );
497: defineElement( "HR", EMPTY | CLOSE_P );
498: defineElement( "HTML", ELEM_CONTENT | OPT_CLOSING );
499: defineElement( "I", PRESERVE );
500: defineElement( "IMG", EMPTY );
501: defineElement( "INPUT", EMPTY );
502: defineElement( "ISINDEX", EMPTY | ALLOWED_HEAD );
503: defineElement( "LI", OPT_CLOSING | ONLY_OPENING | CLOSE_SELF );
504: defineElement( "LINK", EMPTY | ALLOWED_HEAD );
505: defineElement( "MAP", ALLOWED_HEAD );
506: defineElement( "META", EMPTY | ALLOWED_HEAD );
507: defineElement( "OL", ELEM_CONTENT | CLOSE_P );
508: defineElement( "OPTGROUP", ELEM_CONTENT );
509: defineElement( "OPTION", OPT_CLOSING | ONLY_OPENING | CLOSE_SELF );
510: defineElement( "P", OPT_CLOSING | CLOSE_P | CLOSE_SELF );
511: defineElement( "PARAM", EMPTY );
512: defineElement( "PRE", PRESERVE | CLOSE_P );
513: defineElement( "SCRIPT", ALLOWED_HEAD | PRESERVE );
514: defineElement( "NOSCRIPT", ALLOWED_HEAD | PRESERVE );
515: defineElement( "SELECT", ELEM_CONTENT );
516: defineElement( "SPAN", PRESERVE );
517: defineElement( "STYLE", ALLOWED_HEAD | PRESERVE );
518: defineElement( "TABLE", ELEM_CONTENT | CLOSE_P );
519: defineElement( "TBODY", ELEM_CONTENT | OPT_CLOSING | CLOSE_TABLE );
520: defineElement( "TD", OPT_CLOSING | CLOSE_TH_TD | PRESERVE );
521: defineElement( "TEXTAREA", PRESERVE );
522: defineElement( "TFOOT", ELEM_CONTENT | OPT_CLOSING | CLOSE_TABLE );
523: defineElement( "TH", OPT_CLOSING | CLOSE_TH_TD );
524: defineElement( "THEAD", ELEM_CONTENT | OPT_CLOSING | CLOSE_TABLE );
525: defineElement( "TITLE", ALLOWED_HEAD );
526: defineElement( "TR", ELEM_CONTENT | OPT_CLOSING | CLOSE_TABLE );
527: defineElement( "UL", ELEM_CONTENT | CLOSE_P );
528:
529: _boolAttrs = new Hashtable();
530: defineBoolean( "AREA", "href" );
531: defineBoolean( "BUTTON", "disabled" );
532: defineBoolean( "DIR", "compact" );
533: defineBoolean( "DL", "compact" );
534: defineBoolean( "FRAME", "noresize" );
535: defineBoolean( "HR", "noshade" );
536: defineBoolean( "IMAGE", "ismap" );
537: defineBoolean( "INPUT", new String[] { "defaultchecked", "checked", "readonly", "disabled" } );
538: defineBoolean( "LINK", "link" );
539: defineBoolean( "MENU", "compact" );
540: defineBoolean( "OBJECT", "declare" );
541: defineBoolean( "OL", "compact" );
542: defineBoolean( "OPTGROUP", "disabled" );
543: defineBoolean( "OPTION", new String[] { "default-selected", "selected", "disabled" } );
544: defineBoolean( "SCRIPT", "defer" );
545: defineBoolean( "SELECT", new String[] { "multiple", "disabled" } );
546: defineBoolean( "STYLE", "disabled" );
547: defineBoolean( "TD", "nowrap" );
548: defineBoolean( "TH", "nowrap" );
549: defineBoolean( "TEXTAREA", new String[] { "disabled", "readonly" } );
550: defineBoolean( "UL", "compact" );
551:
552: initialize();
553: }
554:
555:
556:
557: }
558: