2009/05/20 - Apache Shale has been retired.

For more information, please explore the Attic.


1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to you under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  /*
19   * $Id: Parser.java 464373 2006-10-16 04:21:54Z rahul $
20   */
21  package org.apache.shale.clay.parser;
22  
23  import java.util.Iterator;
24  import java.util.List;
25  import java.util.Map;
26  import java.util.TreeMap;
27  
28  import org.apache.commons.logging.Log;
29  import org.apache.commons.logging.LogFactory;
30  import org.apache.shale.util.Messages;
31  
32  /***
33   * <p>Parses the document into a tree of nodes using the
34   * {@link NodeTokenizer}. Nodes are defined by a token or
35   * offset range in the document, {@link Token}.  Attributes in beginning
36   * nodes are also parsed into token offsets by the {@link AttributeTokenizer}.
37   * <br><br>
38   * A document tree is built representing nodes in the target document.  The
39   * document can be a HTML fragment that is not well-formed or an XML
40   * fragment of a XHTML document.
41   * </p>
42   *
43   */
44  public class Parser {
45  
46      /***
47       * <p>Common logging utility instance.</p>
48       */
49      private static Log log;
50      static {
51          log = LogFactory.getLog(Parser.class);
52      }
53  
54      /***
55       * <p>
56       * Message resources for this class.
57       * </p>
58       */
59      private static Messages messages = new Messages(
60              "org.apache.shale.clay.Bundle", NodeTokenizer.class
61              .getClassLoader());
62  
63  
64      /***
65       * <p>This inner class is a subclass of a <code>TreeMap</code>.
66       * This wrapper handles the attribute key and value parts represented
67       * as a {@link Token} offsets.  The value part of the attribute will be
68       * represented by a offset until is accessed using the <code>get</code>
69       * method to avoid creating a bunch of <code>String</code> instances.
70       * </p>
71       *
72       */
73      private class Attributes extends TreeMap implements Map {
74  
75          /***
76           * <p>Unique serial id.</p>
77           */
78          private static final long serialVersionUID = 3906654111096190000L;
79  
80          /***
81           * <p> Returns the value of the attribute using a offset
82           * range within the parsed document.  The parameter <code>key</code>
83           * value is converted into a case neutral value.
84           * </p>
85           *
86           * @param key attribute name
87           * @return attribute value
88           */
89          public Object get(Object key) {
90              String tmp = (String) key;
91              if (tmp != null) {
92                  tmp = tmp.toLowerCase();
93              }
94  
95              Token e = (Token) super.get(tmp);
96              return (e != null) ? e.getRawText() : null;
97          }
98  
99          /***
100          *  <p>This method is overridden and not implemented.  The
101          *  <code>add</code> method should be used for this specific
102          *  implementation.  The <code>value</code> attribute's internal
103          *  type realizes {@link Token}, but the <code>get</code> method
104          *  will return a <code>String</code> object.
105          *  </p>
106          *
107          *   @deprecated
108          *   @param key not supported
109          *   @param value not supported
110          *   @return not supported
111          */
112         public Object put(Object key, Object value) {
113             // NA
114             return null;
115         }
116 
117         /***
118          * <p>Adds a attribute to the collection.  The attribute is
119          * represented by two {@link Token} object for its key and
120          * value parts.
121          * </p>
122          *
123          * @param e token to be added
124          *
125          */
126         public void add(Map.Entry e) {
127 
128             String key = ((Token) e.getKey()).getRawText();
129             if (key != null) {
130                 key = key.toLowerCase();
131             }
132 
133             super.put(key, e.getValue());
134         }
135 
136         /***
137          * <p>This method is overridden to convert the key into a neutral
138          * case so that the <code>Map</code> access method will be case
139          * insensitive.</p>
140          *
141          * @param key attribute name
142          * @return <code>true</code> if attribute exists
143          */
144         public boolean containsKey(Object key) {
145             String tmp = (String) key;
146             if (tmp != null) {
147                 tmp = tmp.toLowerCase();
148             }
149 
150             return super.containsKey(tmp);
151         }
152 
153 
154 
155     }
156 
157     /***
158      * <p>This array of HTML tags can have optional ending tags.</p>
159      */
160     private static final String[] OPTIONAL_ENDING_TAG  = {"TR", "TH", "TD", "LI", "DT", "DD", "LH", "OPTION"};
161     /***
162      * <p>This array of parent tags is cross referenced by the
163      * <code>OPTIONAL_ENDING_TAG</code> array.</p>
164      */
165     private static final String[][] TAG_PARENTS = {{"TABLE", "TBODY"}, {"TR"}, {"TR"},
166         {"OL", "UL"}, {"DL"}, {"DL"}, {"DL"}, {"SELECT"}};
167 
168     /***
169      * <p>
170      * Determines if a HTML nodeName is a type of tag that can optionally have a
171      * ending tag.
172      * </p>
173      *
174      * @param nodeName the name of the html node
175      * @return <code>true</code> if the nodeName is in the
176      *   <code>OPTIONAL-ENDING_TAG</code> array; otherwise, <code>false</code> is returned
177      */
178     protected boolean isOptionalEndingTag(String nodeName) {
179         if (nodeName != null) {
180             for (int i = 0; i < OPTIONAL_ENDING_TAG.length; i++) {
181                 if (OPTIONAL_ENDING_TAG[i].equalsIgnoreCase(nodeName)) {
182                     return true;
183                 }
184             }
185         }
186 
187         return false;
188     }
189 
190     /***
191      * <p>
192      * Checks to see if a optional ending tag has a valid parent. This is use to
193      * detect a implicit ending tag
194      * </p>
195      *
196      * @param nodeName of the optional ending tag
197      * @param parentNodeName name of the parent
198      * @return <code>true</code> if the parentNodeName is a valid parent for
199      *   the nodeName; otherwise, a <code>false</code> value is returned
200      */
201     protected boolean isValidOptionalEndingTagParent(String nodeName,
202             String parentNodeName) {
203         if (nodeName != null && parentNodeName != null) {
204             for (int i = 0; i < OPTIONAL_ENDING_TAG.length; i++) {
205                 if (OPTIONAL_ENDING_TAG[i].equalsIgnoreCase(nodeName)) {
206                     for (int j = 0; j < TAG_PARENTS[i].length; j++) {
207                         if (TAG_PARENTS[i][j].equalsIgnoreCase(parentNodeName)) {
208                             return true;
209                         }
210                     }
211                     break;
212                 }
213             }
214         }
215         return false;
216     }
217 
218     /***
219      * @param current top of the stack
220      * @param node ending node
221      * @return begining node
222      */
223     protected Node findBeginingNode(Node current, Node node) {
224 
225         pop: while (true) {
226             if (current == null) {
227                 break pop;
228             }
229 
230             if (isNodeNameEqual(current, node)) {
231 
232                 // isWellFormed indicates a beginning tag and ending tag
233                 // was found
234                 current.setWellFormed(true);
235 
236                 // nodes are equal, make the parent of the
237                 // begin tag the current node
238                 current = current.getParent();
239 
240                 break pop;
241             }
242 
243             if (isOptionalEndingTag(current.getName())) {
244                 current.setWellFormed(true);
245             }
246 
247             if (current.getParent() == null) {
248                 throw new RuntimeException(
249                         messages.getMessage("parser.unmatched.endtoken",
250                                 new Object[] {node.getToken(), node.getToken().getRawText()}));
251             }
252 
253             current = current.getParent();
254 
255         }
256 
257     return current;
258 
259     }
260 
261     /***
262      * <p>Starting remove block delimiter.  It must be a self contained commment.<p>
263      */
264     private static final String BEGIN_REMOVE_TOKEN = "<!-- ### clay:remove ### -->";
265 
266     /***
267      * <p>Ending remove block delimiter.</p>
268      */
269     private static final String END_REMOVE_TOKEN = "<!-- ### /clay:remove ### -->";
270 
271     /***
272      * <p>The start of the comment token used to override the template
273      * encoding type.</p>
274      */
275     public static final String START_CHARSET_TOKEN = "<!-- ### clay:page ";
276 
277     /***
278      * <p>The end of the comment token used to override the template
279      * encoding type.</p>
280      */
281     public static final String END_CHARSET_TOKEN = "/### -->";
282 
283     /***
284      * <p>
285      * Parse a document fragment into graphs of {@link Node}. The resulting
286      * type is a list because the fragment might not be well-formed.
287      * </p>
288      *
289      * @param document input source
290      * @return collection of {@link Node}
291      */
292     public List parse(StringBuffer document) {
293 
294         boolean isWithinRemoveBlock = false;
295         Node root = new Node(null);
296         Node current = root;
297         current.setName("namingContainer");
298         root.setWellFormed(true);
299 
300         NodeTokenizer t = new NodeTokenizer(document);
301         Iterator i = t.iterator();
302         next: while (i.hasNext()) {
303             Token token = (Token) i.next();
304             Node node = buildNode(token);
305 
306             // self contained comment matching the begin/end remove token delimiter
307             // skip all tokens within the remove block
308             if (node.isComment() && node.isStart() && node.isEnd()) {
309 
310                 //ignore start page charset token if not in a remove comment block
311                 if (!isWithinRemoveBlock && node.getToken().getRawText().startsWith(START_CHARSET_TOKEN)) {
312                    continue next;
313                 }
314 
315                 if (isWithinRemoveBlock && node.getToken().getRawText().equals(END_REMOVE_TOKEN)) {
316 
317                    isWithinRemoveBlock = false;
318                    continue next;
319 
320                 } else if (node.getToken().getRawText().equals(BEGIN_REMOVE_TOKEN)) {
321 
322                   isWithinRemoveBlock = true;
323                   continue next;
324 
325                } else if (isWithinRemoveBlock) {
326                   continue next;
327                }
328             } else if (isWithinRemoveBlock) {
329                 continue next;
330             }
331 
332 
333             //play forward on comments making all nodes child nodes until a
334             //ending comment is hit
335             if ((node.isComment() || node.isCdata()) && node.isStart()) {
336 
337                 // capture the type of block since you can have comments in a cdata block
338                 boolean isCommentBlock = node.isComment();
339                 boolean isCdataBlock = node.isCdata();
340 
341                 //not self contained comment
342                 if (!node.isEnd()) {
343 
344                     trash: while (i.hasNext()) {
345                         token = (Token) i.next();
346                         Node bodyNode = buildNode(token);
347                         //if a ending node and the block matches
348                         if (((bodyNode.isComment() && isCommentBlock)
349                                 || (bodyNode.isCdata() && isCdataBlock)) && bodyNode.isEnd()) {
350                             node.addChild(bodyNode);
351                             node.setEnd(true);
352                             node.setWellFormed(true);
353                             break trash;
354                         } else {
355                             //force all nodes to be comment or cdata within a block
356                             node.setComment(isCommentBlock);
357                             node.setCdata(isCdataBlock);
358                             node.setWellFormed(true);
359                             node.addChild(bodyNode);
360                         }
361                     } // end while
362 
363                 }
364 
365                 current.addChild(node);
366                 continue next;
367 
368             }
369 
370 
371             if (!node.isStart() && node.isEnd()) {
372 
373                 current = findBeginingNode(current, node);
374 
375             } else if (node.isStart() && !node.isEnd()) {
376 
377                 // this is to handle an option tag without an ending option tag.
378                 // they just liked to write conditional code back then, what the
379                 // heck!
380                 // <select>
381                 // <option value=why>old school html
382                 // <option value=whyo>crazy man
383                 // </select>
384                 //
385 
386 
387                 if (isOptionalEndingTag(current.getName())
388                         && current.isStart() && !current.isEnd()
389                         && current.getParent() != null
390                         && isValidOptionalEndingTagParent(node.getName(), current.getParent().getName())) {
391 
392                     current.setWellFormed(true);
393                     current.getParent().addChild(node);
394                     current = node;
395 
396                 } else {
397                     // the current node is a optional and the new node is it's parent
398                     // simulate having ending nodes
399 
400                     if (isOptionalEndingTag(node.getName())
401                             && isValidOptionalEndingTagParent(current.getName(), node.getName())) {
402 
403                         current = this.findBeginingNode(current, node);
404                         current.addChild(node);
405                         current = node;
406 
407                     } else {
408 
409                         // adding a new node to the current making it current
410                         current.addChild(node);
411                         current = node;
412                     }
413                 }
414             } else {
415                 if (current != null) {
416                     current.addChild(node);
417                 } else {
418                     current = node;
419                 }
420             }
421 
422         }
423 
424         t = null;
425         i = null;
426 
427         simpleWellFormedCheck(root);
428 
429         return root.getChildren();
430 
431     }
432 
433     /***
434      * <p>A simple check to make sure that all nodes have been terminated including
435      * tags with optional ending tags.</p>
436      *
437      * @param node root markup
438      */
439     private void simpleWellFormedCheck(Node node) {
440         if (node.getName() != null && !node.isWellFormed()) {
441             throw new RuntimeException(
442                     messages.getMessage("parser.unmatched.begintoken",
443                             new Object[] {node.getToken(), node.getToken().getRawText()}));
444         }
445 
446         if (!node.isComment() && !node.isCdata()) {
447             Iterator ci = node.getChildren().iterator();
448             while (ci.hasNext()) {
449                 simpleWellFormedCheck((Node) (ci.next()));
450             }
451         }
452     }
453 
454     /***
455      * <p>Compares two {@link Node} instances by <code>name</code>.
456      * This method is used to match a beginning tag with an ending tag
457      * while building the document stack.  Returns <code>true</code> if
458      * the node <code>name</code> properties are the same.
459      * </p>
460      *
461      * @param node1 first node
462      * @param node2 secnod node
463      * @return <code>true</code> if they are the same
464      *
465      */
466     protected boolean isNodeNameEqual(Node node1, Node node2) {
467         boolean f = false;
468 
469         if (node1 != null && node2 != null) {
470             if (node1.getName() != null && node2.getName() != null) {
471                 if (node1.getName().equalsIgnoreCase(node2.getName())) {
472                     if (node1.getQname() == null && node2.getQname() == null) {
473                         f = true;
474                     } else if (node1.getQname() != null
475                             && node2.getQname() != null
476                             && node1.getQname().equalsIgnoreCase(node2.getQname())) {
477                         f = true;
478                     }
479                 }
480             }
481         }
482 
483         if (log.isDebugEnabled()) {
484             StringBuffer msg = new StringBuffer();
485             msg.append("matching nodes (").append(node1.getName()).append(
486                     (f ? "==" : "!=")).append(node2.getName()).append(")");
487             log.debug(msg.toString());
488         }
489         return f;
490     }
491 
492     /***
493      * <p>Table of self terminating Html tags.</p>
494      */
495     private static final String[] SELF_TERMINATING = {"META", "LINK", "HR",
496         "BASEFONT", "IMG", "PARAM", "BR", "AREA", "INPUT", "ISINDEX",
497     "BASE"};
498 
499     /***
500      * <p>
501      * Checks to see if the nodeName is within the <code>SELF_TERMINATING</code>
502      * table of values.
503      * </p>
504      *
505      * @param nodeName to check for self termination
506      * @return <code>true</code> if is self terminating otherwise
507      *   <code>false</code>
508      */
509     protected boolean isSelfTerminating(String nodeName) {
510 
511         if (nodeName != null) {
512             for (int i = 0; i < SELF_TERMINATING.length; i++) {
513                 if (SELF_TERMINATING[i].equalsIgnoreCase(nodeName)) {
514                     return true;
515                 }
516             }
517         }
518 
519         return false;
520     }
521 
522     /***
523      * <p>This is a factory method that builds a {@link Node} from a
524      * {@link Token}.
525      * </p>
526      *
527      * @param token node offset in the document
528      * @return node that describes the structure of the token
529      */
530     protected Node buildNode(Token token) {
531 
532         Node node = new Node(token);
533 
534         discoverNodeShape(node);
535         discoverNodeName(node);
536         discoverNodeAttributes(node);
537         discoverNodeOverrides(node);
538 
539         return node;
540     }
541 
542 
543     /***
544      * <p>Declare an array of {@link Parser.Rule}s that validate an ending {@link Token}.</p>
545      */
546     private static final Rule[] END_TAG_RULES = {new Rule('<', true, 0, true),
547         new Rule('/', true, 1, true),
548         new Rule('>', false, -1, true)};
549 
550     /***
551      * <p>Declare an array of {@link Parser.Rule}s that validate self terminating {@link Token}.</p>
552      */
553     private static final Rule[] SELF_TERM_TAG_RULES = {new Rule('<', true, 0, true),
554         new Rule('/', false, -2, true),
555         new Rule('>', false, -1, true)};
556     /***
557      * <p>Declare an array of {@link Parser.Rule}s that validate self contained comment {@link Token}.</p>
558      */
559     private static final Rule[] SELF_CONTAINED_COMMENT_RULES = {new Rule('<', true, 0, true),
560         new Rule('!', true, 1, true),
561         new Rule('-', true, 2, true),
562         new Rule('-', true, 3, true),
563         new Rule('>', false, -1, true),
564         new Rule('-', false, -2, true),
565         new Rule('-', false, -3, true)};
566 
567 
568     /***
569      * <p>Declare an array of {@link Parser.Rule}s that validate self contained CDATA {@link Token}.</p>
570      */
571     private static final Rule[] SELF_CONTAINED_CDATA_RULES = {new Rule('<', true, 0, true),
572         new Rule('!', true, 1, true),
573         new Rule('[', true, 2, true),
574         new Rule('C', true, 3, true),
575         new Rule('D', true, 4, true),
576         new Rule('A', true, 5, true),
577         new Rule('T', true, 6, true),
578         new Rule('A', true, 7, true),
579         new Rule('[', true, 8, true),
580         new Rule('>', false, -1, true),
581         new Rule(']', false, -2, true),
582         new Rule(']', false, -3, true)};
583 
584     /***
585      * <p>Declare an array of {@link Parser.Rule}s that validate a begin CDATA {@link Token}.</p>
586      */
587     public static final Rule[] BEGIN_CDATA_RULES = {new Rule('<', true, 0, true),
588         new Rule('!', true, 1, true),
589         new Rule('[', true, 2, true),
590         new Rule('C', true, 3, true),
591         new Rule('D', true, 4, true),
592         new Rule('A', true, 5, true),
593         new Rule('T', true, 6, true),
594         new Rule('A', true, 7, true),
595         new Rule('[', true, 8, true)};
596 
597     /***
598      * <p>Declare an array of {@link Parser.Rule}s that validate an end CDATA {@link Token}.</p>
599      */
600     public static final Rule[] END_CDATA_RULES = {new Rule('>', false, -1, true),
601         new Rule(']', false, -2, true),
602         new Rule(']', false, -3, true)};
603 
604 
605     /***
606      * <p>Declare an array of {@link Parser.Rule}s that validate a begin comment {@link Token}.</p>
607      */
608     public static final Rule[] BEGIN_COMMENT_TAG_RULES = {new Rule('<', true, 0, true),
609         new Rule('!', true, 1, true),
610         new Rule('-', true, 2, true),
611         new Rule('-', true, 3, true)};
612 
613     /***
614      * <p>Declare an array of {@link Parser.Rule}s that validate an end comment {@link Token}.</p>
615      */
616     public static final Rule[] END_COMMENT_TAG_RULES = {new Rule('>', false, -1, true),
617         new Rule('-', false, -2, true),
618         new Rule('-', false, -3, true)};
619 
620     /***
621      * <p>Declare an array of {@link Parser.Rule}s that validate document type {@link Token}.</p>
622      */
623     public static final Rule[] DOCTYPE_TAG_RULES = {new Rule('<', true, 0, true),
624         new Rule('!', true, 1, true),
625         new Rule('>', false, -1, true)};
626 
627     /***
628      * <p>Declare an array of {@link Parser.Rule}s that validate a begining {@link Token}.</p>
629      */
630     public static final Rule[] BEGIN_TAG_RULES = {new Rule('<', true, 0, true),
631         new Rule('-', true, 1, false),
632         new Rule('/', true, 1, false),
633         new Rule('?', true, 1, false),
634         new Rule('%', true, 1, false),
635         new Rule('>', false, -1, true)};
636 
637 
638     /***
639      * <p>Declare an array of {@link Parser.Rule}s that validate JSP block {@link Token}.</p>
640      */
641     private static final Rule[] JSP_RULES = {new Rule('<', true, 0, true),
642         new Rule('%', true, 1, true),
643         new Rule('>', false, -1, true),
644         new Rule('%', false, -2, true)};
645 
646 
647     /***
648      * <p>Declare an array of {@link Parser.Shape}s further defined by {@link Parser.Rule}s
649      *  that are used to determine the type of {@link Node} the {@link Token} defines.</p>
650      */
651     private static final Shape[] NODE_SHAPES = {
652         new Shape(true, true, false, true, SELF_CONTAINED_CDATA_RULES),
653         new Shape(true, false, false, true, BEGIN_CDATA_RULES),
654         new Shape(false, true, false, true, END_CDATA_RULES),
655         new Shape(false, true, false, false, END_TAG_RULES),
656         new Shape(true, true, false, false, SELF_TERM_TAG_RULES),
657         new Shape(true, true, true, false, SELF_CONTAINED_COMMENT_RULES),
658         new Shape(true, false, true, false, BEGIN_COMMENT_TAG_RULES),
659         new Shape(false, true, true, false, END_COMMENT_TAG_RULES),
660         new Shape(true, true, true, false, DOCTYPE_TAG_RULES),
661         new Shape(true, false, false, false, BEGIN_TAG_RULES),
662         new Shape(true, true, true, false, JSP_RULES)};
663 
664 
665     /***
666      * <p>Determine if the {@link Node} is a starting, ending, or body text
667      * tag. The array of {@link Parser.Shape}s are used to determine the type of
668      * {@link Node} the {@link Token} representes.</p>
669      *
670      * @param node target node
671      */
672     protected void discoverNodeShape(Node node) {
673         Token token = node.getToken();
674 
675         nextShape: for (int i = 0; i < NODE_SHAPES.length; i++) {
676 
677             int maxBeginOffset = 0;
678             int minEndOffset = Integer.MAX_VALUE;
679 
680             Shape shape = NODE_SHAPES[i];
681 
682             Rule[] rules = shape.getRules();
683             for (int j = 0; j < rules.length; j++) {
684 
685                 // use the begin or end token offset
686                 int n = (rules[j].isBegin ? token.getBeginOffset() : token.getEndOffset()) + rules[j].getOffset();
687 
688                 if (rules[j].isBegin) {
689                     maxBeginOffset = Math.max(n, maxBeginOffset);
690                 } else {
691                     minEndOffset = Math.min(n, minEndOffset);
692                 }
693 
694 
695                 // if out of document range, look for the next shape
696                 if (n > token.getDocument().length() || n < 0) {
697                     continue nextShape;
698                 }
699 
700                 // check the operator
701                 boolean match = false;
702                 if (rules[j].isEqual) {
703                     match = (token.getDocument().charAt(n) ==  rules[j].getMnemonic());
704                 } else {
705                     match = (token.getDocument().charAt(n) !=  rules[j].getMnemonic());
706                 }
707 
708                 if (!match) {
709                     continue nextShape;
710                 }
711             }
712 
713             //make sure the compared token delimiters don't overlap <!-->
714             if (minEndOffset <= maxBeginOffset) {
715                 continue nextShape;
716             }
717 
718             node.setStart(shape.isStart());
719             node.setEnd(shape.isEnd());
720             node.setComment(shape.isComment());
721             node.setCdata(shape.isCdata);
722 
723             break nextShape;
724         }
725 
726     }
727 
728     /***
729      * <p>Extracts the node name from the {@link Token} if the {@link Node}
730      * is a starting or ending tag.</p>
731      *
732      * @param node target
733      */
734     protected void discoverNodeName(Node node) {
735         Token token = node.getToken();
736 
737         if (node.isStart() || node.isEnd()) {
738             // comments are treated special because and ending comment may will not
739             // have a node name <!-- <input > -->
740             if (node.isComment()) {
741 
742                 node.setName("--");
743 
744             } else if (node.isCdata()) {
745 
746                 node.setName("[CDATA[");
747 
748             } else {
749                 // find the node name delimiter
750                 //int e = token.getDocument().indexOf(" ", token.getBeginOffset() + 2);
751 
752                 //calc end of token body
753                 int etb = (node.isStart() && node.isEnd()) ? (token.getEndOffset() - 2)
754                         : (token.getEndOffset() - 1);
755 
756                 // find the start of the node attribute body
757                 int s = (!node.isStart() && node.isEnd()) ? token.getBeginOffset() + 2
758                         : token.getBeginOffset() + 1;
759 
760                 //look for the first whitespace
761                 int e = -1;
762                 indexOf: for (int i = s; i < etb; i++) {
763                     if (Character.isWhitespace(token.getDocument().charAt(i))) {
764                         e = i;
765                         break indexOf;
766                     }
767                 }
768 
769                 // end of token is the end of body
770                 if (e == -1) {
771                     e = etb;
772                 }
773 
774                 // return the full node name
775                 String nodeName = token.getDocument().substring(s, e);
776                 // separate the namespace
777                 e = nodeName.indexOf(':');
778                 if (e > -1) {
779                     node.setQname(nodeName.substring(0, e));
780                 }
781                 node.setName(nodeName.substring(e + 1));
782             }
783 
784         }
785 
786     }
787 
788     /***
789      * <p>If the {@link Node} is a starting tag and not a comment,
790      * use the {@link AttributeTokenizer} to realize the node attributes.</p>
791      *
792      * @param node target
793      */
794     protected void discoverNodeAttributes(Node node) {
795         Token token = node.getToken();
796         Attributes attributes = this.new Attributes();
797         node.setAttributes(attributes);
798 
799         // look for attribute in a beginning tag only
800         if (node.isStart() && (!node.isComment() && !node.isCdata())) {
801 
802             int e = (node.isStart() && node.isEnd()) ? (token.getEndOffset() - 2)
803                     : (token.getEndOffset() - 1);
804 
805             int s = -1;
806             indexOf: for (int i = token.getBeginOffset() + 2; i < e; i++) {
807                 if (Character.isWhitespace(token.getDocument().charAt(i))) {
808                     s = i;
809                     break indexOf;
810                 }
811             }
812 
813             if (s > -1 && s < e) {
814 
815                 // find the tokens and load them into the attributes map
816                 AttributeTokenizer tokenizer = new AttributeTokenizer(token
817                         .getDocument(), s, e, token.getLineNumber(), token.getLineBeginOffset());
818                 Iterator at = tokenizer.iterator();
819                 while (at.hasNext()) {
820                     Map.Entry attribute = (Map.Entry) at.next();
821                     attributes.add(attribute);
822                 }
823             }
824 
825         }
826 
827     }
828 
829     /***
830      * <p>Explicitly sets the <code>isEnd</code> {@link Node} property to <code>true</code> for
831      * self terminating tags.  Sets the {@link Node}'s <code>isWellFormed</code> property
832      * to <code>true</code> if the <code>isStart</code> and <code>isEnd</code>
833      * {@link Node} properties are <code>true</code>.</p>
834      *
835      * @param node target
836      */
837     protected void discoverNodeOverrides(Node node) {
838         //look for self terminating tags
839         if (node.isStart() && isSelfTerminating(node.getName())) {
840             node.setEnd(true);
841         }
842 
843         // begin and end tag found on a self terminating node <xxx/>
844         if (node.isStart() && node.isEnd()) {
845             node.setWellFormed(true);
846         }
847 
848     }
849 
850     /***
851      * <p>Defines a parsing {@link Parser.Rule} used to determine
852      * the {@link Parser.Shape} of a {@link Node}.</p>
853      */
854     static class Rule {
855         /***
856          * <p>The target char to check for in the {@link Token} document.</p>
857          */
858         private char mnemonic = ' ';
859 
860         /***
861          * <p>A boolen flag that indicates if the <code>offset</code> is from
862          * the begining of the {@link Token} offset or the ending offset.</p>
863          */
864         private boolean isBegin = false;
865 
866         /***
867          * <p>The offset from the start or end of the {@link Token} that the
868          * <code>mnemonic</code> should be found.</p>
869          */
870         private int offset = 0;
871         /***
872          * <p>A boolean value that determines the relational operator used
873          * to compare the <code>mnemonic</code> to the {@link Token} begin
874          * or ending offset plus the {@link Parser.Rule} offset.  If the value
875          * is <code>true</code> the equals operator is used; otherwise,
876          * the not equals operator is used in the comparison.</p>
877          */
878         private boolean isEqual = false;
879 
880         /***
881          * <p>Overloaded constructor for the immutable object.</p>
882          * @param mnemonic character looked for in the token
883          * @param isBegin boolean that determines if the begining or ending of the Token is used
884          * @param offset the offset from the begin or ending Token
885          * @param isEqual boolean that determines if the = or != operator is used to check the mnemonic
886          */
887         public Rule(char mnemonic, boolean isBegin, int offset, boolean isEqual) {
888             this.mnemonic = mnemonic;
889             this.isBegin = isBegin;
890             this.offset = offset;
891             this.isEqual = isEqual;
892         }
893         /***
894          * <p>Returns the character looked for in the {@link Token}.</p>
895          *
896          * @return searched for token
897          */
898         public char getMnemonic() {
899             return mnemonic;
900         }
901         /***
902          * <p>Returns <code>true</code> if the <code>mnemonic</code> is at the
903          * begin or end of the token plus the <code>offset</code>.</p>
904          *
905          * @return <code>true</code> search from the start
906          */
907         public boolean isBegin() {
908             return isBegin;
909         }
910         /***
911          * <p>Returns a positive or negative offset from the begin or ending
912          * {@link Token} offset withing the document.</p>
913          *
914          * @return offset for the begining or ending of the token
915          */
916         public int getOffset() {
917             return offset;
918         }
919 
920         /***
921          * <p>Returns <code>true</code> if the equal relational operator is
922          * used for the <code>mnemonic</code> comparison; otherwise the not
923          * equal operator is used.</p>
924          *
925          * @return use relational operator
926          */
927         public boolean isEqual() {
928             return isEqual;
929         }
930     }
931 
932     /***
933      * <p>This class defines the shape of the {@link Node} by characterizing
934      * if the {@link Token} is a begin, end or comment tag.</p>
935      */
936     static class Shape {
937 
938         /***
939          * <p>If <code>true</code> it indicates a starting node.</p>
940          */
941         private boolean isStart = false;
942 
943         /***
944          * <p>If <code>true</code> it indicates an ending node.</p>
945          */
946         private boolean isEnd = false;
947 
948         /***
949          * <p>If <code>true</code> it indicates a comment node.</p>
950          */
951         private boolean isComment = false;
952 
953 
954         /***
955          * <p>If <code>true</code> it indicates a CDATA node.</p>
956          */
957         private boolean isCdata = false;
958 
959 
960         /***
961          * <p>An array of {@link Parser.Rule}s used to determine if the
962          * {@link Node} matches the {@link Parser.Shape}.</p>
963          */
964         private Rule[] rules = null;
965 
966         /***
967          * <p>Overloaded constructor used to instantiate the immutable object.</p>
968          *
969          * @param isStart starting node
970          * @param isEnd ending node
971          * @param isComment comment node
972          * @param isCdata cdata node
973          * @param rules define the node
974          */
975         public Shape(boolean isStart, boolean isEnd, boolean isComment, boolean isCdata, Rule[] rules) {
976             this.isStart = isStart;
977             this.isEnd = isEnd;
978             this.isComment = isComment;
979             this.isCdata = isCdata;
980             this.rules = rules;
981         }
982 
983         /***
984          * <p>Returns <code>true</code> if the {@link Token} is a starting tag.</p>
985          *
986          * @return is a starting tag
987          */
988         public boolean isStart() {
989             return isStart;
990         }
991         /***
992          * <p>Returns <code>true</code> if the {@link Token} is an ending tag.</p>
993          *
994          * @return is a ending tag
995          */
996         public boolean isEnd() {
997             return isEnd;
998         }
999         /***
1000          * <p>Returns <code>true</code> if the {@link Token} is a comment tag.</p>
1001          *
1002          * @return is a comment
1003          */
1004         public boolean isComment() {
1005             return isComment;
1006         }
1007         /***
1008          * <p>Returns <code>true</code> if the {@link Token} is a CDATA tag.</p>
1009          *
1010          * @return is a cdata
1011          */
1012         public boolean isCdata() {
1013             return isCdata;
1014         }
1015 
1016         /***
1017          * <p>Returns the {@link Parser.Rule}s that define the <code>isStart</code>,
1018          * <code>isEnd</code> and <code>isComment</code> characteristics.</p>
1019          *
1020          * @return rules defining the type of node
1021          */
1022         public Rule[] getRules() {
1023             return rules;
1024         }
1025     }
1026 
1027 }