2009/05/20 - Apache Shale has been retired.

For more information, please explore the Attic.

View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to you under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  /*
19   * $Id: AttributeTokenizer.java 464373 2006-10-16 04:21:54Z rahul $
20   */
21  package org.apache.shale.clay.parser;
22  
23  import java.util.ArrayList;
24  import java.util.Iterator;
25  import java.util.Map;
26  
27  import org.apache.commons.logging.Log;
28  import org.apache.commons.logging.LogFactory;
29  import org.apache.shale.util.Messages;
30  
31  /***
32   * <p>
33   * Tokenizes a portion of the document for attributes. The complete document is
34   * passed by reference and new {@link Token} offsets are created for the name
35   * and value of the discovered attributes.
36   * </p>
37   */
38  
39  public class AttributeTokenizer {
40  
41      /***
42       * <p>
43       * Message resources for this class.
44       * </p>
45       */
46      private static Messages messages = new Messages(
47              "org.apache.shale.clay.Bundle", AttributeTokenizer.class
48              .getClassLoader());
49  
50      /***
51       * <p>
52       * Common logging utility.
53       * </p>
54       */
55      private static Log log;
56      static {
57          log = LogFactory.getLog(AttributeTokenizer.class);
58      }
59  
60      /***
61       * <p>
62       * Internal document buffer.
63       * </p>
64       */
65      private StringBuffer buffer = null;
66  
67      /***
68       * <p>
69       * Beginning offset of the starting node.
70       * </p>
71       */
72      private int beginOffset = 0;
73  
74      /***
75       * <p>
76       * Ending offset of the starting node.
77       * </p>
78       */
79      private int endOffset = 0;
80  
81      /***
82       * <p>Line number the target node is located in.</p>
83       */
84      private int lineNumber = 0;
85  
86      /***
87       * <p>Line begining document offset where the target node is located.</p>
88       */
89      private int lineBeginOffset = 0;
90  
91  
92      /***
93       * <p>
94       * Overloaded constructor that is passed the complete document and the
95       * starting and ending offset of the node body within the document.
96       * </p>
97       *
98       * @param buffer document
99       * @param beginOffset start index of node body in the document
100      * @param endOffset end index of node body in the document
101      * @param lineNumber line number of the node within the document
102      * @param lineBeginOffset index in the document that the line begins
103      */
104     public AttributeTokenizer(StringBuffer buffer, int beginOffset,
105             int endOffset, int lineNumber, int lineBeginOffset) {
106         this.buffer = buffer;
107         this.beginOffset = beginOffset;
108         this.endOffset = endOffset;
109         this.lineBeginOffset = lineBeginOffset;
110         this.lineNumber = lineNumber;
111 
112     }
113 
114     /***
115      * <p>
116      * Inner class implementing the {@link Token} interface. This class will
117      * define an attribute's key and value offsets
118      * </p>
119      */
120     private class TokenOffset implements Token {
121         /***
122          * <p>Starting offset of the token.</p>
123          */
124         private int beginOffset = 0;
125 
126         /***
127          * <p>Ending offset of the token.</p>
128          */
129         private int endOffset = 0;
130 
131         /***
132          * @param beginOffset token start index
133          * @param endOffset token end index
134          */
135         public TokenOffset(int beginOffset, int endOffset) {
136             this.beginOffset = beginOffset;
137             this.endOffset = endOffset;
138         }
139 
140         /***
141          * @return starting offset of the token in the document
142          */
143         public int getBeginOffset() {
144             return beginOffset;
145         }
146 
147         /***
148          * @return ending offset of the token in the document
149          */
150         public int getEndOffset() {
151             return endOffset;
152         }
153 
154         /***
155          * @return parsed document
156          */
157         public StringBuffer getDocument() {
158             return buffer;
159         }
160 
161         /***
162          * @return token text between the beginOffset and endOffset
163          */
164         public String getRawText() {
165             String pickel = null;
166             try {
167                 pickel = buffer.substring(beginOffset, endOffset);
168             } catch (RuntimeException e) {
169                 log.error(toString(), e);
170                 throw e;
171             }
172             return pickel;
173         }
174 
175         /***
176          * @return line number the token is found on within the document
177          */
178         public int getLineNumber() {
179            return lineNumber;
180         }
181 
182         /***
183          * @return offset within the document that the token line is found
184          */
185         public int getLineBeginOffset() {
186            return lineBeginOffset;
187         }
188 
189         /***
190          * @return description of the token
191          */
192         public String toString() {
193             return messages.getMessage("node.token.range",
194                     new Object[] {
195                 new Integer(beginOffset),
196                         new Integer(endOffset),
197                         new Integer(lineNumber),
198                         new Integer(lineBeginOffset)});
199         }
200 
201     }
202 
203     /***
204      * <p>
205      * This inner class implements the <code>Map.Entry</code> interfaces. It
206      * holds a reference to the key and value parts of an attribute. Both the
207      * key and value attributes are {@link Token} instances.
208      * </p>
209      */
210     private class AttributeEntry implements Map.Entry {
211         /***
212          * <p>Token offset of the attribute key.</p>
213          */
214         private TokenOffset key = null;
215 
216         /***
217          * <p>Token offset of the attribute value.</p>
218          */
219         private TokenOffset value = null;
220 
221         /***
222          * <p>
223          * Overloaded constructor is passed a {@link Token} for the key and
224          * value attributes.
225          * </p>
226          *
227          * @param key token key offset
228          * @param value token value offset
229          */
230         public AttributeEntry(TokenOffset key, TokenOffset value) {
231             this.key = key;
232             this.value = value;
233         }
234 
235         /***
236          * <p>
237          * Returns the attribute name {@link Token} offset.
238          * </p>
239          *
240          * @return TokenOffset for the attribute key
241          */
242         public Object getKey() {
243             return key;
244         }
245 
246         /***
247          * <p>
248          * Returns the attribute value {@link Token} offset.
249          * </p>
250          *
251          * @return TokenOffset of the attribute value
252          */
253         public Object getValue() {
254             return value;
255         }
256 
257         /***
258          * <p>
259          * Sets the attribute value {@link Token} offset.
260          * </p>
261          *
262          * @param value TokenOffset value
263          * @return value token offset
264          */
265         public Object setValue(Object value) {
266             this.value = (TokenOffset) value;
267             return value;
268         }
269 
270         /***
271          * @return description of the attribute
272          */
273         public String toString() {
274             StringBuffer buff = new StringBuffer();
275             TokenOffset key = (TokenOffset) getKey();
276             TokenOffset value = (TokenOffset) getValue();
277 
278             buff.append("key: [").append((key != null ? key.getRawText() : null))
279             .append("]\n").append("value: [")
280             .append((value != null ? value.getRawText() : null))
281             .append("]");
282 
283             return buff.toString();
284         }
285     }
286 
287     /***
288      * <p>
289      * The current offset within the <code>beginOffset</code> and
290      * <code>endOffset</code> of the Node within the document.
291      */
292     private int currOffset = 0;
293 
294     /***
295      * <p>
296      * Builds an <code>ArrayList</code> of
297      * {@link AttributeTokenizer.AttributeEntry} instances identifying
298      * name and value pairs.
299      * </p>
300      *
301      * @param tokenIndex populated attribute offset of a beging node body
302      */
303     protected synchronized void parse(ArrayList tokenIndex) {
304         currOffset = beginOffset;
305 
306         if (log.isDebugEnabled()) {
307             log.debug(messages.getMessage("attribute.range", new Object[] {
308                 new Integer(beginOffset), new Integer(endOffset) }));
309         }
310 
311         while (currOffset < endOffset) {
312             // skip leading spaces
313             int startOffset = currOffset;
314             while (Character.isWhitespace(buffer.charAt(currOffset))) {
315                 currOffset++;
316             }
317 
318             if (log.isDebugEnabled()) {
319                 if (currOffset > startOffset) {
320                     log.debug(messages
321                             .getMessage("attribute.skip.space",
322                             new Object[] { new Integer(currOffset
323                                     - startOffset) }));
324                 }
325             }
326 
327             // looks for the key value delimiter
328             TokenOffset key = nextToken(currOffset, " ", "=", true);
329             if (key == null) {
330                 break;
331             }
332 
333             boolean skipValue = false;
334             currOffset++;
335             String delim = " "; // old school html color=red
336             String otherDelim = "\"";
337             if (currOffset < buffer.length()
338                 && buffer.charAt(currOffset) == '"') {
339 
340                 // xmlish attribute
341                 delim = "\"";
342                 otherDelim = " ";
343                 currOffset++;
344             } else if (currOffset < buffer.length() && currOffset > 0
345                     && buffer.charAt(currOffset - 1) == ' ') {
346 
347                 //attribute without value
348                 currOffset--;        //back up <option selected value=
349                 skipValue = true;
350             }
351 
352             TokenOffset value = null;
353             if (!skipValue) {   // no value part <option selected value=xxx>
354                value = nextToken(currOffset, delim, otherDelim, false);
355             }
356 
357             tokenIndex.add(new AttributeEntry(key, value));
358 
359             currOffset++;
360             key = null;
361             value = null;
362         }
363 
364         if (log.isDebugEnabled()) {
365             log.debug(messages.getMessage("attributes.total.found",
366                     new Object[] { new Integer(tokenIndex.size()) }));
367         }
368 
369     }
370 
371     /***
372      * <p>
373      * Returns the next {@link Token} given an <code>startOffset</code> and a
374      * <code>endDelim</code>.
375      * </p>
376      *
377      * @param startOffset begining offset in the document
378      * @param endDelim primary token delimiter
379      * @param otherDelim secondary token delimiter
380      * @param isKey looking for an attribute name not a value
381      * @return next token offset
382      */
383     protected TokenOffset nextToken(int startOffset, String endDelim, String otherDelim, boolean isKey) {
384         //If isKey is true, we are looking for an attribute name with a endDelim or otherDelim.
385         //Pick the one that comes first.
386 
387         //If isKey is false we are looking for an attribute value.  The endDelim is the best guess
388         //and the otherDelim is the next best guess.
389         if (isKey) {
390             int offsetEnd = Math.min(buffer.indexOf(endDelim, startOffset), endOffset);
391             int offsetOther = Math.min(buffer.indexOf(otherDelim, startOffset), endOffset);
392             if (offsetEnd == -1) {
393                currOffset = offsetOther;
394             } else if (offsetOther == -1) {
395                currOffset = offsetOther;
396             } else {
397                currOffset = Math.min(offsetEnd, offsetOther);
398             }
399         } else {
400            currOffset = Math.min(buffer.indexOf(endDelim, startOffset), endOffset);
401            // try another delimiter
402            if (currOffset == -1) {
403               currOffset = Math.min(buffer.indexOf(otherDelim, startOffset), endOffset);
404            }
405         }
406 
407 
408         if (currOffset == -1) {
409             currOffset = endOffset;
410         }
411 
412         // look for the value delimiter or the end of the parse fragment,
413         // whichever comes first
414         if (currOffset > -1 && currOffset <= endOffset && startOffset < currOffset) {
415 
416             int e = currOffset;
417             //forgive an attribute with Inconsistent delimiters, color=red"
418             if (buffer.charAt(e - 1) == '"'
419                 || (Character.isWhitespace(buffer.charAt(e - 1))
420                 && buffer.charAt(e - 1) != ' ')) {
421               --e;
422             }
423 
424 
425             TokenOffset value = new TokenOffset(startOffset, e);
426 
427             if (log.isDebugEnabled()) {
428                 log.debug(messages.getMessage("attribute.token.range",
429                         new Object[] { new Integer(startOffset),
430                                 new Integer(e) }));
431             }
432 
433             return value;
434         }
435 
436         return null;
437     }
438 
439     /***
440      * <p>Inner class implementing the <code>Iterator</code>
441      * interface. This class is a decorator of a <code>ArrayList</code>
442      * of nodes.
443      * </p>
444      */
445     private class TokenIterator implements Iterator {
446 
447         /***
448          * <p>All the attribute entry tokens in the node body.</p>
449          */
450         private ArrayList tokenIndex = null;
451 
452         /***
453          * <p>Internal <code>tokenIndex</code> iterator.</p>
454          */
455         private Iterator ti = null;
456 
457         /***
458          * <p>Constructor parses the node body into a collection of
459          * {@link AttributeTokenizer.AttributeEntry}.
460          * </p>
461          */
462         public TokenIterator() {
463             tokenIndex = new ArrayList();
464             parse(tokenIndex);
465             ti = tokenIndex.iterator();
466         }
467 
468         /***
469          * <p>Retuns <code>true</code> if there are more
470          * {@link AttributeTokenizer.AttributeEntry} in the collection.
471          * </p>
472          *
473          * @return <code>true</code> if there are more tokens
474          */
475         public boolean hasNext() {
476             return ti.hasNext();
477         }
478 
479         /***
480          * <p>Retuns the next {@link AttributeTokenizer.AttributeEntry}
481          * in the collection.
482          * </p>
483          *
484          * @return returns the next token
485          */
486         public Object next() {
487             Map.Entry attribute = (Map.Entry) ti.next();
488             return attribute;
489         }
490 
491         /***
492          * <p>This method is not implemented.</p>
493          *
494          * @deprecated
495          */
496         public void remove() {
497             // NA
498         }
499     }
500 
501     /***
502      * <p>Returns an instance of an <code>Iterator</code> that
503      * will enumerate attributes in the document where the attributes
504      * are represented by a {@link AttributeTokenizer.AttributeEntry} instance.
505      * </p>
506      *
507      * @return returns a {@link AttributeTokenizer.TokenIterator} iterator.
508      */
509     public Iterator iterator() {
510         return new TokenIterator();
511     }
512 
513 }