2009/05/20 - Apache Shale has been retired.

For more information, please explore the Attic.


1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to you under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  /*
19   * $Id: NodeTokenizer.java 464373 2006-10-16 04:21:54Z rahul $
20   */
21  package org.apache.shale.clay.parser;
22  
23  import java.util.ArrayList;
24  import java.util.Iterator;
25  
26  import org.apache.commons.logging.Log;
27  import org.apache.commons.logging.LogFactory;
28  import org.apache.shale.util.Messages;
29  
30  /***
31   * <p>
32   * Splits a document into tokens using the following delimiters "<>". The
33   * tokens are represented by a starting and ending offset so a bunch of strings
34   * are not created until the content is needed.
35   * </p>
36   */
37  
38  public class NodeTokenizer {
39  
40      /***
41       * <p>
42       * Message resources for this class.
43       * </p>
44       */
45      private static Messages messages = new Messages(
46              "org.apache.shale.clay.Bundle", NodeTokenizer.class
47              .getClassLoader());
48  
49      /***
50       * <p>
51       * Common logger utility instance.
52       * </p>
53       */
54      private static Log log;
55      static {
56          log = LogFactory.getLog(NodeTokenizer.class);
57      }
58  
59      /***
60       * <p>
61       * The complete document being parsed.
62       * </p>
63       */
64      private StringBuffer buffer = null;
65  
66      /***
67       * <p>
68       * Constructor with the complete document to parse.
69       * </p>
70       *
71       * @param buffer document
72       */
73      public NodeTokenizer(StringBuffer buffer) {
74          this.buffer = buffer;
75      }
76  
77      /***
78       *
79       * <p>
80       * Inner class implementing the {@link Token} interface.
81       * </p>
82       *
83       */
84      private class TokenOffset implements Token {
85          /***
86           * <p>Starting offset of the token.</p>
87           */
88          private int beginOffset = 0;
89          /***
90           * <p>Ending offset of the token.</p>
91           */
92          private int endOffset = 0;
93          /***
94           * <p>Line number the token was found on.</p>
95           */
96          private int lineNumber = 0;
97          /***
98           * <p>Offset the begining line was found.</p>
99           */
100         private int lineBeginOffset = 0;
101 
102         /***
103          * <p>
104          * Constructor requires the begining and ending offset within the
105          * document of the {@link Node}.
106          * </p>
107          *
108          * @param beginOffset starting offset
109          * @param endOffset ending offset
110          * @param lineNumber line the token is found within the document
111          * @param lineBeginOffset offset the line is in the document
112          */
113         public TokenOffset(int beginOffset, int endOffset, int lineNumber, int lineBeginOffset) {
114             this.beginOffset = beginOffset;
115             this.endOffset = endOffset;
116             this.lineNumber = lineNumber;
117             this.lineBeginOffset = lineBeginOffset;
118         }
119 
120         /***
121          * <p>
122          * Returns the begining offset within the document for the {@link Node}.
123          * </p>
124          *
125          * @return begining offset
126          */
127         public int getBeginOffset() {
128             return beginOffset;
129         }
130 
131         /***
132          * <p>
133          * Returns the ending offset within the document for the {@link Node}.</p>
134          *
135          * @return ending offset
136          */
137         public int getEndOffset() {
138             return endOffset;
139         }
140 
141         /***
142          * <p>
143          * Returns the complete document being parsed.
144          * </p>
145          *
146          * @return document
147          */
148         public StringBuffer getDocument() {
149             return buffer;
150         }
151 
152         /***
153          * <p>
154          * Returns the raw representation of the {@link Node} within the
155          * document identified by the {@link Token}.
156          * </p>
157          *
158          * @return node text for beginOffset to endOffset
159          */
160         public String getRawText() {
161             String pickel = null;
162             try {
163                 pickel = buffer.substring(beginOffset, endOffset);
164             } catch (RuntimeException e) {
165                 log.error(toString(), e);
166                 throw e;
167             }
168             return pickel;
169         }
170 
171         /***
172          * @return describes the objects state
173          */
174         public String toString() {
175             return messages.getMessage("node.token.range",
176                     new Object[] {
177                 new Integer(beginOffset),
178                         new Integer(endOffset),
179                         new Integer(lineNumber),
180                         new Integer(lineBeginOffset)});
181         }
182 
183         /***
184          * <p>Returns the line number in the document that the node starts.</p>
185          *
186          * @return line number
187          */
188         public int getLineNumber() {
189            return lineNumber;
190         }
191 
192         /***
193          * <p>Returns the line begining offset in the document that the node starts.</p>
194          *
195          * @return line begin offset within the document
196          */
197         public int getLineBeginOffset() {
198            return lineBeginOffset;
199         }
200 
201     }
202 
203     /***
204      * <p>
205      * This method is passed an empty <code>ArrayList</code> that should be
206      * populated into {@link Token} offsets.
207      * </p>
208      *
209      * @param tokenIndex all the document tokens
210      */
211     protected void index(ArrayList tokenIndex) {
212         int s = 0;
213         int lineNumber = 1;
214         int lineBeginOffset = 0;
215 
216         if (log.isDebugEnabled()) {
217             log.debug(messages.getMessage("node.document.size",
218                     new Object[] { new Integer(buffer.length()) }));
219         }
220 
221         for (int i = 0; i < buffer.length(); i++) {
222 
223             if (buffer.charAt(i) == '<') {
224                 if (i > s && s < i) {
225 
226                     TokenOffset offset = new TokenOffset(s, i, lineNumber, lineBeginOffset);
227                     tokenIndex.add(offset);
228 
229                     if (log.isDebugEnabled()) {
230                         log.debug(messages.getMessage("node.token.range",
231                                 new Object[] {
232                             new Integer(offset.getBeginOffset()),
233                                     new Integer(offset.getEndOffset()),
234                                     new Integer(offset.getLineNumber()),
235                                     new Integer(offset.getLineBeginOffset())}));
236                     }
237 
238                 }
239                 s = i;
240             } else if (buffer.charAt(i) == '>') {
241                 if (i > s) {
242                     TokenOffset offset = new TokenOffset(s, i + 1, lineNumber, lineBeginOffset);
243                     tokenIndex.add(offset);
244 
245                     if (log.isDebugEnabled()) {
246                         log.debug(messages.getMessage("node.token.range",
247                                 new Object[] {
248                             new Integer(offset.getBeginOffset()),
249                                     new Integer(offset.getEndOffset()),
250                                     new Integer(offset.getLineNumber()),
251                                     new Integer(offset.getLineBeginOffset())}));
252                     }
253                 }
254                 s = i + 1;
255             } else if (buffer.charAt(i) == '\n') {
256                 lineNumber++;
257                 lineBeginOffset = i;
258             }
259 
260         }
261 
262         if ((buffer.length()) > s + 1) {
263             TokenOffset offset = new TokenOffset(s, (buffer.length()), lineNumber, lineBeginOffset);
264             tokenIndex.add(offset);
265 
266             if (log.isDebugEnabled()) {
267                 log.debug(messages.getMessage("node.token.range",
268                         new Object[] {
269                     new Integer(offset.getBeginOffset()),
270                             new Integer(offset.getEndOffset()),
271                             new Integer(offset.getLineNumber()),
272                             new Integer(offset.getLineBeginOffset())}));
273             }
274 
275         }
276 
277     }
278 
279     /***
280      * <p>This inner class implements the <code>Iterator</code>
281      * interface and is used to enumerate the {@link Token}
282      * offset that define the document.  It's a decorator for
283      * an <code>Iterator</code> of a internal collection of
284      * node offsets.
285      * </p>
286      */
287     private class TokenIterator implements Iterator {
288 
289         /***
290          * <p>All the document tokens.</p>
291          */
292         private ArrayList tokenIndex = null;
293 
294         /***
295          * <p>Current tokenIndex iterator.</p>
296          */
297         private Iterator ti = null;
298 
299         /***
300          * <p>Constructor invokes the <code>index</code> method
301          * passing an <code>ArrayList</code> to populate with
302          * {@link Token} offsets.
303          * </p>
304          */
305         public TokenIterator() {
306             tokenIndex = new ArrayList();
307             index(tokenIndex);
308             ti = tokenIndex.iterator();
309         }
310 
311         /***
312          * <p>Retuns <code>true</code> if there are more {@link Token} node
313          * offsets within the parsed document.
314          * </p>
315          *
316          * @return <code>true</code> if there are more tokens
317          */
318         public boolean hasNext() {
319             return ti.hasNext();
320         }
321 
322         /***
323          * <p>Returns the next {@link Token} in the document.</p>
324          *
325          * @return the next document token
326          */
327         public Object next() {
328             TokenOffset offset = (TokenOffset) ti.next();
329             return offset;
330         }
331 
332         /***
333          * <p>Not supported.</p>
334          *
335          * @deprecated
336          */
337         public void remove() {
338             // NA
339         }
340     }
341 
342     /***
343      * <p>Returns an implementation of the <code>Iterator</code>
344      * interface to enumerate the nodes within the document.  Each
345      * node is defined using a {@link Token} interface.
346      * </p>
347      *
348      * @return TokenIterator
349      */
350     public Iterator iterator() {
351         return new TokenIterator();
352     }
353 
354 }