2009/05/20 - Apache Shale has been retired.
For more information, please explore the Attic.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21 package org.apache.shale.clay.parser;
22
23 import java.util.ArrayList;
24 import java.util.Iterator;
25
26 import org.apache.commons.logging.Log;
27 import org.apache.commons.logging.LogFactory;
28 import org.apache.shale.util.Messages;
29
30 /***
31 * <p>
32 * Splits a document into tokens using the following delimiters "<>". The
33 * tokens are represented by a starting and ending offset so a bunch of strings
34 * are not created until the content is needed.
35 * </p>
36 */
37
38 public class NodeTokenizer {
39
40 /***
41 * <p>
42 * Message resources for this class.
43 * </p>
44 */
45 private static Messages messages = new Messages(
46 "org.apache.shale.clay.Bundle", NodeTokenizer.class
47 .getClassLoader());
48
49 /***
50 * <p>
51 * Common logger utility instance.
52 * </p>
53 */
54 private static Log log;
55 static {
56 log = LogFactory.getLog(NodeTokenizer.class);
57 }
58
59 /***
60 * <p>
61 * The complete document being parsed.
62 * </p>
63 */
64 private StringBuffer buffer = null;
65
66 /***
67 * <p>
68 * Constructor with the complete document to parse.
69 * </p>
70 *
71 * @param buffer document
72 */
73 public NodeTokenizer(StringBuffer buffer) {
74 this.buffer = buffer;
75 }
76
77 /***
78 *
79 * <p>
80 * Inner class implementing the {@link Token} interface.
81 * </p>
82 *
83 */
84 private class TokenOffset implements Token {
85 /***
86 * <p>Starting offset of the token.</p>
87 */
88 private int beginOffset = 0;
89 /***
90 * <p>Ending offset of the token.</p>
91 */
92 private int endOffset = 0;
93 /***
94 * <p>Line number the token was found on.</p>
95 */
96 private int lineNumber = 0;
97 /***
98 * <p>Offset the begining line was found.</p>
99 */
100 private int lineBeginOffset = 0;
101
102 /***
103 * <p>
104 * Constructor requires the begining and ending offset within the
105 * document of the {@link Node}.
106 * </p>
107 *
108 * @param beginOffset starting offset
109 * @param endOffset ending offset
110 * @param lineNumber line the token is found within the document
111 * @param lineBeginOffset offset the line is in the document
112 */
113 public TokenOffset(int beginOffset, int endOffset, int lineNumber, int lineBeginOffset) {
114 this.beginOffset = beginOffset;
115 this.endOffset = endOffset;
116 this.lineNumber = lineNumber;
117 this.lineBeginOffset = lineBeginOffset;
118 }
119
120 /***
121 * <p>
122 * Returns the begining offset within the document for the {@link Node}.
123 * </p>
124 *
125 * @return begining offset
126 */
127 public int getBeginOffset() {
128 return beginOffset;
129 }
130
131 /***
132 * <p>
133 * Returns the ending offset within the document for the {@link Node}.</p>
134 *
135 * @return ending offset
136 */
137 public int getEndOffset() {
138 return endOffset;
139 }
140
141 /***
142 * <p>
143 * Returns the complete document being parsed.
144 * </p>
145 *
146 * @return document
147 */
148 public StringBuffer getDocument() {
149 return buffer;
150 }
151
152 /***
153 * <p>
154 * Returns the raw representation of the {@link Node} within the
155 * document identified by the {@link Token}.
156 * </p>
157 *
158 * @return node text for beginOffset to endOffset
159 */
160 public String getRawText() {
161 String pickel = null;
162 try {
163 pickel = buffer.substring(beginOffset, endOffset);
164 } catch (RuntimeException e) {
165 log.error(toString(), e);
166 throw e;
167 }
168 return pickel;
169 }
170
171 /***
172 * @return describes the objects state
173 */
174 public String toString() {
175 return messages.getMessage("node.token.range",
176 new Object[] {
177 new Integer(beginOffset),
178 new Integer(endOffset),
179 new Integer(lineNumber),
180 new Integer(lineBeginOffset)});
181 }
182
183 /***
184 * <p>Returns the line number in the document that the node starts.</p>
185 *
186 * @return line number
187 */
188 public int getLineNumber() {
189 return lineNumber;
190 }
191
192 /***
193 * <p>Returns the line begining offset in the document that the node starts.</p>
194 *
195 * @return line begin offset within the document
196 */
197 public int getLineBeginOffset() {
198 return lineBeginOffset;
199 }
200
201 }
202
203 /***
204 * <p>
205 * This method is passed an empty <code>ArrayList</code> that should be
206 * populated into {@link Token} offsets.
207 * </p>
208 *
209 * @param tokenIndex all the document tokens
210 */
211 protected void index(ArrayList tokenIndex) {
212 int s = 0;
213 int lineNumber = 1;
214 int lineBeginOffset = 0;
215
216 if (log.isDebugEnabled()) {
217 log.debug(messages.getMessage("node.document.size",
218 new Object[] { new Integer(buffer.length()) }));
219 }
220
221 for (int i = 0; i < buffer.length(); i++) {
222
223 if (buffer.charAt(i) == '<') {
224 if (i > s && s < i) {
225
226 TokenOffset offset = new TokenOffset(s, i, lineNumber, lineBeginOffset);
227 tokenIndex.add(offset);
228
229 if (log.isDebugEnabled()) {
230 log.debug(messages.getMessage("node.token.range",
231 new Object[] {
232 new Integer(offset.getBeginOffset()),
233 new Integer(offset.getEndOffset()),
234 new Integer(offset.getLineNumber()),
235 new Integer(offset.getLineBeginOffset())}));
236 }
237
238 }
239 s = i;
240 } else if (buffer.charAt(i) == '>') {
241 if (i > s) {
242 TokenOffset offset = new TokenOffset(s, i + 1, lineNumber, lineBeginOffset);
243 tokenIndex.add(offset);
244
245 if (log.isDebugEnabled()) {
246 log.debug(messages.getMessage("node.token.range",
247 new Object[] {
248 new Integer(offset.getBeginOffset()),
249 new Integer(offset.getEndOffset()),
250 new Integer(offset.getLineNumber()),
251 new Integer(offset.getLineBeginOffset())}));
252 }
253 }
254 s = i + 1;
255 } else if (buffer.charAt(i) == '\n') {
256 lineNumber++;
257 lineBeginOffset = i;
258 }
259
260 }
261
262 if ((buffer.length()) > s + 1) {
263 TokenOffset offset = new TokenOffset(s, (buffer.length()), lineNumber, lineBeginOffset);
264 tokenIndex.add(offset);
265
266 if (log.isDebugEnabled()) {
267 log.debug(messages.getMessage("node.token.range",
268 new Object[] {
269 new Integer(offset.getBeginOffset()),
270 new Integer(offset.getEndOffset()),
271 new Integer(offset.getLineNumber()),
272 new Integer(offset.getLineBeginOffset())}));
273 }
274
275 }
276
277 }
278
279 /***
280 * <p>This inner class implements the <code>Iterator</code>
281 * interface and is used to enumerate the {@link Token}
282 * offset that define the document. It's a decorator for
283 * an <code>Iterator</code> of a internal collection of
284 * node offsets.
285 * </p>
286 */
287 private class TokenIterator implements Iterator {
288
289 /***
290 * <p>All the document tokens.</p>
291 */
292 private ArrayList tokenIndex = null;
293
294 /***
295 * <p>Current tokenIndex iterator.</p>
296 */
297 private Iterator ti = null;
298
299 /***
300 * <p>Constructor invokes the <code>index</code> method
301 * passing an <code>ArrayList</code> to populate with
302 * {@link Token} offsets.
303 * </p>
304 */
305 public TokenIterator() {
306 tokenIndex = new ArrayList();
307 index(tokenIndex);
308 ti = tokenIndex.iterator();
309 }
310
311 /***
312 * <p>Retuns <code>true</code> if there are more {@link Token} node
313 * offsets within the parsed document.
314 * </p>
315 *
316 * @return <code>true</code> if there are more tokens
317 */
318 public boolean hasNext() {
319 return ti.hasNext();
320 }
321
322 /***
323 * <p>Returns the next {@link Token} in the document.</p>
324 *
325 * @return the next document token
326 */
327 public Object next() {
328 TokenOffset offset = (TokenOffset) ti.next();
329 return offset;
330 }
331
332 /***
333 * <p>Not supported.</p>
334 *
335 * @deprecated
336 */
337 public void remove() {
338
339 }
340 }
341
342 /***
343 * <p>Returns an implementation of the <code>Iterator</code>
344 * interface to enumerate the nodes within the document. Each
345 * node is defined using a {@link Token} interface.
346 * </p>
347 *
348 * @return TokenIterator
349 */
350 public Iterator iterator() {
351 return new TokenIterator();
352 }
353
354 }