Source code

001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.juneau.parser;
018
019import static org.apache.juneau.commons.utils.StringUtils.*;
020import static org.apache.juneau.commons.utils.ThrowableUtils.*;
021
022import java.io.*;
023
024import org.apache.juneau.commons.io.*;
025
026/**
027 * Similar to a {@link java.io.PushbackReader} with a pushback buffer of 1 character.
028 *
029 * <p>
030 * Code is optimized to work with a 1 character buffer.
031 *
032 * <p>
033 * Additionally keeps track of current line and column number, and provides the ability to set mark points and capture
034 * characters from the previous mark point.
035 *
036 * <h5 class='section'>Notes:</h5><ul>
037 *    <li class='warn'>This class is not thread safe.
038 * </ul>
039 *
040 * <h5 class='section'>See Also:</h5><ul>
041 *    <li class='link'><a class="doclink" href="https://juneau.apache.org/docs/topics/SerializersAndParsers">Serializers and Parsers</a>
042
043 * </ul>
044 */
045@SuppressWarnings("resource")
046public class ParserReader extends Reader implements Positionable {
047
048   /** Wrapped reader */
049   protected final Reader r;
050
051   private char[] buff;       // Internal character buffer
052   private int line = 1;      // Current line number
053   private int column;        // Current column number
054   private int iCurrent;      // Current pointer into character buffer
055   private int iMark = -1;    // Mark position in buffer
056   private int iEnd;          // The last good character position in the buffer
057   private boolean endReached, holesExist;
058   private final boolean unbuffered;
059
060   /**
061    * Constructor.
062    *
063    * @param pipe The parser input.
064    * @throws IOException Thrown by underlying stream.
065    */
066   public ParserReader(ParserPipe pipe) throws IOException {
067      this.unbuffered = pipe.unbuffered;
068      if (pipe.isString()) {
069         String in = pipe.getInputAsString();
070         this.r = new CharSequenceReader(in);
071         this.buff = new char[in.length() < 1024 ? in.length() : 1024];
072      } else {
073         Reader _r = pipe.getReader();
074         if (_r instanceof ParserReader _r2)
075            this.r = _r2.r;
076         else
077            this.r = _r;
078         this.buff = new char[1024];
079      }
080      pipe.setPositionable(this);
081   }
082
083   /**
084    * No-op.
085    *
086    * <p>
087    * Input readers are closed in the {@link ParserPipe} class.
088    *
089    * @throws IOException If a problem occurred trying to read from the reader.
090    */
091   @Override /* Overridden from Reader */
092   public void close() throws IOException {
093      // No-op
094   }
095
096   /**
097    * Trims off the last character in the marking buffer.
098    *
099    * <p>
100    * Useful for removing escape characters from sequences.
101    *
102    * @return This object.
103    */
104   public final ParserReader delete() {
105      return delete(1);
106   }
107
108   /**
109    * Trims off the specified number of last characters in the marking buffer.
110    * Useful for removing escape characters from sequences.
111    *
112    * @param count The number of characters to delete.
113    * @return This object.
114    */
115   public final ParserReader delete(int count) {
116      for (var i = 0; i < count; i++)
117         buff[iCurrent - i - 1] = 127;
118      holesExist = true;
119      return this;
120   }
121
122   /**
123    * Returns the contents of the reusable character buffer as a string, and resets the buffer for next usage.
124    *
125    * @return The contents of the reusable character buffer as a string.
126    */
127   public final String getMarked() { return getMarked(0, 0); }
128
129   /**
130    * Same as {@link #getMarked()} except allows you to specify offsets into the buffer.
131    *
132    * <p>
133    * For example, to return the marked string, but trim the first and last characters, call the following:
134    * <p class='bjava'>
135    *    getFromMarked(1, -1);
136    * </p>
137    *
138    * @param offsetStart The offset of the start position.
139    * @param offsetEnd The offset of the end position.
140    * @return The contents of the reusable character buffer as a string.
141    */
142   public final String getMarked(int offsetStart, int offsetEnd) {
143      int offset = 0;
144
145      // Holes are \u00FF 'delete' characters that we need to get rid of now.
146      if (holesExist) {
147         for (var i = iMark; i < iCurrent; i++) {
148            char c = buff[i];
149            if (c == 127)
150               offset++;
151            else
152               buff[i - offset] = c;
153         }
154         holesExist = false;
155      }
156      int start = iMark + offsetStart, len = iCurrent - iMark + offsetEnd - offsetStart - offset;
157      var s = new String(buff, start, len);
158      iMark = -1;
159      return s;
160   }
161
162   @Override /* Overridden from Positionable */
163   public Position getPosition() { return new Position(line, column); }
164
165   /**
166    * Start buffering the calls to read() so that the text can be gathered from the mark point on calling {@code getFromMarked()}.
167    */
168   public final void mark() {
169      iMark = iCurrent;
170   }
171
172   /**
173    * Reads a numeric string from the specified reader.
174    *
175    * @return The parsed number string.
176    * @throws IOException Thrown by underlying stream.
177    */
178   public String parseNumberString() throws IOException {
179      mark();
180      int c = 0;
181      while (true) {
182         c = read();
183         if (c == -1)
184            break;
185         if (! isNumberChar((char)c)) {
186            unread();
187            break;
188         }
189      }
190      return getMarked();
191   }
192
193   /**
194    * Peeks the next character in the stream.
195    *
196    * <p>
197    * This is equivalent to doing a {@code read()} followed by an {@code unread()}.
198    *
199    * @return The peeked character, or (char)-1 if the end of the stream has been reached.
200    * @throws IOException If a problem occurred trying to read from the reader.
201    */
202   public final int peek() throws IOException {
203      int c = read();
204      if (c != -1)
205         unread();
206      return c;
207   }
208
209   /**
210    * Same as {@link #peek()} but skips over any whitespace characters.
211    *
212    * <p>
213    * This is equivalent to doing a {@code read()} followed by an {@code unread()}.
214    *
215    * @return The peeked character, or (char)-1 if the end of the stream has been reached.
216    * @throws IOException If a problem occurred trying to read from the reader.
217    */
218   public final int peekSkipWs() throws IOException {
219      while (true) {
220         var c = read();
221         var isWs = Character.isWhitespace(c);
222         if (c != -1 && ! isWs)
223            unread();
224         if (! isWs)
225            return c;
226      }
227   }
228
229   /**
230    * Reads a single character.
231    *
232    * <p>
233    * Note that this method does NOT process extended unicode characters (i.e. characters above 0x10000), but rather
234    * returns them as two <jk>char</jk>s.
235    * Use {@link #readCodePoint()} to ensure proper handling of extended unicode.
236    *
237    * @return The character read, or -1 if the end of the stream has been reached.
238    * @throws IOException If a problem occurred trying to read from the reader.
239    */
240   @Override /* Overridden from Reader */
241   public final int read() throws IOException {
242      int c = readFromBuff();
243      if (c == -1)
244         return -1;
245      if (c == '\n') {
246         line++;
247         column = 0;
248      } else {
249         column++;
250      }
251      return c;
252   }
253
254   /**
255    * Subclasses can override this method to provide additional filtering.
256    *
257    * <p>
258    * Default implementation simply calls the same method on the underlying reader.
259    */
260   @Override /* Overridden from Reader */
261   public int read(char[] cbuf, int off, int len) throws IOException {
262      return unbuffered ? r.read(cbuf, off, 1) : r.read(cbuf, off, len);
263   }
264
265   /**
266    * Read the specified number of characters off the stream.
267    *
268    * @param num The number of characters to read.
269    * @return The characters packaged as a String.
270    * @throws IOException If a problem occurred trying to read from the reader.
271    */
272   public final String read(int num) throws IOException {
273      var c = new char[num];
274      for (var i = 0; i < num; i++) {
275         var c2 = read();
276         if (c2 == -1)
277            return new String(c, 0, i);
278         c[i] = (char)c2;
279      }
280      return new String(c);
281   }
282
283   /**
284    * Same as {@link #read()} but detects and combines extended unicode characters (characters above 0x10000).
285    *
286    * @return The character read, or -1 if the end of the stream has been reached.
287    * @throws IOException If a problem occurred trying to read from the reader.
288    */
289   public final int readCodePoint() throws IOException {
290      int c = read();
291
292      // Characters that take up 2 chars.
293      if (c >= 0xd800 && c <= 0xdbff) {
294         var low = read();
295         if (low >= 0xdc00 && low <= 0xdfff)
296            c = 0x10000 + ((c - 0xd800) << 10) + (low - 0xdc00);
297      }
298
299      return c;
300   }
301
302   /**
303    * Same as {@link #read()} but skips over any whitespace characters.
304    *
305    * @return The first non-whitespace character, or -1 if the end of stream reached.
306    * @throws IOException Thrown by underlying stream.
307    */
308   public final int readSkipWs() throws IOException {
309      while (true) {
310         var c = read();
311         if (c == -1 || ! Character.isWhitespace(c))
312            return c;
313      }
314   }
315
316   /**
317    * Replace the last read character in the buffer with the specified character.
318    *
319    * @param c The new character.
320    * @return This object.
321    * @throws IOException Thrown by underlying stream.
322    */
323   public final ParserReader replace(char c) throws IOException {
324      return replace(c, 1);
325   }
326
327   /**
328    * Replaces the last character in the marking buffer with the specified character.
329    *
330    * <p>
331    * <c>offset</c> must be at least <c>1</c> for normal characters, and <c>2</c> for extended
332    * unicode characters in order for the replacement to fit into the buffer.
333    *
334    * @param c The new character.
335    * @param offset The offset.
336    * @return This object.
337    * @throws IOException Thrown by underlying stream.
338    */
339   public final ParserReader replace(int c, int offset) throws IOException {
340      if (c < 0x10000) {
341         if (offset < 1)
342            throw ioex("Buffer underflow.");
343         buff[iCurrent - offset] = (char)c;
344      } else {
345         if (offset < 2)
346            throw ioex("Buffer underflow.");
347         c -= 0x10000;
348         buff[iCurrent - offset] = (char)(0xd800 + (c >> 10));
349         buff[iCurrent - offset + 1] = (char)(0xdc00 + (c & 0x3ff));
350         offset--;
351      }
352      // Fill in the gap with DEL characters.
353      for (var i = 1; i < offset; i++)
354         buff[iCurrent - i] = 127;
355      holesExist |= (offset > 1);
356      return this;
357   }
358
359   /**
360    * Pushes the last read character back into the stream.
361    *
362    * @return This object.
363    * @throws IOException If a problem occurred trying to read from the reader.
364    */
365   public ParserReader unread() throws IOException {
366      if (iCurrent <= 0)
367         throw ioex("Buffer underflow.");
368      iCurrent--;
369      if (column == 0)
370         line--;
371      else
372         column--;
373      return this;
374   }
375
376   private final int readFromBuff() throws IOException {
377      while (iCurrent >= iEnd) {
378         if (endReached)
379            return -1;
380
381         // If there's still space at the end of this buffer, fill it.
382         // Make sure there's at least 2 character spaces free for extended unicode characters.
383         if (iEnd + 1 < buff.length) {
384            int x = read(buff, iCurrent, buff.length - iEnd);
385            if (x == -1) {
386               endReached = true;
387               return -1;
388            }
389            iEnd += x;
390
391         } else {
392            // If we're currently marking, then we want to copy from the current mark point
393            // to the beginning of the buffer and then fill in the remainder of buffer.
394            if (iMark >= 0) {
395
396               // If we're marking from the beginning of the array, we double the size of the
397               // buffer.  This isn't likely to occur often.
398               if (iMark == 0) {
399                  var buff2 = new char[buff.length << 1];
400                  System.arraycopy(buff, 0, buff2, 0, buff.length);
401                  buff = buff2;
402
403                  // Otherwise, we copy what's currently marked to the beginning of the buffer.
404               } else {
405                  int copyBuff = iMark;
406                  System.arraycopy(buff, copyBuff, buff, 0, buff.length - copyBuff);
407                  iCurrent -= copyBuff;
408                  iMark -= copyBuff;
409               }
410               int expected = buff.length - iCurrent;
411
412               int x = read(buff, iCurrent, expected);
413               if (x == -1) {
414                  endReached = true;
415                  iEnd = iCurrent;
416                  return -1;
417               }
418               iEnd = iCurrent + x;
419            } else {
420               // Copy the last 10 chars in the buffer to the beginning of the buffer.
421               int copyBuff = Math.min(iCurrent, 10);
422               System.arraycopy(buff, iCurrent - copyBuff, buff, 0, copyBuff);
423
424               // Number of characters we expect to copy on the next read.
425               int expected = buff.length - copyBuff;
426               int x = read(buff, copyBuff, expected);
427               iCurrent = copyBuff;
428               if (x == -1) {
429                  endReached = true;
430                  iEnd = iCurrent;
431                  return -1;
432               }
433               iEnd = iCurrent + x;
434            }
435         }
436      }
437      return buff[iCurrent++];
438   }
439}