001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.juneau.parser; 018 019import static org.apache.juneau.commons.utils.StringUtils.*; 020import static org.apache.juneau.commons.utils.ThrowableUtils.*; 021 022import java.io.*; 023 024import org.apache.juneau.commons.io.*; 025 026/** 027 * Similar to a {@link java.io.PushbackReader} with a pushback buffer of 1 character. 028 * 029 * <p> 030 * Code is optimized to work with a 1 character buffer. 031 * 032 * <p> 033 * Additionally keeps track of current line and column number, and provides the ability to set mark points and capture 034 * characters from the previous mark point. 035 * 036 * <h5 class='section'>Notes:</h5><ul> 037 * <li class='warn'>This class is not thread safe. 038 * </ul> 039 * 040 * <h5 class='section'>See Also:</h5><ul> 041 * <li class='link'><a class="doclink" href="https://juneau.apache.org/docs/topics/SerializersAndParsers">Serializers and Parsers</a> 042 043 * </ul> 044 */ 045@SuppressWarnings("resource") 046public class ParserReader extends Reader implements Positionable { 047 048 /** Wrapped reader */ 049 protected final Reader r; 050 051 private char[] buff; // Internal character buffer 052 private int line = 1; // Current line number 053 private int column; // Current column number 054 private int iCurrent; // Current pointer into character buffer 055 private int iMark = -1; // Mark position in buffer 056 private int iEnd; // The last good character position in the buffer 057 private boolean endReached, holesExist; 058 private final boolean unbuffered; 059 060 /** 061 * Constructor. 062 * 063 * @param pipe The parser input. 064 * @throws IOException Thrown by underlying stream. 065 */ 066 public ParserReader(ParserPipe pipe) throws IOException { 067 this.unbuffered = pipe.unbuffered; 068 if (pipe.isString()) { 069 String in = pipe.getInputAsString(); 070 this.r = new CharSequenceReader(in); 071 this.buff = new char[in.length() < 1024 ? in.length() : 1024]; 072 } else { 073 Reader _r = pipe.getReader(); 074 if (_r instanceof ParserReader _r2) 075 this.r = _r2.r; 076 else 077 this.r = _r; 078 this.buff = new char[1024]; 079 } 080 pipe.setPositionable(this); 081 } 082 083 /** 084 * No-op. 085 * 086 * <p> 087 * Input readers are closed in the {@link ParserPipe} class. 088 * 089 * @throws IOException If a problem occurred trying to read from the reader. 090 */ 091 @Override /* Overridden from Reader */ 092 public void close() throws IOException { 093 // No-op 094 } 095 096 /** 097 * Trims off the last character in the marking buffer. 098 * 099 * <p> 100 * Useful for removing escape characters from sequences. 101 * 102 * @return This object. 103 */ 104 public final ParserReader delete() { 105 return delete(1); 106 } 107 108 /** 109 * Trims off the specified number of last characters in the marking buffer. 110 * Useful for removing escape characters from sequences. 111 * 112 * @param count The number of characters to delete. 113 * @return This object. 114 */ 115 public final ParserReader delete(int count) { 116 for (var i = 0; i < count; i++) 117 buff[iCurrent - i - 1] = 127; 118 holesExist = true; 119 return this; 120 } 121 122 /** 123 * Returns the contents of the reusable character buffer as a string, and resets the buffer for next usage. 124 * 125 * @return The contents of the reusable character buffer as a string. 126 */ 127 public final String getMarked() { return getMarked(0, 0); } 128 129 /** 130 * Same as {@link #getMarked()} except allows you to specify offsets into the buffer. 131 * 132 * <p> 133 * For example, to return the marked string, but trim the first and last characters, call the following: 134 * <p class='bjava'> 135 * getFromMarked(1, -1); 136 * </p> 137 * 138 * @param offsetStart The offset of the start position. 139 * @param offsetEnd The offset of the end position. 140 * @return The contents of the reusable character buffer as a string. 141 */ 142 public final String getMarked(int offsetStart, int offsetEnd) { 143 int offset = 0; 144 145 // Holes are \u00FF 'delete' characters that we need to get rid of now. 146 if (holesExist) { 147 for (var i = iMark; i < iCurrent; i++) { 148 char c = buff[i]; 149 if (c == 127) 150 offset++; 151 else 152 buff[i - offset] = c; 153 } 154 holesExist = false; 155 } 156 int start = iMark + offsetStart, len = iCurrent - iMark + offsetEnd - offsetStart - offset; 157 var s = new String(buff, start, len); 158 iMark = -1; 159 return s; 160 } 161 162 @Override /* Overridden from Positionable */ 163 public Position getPosition() { return new Position(line, column); } 164 165 /** 166 * Start buffering the calls to read() so that the text can be gathered from the mark point on calling {@code getFromMarked()}. 167 */ 168 public final void mark() { 169 iMark = iCurrent; 170 } 171 172 /** 173 * Reads a numeric string from the specified reader. 174 * 175 * @return The parsed number string. 176 * @throws IOException Thrown by underlying stream. 177 */ 178 public String parseNumberString() throws IOException { 179 mark(); 180 int c = 0; 181 while (true) { 182 c = read(); 183 if (c == -1) 184 break; 185 if (! isNumberChar((char)c)) { 186 unread(); 187 break; 188 } 189 } 190 return getMarked(); 191 } 192 193 /** 194 * Peeks the next character in the stream. 195 * 196 * <p> 197 * This is equivalent to doing a {@code read()} followed by an {@code unread()}. 198 * 199 * @return The peeked character, or (char)-1 if the end of the stream has been reached. 200 * @throws IOException If a problem occurred trying to read from the reader. 201 */ 202 public final int peek() throws IOException { 203 int c = read(); 204 if (c != -1) 205 unread(); 206 return c; 207 } 208 209 /** 210 * Same as {@link #peek()} but skips over any whitespace characters. 211 * 212 * <p> 213 * This is equivalent to doing a {@code read()} followed by an {@code unread()}. 214 * 215 * @return The peeked character, or (char)-1 if the end of the stream has been reached. 216 * @throws IOException If a problem occurred trying to read from the reader. 217 */ 218 public final int peekSkipWs() throws IOException { 219 while (true) { 220 var c = read(); 221 var isWs = Character.isWhitespace(c); 222 if (c != -1 && ! isWs) 223 unread(); 224 if (! isWs) 225 return c; 226 } 227 } 228 229 /** 230 * Reads a single character. 231 * 232 * <p> 233 * Note that this method does NOT process extended unicode characters (i.e. characters above 0x10000), but rather 234 * returns them as two <jk>char</jk>s. 235 * Use {@link #readCodePoint()} to ensure proper handling of extended unicode. 236 * 237 * @return The character read, or -1 if the end of the stream has been reached. 238 * @throws IOException If a problem occurred trying to read from the reader. 239 */ 240 @Override /* Overridden from Reader */ 241 public final int read() throws IOException { 242 int c = readFromBuff(); 243 if (c == -1) 244 return -1; 245 if (c == '\n') { 246 line++; 247 column = 0; 248 } else { 249 column++; 250 } 251 return c; 252 } 253 254 /** 255 * Subclasses can override this method to provide additional filtering. 256 * 257 * <p> 258 * Default implementation simply calls the same method on the underlying reader. 259 */ 260 @Override /* Overridden from Reader */ 261 public int read(char[] cbuf, int off, int len) throws IOException { 262 return unbuffered ? r.read(cbuf, off, 1) : r.read(cbuf, off, len); 263 } 264 265 /** 266 * Read the specified number of characters off the stream. 267 * 268 * @param num The number of characters to read. 269 * @return The characters packaged as a String. 270 * @throws IOException If a problem occurred trying to read from the reader. 271 */ 272 public final String read(int num) throws IOException { 273 var c = new char[num]; 274 for (var i = 0; i < num; i++) { 275 var c2 = read(); 276 if (c2 == -1) 277 return new String(c, 0, i); 278 c[i] = (char)c2; 279 } 280 return new String(c); 281 } 282 283 /** 284 * Same as {@link #read()} but detects and combines extended unicode characters (characters above 0x10000). 285 * 286 * @return The character read, or -1 if the end of the stream has been reached. 287 * @throws IOException If a problem occurred trying to read from the reader. 288 */ 289 public final int readCodePoint() throws IOException { 290 int c = read(); 291 292 // Characters that take up 2 chars. 293 if (c >= 0xd800 && c <= 0xdbff) { 294 var low = read(); 295 if (low >= 0xdc00 && low <= 0xdfff) 296 c = 0x10000 + ((c - 0xd800) << 10) + (low - 0xdc00); 297 } 298 299 return c; 300 } 301 302 /** 303 * Same as {@link #read()} but skips over any whitespace characters. 304 * 305 * @return The first non-whitespace character, or -1 if the end of stream reached. 306 * @throws IOException Thrown by underlying stream. 307 */ 308 public final int readSkipWs() throws IOException { 309 while (true) { 310 var c = read(); 311 if (c == -1 || ! Character.isWhitespace(c)) 312 return c; 313 } 314 } 315 316 /** 317 * Replace the last read character in the buffer with the specified character. 318 * 319 * @param c The new character. 320 * @return This object. 321 * @throws IOException Thrown by underlying stream. 322 */ 323 public final ParserReader replace(char c) throws IOException { 324 return replace(c, 1); 325 } 326 327 /** 328 * Replaces the last character in the marking buffer with the specified character. 329 * 330 * <p> 331 * <c>offset</c> must be at least <c>1</c> for normal characters, and <c>2</c> for extended 332 * unicode characters in order for the replacement to fit into the buffer. 333 * 334 * @param c The new character. 335 * @param offset The offset. 336 * @return This object. 337 * @throws IOException Thrown by underlying stream. 338 */ 339 public final ParserReader replace(int c, int offset) throws IOException { 340 if (c < 0x10000) { 341 if (offset < 1) 342 throw ioex("Buffer underflow."); 343 buff[iCurrent - offset] = (char)c; 344 } else { 345 if (offset < 2) 346 throw ioex("Buffer underflow."); 347 c -= 0x10000; 348 buff[iCurrent - offset] = (char)(0xd800 + (c >> 10)); 349 buff[iCurrent - offset + 1] = (char)(0xdc00 + (c & 0x3ff)); 350 offset--; 351 } 352 // Fill in the gap with DEL characters. 353 for (var i = 1; i < offset; i++) 354 buff[iCurrent - i] = 127; 355 holesExist |= (offset > 1); 356 return this; 357 } 358 359 /** 360 * Pushes the last read character back into the stream. 361 * 362 * @return This object. 363 * @throws IOException If a problem occurred trying to read from the reader. 364 */ 365 public ParserReader unread() throws IOException { 366 if (iCurrent <= 0) 367 throw ioex("Buffer underflow."); 368 iCurrent--; 369 if (column == 0) 370 line--; 371 else 372 column--; 373 return this; 374 } 375 376 private final int readFromBuff() throws IOException { 377 while (iCurrent >= iEnd) { 378 if (endReached) 379 return -1; 380 381 // If there's still space at the end of this buffer, fill it. 382 // Make sure there's at least 2 character spaces free for extended unicode characters. 383 if (iEnd + 1 < buff.length) { 384 int x = read(buff, iCurrent, buff.length - iEnd); 385 if (x == -1) { 386 endReached = true; 387 return -1; 388 } 389 iEnd += x; 390 391 } else { 392 // If we're currently marking, then we want to copy from the current mark point 393 // to the beginning of the buffer and then fill in the remainder of buffer. 394 if (iMark >= 0) { 395 396 // If we're marking from the beginning of the array, we double the size of the 397 // buffer. This isn't likely to occur often. 398 if (iMark == 0) { 399 var buff2 = new char[buff.length << 1]; 400 System.arraycopy(buff, 0, buff2, 0, buff.length); 401 buff = buff2; 402 403 // Otherwise, we copy what's currently marked to the beginning of the buffer. 404 } else { 405 int copyBuff = iMark; 406 System.arraycopy(buff, copyBuff, buff, 0, buff.length - copyBuff); 407 iCurrent -= copyBuff; 408 iMark -= copyBuff; 409 } 410 int expected = buff.length - iCurrent; 411 412 int x = read(buff, iCurrent, expected); 413 if (x == -1) { 414 endReached = true; 415 iEnd = iCurrent; 416 return -1; 417 } 418 iEnd = iCurrent + x; 419 } else { 420 // Copy the last 10 chars in the buffer to the beginning of the buffer. 421 int copyBuff = Math.min(iCurrent, 10); 422 System.arraycopy(buff, iCurrent - copyBuff, buff, 0, copyBuff); 423 424 // Number of characters we expect to copy on the next read. 425 int expected = buff.length - copyBuff; 426 int x = read(buff, copyBuff, expected); 427 iCurrent = copyBuff; 428 if (x == -1) { 429 endReached = true; 430 iEnd = iCurrent; 431 return -1; 432 } 433 iEnd = iCurrent + x; 434 } 435 } 436 } 437 return buff[iCurrent++]; 438 } 439}