001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.juneau.uon;
018
019import static org.apache.juneau.commons.utils.ThrowableUtils.*;
020
021import java.io.*;
022
023import org.apache.juneau.parser.*;
024
025/**
026 * Same functionality as {@link ParserReader} except automatically decoded <c>%xx</c> escape sequences.
027 *
028 * <p>
029 * Escape sequences are assumed to be encoded UTF-8.  Extended Unicode (&gt;\u10000) is supported.
030 *
031 * <p>
032 * If decoding is enabled, the following character replacements occur so that boundaries are not lost:
033 * <ul>
034 *    <li><js>'&amp;'</js> -&gt; <js>'\u0001'</js>
035 *    <li><js>'='</js> -&gt; <js>'\u0002'</js>
036 * </ul>
037 *
038 * <h5 class='section'>See Also:</h5><ul>
039 *    <li class='link'><a class="doclink" href="https://juneau.apache.org/docs/topics/UonBasics">UON Basics</a>
040
041 * </ul>
042 */
043@SuppressWarnings("resource")
044public class UonReader extends ParserReader {
045
046   private static int fromHexChar(int c) throws IOException {
047      if (c >= '0' && c <= '9')
048         return c - '0';
049      if (c >= 'a' && c <= 'f')
050         return 10 + c - 'a';
051      if (c >= 'A' && c <= 'F')
052         return 10 + c - 'A';
053      throw ioex("Invalid hex character ''{0}'' found in escape pattern.", c);
054   }
055
056   private final boolean decodeChars;
057
058   private final char[] buff;
059
060   // Writable properties.
061   private int iCurrent, iEnd;
062
063   /**
064    * Constructor.
065    *
066    * @param pipe The parser input.
067    * @param decodeChars Whether the input is URL-encoded.
068    * @throws IOException Thrown by underlying stream.
069    */
070   public UonReader(ParserPipe pipe, boolean decodeChars) throws IOException {
071      super(pipe);
072      this.decodeChars = decodeChars;
073      if (pipe.isString()) {
074         var in = pipe.getInputAsString();
075         this.buff = new char[in.length() < 1024 ? in.length() : 1024];
076      } else {
077         this.buff = new char[1024];
078      }
079   }
080
081   @Override /* Overridden from Reader */
082   public int read(char[] cbuf, int off, int len) throws IOException {
083
084      if (! decodeChars)
085         return super.read(cbuf, off, len);
086
087      // Copy any remainder to the beginning of the buffer.
088      var remainder = iEnd - iCurrent;
089      if (remainder > 0)
090         System.arraycopy(buff, iCurrent, buff, 0, remainder);
091      iCurrent = 0;
092
093      var expected = buff.length - remainder;
094
095      var x = super.read(buff, remainder, expected);
096      if (x == -1 && remainder == 0)
097         return -1;
098
099      iEnd = remainder + (x == -1 ? 0 : x);
100
101      var i = 0;
102      while (i < len) {
103         if (iCurrent >= iEnd)
104            return i;
105         var c = buff[iCurrent++];
106         if (c == '+') {
107            cbuf[off + i++] = ' ';
108         } else if (c == '&') {
109            cbuf[off + i++] = '\u0001';
110         } else if (c == '=') {
111            cbuf[off + i++] = '\u0002';
112         } else if (c != '%') {
113            cbuf[off + i++] = c;
114         } else {
115            var iMark = iCurrent - 1;  // Keep track of current position.
116
117            // Stop if there aren't at least two more characters following '%' in the buffer,
118            // or there aren't at least two more positions open in cbuf to handle double-char chars.
119            if (iMark + 2 >= iEnd || i + 2 > len) {
120               iCurrent--;
121               return i;
122            }
123
124            var b0 = readEncodedByte();
125            int cx;
126
127            // 0xxxxxxx
128            if (b0 < 128) {
129               cx = b0;
130
131            } else if (b0 < 192) {
132               // 10xxxxxx
133               throw ioex("Invalid hex value for first escape pattern in UTF-8 sequence: {0}", b0);
134
135            } else if (b0 < 224) {
136               // 110xxxxx 10xxxxxx
137               // 11000000(192) - 11011111(223)
138               cx = readUTF8(b0 - 192, 1);
139               if (cx == -1) {
140                  iCurrent = iMark;
141                  return i;
142               }
143
144            } else if (b0 < 240) {
145               // 1110xxxx 10xxxxxx 10xxxxxx
146               // 11100000(224) - 11101111(239)
147               cx = readUTF8(b0 - 224, 2);
148               if (cx == -1) {
149                  iCurrent = iMark;
150                  return i;
151               }
152
153            } else if (b0 < 248) {
154               // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
155               // 11110000(240) - 11110111(247)
156               cx = readUTF8(b0 - 240, 3);
157               if (cx == -1) {
158                  iCurrent = iMark;
159                  return i;
160               }
161
162            } else
163               throw ioex("Invalid hex value for first escape pattern in UTF-8 sequence: {0}", b0);
164
165            if (cx < 0x10000)
166               cbuf[off + i++] = (char)cx;
167            else {
168               cx -= 0x10000;
169               cbuf[off + i++] = (char)(0xd800 + (cx >> 10));
170               cbuf[off + i++] = (char)(0xdc00 + (cx & 0x3ff));
171            }
172         }
173      }
174      return i;
175   }
176
177   @Override /* Overridden from ParserReader */
178   public UonReader unread() throws IOException {
179      super.unread();
180      return this;
181   }
182
183   private int readEncodedByte() throws IOException {
184      if (iEnd <= iCurrent + 1)
185         throw ioex("Incomplete trailing escape pattern");
186      int h = buff[iCurrent++];
187      int l = buff[iCurrent++];
188      h = fromHexChar(h);
189      l = fromHexChar(l);
190      return (h << 4) + l;
191   }
192
193   private int readHex() throws IOException {
194      var c = buff[iCurrent++];
195      if (c != '%')
196         throw ioex("Did not find expected '%' character in UTF-8 sequence.");
197      return readEncodedByte();
198   }
199
200   private int readUTF8(int n, int numBytes) throws IOException {
201      if (iCurrent + numBytes * 3 > iEnd)
202         return -1;
203      for (var i = 0; i < numBytes; i++) {
204         n <<= 6;
205         n += readHex() - 128;
206      }
207      return n;
208   }
209}