001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.juneau.xml; 018 019import static org.apache.juneau.commons.utils.StringUtils.*; 020import static org.apache.juneau.commons.utils.ThrowableUtils.*; 021import static org.apache.juneau.commons.utils.Utils.*; 022 023import java.io.*; 024import java.util.*; 025 026import javax.xml.stream.*; 027 028import org.apache.juneau.commons.io.*; 029import org.apache.juneau.commons.lang.*; 030import org.apache.juneau.xml.annotation.*; 031 032/** 033 * XML utility methods. 034 * 035 * <h5 class='section'>See Also:</h5><ul> 036 * <li class='link'><a class="doclink" href="https://juneau.apache.org/docs/topics/XmlBasics">XML Basics</a> 037 038 * </ul> 039 */ 040@SuppressWarnings("resource") 041public class XmlUtils { 042 043 // @formatter:off 044 private static AsciiMap REPLACE_TEXT = new AsciiMap() 045 .append('&', "&") 046 .append('<', "<") 047 .append('>', ">") 048 .append((char)0x09, "	") 049 .append((char)0x0A, "
") 050 .append((char)0x0D, "
"); 051 052 private static AsciiMap REPLACE_ATTR_VAL = new AsciiMap() 053 .append('&', "&") 054 .append('<', "<") 055 .append('>', ">") 056 .append('"', """) 057 .append('\'', "'") 058 .append((char)0x09, "	") 059 .append((char)0x0A, "
") 060 .append((char)0x0D, "
"); 061 // @formatter:on 062 063 /** 064 * Given a list of Strings and other Objects, combines Strings that are next to each other in the list. 065 * 066 * @param value The list of text nodes to collapse. 067 * @return The same list. 068 */ 069 public static LinkedList<Object> collapseTextNodes(LinkedList<Object> value) { 070 071 var prev = (String)null; 072 for (ListIterator<Object> i = value.listIterator(); i.hasNext();) { 073 Object o = i.next(); 074 if (o instanceof String o2) { 075 if (prev == null) 076 prev = o2.toString(); 077 else { 078 prev += o2; 079 i.remove(); 080 i.previous(); 081 i.remove(); 082 i.add(prev); 083 } 084 } else { 085 prev = null; 086 } 087 } 088 return value; 089 } 090 091 /** 092 * Translates any _x####_ sequences (introduced by the various encode methods) back into their original characters. 093 * 094 * @param value The string being decoded. 095 * @param sb The string builder to use as a scratch pad. 096 * @return The decoded string. 097 */ 098 public static String decode(String value, StringBuilder sb) { 099 if (value == null) 100 return null; 101 if (value.isEmpty() || value.indexOf('_') == -1) 102 return value; 103 if (sb == null) 104 sb = new StringBuilder(value.length()); 105 106 for (var i = 0; i < value.length(); i++) { 107 var c = value.charAt(i); 108 if (c == '_' && isEscapeSequence(value, i)) { 109 110 var x = Integer.parseInt(value.substring(i + 2, i + 6), 16); 111 112 // If we find _x0000_, then that means a null. 113 // If we find _xE000_, then that means an empty string. 114 if (x == 0) 115 return null; 116 else if (x != 0xE000) 117 sb.append((char)x); 118 119 i += 6; 120 } else { 121 sb.append(c); 122 } 123 } 124 return sb.toString(); 125 } 126 127 /** 128 * Serializes and encodes the specified object as valid XML attribute name. 129 * 130 * @param w The writer to send the output to. 131 * @param value The object being serialized. 132 * @return This object. 133 * @throws IOException If a problem occurred. 134 */ 135 public static Writer encodeAttrName(Writer w, Object value) throws IOException { 136 if (value == null) 137 return w.append("_x0000_"); 138 var s = value.toString(); 139 140 if (needsAttrNameEncoding(s)) { 141 for (var i = 0; i < s.length(); i++) { 142 var c = s.charAt(i); 143 if (i == 0) { 144 if (c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z' || c == ':') 145 w.append(c); 146 else if (c == '_' && ! isEscapeSequence(s, i)) 147 w.append(c); 148 else 149 appendPaddedHexChar(w, c); 150 } else { 151 if ((c >= '0' && c <= '9' || c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z' || c == ':')) 152 w.append(c); 153 else if (c == '_' && ! isEscapeSequence(s, i)) 154 w.append(c); 155 else 156 appendPaddedHexChar(w, c); 157 } 158 } 159 } else { 160 w.append(s); 161 } 162 163 return w; 164 } 165 166 /** 167 * Encodes the specified attribute value and sends the results to the specified writer. 168 * 169 * <p> 170 * Encodes any invalid XML text characters to <c>_x####_</c> sequences and sends the response to the specified 171 * writer. 172 * <br>Encodes <js>'&'</js>, <js>'<'</js>, <js>'>'</js>, <js>'"'</js>, and <js>'\''</js> as XML entities. 173 * <br>Encodes invalid XML text characters to <c>_x####_</c> sequences. 174 * 175 * @param w The writer to send the output to. 176 * @param value The object being encoded. 177 * @param trim 178 * Trim the text before serializing it. 179 * If <jk>true</jk>, leading and trailing whitespace characters will be encoded. 180 * @return The same writer passed in. 181 */ 182 public static Writer encodeAttrValue(Writer w, Object value, boolean trim) { 183 try { 184 if (value == null) 185 return w.append("_x0000_"); 186 var s = value.toString(); 187 if (s.isEmpty()) 188 return w; 189 if (trim) 190 s = s.trim(); 191 192 if (needsAttrValueEncoding(s)) { 193 var len = s.length(); 194 for (var i = 0; i < len; i++) { 195 var c = s.charAt(i); 196 if ((i == 0 || i == len - 1) && Character.isWhitespace(c)) 197 appendPaddedHexChar(w, c); 198 else if (REPLACE_ATTR_VAL.contains(c)) 199 w.append(REPLACE_ATTR_VAL.get(c)); 200 else if (c == '_' && isEscapeSequence(s, i)) 201 appendPaddedHexChar(w, c); 202 else if (isValidXmlCharacter(c)) 203 w.append(c); 204 else 205 appendPaddedHexChar(w, c); 206 } 207 } else { 208 w.append(s); 209 } 210 } catch (IOException e) { 211 throw toRex(e); 212 } 213 214 return w; 215 } 216 217 /** 218 * Encodes any invalid XML element name characters to <c>_x####_</c> sequences. 219 * 220 * @param value The object being encoded. 221 * @return The encoded element name string. 222 */ 223 public static String encodeElementName(Object value) { 224 if (value == null) 225 return "_x0000_"; 226 var s = value.toString(); 227 if (s.isEmpty()) 228 return "_xE000_"; 229 230 try { 231 if (needsElementNameEncoding(s)) 232 try (var w = new StringBuilderWriter(s.length() * 2)) { 233 return encodeElementNameInner(w, s).toString(); 234 } 235 } catch (IOException e) { 236 throw toRex(e); // Never happens 237 } 238 239 return s; 240 } 241 242 /** 243 * Encodes any invalid XML element name characters to <c>_x####_</c> sequences. 244 * 245 * @param w The writer to send the output to. 246 * @param value The object being encoded. 247 * @return The same writer passed in. 248 */ 249 public static Writer encodeElementName(Writer w, Object value) { 250 try { 251 if (value == null) 252 return w.append("_x0000_"); 253 var s = value.toString(); 254 if (needsElementNameEncoding(s)) 255 return encodeElementNameInner(w, s); 256 w.append(s); 257 } catch (IOException e) { 258 throw toRex(e); 259 } 260 return w; 261 } 262 263 /** 264 * Encodes the specified element text and sends the results to the specified writer. 265 * 266 * <p> 267 * Encodes any invalid XML text characters to <c>_x####_</c> sequences and sends the response to the specified 268 * writer. 269 * <br>Encodes <js>'&'</js>, <js>'<'</js>, and <js>'>'</js> as XML entities. 270 * <br>Encodes invalid XML text characters to <c>_x####_</c> sequences. 271 * 272 * @param w The writer to send the output to. 273 * @param value The object being encoded. 274 * @param trim Trim the text before serializing it. 275 * @param preserveWhitespace 276 * Specifies whether we're in preserve-whitespace mode. 277 * (e.g. {@link XmlFormat#MIXED_PWS} or {@link XmlFormat#TEXT_PWS}. 278 * If <jk>true</jk>, leading and trailing whitespace characters will be encoded. 279 * @return The same writer passed in. 280 */ 281 public static Writer encodeText(Writer w, Object value, boolean trim, boolean preserveWhitespace) { 282 283 try { 284 if (value == null) 285 return w.append("_x0000_"); 286 var s = value.toString(); 287 if (s.isEmpty()) 288 return w.append("_xE000_"); 289 if (trim) 290 s = s.trim(); 291 292 if (needsTextEncoding(s)) { 293 var len = s.length(); 294 for (var i = 0; i < len; i++) { 295 var c = s.charAt(i); 296 if ((i == 0 || i == len - 1) && Character.isWhitespace(c) && ! preserveWhitespace) 297 appendPaddedHexChar(w, c); 298 else if (REPLACE_TEXT.contains(c)) 299 w.append(REPLACE_TEXT.get(c)); 300 else if (c == '_' && isEscapeSequence(s, i)) 301 appendPaddedHexChar(w, c); 302 else if (isValidXmlCharacter(c)) 303 w.append(c); 304 else 305 appendPaddedHexChar(w, c); 306 } 307 } else { 308 w.append(s); 309 } 310 } catch (IOException e) { 311 throw toRex(e); 312 } 313 314 return w; 315 } 316 317 /** 318 * Escapes invalid XML text characters to <c>_x####_</c> sequences. 319 * 320 * @param value The object being encoded. 321 * @return The encoded string. 322 */ 323 public static String escapeText(Object value) { 324 if (value == null) 325 return "_x0000_"; 326 var s = value.toString(); 327 328 try { 329 if (! needsTextEncoding(s)) 330 return s; 331 var len = s.length(); 332 var sw = new StringWriter(s.length() * 2); 333 for (var i = 0; i < len; i++) { 334 var c = s.charAt(i); 335 if ((i == 0 || i == len - 1) && Character.isWhitespace(c)) 336 appendPaddedHexChar(sw, c); 337 else if (c == '_' && isEscapeSequence(s, i)) 338 appendPaddedHexChar(sw, c); 339 else if (isValidXmlCharacter(c)) 340 sw.append(c); 341 else 342 appendPaddedHexChar(sw, c); 343 } 344 return sw.toString(); 345 } catch (IOException e) { 346 throw toRex(e); // Never happens 347 } 348 } 349 350 /** 351 * Find the namespace given a list of <ja>@Xml</ja> and <ja>@XmlSchema</ja> annotations. 352 * 353 * <p> 354 * The annotations should be a parent-to-child ordering of annotations found on a class or method. 355 * 356 * @param xmls The list of <ja>@Xml</ja> annotations. 357 * @param schemas The list of <ja>@XmlSchema</ja> annotations. 358 * @return The namespace, or <jk>null</jk> if it couldn't be found. 359 */ 360 public static Namespace findNamespace(List<Xml> xmls, List<XmlSchema> schemas) { 361 362 for (var i = xmls.size() - 1; i >= 0; i--) { 363 var xml = xmls.get(i); 364 var ns = findNamespace(xml.prefix(), xml.namespace(), xmls, schemas); 365 if (nn(ns)) 366 return ns; 367 } 368 369 for (var i = schemas.size() - 1; i >= 0; i--) { 370 var schema = schemas.get(i); 371 var ns = findNamespace(schema.prefix(), schema.namespace(), null, schemas); 372 if (nn(ns)) 373 return ns; 374 } 375 376 return null; 377 } 378 379 /** 380 * Utility method that converts the current event on the XML stream to something human-readable for debug purposes. 381 * 382 * @param r The XML stream reader whose current event is to be converted to a readable string. 383 * @return The event in human-readable form. 384 */ 385 public static String toReadableEvent(XMLStreamReader r) { 386 int t = r.getEventType(); 387 if (t == 1) 388 return "<" + r.getLocalName() + ">"; 389 if (t == 2) 390 return "</" + r.getLocalName() + ">"; 391 if (t == 3) 392 return "PROCESSING_INSTRUCTION"; 393 if (t == 4) 394 return "CHARACTERS=[" + r.getText() + "]"; 395 if (t == 5) 396 return "COMMENTS=[" + r.getText() + "]"; 397 if (t == 6) 398 return "SPACE=[" + r.getText() + "]"; 399 if (t == 7) 400 return "START_DOCUMENT"; 401 if (t == 8) 402 return "END_DOCUMENT"; 403 if (t == 9) 404 return "ENTITY_REFERENCE"; 405 if (t == 10) 406 return "ATTRIBUTE"; 407 if (t == 11) 408 return "DTD"; 409 if (t == 12) 410 return "CDATA=[" + r.getText() + "]"; 411 if (t == 13) 412 return "NAMESPACE"; 413 if (t == 14) 414 return "NOTATION_DECLARATION"; 415 if (t == 15) 416 return "ENTITY_DECLARATION"; 417 return "UNKNOWN"; 418 } 419 420 // Converts an integer to a hexadecimal string padded to 4 places. 421 private static Writer appendPaddedHexChar(Writer out, int num) throws IOException { 422 out.append("_x"); 423 for (var c : toHex4(num)) 424 out.append(c); 425 return out.append('_'); 426 } 427 428 private static Writer encodeElementNameInner(Writer w, String s) throws IOException { 429 for (var i = 0; i < s.length(); i++) { 430 var c = s.charAt(i); 431 // @formatter:off 432 if ((c >= 'A' && c <= 'Z') 433 || (c == '_' && ! isEscapeSequence(s,i)) 434 || (c >= 'a' && c <= 'z') 435 || (i != 0 && ( 436 c == '-' 437 || c == '.' 438 || (c >= '0' && c <= '9') 439 || c == '\u00b7' 440 || (c >= '\u0300' && c <= '\u036f') 441 || (c >= '\u203f' && c <= '\u2040') 442 )) 443 || (c >= '\u00c0' && c <= '\u00d6') 444 || (c >= '\u00d8' && c <= '\u00f6') 445 || (c >= '\u00f8' && c <= '\u02ff') 446 || (c >= '\u0370' && c <= '\u037d') 447 || (c >= '\u037f' && c <= '\u1fff') 448 || (c >= '\u200c' && c <= '\u200d') 449 || (c >= '\u2070' && c <= '\u218f') 450 || (c >= '\u2c00' && c <= '\u2fef') 451 || (c >= '\u3001' && c <= '\ud7ff') 452 || (c >= '\uf900' && c <= '\ufdcf') 453 || (c >= '\ufdf0' && c <= '\ufffd')) { 454 // @formatter:on 455 w.append(c); 456 } else { 457 appendPaddedHexChar(w, c); 458 } 459 } 460 return w; 461 } 462 463 private static Namespace findNamespace(String prefix, String ns, List<Xml> xmls, List<XmlSchema> schemas) { 464 465 // If both prefix and namespace specified, use that Namespace mapping. 466 if (! (prefix.isEmpty() || ns.isEmpty())) 467 return Namespace.of(prefix, ns); 468 469 // If only prefix specified, need to search for namespaceURI. 470 if (! prefix.isEmpty()) { 471 if (nn(xmls)) 472 for (var xml2 : xmls) 473 if (xml2.prefix().equals(prefix) && ! xml2.namespace().isEmpty()) 474 return Namespace.of(prefix, xml2.namespace()); 475 for (var schema : schemas) { 476 if (schema.prefix().equals(prefix) && ! schema.namespace().isEmpty()) 477 return Namespace.of(prefix, schema.namespace()); 478 for (var xmlNs : schema.xmlNs()) 479 if (xmlNs.prefix().equals(prefix)) 480 return Namespace.of(prefix, xmlNs.namespaceURI()); 481 } 482 throw bex("Found @Xml.prefix annotation with no matching URI. prefix=''{0}''", prefix); 483 } 484 485 // If only namespaceURI specified, need to search for prefix. 486 if (! ns.isEmpty()) { 487 if (nn(xmls)) 488 for (var xml2 : xmls) 489 if (xml2.namespace().equals(ns) && ! xml2.prefix().isEmpty()) 490 return Namespace.of(xml2.prefix(), ns); 491 for (var schema : schemas) { 492 if (schema.namespace().equals(ns) && ! schema.prefix().isEmpty()) 493 return Namespace.of(schema.prefix(), ns); 494 for (var xmlNs : schema.xmlNs()) 495 if (xmlNs.namespaceURI().equals(ns)) 496 return Namespace.of(xmlNs.prefix(), ns); 497 } 498 } 499 500 return null; 501 } 502 503 // Returns true if the string at the specified position is of the form "_x####_" 504 // where '#' are hexadecimal characters. 505 private static boolean isEscapeSequence(String s, int i) { 506 // @formatter:off 507 return s.length() > i+6 508 && s.charAt(i) == '_' 509 && s.charAt(i+1) == 'x' 510 && isHexCharacter(s.charAt(i+2)) 511 && isHexCharacter(s.charAt(i+3)) 512 && isHexCharacter(s.charAt(i+4)) 513 && isHexCharacter(s.charAt(i+5)) 514 && s.charAt(i+6) == '_'; 515 // @formatter:on 516 } 517 518 // Returns true if the character is a hexadecimal character 519 private static boolean isHexCharacter(char c) { 520 return (c >= '0' && c <= '9') || (c >= 'A' && c <= 'F'); 521 } 522 523 // Returns true if the specified character can safely be used in XML text or an attribute. 524 private static boolean isValidXmlCharacter(char c) { 525 return (c >= 0x20 && c <= 0xD7FF) /*|| c == 0xA || c == 0xD*/ || (c >= 0xE000 && c <= 0xFFFD); 526 } 527 528 private static boolean needsAttrNameEncoding(String value) { 529 // Note that this doesn't need to be perfect, just fast. 530 for (var i = 0; i < value.length(); i++) { 531 var c = value.charAt(i); 532 if (! (c >= '0' && c <= '9' || c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z') || (i == 0 && ! (c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z'))) 533 return true; 534 } 535 return false; 536 } 537 538 private static boolean needsAttrValueEncoding(String value) { 539 // See if we need to convert the string. 540 // Conversion is somewhat expensive, so make sure we need to do so before hand. 541 var len = value.length(); 542 for (var i = 0; i < len; i++) { 543 var c = value.charAt(i); 544 if ((i == 0 || i == len - 1) && Character.isWhitespace(c)) 545 return true; 546 if (REPLACE_ATTR_VAL.contains(c) || ! isValidXmlCharacter(c) || (c == '_' && isEscapeSequence(value, i))) 547 return true; 548 } 549 return false; 550 } 551 552 private static boolean needsElementNameEncoding(String value) { 553 // Note that this doesn't need to be perfect, just fast. 554 for (var i = 0; i < value.length(); i++) { 555 var c = value.charAt(i); 556 if (! (c >= '0' && c <= '9' || c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z') || (i == 0 && (c >= '0' && c <= '9'))) 557 return true; 558 } 559 return false; 560 } 561 562 private static boolean needsTextEncoding(String value) { 563 // See if we need to convert the string. 564 // Conversion is somewhat expensive, so make sure we need to do so before hand. 565 var len = value.length(); 566 for (var i = 0; i < len; i++) { 567 var c = value.charAt(i); 568 if ((i == 0 || i == len - 1) && Character.isWhitespace(c)) 569 return true; 570 if (REPLACE_TEXT.contains(c) || ! isValidXmlCharacter(c) || (c == '_' && isEscapeSequence(value, i))) 571 return true; 572 } 573 return false; 574 } 575}