001// *************************************************************************************************************************** 002// * Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the NOTICE file * 003// * distributed with this work for additional information regarding copyright ownership. The ASF licenses this file * 004// * to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance * 005// * with the License. You may obtain a copy of the License at * 006// * * 007// * http://www.apache.org/licenses/LICENSE-2.0 * 008// * * 009// * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an * 010// * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the * 011// * specific language governing permissions and limitations under the License. * 012// *************************************************************************************************************************** 013package org.apache.juneau.xml; 014 015import static org.apache.juneau.common.internal.StringUtils.*; 016import static org.apache.juneau.common.internal.ThrowableUtils.*; 017 018import java.io.*; 019import java.util.*; 020 021import javax.xml.stream.*; 022 023import org.apache.juneau.*; 024import org.apache.juneau.common.internal.*; 025import org.apache.juneau.internal.*; 026import org.apache.juneau.xml.annotation.*; 027 028/** 029 * XML utility methods. 030 * 031 * <h5 class='section'>See Also:</h5><ul> 032 * <li class='link'><a class="doclink" href="../../../../index.html#jm.XmlDetails">XML Details</a> 033 034 * </ul> 035 */ 036public final class XmlUtils { 037 038 //----------------------------------------------------------------------------------------------------------------- 039 // XML element names 040 //----------------------------------------------------------------------------------------------------------------- 041 042 /** 043 * Encodes any invalid XML element name characters to <c>_x####_</c> sequences. 044 * 045 * @param w The writer to send the output to. 046 * @param value The object being encoded. 047 * @return The same writer passed in. 048 */ 049 public static final Writer encodeElementName(Writer w, Object value) { 050 try { 051 if (value == null) 052 return w.append("_x0000_"); 053 String s = value.toString(); 054 if (needsElementNameEncoding(s)) 055 return encodeElementNameInner(w, s); 056 w.append(s); 057 } catch (IOException e) { 058 throw asRuntimeException(e); 059 } 060 return w; 061 } 062 063 /** 064 * Encodes any invalid XML element name characters to <c>_x####_</c> sequences. 065 * 066 * @param value The object being encoded. 067 * @return The encoded element name string. 068 */ 069 public static final String encodeElementName(Object value) { 070 if (value == null) 071 return "_x0000_"; 072 String s = value.toString(); 073 if (s.isEmpty()) 074 return "_xE000_"; 075 076 try { 077 if (needsElementNameEncoding(s)) 078 try (Writer w = new StringBuilderWriter(s.length() * 2)) { 079 return encodeElementNameInner(w, s).toString(); 080 } 081 } catch (IOException e) { 082 throw asRuntimeException(e); // Never happens 083 } 084 085 return s; 086 } 087 088 private static final Writer encodeElementNameInner(Writer w, String s) throws IOException { 089 for (int i = 0; i < s.length(); i++) { 090 char c = s.charAt(i); 091 if ((c >= 'A' && c <= 'Z') 092 || (c == '_' && ! isEscapeSequence(s,i)) 093 || (c >= 'a' && c <= 'z') 094 || (i != 0 && ( 095 c == '-' 096 || c == '.' 097 || (c >= '0' && c <= '9') 098 || c == '\u00b7' 099 || (c >= '\u0300' && c <= '\u036f') 100 || (c >= '\u203f' && c <= '\u2040') 101 )) 102 || (c >= '\u00c0' && c <= '\u00d6') 103 || (c >= '\u00d8' && c <= '\u00f6') 104 || (c >= '\u00f8' && c <= '\u02ff') 105 || (c >= '\u0370' && c <= '\u037d') 106 || (c >= '\u037f' && c <= '\u1fff') 107 || (c >= '\u200c' && c <= '\u200d') 108 || (c >= '\u2070' && c <= '\u218f') 109 || (c >= '\u2c00' && c <= '\u2fef') 110 || (c >= '\u3001' && c <= '\ud7ff') 111 || (c >= '\uf900' && c <= '\ufdcf') 112 || (c >= '\ufdf0' && c <= '\ufffd')) { 113 w.append(c); 114 } else { 115 appendPaddedHexChar(w, c); 116 } 117 } 118 return w; 119 } 120 121 private static final boolean needsElementNameEncoding(String value) { 122 // Note that this doesn't need to be perfect, just fast. 123 for (int i = 0; i < value.length(); i++) { 124 char c = value.charAt(i); 125 if (! (c >= '0' && c <= '9' || c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z') || (i == 0 && (c >= '0' && c <= '9'))) 126 return true; 127 } 128 return false; 129 } 130 131 //----------------------------------------------------------------------------------------------------------------- 132 // XML element text 133 //----------------------------------------------------------------------------------------------------------------- 134 135 /** 136 * Escapes invalid XML text characters to <c>_x####_</c> sequences. 137 * 138 * @param value The object being encoded. 139 * @return The encoded string. 140 */ 141 public static final String escapeText(Object value) { 142 if (value == null) 143 return "_x0000_"; 144 String s = value.toString(); 145 146 try { 147 if (! needsTextEncoding(s)) 148 return s; 149 final int len = s.length(); 150 StringWriter sw = new StringWriter(s.length()*2); 151 for (int i = 0; i < len; i++) { 152 char c = s.charAt(i); 153 if ((i == 0 || i == len-1) && Character.isWhitespace(c)) 154 appendPaddedHexChar(sw, c); 155 else if (c == '_' && isEscapeSequence(s,i)) 156 appendPaddedHexChar(sw, c); 157 else if (isValidXmlCharacter(c)) 158 sw.append(c); 159 else 160 appendPaddedHexChar(sw, c); 161 } 162 return sw.toString(); 163 } catch (IOException e) { 164 throw asRuntimeException(e); // Never happens 165 } 166 } 167 168 /** 169 * Encodes the specified element text and sends the results to the specified writer. 170 * 171 * <p> 172 * Encodes any invalid XML text characters to <c>_x####_</c> sequences and sends the response to the specified 173 * writer. 174 * <br>Encodes <js>'&'</js>, <js>'<'</js>, and <js>'>'</js> as XML entities. 175 * <br>Encodes invalid XML text characters to <c>_x####_</c> sequences. 176 * 177 * @param w The writer to send the output to. 178 * @param value The object being encoded. 179 * @param trim Trim the text before serializing it. 180 * @param preserveWhitespace 181 * Specifies whether we're in preserve-whitespace mode. 182 * (e.g. {@link XmlFormat#MIXED_PWS} or {@link XmlFormat#TEXT_PWS}. 183 * If <jk>true</jk>, leading and trailing whitespace characters will be encoded. 184 * @return The same writer passed in. 185 */ 186 public static final Writer encodeText(Writer w, Object value, boolean trim, boolean preserveWhitespace) { 187 188 try { 189 if (value == null) 190 return w.append("_x0000_"); 191 String s = value.toString(); 192 if (s.isEmpty()) 193 return w.append("_xE000_"); 194 if (trim) 195 s = s.trim(); 196 197 if (needsTextEncoding(s)) { 198 final int len = s.length(); 199 for (int i = 0; i < len; i++) { 200 char c = s.charAt(i); 201 if ((i == 0 || i == len-1) && Character.isWhitespace(c) && ! preserveWhitespace) 202 appendPaddedHexChar(w, c); 203 else if (REPLACE_TEXT.contains(c)) 204 w.append(REPLACE_TEXT.get(c)); 205 else if (c == '_' && isEscapeSequence(s,i)) 206 appendPaddedHexChar(w, c); 207 else if (isValidXmlCharacter(c)) 208 w.append(c); 209 else 210 appendPaddedHexChar(w, c); 211 } 212 } else { 213 w.append(s); 214 } 215 } catch (IOException e) { 216 throw asRuntimeException(e); 217 } 218 219 return w; 220 } 221 222 private static final boolean needsTextEncoding(String value) { 223 // See if we need to convert the string. 224 // Conversion is somewhat expensive, so make sure we need to do so before hand. 225 final int len = value.length(); 226 for (int i = 0; i < len; i++) { 227 char c = value.charAt(i); 228 if ((i == 0 || i == len-1) && Character.isWhitespace(c)) 229 return true; 230 if (REPLACE_TEXT.contains(c) || ! isValidXmlCharacter(c) || (c == '_' && isEscapeSequence(value,i))) 231 return true; 232 } 233 return false; 234 } 235 236 private static AsciiMap REPLACE_TEXT = new AsciiMap() 237 .append('&', "&") 238 .append('<', "<") 239 .append('>', ">") 240 .append((char)0x09, "	") 241 .append((char)0x0A, "
") 242 .append((char)0x0D, "
"); 243 244 245 //----------------------------------------------------------------------------------------------------------------- 246 // XML attribute names 247 //----------------------------------------------------------------------------------------------------------------- 248 249 /** 250 * Serializes and encodes the specified object as valid XML attribute name. 251 * 252 * @param w The writer to send the output to. 253 * @param value The object being serialized. 254 * @return This object. 255 * @throws IOException If a problem occurred. 256 */ 257 public static final Writer encodeAttrName(Writer w, Object value) throws IOException { 258 if (value == null) 259 return w.append("_x0000_"); 260 String s = value.toString(); 261 262 if (needsAttrNameEncoding(s)) { 263 for (int i = 0; i < s.length(); i++) { 264 char c = s.charAt(i); 265 if (i == 0) { 266 if (c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z' || c == ':') 267 w.append(c); 268 else if (c == '_' && ! isEscapeSequence(s,i)) 269 w.append(c); 270 else 271 appendPaddedHexChar(w, c); 272 } else { 273 if ((c >= '0' && c <= '9' || c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z' || c == ':')) 274 w.append(c); 275 else if (c == '_' && ! isEscapeSequence(s,i)) 276 w.append(c); 277 else 278 appendPaddedHexChar(w, c); 279 } 280 } 281 } else { 282 w.append(s); 283 } 284 285 return w; 286 } 287 288 private static final boolean needsAttrNameEncoding(String value) { 289 // Note that this doesn't need to be perfect, just fast. 290 for (int i = 0; i < value.length(); i++) { 291 char c = value.charAt(i); 292 if (! (c >= '0' && c <= '9' || c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z') || (i == 0 && ! (c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z'))) 293 return true; 294 } 295 return false; 296 } 297 298 //----------------------------------------------------------------------------------------------------------------- 299 // XML attribute values 300 //----------------------------------------------------------------------------------------------------------------- 301 302 /** 303 * Encodes the specified attribute value and sends the results to the specified writer. 304 * 305 * <p> 306 * Encodes any invalid XML text characters to <c>_x####_</c> sequences and sends the response to the specified 307 * writer. 308 * <br>Encodes <js>'&'</js>, <js>'<'</js>, <js>'>'</js>, <js>'"'</js>, and <js>'\''</js> as XML entities. 309 * <br>Encodes invalid XML text characters to <c>_x####_</c> sequences. 310 * 311 * @param w The writer to send the output to. 312 * @param value The object being encoded. 313 * @param trim 314 * Trim the text before serializing it. 315 * If <jk>true</jk>, leading and trailing whitespace characters will be encoded. 316 * @return The same writer passed in. 317 */ 318 public static final Writer encodeAttrValue(Writer w, Object value, boolean trim) { 319 try { 320 if (value == null) 321 return w.append("_x0000_"); 322 String s = value.toString(); 323 if (s.isEmpty()) 324 return w; 325 if (trim) 326 s = s.trim(); 327 328 if (needsAttrValueEncoding(s)) { 329 final int len = s.length(); 330 for (int i = 0; i < len; i++) { 331 char c = s.charAt(i); 332 if ((i == 0 || i == len-1) && Character.isWhitespace(c)) 333 appendPaddedHexChar(w, c); 334 else if (REPLACE_ATTR_VAL.contains(c)) 335 w.append(REPLACE_ATTR_VAL.get(c)); 336 else if (c == '_' && isEscapeSequence(s,i)) 337 appendPaddedHexChar(w, c); 338 else if (isValidXmlCharacter(c)) 339 w.append(c); 340 else 341 appendPaddedHexChar(w, c); 342 } 343 } else { 344 w.append(s); 345 } 346 } catch (IOException e) { 347 throw asRuntimeException(e); 348 } 349 350 return w; 351 } 352 353 private static final boolean needsAttrValueEncoding(String value) { 354 // See if we need to convert the string. 355 // Conversion is somewhat expensive, so make sure we need to do so before hand. 356 final int len = value.length(); 357 for (int i = 0; i < len; i++) { 358 char c = value.charAt(i); 359 if ((i == 0 || i == len-1) && Character.isWhitespace(c)) 360 return true; 361 if (REPLACE_ATTR_VAL.contains(c) || ! isValidXmlCharacter(c) || (c == '_' && isEscapeSequence(value,i))) 362 return true; 363 } 364 return false; 365 } 366 367 private static AsciiMap REPLACE_ATTR_VAL = new AsciiMap() 368 .append('&', "&") 369 .append('<', "<") 370 .append('>', ">") 371 .append('"', """) 372 .append('\'', "'") 373 .append((char)0x09, "	") 374 .append((char)0x0A, "
") 375 .append((char)0x0D, "
"); 376 377 378 //----------------------------------------------------------------------------------------------------------------- 379 // Decode XML text 380 //----------------------------------------------------------------------------------------------------------------- 381 382 /** 383 * Translates any _x####_ sequences (introduced by the various encode methods) back into their original characters. 384 * 385 * @param value The string being decoded. 386 * @param sb The string builder to use as a scratch pad. 387 * @return The decoded string. 388 */ 389 public static final String decode(String value, StringBuilder sb) { 390 if (value == null) 391 return null; 392 if (value.length() == 0 || value.indexOf('_') == -1) 393 return value; 394 if (sb == null) 395 sb = new StringBuilder(value.length()); 396 397 for (int i = 0; i < value.length(); i++) { 398 char c = value.charAt(i); 399 if (c == '_' && isEscapeSequence(value,i)) { 400 401 int x = Integer.parseInt(value.substring(i+2, i+6), 16); 402 403 // If we find _x0000_, then that means a null. 404 // If we find _xE000_, then that means an empty string. 405 if (x == 0) 406 return null; 407 else if (x != 0xE000) 408 sb.append((char)x); 409 410 i+=6; 411 } else { 412 sb.append(c); 413 } 414 } 415 return sb.toString(); 416 } 417 418 419 /** 420 * Given a list of Strings and other Objects, combines Strings that are next to each other in the list. 421 * 422 * @param value The list of text nodes to collapse. 423 * @return The same list. 424 */ 425 public static LinkedList<Object> collapseTextNodes(LinkedList<Object> value) { 426 427 String prev = null; 428 for (ListIterator<Object> i = value.listIterator(); i.hasNext();) { 429 Object o = i.next(); 430 if (o instanceof String) { 431 if (prev == null) 432 prev = o.toString(); 433 else { 434 prev += o; 435 i.remove(); 436 i.previous(); 437 i.remove(); 438 i.add(prev); 439 } 440 } else { 441 prev = null; 442 } 443 } 444 return value; 445 } 446 447 //----------------------------------------------------------------------------------------------------------------- 448 // Other methods 449 //----------------------------------------------------------------------------------------------------------------- 450 451 // Returns true if the specified character can safely be used in XML text or an attribute. 452 private static final boolean isValidXmlCharacter(char c) { 453 return (c >= 0x20 && c <= 0xD7FF) /*|| c == 0xA || c == 0xD*/ || (c >= 0xE000 && c <= 0xFFFD); 454 } 455 456 // Returns true if the string at the specified position is of the form "_x####_" 457 // where '#' are hexadecimal characters. 458 private static final boolean isEscapeSequence(String s, int i) { 459 return s.length() > i+6 460 && s.charAt(i) == '_' 461 && s.charAt(i+1) == 'x' 462 && isHexCharacter(s.charAt(i+2)) 463 && isHexCharacter(s.charAt(i+3)) 464 && isHexCharacter(s.charAt(i+4)) 465 && isHexCharacter(s.charAt(i+5)) 466 && s.charAt(i+6) == '_'; 467 } 468 469 // Returns true if the character is a hexadecimal character 470 private static final boolean isHexCharacter(char c) { 471 return (c >= '0' && c <= '9') || (c >= 'A' && c <= 'F'); 472 } 473 474 // Converts an integer to a hexadecimal string padded to 4 places. 475 private static final Writer appendPaddedHexChar(Writer out, int num) throws IOException { 476 out.append("_x"); 477 for (char c : toHex4(num)) 478 out.append(c); 479 return out.append('_'); 480 } 481 482 /** 483 * Find the namespace given a list of <ja>@Xml</ja> and <ja>@XmlSchema</ja> annotations. 484 * 485 * <p> 486 * The annotations should be a parent-to-child ordering of annotations found on a class or method. 487 * 488 * @param xmls The list of <ja>@Xml</ja> annotations. 489 * @param schemas The list of <ja>@XmlSchema</ja> annotations. 490 * @return The namespace, or <jk>null</jk> if it couldn't be found. 491 */ 492 public static Namespace findNamespace(List<Xml> xmls, List<XmlSchema> schemas) { 493 494 for (int i = xmls.size()-1; i >= 0; i--) { 495 Xml xml = xmls.get(i); 496 Namespace ns = findNamespace(xml.prefix(), xml.namespace(), xmls, schemas); 497 if (ns != null) 498 return ns; 499 } 500 501 for (int i = schemas.size()-1; i >= 0; i--) { 502 XmlSchema schema = schemas.get(i); 503 Namespace ns = findNamespace(schema.prefix(), schema.namespace(), null, schemas); 504 if (ns != null) 505 return ns; 506 } 507 508 return null; 509 } 510 511 private static Namespace findNamespace(String prefix, String ns, List<Xml> xmls, List<XmlSchema> schemas) { 512 513 // If both prefix and namespace specified, use that Namespace mapping. 514 if (! (prefix.isEmpty() || ns.isEmpty())) 515 return Namespace.of(prefix, ns); 516 517 // If only prefix specified, need to search for namespaceURI. 518 if (! prefix.isEmpty()) { 519 if (xmls != null) 520 for (Xml xml2 : xmls) 521 if (xml2.prefix().equals(prefix) && ! xml2.namespace().isEmpty()) 522 return Namespace.of(prefix, xml2.namespace()); 523 for (XmlSchema schema : schemas) { 524 if (schema.prefix().equals(prefix) && ! schema.namespace().isEmpty()) 525 return Namespace.of(prefix, schema.namespace()); 526 for (XmlNs xmlNs : schema.xmlNs()) 527 if (xmlNs.prefix().equals(prefix)) 528 return Namespace.of(prefix, xmlNs.namespaceURI()); 529 } 530 throw new BeanRuntimeException("Found @Xml.prefix annotation with no matching URI. prefix='"+prefix+"'"); 531 } 532 533 // If only namespaceURI specified, need to search for prefix. 534 if (! ns.isEmpty()) { 535 if (xmls != null) 536 for (Xml xml2 : xmls) 537 if (xml2.namespace().equals(ns) && ! xml2.prefix().isEmpty()) 538 return Namespace.of(xml2.prefix(), ns); 539 for (XmlSchema schema : schemas) { 540 if (schema.namespace().equals(ns) && ! schema.prefix().isEmpty()) 541 return Namespace.of(schema.prefix(), ns); 542 for (XmlNs xmlNs : schema.xmlNs()) 543 if (xmlNs.namespaceURI().equals(ns)) 544 return Namespace.of(xmlNs.prefix(), ns); 545 } 546 } 547 548 return null; 549 } 550 551 /** 552 * Utility method that converts the current event on the XML stream to something human-readable for debug purposes. 553 * 554 * @param r The XML stream reader whose current event is to be converted to a readable string. 555 * @return The event in human-readable form. 556 */ 557 public static final String toReadableEvent(XMLStreamReader r) { 558 int t = r.getEventType(); 559 if (t == 1) 560 return "<"+r.getLocalName()+">"; 561 if (t == 2) 562 return "</"+r.getLocalName()+">"; 563 if (t == 3) 564 return "PROCESSING_INSTRUCTION"; 565 if (t == 4) 566 return "CHARACTERS=[" + r.getText() + "]"; 567 if (t == 5) 568 return "COMMENTS=[" + r.getText() + "]"; 569 if (t == 6) 570 return "SPACE=[" + r.getText() + "]"; 571 if (t == 7) 572 return "START_DOCUMENT"; 573 if (t == 8) 574 return "END_DOCUMENT"; 575 if (t == 9) 576 return "ENTITY_REFERENCE"; 577 if (t == 10) 578 return "ATTRIBUTE"; 579 if (t == 11) 580 return "DTD"; 581 if (t == 12) 582 return "CDATA=["+r.getText()+"]"; 583 if (t == 13) 584 return "NAMESPACE"; 585 if (t == 14) 586 return "NOTATION_DECLARATION"; 587 if (t == 15) 588 return "ENTITY_DECLARATION"; 589 return "UNKNOWN"; 590 } 591}