git.aero2k.de Git - tmp/jakarta-migration.git/blob

   1 package org.collectionspace.services.common.xmljson;
   2
   3 import java.io.IOException;
   4 import java.io.InputStream;
   5 import java.io.OutputStream;
   6 import java.util.Iterator;
   7 import java.util.Stack;
   8
   9 import javax.xml.namespace.QName;
  10 import javax.xml.stream.XMLEventReader;
  11 import javax.xml.stream.XMLInputFactory;
  12 import javax.xml.stream.XMLStreamConstants;
  13 import javax.xml.stream.XMLStreamException;
  14 import javax.xml.stream.events.Attribute;
  15 import javax.xml.stream.events.Namespace;
  16 import javax.xml.stream.events.StartElement;
  17 import javax.xml.stream.events.XMLEvent;
  18
  19 import com.fasterxml.jackson.core.JsonGenerationException;
  20 import com.fasterxml.jackson.databind.JsonMappingException;
  21 import com.fasterxml.jackson.databind.ObjectMapper;
  22
  23 /**
  24  * Converts a CSpace XML payload to a JSON payload.
  25  *
  26  * This class is not intended to serve as a general purpose XML to JSON
  27  * translator. It is instead a lightweight processor tuned for the kinds
  28  * of XML generated by CSpace, and the particular transformations needed
  29  * to generate JSON for CSpace. The XML input is expected to conform to
  30  * conventions (described below) of CSpace XML payloads.
  31  *
  32  * The conversion is performed as follows:
  33  * <ul>
  34  * <li>XML elements are converted to identically-named JSON fields.</li>
  35  * <li>XML attributes are converted to JSON fields prepended with "@".</li>
  36  * <li>XML namespace declarations are converted to JSON fields prepended with "@xmlns:".</li>
  37  * <li>Sibling XML elements that have the same name are converted to JSON arrays.</li>
  38  * </ul>
  39  *
  40  * This implementation is schema-unaware. It operates by examining only the input
  41  * document, without utilizing any XML schema information. This allows for speed
  42  * and simplicity, but has some consequences:
  43  *
  44  * <ul>
  45  * <li>Since type information is not available, all text content is converted to
  46  *     JSON strings.</li>
  47  * <li>Lists are inferred by the presence of multiple child elements with
  48  *     the same name. If an element contains only one child with a given name, it
  49  *     will not be converted to a JSON array, even if multiples are allowed by
  50  *     the XML schema.</li>
  51  * <li>Lists are not known ahead of time, and must be inferred by the presence of
  52  *     multiple identically-named children. This means that all children of an element
  53  *     must be known before JSON for that element can be generated. This makes it
  54  *     necessary to read the entire XML document into memory first, instead of
  55  *     doing a direct stream-to-stream conversion.</li>
  56  * </ul>
  57  *
  58  * Example:
  59  *
  60  * XML
  61  * <pre>
  62  * <document name="collectionobjects">
  63  *   <ns2:collectionspace_core xmlns:ns2="http://collectionspace.org/collectionspace_core/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
  64  *     <createdBy>admin@core.collectionspace.org</createdBy>
  65  *     <createdAt>2016-07-27T04:31:38.290Z</createdAt>
  66  *   </ns2:collectionspace_core>
  67  *   <ns2:collectionobjects_common xmlns:ns2="http://collectionspace.org/services/collectionobject" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
  68  *     <objectNumber>2016.1.1</objectNumber>
  69  *     <objectNameList>
  70  *       <objectNameGroup>
  71  *         <objectNameCurrency/>
  72  *         <objectNameLanguage/>
  73  *         <objectName>Object name</objectName>
  74  *         <objectNameSystem/>
  75  *         <objectNameType/>
  76  *         <objectNameNote/>
  77  *         <objectNameLevel/>
  78  *       </objectNameGroup>
  79  *       <objectNameGroup>
  80  *         <objectNameCurrency/>
  81  *         <objectNameLanguage/>
  82  *         <objectName>Another name</objectName>
  83  *         <objectNameSystem/>
  84  *         <objectNameType/>
  85  *         <objectNameNote/>
  86  *         <objectNameLevel/>
  87  *       </objectNameGroup>
  88  *     </objectNameList>
  89  *     <comments>
  90  *       <comment>Some comment text</comment>
  91  *       <comment>Another comment</comment>
  92  *     </comments>
  93  *   </ns2:collectionobjects_common>
  94  * </document>
  95  * </pre>
  96  *
  97  * JSON
  98  * <pre>
  99  * {
 100  *   "document": {
 101  *     "@name": "collectionobjects",
 102  *     "ns2:collectionspace_core": {
 103  *       "@xmlns:ns2": "http://collectionspace.org/collectionspace_core/",
 104  *       "@xmlns:xsi": "http://www.w3.org/2001/XMLSchema-instance",
 105  *       "createdBy": "admin@core.collectionspace.org",
 106  *       "createdAt": "2016-07-27T04:31:38.290Z"
 107  *     },
 108  *     "ns2:collectionobjects_common": {
 109  *       "@xmlns:ns2": "http://collectionspace.org/services/collectionobject",
 110  *       "@xmlns:xsi": "http://www.w3.org/2001/XMLSchema-instance",
 111  *       "objectNumber": "2016.1.1",
 112  *       "objectNameList": {
 113  *         "objectNameGroup": [
 114  *           {
 115  *             "objectNameCurrency": null,
 116  *             "objectNameLanguage": null,
 117  *             "objectName": "Object name",
 118  *             "objectNameSystem": null,
 119  *             "objectNameType": null,
 120  *             "objectNameNote": null,
 121  *             "objectNameLevel": null
 122  *           },
 123  *           {
 124  *             "objectNameCurrency": null,
 125  *             "objectNameLanguage": null,
 126  *             "objectName": "Another name",
 127  *             "objectNameSystem": null,
 128  *             "objectNameType": null,
 129  *             "objectNameNote": null,
 130  *             "objectNameLevel": null
 131  *           }
 132  *         ]
 133  *       },
 134  *       "comments": {
 135  *         "comment": [
 136  *           "Some comment text",
 137  *           "Another comment"
 138  *         ]
 139  *       }
 140  *     }
 141  *   }
 142  * }
 143  * </pre>
 144  *
 145  * The conversion algorithm assumes that the input XML adheres to the following
 146  * conventions:
 147  *
 148  * <ul>
 149  * <li>The XML does not contain mixed-content elements. Elements may contain text
 150  *     or child elements, but not both. If an element contains child elements,
 151  *     any text adjacent to those child elements is discarded.</li>
 152  * <li>The XML does not have namespace declarations or attributes on text elements.
 153  *     If namespace declarations or attributes appear on elements containing
 154  *     only text, they are discarded.</li>
 155  * <li>The XML does not contain sequences of identically-named elements that are
 156  *     interrupted by other elements; or if it does, those interruptions are not
 157  *     important. For example, the parent node below contains a list of item
 158  *     elements, interrupted by an other element:
 159  *
 160  *     <pre>
 161  *       <parent>
 162  *         <item>a</item>
 163  *         <item>b</item>
 164  *         <item>c</item>
 165  *         <other>uh oh</other>
 166  *         <item>d</item>
 167  *         <item>e</item>
 168  *       </parent>
 169  *     </pre>
 170  *
 171  *     This is translated to:
 172  *
 173  *     <pre>
 174  *       "parent": {
 175  *          "item": [
 176  *            "a",
 177  *            "b",
 178  *            "c",
 179  *            "d",
 180  *            "e"
 181  *          ],
 182  *          "other": "uh oh"
 183  *       }
 184  *     </pre>
 185  *
 186  *     All of the item children of parent are converted into a single
 187  *     list, so the placement of the other element is not retained in
 188  *     JSON.
 189  * </li>
 190  * </ul>
 191  *
 192  * This implementation uses a StAX parser to generate a lightweight
 193  * representation of the input XML document in memory, performs the
 194  * necessary transformations, and outputs a JSON rendering of the
 195  * transformed document. A direct stream-to-stream conversion is
 196  * not possible because of the need to collect identically-named
 197  * XML elements for output as a JSON array; for any element, all children
 198  * must be known before JSON for that element may be written to the
 199  * output stream.
 200  */
 201 public class XmlToJsonStreamConverter {
 202     /**
 203      * The StAX event reader used to parse the XML input stream.
 204      */
 205     protected XMLEventReader xmlEventReader;
 206
 207
 208     /**
 209      * The JSON output stream.
 210      */
 211     protected OutputStream jsonStream;
 212
 213     /**
 214      * A stack used to track the current state of XML parsing.
 215      * XmlNode instances are pushed onto the stack as elements
 216      * are entered, and popped off as elements are exited.
 217      */
 218     protected Stack<XmlNode> stack = new Stack<XmlNode>();
 219
 220     /**
 221      * The result of parsing the XML.
 222      */
 223     protected XmlNode parseResult = null;
 224
 225     /**
 226      * Creates an XmlToJsonStreamConverter that reads XML from an input stream,
 227      * and writes JSON to an output stream.
 228      *
 229      * @param in the XML input stream
 230      * @param out the JSON output stream
 231      * @throws XMLStreamException
 232      */
 233     public XmlToJsonStreamConverter(InputStream in, OutputStream out) throws XMLStreamException {
 234         XMLInputFactory factory = XMLInputFactory.newInstance();
 235
 236         xmlEventReader = factory.createXMLEventReader(in);
 237         jsonStream = out;
 238     }
 239
 240     /**
 241      * Performs the conversion.
 242      *
 243      * @throws XMLStreamException
 244      * @throws JsonGenerationException
 245      * @throws JsonMappingException
 246      * @throws IOException
 247      */
 248     public void convert() throws XMLStreamException, JsonGenerationException, JsonMappingException, IOException {
 249         // Read in the XML stream.
 250
 251         while(xmlEventReader.hasNext()) {
 252             XMLEvent event = xmlEventReader.nextEvent();
 253
 254             switch(event.getEventType()) {
 255                 case XMLStreamConstants.CHARACTERS:
 256                     onCharacters(event);
 257                     break;
 258                 case XMLStreamConstants.START_ELEMENT:
 259                     onStartElement(event);
 260                     break;
 261                 case XMLStreamConstants.END_ELEMENT:
 262                     onEndElement(event);
 263                     break;
 264                 case XMLStreamConstants.START_DOCUMENT:
 265                     onStartDocument(event);
 266                     break;
 267                 case XMLStreamConstants.END_DOCUMENT:
 268                     onEndDocument(event);
 269                     break;
 270             }
 271         }
 272
 273         // The XML has been parsed into parseResult.
 274         // Write it out as JSON.
 275
 276         ObjectMapper objectMapper = new ObjectMapper();
 277         objectMapper.writeValue(jsonStream, parseResult);
 278
 279         jsonStream.flush();
 280     }
 281
 282     /**
 283      * Event handler executed when the start of the XML document is
 284      * encountered in the input stream.
 285      *
 286      * @param event the event
 287      */
 288     protected void onStartDocument(XMLEvent event) {
 289         // Push an unnamed node on the stack to represent the
 290         // document.
 291
 292         stack.push(new XmlNode());
 293     }
 294
 295     /**
 296      * Event handler executed when the end of the XML document is
 297      * encountered in the input stream.
 298      *
 299      * @param event the event
 300      */
 301     protected void onEndDocument(XMLEvent event) {
 302         // The last remaining node on the stack should be
 303         // the one representing the document. Pop it and
 304         // store it in parseResult.
 305
 306         parseResult = stack.pop();
 307     }
 308
 309     /**
 310      * Event handler executed when the start of an XML element is
 311      * encountered in the input stream.
 312      *
 313      * @param event the event
 314      */
 315     @SuppressWarnings("unchecked")
 316     protected void onStartElement(XMLEvent event) {
 317         // Create a node to represent the element.
 318
 319         StartElement element = event.asStartElement();
 320         QName name = element.getName();
 321
 322         XmlNode node = new XmlNode(ConversionUtils.jsonFieldNameFromXMLQName(name));
 323
 324         // Add namespace declarations, if any.
 325
 326         Iterator<Namespace> nsIter = element.getNamespaces();
 327
 328         while(nsIter.hasNext()) {
 329             Namespace ns = nsIter.next();
 330
 331             node.addNamespace(ns.getPrefix(), ns.getNamespaceURI());
 332         }
 333
 334         // Add attributes, if any.
 335
 336         Iterator<Attribute> attrIter = element.getAttributes();
 337
 338         while(attrIter.hasNext()) {
 339             Attribute attr = attrIter.next();
 340
 341             node.addAttribute(attr.getName().toString(), attr.getValue());
 342         }
 343
 344         // Push the node onto the stack.
 345
 346         stack.push(node);
 347     }
 348
 349     /**
 350      * Event handler executed when the end of an XML element is
 351      * encountered in the input stream.
 352      *
 353      * @param event the event
 354      */
 355     protected void onEndElement(XMLEvent event) {
 356         // Pop the node corresponding to this element off the stack.
 357
 358         XmlNode node = stack.pop();
 359         XmlNode parent = stack.peek();
 360
 361         // Add the node to its parent. This is done here instead of
 362         // in onStartElement(), because we now know the entire contents
 363         // of the element. This gives us the possibility to prevent
 364         // adding elements that are empty. In onStartElement(), we don't
 365         // yet know if the element is going to be empty.
 366
 367         parent.addChild(node);
 368     }
 369
 370     /**
 371      * Event handler executed when character content is
 372      * encountered in the input stream.
 373      *
 374      * @param event the event
 375      */
 376     protected void onCharacters(XMLEvent event) {
 377         // Add the text to the parent element.
 378
 379         String text = event.asCharacters().getData();
 380         XmlNode parent = stack.peek();
 381
 382         parent.addText(text);
 383     }
 384 }