1 package org.collectionspace.services.common.xmljson;
3 import java.io.IOException;
4 import java.io.InputStream;
5 import java.io.OutputStream;
6 import java.util.Iterator;
7 import java.util.Stack;
9 import javax.xml.namespace.QName;
10 import javax.xml.stream.XMLEventReader;
11 import javax.xml.stream.XMLInputFactory;
12 import javax.xml.stream.XMLStreamConstants;
13 import javax.xml.stream.XMLStreamException;
14 import javax.xml.stream.events.Attribute;
15 import javax.xml.stream.events.Namespace;
16 import javax.xml.stream.events.StartElement;
17 import javax.xml.stream.events.XMLEvent;
19 import com.fasterxml.jackson.core.JsonGenerationException;
20 import com.fasterxml.jackson.databind.JsonMappingException;
21 import com.fasterxml.jackson.databind.ObjectMapper;
24 * Converts a CSpace XML payload to a JSON payload.
26 * This class is not intended to serve as a general purpose XML to JSON
27 * translator. It is instead a lightweight processor tuned for the kinds
28 * of XML generated by CSpace, and the particular transformations needed
29 * to generate JSON for CSpace. The XML input is expected to conform to
30 * conventions (described below) of CSpace XML payloads.
32 * The conversion is performed as follows:
34 * <li>XML elements are converted to identically-named JSON fields.</li>
35 * <li>XML attributes are converted to JSON fields prepended with "@".</li>
36 * <li>XML namespace declarations are converted to JSON fields prepended with "@xmlns:".</li>
37 * <li>Sibling XML elements that have the same name are converted to JSON arrays.</li>
40 * This implementation is schema-unaware. It operates by examining only the input
41 * document, without utilizing any XML schema information. This allows for speed
42 * and simplicity, but has some consequences:
45 * <li>Since type information is not available, all text content is converted to
47 * <li>Lists are inferred by the presence of multiple child elements with
48 * the same name. If an element contains only one child with a given name, it
49 * will not be converted to a JSON array, even if multiples are allowed by
50 * the XML schema.</li>
51 * <li>Lists are not known ahead of time, and must be inferred by the presence of
52 * multiple identically-named children. This means that all children of an element
53 * must be known before JSON for that element can be generated. This makes it
54 * necessary to read the entire XML document into memory first, instead of
55 * doing a direct stream-to-stream conversion.</li>
62 * <document name="collectionobjects">
63 * <ns2:collectionspace_core xmlns:ns2="http://collectionspace.org/collectionspace_core/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
64 * <createdBy>admin@core.collectionspace.org</createdBy>
65 * <createdAt>2016-07-27T04:31:38.290Z</createdAt>
66 * </ns2:collectionspace_core>
67 * <ns2:collectionobjects_common xmlns:ns2="http://collectionspace.org/services/collectionobject" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
68 * <objectNumber>2016.1.1</objectNumber>
71 * <objectNameCurrency/>
72 * <objectNameLanguage/>
73 * <objectName>Object name</objectName>
80 * <objectNameCurrency/>
81 * <objectNameLanguage/>
82 * <objectName>Another name</objectName>
90 * <comment>Some comment text</comment>
91 * <comment>Another comment</comment>
93 * </ns2:collectionobjects_common>
101 * "@name": "collectionobjects",
102 * "ns2:collectionspace_core": {
103 * "@xmlns:ns2": "http://collectionspace.org/collectionspace_core/",
104 * "@xmlns:xsi": "http://www.w3.org/2001/XMLSchema-instance",
105 * "createdBy": "admin@core.collectionspace.org",
106 * "createdAt": "2016-07-27T04:31:38.290Z"
108 * "ns2:collectionobjects_common": {
109 * "@xmlns:ns2": "http://collectionspace.org/services/collectionobject",
110 * "@xmlns:xsi": "http://www.w3.org/2001/XMLSchema-instance",
111 * "objectNumber": "2016.1.1",
112 * "objectNameList": {
113 * "objectNameGroup": [
115 * "objectNameCurrency": null,
116 * "objectNameLanguage": null,
117 * "objectName": "Object name",
118 * "objectNameSystem": null,
119 * "objectNameType": null,
120 * "objectNameNote": null,
121 * "objectNameLevel": null
124 * "objectNameCurrency": null,
125 * "objectNameLanguage": null,
126 * "objectName": "Another name",
127 * "objectNameSystem": null,
128 * "objectNameType": null,
129 * "objectNameNote": null,
130 * "objectNameLevel": null
136 * "Some comment text",
145 * The conversion algorithm assumes that the input XML adheres to the following
149 * <li>The XML does not contain mixed-content elements. Elements may contain text
150 * or child elements, but not both. If an element contains child elements,
151 * any text adjacent to those child elements is discarded.</li>
152 * <li>The XML does not have namespace declarations or attributes on text elements.
153 * If namespace declarations or attributes appear on elements containing
154 * only text, they are discarded.</li>
155 * <li>The XML does not contain sequences of identically-named elements that are
156 * interrupted by other elements; or if it does, those interruptions are not
157 * important. For example, the parent node below contains a list of item
158 * elements, interrupted by an other element:
165 * <other>uh oh</other>
171 * This is translated to:
186 * All of the item children of parent are converted into a single
187 * list, so the placement of the other element is not retained in
192 * This implementation uses a StAX parser to generate a lightweight
193 * representation of the input XML document in memory, performs the
194 * necessary transformations, and outputs a JSON rendering of the
195 * transformed document. A direct stream-to-stream conversion is
196 * not possible because of the need to collect identically-named
197 * XML elements for output as a JSON array; for any element, all children
198 * must be known before JSON for that element may be written to the
201 public class XmlToJsonStreamConverter {
203 * The StAX event reader used to parse the XML input stream.
205 protected XMLEventReader xmlEventReader;
209 * The JSON output stream.
211 protected OutputStream jsonStream;
214 * A stack used to track the current state of XML parsing.
215 * XmlNode instances are pushed onto the stack as elements
216 * are entered, and popped off as elements are exited.
218 protected Stack<XmlNode> stack = new Stack<XmlNode>();
221 * The result of parsing the XML.
223 protected XmlNode parseResult = null;
226 * Creates an XmlToJsonStreamConverter that reads XML from an input stream,
227 * and writes JSON to an output stream.
229 * @param in the XML input stream
230 * @param out the JSON output stream
231 * @throws XMLStreamException
233 public XmlToJsonStreamConverter(InputStream in, OutputStream out) throws XMLStreamException {
234 XMLInputFactory factory = XMLInputFactory.newInstance();
236 xmlEventReader = factory.createXMLEventReader(in);
241 * Performs the conversion.
243 * @throws XMLStreamException
244 * @throws JsonGenerationException
245 * @throws JsonMappingException
246 * @throws IOException
248 public void convert() throws XMLStreamException, JsonGenerationException, JsonMappingException, IOException {
249 // Read in the XML stream.
251 while(xmlEventReader.hasNext()) {
252 XMLEvent event = xmlEventReader.nextEvent();
254 switch(event.getEventType()) {
255 case XMLStreamConstants.CHARACTERS:
258 case XMLStreamConstants.START_ELEMENT:
259 onStartElement(event);
261 case XMLStreamConstants.END_ELEMENT:
264 case XMLStreamConstants.START_DOCUMENT:
265 onStartDocument(event);
267 case XMLStreamConstants.END_DOCUMENT:
268 onEndDocument(event);
273 // The XML has been parsed into parseResult.
274 // Write it out as JSON.
276 ObjectMapper objectMapper = new ObjectMapper();
277 objectMapper.writeValue(jsonStream, parseResult);
283 * Event handler executed when the start of the XML document is
284 * encountered in the input stream.
286 * @param event the event
288 protected void onStartDocument(XMLEvent event) {
289 // Push an unnamed node on the stack to represent the
292 stack.push(new XmlNode());
296 * Event handler executed when the end of the XML document is
297 * encountered in the input stream.
299 * @param event the event
301 protected void onEndDocument(XMLEvent event) {
302 // The last remaining node on the stack should be
303 // the one representing the document. Pop it and
304 // store it in parseResult.
306 parseResult = stack.pop();
310 * Event handler executed when the start of an XML element is
311 * encountered in the input stream.
313 * @param event the event
315 @SuppressWarnings("unchecked")
316 protected void onStartElement(XMLEvent event) {
317 // Create a node to represent the element.
319 StartElement element = event.asStartElement();
320 QName name = element.getName();
322 XmlNode node = new XmlNode(ConversionUtils.jsonFieldNameFromXMLQName(name));
324 // Add namespace declarations, if any.
326 Iterator<Namespace> nsIter = element.getNamespaces();
328 while(nsIter.hasNext()) {
329 Namespace ns = nsIter.next();
331 node.addNamespace(ns.getPrefix(), ns.getNamespaceURI());
334 // Add attributes, if any.
336 Iterator<Attribute> attrIter = element.getAttributes();
338 while(attrIter.hasNext()) {
339 Attribute attr = attrIter.next();
341 node.addAttribute(attr.getName().toString(), attr.getValue());
344 // Push the node onto the stack.
350 * Event handler executed when the end of an XML element is
351 * encountered in the input stream.
353 * @param event the event
355 protected void onEndElement(XMLEvent event) {
356 // Pop the node corresponding to this element off the stack.
358 XmlNode node = stack.pop();
359 XmlNode parent = stack.peek();
361 // Add the node to its parent. This is done here instead of
362 // in onStartElement(), because we now know the entire contents
363 // of the element. This gives us the possibility to prevent
364 // adding elements that are empty. In onStartElement(), we don't
365 // yet know if the element is going to be empty.
367 parent.addChild(node);
371 * Event handler executed when character content is
372 * encountered in the input stream.
374 * @param event the event
376 protected void onCharacters(XMLEvent event) {
377 // Add the text to the parent element.
379 String text = event.asCharacters().getData();
380 XmlNode parent = stack.peek();
382 parent.addText(text);