Mega Code Archive

 
Categories / Java / Data Type
 

A simple XML parser that starts parsing right away and validates along the way

//Copyright 2007-2008 David Yu dyuproject@gmail.com //------------------------------------------------------------------------ //Licensed under the Apache License, Version 2.0 (the "License"); //you may not use this file except in compliance with the License. //You may obtain a copy of the License at  //http://www.apache.org/licenses/LICENSE-2.0 //Unless required by applicable law or agreed to in writing, software //distributed under the License is distributed on an "AS IS" BASIS, //WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //See the License for the specific language governing permissions and //limitations under the License. //package com.dyuproject.util.xml; import java.io.IOException; import java.io.InputStreamReader; /**  * A simple XML parser that starts parsing right away and validates along the way.   *   * @author David Yu  * @created Sep 17, 2008  */ public final class XMLParser {              private static final int STATE_EL_STARTING = 1;     private static final int STATE_EL_STARTED = 2;     private static final int STATE_EL_ENDING = 3;     private static final int STATE_EL_ENDED = 4;     private static final int STATE_EL_ATTR_NAME_START = 5;     private static final int STATE_EL_ATTR_VALUE_START = 6;     private static final int STATE_EL_ATTR_VALUE_END = 7;          private static final int STATE_EL_TEXT = 8;     private static final int STATE_COMMENT_STARTING = 9;     private static final int STATE_COMMENT_DASH_START = 10;     private static final int STATE_COMMENT_STARTED = 11;     private static final int STATE_COMMENT_DASH_END = 12;     private static final int STATE_COMMENT_ENDING = 13;     private static final int STATE_IGNORE = 14;     private static final int STATE_CDATA_STARTING = 15;     private static final int STATE_CDATA_STARTED = 16;     private static final int STATE_CDATA_ENDING = 17;     private static final int STATE_CDATA_ENDED = 18;          private static int __defaultBufferSize = 4096;          public static void setDefaultBufferSize(int size)     {         __defaultBufferSize = size;     }          /**      * Lazily parses the given {@code reader} using the default buffer size       * {@link #__defaultBufferSize}.  The parsing can be terminated by       * the {@link LazyHandler} {@code handler} at any point.      */     public static void parse(InputStreamReader reader, LazyHandler handler,              boolean includeInnerText) throws IOException     {         parse(reader, handler, includeInnerText, __defaultBufferSize);             }          /**      * Lazily parses the given {@code reader}.  The parsing can be terminated by       * the {@link LazyHandler} {@code handler} at any point.      */     public static void parse(InputStreamReader reader, LazyHandler handler,              boolean includeInnerText, int bufferSize) throws IOException     {         if(handler==null)             throw new IllegalArgumentException("LazyHandler arg must not be null.");         char[] cbuf = new char[bufferSize];         int offset = 0;         int len = 0;         int state = 0;         int stateBeforeComment = 0;         int mark = -1;         int elwsMark = -1;         int nsMark = -1;         String attrName = null;         String attrValue = null;         boolean dq = true;         boolean searchRoot = true;         while((len = reader.read(cbuf, offset, cbuf.length-offset))!=-1)         {                         for(int i=0; i<len; i++, offset++)             {                                 char c = cbuf[offset];                 switch(c)                 {                     case '<':                         switch(state)                         {                             case STATE_COMMENT_STARTED:                             case STATE_IGNORE:                                                                 continue;                             case STATE_COMMENT_ENDING://handle --< comments                                 state = STATE_COMMENT_STARTED;                                 continue;                             case STATE_CDATA_ENDING:                             case STATE_CDATA_ENDED:                                 state = STATE_CDATA_STARTED;                                 continue;                             case 0:                                 state = STATE_EL_STARTING;                                 mark = offset;                                 continue;                             case STATE_EL_ENDED:                             case STATE_EL_STARTED:                                 stateBeforeComment = state;                                 state = STATE_EL_STARTING;                                 mark = offset;                                 continue;                             case STATE_EL_TEXT:                                 stateBeforeComment = state;                                 state = STATE_EL_STARTING;                                 if(mark!=-1 && includeInnerText)                                 {                                     handler.characters(cbuf, mark+1, offset-mark-1);                                 }                                 mark = offset;                                 continue;                         }                         continue;                                          case '>':                         switch(state)                         {                             case STATE_IGNORE:                                                                 if(stateBeforeComment==0)                                     state = 0;                                 continue;                             case STATE_EL_TEXT:// uncommented text                             case STATE_COMMENT_STARTED:                                 continue;                             case STATE_CDATA_ENDING:                                 state = STATE_CDATA_STARTED;                                 continue;                             case STATE_EL_ENDING:                                 state = STATE_EL_ENDED;                                                                 if(!handler.endElement())                                     return;                                 elwsMark = -1;                                 continue;                             case STATE_EL_ATTR_NAME_START:                             case STATE_EL_STARTING:                                 if(elwsMark==-1)                                 {                                     String name = null;                                     String namespace = null;                                                                         if(nsMark==-1)                                         name = new String(cbuf, mark+1, offset-mark-1).trim();                                     else                                     {                                         namespace = new String(cbuf, mark+1, nsMark-mark-1).trim();                                         name = new String(cbuf, nsMark+1, offset-nsMark-1).trim();                                     }                                     if(searchRoot)                                     {                                         if(!handler.rootElement(name, namespace))                                             return;                                         searchRoot = false;                                     }                                     else if(!handler.startElement(name, namespace))                                         return;                                                                                                     }                                 nsMark = -1;                                 elwsMark = -1;                                 state = STATE_EL_STARTED;                                 mark = -1;                                 continue;                             case STATE_COMMENT_ENDING:                                 state = stateBeforeComment;                                 continue;                             case STATE_CDATA_ENDED:                                 state = STATE_EL_TEXT;                                 if(mark!=-1 && includeInnerText)                                 {                                     handler.characters(cbuf, mark+1, offset-2-mark-1);                                 }                                 mark = offset;                                 continue;                         }                         continue;                                              case '/':                         switch(state)                         {                             case STATE_COMMENT_STARTED:                             case STATE_IGNORE:                                                                 continue;                             case STATE_COMMENT_ENDING://handle --/ comments                                 state = STATE_COMMENT_STARTED;                                 continue;                             case STATE_CDATA_ENDING:                             case STATE_CDATA_ENDED:                                 state = STATE_CDATA_STARTED;                                 continue;                             case STATE_EL_ATTR_NAME_START:                                 mark = -1;                                                                 state = STATE_EL_ENDING;                                 continue;                                                              case STATE_EL_STARTED:                                 state = STATE_EL_TEXT;                                 mark = offset-1;                                 continue;                             case STATE_EL_STARTING:                                 if(mark+1!=offset)                                 {                                     String name = null;                                     String namespace = null;                                                                         if(nsMark==-1)                                         name = new String(cbuf, mark+1, offset-mark-1).trim();                                     else                                     {                                         namespace = new String(cbuf, mark+1, nsMark-mark-1).trim();                                         name = new String(cbuf, nsMark+1, offset-nsMark-1).trim();                                     }                                     if(searchRoot)                                     {                                         if(!handler.rootElement(name, namespace))                                             return;                                         searchRoot = false;                                     }                                     else if(!handler.startElement(name, namespace))                                         return;                                                                                                     }                                 state = STATE_EL_ENDING;                                 elwsMark = -1;                                 nsMark = -1;                                 mark = -1;                                 continue;                                                          }                         continue;                                          case ':':                         switch(state)                         {                             case STATE_COMMENT_STARTED:                             case STATE_IGNORE:                                                                 continue;                             case STATE_COMMENT_ENDING://handle --: comments                                 state = STATE_COMMENT_STARTED;                                 continue;                             case STATE_CDATA_ENDING:                             case STATE_CDATA_ENDED:                                 state = STATE_CDATA_STARTED;                                 continue;                             case STATE_EL_STARTING:                                 if(nsMark!=-1)                                     throw new IOException("invalid xml.");                                 nsMark = offset;                                 continue;                         }                                                 continue;                     case '?':                         switch(state)                         {                             case STATE_COMMENT_STARTED:                             case STATE_IGNORE:                                                                 continue;                             case STATE_COMMENT_ENDING://handle --? comments                                 state = STATE_COMMENT_STARTED;                                 continue;                                case STATE_CDATA_ENDING:                             case STATE_CDATA_ENDED:                                 state = STATE_CDATA_STARTED;                                 continue;                             case STATE_EL_STARTING:                                 // uncommented text                                 if(stateBeforeComment==STATE_EL_TEXT)                                     continue;                                 state = STATE_COMMENT_STARTING;                                 mark = -1;                                 continue;                         }                         continue;                     case '!':                         switch(state)                         {                             case STATE_COMMENT_STARTED:                             case STATE_IGNORE:                                                                 continue;                              case STATE_COMMENT_ENDING://handle --! comments                                 state = STATE_COMMENT_STARTED;                                 continue;                                case STATE_CDATA_ENDING:                             case STATE_CDATA_ENDED:                                 state = STATE_CDATA_STARTED;                                 continue;                             case STATE_EL_STARTING:                                 state = STATE_COMMENT_STARTING;                                 mark = -1;                                 continue;                         }                         continue;                     case '[':                         switch(state)                         {                             case STATE_COMMENT_STARTED:                             case STATE_IGNORE:                                                                 continue;                             case STATE_COMMENT_ENDING://handle --[ comments                                 state = STATE_COMMENT_STARTED;                                 continue;                             case STATE_CDATA_ENDING:                             case STATE_CDATA_ENDED:                                 state = STATE_CDATA_STARTED;                                 continue;                             case STATE_COMMENT_STARTING:                                 state = STATE_CDATA_STARTING;                                 if(mark!=-1 && includeInnerText)                                 {                                     handler.characters(cbuf, mark+1, offset-2-mark-1);                                 }                                 mark = -1;                                 continue;                             case STATE_CDATA_STARTING:                                 state = STATE_CDATA_STARTED;                                 mark = offset;                                 continue;                         }                         continue;                     case ']':                         switch(state)                         {                             case STATE_COMMENT_STARTED:                             case STATE_IGNORE:                                                                 continue;                             case STATE_COMMENT_ENDING://handle --[ comments                                 state = STATE_COMMENT_STARTED;                                 continue;                             case STATE_CDATA_ENDED:                                 state = STATE_CDATA_STARTED;                                 continue;                             case STATE_CDATA_STARTED:                                 state = STATE_CDATA_ENDING;                                 continue;                             case STATE_CDATA_ENDING:                                 state = STATE_CDATA_ENDED;                                 continue;                                                         }                         continue;                                              case '-':                         switch(state)                         {                             case STATE_IGNORE:                                                                 continue;                             case STATE_COMMENT_STARTING:                                 state = STATE_COMMENT_DASH_START;                                 continue;                             case STATE_COMMENT_DASH_START:                                 state = STATE_COMMENT_STARTED;                                 continue;                             case STATE_COMMENT_STARTED:                                 state = STATE_COMMENT_DASH_END;                                 continue;                             case STATE_COMMENT_DASH_END:                                 state = STATE_COMMENT_ENDING;                                 continue;                             case STATE_COMMENT_ENDING:// handle ---- text                                 state = STATE_COMMENT_STARTED;                                 continue;                         }                         continue;                                              case '=':                                                 switch(state)                         {                             case STATE_COMMENT_STARTED:                             case STATE_IGNORE:                                                                 continue;                             case STATE_COMMENT_ENDING://handle --= comments                                 state = STATE_COMMENT_STARTED;                                 continue;                                case STATE_CDATA_ENDING:                             case STATE_CDATA_ENDED:                                 state = STATE_CDATA_STARTED;                                 continue;                             case STATE_EL_ATTR_NAME_START:                                                                 state = STATE_EL_ATTR_VALUE_START;                                 attrName = new String(cbuf, mark+1, offset-mark-1).trim();                                                                 mark = -1;                                 continue;                         }                                                 continue;                                                                   case '\'':                                                 switch(state)                         {                             case STATE_COMMENT_STARTED:                             case STATE_IGNORE:                                                                 continue;                             case STATE_COMMENT_ENDING://handle --' comments                                 state = STATE_COMMENT_STARTED;                                 continue;                                case STATE_CDATA_ENDING:                             case STATE_CDATA_ENDED:                                 state = STATE_CDATA_STARTED;                                 continue;                             case STATE_EL_ATTR_VALUE_START:                                 dq = false;                                 state = STATE_EL_ATTR_VALUE_END;                                 mark = offset;                                 continue;                                                              case STATE_EL_ATTR_VALUE_END:                                 if(dq)                                     continue;                                 state = STATE_EL_STARTING;                                 attrValue = new String(cbuf, mark+1, offset-mark-1).trim();                                 handler.attribute(attrName, attrValue);                                 attrName = null;                                 attrValue = null;                                                                 mark = -1;                                 continue;                         }                                                 continue;                     case '"':                                                     switch(state)                         {                             case STATE_COMMENT_STARTED:                             case STATE_IGNORE:                                                                 continue;                             case STATE_COMMENT_ENDING://handle --" comments                                 state = STATE_COMMENT_STARTED;                                 continue;                                case STATE_CDATA_ENDING:                             case STATE_CDATA_ENDED:                                 state = STATE_CDATA_STARTED;                                 continue;                             case STATE_EL_ATTR_VALUE_START:                                 dq = true;                                 state = STATE_EL_ATTR_VALUE_END;                                 mark = offset;                                 continue;                                                              case STATE_EL_ATTR_VALUE_END:                                 if(!dq)                                     continue;                                 state = STATE_EL_STARTING;                                 attrValue = new String(cbuf, mark+1, offset-mark-1).trim();                                 handler.attribute(attrName, attrValue);                                 attrName = null;                                 attrValue = null;                                                                 mark = -1;                                 continue;                         }                         continue;                                              case ' ':                     case '\t':                     case '\r':                     case '\n':                         switch(state)                         {                             case STATE_COMMENT_STARTED:                             case STATE_IGNORE:                                                                 continue;                             case STATE_EL_STARTING:                                 state = STATE_EL_ATTR_NAME_START;                                 if(elwsMark==-1)                                 {                                                                         String name = null;                                     String namespace = null;                                     if(nsMark==-1)                                         name = new String(cbuf, mark+1, offset-mark-1).trim();                                     else                                     {                                         namespace = new String(cbuf, mark+1, nsMark-mark-1).trim();                                         name = new String(cbuf, nsMark+1, offset-nsMark-1).trim();                                     }                                     if(searchRoot)                                     {                                         if(!handler.rootElement(name, namespace))                                             return;                                         searchRoot = false;                                     }                                     else if(!handler.startElement(name, namespace))                                         return;                                                                 }                                 nsMark = -1;                                 elwsMark = offset;                                 mark = offset;                                                            continue;                         }                         continue;                                              default:                         switch(state)                         {                                                         case STATE_COMMENT_STARTED:                             case STATE_IGNORE:                                                                 continue;                             case STATE_COMMENT_DASH_END:                             case STATE_COMMENT_ENDING:                                 state = STATE_COMMENT_STARTED;                                 continue;                             case STATE_CDATA_ENDING:                             case STATE_CDATA_ENDED:                                 state = STATE_CDATA_STARTED;                                 continue;                                                          case STATE_EL_STARTED:                                                                 state = STATE_EL_TEXT;                                 if(includeInnerText)                                     mark = offset-1;                                 continue;                             case STATE_COMMENT_STARTING:                                                                 if(stateBeforeComment!=0)                                     throw new IOException("invalid xml.");                                                                  mark = -1;                                 state = STATE_IGNORE;                                 continue;                             case STATE_COMMENT_DASH_START:                                 throw new IOException("invalid xml.");                         }                         continue;                 }                         }                          if(mark==-1)                 offset = 0;             else             {                 if(state==STATE_EL_TEXT || state==STATE_CDATA_STARTED)                 {                     if(includeInnerText)                         handler.characters(cbuf, mark+1, offset-mark-2);                                         offset = 0;                     mark = -1;                 }                 else                 {                     // move to the front                     int copyLen = offset - mark;                     System.arraycopy(cbuf, mark, cbuf, 0, copyLen);                     offset = len;                     mark = 0;                 }                          }         }             }     } interface LazyHandler {          /**      * Callback that gets called only once upon traversing the root xml element.      */     public boolean rootElement(String name, String namespace);          /**      * Callback after traversing the start of xml elements (E.g &lt;foo&gt;).      */     public boolean startElement(String name, String namespace);          /**      * Callback after traversing the end of xml elements (E.g &lt;/foo&gt; or       * /&gt;).      */     public boolean endElement();          /**      * Callback after traversing the attributes of an element.      */     public void attribute(String name, String value);          /**      * Callback after traversing the text content of an element.      */     public void characters(char[] data, int start, int length);  } /* //Copyright 2007-2008 David Yu dyuproject@gmail.com //------------------------------------------------------------------------ //Licensed under the Apache License, Version 2.0 (the "License"); //you may not use this file except in compliance with the License. //You may obtain a copy of the License at  //http://www.apache.org/licenses/LICENSE-2.0 //Unless required by applicable law or agreed to in writing, software //distributed under the License is distributed on an "AS IS" BASIS, //WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //See the License for the specific language governing permissions and //limitations under the License. //package com.dyuproject.util.xml; import java.io.InputStreamReader; import java.net.HttpURLConnection; import java.net.URL; import junit.framework.TestCase; //* @author David Yu //* @created Sep 18, 2008 public class XMLParserTest extends TestCase {      static final String prefix = "com/dyuproject/util/xml/";      static URL getResource(String resource)   {       return Thread.currentThread().getContextClassLoader().getResource(prefix + resource);   }      public void testNamespace() throws Exception   {       String url = "http://open.login.yahooapis.com/openid20/www.yahoo.com/xrds";       HttpURLConnection con = (HttpURLConnection)new URL(url).openConnection();       con.setRequestMethod("GET");       con.setDefaultUseCaches(false);       con.setInstanceFollowRedirects(false);       con.setDoInput(true);       con.connect();       SimpleHandler handler = new SimpleHandler();       InputStreamReader reader = new InputStreamReader(con.getInputStream());       try       {           XMLParser.parse(reader, handler, true);           Node xrds = handler.getNode();           assertEquals("xrds", xrds.getNamespace());           Node xrd = xrds.getNode("xrd");           Node service = xrd.getNode("service");           assertTrue(0!=service.getNodes("type").size());           assertEquals("xrds", service.getLastNode().getNamespace());       }       finally       {           reader.close();           con.disconnect();       }      }      public void testSimple() throws Exception   {       SimpleHandler handler = new SimpleHandler();       InputStreamReader reader = new InputStreamReader(getResource("simple.xml").openStream());       try       {           XMLParser.parse(reader, handler, true);           Node root = handler.getNode();           assertEquals("root", root.getName());           Node foo = root.getNode("foo");           assertNotNull(foo);           assertEquals(foo.getText().toString(), "baz");           Node bar = foo.getNode("bar");           assertNotNull(bar);       }       finally       {           reader.close();       }     }      public void testTrimAndCDATA() throws Exception   {       SimpleHandler handler = new SimpleHandler();       InputStreamReader reader = new InputStreamReader(getResource("xrds").openStream());       try       {           XMLParser.parse(reader, handler, true);           Node xrds = handler.getNode();           assertEquals("xrds", xrds.getNamespace());           Node xrd = xrds.getNode("xrd");           Node service = xrd.getNode("service");           assertTrue(0!=service.getNodes("type").size());           assertEquals("xrds", service.getLastNode().getNamespace());           Node foo = xrds.getNode("FOO");           assertNotNull(foo);           assertEquals(foo.getText().toString(), "I am a cdata text. yep\nyep");           System.err.println(foo.getText().toString());       }       finally       {           reader.close();       }           }      public void testSiteXrds() throws Exception   {       SimpleHandler handler = new SimpleHandler();       InputStreamReader reader = new InputStreamReader(getResource("site-xrds").openStream());       try       {           XMLParser.parse(reader, handler, true);           Node xrds = handler.getNode();           assertNotNull(xrds);           assertEquals("xrds", xrds.getNamespace());           assertEquals("XRDS", xrds.getName());           Node signature = xrds.getNode("Signature");           assertNotNull(signature);           assertEquals("ds", signature.getNamespace());           Node xrd = xrds.getNode("XRD");           assertNotNull(xrd);           Node canonicalID = xrd.getNode("CanonicalID");           assertNotNull(canonicalID);           assertEquals("dyuproject.com", canonicalID.getText().toString());           Node service = xrd.getNode("Service");           assertNotNull(service);           Node uri = service.getNode("URI");           assertNotNull(uri);           assertEquals("https://www.google.com/a/dyuproject.com/o8/ud?be=o8", uri.getText().toString());       }       finally       {           reader.close();       }   } } */