Sign In/My Account | View Cart  
advertisement


Listen Print Discuss

Pull Parsing in C# and Java
by Niel Bornstein | Pages: 1, 2

Java Pull Parsers

But pull parsers are not unique to the .NET world. The Java Community Process is currently working on a standard called StAX, the Streaming API for XML. This nascent API is, in turn, based upon several vendors' pull parser implementations, notably Apache's Xerces XNI, BEA's XML Stream API, XML Pull Parser 2, PullDOM (for Python), and, yes, Microsoft's XmlReader.

So how would we implement this same program in yet another pull parser, the Common API for XML Pull Parsing, or XPP? Let's take a look.

package com.xml;

import java.io.*;
import java.net.*;
import java.util.*;

import com.alexandriasc.xml.XMLWriter;
import org.xmlpull.v1.*;

public class RSSReader {

  public static void main(String [] args) {
    // create an instance of RSSReader
    RSSReader rssreader = new RSSReader();

    XMLWriter writer = null;
    try {
      String url = args[0];
      writer = new XMLWriter(new OutputStreamWriter(System.out),false);
      XmlPullParserFactory factory = XmlPullParserFactory.newInstance();
      XmlPullParser parser = factory.newPullParser();
      InputStreamReader stream = new InputStreamReader(
        new URL(url).openStream());
      parser.setInput(stream);
      parser.setFeature(XmlPullParser.FEATURE_PROCESS_DOCDECL,false);
      rssreader.RSSToHtml(parser, writer);
    } catch (Exception e) {
      e.printStackTrace(System.err);
    } finally {
      try {
        writer.flush();
      } catch (IOException io) {
        io.printStackTrace(System.err);
      }
    }
  }

  public void RSSToHtml(XmlPullParser parser, XMLWriter writer)
  throws IOException, XmlPullParserException {
    // equivalent to XmlReader.MoveToContent()
    while (parser.next() != XmlPullParser.START_TAG
      && !parser.getName().equals("rss")) {
    }
    if (parser.getName().equals("rss")) {
      writer.beginElement("html");
      do {
        parser.next();
        if (parser.getEventType() == XmlPullParser.START_TAG
          && parser.getName().equals("channel")) {
          ChannelToHtml(parser, writer);
        } else if (parser.getEventType() == XmlPullParser.START_TAG
          && parser.getName().equals("item")) {
          ItemToHtml(parser, writer);
        }
      } while (parser.getEventType() != XmlPullParser.END_DOCUMENT);
      writer.endElement();
    } else {
      // not an RSS document!
    }
  }

  void ChannelToHtml(XmlPullParser parser, XMLWriter writer)
  throws IOException, XmlPullParserException {
    writer.beginElement("head");
    // scan header elements and pick out the title.
    while (!(parser.next() == XmlPullParser.END_TAG
      && parser.getName().equals("channel"))) {
      if (parser.getEventType() == XmlPullParser.START_TAG) {
        do {
          if (parser.getEventType() == XmlPullParser.START_TAG
            && parser.getName().equals("title")) {
            while (parser.next() != XmlPullParser.END_TAG) {
              if (parser.getEventType() == XmlPullParser.TEXT) {
                writer.writeElement("title",null,parser.getText());
                break;
              }
            }
            break;
          }
        } while (parser.next() != XmlPullParser.END_TAG);
        break;
      }
    }
    writer.endElement();

    writer.beginElement("body");
    // transform the items.
    do {
      if (parser.getEventType() == XmlPullParser.START_TAG 
        && parser.getName().equals("item")) {
        ItemToHtml(parser, writer);
      }
      parser.next();
    } while (parser.getEventType() != XmlPullParser.END_DOCUMENT);
    writer.endElement();
  }

  void ItemToHtml(XmlPullParser parser, XMLWriter writer)
  throws IOException, XmlPullParserException {
    writer.beginElement("p");

    String title = null, link = null, description = null;
    while (parser.next() != XmlPullParser.END_DOCUMENT
      && parser.getEventType() != XmlPullParser.END_TAG) {
      if (parser.getEventType() == XmlPullParser.START_TAG
        && parser.getName().equals("title")) {
        if (parser.next() == XmlPullParser.TEXT)
          title = parser.readText();
      } else if (parser.getEventType() == XmlPullParser.START_TAG
        && parser.getName().equals("link")) {
        if (parser.next() == XmlPullParser.TEXT)
          link = parser.readText();
      } else if (parser.getEventType() == XmlPullParser.START_TAG
        && parser.getName().equals("description")) {
        if (parser.next() == XmlPullParser.TEXT)
          description = parser.readText();
      }
    }
    HashMap attributes = new HashMap(1);
    attributes.put("href", link);
    writer.beginElement("a",attributes);
    writer.write(title);
    writer.endElement();

    writer.writeEmptyElement("br");

    writer.write(description);

    writer.endElement(); // end the "p" element
  }
}

C# Essentials

Related Reading

C# Essentials
By Ben Albahari, Peter Drayton, Brad Merrill

Most of our port was the reverse of our previous ports; for example, changing Console.Out to System.out, making method names start with lowercase letters, adding explicit throws clauses. The real meat of this port is in two areas.

The Parser

First, we're using XmlPullParser as a rough equivalent of XmlTextReader. One difference is that while we are able to instantiate an XmlTextReader directly in C# (remember, Microsoft is a one-stop shop), we have to use the Java XmlPullParserFactory to get a concrete implementation of the XmlPullParser interface. This should be a familiar exercise for anyone who's used JAXP or, for that matter, JDBC.

Once we have the parser, most of the method name equivalencies are obvious. Remember that in C# the == operator works just fine for strings, but in Java you must use the .equals() method; otherwise you'll be comparing object references rather than their values, not at all what we want to do. Also, you can't use a String as the expression in a switch...case statement in Java, so we've turned those into an if...else structure.

Another difference between the .NET XmlReader and the Java XmlPullParser has to do with the way in which events are pulled out of the XMLdocument. In the former, the ReadString() method will return all the text for the current element; while in the latter, next() must explicitly be called to position the parser at the text node before calling getText() or readText() to read the text.

This may be a minor difference, but it tends to make our port a little more difficult. To better handle this requirement, I've changed several while loops into do...while loops. This, unfortunately, makes it less than a simple port; the logic has changed, but not considerably.

The Writer

Second, there is no XmlTextWriter in Java, so we're using Alexandria Software Consulting's XmlHelper package, which contains a class called XMLWriter. Besides the naming of methods, XMLWriter operates almost identically to .NET's XmlWriter, except for two details.

First, XMLWriter has the notion of a collection of attributes, whereas XmlWriter requires you to write each attribute individually. In Java, we call beginElement(), passing the name and the Map of attributes, whereas in C#, we called WriteStartElement() followed by WriteAttributeString().

Second, XMLWriter has a writeEmptyElement() method, where XmlWriter requires you to call WriteStartElement() followed by WriteEndElement(). However, .NET automatically collapses an empty element into a short end element (in this case, <br />). .NET's way gives you the flexibility of determining whether the element is empty at runtime. If, however, you need to force an end tag, you can call WriteFullEndElement() instead of WriteEndElement().

Conclusion

A pull parser makes it much easier to process XML, especially when you are processing XML with a well-defined grammar like RSS. This code is much easier to understand and maintain since there's no complex state machine to build or maintain. In fact, this code is completely stateless; the pull parser keeps track of all the state for us. So in that sense a pull parser is a higher level way of processing XML than SAX.

Although my original code quite intentionally didn't do any error handling, error handling in a push model state machine adds even more complexity to an already complex model. The new RSSReader has clear placeholders for error handling code in the cases when the input doesn't comply with the expected RSS DTD.

Performance can be an important consideration in an XML parser. Notice the call to Skip() (in the C# version) when we find elements we're not interested in. In this case the XML parser can skip over entire subtrees of XML without having to call us back on every element, even ones we know we're not interested in. In this case we skip over the <image> elements and all their children. Second, in C# we could optimize out all the element name string comparisons and make the atomized pointer comparisons if we used the XmlReader's NameTable to pre-atomize those strings.

Finally, using an XML writer makes our output generation more robust. For example, it will correctly convert special characters -- <, &, etc. -- into their respective entity references. Because it maintains its own state internally, it never forgets which element to close after a convoluted series of while loops. And it will always produce XML output in the consistent and readable format of your choice.

And now for the inevitable comparison between .NET's XmlReader/XmlWriter and the equivalent functionality in Java. As usual, I'll say that in .NET, Microsoft has provided it all for you and, thus, it is undeniably simpler to learn and use. The C# version of our RSSReader is about 20% shorter than the Java version, which is great unless you work in one of those shops which still measures productivity in KLOCs. And the readability of the code itself is much greater in C#, although that probably can be chalked up at least in part to my own lack of skill in that conversion from while to do...while.

But the real bottom line remains that doing it the .NET way means that Microsoft provides all the standards-compliant tools that 90% of developers are likely to need, while the Java way still means putting together a solution from various pieces that you can scrounge from various sources. Some of those pieces come from the Java Community Process and thus represent peer-reviewed, formally approved APIs, but some come from a quick search of the Web, and in the end only you are qualified to judge their worthiness.


Comment on this articleHave you tried C# for XML applications? What about pull parsers versus SAX? Share your experience in our forum.
(* You must be a
member of XML.com to use this feature.)
Comment on this Article


Titles Only Titles Only Newest First
  • Returning a RSS Block
    2006-10-20 03:58:05 Phil_WBC [Reply]

    I've changed the code to use a specific URL and return the RRS Feed as a block to a page however I don't think I've done it correctly, can anyone tell me where I might have gone wrong? Thank you in advance!


    string url = "http://rss.news.yahoo.com/rss/topstories";
    XmlTextWriter writer = new XmlTextWriter(Console.Out);
    writer.Formatting = Formatting.Indented;
    HttpWebRequest wr = (HttpWebRequest)WebRequest.Create(url);
    WebResponse resp = wr.GetResponse();
    Stream stream = resp.GetResponseStream();
    XmlTextReader reader = new XmlTextReader(stream);
    reader.XmlResolver = null; // ignore the DTD
    reader.WhitespaceHandling = WhitespaceHandling.None;
    strRSS = rssreader.RSSToHtml(reader, writer);


    return strRSS;

    • Returning a RSS Block
      2006-10-20 06:15:41 Niel Bornstein [Reply]

      The problem with this approach is that rssreader.RSSToHtml() does not return anything. It expects to place its output in the XmlWriter that is passed in.


      What you want to do is have the XmlWriter write to a StringWriter instead of the console:


      ...
      StringWriter sw = new StringWriter();
      XmlTextWriter writer = new XmlTextWriter(sw);
      ...
      rssreader.RSSToHtml(reader, writer);
      return sw.ToString();

      • Returning a RSS Block
        2006-10-24 01:52:23 Phil_WBC [Reply]

        Thanks, this works perfectly. It's only returning one result (the top one) from the URL though, is it possible to return more?

        • Returning a RSS Block
          2006-10-25 08:28:24 Niel Bornstein [Reply]

          That could be a problem with the specific RSS feed or some bug in my code. RSS is really under-specified, so there's no way a small snippet like this could handle all the variants that are out there. Your best bet is find another RSS library that handles everything gracefully.

  • XML to HTML
    2003-01-22 08:17:59 CJ Varghese [Reply]

    I tried to use the code for C# in converting XML to HTML I get an error. On this line--> string url = args[0];
    error = Additional information: Index was outside the bounds of the array.


    it maybe something simple i'm forgetting.. also if you can tell me what would i chagne if i'm getting the xml from file rather than a URL.
    Thank you,
    cj

    • XML to HTML
      2003-01-22 08:39:35 Niel Bornstein [Reply]

      I guess I should have made it clearer that the program takes as an argument the URL of the file to parse. The error you got indicates that there were no arguments passed to the Main method. To invoke the program to read the XML at the URL http://xmlhack.com/rss.php, you should enter the following on the command line: "RSSReader http://xmlhack.com/rss.php".


      As to your second question, to take the input from a file instead of an HTTP URL, you would need to change some of the code inside the try/catch block of the Main method. It would need to look something like this:


      string url = args[0];
      XmlTextWriter writer = new XmlTextWriter(Console.Out);
      writer.Formatting = Formatting.Indented;
      XmlTextReader reader = new XmlTextReader(url);
      reader.XmlResolver = null; // ignore the DTD
      reader.WhitespaceHandling = WhitespaceHandling.None;
      rssreader.RSSToHtml(reader, writer);


      Invoke the program, passing in the filename as the argument. This would also work for HTTP URLs without a web proxy, by the way, as the XmlTextReader knows how to read a file directly from an HTTP URL as well as a local filename.

  • corrections about XMLPULL API ...
    2002-06-13 12:22:25 Aleksander Slominski [Reply]

    hi,


    first i wanted to compliment the article author about very good work.


    however as co-author of XMLPULL V1 API i would like to comment on issues brought in the article:


    (...) First, we're using XmlPullParser as a rough equivalent of XmlTextReader. One difference is that
    while we are able to instantiate an XmlTextReader directly in C# (remember, Microsoft is a
    one-stop shop), we have to use the Java XmlPullParserFactory to get a concrete implementation
    of the XmlPullParser interface. This should be
    a familiar exercise for anyone who's used JAXP or, for that matter, JDBC. (...)


    this allow to select alternate implementations of XMLPULL API
    (even such that works with WBXML) and does not lock users
    with one implementation (as XmlTextReader unfortunately does ...)


    (...) Another difference between the .NET XmlReader and the Java XmlPullParser has to
    do with the way in which events are pulled out of the XML document. In the former,
    the ReadString() method will return all the text for the
    current element; while in the latter, next() must explicitly be called
    to position the parser at the text node before
    calling getText() or readText() to read the text. (...)


    this was changed and in the newest XMLPULL API one can cal nextText()
    to get text content of element in one step.


    (...) This may be a minor difference, but it tends to make our port a little more difficult. To better
    handle this requirement, I've changed several while loops into do...while loops.
    This, unfortunately, makes it less than a simple port; the logic has changed, but not considerably. (...)


    so in short this is now fixed and change is no longer needed :-)


    there is one incompatibility introduced in the latest 1.0.8 release of XMLPULL API
    that is affecting your sample code: now it has parser.nextText() instead of parser.readText()
    consequently there i no need to call "if (parser.next() == XmlPullParser.TEXT)"
    (if possible please update sample as an appendix to the article).


    i have also modified RSSReader to correctly follow RSS DTD
    (http://my.netscape.com/publish/formats/rss-0.91.dtd)
    - the key difference is that channel content model allow title at any position and item is inside
    channel:
    <!ELEMENT channel (title | description | link | language | item+ | rating? | image? | textinput? |
    copyright? | pubDate? | lastBuildDate? | docs? | managingEditor? | webMaster? | skipHours? |
    skipDays?)*>
    <!ELEMENT item (title | link | description)*>


    the modified sample tries to validate input so it will detect if it is not RSS feed
    or if it has unexpected structure and will report it to the user
    - also for convenience now parser will skip also unknown
    top-level elements in channel. additionally though the modified sample is now more
    sophisticated it is now shorter and i think it demonstrates that XMLPULL API
    can work pretty good for parsing in Java! the modified sample is appended
    at the bottom of the message (and attached to email)


    also currently on XMLPULL-DEV mailing list we are working on a class
    to write XML to make it easy to do what you described with XMLPULL API
    and we welcome input from anybody interested in XML pull parsing and in XML
    serialization.


    thanks,


    alek


    ps. there is a bug in ItemToHtml() - assumption that link must be not null
    - if attribute value is null then Writer dies - fix it with
    (...) if(link != null) attributes.put("href", link);(...)
    the same fix is required for description to not print it if null:
    (...) if(description != null) writer.write(description); (...)
    - that is related to problem that as you can see in DTD
    item may or may not have link or description:
    <!ELEMENT item (title | link | description)*>


    ps2. this is modified version of the sample that was in the article:


    //package com.xml;
    // modifiad based by Aleksander Slominski
    // based on http://www.xml.com/pub/a/2002/05/22/parsing.html?page=2


    import java.io.*;
    import java.net.*;
    import java.util.*;


    import com.alexandriasc.xml.XMLWriter;
    import org.xmlpull.v1.*;


    public class RSSReader {


    public static void main(String [] args)
    {


    // create an instance of RSSReader
    RSSReader rssreader = new RSSReader();


    XMLWriter writer = null;
    try {
    writer = new XMLWriter(new OutputStreamWriter(System.out),false);
    XmlPullParser parser = XmlPullParserFactory.newInstance().newPullParser();
    String url = args[0];
    InputStreamReader stream = new InputStreamReader(
    new URL(url).openStream());
    parser.setInput(stream);
    rssreader.convertRSSToHtml(parser, writer);
    } catch (Exception e) {
    e.printStackTrace(System.err);
    }
    }


    public void convertRSSToHtml(XmlPullParser parser, XMLWriter writer)
    throws IOException, XmlPullParserException
    {
    // <!ELEMENT rss (channel)>
    if (parser.nextTag() == XmlPullParser.START_TAG
    && parser.getName().equals("rss"))
    {
    writer.beginElement("html");
    if (parser.nextTag() == XmlPullParser.START_TAG
    && parser.getName().equals("channel"))
    {
    convertChannelToHtml(parser, writer);
    parser.require(XmlPullParser.END_TAG, null, "channel");
    } else {
    new RuntimeException("expectd channel start tag not "+parser.getPositionDescription());
    }
    parser.nextTag();
    parser.require(XmlPullParser.END_TAG, null, "rss");
    writer.endElement();
    writer.flush();
    } else {
    throw new RuntimeException("expectd an RSS document at" + parser.getPositionDescription());
    }
    }


    public void convertChannelToHtml(XmlPullParser parser, XMLWriter writer)
    throws IOException, XmlPullParserException
    {
    // <!ELEMENT channel (title | description | link | language | item+ | rating? | image? | textinput? |
    copyright? | pubDate? | lastBuildDate? | docs? | managingEditor? | webMaster? | skipHours? |
    skipDays?)*>
    boolean seenBody = false; //assumption that title is before items ...
    while (parser.nextTag() != XmlPullParser.END_TAG) { // this guranteed by well formednes of XML &&
    parser.getName().equals("channel"))) {
    // if (parser.getEventType() == XmlPullParser.START_TAG) { //guranteed by nextTag
    // <!ELEMENT title (#PCDATA)>
    if(parser.getName().equals("title") && !seenBody) {
    writer.beginElement("head");
    writer.writeElement("title",null,parser.nextText());
    writer.endElement();
    } else if(parser.getName().equals("item")) {
    if(!seenBody) {
    writer.beginElement("body");
    seenBody = true;
    }
    convertItemToHtml(parser, writer);
    } else {
    // skip any element content including sub elements...
    int level = 1;
    while (level > 0) {
    switch(parser.next()) {
    case XmlPullParser.START_TAG: ++level; break;
    case XmlPullParser.END_TAG: --level; break;
    }
    }
    }
    }
    if(seenBody) writer.endElement();
    }


    public void convertItemToHtml(XmlPullParser parser, XMLWriter writer)
    throws IOException, XmlPullParserException
    {
    writer.beginElement("p");
    //<!ELEMENT item (title | link | description)*>
    String title = null, link = null, description = null;
    while (parser.nextTag() != XmlPullParser.END_TAG) {
    if (parser.getName().equals("title")) {
    title = parser.nextText();
    } else if (parser.getName().equals("link")) {
    link = parser.nextText();
    } else if (parser.getName().equals("description")) {
    description = parser.nextText();
    }
    }


    HashMap attributes = new HashMap(1);
    if(link != null) attributes.put("href", link);
    writer.beginElement("a",attributes);
    if(title != null) writer.write(title);
    writer.endElement();


    writer.writeEmptyElement("br");


    if(description != null) writer.write(description);


    writer.endElement(); // end the "p" element
    }
    }