[SoapRMI] xpath and fragments

Andy Harrison andyh_at_agaricus.co.uk
Mon, 19 Feb 2007 11:33:24 +0000


This is a multi-part message in MIME format.
--------------070505080500030808010600
Content-Type: text/plain; charset=ISO-8859-1
Content-Transfer-Encoding: 7bit

Hi,

I've been finding different behaviour for XPath expressions when
choosing to parse either the entire document (builder.parse(xpp)) versus
parsing a fragment (builder.parseFragment(xpp).

Basically I can't apply any XPath expressions to fragments! Have
attached a full example. I use the expression "//w:p" which should
return all paragraphs in a WordML document.

If anyone can explain this, I'd be most grateful. It seems I can only
get XPath to work against fragments if there are no namespace declarations.

thanks,
Andy Harrison

--------------070505080500030808010600
Content-Type: text/java;
 name="TestWordML.java"
Content-Transfer-Encoding: 7bit
Content-Disposition: inline;
 filename="TestWordML.java"



import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.Reader;
import java.util.Iterator;
import java.util.List;


import org.xmlpull.v1.XmlPullParser;
import org.xmlpull.v1.XmlPullParserException;
import org.xmlpull.v1.XmlPullParserFactory;
import org.xmlpull.v1.builder.XmlAttribute;
import org.xmlpull.v1.builder.XmlDocument;
import org.xmlpull.v1.builder.XmlElement;
import org.xmlpull.v1.builder.XmlInfosetBuilder;
import org.xmlpull.v1.builder.xpath.Xb1XPath;


/**
 * 
 * Try to run basic xpath expressions against a fragment
 * of a word document
 * 
 * @author andyh
 *
 */
public class TestWordML
{	
	private Reader input;

	public static void main(String[] args)
	{
		try{
			new TestWordML(new File("C:\\basics.xml"));
		}
		catch (Exception e)
		{
			e.printStackTrace();
		}
	}

	public TestWordML(File reportFile)
	throws IOException, XmlPullParserException
	{
		if (reportFile.exists() && reportFile.isFile())
		{
			XmlInfosetBuilder builder = XmlInfosetBuilder.newInstance();
			input = new BufferedReader(new FileReader(reportFile));
			XmlPullParserFactory factory = XmlPullParserFactory.newInstance();
			factory.setNamespaceAware(true);
			
			//Load and advance to w:body
			XmlPullParser xpp = factory.newPullParser();
			xpp.setInput(input);
			
			/*two ways of parsing: 
			 * 1. entire document - wholeDoc = true (works)
			 * 2. just a fragment - wholeDoc = false (fails)
			 */
			
			boolean wholeDoc = true;
			XmlElement e;
			
			if (wholeDoc)
			{
				//THIS WORKS!
				XmlDocument doc = builder.parse(xpp);
				e = doc.getDocumentElement();
			}
			else
			{
				//THIS DOES NOT!
				int pos = xpp.next();
				while (!xpp.getName().equals("body")
						&& pos != XmlPullParser.END_DOCUMENT)
				{
					pos = toStartTag(xpp);
				}
				e = builder.parseFragment(xpp);
			}
			
			
			recurseAndDump(e,"");
			
			
			//try some simple xpath
			Xb1XPath xp = new Xb1XPath("//w:p");
			xp.addNamespace("w","http://schemas.microsoft.com/office/word/2003/wordml");
			//xp.addNamespace("v","urn:schemas-microsoft-com:vml");
			//xp.addNamespace("w10","urn:schemas-microsoft-com:office:word");
			//xp.addNamespace("sl","http://schemas.microsoft.com/schemaLibrary/2003/core");
			//xp.addNamespace("aml","http://schemas.microsoft.com/aml/2001/core");
			//xp.addNamespace("wx","http://schemas.microsoft.com/office/word/2003/auxHint");
			//xp.addNamespace("o","urn:schemas-microsoft-com:office:office");
			
			//DO IT
			List nodes = xp.selectNodes(e);
			System.out.println("\n***************\nNodes recovered: " + nodes.size());
			
			input.close();
		}
	}


	private int toStartTag(XmlPullParser xpp)
	throws XmlPullParserException, IOException
	{
		int eventType = xpp.next();
		while(eventType != XmlPullParser.START_TAG &&
				eventType != XmlPullParser.END_DOCUMENT)
			eventType = xpp.next();
		return eventType;
	}
	
	
	/**
	 * Dump an element tree for a quick viewing
	 *
	 */
	private void recurseAndDump(XmlElement e, String loc)
	{
		loc += "/" + e.getName();
		System.out.println(loc + " >> " + e.getNamespace());
		
		Iterator attribs = e.attributes();
		while(attribs.hasNext())
		{
			XmlAttribute a = (XmlAttribute) attribs.next();
			System.out.println("\t +++ " + a.getName() + " -> " + a.getNamespace());
		}
		
		Iterator kids = e.children();
		while(kids.hasNext())
		{
			Object child = kids.next();
			if (child instanceof XmlElement)
			{
				XmlElement elem = (XmlElement) child;
				recurseAndDump(elem,loc);
			}
		}
	}
}



--------------070505080500030808010600
Content-Type: text/xml;
 name="basics.xml"
Content-Transfer-Encoding: 7bit
Content-Disposition: inline;
 filename="basics.xml"

<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<?mso-application progid="Word.Document"?>
<w:wordDocument xmlns:w="http://schemas.microsoft.com/office/word/2003/wordml" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:w10="urn:schemas-microsoft-com:office:word" xmlns:sl="http://schemas.microsoft.com/schemaLibrary/2003/core" xmlns:aml="http://schemas.microsoft.com/aml/2001/core" xmlns:wx="http://schemas.microsoft.com/office/word/2003/auxHint" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:dt="uuid:C2F41010-65B3-11d1-A29F-00AA00C14882" w:macrosPresent="no" w:embeddedObjPresent="no" w:ocxPresent="no" xml:space="preserve"><o:DocumentProperties><o:Title>Hello this is a test</o:Title><o:Author>Andy Harrison</o:Author><o:LastAuthor>Andy Harrison</o:LastAuthor><o:Revision>1</o:Revision><o:TotalTime>1</o:TotalTime><o:Created>2007-02-19T10:34:00Z</o:Created><o:LastSaved>2007-02-19T10:35:00Z</o:LastSaved><o:Pages>1</o:Pages><o:Words>8</o:Words><o:Characters>52</o:Characters><o:Company> </o:Company><o:Lines>1</o:Lines><o:Paragraphs>1</o:Paragraphs><o:CharactersWithSpac!
 es>59</o:CharactersWithSpaces><o:Version>11.5604</o:Version></o:DocumentProperties><w:fonts><w:defaultFonts w:ascii="Times New Roman" w:fareast="Times New Roman" w:h-ansi="Times New Roman" w:cs="Times New Roman"/></w:fonts><w:styles><w:versionOfBuiltInStylenames w:val="4"/><w:latentStyles w:defLockedState="off" w:latentStyleCount="156"/><w:style w:type="paragraph" w:default="on" w:styleId="Normal"><w:name w:val="Normal"/><w:rPr><wx:font wx:val="Times New Roman"/><w:sz w:val="24"/><w:sz-cs w:val="24"/><w:lang w:val="EN-GB" w:fareast="EN-GB" w:bidi="AR-SA"/></w:rPr></w:style><w:style w:type="character" w:default="on" w:styleId="DefaultParagraphFont"><w:name w:val="Default Paragraph Font"/><w:semiHidden/></w:style><w:style w:type="table" w:default="on" w:styleId="TableNormal"><w:name w:val="Normal Table"/><wx:uiName wx:val="Table Normal"/><w:semiHidden/><w:rPr><wx:font wx:val="Times New Roman"/></w:rPr><w:tblPr><w:tblInd w:w="0" w:type="dxa"/><w:tblCellMar><w:top w:w="0" w:typ!
 e="dxa"/><w:left w:w="108" w:type="dxa"/><w:bottom w:w="0" w:type="dxa
"/><w:right w:w="108" w:type="dxa"/></w:tblCellMar></w:tblPr></w:style><w:style w:type="list" w:default="on" w:styleId="NoList"><w:name w:val="No List"/><w:semiHidden/></w:style></w:styles><w:docPr><w:view w:val="print"/><w:zoom w:percent="100"/><w:doNotEmbedSystemFonts/><w:proofState w:spelling="clean" w:grammar="clean"/><w:attachedTemplate w:val=""/><w:defaultTabStop w:val="720"/><w:punctuationKerning/><w:characterSpacingControl w:val="DontCompress"/><w:optimizeForBrowser/><w:validateAgainstSchema/><w:saveInvalidXML w:val="off"/><w:ignoreMixedContent w:val="off"/><w:alwaysShowPlaceholderText w:val="off"/><w:compat><w:breakWrappedTables/><w:snapToGridInCell/><w:wrapTextWithPunct/><w:useAsianBreakRules/><w:dontGrowAutofit/></w:compat></w:docPr><w:body><wx:sect><w:p><w:r><w:t>Hello this is a test.</w:t></w:r></w:p><w:p/><w:proofErr w:type="gramStart"/><w:p><w:r><w:t>Of namespaces and stuff.</w:t></w:r><w:proofErr w:type="gramEnd"/></w:p><w:p/><w:proofErr w:type="gramStart"/><!
 w:p><w:r><w:t>So there.</w:t></w:r><w:proofErr w:type="gramEnd"/></w:p><w:p/><w:sectPr><w:pgSz w:w="11906" w:h="16838"/><w:pgMar w:top="1440" w:right="1800" w:bottom="1440" w:left="1800" w:header="708" w:footer="708" w:gutter="0"/><w:cols w:space="708"/><w:docGrid w:line-pitch="360"/></w:sectPr></wx:sect></w:body></w:wordDocument>
--------------070505080500030808010600--