[SoapRMI] xpath and fragments
Andy Harrison
andyh_at_agaricus.co.uk
Mon, 19 Feb 2007 11:33:24 +0000
This is a multi-part message in MIME format.
--------------070505080500030808010600
Content-Type: text/plain; charset=ISO-8859-1
Content-Transfer-Encoding: 7bit
Hi,
I've been finding different behaviour for XPath expressions when
choosing to parse either the entire document (builder.parse(xpp)) versus
parsing a fragment (builder.parseFragment(xpp).
Basically I can't apply any XPath expressions to fragments! Have
attached a full example. I use the expression "//w:p" which should
return all paragraphs in a WordML document.
If anyone can explain this, I'd be most grateful. It seems I can only
get XPath to work against fragments if there are no namespace declarations.
thanks,
Andy Harrison
--------------070505080500030808010600
Content-Type: text/java;
name="TestWordML.java"
Content-Transfer-Encoding: 7bit
Content-Disposition: inline;
filename="TestWordML.java"
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.Reader;
import java.util.Iterator;
import java.util.List;
import org.xmlpull.v1.XmlPullParser;
import org.xmlpull.v1.XmlPullParserException;
import org.xmlpull.v1.XmlPullParserFactory;
import org.xmlpull.v1.builder.XmlAttribute;
import org.xmlpull.v1.builder.XmlDocument;
import org.xmlpull.v1.builder.XmlElement;
import org.xmlpull.v1.builder.XmlInfosetBuilder;
import org.xmlpull.v1.builder.xpath.Xb1XPath;
/**
*
* Try to run basic xpath expressions against a fragment
* of a word document
*
* @author andyh
*
*/
public class TestWordML
{
private Reader input;
public static void main(String[] args)
{
try{
new TestWordML(new File("C:\\basics.xml"));
}
catch (Exception e)
{
e.printStackTrace();
}
}
public TestWordML(File reportFile)
throws IOException, XmlPullParserException
{
if (reportFile.exists() && reportFile.isFile())
{
XmlInfosetBuilder builder = XmlInfosetBuilder.newInstance();
input = new BufferedReader(new FileReader(reportFile));
XmlPullParserFactory factory = XmlPullParserFactory.newInstance();
factory.setNamespaceAware(true);
//Load and advance to w:body
XmlPullParser xpp = factory.newPullParser();
xpp.setInput(input);
/*two ways of parsing:
* 1. entire document - wholeDoc = true (works)
* 2. just a fragment - wholeDoc = false (fails)
*/
boolean wholeDoc = true;
XmlElement e;
if (wholeDoc)
{
//THIS WORKS!
XmlDocument doc = builder.parse(xpp);
e = doc.getDocumentElement();
}
else
{
//THIS DOES NOT!
int pos = xpp.next();
while (!xpp.getName().equals("body")
&& pos != XmlPullParser.END_DOCUMENT)
{
pos = toStartTag(xpp);
}
e = builder.parseFragment(xpp);
}
recurseAndDump(e,"");
//try some simple xpath
Xb1XPath xp = new Xb1XPath("//w:p");
xp.addNamespace("w","http://schemas.microsoft.com/office/word/2003/wordml");
//xp.addNamespace("v","urn:schemas-microsoft-com:vml");
//xp.addNamespace("w10","urn:schemas-microsoft-com:office:word");
//xp.addNamespace("sl","http://schemas.microsoft.com/schemaLibrary/2003/core");
//xp.addNamespace("aml","http://schemas.microsoft.com/aml/2001/core");
//xp.addNamespace("wx","http://schemas.microsoft.com/office/word/2003/auxHint");
//xp.addNamespace("o","urn:schemas-microsoft-com:office:office");
//DO IT
List nodes = xp.selectNodes(e);
System.out.println("\n***************\nNodes recovered: " + nodes.size());
input.close();
}
}
private int toStartTag(XmlPullParser xpp)
throws XmlPullParserException, IOException
{
int eventType = xpp.next();
while(eventType != XmlPullParser.START_TAG &&
eventType != XmlPullParser.END_DOCUMENT)
eventType = xpp.next();
return eventType;
}
/**
* Dump an element tree for a quick viewing
*
*/
private void recurseAndDump(XmlElement e, String loc)
{
loc += "/" + e.getName();
System.out.println(loc + " >> " + e.getNamespace());
Iterator attribs = e.attributes();
while(attribs.hasNext())
{
XmlAttribute a = (XmlAttribute) attribs.next();
System.out.println("\t +++ " + a.getName() + " -> " + a.getNamespace());
}
Iterator kids = e.children();
while(kids.hasNext())
{
Object child = kids.next();
if (child instanceof XmlElement)
{
XmlElement elem = (XmlElement) child;
recurseAndDump(elem,loc);
}
}
}
}
--------------070505080500030808010600
Content-Type: text/xml;
name="basics.xml"
Content-Transfer-Encoding: 7bit
Content-Disposition: inline;
filename="basics.xml"
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<?mso-application progid="Word.Document"?>
<w:wordDocument xmlns:w="http://schemas.microsoft.com/office/word/2003/wordml" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:w10="urn:schemas-microsoft-com:office:word" xmlns:sl="http://schemas.microsoft.com/schemaLibrary/2003/core" xmlns:aml="http://schemas.microsoft.com/aml/2001/core" xmlns:wx="http://schemas.microsoft.com/office/word/2003/auxHint" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:dt="uuid:C2F41010-65B3-11d1-A29F-00AA00C14882" w:macrosPresent="no" w:embeddedObjPresent="no" w:ocxPresent="no" xml:space="preserve"><o:DocumentProperties><o:Title>Hello this is a test</o:Title><o:Author>Andy Harrison</o:Author><o:LastAuthor>Andy Harrison</o:LastAuthor><o:Revision>1</o:Revision><o:TotalTime>1</o:TotalTime><o:Created>2007-02-19T10:34:00Z</o:Created><o:LastSaved>2007-02-19T10:35:00Z</o:LastSaved><o:Pages>1</o:Pages><o:Words>8</o:Words><o:Characters>52</o:Characters><o:Company> </o:Company><o:Lines>1</o:Lines><o:Paragraphs>1</o:Paragraphs><o:CharactersWithSpac!
es>59</o:CharactersWithSpaces><o:Version>11.5604</o:Version></o:DocumentProperties><w:fonts><w:defaultFonts w:ascii="Times New Roman" w:fareast="Times New Roman" w:h-ansi="Times New Roman" w:cs="Times New Roman"/></w:fonts><w:styles><w:versionOfBuiltInStylenames w:val="4"/><w:latentStyles w:defLockedState="off" w:latentStyleCount="156"/><w:style w:type="paragraph" w:default="on" w:styleId="Normal"><w:name w:val="Normal"/><w:rPr><wx:font wx:val="Times New Roman"/><w:sz w:val="24"/><w:sz-cs w:val="24"/><w:lang w:val="EN-GB" w:fareast="EN-GB" w:bidi="AR-SA"/></w:rPr></w:style><w:style w:type="character" w:default="on" w:styleId="DefaultParagraphFont"><w:name w:val="Default Paragraph Font"/><w:semiHidden/></w:style><w:style w:type="table" w:default="on" w:styleId="TableNormal"><w:name w:val="Normal Table"/><wx:uiName wx:val="Table Normal"/><w:semiHidden/><w:rPr><wx:font wx:val="Times New Roman"/></w:rPr><w:tblPr><w:tblInd w:w="0" w:type="dxa"/><w:tblCellMar><w:top w:w="0" w:typ!
e="dxa"/><w:left w:w="108" w:type="dxa"/><w:bottom w:w="0" w:type="dxa
"/><w:right w:w="108" w:type="dxa"/></w:tblCellMar></w:tblPr></w:style><w:style w:type="list" w:default="on" w:styleId="NoList"><w:name w:val="No List"/><w:semiHidden/></w:style></w:styles><w:docPr><w:view w:val="print"/><w:zoom w:percent="100"/><w:doNotEmbedSystemFonts/><w:proofState w:spelling="clean" w:grammar="clean"/><w:attachedTemplate w:val=""/><w:defaultTabStop w:val="720"/><w:punctuationKerning/><w:characterSpacingControl w:val="DontCompress"/><w:optimizeForBrowser/><w:validateAgainstSchema/><w:saveInvalidXML w:val="off"/><w:ignoreMixedContent w:val="off"/><w:alwaysShowPlaceholderText w:val="off"/><w:compat><w:breakWrappedTables/><w:snapToGridInCell/><w:wrapTextWithPunct/><w:useAsianBreakRules/><w:dontGrowAutofit/></w:compat></w:docPr><w:body><wx:sect><w:p><w:r><w:t>Hello this is a test.</w:t></w:r></w:p><w:p/><w:proofErr w:type="gramStart"/><w:p><w:r><w:t>Of namespaces and stuff.</w:t></w:r><w:proofErr w:type="gramEnd"/></w:p><w:p/><w:proofErr w:type="gramStart"/><!
w:p><w:r><w:t>So there.</w:t></w:r><w:proofErr w:type="gramEnd"/></w:p><w:p/><w:sectPr><w:pgSz w:w="11906" w:h="16838"/><w:pgMar w:top="1440" w:right="1800" w:bottom="1440" w:left="1800" w:header="708" w:footer="708" w:gutter="0"/><w:cols w:space="708"/><w:docGrid w:line-pitch="360"/></w:sectPr></wx:sect></w:body></w:wordDocument>
--------------070505080500030808010600--