changeset 1:0a487b4c948b

~ refactoring + easy xpath requests
author Alexandre Bertails <bertails@w3.org>
date Mon, 08 Feb 2010 19:07:39 -0500
parents 7b71d5d13193
children 3990c89ea03b
files src/main/scala/Main.scala src/test/scala/Test.scala
diffstat 2 files changed, 85 insertions(+), 32 deletions(-) [+]
line wrap: on
line diff
--- a/src/main/scala/Main.scala	Fri Feb 05 10:17:43 2010 -0500
+++ b/src/main/scala/Main.scala	Mon Feb 08 19:07:39 2010 -0500
@@ -5,7 +5,19 @@
 import net.sf.saxon.s9api._
 import javax.xml.transform.stream.StreamSource
 
-object Tidy {
+object Document {
+
+  /**
+   * utility method to concert an InputStream into a String by reading from this stream
+   */
+  def convertStreamToString(is:InputStream):String = scala.io.Source.fromInputStream(is).getLines.reduceLeft(_ + _)
+
+  /**
+   * utility method that invokes the jtidy library
+   * it generated xhtml and UTF-8
+   * encoding is the expected encoding for in
+   * see http://jtidy.sourceforge.net/howto.html
+   */
   def tidy(in:InputStream, out:OutputStream, encoding:String):Unit = {
     val tidy = new org.w3c.tidy.Tidy()
     tidy.setXHTML(true)
@@ -13,39 +25,80 @@
     tidy.setOutputEncoding("UTF-8");
     tidy.parse(in, out)
   }
+
+  /**
+   * utility method to extract a parsed XML from a URL
+   * jtidy is used to clean the document
+   */
+  def convertUrlToXdmNode(url:String):XdmNode = {
+
+    // we store the full bodies in bytearrays to be able to read from them
+    val body = new ByteArrayOutputStream() 
+    val tidyBody = new ByteArrayOutputStream()
+
+    // request the url using dispatch
+    // there is one handler to store the body
+    // there is another handler to get the Content-Type header
+    // see http://dispatch.databinder.net/Stdout_Walkthrough
+    val http = new Http
+    val (_, contentType) =
+      http(url >+ {
+	r => (r >>> body,
+	      r >:> { _("Content-Type") } )} )
+
+    // extract the charset from the Content-Type and choose UTF-8 as a default
+    val charset =
+      try {
+	val uniqContentType = contentType.toList(0)
+	"""charset=(.*)$""".r.findFirstMatchIn(uniqContentType).get.group(1)
+      } catch {
+	case _ => "UTF-8"
+      }
+
+    // call the jtidy library to generate a valid xhtml document
+    // the charset corresponds to the one from the http response
+    tidy(new ByteArrayInputStream(body.toByteArray), tidyBody, charset)
+
+    // val f = new FileWriter("/tmp/out.xml")
+    // f.write(tidyBody.toString)
+    // f.close()
+
+    // parse the XML document using Saxon
+    // see http://www.saxonica.com/documentation/xpath-api/s9api-xpath.html
+    val proc:Processor = new Processor(false)
+    val builder:DocumentBuilder = proc.newDocumentBuilder()
+    builder.setLineNumbering(true)
+    builder.setWhitespaceStrippingPolicy(WhitespaceStrippingPolicy.ALL)
+    val doc:XdmNode = builder.build(new StreamSource(new ByteArrayInputStream(tidyBody.toByteArray)))
+
+    // "closing a ByteArrayOutputStream has no effect", so we don't do it
+
+    // the value of this method is the document itself
+    doc
+  }
+
+  /**
+   * evaluates an xpath expression against a document
+   * xhtml namespace is assumed
+   */
+  def evaluateSingle(doc:XdmNode, xpath:String):XdmItem = {
+
+    val proc:Processor = doc.getProcessor
+    val xpathCompiler:XPathCompiler = proc.newXPathCompiler();
+    xpathCompiler.declareNamespace("xhtml", "http://www.w3.org/1999/xhtml")
+    val selector:XPathSelector = xpathCompiler.compile(xpath).load()
+    selector.setContextItem(doc)
+    selector.evaluateSingle()
+
+  }
+
 }
 
-case class Document() {
-
-  def convertStreamToString(is: InputStream) : String = scala.io.Source.fromInputStream(is).getLines.reduceLeft(_ + _)
-
-  val body = new ByteArrayOutputStream() 
-  val tidyBody = new ByteArrayOutputStream() 
-
-  val url = "http://www.w3.org/TR/2010/WD-xmldsig-properties-20100204/"
-
-  val http = new Http
-  val (_, contentType) =
-    http(url >+ {
-      r => (r >>> body,
-	    r >:> { _("Content-Type") } )} )
+case class Document(url:String) {
 
-  val charset =
-    try {
-      val uniqContentType = contentType.toList(0)
-      """charset=(.*)$""".r.findFirstMatchIn(uniqContentType).get.group(1)
-    } catch {
-      case _ => "UTF-8"
-    }
+  val doc:XdmNode = Document.convertUrlToXdmNode(url)
 
-  Tidy.tidy(new ByteArrayInputStream(body.toByteArray), tidyBody, charset)
-
-  val proc:Processor = new Processor(false)
-  val builder:DocumentBuilder = proc.newDocumentBuilder()
-  builder.setLineNumbering(true)
-  builder.setWhitespaceStrippingPolicy(WhitespaceStrippingPolicy.ALL)
-  builder.setDTDValidation(false)
-  val booksDoc:XdmNode = builder.build(new StreamSource(new ByteArrayInputStream(tidyBody.toByteArray)))
+  def evaluateSingle(xpath:String):XdmItem = Document.evaluateSingle(doc, xpath)
 
 
 }
--- a/src/test/scala/Test.scala	Fri Feb 05 10:17:43 2010 -0500
+++ b/src/test/scala/Test.scala	Mon Feb 08 19:07:39 2010 -0500
@@ -5,9 +5,9 @@
 
   test("") {
 
-    new Document()
+    val document = new Document("http://www.w3.org/TR/2010/WD-xmldsig-properties-20100204/")
 
-
+    println(document.evaluateSingle("//xhtml:h2"))
 
     assert(1 === 1)
   }
Set up and maintained by W3C Systems Team, please report bugs to sysreq@w3.org.

W3C would like to thank Microsoft who donated the server that allows us to run this service.