changeset 7:d8de87e77642

~ refactoring + editors extraction + tests
author Alexandre Bertails <bertails@w3.org>
date Fri, 26 Feb 2010 16:18:00 -0500
parents bff8c4d5d857
children f92c5ac20702
files project/build/Project.scala src/main/scala/Main.scala src/main/scala/Model.scala src/main/scala/XML.scala src/test/scala/DocumentTest.scala src/test/scala/ModelTest.scala src/test/scala/Test.scala
diffstat 7 files changed, 108 insertions(+), 42 deletions(-) [+]
line wrap: on
line diff
--- a/project/build/Project.scala	Thu Feb 25 19:53:50 2010 -0500
+++ b/project/build/Project.scala	Fri Feb 26 16:18:00 2010 -0500
@@ -2,15 +2,6 @@
 
 class Project(info: ProjectInfo) extends DefaultProject(info) {
 
-  // http://www.w3.org/blog/systeam/2008/02/08/w3c_s_excessive_dtd_traffic
-  // http://sourceforge.net/apps/mediawiki/saxon/index.php?title=XML_Catalogs
-  // http://www.sagehill.net/docbookxsl/UseCatalog.html
-  // http://xml.apache.org/commons/components/resolver/resolver-article.html
-  // override def unmanagedClasspath = super.unmanagedClasspath +++ "dtds"
-  // System.setProperty("xml.catalog.files", "dtds/catalog.xml")
-  // System.setProperty("xml.catalog.verbosity", "1")
-  // -r org.apache.xml.resolver.tools.CatalogResolver -x org.apache.xml.resolver.tools.ResolvingXMLReader -y org.apache.xml.resolver.tools.ResolvingXMLReader
-
   val scalatools = "scala-tools" at "http://scala-tools.org/repo-snapshots"
   val smackRepo = "m2-repository-smack" at "http://maven.reucon.com/public"
   val databinder_net = "databinder.net repository" at "http://databinder.net/repo"
@@ -21,6 +12,5 @@
 
   val jtidy = "jtidy" % "jtidy" % "r938" from "http://downloads.sourceforge.net/project/jtidy/JTidy/r938/jtidy-r938.jar"
   val saxon = "saxon" % "saxon" % "9.2.0.5j" from "http://www.bertails.org/jar/saxonhe9-2-0-5j.jar"
-  // val xmlresolver = "xml-resolver" % "xml-resolver" % "1.2"
 
 }
--- a/src/main/scala/Main.scala	Thu Feb 25 19:53:50 2010 -0500
+++ b/src/main/scala/Main.scala	Fri Feb 26 16:18:00 2010 -0500
@@ -3,41 +3,44 @@
 import org.w3c.util._
 import scala.xml.Elem
 
-case class Email(address:String)
-
-case class Editor(
-  firstname:String,
-  lastname:String,
-  email:Email,
-  middleName:Option[String] = None,
-  firstNameInitial:Option[String] = None,
-  middleNameInitial:Option[String] = None
-)
-
-
+/**
+ * represent a report
+ */
 case class Document(url:String) {
 
+  /**
+   * the inner parsed XML document from the url
+   */
   val doc:xml.Node = xml.Node(url)
 
-  val divHead = "//html:div[@class='head']"
+  final val divHead = "//html:div[@class='head']"
 
   def edregexp(v:String) = "'^" + v + """(s?)(\s+\([^)]+\)\s*)?(:?)$'"""
 
+  /**
+   * extract the editor section name
+   * basically, it's either "Editor" or "Author"
+   */
   def editorSectionName = {
-    lazy val editor = divHead + "//html:dl/html:dt[matches(normalize-space()," + edregexp("Editor") + ")]/text()"
+    val editor = divHead + "//html:dl/html:dt[matches(normalize-space()," + edregexp("Editor") + ")]/text()"
     lazy val author = divHead + "//html:dl/html:dt[matches(normalize-space()," + edregexp("Author") + ")]/text()"
     val esn = doc.evaluateSingle(editor) orElse doc.evaluateSingle(author)
     esn.get.toString
   }
 
-  def editorsList = {
+  /**
+   * get the list of editors
+   * adapted from http://www.w3.org/2001/10/trdoc-data.xsl template trd:getEditorsList
+   */
+  def editors:Seq[Editor] = {
+    // extract the right node
     val sectXPath = divHead + "//html:dl/html:dt[matches(normalize-space()," + edregexp(editorSectionName) + ")]"
     val sect = doc.evaluateSingle(sectXPath).get
+    // the rights dds are right under the previous section
     val ddsXPath = "$trd:sect/following-sibling::html:dd[preceding-sibling::html:dt[1] is $trd:sect]"
     val dds = sect.evaluate(ddsXPath, "sect" -> sect)
-    for (dd <- dds) {
-      println(dd.text)
-    }
+    // parse and inject the found editors into Editor
+    dds map (dd => Editor.parse(dd.text)) toSeq
   }
   
 
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/main/scala/Model.scala	Fri Feb 26 16:18:00 2010 -0500
@@ -0,0 +1,37 @@
+package org.w3c.prc
+
+import scala.xml.Elem
+
+case class Email(address:String)
+
+case class Editor(
+  firstName:String,
+  lastName:String,
+  email:Option[Email] = None,
+  middleName:Option[String] = None,
+  firstNameInitial:Option[String] = None,
+  middleNameInitial:Option[String] = None
+) {
+  
+  override def toString = firstName + " " + lastName
+
+}
+
+object Editor {
+
+  final val nameChunkRegex = """(\(|,|-|&lt;)"""
+
+  /**
+   * parse an editor section
+   * this is a basic implementation
+   * TODO speak with Ian to improve it
+   */  
+  def parse(s:String):Editor = {
+    val tokens = s split nameChunkRegex
+    val editorPart = tokens(0).split(" ").filter(! _.isEmpty)
+    val firstName = editorPart(0)
+    val lastName = editorPart(1)
+    Editor(firstName=firstName, lastName=lastName)
+  }
+
+}
--- a/src/main/scala/XML.scala	Thu Feb 25 19:53:50 2010 -0500
+++ b/src/main/scala/XML.scala	Fri Feb 26 16:18:00 2010 -0500
@@ -13,10 +13,11 @@
 
   final val DEFAULTCHARSET = "UTF-8"
 
-  // request the url using dispatch
-  // there is one handler to store the body
-  // there is another handler to get the Content-Type header
-  // see http://dispatch.databinder.net/Stdout_Walkthrough
+  /** request the url using dispatch
+   * there is one handler to store the body
+   * there is another handler to get the Content-Type header
+   * see http://dispatch.databinder.net/Stdout_Walkthrough
+   */
   def getBodyAndCharset(url:String):(InputStream, String) = {
 
     if (url startsWith "file://") {
@@ -69,9 +70,6 @@
     // parse the XML document using Saxon
     // see http://www.saxonica.com/documentation/xpath-api/s9api-xpath.html
     val proc:Processor = new Processor(false)
-    // proc.setConfigurationProperty("r", "org.apache.xml.resolver.tools.CatalogResolver")
-    // proc.setConfigurationProperty("x", "org.apache.xml.resolver.tools.ResolvingXMLReader")
-    // proc.setConfigurationProperty("y", "org.apache.xml.resolver.tools.ResolvingXMLReader")
     val builder:DocumentBuilder = proc.newDocumentBuilder()
     builder.setLineNumbering(true)
     builder.setWhitespaceStrippingPolicy(WhitespaceStrippingPolicy.ALL)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/test/scala/DocumentTest.scala	Fri Feb 26 16:18:00 2010 -0500
@@ -0,0 +1,31 @@
+import org.scalatest.FunSuite
+import org.w3c.prc._
+
+class DocumentTest extends FunSuite {
+
+  test("PR-xlink11-20100225") {
+
+    val document = Document("file://src/test/resources/PR-xlink11-20100225.html")
+
+    assert("Editors:" === document.editorSectionName)
+
+    val expectedEditors = Set(Editor("Steve", "DeRose"),
+			      Editor("Eve", "Maler"),
+			      Editor("David", "Orchard"),
+			      Editor("Norman", "Walsh"))
+    assert(expectedEditors === document.editors.toList.toSet)
+
+  }
+
+  test("WD-xmldsig-properties-20100204") {
+
+    val document = Document("file://src/test/resources/WD-xmldsig-properties-20100204.html")
+
+    assert("Editor:" === document.editorSectionName)
+
+    val expectedEditors = Set(Editor("Frederick", "Hirsch"))
+    assert(expectedEditors === document.editors.toList.toSet)
+
+  }
+
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/test/scala/ModelTest.scala	Fri Feb 26 16:18:00 2010 -0500
@@ -0,0 +1,14 @@
+import org.scalatest.FunSuite
+import org.w3c.prc._
+
+class ModelTest extends FunSuite {
+
+  test("parse editors") {
+
+    assert(Editor("Steve", "DeRose") === Editor.parse("Steve DeRose, Brown University Scholarly Technology Group"))
+
+    assert(Editor("Norman", "Walsh") === Editor.parse("Norman Walsh, Mark Logic Corporation - Version 1.1"))
+
+  }
+
+}
--- a/src/test/scala/Test.scala	Thu Feb 25 19:53:50 2010 -0500
+++ b/src/test/scala/Test.scala	Fri Feb 26 16:18:00 2010 -0500
@@ -5,13 +5,6 @@
 
   test("") {
 
-    // val document = Document("file://src/test/resources/WD-xmldsig-properties-20100204.html")
-
-    val document = Document("file://src/test/resources/PR-xlink11-20100225.html")
-
-    println(document.editorSectionName)
-    println(document.editorsList)
-
   }
 
 }
Set up and maintained by W3C Systems Team, please report bugs to sysreq@w3.org.

W3C would like to thank Microsoft who donated the server that allows us to run this service.