changeset 59:81832a781fd8

~ prepare the validator.nu experiment
author Alexandre Bertails <bertails@w3.org>
date Mon, 03 Oct 2011 11:01:15 -0400
parents 6568ebb8f3e1
children fa06de8a96bc
files project/Build.scala src/main/scala/Main.scala
diffstat 2 files changed, 11 insertions(+), 44 deletions(-) [+]
line wrap: on
line diff
--- a/project/Build.scala	Fri Sep 30 13:57:48 2011 +0200
+++ b/project/Build.scala	Mon Oct 03 11:01:15 2011 -0400
@@ -32,6 +32,7 @@
   val unfiltered_jetty = "net.databinder" %% "unfiltered-jetty" % "0.4.1"
   val avsl = "org.clapper" %% "avsl" % "0.3.1"
   val jTidy = "jtidy" % "jtidy" % "r938" from fromZip("http://sourceforge.net/projects/jtidy/files/JTidy/r938/jtidy-r938.zip", "jtidy-r938.jar")
+  val html5parser = "nu.validator.htmlparser" % "htmlparser" % "1.2.1" intransitive()
 }
 
 object BuildSettings {
@@ -66,6 +67,7 @@
       libraryDependencies += unfiltered_filter,
       libraryDependencies += unfiltered_jetty,
       libraryDependencies += jTidy,
+      libraryDependencies += html5parser,
       test in Assembly := {},
       jarName in Assembly := "pubrules-checker.jar"
     )
--- a/src/main/scala/Main.scala	Fri Sep 30 13:57:48 2011 +0200
+++ b/src/main/scala/Main.scala	Mon Oct 03 11:01:15 2011 -0400
@@ -70,7 +70,7 @@
 """
 You must provide a port and a path for the URLResolver
 usage: 
-java -jar target/pubrules-checker.jar 8080 src/main/resources
+  java -jar target/pubrules-checker.jar 8080 src/main/resources
 """)
         System.exit(1)
         null
@@ -129,7 +129,7 @@
 
         val paramsW = defaultParams ++ paramsNoURI + ( "doc_uri" -> spec )
 
-        pubrulesWrapper using paramsW applyOn (Source.fromTidy(spec)) getResponder()
+        pubrulesWrapper using paramsW applyOn Source.asHTML(spec) getResponder()
 
       }
     }
@@ -220,53 +220,18 @@
 
   def apply(url:String):Source = Source(new URL(url))
 
-  /** http://jtidy.sourceforge.net/apidocs/org/w3c/tidy/Tidy.html
-    * http://tidy.sourceforge.net/docs/quickref.html
-    * we may have to use a local tidy if jtidy is not enough:
-    *   https://lists.w3.org/Archives/Team/w3t-sys/2011SepOct/0162.html
-    */
-  def jtidy(in:InputStream, charset:String):InputStream = {
-    val tidy = new Tidy
-    // tidy_options = ["-n", "-asxml", "-q", "--force-output","yes", "--show-warnings", "no"]
-    tidy.setXHTML(true)
-    tidy.setInputEncoding(charset.toUpperCase)
-    tidy.setOutputEncoding("UFT-8")
-    tidy.setShowWarnings(false)
-    tidy.setShowErrors(0)
-    tidy.setQuiet(true)
-    val out = new ByteArrayOutputStream
-    tidy.parse(in, out)
-    new ByteArrayInputStream(out.toByteArray)
-//     println("\n\n\n\n\n\n===========================\n\n\n\n\n\n")
-//     tidy.parse(in, new FileOutputStream("/tmp/out.xml"))
-//     new FileInputStream("/tmp/out.xml")
-  }
+  import java.io._
+  import org.xml.sax._
+  import org.xml.sax.helpers.DefaultHandler
 
-
-  // def tidytest(url:String, charin:String, charout:String):Unit = {
-  //   val tidy = new Tidy
-  //   val in = new URL(url).openStream()
-  //   tidy.setXHTML(true)
-  //   tidy.setInputEncoding(charin)
-  //   tidy.setOutputEncoding(charout)
-  //   //tidy.setNumEntities(true)
-  //   tidy.parse(in, new FileOutputStream("/tmp/out.xml"))
-  // }
-
-  def fromTidy(url:URL):Source = {
-    val tidyURL = new URL("http://services.w3.org/tidy/tidy?docAddr=" + url.toString)
-    val streamSource = new StreamSource(tidyURL.openStream())
+  def asHTML(url:URL):Source = {
+    /* look into https://dvcs.w3.org/hg/validator-service/file/18921b197338/src/main/scala/HtmlParser.scala */
+    val streamSource = new StreamSource(url.openStream())
     streamSource.setSystemId(url.toString)
     Source(streamSource)
   }
 
-  // def fromTidyOld(url:URL):Source = {
-  //   val is:InputStream = Http(new Request(url.toString) >> (tidy _))
-  //   val streamSource = new StreamSource(is)
-  //   streamSource.setSystemId(url.toString)
-  //   Source(streamSource)
-  // }
+  def asHTML(url:String):Source = asHTML(new URL(url))
 
-  def fromTidy(url:String):Source = Source.fromTidy(new URL(url))
 
 }
Set up and maintained by W3C Systems Team, please report bugs to sysreq@w3.org.

W3C would like to thank Microsoft who donated the server that allows us to run this service.