+ RDFXML parser
authorAlexandre Bertails <alexandre@bertails.org>
Sat, 27 Nov 2010 17:58:38 -0500
changeset 283 98b3d35e6175
parent 282 19086fe774d2
child 284 f3a0b800e5f9
+ RDFXML parser
project/build/RDB2RDF.scala
rdfxml/src/main/scala/RDFXML.scala
rdfxml/src/test/scala/RDFXMLTest.scala
--- a/project/build/RDB2RDF.scala	Fri Nov 26 19:43:36 2010 -0500
+++ b/project/build/RDB2RDF.scala	Sat Nov 27 17:58:38 2010 -0500
@@ -16,6 +16,7 @@
   lazy val sql = project("sql", "sql", new SQL(_), rdb)
   lazy val rdf = project("rdf", "rdf", new RDF(_))
   lazy val turtle = project("turtle", "turtle", new Turtle(_), rdf)
+  lazy val rdfxml = project("rdfxml", "rdfxml", new RDFXML(_), rdf)
   lazy val sharedtestdata = project("sharedtestdata", "sharedtestdata", new SharedTestData(_), rdb, rdf, sql, turtle)
   lazy val directmapping = project("directmapping", "directmapping", new DirectMapping(_), rdb, rdf, sql, sharedtestdata)
   lazy val sparql = project("sparql", "sparql", new SPARQL(_), rdf)
@@ -33,6 +34,11 @@
 
   class Turtle(info: ProjectInfo) extends DefaultProject(info) with Common
 
+  class RDFXML(info: ProjectInfo) extends DefaultProject(info) with Common {
+    val jena = "com.hp.hpl.jena" % "jena" % "2.6.3"
+    val jenaIri = "com.hp.hpl.jena" % "iri" % "0.8" from "http://openjena.org/repo/com/hp/hpl/jena/iri/0.8/iri-0.8.jar"
+  }
+
   class SharedTestData(info: ProjectInfo) extends DefaultProject(info) with Common
 
   class DirectMapping(info: ProjectInfo) extends DefaultProject(info) with Common
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/rdfxml/src/main/scala/RDFXML.scala	Sat Nov 27 17:58:38 2010 -0500
@@ -0,0 +1,103 @@
+package org.w3.sw.rdfxml
+
+import org.w3.sw.rdf._
+
+import com.hp.hpl.jena.rdf.arp._
+import org.xml.sax._
+import org.xml.sax.helpers._
+import java.io._
+import java.net.{MalformedURLException, URL, URLConnection}
+import java.util.{StringTokenizer, Enumeration, Hashtable}
+import java.net.URI
+
+object RDFXML {
+
+  /**
+   * http://jena.sourceforge.net/javadoc/com/hp/hpl/jena/rdf/arp/AResource.html
+   * note: see setUserData and getUserData for when BNode will be abstract
+   */
+  def toNode(a:AResource):Node =
+    if (a.isAnonymous)
+      NodeBNode(BNode(a.getAnonymousID))
+    else
+      NodeIRI(IRI(a.getURI))
+
+  def toPredicate(a:AResource):Predicate = PredicateIRI(IRI(a.getURI))
+
+  def toLiteral(l:ALiteral):Literal = {
+    val datatype:String = l.getDatatypeURI
+    if (datatype == null) {
+      val lang = l.getLang match {
+	case "" => None
+	case l  => Some(LangTag(l))
+      }
+      PlainLiteral(l.toString, lang)
+    } else {
+      TypedLiteral(l.toString, IRI(datatype))
+    }
+  }
+
+}
+
+class RDFXML() {
+
+  import RDFXML._
+
+  def toGraph(file:File):Graph = toGraph(new FileInputStream(file))
+
+  def toGraph(rdfxml:String):Graph = toGraph(new StringReader(rdfxml))
+
+  def toGraph(in:InputStream):Graph = toGraph(new BufferedReader(new InputStreamReader(in)))
+
+  def toGraph(in:Reader):Graph = {
+
+    // the accumulator for the triples
+    val triples = scala.collection.mutable.Set[Triple]()
+
+    // the accumulators for the problems we encounter
+    var fatalErrors = List[SAXParseException]()
+    var errors = List[SAXParseException]()
+    var warnings = List[SAXParseException]()
+
+    // this ErrorHandler keeps track of all the problems during the parsing
+    val errorHandler = new ErrorHandler {
+      def fatalError(e:SAXParseException):Unit = fatalErrors ::= e
+      def error(e:SAXParseException):Unit = errors ::= e
+      def warning(e:SAXParseException):Unit = warnings ::= e
+    }
+
+    // this StatementHandler read the parsed triples
+    val statementHandler = new StatementHandler {
+      def statement(s:AResource, p:AResource, o:ALiteral):Unit = {
+	val triple = Triple(SubjectNode(toNode(s)),
+			    toPredicate(p),
+			    ObjectLiteral(toLiteral(o)))
+	triples += triple
+      }
+      def statement(s:AResource, p:AResource, o:AResource):Unit = {
+	val triple = Triple(SubjectNode(toNode(s)),
+			    toPredicate(p),
+			    ObjectNode(toNode(o)))
+	triples += triple
+      }
+    }
+
+    // http://jena.sourceforge.net/ARP/standalone.html
+
+    val arp = new ARP
+    arp.getOptions.setLaxErrorMode()
+    arp.getHandlers.setErrorHandler(errorHandler)
+    arp.getHandlers.setStatementHandler(statementHandler)
+    arp.load(in)
+
+    // we should do something else with that...
+    val problems = fatalErrors ++ errors ++ warnings
+    if (! problems.isEmpty) println(problems)
+
+    // returns an immutable set
+    triples.toSet
+  }
+
+}
+
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/rdfxml/src/test/scala/RDFXMLTest.scala	Sat Nov 27 17:58:38 2010 -0500
@@ -0,0 +1,45 @@
+package org.w3.sw.rdfxml
+
+import org.w3.sw.rdf._
+
+import org.scalatest.FunSuite
+
+class RDFXMLTest extends FunSuite {
+
+  val parser = new RDFXML
+
+  test("default on http://www.rdfabout.com/demo/validator/") {
+
+    val rdfxml = """
+<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+  xmlns:dc="http://purl.org/dc/elements/1.1/">
+  <rdf:Description rdf:about="http://www.rdfabout.com/">
+    <dc:title>rdf:about: About Resource Description Framework</dc:title>
+  </rdf:Description>
+</rdf:RDF>
+""" // "
+
+    val graph:Graph = parser.toGraph(rdfxml)
+
+    println(graph)
+
+  }
+
+  test("default on http://www.w3.org/RDF/Validator/") {
+
+    val rdfxml = """<?xml version="1.0"?>
+<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+  xmlns:dc="http://purl.org/dc/elements/1.1/">
+  <rdf:Description rdf:about="http://www.w3.org/">
+    <dc:title>World Wide Web Consortium</dc:title> 
+  </rdf:Description>
+</rdf:RDF>
+""" // "
+
+    val graph:Graph = parser.toGraph(rdfxml)
+
+    println(graph)
+
+  }
+
+}