From d6a06fe09297f71ecba14f464194efb1dfbc3af7 Mon Sep 17 00:00:00 2001 From: Vitor Santos Costa Date: Fri, 13 Mar 2009 19:39:06 +0000 Subject: [PATCH] Experiment with porting SGML to YAP, and trying to preserve SWI code as much as possible. --- packages/sgml/COPYING | 6 + packages/sgml/ChangeLog | 716 +++ packages/sgml/DTD/HTML4.dcl | 88 + packages/sgml/DTD/HTML4.dtd | 1092 +++++ packages/sgml/DTD/HTML4.soc | 6 + packages/sgml/DTD/HTMLlat1.ent | 195 + packages/sgml/DTD/HTMLspec.ent | 77 + packages/sgml/DTD/HTMLsym.ent | 241 + packages/sgml/FILES | 20 + packages/sgml/INSTALL | 167 + packages/sgml/Makefile.in | 220 + packages/sgml/Makefile.mak | 79 + packages/sgml/RDF/.cvsignore | 9 + packages/sgml/RDF/.gitignore | 4 + packages/sgml/RDF/ChangeLog | 237 + packages/sgml/RDF/Makefile.in | 128 + packages/sgml/RDF/Makefile.mak | 51 + packages/sgml/RDF/README | 12 + packages/sgml/RDF/configure.in | 32 + packages/sgml/RDF/install-sh | 238 + packages/sgml/RDF/online.html | 177 + packages/sgml/RDF/online.pl | 457 ++ packages/sgml/RDF/pretty_print.pl | 167 + packages/sgml/RDF/rdf-parser | 35 + packages/sgml/RDF/rdf.html | 156 + packages/sgml/RDF/rdf.pl | 456 ++ packages/sgml/RDF/rdf2pl.doc | 475 ++ packages/sgml/RDF/rdf_diagram.pl | 492 +++ packages/sgml/RDF/rdf_ntriples.pl | 311 ++ packages/sgml/RDF/rdf_parser.pl | 724 +++ packages/sgml/RDF/rdf_test.pl | 347 ++ packages/sgml/RDF/rdf_triple.pl | 461 ++ packages/sgml/RDF/rdf_write.pl | 635 +++ packages/sgml/RDF/rdfs.rdfs | 237 + packages/sgml/RDF/rewrite.pl | 144 + packages/sgml/RDF/suite/ex_19.rdf | 12 + packages/sgml/RDF/suite/ok/ex_19.ok | 14 + packages/sgml/RDF/suite/ok/t1.ok | 1 + packages/sgml/RDF/suite/ok/t10.ok | 4 + packages/sgml/RDF/suite/ok/t11.ok | 1 + packages/sgml/RDF/suite/ok/t12.ok | 1 + packages/sgml/RDF/suite/ok/t13.ok | 1 + packages/sgml/RDF/suite/ok/t14.ok | 3 + packages/sgml/RDF/suite/ok/t15.ok | 3 + packages/sgml/RDF/suite/ok/t16.ok | 3 + packages/sgml/RDF/suite/ok/t17.ok | 3 + packages/sgml/RDF/suite/ok/t18.ok | 3 + packages/sgml/RDF/suite/ok/t19.ok | 4 + packages/sgml/RDF/suite/ok/t2.ok | 3 + packages/sgml/RDF/suite/ok/t20.ok | 4 + packages/sgml/RDF/suite/ok/t21.ok | 7 + packages/sgml/RDF/suite/ok/t22.ok | 5 + packages/sgml/RDF/suite/ok/t23.ok | 1 + packages/sgml/RDF/suite/ok/t24.ok | 5 + packages/sgml/RDF/suite/ok/t25.ok | 13 + packages/sgml/RDF/suite/ok/t26.ok | 8 + packages/sgml/RDF/suite/ok/t27.ok | 12 + packages/sgml/RDF/suite/ok/t28.ok | 3 + packages/sgml/RDF/suite/ok/t29.ok | 3 + packages/sgml/RDF/suite/ok/t3.ok | 3 + packages/sgml/RDF/suite/ok/t30.ok | 12 + packages/sgml/RDF/suite/ok/t31.ok | 17 + packages/sgml/RDF/suite/ok/t32.ok | 15 + packages/sgml/RDF/suite/ok/t33.ok | 2 + packages/sgml/RDF/suite/ok/t34.ok | 44 + packages/sgml/RDF/suite/ok/t35.ok | 8 + packages/sgml/RDF/suite/ok/t36.ok | 4 + packages/sgml/RDF/suite/ok/t37.ok | 7 + packages/sgml/RDF/suite/ok/t38.ok | 128 + packages/sgml/RDF/suite/ok/t39.ok | 9 + packages/sgml/RDF/suite/ok/t4.ok | 6 + packages/sgml/RDF/suite/ok/t40.ok | 4 + packages/sgml/RDF/suite/ok/t41.ok | 3 + packages/sgml/RDF/suite/ok/t42.ok | 4 + packages/sgml/RDF/suite/ok/t5.ok | 7 + packages/sgml/RDF/suite/ok/t6.ok | 4 + packages/sgml/RDF/suite/ok/t7.ok | 5 + packages/sgml/RDF/suite/ok/t8.ok | 3 + packages/sgml/RDF/suite/ok/t9.ok | 9 + packages/sgml/RDF/suite/ok/types.ok | 6 + packages/sgml/RDF/suite/ok/xmllit.ok | 2 + packages/sgml/RDF/suite/ok/xsdtypes.ok | 2 + packages/sgml/RDF/suite/t1.rdf | 10 + packages/sgml/RDF/suite/t10.rdf | 12 + packages/sgml/RDF/suite/t11.rdf | 7 + packages/sgml/RDF/suite/t12.rdf | 11 + packages/sgml/RDF/suite/t13.rdf | 9 + packages/sgml/RDF/suite/t14.rdf | 12 + packages/sgml/RDF/suite/t15.rdf | 11 + packages/sgml/RDF/suite/t16.rdf | 15 + packages/sgml/RDF/suite/t17.rdf | 15 + packages/sgml/RDF/suite/t18.rdf | 12 + packages/sgml/RDF/suite/t19.rdf | 17 + packages/sgml/RDF/suite/t2.rdf | 8 + packages/sgml/RDF/suite/t20.rdf | 17 + packages/sgml/RDF/suite/t21.rdf | 19 + packages/sgml/RDF/suite/t22.rdf | 17 + packages/sgml/RDF/suite/t23.rdf | 11 + packages/sgml/RDF/suite/t24.rdf | 15 + packages/sgml/RDF/suite/t25.rdf | 12 + packages/sgml/RDF/suite/t26.rdf | 16 + packages/sgml/RDF/suite/t27.rdf | 30 + packages/sgml/RDF/suite/t28.rdf | 15 + packages/sgml/RDF/suite/t29.rdf | 15 + packages/sgml/RDF/suite/t3.rdf | 15 + packages/sgml/RDF/suite/t30.rdf | 26 + packages/sgml/RDF/suite/t31.rdf | 36 + packages/sgml/RDF/suite/t32.rdf | 43 + packages/sgml/RDF/suite/t33.rdf | 18 + packages/sgml/RDF/suite/t34.rdf | 28 + packages/sgml/RDF/suite/t35.rdf | 15 + packages/sgml/RDF/suite/t36.rdf | 15 + packages/sgml/RDF/suite/t37.rdf | 15 + packages/sgml/RDF/suite/t38.rdf | 190 + packages/sgml/RDF/suite/t39.rdf | 25 + packages/sgml/RDF/suite/t4.rdf | 14 + packages/sgml/RDF/suite/t40.rdf | 11 + packages/sgml/RDF/suite/t41.rdf | 15 + packages/sgml/RDF/suite/t42.rdf | 16 + packages/sgml/RDF/suite/t5.rdf | 15 + packages/sgml/RDF/suite/t6.rdf | 15 + packages/sgml/RDF/suite/t7.rdf | 17 + packages/sgml/RDF/suite/t8.rdf | 16 + packages/sgml/RDF/suite/t9.rdf | 12 + packages/sgml/RDF/suite/types.rdf | 22 + packages/sgml/RDF/suite/xmllit.rdf | 19 + packages/sgml/RDF/suite/xsdtypes.rdf | 19 + packages/sgml/RDF/w3c_test.pl | 467 ++ packages/sgml/RDF/write_test.pl | 155 + packages/sgml/TODO | 26 + packages/sgml/Test/amp.sgml | 7 + packages/sgml/Test/att.xml | 3 + packages/sgml/Test/badxmlent.xml | 5 + packages/sgml/Test/bar.sgml | 8 + packages/sgml/Test/bat.sgml | 68 + packages/sgml/Test/cdata.sgml | 8 + packages/sgml/Test/ce.sgml | 7 + packages/sgml/Test/cent-nul.xml | 1 + packages/sgml/Test/cent-utf8.xml | 19 + packages/sgml/Test/cmt.sgml | 13 + packages/sgml/Test/comment.xml | 10 + packages/sgml/Test/conref.sgml | 8 + packages/sgml/Test/conref2.sgml | 10 + packages/sgml/Test/crlf.sgml | 11 + packages/sgml/Test/defent.sgml | 10 + packages/sgml/Test/entent.sgml | 10 + packages/sgml/Test/estag.sgml | 10 + packages/sgml/Test/foo.sgml | 7 + packages/sgml/Test/i.sgml | 12 + packages/sgml/Test/layout.xml | 11 + packages/sgml/Test/mapbug.sgml | 41 + packages/sgml/Test/ment.sgml | 7 + packages/sgml/Test/minus2.xml | 3 + packages/sgml/Test/netc.sgml | 12 + packages/sgml/Test/ng.sgml | 10 + packages/sgml/Test/noent.sgml | 14 + packages/sgml/Test/not.sgml | 13 + packages/sgml/Test/ok/amp.ok | 1 + packages/sgml/Test/ok/att.ok | 2 + packages/sgml/Test/ok/badxmlent.ok | 2 + packages/sgml/Test/ok/bar.ok | 1 + packages/sgml/Test/ok/bat.ok | 2 + packages/sgml/Test/ok/cdata.ok | 2 + packages/sgml/Test/ok/ce.ok | 1 + packages/sgml/Test/ok/cent-nul.ok | 2 + packages/sgml/Test/ok/cent-utf8.ok | 2 + packages/sgml/Test/ok/cmt.ok | 1 + packages/sgml/Test/ok/comment.ok | 2 + packages/sgml/Test/ok/conref.ok | 1 + packages/sgml/Test/ok/conref2.ok | 2 + packages/sgml/Test/ok/crlf.ok | 2 + packages/sgml/Test/ok/defent.ok | 2 + packages/sgml/Test/ok/entent.ok | 1 + packages/sgml/Test/ok/estag.ok | 2 + packages/sgml/Test/ok/foo.ok | 1 + packages/sgml/Test/ok/i.ok | 1 + packages/sgml/Test/ok/layout.ok | 2 + packages/sgml/Test/ok/mapbug.ok | 2 + packages/sgml/Test/ok/ment.ok | 2 + packages/sgml/Test/ok/minus2.ok | 2 + packages/sgml/Test/ok/netc.ok | 2 + packages/sgml/Test/ok/ng.ok | 2 + packages/sgml/Test/ok/noent.ok | 1 + packages/sgml/Test/ok/not.ok | 1 + packages/sgml/Test/ok/oma.ok | 2 + packages/sgml/Test/ok/omit1.ok | 1 + packages/sgml/Test/ok/per.ok | 1 + packages/sgml/Test/ok/pi.ok | 2 + packages/sgml/Test/ok/rcdata.ok | 1 + packages/sgml/Test/ok/rdefent.ok | 2 + packages/sgml/Test/ok/rsre.ok | 2 + packages/sgml/Test/ok/sdata.ok | 1 + packages/sgml/Test/ok/shortval.ok | 1 + packages/sgml/Test/ok/simple.ok | 2 + packages/sgml/Test/ok/sr.ok | 1 + packages/sgml/Test/ok/sr2.ok | 1 + packages/sgml/Test/ok/ugh.ok | 1 + packages/sgml/Test/ok/utf8-cent.ok | 2 + packages/sgml/Test/ok/utf8-ru.ok | 2 + packages/sgml/Test/ok/utf8.ok | 2 + packages/sgml/Test/ok/wchar.ok | 2 + packages/sgml/Test/ok/wcharlong.ok | 2 + packages/sgml/Test/oma.sgml | 8 + packages/sgml/Test/omit1.sgml | 14 + packages/sgml/Test/per.sgml | 9 + packages/sgml/Test/pi.xml | 3 + packages/sgml/Test/rcdata.sgml | 19 + packages/sgml/Test/rdefent.sgml | 9 + packages/sgml/Test/rsre.sgml | 18 + packages/sgml/Test/sdata.sgml | 8 + packages/sgml/Test/shortval.sgml | 6 + packages/sgml/Test/simple.xml | 1 + packages/sgml/Test/sr.sgml | 20 + packages/sgml/Test/sr2.sgml | 25 + packages/sgml/Test/test.pl | 163 + packages/sgml/Test/ugh.sgml | 9 + packages/sgml/Test/utf8-cent.xml | 7 + packages/sgml/Test/utf8-ru.xml | 3 + packages/sgml/Test/utf8.xml | 10 + packages/sgml/Test/wchar.xml | 11 + packages/sgml/Test/wcharlong.xml | 16 + packages/sgml/Test/wrtest.pl | 241 + packages/sgml/VERSION | 1 + packages/sgml/catalog.c | 672 +++ packages/sgml/catalog.h | 64 + packages/sgml/charmap.c | 104 + packages/sgml/configure.in | 114 + packages/sgml/dtd.h | 481 ++ packages/sgml/dtd2pl.1 | 51 + packages/sgml/dtd2pl.c | 91 + packages/sgml/error.c | 178 + packages/sgml/error.h | 46 + packages/sgml/install-sh | 238 + packages/sgml/iso_639.pl | 628 +++ packages/sgml/make.bat | 3 + packages/sgml/model.c | 524 +++ packages/sgml/model.h | 47 + packages/sgml/parser.c | 5602 ++++++++++++++++++++++++ packages/sgml/parser.h | 229 + packages/sgml/prolog.c | 520 +++ packages/sgml/prolog.h | 42 + packages/sgml/quote.c | 401 ++ packages/sgml/sgml.c | 455 ++ packages/sgml/sgml.doc | 1319 ++++++ packages/sgml/sgml.pl | 434 ++ packages/sgml/sgml2pl.c | 2401 ++++++++++ packages/sgml/sgml_mode.html | 115 + packages/sgml/sgml_mode.pl | 1081 +++++ packages/sgml/sgml_write.pl | 808 ++++ packages/sgml/sgmldefs.h | 85 + packages/sgml/utf8.c | 117 + packages/sgml/utf8.h | 45 + packages/sgml/util.c | 747 ++++ packages/sgml/util.h | 119 + packages/sgml/xml_unicode.c | 1260 ++++++ packages/sgml/xml_unicode.h | 41 + packages/sgml/xml_unicode.pl | 436 ++ packages/sgml/xmlns.c | 244 ++ packages/sgml/xmlns.h | 43 + packages/sgml/xsdp_types.pl | 214 + 260 files changed, 31894 insertions(+) create mode 100644 packages/sgml/COPYING create mode 100644 packages/sgml/ChangeLog create mode 100644 packages/sgml/DTD/HTML4.dcl create mode 100644 packages/sgml/DTD/HTML4.dtd create mode 100644 packages/sgml/DTD/HTML4.soc create mode 100644 packages/sgml/DTD/HTMLlat1.ent create mode 100644 packages/sgml/DTD/HTMLspec.ent create mode 100644 packages/sgml/DTD/HTMLsym.ent create mode 100644 packages/sgml/FILES create mode 100644 packages/sgml/INSTALL create mode 100644 packages/sgml/Makefile.in create mode 100644 packages/sgml/Makefile.mak create mode 100644 packages/sgml/RDF/.cvsignore create mode 100644 packages/sgml/RDF/.gitignore create mode 100644 packages/sgml/RDF/ChangeLog create mode 100644 packages/sgml/RDF/Makefile.in create mode 100644 packages/sgml/RDF/Makefile.mak create mode 100644 packages/sgml/RDF/README create mode 100644 packages/sgml/RDF/configure.in create mode 100755 packages/sgml/RDF/install-sh create mode 100644 packages/sgml/RDF/online.html create mode 100644 packages/sgml/RDF/online.pl create mode 100644 packages/sgml/RDF/pretty_print.pl create mode 100755 packages/sgml/RDF/rdf-parser create mode 100644 packages/sgml/RDF/rdf.html create mode 100644 packages/sgml/RDF/rdf.pl create mode 100644 packages/sgml/RDF/rdf2pl.doc create mode 100644 packages/sgml/RDF/rdf_diagram.pl create mode 100644 packages/sgml/RDF/rdf_ntriples.pl create mode 100644 packages/sgml/RDF/rdf_parser.pl create mode 100644 packages/sgml/RDF/rdf_test.pl create mode 100644 packages/sgml/RDF/rdf_triple.pl create mode 100644 packages/sgml/RDF/rdf_write.pl create mode 100644 packages/sgml/RDF/rdfs.rdfs create mode 100644 packages/sgml/RDF/rewrite.pl create mode 100644 packages/sgml/RDF/suite/ex_19.rdf create mode 100644 packages/sgml/RDF/suite/ok/ex_19.ok create mode 100644 packages/sgml/RDF/suite/ok/t1.ok create mode 100644 packages/sgml/RDF/suite/ok/t10.ok create mode 100644 packages/sgml/RDF/suite/ok/t11.ok create mode 100644 packages/sgml/RDF/suite/ok/t12.ok create mode 100644 packages/sgml/RDF/suite/ok/t13.ok create mode 100644 packages/sgml/RDF/suite/ok/t14.ok create mode 100644 packages/sgml/RDF/suite/ok/t15.ok create mode 100644 packages/sgml/RDF/suite/ok/t16.ok create mode 100644 packages/sgml/RDF/suite/ok/t17.ok create mode 100644 packages/sgml/RDF/suite/ok/t18.ok create mode 100644 packages/sgml/RDF/suite/ok/t19.ok create mode 100644 packages/sgml/RDF/suite/ok/t2.ok create mode 100644 packages/sgml/RDF/suite/ok/t20.ok create mode 100644 packages/sgml/RDF/suite/ok/t21.ok create mode 100644 packages/sgml/RDF/suite/ok/t22.ok create mode 100644 packages/sgml/RDF/suite/ok/t23.ok create mode 100644 packages/sgml/RDF/suite/ok/t24.ok create mode 100644 packages/sgml/RDF/suite/ok/t25.ok create mode 100644 packages/sgml/RDF/suite/ok/t26.ok create mode 100644 packages/sgml/RDF/suite/ok/t27.ok create mode 100644 packages/sgml/RDF/suite/ok/t28.ok create mode 100644 packages/sgml/RDF/suite/ok/t29.ok create mode 100644 packages/sgml/RDF/suite/ok/t3.ok create mode 100644 packages/sgml/RDF/suite/ok/t30.ok create mode 100644 packages/sgml/RDF/suite/ok/t31.ok create mode 100644 packages/sgml/RDF/suite/ok/t32.ok create mode 100644 packages/sgml/RDF/suite/ok/t33.ok create mode 100644 packages/sgml/RDF/suite/ok/t34.ok create mode 100644 packages/sgml/RDF/suite/ok/t35.ok create mode 100644 packages/sgml/RDF/suite/ok/t36.ok create mode 100644 packages/sgml/RDF/suite/ok/t37.ok create mode 100644 packages/sgml/RDF/suite/ok/t38.ok create mode 100644 packages/sgml/RDF/suite/ok/t39.ok create mode 100644 packages/sgml/RDF/suite/ok/t4.ok create mode 100644 packages/sgml/RDF/suite/ok/t40.ok create mode 100644 packages/sgml/RDF/suite/ok/t41.ok create mode 100644 packages/sgml/RDF/suite/ok/t42.ok create mode 100644 packages/sgml/RDF/suite/ok/t5.ok create mode 100644 packages/sgml/RDF/suite/ok/t6.ok create mode 100644 packages/sgml/RDF/suite/ok/t7.ok create mode 100644 packages/sgml/RDF/suite/ok/t8.ok create mode 100644 packages/sgml/RDF/suite/ok/t9.ok create mode 100644 packages/sgml/RDF/suite/ok/types.ok create mode 100644 packages/sgml/RDF/suite/ok/xmllit.ok create mode 100644 packages/sgml/RDF/suite/ok/xsdtypes.ok create mode 100644 packages/sgml/RDF/suite/t1.rdf create mode 100644 packages/sgml/RDF/suite/t10.rdf create mode 100644 packages/sgml/RDF/suite/t11.rdf create mode 100644 packages/sgml/RDF/suite/t12.rdf create mode 100644 packages/sgml/RDF/suite/t13.rdf create mode 100644 packages/sgml/RDF/suite/t14.rdf create mode 100644 packages/sgml/RDF/suite/t15.rdf create mode 100644 packages/sgml/RDF/suite/t16.rdf create mode 100644 packages/sgml/RDF/suite/t17.rdf create mode 100644 packages/sgml/RDF/suite/t18.rdf create mode 100644 packages/sgml/RDF/suite/t19.rdf create mode 100644 packages/sgml/RDF/suite/t2.rdf create mode 100644 packages/sgml/RDF/suite/t20.rdf create mode 100644 packages/sgml/RDF/suite/t21.rdf create mode 100644 packages/sgml/RDF/suite/t22.rdf create mode 100644 packages/sgml/RDF/suite/t23.rdf create mode 100644 packages/sgml/RDF/suite/t24.rdf create mode 100644 packages/sgml/RDF/suite/t25.rdf create mode 100644 packages/sgml/RDF/suite/t26.rdf create mode 100644 packages/sgml/RDF/suite/t27.rdf create mode 100644 packages/sgml/RDF/suite/t28.rdf create mode 100644 packages/sgml/RDF/suite/t29.rdf create mode 100644 packages/sgml/RDF/suite/t3.rdf create mode 100644 packages/sgml/RDF/suite/t30.rdf create mode 100644 packages/sgml/RDF/suite/t31.rdf create mode 100644 packages/sgml/RDF/suite/t32.rdf create mode 100644 packages/sgml/RDF/suite/t33.rdf create mode 100644 packages/sgml/RDF/suite/t34.rdf create mode 100644 packages/sgml/RDF/suite/t35.rdf create mode 100644 packages/sgml/RDF/suite/t36.rdf create mode 100644 packages/sgml/RDF/suite/t37.rdf create mode 100644 packages/sgml/RDF/suite/t38.rdf create mode 100644 packages/sgml/RDF/suite/t39.rdf create mode 100644 packages/sgml/RDF/suite/t4.rdf create mode 100644 packages/sgml/RDF/suite/t40.rdf create mode 100644 packages/sgml/RDF/suite/t41.rdf create mode 100644 packages/sgml/RDF/suite/t42.rdf create mode 100644 packages/sgml/RDF/suite/t5.rdf create mode 100644 packages/sgml/RDF/suite/t6.rdf create mode 100644 packages/sgml/RDF/suite/t7.rdf create mode 100644 packages/sgml/RDF/suite/t8.rdf create mode 100644 packages/sgml/RDF/suite/t9.rdf create mode 100644 packages/sgml/RDF/suite/types.rdf create mode 100644 packages/sgml/RDF/suite/xmllit.rdf create mode 100644 packages/sgml/RDF/suite/xsdtypes.rdf create mode 100644 packages/sgml/RDF/w3c_test.pl create mode 100644 packages/sgml/RDF/write_test.pl create mode 100644 packages/sgml/TODO create mode 100644 packages/sgml/Test/amp.sgml create mode 100644 packages/sgml/Test/att.xml create mode 100644 packages/sgml/Test/badxmlent.xml create mode 100644 packages/sgml/Test/bar.sgml create mode 100644 packages/sgml/Test/bat.sgml create mode 100644 packages/sgml/Test/cdata.sgml create mode 100644 packages/sgml/Test/ce.sgml create mode 100644 packages/sgml/Test/cent-nul.xml create mode 100644 packages/sgml/Test/cent-utf8.xml create mode 100644 packages/sgml/Test/cmt.sgml create mode 100644 packages/sgml/Test/comment.xml create mode 100644 packages/sgml/Test/conref.sgml create mode 100644 packages/sgml/Test/conref2.sgml create mode 100644 packages/sgml/Test/crlf.sgml create mode 100644 packages/sgml/Test/defent.sgml create mode 100644 packages/sgml/Test/entent.sgml create mode 100644 packages/sgml/Test/estag.sgml create mode 100644 packages/sgml/Test/foo.sgml create mode 100644 packages/sgml/Test/i.sgml create mode 100644 packages/sgml/Test/layout.xml create mode 100644 packages/sgml/Test/mapbug.sgml create mode 100644 packages/sgml/Test/ment.sgml create mode 100644 packages/sgml/Test/minus2.xml create mode 100644 packages/sgml/Test/netc.sgml create mode 100644 packages/sgml/Test/ng.sgml create mode 100644 packages/sgml/Test/noent.sgml create mode 100644 packages/sgml/Test/not.sgml create mode 100644 packages/sgml/Test/ok/amp.ok create mode 100644 packages/sgml/Test/ok/att.ok create mode 100644 packages/sgml/Test/ok/badxmlent.ok create mode 100644 packages/sgml/Test/ok/bar.ok create mode 100644 packages/sgml/Test/ok/bat.ok create mode 100644 packages/sgml/Test/ok/cdata.ok create mode 100644 packages/sgml/Test/ok/ce.ok create mode 100644 packages/sgml/Test/ok/cent-nul.ok create mode 100644 packages/sgml/Test/ok/cent-utf8.ok create mode 100644 packages/sgml/Test/ok/cmt.ok create mode 100644 packages/sgml/Test/ok/comment.ok create mode 100644 packages/sgml/Test/ok/conref.ok create mode 100644 packages/sgml/Test/ok/conref2.ok create mode 100644 packages/sgml/Test/ok/crlf.ok create mode 100644 packages/sgml/Test/ok/defent.ok create mode 100644 packages/sgml/Test/ok/entent.ok create mode 100644 packages/sgml/Test/ok/estag.ok create mode 100644 packages/sgml/Test/ok/foo.ok create mode 100644 packages/sgml/Test/ok/i.ok create mode 100644 packages/sgml/Test/ok/layout.ok create mode 100644 packages/sgml/Test/ok/mapbug.ok create mode 100644 packages/sgml/Test/ok/ment.ok create mode 100644 packages/sgml/Test/ok/minus2.ok create mode 100644 packages/sgml/Test/ok/netc.ok create mode 100644 packages/sgml/Test/ok/ng.ok create mode 100644 packages/sgml/Test/ok/noent.ok create mode 100644 packages/sgml/Test/ok/not.ok create mode 100644 packages/sgml/Test/ok/oma.ok create mode 100644 packages/sgml/Test/ok/omit1.ok create mode 100644 packages/sgml/Test/ok/per.ok create mode 100644 packages/sgml/Test/ok/pi.ok create mode 100644 packages/sgml/Test/ok/rcdata.ok create mode 100644 packages/sgml/Test/ok/rdefent.ok create mode 100644 packages/sgml/Test/ok/rsre.ok create mode 100644 packages/sgml/Test/ok/sdata.ok create mode 100644 packages/sgml/Test/ok/shortval.ok create mode 100644 packages/sgml/Test/ok/simple.ok create mode 100644 packages/sgml/Test/ok/sr.ok create mode 100644 packages/sgml/Test/ok/sr2.ok create mode 100644 packages/sgml/Test/ok/ugh.ok create mode 100644 packages/sgml/Test/ok/utf8-cent.ok create mode 100644 packages/sgml/Test/ok/utf8-ru.ok create mode 100644 packages/sgml/Test/ok/utf8.ok create mode 100644 packages/sgml/Test/ok/wchar.ok create mode 100644 packages/sgml/Test/ok/wcharlong.ok create mode 100644 packages/sgml/Test/oma.sgml create mode 100644 packages/sgml/Test/omit1.sgml create mode 100644 packages/sgml/Test/per.sgml create mode 100644 packages/sgml/Test/pi.xml create mode 100644 packages/sgml/Test/rcdata.sgml create mode 100644 packages/sgml/Test/rdefent.sgml create mode 100644 packages/sgml/Test/rsre.sgml create mode 100644 packages/sgml/Test/sdata.sgml create mode 100644 packages/sgml/Test/shortval.sgml create mode 100644 packages/sgml/Test/simple.xml create mode 100644 packages/sgml/Test/sr.sgml create mode 100644 packages/sgml/Test/sr2.sgml create mode 100644 packages/sgml/Test/test.pl create mode 100644 packages/sgml/Test/ugh.sgml create mode 100644 packages/sgml/Test/utf8-cent.xml create mode 100644 packages/sgml/Test/utf8-ru.xml create mode 100644 packages/sgml/Test/utf8.xml create mode 100644 packages/sgml/Test/wchar.xml create mode 100644 packages/sgml/Test/wcharlong.xml create mode 100644 packages/sgml/Test/wrtest.pl create mode 100644 packages/sgml/VERSION create mode 100644 packages/sgml/catalog.c create mode 100644 packages/sgml/catalog.h create mode 100644 packages/sgml/charmap.c create mode 100644 packages/sgml/configure.in create mode 100644 packages/sgml/dtd.h create mode 100644 packages/sgml/dtd2pl.1 create mode 100644 packages/sgml/dtd2pl.c create mode 100644 packages/sgml/error.c create mode 100644 packages/sgml/error.h create mode 100755 packages/sgml/install-sh create mode 100644 packages/sgml/iso_639.pl create mode 100755 packages/sgml/make.bat create mode 100644 packages/sgml/model.c create mode 100644 packages/sgml/model.h create mode 100644 packages/sgml/parser.c create mode 100644 packages/sgml/parser.h create mode 100644 packages/sgml/prolog.c create mode 100644 packages/sgml/prolog.h create mode 100644 packages/sgml/quote.c create mode 100644 packages/sgml/sgml.c create mode 100644 packages/sgml/sgml.doc create mode 100644 packages/sgml/sgml.pl create mode 100644 packages/sgml/sgml2pl.c create mode 100644 packages/sgml/sgml_mode.html create mode 100644 packages/sgml/sgml_mode.pl create mode 100644 packages/sgml/sgml_write.pl create mode 100644 packages/sgml/sgmldefs.h create mode 100644 packages/sgml/utf8.c create mode 100644 packages/sgml/utf8.h create mode 100644 packages/sgml/util.c create mode 100644 packages/sgml/util.h create mode 100644 packages/sgml/xml_unicode.c create mode 100644 packages/sgml/xml_unicode.h create mode 100644 packages/sgml/xml_unicode.pl create mode 100644 packages/sgml/xmlns.c create mode 100644 packages/sgml/xmlns.h create mode 100644 packages/sgml/xsdp_types.pl diff --git a/packages/sgml/COPYING b/packages/sgml/COPYING new file mode 100644 index 000000000..bfdcd7e58 --- /dev/null +++ b/packages/sgml/COPYING @@ -0,0 +1,6 @@ +This library is distributed under the LGPL licence terms. For details +visit http://www.gnu.org/copyleft/lesser.html. + +Holders of the SWI-Prolog commercial license (see +http://www.swi.psy.uva.nl/projects/SWI-Prolog/) may use this software +under the same conditions as SWI-Prolog. diff --git a/packages/sgml/ChangeLog b/packages/sgml/ChangeLog new file mode 100644 index 000000000..9b756031c --- /dev/null +++ b/packages/sgml/ChangeLog @@ -0,0 +1,716 @@ +[Feb 16 2009] + + * FIXED: Release HTML DTDs when using load_html_file/2 in a thread. + +[Feb 15 2009] + + * FIXED: Memory leak in SGML parser. + +[Jan 21 2009] + + * FIXED: RDF writer: write valid XML if the namespace entity contains characters + that must be %-escaped. Jacopo Urbani. + +[Jan 13 2009] + + * FIXED: RDF/XML could save illegal XML if multiple namespaces are used + for predicates where one namespace is a prefix of another one. Jacopo Urbani. +[Dec 19 2008] + + * FIXED: Correct usage of content_length option in sgml_parse/3 when using callbacks. Needed to deal with streaming input for parsing RDF. + + * ADDED: Pass content_length through process_rdf/3 +[Oct 13 2008] + + * CLEANUP: Avoid repetition and warnings on option processing predicates. Matt Lilley. +[Sep 11 2008] + + * PORT: Add AC_PREREQ to configure.h for systems that auto-select autoconf + versions. After tip by Ulrich Neumerkel. + +[Aug 11 2008] + + * INSTALL: Remove all configure files from the git repository +[May 20 2008] + + * MODIFIED: Streams in socket.pl and ssl.pl appeared unbuffered. They + are now fully buffered. + +[Apr 18 2008] + + * MODIFIED: Renamed hash_term/2 to term_hash/2. Added hash_term/2 to + library(backcomp), so most code should not notice this. +[Mar 1 2008] + + * ENHANCED: Allow for GC from PL_handle_signals(), providing GC for foreign + code building large structures. This is now used by the SGML/XML parser + to avoid running unnecessarily out of stack. + +[Feb 28 2008] + + * FIXED: rdf_write_xml/2 loops if it encounters an rdf:Bag. + +[Feb 18 2008] + + * FIXED: More fixes for proper handling of rdf:Bag + +[Feb 13 2008] + + * FIXED: Emit rdf:Bag attributes (etc.) as rdf:li + + * FIXED: possible failure in rdf_write_xml with http://t-d-b.org? + +[Jan 23 2008] + + * PORT: Bug#346: Allow overriding COFLAGS and CWFLAGS in package + configuration. Keri Harris. +[Jan 14 2008] + + * FIXED: Bug#343: Handling bnodes in rdf_write_xml. Yver Raimond. + + * FIXED: Added support for rdf:NodeID to rdf_write_xml/2. Yves Raimond. + +[Dec 13 2007] + + * FIXED: library(rdf_write) to deal with operators. Related to Bug#332 + +[Nov 12 2007] + + * FIXED: Bug#320: Memory leak when parsing SGML NOTATIONs. Keri Harris. + +Oct 30, 2007 + + * FIXED: xml_write/3 quoting of < in attributes. Dmitry Kuzmin. + +Sep 11, 2007 + + * FIXED: xml_write/3 for multi-valued attributes such as IDREFS, NAMES, + etc. Victor de Boer and Anjo Anjewierden. + +Jun 6, 2007 + + * MODIFIED: Avoid recursive expansion of entities, unless marked as + SGML content. + * FIXED: Strict XML comment syntax. Jacco van Ossenbruggen. + * FIXED: give error on bad entities in XML mode. Jacco van Ossenbruggen. + +Feb 6, 2007 + + * MODIFIED: xml_quote_attribute no longer maps ' to ' See note in + quote.c + +Nov 15, 2006 + + * FIXED: properly pass instructions. + +Oct 27, 2006 + + * ENHANCEMENT: Started branch XML_UNICODE to provide support for Unicode + filenames, tags and elements. + +Aug 28, 2006 + + * DOCUMENTATION: Moved to sgml.doc, using the same system as the + remainder of the system. + +Jul 25, 2006 + + * FIXED: space(default) handling. Juho Östman. + +Jun 20, 2006 + + * FIXED: illegal read. Can cause wrong data and/or crashes. + +Feb 16, 2006 + + * ENHANCED: various improvements of the sgml_write.pl library by Richard + O'Keefe. + +Feb 12, 2006 + + * ENHANCED: xml_write/3: if encoding is `text', write the data as UTF-8 + +Feb 9, 2006 + + * FIXED: valgrind detected memory error. Not sure whether or not it was + a real error. + +Feb 1, 2006 + + * PORT: Detect inline behaviour of compiler in configure + +Jan 19, 2006 + + * FIXED: Allow for different deparators in in/excluded namegroup. + Anjo Anjewierden. + +Jul 7, 2005 + + * ADDED: entiry(+Name, +Value) to (re-)define CDATA entity values. + +Mar 31, 2005 + + * ADDED: make exceptions in call-backs from sgml_parse/3 return the + parser immediately with the given exception. + +Mar 29, 2005 + + * ADDED: layout(Bool) option to the xml_write/3 and friends predciates. + +Mar 21, 2005 + + * ADDED: specify dialect for loading DTDs. After mail from Sebastien Cabot. + + * FIXED: Avoid error on thread_at_exit when using in single-threaded + Prolog. Anjo Anjewierden. + +Mar 6, 2005 + + * ADDED: improved namespace and indentation support for xml_write/3. + +Mar 5, 2005 + + * ADDED: xml_is_dom/1. Option header(Bool) to xml_write to suppress + writing the header. + +Mar 4, 2005 + + * ADDED: xmlns support to xml_write/3. + + * ENHANCED: library(sgml_write): Indent attributes if there are too many. + +Mar 01, 2005 + + * ADDED: encoding argument to xml_quote_* and xml_name/2 predicates. + +Feb 24, 2005 + + * ADDED: handle encoding="US-ASCII" for XML documents. + +Feb 22, 2005 + + * MODIFIED: Use Sgetcode() rather than Sgetc() to get data from a Prolog + stream. Extended parser to accept characters outside 0..255 range. + +Feb 7, 2005 + + * ADDED: when converting attributes to integers on 32-bit machines, + exploit 64-bit Prolog integers. + +Jan 6, 2005 + + * FIXED: Documentation + +Dec 21, 2004 + + * ADDED: library(sgml_write), providing conversion of parsed data to + a file. + +Dec 20, 2004 + + * Working wide-character version. Consequences: + - CDATA attributes and content never contain entities anymore + as all text can now be represented. + - Limit on quoted CDATA attribute values (was 2048 characters) + has been removed. + +Dec 19, 2004 + + * Start working on wide-character support: introducing wide-character + output buffers. + +Dec 15, 2004 + + * FIXED: Bug#212: Disallow � character entities. + +Nov 25, 2004 + + * FIXED: utf-8 tests, avoid conflict with UTF-8 support in Prolog + +Sep 13, 2004 + + * ENHANCED: load_html_file/2: add shorttag(false) to the load_structure/3 + options for better parsing of common errornous HTML pages. + +Aug 26, 2004 + + * FIXED: Bug#177: catalog files from $SGML_CATALOG_FILES were not + honoured. Simon Ambler. Also made catalog management thread-safe. + +Jul 22, 2004 + + * MODIFIED: Updated XML-Schema (XSD) namespace in xsdp_types.pl + +Apr 28, 2004 + + * ADDED: iso_639.pl: ISO-639 language identifiers. + +Apr 27, 2004 + + * ADDED: first version of xsdp_types.pl, a module to do type checking + and type conversion for XSD (XML Schema DataTypes). + +Apr 26, 2004 + + * FIXED: Bug#149: Buffer overflow reading too long NAME, NMTOKEN, etc. + Fabien Todescato. + +Jan 17, 2004 + + * FIXED: handling switching from SGML to XML mode. + Fabien Todescato. + +Jan 9, 2004 + + * FIXED: avoid sharing DTD objects between threads. Fabien Todescato. + +Nov 20, 2003 + + * ADDED: xml_name/1 to test an atom to refer to a valid XML name. + +Nov 3, 2003 + + * FIXED: Error parsing SHORTREF declaration that has whitespace at the + end. Richard O'Keefe. + +Sep 12, 2003 + + * FIXED: Memory leak in load_structure/3 (not freeing the parser data). + Petter Egesund. + +Jul 8, 2003 + + * ADDED: handle UTF-8 sequences producing characters that cannot be + represented as character entities. Suggested by C. M. Sperberg-McQueen. + + * FIXED: handling &#X, where 128<=X<256 with UTF-8 decoding enabled. + C. M. Sperberg-McQueen. + +Jun 9, 2003 + + * FIXED: SGML SHORTREF declaration performed case-insensitive entity + lookup. Richard O'Keefe. + +Jun 4, 2003 + + * ENHANCED: type the conflicting CDATA in #PCDATA that violates the DTD + +May 23, 2003 + + * FIXED: More catalog trouble. Richard O'Keefe. + +May 22, 2003 + + * FIXED: Lookup of system identifiers through the catalogue (avoid early + tagging with file:) Richard O'Keefe. + +May 21, 2003 + + * FIXED: Handle -- in element-names (Bijan Parsia). + +May 8, 2003 + + * Fix registering catalog files + +May 5, 2003 + + * Incorporated better CATALOG parsing by Richard O'Keefe. May be incomplete + or incorrect. + +* FIXED: call(end, Goal) passing the element-name incorrectly (breaks + process_rdf/3). + +* FIXED: delay unification of value in load_structure/3. + +* FIXED: Properly expand UTF-8 values in CDATA attributes + +* FIXED: Properly handle unquoted attributes at the end of an empty element + in XML mode + +* FIXED: #CONREF attribute handling in elements with declared content + (CDATA/RCDATA). Richard O'Keefe. + +SWI-Prolog VERSION 5.0.10 +========================= + +* FIXED: properly handle elements that have only an declaration. Richard O'Keefe. + +* MODIFIED/ADDED: qualify_attributes option for xmlns mode. Default is + now *not* to qualify attributes. + +* FIXED: Handle elements inside shorttag values. Thanks Richard O'Keefe + for providing a clear description how to handle this. + +SWI-Prolog VERSION 5.0.9 +======================== + +* FIXED: correct handling of content_length(Len) option (was reading one + character too many). Oops, normal reading was broken. Fixed. + +* FIXED: Correctly handle marked sections ending in ]]]> rather than ]]> + Spotted by Adrian Boyko. + +* ADDED: xml_quote_attribute/2 and xml_quote_cdata/2, providing simple + quote-support. + +* FIXED: parse(content) option to the parser (multiple bugs) required for + process_rdf/3 (Girish Padmalayam). + +* FIXED: dtd2pl: printing of NAMEOF and NMTOKEN attributes + (Joseph Wayne Norton). + +* FIXED: load_structure/3 to pass doctype(_) to the correct place. + +* ADDED: space(sgml) option. Was documented but omitted. + + +SWI-Prolog VERSION 4.0.11 +========================= + +* MODIFIED: When processing non-validated data in space-preserve mode, emit + ALL blank data (Richard O'Keefe). + +* FIXED: close file after reading DTD from a file. (Anjo Anjewierden) + +* FIXED: handling of (a) +(b) element declaraction (Richard O'Keefe) + +* ADDED: Warning when redefining (parameter) entities. + +* ENHANCED/FIXED: Process entities refering to files directly. This + reduces memory needs and fixes path-problems in recursive includes. + This applies for both normal and parameter entities. + +* ADDED: Test for legal value in attributes with type NAMEOF (a|b) + +* FIXED: Handle (a,b), (a&b) and (a|b) as equivalent in ATTLIST type + declaration. + +* FIXED: option shorttag was spelled shortag. + +* FIXED: Ensure the output is properly closed, even if the parser encounters + an unexpected end-of-file. + + +SWI-Prolog VERSION 4.0.6 +======================== + +* FIXED: Data overwriting in if local stuff + is too long (Andrew Dadakov). + +* FIXED: Report elements not in the DTD allowed by the ANY model + as error (Andrew Dadakov). + +* FIXED: Call sgml_nomem() after strdup in case of out-of-memory + (Richard O'Keefe). + +* PORT: Removed alloca() usage (Richard O'Keefe). + +* LIMIT: Make maximum string length 2048 to allow parsing HTML4 DTD. + Maybe we should remove comments while fetching parameter-entity + values? + +* FIXED: Include #FIXED and defaulted arguments for omitted tags + (reported by Richard O'Keefe). + +* ADDED: handle plain silently if the catalog contains + a DOCTYPE doc file.dtd entry. + +* FIXED: Various issues in attribute handling with new routines supplied + by Richard O'Keefe. Added shorttag(Bool) to options you can set. + +* ADDED: `make check' to run the test-suite. + +* FIXED: Handling of & (Richard O'Keefe). + +* FIXED: dtd2pl to use model(Model) to avoid amibiguity between CDATA and + (CDATA) model. Richard O'Keefe. + +* FIXED: Various output aspects of the sgml driver program, notably case and + character escaping problems. By Richard O'Keefe. + +* ADDED: Ignore SGML declaration in + +* FIXED: Do not require ; after character-entity + +* ADDED: Expand character and parameter entities while parsing literals + in a DTD (Richard O'Keefe). + + +VERSION 1.0.13 +============== + +* FIXED: Interpret as "> (well, + dependent on the delimiters). (Richard O'Keefe). + +* FIXED: Allow omitted end-tag for RCDATA and CDATA elements (only effective + when hitting the end of the file). + +* FIXED: Proper parsing of declaration. Also fixed + dtd_property(DTD, notations(Notations)) and modified + dtd_property(DTD, notation(N, Decl)). + +* ADDED: (Richard O'Keefe). + +* FIXED: Ignore newline after entity + +* FIXED: Allow omitted ; expanding entities in CDATA attributes + +* FIXED: Possible crash with SHORTREF endding in &#RE. + +* FIXED: avoid crash on illegal syntax in DTD omited-tag declaration + +VERSION 1.0.12 +============== + +* FIXED: More white-space issues in handling quoted attribute values. + +* ADDED: XML NAMECHARS to the default set. This seems to match HTML-4 + better and will only in exceptional cases harm normal SGML processing. + +* ADDED/MODIFIED: Attributes not in the source, but with a default or + fixed value declared in the DTD are now included in the output. The + old behaviour can be used using the defaults(false) option to + load_structure/3. + +* FIXED: expand parameter-entities in by Milan Zamazal for integration + as Debian (linux) package. + + +VERSION 1.0.10 +============== + +* UPDATE: calls to select/3 for compatibility to SWI-Prolog 3.4 + +* FIXED: Allow , using a new state S_DECL0 after seeing < + to decide whether this is a non-escaped < in CDATA or a real tag. Also + changed comment-handling to avoid the need for parser->previous_char. + +* ADDED: sgml utility allow for parsing stdin. + +* PERFORMANCE: Improved expand_[p]entities(), providing about 10% + overall improvement. + +* FIXED: crash in sgml (demo-)driver app causing a crash if the input + file has no extension (Richard O'Keefe). + +* ADDED: support for RCDATA declared-content elements + +* FIXED: SGML-mode: immediately close EMPTY elements (i.e. do not allow + for a closing ). + +* IMPROVED: Error reporting on elements not in the DTD. + +* ADDED: get_sgml_parser(Parser, dtd(-DTD)). + +VERSION 1.0.8 +============= + +* FIXED: Problem in windows version regarding text/binary file and \ <-> / + difference. + +* FIXED: Problem finding HTML4 .soc file + +* FIXED: entity_file() to avoid a crash if the entity is not in the catalog. + +* ADDED: Parse not-quoted attribute values that require quotes by parsing + upto the next layout character. Give a warning. For example: +
is parsed as
, but a warning is displayed. + +* FIXED: Crash if start of file is not a declaration but ordinary non-blank + text. + +* ADDED: Improved source-location handling and some additional call-backs + to the call-back interface, preparing the parser for supporting SGML + syntax highlighting and checking editors. See get_sgml_parser/2 and + sgml_parse/2 predicates. + +VERSION 1.0.7 +============= + +* IMPROVED: Redirect all memory management to print a fatal error message + instead of crashing. + +* IMPROVED: Handle the (illegal) comment + more elegantly: assume and generate a + warning if text was found outside --comment--. + +* FIXED: Some issues in marked-section handling + +* ADDED: Direct error reporting of sgml application through callback for + demo purposes. Message now includes the current dialect. + +* FIXED: sgml application: do not convert case in XML mode. + +* FIXED: Do not report error on not-declared xmlns attribute in XML mode. + +* Cleaned Makefiles + +VERSION 1.0.6 +============= + +* ADDED: SGML based documentation of the package with converters to LaTeX + and HTML in Prolog. + +* ADDED/MODIFIED: number(NumberMode) option to the load_structure/3. + By default NUMBER and NUMBERS tokens are now parsed as tokens of digits. + Using number(integer) they are converted to Prolog integers, provided + they fit (Richard O'Keefe). + +* ADDED: If a document is parsed without DTD and a catalog is provided, + try finding a matching DTD file from the catalog. If successful + print a warning and load this DTD. + +* FIXED: Ensure the -xml flag to the sgml utility works again. + +* INSTALL: Check for existence of runtex + +* INSTALL: Do not cache Prolog related variables. + +* FIXED: signed/unsigned character handling in catalog.c (Richard O'Keefe) + +* MODIFIED: Representation for source-locations. + + +VERSION 1.0.5 +============= + +* MODIFIED: Get SYSTEM entities relative to the file in which the + entity is declared. Get PUBLIC entities from the second argument + if the catalog (entry) is not found. + +* FIXED: Deal with elements only containing shortref references + +* MODIFIED: Only insert missing close-tags that should not be on closing + outer environments, not because an element is encountered that fits in + an outer environment. Improves error behaviour. + +* ADDED: Print default list values in dtd2pl + +* ADDED: Support for the content-model ANY. + +* ADDED: option -xml to pl2dtd to force it loading a DTD in XML mode. + +* FIXED: Do not expand SHORTREF inside CDATA marked sections and elements. + +* ADDED: Properly deal with NUTOKEN: handling DTD default and validate type. + +* FIXED: Expand parameter entities in SHORTREF declaraction (Richard O'Keefe). + +* FIXED: Handling of < as last character of a CDATA element + +* FIXED: proper handling of &#RE; in SHORTREF + +* FIXED: Open CDATA element when encountering a CDATA entity. + +* FIXED: SHORTTAG with attributes: ) + +* ADDED: Prolog interface: map multi-valued attributes to a list. + +* FIXED: ESIS output of the sgml utility to provide better compliant output + for attributes (before the open-tag, including the type and in UPPERCASE). + +* ADDED: Handling and friends. + +* FIXED: Added &#RS;, &#RE;, &#TAB; and &#SPACE; (Richard O'Keefe) + +* FIXED: Processing instruction is now (i.e. optional closing ?) + Q: Is this XML or SGML? Check docs! + +* FIXED: Line-number info for DOCTYPE declarations holding [declaractions]. + +* FIXED: NUMBER attribute declaration with default (skip layout afterwards) + +* ADDED: Support for NOTATION attribute + +* FIXED: memory allocation/free bug in model-reduction ((a|b)*) + +* ADDED: Passing processing instructions to the user + +* FIXED: Independent case-sensitivity switch for entities (SGML entities are + case sensitive). + +* ADDED: SGML Blank-space handling + +* FIXED: Completeness test for A&B? + +* ADDED: Dummy make check (Richard O'Keefe) + + +VERSION 1.0.1 +============= + +* ADDED: Programmable XML namespace canonisation. + +* ADDED: White space handling using the xlm:space attribute as well as + allow for setting the initial default as option to load_structure. + +* FIXED: Allow for (i.e. without CDATA type decl). + +* FIXED: Allow for (i.e. no public nor system) + +* FIXED: Avoid crash on bad attribute-list + +* ADDED: Reporting errors and warnings through print_message/2. + +* FIXED: Handling of end-of-file in CDATA diff --git a/packages/sgml/DTD/HTML4.dcl b/packages/sgml/DTD/HTML4.dcl new file mode 100644 index 000000000..db46db0f9 --- /dev/null +++ b/packages/sgml/DTD/HTML4.dcl @@ -0,0 +1,88 @@ + \ No newline at end of file diff --git a/packages/sgml/DTD/HTML4.dtd b/packages/sgml/DTD/HTML4.dtd new file mode 100644 index 000000000..557f2372c --- /dev/null +++ b/packages/sgml/DTD/HTML4.dtd @@ -0,0 +1,1092 @@ + + + + + ... + + + ... + + + + The URI used as a system identifier with the public identifier allows + the user agent to download the DTD and entity sets as needed. + + The FPI for the Strict HTML 4.0 DTD is: + + "-//W3C//DTD HTML 4.0//EN" + + and its URI is: + + http://www.w3.org/TR/REC-html40/strict.dtd + + Authors should use the Strict DTD unless they need the + presentation control for user agents that don't (adequately) + support style sheets. + + If you are writing a document that includes frames, use + the following FPI: + + "-//W3C//DTD HTML 4.0 Frameset//EN" + + with the URI: + + http://www.w3.org/TR/REC-html40/frameset.dtd + + The following URIs are supported in relation to HTML 4.0 + + "http://www.w3.org/TR/REC-html40/strict.dtd" (Strict DTD) + "http://www.w3.org/TR/REC-html40/loose.dtd" (Loose DTD) + "http://www.w3.org/TR/REC-html40/frameset.dtd" (Frameset DTD) + "http://www.w3.org/TR/REC-html40/HTMLlat1.ent" (Latin-1 entities) + "http://www.w3.org/TR/REC-html40/HTMLsymbol.ent" (Symbol entities) + "http://www.w3.org/TR/REC-html40/HTMLspecial.ent" (Special entities) + + These URIs point to the latest version of each file. To reference + this specific revision use the following URIs: + + "http://www.w3.org/TR/REC-html40-971218/strict.dtd" + "http://www.w3.org/TR/REC-html40-971218/loose.dtd" + "http://www.w3.org/TR/REC-html40-971218/frameset.dtd" + "http://www.w3.org/TR/REC-html40-971218/HTMLlat1.ent" + "http://www.w3.org/TR/REC-html40-971218/HTMLsymbol.ent" + "http://www.w3.org/TR/REC-html40-971218/HTMLspecial.ent" + +--> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +%HTMLlat1; + + +%HTMLsymbol; + + +%HTMLspecial; + + + + + + + + + + + + + +]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +]]> + + + + +]]> + + + + + +]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +]]> + + + + + diff --git a/packages/sgml/DTD/HTML4.soc b/packages/sgml/DTD/HTML4.soc new file mode 100644 index 000000000..bc40206d5 --- /dev/null +++ b/packages/sgml/DTD/HTML4.soc @@ -0,0 +1,6 @@ +OVERRIDE YES +SGMLDECL HTML4.dcl +DOCTYPE HTML HTML4.dtd +PUBLIC "-//W3C//ENTITIES Latin1//EN//HTML" HTMLlat1.ent +PUBLIC "-//W3C//ENTITIES Special//EN//HTML" HTMLspec.ent +PUBLIC "-//W3C//ENTITIES Symbols//EN//HTML" HTMLsym.ent diff --git a/packages/sgml/DTD/HTMLlat1.ent b/packages/sgml/DTD/HTMLlat1.ent new file mode 100644 index 000000000..7632023a8 --- /dev/null +++ b/packages/sgml/DTD/HTMLlat1.ent @@ -0,0 +1,195 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/packages/sgml/DTD/HTMLspec.ent b/packages/sgml/DTD/HTMLspec.ent new file mode 100644 index 000000000..29011cc2b --- /dev/null +++ b/packages/sgml/DTD/HTMLspec.ent @@ -0,0 +1,77 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/packages/sgml/DTD/HTMLsym.ent b/packages/sgml/DTD/HTMLsym.ent new file mode 100644 index 000000000..2a6250ba9 --- /dev/null +++ b/packages/sgml/DTD/HTMLsym.ent @@ -0,0 +1,241 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/packages/sgml/FILES b/packages/sgml/FILES new file mode 100644 index 000000000..5f58b0bb0 --- /dev/null +++ b/packages/sgml/FILES @@ -0,0 +1,20 @@ +Files in the SGML/XML parser: + +catalog.c Handle SGML_CATALOG_FILES +catalog.h +charmap.c Character classification and mapping +dtd.h Include file for DTD primitives +dtd2pl.c Main file for dtd2pl: convert DTD to Prolog +error.c Utilities for generating Prolog exceptions +error.h +model.c State engine for handling DDT content model +model.h +parser.c The parser itself +parser.h +prolog.c Print DTD in Prolog source (used by dtd2pl) +prolog.h +sgml.c Main file for sgml (stand-alone parser) +sgml2pl.c SWI-Prolog foreign code wrapper +sgmldefs.h General definitions +util.c Basic character handling and utilities +util.h diff --git a/packages/sgml/INSTALL b/packages/sgml/INSTALL new file mode 100644 index 000000000..0338fbce2 --- /dev/null +++ b/packages/sgml/INSTALL @@ -0,0 +1,167 @@ +Basic Installation +================== + + These are generic installation instructions. + + The `configure' shell script attempts to guess correct values for +various system-dependent variables used during compilation. It uses +those values to create a `Makefile' in each directory of the package. +It may also create one or more `.h' files containing system-dependent +definitions. Finally, it creates a shell script `config.status' that +you can run in the future to recreate the current configuration, a file +`config.cache' that saves the results of its tests to speed up +reconfiguring, and a file `config.log' containing compiler output +(useful mainly for debugging `configure'). + + If you need to do unusual things to compile the package, please try +to figure out how `configure' could check whether to do them, and mail +diffs or instructions to the address given in the `README' so they can +be considered for the next release. If at some point `config.cache' +contains results you don't want to keep, you may remove or edit it. + + The file `configure.in' is used to create `configure' by a program +called `autoconf'. You only need `configure.in' if you want to change +it or regenerate `configure' using a newer version of `autoconf'. + +The simplest way to compile this package is: + + 1. `cd' to the directory containing the package's source code and type + `./configure' to configure the package for your system. If you're + using `csh' on an old version of System V, you might need to type + `sh ./configure' instead to prevent `csh' from trying to execute + `configure' itself. + + Running `configure' takes awhile. While running, it prints some + messages telling which features it is checking for. + + 2. Type `make' to compile the package. + + 3. Optionally, type `make check' to run any self-tests that come with + the package. + + 4. Type `make install' to install the programs and any data files and + documentation. + + 5. You can remove the program binaries and object files from the + source directory by typing `make clean'. To also remove the files + that `configure' created (so you can compile the package for a + different kind of computer), type `make distclean'. + +Compilers and Options +===================== + + Some systems require unusual options for compilation or linking that +the `configure' script does not know about. You can give `configure' +initial values for variables by setting them in the environment. Using +a Bourne-compatible shell, you can do that on the command line like +this: + CC=c89 CFLAGS=-O2 LIBS=-lposix ./configure + +Or on systems that have the `env' program, you can do it like this: + env CPPFLAGS=-I/usr/local/include LDFLAGS=-s ./configure + +Using a Different Build Directory +================================= + + You can compile the package in a different directory from the one +containing the source code. Doing so allows you to compile it on more +than one kind of computer at the same time. To do this, you must use a +version of `make' that supports the `VPATH' variable, such as GNU +`make'. `cd' to the directory where you want the object files and +executables to go and run the `configure' script. `configure' +automatically checks for the source code in the directory that +`configure' is in and in `..'. + +Installation Names +================== + + By default, `make install' will install the package's files in +`/usr/local/bin', `/usr/local/man', etc. You can specify an +installation prefix other than `/usr/local' by giving `configure' the +option `--prefix=PATH'. + + You can specify separate installation prefixes for +architecture-specific files and architecture-independent files. If you +give `configure' the option `--exec-prefix=PATH', the package will use +PATH as the prefix for installing programs and libraries. +Documentation and other data files will still use the regular prefix. + + If the package supports it, you can cause programs to be installed +with an extra prefix or suffix on their names by giving `configure' the +option `--program-prefix=PREFIX' or `--program-suffix=SUFFIX'. + +Optional Features +================= + + Some packages pay attention to `--enable-FEATURE' options to +`configure', where FEATURE indicates an optional part of the package. +They may also pay attention to `--with-PACKAGE' options, where PACKAGE +is something like `gnu-as' or `x' (for the X Window System). The +`README' should mention any `--enable-' and `--with-' options that the +package recognizes. + + For packages that use the X Window System, `configure' can usually +find the X include and library files automatically, but if it doesn't, +you can use the `configure' options `--x-includes=DIR' and +`--x-libraries=DIR' to specify their locations. + +Specifying the System Type +========================== + + There may be some features `configure' can not figure out +automatically, but needs to determine by the type of host the package +will run on. Usually `configure' can figure that out, but if it prints +a message saying it can not guess the host type, give it the +`--host=TYPE' option. TYPE can either be a short name for the system +type, such as `sun4', or a canonical name with three fields: + CPU-COMPANY-SYSTEM + +See the file `config.sub' for the possible values of each field. If +`config.sub' isn't included in this package, then this package doesn't +need to know the host type. + + If you are building compiler tools for cross-compiling, you can also +use the `--target=TYPE' option to select the type of system they will +produce code for and the `--build=TYPE' option to select the type of +system on which you are compiling the package. + +Sharing Defaults +================ + + If you want to set default values for `configure' scripts to share, +you can create a site shell script called `config.site' that gives +default values for variables like `CC', `cache_file', and `prefix'. +`configure' looks for `PREFIX/share/config.site' if it exists, then +`PREFIX/etc/config.site' if it exists. Or, you can set the +`CONFIG_SITE' environment variable to the location of the site script. +A warning: not all `configure' scripts look for a site script. + +Operation Controls +================== + + `configure' recognizes the following options to control how it +operates. + +`--cache-file=FILE' + Save the results of the tests in FILE instead of `config.cache'. + Set FILE to `/dev/null' to disable caching, for debugging + `configure'. + +`--help' + Print a summary of the options to `configure', and exit. + +`--quiet' +`--silent' +`-q' + Do not print messages saying which checks are being made. + +`--srcdir=DIR' + Look for the package's source code in directory DIR. Usually + `configure' can determine that directory automatically. + +`--version' + Print the version of Autoconf used to generate the `configure' + script, and exit. + +`configure' also accepts some other, not widely useful, options. + diff --git a/packages/sgml/Makefile.in b/packages/sgml/Makefile.in new file mode 100644 index 000000000..ada0db777 --- /dev/null +++ b/packages/sgml/Makefile.in @@ -0,0 +1,220 @@ +################################################################ +# SWI-Prolog `sgml2pl' package +# Author: Jan Wielemaker. jan@swi.psy.uva.nl +# Copyright: LGPL (see COPYING or www.gnu.org +################################################################ + +.SUFFIXES: .tex .dvi .doc .pl + +ifeq (@PROLOG_SYSTEM@,yap) + +prefix = @prefix@ +ROOTDIR = $(prefix) +EROOTDIR = @exec_prefix@ + +srcdir=@srcdir@ + +BINDIR = $(EROOTDIR)/bin +LIBDIR=$(EROOTDIR)/lib +YAPLIBDIR=$(EROOTDIR)/lib/Yap +SHAREDIR=$(ROOTDIR)/share/Yap + +SHELL=@SHELL@ +PL=@EXTEND_DYNLOADER_PATH@ $(DESTDIR)$(BINDIR)/yap $(DESTDIR)$(YAPLIBDIR)/startup +CHRDIR=$(SHAREDIR)/chr +EXDIR=$(CHRDIR)/examples/chr +LN_S=@LN_S@ +# +# +CC=@CC@ +CFLAGS= @CFLAGS@ $(YAP_EXTRAS) $(DEFS) -I$(srcdir) -I../.. -I$(srcdir)/../../include +MKINDEX=true + +LD=$(CC) + +TARGETS= sgml2pl@SHLIB_SUFFIX@ + +else # SWI + +PL=@PL@ +LD=@LD@ +PLLD=@PLLD@ +PLBASE=@PLBASE@ +PLARCH=@PLARCH@ +PKGDOC=$(PLBASE)/doc/packages +PCEHOME=../xpce +PLLIB=$(PLBASE)/library +PLFOREIGN=$(PLBASE)/lib/$(PLARCH) +DESTDIR= +SO=@SO@ + +DOCTOTEX=$(PCEHOME)/bin/doc2tex +PLTOTEX=$(PCEHOME)/bin/pl2tex +LATEX=latex +DOC=sgml +TEX=$(DOC).tex +DVI=$(DOC).dvi +PDF=$(DOC).pdf +HTML=$(DOC).html + +CC=@CC@ +COFLAGS=@COFLAGS@ +CWFLAGS=@CWFLAGS@ +CMFLAGS=@CMFLAGS@ +CIFLAGS=-I. -I@PLINCL@ +CFLAGS=$(COFLAGS) $(CWFLAGS) $(CMFLAGS) $(CIFLAGS) @DEFS@ +LDFLAGS=$(COFLAGS) + +LDSOFLAGS=@LDSOFLAGS@ + +TARGETS= sgml2pl.@SO@ + +endif #YAP/SWI + +INSTALL=@INSTALL@ +INSTALL_PROGRAM=@INSTALL_PROGRAM@ +INSTALL_DATA=@INSTALL_DATA@ + +LIBOBJ= parser.o util.o charmap.o catalog.o model.o xmlns.o utf8.o \ + xml_unicode.o +PLOBJ= $(LIBOBJ) error.o sgml2pl.o quote.o +SGMLOBJ= $(LIBOBJ) sgml.o +DTD2PLOBJ= $(LIBOBJ) dtd2pl.o prolog.o + +HDRS= $(srcdir)/catalog.h $(srcdir)/dtd.h \ + $(srcdir)/model.h $(srcdir)/prolog.h \ + $(srcdir)/utf8.h $(srcdir)/xmlns.h \ + $(srcdir)/config.h $(srcdir)/error.h \ + $(srcdir)/parser.h $(srcdir)/sgmldefs.h $(srcdir)/util.h + +ALLCSRC= $(LIBOBJ:.o=.c) \ + $(PLOBJ:.o=.c) $(SGMLOBJ:.o=.c) $(DTD2PLOBJ:.o=.c) \ + $(HDRS) + +LIBPL= $(srcdir)/sgml.pl $(srcdir)/xsdp_types.pl \ + $(srcdir)/iso_639.pl $(srcdir)/sgml_write.pl +PROGRAMS= dtd2pl sgml +DTDFILES= HTML4.dcl HTML4.dtd \ + HTML4.soc \ + HTMLlat1.ent \ + HTMLspec.ent HTMLsym.ent + +ifeq (@PROLOG_SYSTEM@,yap) + +%.o : $(srcdir)/%.c + $(CC) $(CFLAGS) $(SHLIB_CFLAGS) -o $@ -c $< + +@DO_SECOND_LD@sgml2pl@SHLIB_SUFFIX@: $(PLOBJ) +@DO_SECOND_LD@ @SHLIB_LD@ -o sgml2pl@SHLIB_SUFFIX@ $(PLOBJ) + +all: $(TARGETS) $(PROGRAMS) + +install: $(TARGETS) $(LIBPL) install-dtd + mkdir -p $(DESTDIR)$(YAPLIBDIR) + $(INSTALL_PROGRAM) $(TARGETS) $(DESTDIR)$(YAPLIBDIR) + $(INSTALL_DATA) $(LIBPL) $(DESTDIR)$(SHAREDIR) + $(PL) -f none -g make -t halt + +install-dtd:: + mkdir -p $(DESTDIR)$(SHAREDIR)/DTD + for f in $(DTDFILES); do \ + $(INSTALL_DATA) $(srcdir)/DTD/$$f $(DESTDIR)$(SHAREDIR)/DTD; \ + done + +else + +sgml2pl.@SO@: $(PLOBJ) + $(PLLD) -shared -o $@ $(PLOBJ) + +all: coflags $(TARGETS) $(PROGRAMS) + +coflags:: + @echo $(COFLAGS) > .coflags + +install: $(TARGETS) $(LIBPL) install-dtd + mkdir -p $(DESTDIR)$(PLBASE)/lib/$(PLARCH) + $(INSTALL_PROGRAM) $(TARGETS) $(DESTDIR)$(PLFOREIGN) + $(INSTALL_DATA) $(LIBPL) $(DESTDIR)$(PLLIB) + $(PL) -f none -g make -t halt + +install-dtd:: + mkdir -p $(DESTDIR)$(PLBASE)/library/DTD + for f in $(DTDFILES); do \ + $(INSTALL_DATA) DTD/$$f $(DESTDIR)$(PLBASE)/library/DTD; \ + done + +endif + +ln-install:: + @$(MAKE) INSTALL_DATA=../ln-install install + +rpm-install: install + +pdf-install:: + mkdir -p $(DESTDIR)$(PKGDOC) + $(INSTALL_DATA) sgml.pdf $(DESTDIR)$(PKGDOC) + +html-install:: + mkdir -p $(DESTDIR)$(PKGDOC) + $(INSTALL_DATA) sgml.html $(DESTDIR)$(PKGDOC) + +check:: + $(PL) -f Test/test.pl -g test,halt + $(PL) -f Test/wrtest.pl -g test,halt + +uninstall:: + (cd $(PLBASE)/lib/$(PLARCH) && rm -f $(TARGETS)) + (cd $(PLBASE)/library && rm -f $(LIBPL)) + $(PL) -f none -g make -t halt + +dtd2pl: $(DTD2PLOBJ) + $(LD) $(LDFLAGS) -o $@ $(DTD2PLOBJ) + +sgml: $(SGMLOBJ) + $(LD) $(LDFLAGS) -o $@ $(SGMLOBJ) + +tags: TAGS + +TAGS: $(ALLCSRC) + etags $(ALLCSRC) + +################################################################ +# Documentation +################################################################ + +doc: $(PDF) $(HTML) +pdf: $(PDF) +html: $(HTML) + +$(HTML): $(TEX) + latex2html $(DOC) + mv html/index.html $@ + rmdir html + +$(PDF): $(TEX) + ../../man/runtex --pdf $(DOC) + +$(TEX): $(DOCTOTEX) + +.doc.tex: + $(DOCTOTEX) $*.doc > $*.tex +.pl.tex: + $(PLTOTEX) $*.pl > $*.tex + +docclean:: + rm -f $(TEX) + rm -rf html + ../../man/runtex --clean $(DOC) + +################################################################ +# Clean +################################################################ + +clean:: + rm -f $(PLOBJ) *~ *.o *% a.out core config.log + +distclean: clean + rm -f $(TARGETS) $(PROGRAMS) + rm -f config.cache config.h config.status Makefile + rm -f .coflags + diff --git a/packages/sgml/Makefile.mak b/packages/sgml/Makefile.mak new file mode 100644 index 000000000..54f801f7d --- /dev/null +++ b/packages/sgml/Makefile.mak @@ -0,0 +1,79 @@ +################################################################ +# Build the SWI-Prolog XML/SGML package for MS-Windows +# +# Author: Jan Wielemaker +# +# Use: +# nmake /f Makefile.mak +# nmake /f Makefile.mak install +################################################################ + +PLHOME=..\.. +!include ..\..\src\rules.mk +PKGDLL=sgml2pl + +LIBOBJ= parser.obj util.obj charmap.obj catalog.obj \ + model.obj xmlns.obj utf8.obj xml_unicode.obj +OBJ= $(LIBOBJ) sgml2pl.obj error.obj quote.obj +SGMLOBJ= $(LIBOBJ) sgml.obj +DTDFILES= HTML4.dcl HTML4.dtd HTML4.soc \ + HTMLlat1.ent HTMLspec.ent HTMLsym.ent +DTDDIR= $(PLBASE)\library\DTD + +all: $(PKGDLL).dll + +$(PKGDLL).dll: $(OBJ) + $(LD) /dll /out:$@ $(LDFLAGS) $(OBJ) $(PLLIB) $(LIBS) + +sgml.exe: $(SGMLOBJ) + $(LD) $(LDFLAGS) /out:$@ $(SGMLOBJ) $(LIBS) + +!IF "$(CFG)" == "rt" +install: idll +!ELSE +install: idtd idll ilib +!ENDIF + +idll:: + copy $(PKGDLL).dll "$(PLBASE)\bin" +!IF "$(PDB)" == "true" + copy $(PKGDLL).pdb "$(PLBASE)\bin" +!ENDIF + +ilib:: + copy sgml.pl "$(PLBASE)\library" + copy xsdp_types.pl "$(PLBASE)\library" + copy iso_639.pl "$(PLBASE)\library" + copy sgml_write.pl "$(PLBASE)\library" + $(MAKEINDEX) + +idtd:: + @if not exist "$(DTDDIR)\$(NULL)" $(MKDIR) "$(DTDDIR)" + @echo "Installing DTD files in $(DTDDIR)" + @for %f in ($(DTDFILES)) do \ + @copy DTD\%f "$(DTDDIR)" + @echo "done" + +uninstall:: + del "$(PLBASE)\bin\$(PKGDLL).dll" + del "$(PLBASE)\library\sgml.pl" + del "$(PLBASE)\library\xsdp_types.pl" + del "$(PLBASE)\library\iso_639.pl" + del "$(PLBASE)\library\sgml_write.pl" + $(MAKEINDEX) + +html-install:: + copy sgml.html "$(PKGDOC)" + +xpce-install:: + +check:: + cd Test && "$(PLCON)" -f test.pl -g test,halt. + +clean:: + if exist *.obj del *.obj + if exist *~ del *~ + +distclean: clean + -DEL *.dll *.lib *.exp *.pdb *.ilk 2>nul + diff --git a/packages/sgml/RDF/.cvsignore b/packages/sgml/RDF/.cvsignore new file mode 100644 index 000000000..9ac5f50c6 --- /dev/null +++ b/packages/sgml/RDF/.cvsignore @@ -0,0 +1,9 @@ +.plrc +config.h +Online-requests +Literature +Tests +Makefile +config.log +config.status +config.cache diff --git a/packages/sgml/RDF/.gitignore b/packages/sgml/RDF/.gitignore new file mode 100644 index 000000000..14a0668cc --- /dev/null +++ b/packages/sgml/RDF/.gitignore @@ -0,0 +1,4 @@ +configure +rdf2pl.html +rdf2pl.pdf +rdf2pl.tex diff --git a/packages/sgml/RDF/ChangeLog b/packages/sgml/RDF/ChangeLog new file mode 100644 index 000000000..1c65a32fa --- /dev/null +++ b/packages/sgml/RDF/ChangeLog @@ -0,0 +1,237 @@ +[Jan 21 2009] + + * FIXED: RDF writer: write valid XML if the namespace entity contains characters + that must be %-escaped. Jacopo Urbani. + +[Jan 13 2009] + + * FIXED: RDF/XML could save illegal XML if multiple namespaces are used + for predicates where one namespace is a prefix of another one. Jacopo Urbani. +[Dec 19 2008] + + * ADDED: Pass content_length through process_rdf/3 +[Sep 11 2008] + + * PORT: Add AC_PREREQ to configure.h for systems that auto-select autoconf + versions. After tip by Ulrich Neumerkel. + +[Aug 11 2008] + + * INSTALL: Remove all configure files from the git repository +[Apr 18 2008] + + * MODIFIED: Renamed hash_term/2 to term_hash/2. Added hash_term/2 to + library(backcomp), so most code should not notice this. +[Feb 28 2008] + + * FIXED: rdf_write_xml/2 loops if it encounters an rdf:Bag. + +[Feb 18 2008] + + * FIXED: More fixes for proper handling of rdf:Bag + +[Feb 13 2008] + + * FIXED: Emit rdf:Bag attributes (etc.) as rdf:li + + * FIXED: possible failure in rdf_write_xml with http://t-d-b.org? + +[Jan 14 2008] + + * FIXED: Bug#343: Handling bnodes in rdf_write_xml. Yver Raimond. + + * FIXED: Added support for rdf:NodeID to rdf_write_xml/2. Yves Raimond. + +[Dec 13 2007] + + * FIXED: library(rdf_write) to deal with operators. Related to Bug#332 + +Oct 29, 2007 + + * FIXED: extract namespaces used in typed literals. Yves Raimond. + +Jul 9, 2007 + + * FIXED: xml:lang with empty literals. Jochem Liem. + +Jun 14, 2007 + + * FIXED: return rdf:parseType="Literal" as literal of type + rdf:XMLLiteral. + +Jan 18, 2007 + + * ADDED: embedded(Bool) option to process_rdf/3 + +Jun 25, 2006 + + * CLEANUP: Delete unused global variable. + +Jun 5, 2006 + + * FIXED: use UTF-8 decoder from new library(utf8). Our own internal one + was broken. + +Apr 25, 2006 + + * FIXED: decoding of unicode-URIs using UTF-8 over %XX%XX. + +Apr 13, 2006 + + * ADDED: library rdf_write to write an RDF file from a list of triples. + +Dec 8, 2005 + + * FIXED: xmlns attributes in descriptions. Bijan Parsia + +Nov 23, 2005 + + * ADDED: option db(DB) to parse_rdf/2. + +Nov 10, 2005 + + * COMMENT: Comment use of IRI + +Jul 7, 2005 + + * ADDED: Pass entity(Name, Value) to XML parser + +Jul 5, 2005 + + * FIXED: Perform proper URI decoding to Unicode atoms + +Jul 4, 2005 + + * FIXED: Make "make check" work from the build directory. + +Mar 31, 2005 + + * FIXED: memory leak in process_rdf/2 + +Mar 29, 2005 + + * FIXED: dataType --> datatype (Vangelis Vassiliadis) + +Oct 21, 2004 + + * FIXED: Bug#196: avoid need for autoloading. Sandro Hawke. + +Sep 13, 2004 + + * ADDED: namespaces(-NameSpaces) option to load_rdf/3 and process_rdf/3 + to query the document namespace declarations. + +Aug 13, 2004 + + * MODIFIED: load_rdf/3 no longer returns resources as Prefix:URI, but + instead returns the plain atoms. + +Jul 31, 2004 + + * ADDED: converted rdf_nt.pl into public rdf_ntriples.pl library for + loading data in the W3C ntriple format. + +Jun 29, 2004 + + * FIXED: sharing code for blank nodes. Broken in recent cleanup. + +Jun 17, 2004 + + * CLEANUP: start/end of loading a file, use option/3 from library + +Apr 21, 2004 + + * ADDED: Support for xml:lang and rdf:dataType attributes. + + * INTERNAL: Use global variables rather than assert/retract for + keeping track of the state for process_rdf/3. + +Nov 29, 2003 + + * ADDED: warning for incorrect and multiple definitions of rdf:ID. + Maarten Menken. + +Nov 7, 2003 + + * ADDED: process_rdf/3: allow processing from a stream + +Oct 6, 2003 + + * MODIFIED: synopsis of process_rdf/3 to + process_rdf(+File, :OnTriples, +Options) for consistency and to allow + extending the option list. Old calls are mapped to the new. + + * Added option blank_nodes(share) to load_rdf/3 and process_rdf/3. + +Aug 18, 2003 + + * Guarantee that anonymous ids start with __ + +Mar 20, 2003 + + * Fixed exception in cleanup. Dominique de Waleffe. + +Feb 28, 2003 + + * ADDED: Use BaseURI to create non-conflicting anonymous resources. + + * FIXED: rdf_parser:global_id to ignore xml:base for absolute URIs + +Jan 17, 2003 + + * FIXED: online.pl (web frontend) to avoid using goal/1 option for the + xml parser. Now uses 4.0.8 clib memfile library primitives. + + * FIXED: process_rdf/3, Peter Marks. + + * ADDED: parseType="Collection", satisfying the W3C Working Draft + 8 November 2002. + +################################################################ +# Sumary of incompatibilities: +# +# Many problems in nested bag handling, changing output of +# suite/t5.rdf +# +# Content of Alt-container was incorrectly rendered as rdf:li +# instead of rdf:_1, rdf:_2, etc (suite/t27.rdf). +################################################################ + + * ADDED: Translate rdf:li predicates into _1, _2, etc. + + * FIXED: parseType=Literal to avoid extraneous [..] around the value + +Oct 28, 2002 + + * CLEANUP: pass base-uri as attribute, preparing for xml:base and making + the parser ready for multi-threading. + +Sep 16, 2002 + + * MODIFIED: Allow for unqualified attribute-names + + * ADDED: "make check" + +Older entries + + * FIXED: Type-exception in atom_chars/2 + + * FIXED: handling mixed literal and object + (space canonisation problem). + + * CGI Demo: report errors generated before a fatal exception. + + * FIXED: handling of propertyElt of the form + rdf:ID="myid" rdf:parseType="Resource" + + * For objects, map NameSpace:Local to the simple concatenation of the two. + This implies: + + # Subjects are always atoms + # Predicates are NameSpace:LocalName or simply Name + # Objects are atoms (URI) or literal(Value) + + * Warn on things that cannot be converted into an RDF-object rather then + failing silently. + + * Removed some undesirable choice-points. diff --git a/packages/sgml/RDF/Makefile.in b/packages/sgml/RDF/Makefile.in new file mode 100644 index 000000000..37bdc1a3c --- /dev/null +++ b/packages/sgml/RDF/Makefile.in @@ -0,0 +1,128 @@ +################################################################ +# SWI-Prolog `RDF' package +# Author: Jan Wielemaker. jan@swi.psy.uva.nl +# Copyright: LGPL (see COPYING or www.gnu.org +################################################################ + +.SUFFIXES: .tex .dvi .doc .pl + +SHELL=@SHELL@ +PLBASE=@PLBASE@ +PLARCH=@PLARCH@ +PL=@PL@ +XPCEBASE=$(PLBASE)/xpce +PKGDOC=$(PLBASE)/doc/packages +PCEHOME=../../xpce +DESTDIR= + +CGISCRIPT=rdf-parser +CGIDIR=/etc/httpd/cgi-bin +CGIURL=http://gollem.science.uva.nl/cgi-bin +WEBPAGE=/swi40/prolog/packages/rdf-online.html +CGIGRP=www + +DOCTOTEX=$(PCEHOME)/bin/doc2tex +PLTOTEX=$(PCEHOME)/bin/pl2tex +RUNTEX=../../../man/runtex +LATEX=latex +DOC=rdf2pl +TEX=$(DOC).tex +DVI=$(DOC).dvi +PDF=$(DOC).pdf +HTML=$(DOC).html + +INSTALL=@INSTALL@ +INSTALL_PROGRAM=@INSTALL_PROGRAM@ +INSTALL_DATA=@INSTALL_DATA@ + +LIBPL= rdf.pl rdf_parser.pl rdf_triple.pl rewrite.pl \ + rdf_ntriples.pl rdf_write.pl +XPCEPL= rdf_diagram.pl + +all: + @echo "Nothing to do for this package" + +install: $(LIBPL) xpce-install + $(INSTALL_DATA) $(LIBPL) $(DESTDIR)$(PLBASE)/library + $(PL) -f none -g make -t halt + +ln-install:: + @$(MAKE) INSTALL_DATA=../../ln-install install + +xpce-install: $(XPCEPL) + if [ -d $(DESTDIR)$(XPCEBASE) ]; then \ + $(INSTALL) -m 644 $(XPCEPL) $(DESTDIR)$(XPCEBASE)/prolog/lib; \ + fi + +rpm-install: install + +pdf-install:: + $(INSTALL_DATA) $(DOC).pdf $(DESTDIR)$(PKGDOC) + +html-install:: + $(INSTALL_DATA) $(DOC).html $(DESTDIR)$(PKGDOC) + +uninstall: + (cd $(PLBASE)/library && rm -f $(LIBPL)) + $(PL) -f none -g make -t halt + +check:: + $(PL) -f rdf_test.pl -g test,halt -t 'halt(1)' + $(PL) -q -f write_test.pl -g run_tests,halt -t 'halt(1)' + +################################################################ +# CGI INSTALL +################################################################ + +cgi-install: $(CGIDIR)/$(CGISCRIPT) \ + Online-requests \ + $(WEBPAGE) + +cgi-uninstall: + rm -f $(CGIDIR)/$(CGISCRIPT) $(WEBPAGE) + +Online-requests: + mkdir $@ + chmod 775 $@ + chgrp $(CGIGRP) $@ + +$(WEBPAGE): online.html + sed 's%@ACTION@%$(CGIURL)/$(CGISCRIPT)%' online.html > $@ + +$(CGIDIR)/$(CGISCRIPT): rdf-parser Makefile + sed -e "s%@BASEDIR@%`pwd`%" -e s%@SWI@%$(PL)% rdf-parser > $@ + chmod 755 $@ + +################################################################ +# Documentation +################################################################ + +doc: $(PDF) $(HTML) +pdf: $(PDF) +html: $(HTML) + +$(HTML): $(TEX) + latex2html $(DOC) + mv html/index.html $@ + +$(PDF): $(TEX) + $(RUNTEX) --pdf $(DOC) + +$(TEX): $(DOCTOTEX) + +.doc.tex: + $(DOCTOTEX) $*.doc > $*.tex +.pl.tex: + $(PLTOTEX) $*.pl > $*.tex + +################################################################ +# Clean +################################################################ + +clean: + rm -f *~ *% config.log + +distclean: clean + rm -f $(TARGETS) config.h config.cache config.status Makefile + rm -f $(TEX) + $(RUNTEX) --clean $(DOC) diff --git a/packages/sgml/RDF/Makefile.mak b/packages/sgml/RDF/Makefile.mak new file mode 100644 index 000000000..cc68c9957 --- /dev/null +++ b/packages/sgml/RDF/Makefile.mak @@ -0,0 +1,51 @@ +################################################################ +# Build the SWI-Prolog RDF package for MS-Windows +# NOTE: This package requires the SGML package +# +# Author: Jan Wielemaker +# +# Use: +# nmake /f Makefile.mak +# nmake /f Makefile.mak install +################################################################ + +PLHOME=..\..\.. +!include $(PLHOME)\src\rules.mk + +all: + +!IF "$(CFG)" == "rt" +install:: +!ELSE +install:: + copy rdf.pl "$(PLBASE)\library" + copy rdf_parser.pl "$(PLBASE)\library" + copy rdf_triple.pl "$(PLBASE)\library" + copy rewrite.pl "$(PLBASE)\library" + copy rdf_ntriples.pl "$(PLBASE)\library" + copy rdf_write.pl "$(PLBASE)\library" + $(MAKEINDEX) +!ENDIF + +xpce-install:: + copy rdf_diagram.pl "$(PLBASE)\xpce\prolog\lib" + $(MAKEINDEX) + +html-install:: + copy rdf2pl.html "$(PKGDOC)" + +uninstall:: + del "$(PLBASE)\library\rdf.pl" + del "$(PLBASE)\library\rdf_parser.pl" + del "$(PLBASE)\library\rdf_triple.pl" + del "$(PLBASE)\library\rewrite.pl" + del "$(PLBASE)\library\rdf_ntriples.pl" + del "$(PLBASE)\library\rdf_write.pl" + $(MAKEINDEX) + +clean:: + if exist *~ del *~ + +distclean: clean + + diff --git a/packages/sgml/RDF/README b/packages/sgml/RDF/README new file mode 100644 index 000000000..149fd04c3 --- /dev/null +++ b/packages/sgml/RDF/README @@ -0,0 +1,12 @@ +---+ RDF/XML parser and writer + +This directory implements the RDF/XML parser on top of the SWI-Prolog +XML parser. The main entry point is provided by load_rdf/3 or the +call-back version process_rdf/3. + +In addition, rdf_write.pl provides writing (serialization) of an RDD/XML +document from a list of triples. + + * [[load_rdf/3]] + * [[process_rdf/3]] + * [[rdf_write_xml/2]] diff --git a/packages/sgml/RDF/configure.in b/packages/sgml/RDF/configure.in new file mode 100644 index 000000000..2c15deec2 --- /dev/null +++ b/packages/sgml/RDF/configure.in @@ -0,0 +1,32 @@ +dnl Process this file with autoconf to produce a configure script. + +AC_INIT(install-sh) +AC_PREREQ([2.50]) +AC_CONFIG_HEADER(config.h) + +AC_SUBST(PL) +AC_SUBST(PLBASE) +AC_SUBST(PLARCH) + +# Do not cache this, it changes too often in many configurations +unset ac_cv_prog_PL + +if test -z "$PLINCL"; then +plcandidates="swi-prolog swipl pl" +AC_CHECK_PROGS(PL, $plcandidates, "none") +if test $PL = "none"; then + AC_ERROR("Cannot find SWI-Prolog. SWI-Prolog must be installed first") +else + AC_CHECKING("Running $PL -dump-runtime-variables") + eval `$PL -dump-runtime-variables` +fi +AC_MSG_RESULT(" PLBASE=$PLBASE") +AC_MSG_RESULT(" PLARCH=$PLARCH") +else +PL=../../pl.sh +fi + +AC_CHECK_PROGS(MAKE, gmake make, "make") +AC_PROG_INSTALL + +AC_OUTPUT(Makefile) diff --git a/packages/sgml/RDF/install-sh b/packages/sgml/RDF/install-sh new file mode 100755 index 000000000..ab74c882e --- /dev/null +++ b/packages/sgml/RDF/install-sh @@ -0,0 +1,238 @@ +#!/bin/sh +# +# install - install a program, script, or datafile +# This comes from X11R5. +# +# Calling this script install-sh is preferred over install.sh, to prevent +# `make' implicit rules from creating a file called install from it +# when there is no Makefile. +# +# This script is compatible with the BSD install script, but was written +# from scratch. +# + + +# set DOITPROG to echo to test this script + +# Don't use :- since 4.3BSD and earlier shells don't like it. +doit="${DOITPROG-}" + + +# put in absolute paths if you don't have them in your path; or use env. vars. + +mvprog="${MVPROG-mv}" +cpprog="${CPPROG-cp}" +chmodprog="${CHMODPROG-chmod}" +chownprog="${CHOWNPROG-chown}" +chgrpprog="${CHGRPPROG-chgrp}" +stripprog="${STRIPPROG-strip}" +rmprog="${RMPROG-rm}" +mkdirprog="${MKDIRPROG-mkdir}" + +tranformbasename="" +transform_arg="" +instcmd="$mvprog" +chmodcmd="$chmodprog 0755" +chowncmd="" +chgrpcmd="" +stripcmd="" +rmcmd="$rmprog -f" +mvcmd="$mvprog" +src="" +dst="" +dir_arg="" + +while [ x"$1" != x ]; do + case $1 in + -c) instcmd="$cpprog" + shift + continue;; + + -d) dir_arg=true + shift + continue;; + + -m) chmodcmd="$chmodprog $2" + shift + shift + continue;; + + -o) chowncmd="$chownprog $2" + shift + shift + continue;; + + -g) chgrpcmd="$chgrpprog $2" + shift + shift + continue;; + + -s) stripcmd="$stripprog" + shift + continue;; + + -t=*) transformarg=`echo $1 | sed 's/-t=//'` + shift + continue;; + + -b=*) transformbasename=`echo $1 | sed 's/-b=//'` + shift + continue;; + + *) if [ x"$src" = x ] + then + src=$1 + else + # this colon is to work around a 386BSD /bin/sh bug + : + dst=$1 + fi + shift + continue;; + esac +done + +if [ x"$src" = x ] +then + echo "install: no input file specified" + exit 1 +else + true +fi + +if [ x"$dir_arg" != x ]; then + dst=$src + src="" + + if [ -d $dst ]; then + instcmd=: + else + instcmd=mkdir + fi +else + +# Waiting for this to be detected by the "$instcmd $src $dsttmp" command +# might cause directories to be created, which would be especially bad +# if $src (and thus $dsttmp) contains '*'. + + if [ -f $src -o -d $src ] + then + true + else + echo "install: $src does not exist" + exit 1 + fi + + if [ x"$dst" = x ] + then + echo "install: no destination specified" + exit 1 + else + true + fi + +# If destination is a directory, append the input filename; if your system +# does not like double slashes in filenames, you may need to add some logic + + if [ -d $dst ] + then + dst="$dst"/`basename $src` + else + true + fi +fi + +## this sed command emulates the dirname command +dstdir=`echo $dst | sed -e 's,[^/]*$,,;s,/$,,;s,^$,.,'` + +# Make sure that the destination directory exists. +# this part is taken from Noah Friedman's mkinstalldirs script + +# Skip lots of stat calls in the usual case. +if [ ! -d "$dstdir" ]; then +defaultIFS=' +' +IFS="${IFS-${defaultIFS}}" + +oIFS="${IFS}" +# Some sh's can't handle IFS=/ for some reason. +IFS='%' +set - `echo ${dstdir} | sed -e 's@/@%@g' -e 's@^%@/@'` +IFS="${oIFS}" + +pathcomp='' + +while [ $# -ne 0 ] ; do + pathcomp="${pathcomp}${1}" + shift + + if [ ! -d "${pathcomp}" ] ; + then + $mkdirprog "${pathcomp}" + else + true + fi + + pathcomp="${pathcomp}/" +done +fi + +if [ x"$dir_arg" != x ] +then + $doit $instcmd $dst && + + if [ x"$chowncmd" != x ]; then $doit $chowncmd $dst; else true ; fi && + if [ x"$chgrpcmd" != x ]; then $doit $chgrpcmd $dst; else true ; fi && + if [ x"$stripcmd" != x ]; then $doit $stripcmd $dst; else true ; fi && + if [ x"$chmodcmd" != x ]; then $doit $chmodcmd $dst; else true ; fi +else + +# If we're going to rename the final executable, determine the name now. + + if [ x"$transformarg" = x ] + then + dstfile=`basename $dst` + else + dstfile=`basename $dst $transformbasename | + sed $transformarg`$transformbasename + fi + +# don't allow the sed command to completely eliminate the filename + + if [ x"$dstfile" = x ] + then + dstfile=`basename $dst` + else + true + fi + +# Make a temp file name in the proper directory. + + dsttmp=$dstdir/#inst.$$# + +# Move or copy the file name to the temp name + + $doit $instcmd $src $dsttmp && + + trap "rm -f ${dsttmp}" 0 && + +# and set any options; do chmod last to preserve setuid bits + +# If any of these fail, we abort the whole thing. If we want to +# ignore errors from any of these, just make sure not to ignore +# errors from the above "$doit $instcmd $src $dsttmp" command. + + if [ x"$chowncmd" != x ]; then $doit $chowncmd $dsttmp; else true;fi && + if [ x"$chgrpcmd" != x ]; then $doit $chgrpcmd $dsttmp; else true;fi && + if [ x"$stripcmd" != x ]; then $doit $stripcmd $dsttmp; else true;fi && + if [ x"$chmodcmd" != x ]; then $doit $chmodcmd $dsttmp; else true;fi && + +# Now rename the file to the real destination. + + $doit $rmcmd -f $dstdir/$dstfile && + $doit $mvcmd $dsttmp $dstdir/$dstfile + +fi && + + +exit 0 diff --git a/packages/sgml/RDF/online.html b/packages/sgml/RDF/online.html new file mode 100644 index 000000000..8d8925eee --- /dev/null +++ b/packages/sgml/RDF/online.html @@ -0,0 +1,177 @@ + + + + +Online SWI-Prolog RDF parser demo + + + +

Online SWI-Prolog RDF parser demo

+ +

+This page provides an online demonstration of an RDF parser written in SWI-Prolog and distributed as part of the +normal SWI-Prolog distribution. This RDF parser has a couple of attractive +properties: + +

+
Small
+Both in terms of source-code (< 1200 lines) and executable. + +
Fast
+Parses about 1.2 Mbytes/sec on an AMD 1600+ running SWI-Prolog 5.1.6 and +SuSE Linux 8.1 (tested on a 9MB RDFS file containing 179403 triples). + +
Conforming
+This parser conforms to http://www.w3.org/TR/rdf-syntax-grammar/, +W3C Working Draft 8 November 2002. It includes the revised bag +syntax, xml:base, parseType="Collection" +and nodeID features. + +
Prolog based
+Many people regard the Prolog programming language a good vehicle to +reason about RDF statements. Having a simple and fast Prolog-based RDF +parser makes life easier. + +
Portability
+The RDF parser itself is written in ISO Prolog. The XML parser is +written in ANSI-C. There is no standard for interfacing Prolog and C, +but the interface is relatively small. + +
Discussion Page
+There is a + +discussion page on this parser on the SWI-Prolog collaborative +(twiki web)
+ +A more detailed description of this packages is available in in this +document. The sources of the parser are included in the full +source for SWI-Prolog. The individual source files can also be +examined through the +cvsweb service. + +

+


+Please write your RDF description into the text-area below or select a +local file using the File: item and submit it. If anything goes +wrong, please mail Jan +Wielemaker. + +

+The RDF-data submitted is kept anonymously on our server and +might be used by us to examine problems with our RDF parser. We do not +publish this material. The result-page provides a form for +attaching a comment to the stored RDF statement. + +

+

+ +
+ +
File: + + + + + +
+ +
+
+ +


Notes

+
rdf:resource, etc.
+The specification and discussion on the rdf interest group yielded no +satisfactory solution how to deal with RDF attributes that are not in +the RDF namespace such as ID, resource, etc. This parser +interprets such attributes in the namespace of the element, so the +statement below is not interpreted as a propertyElt with value +me but as a typedNode with predicate +resource and value literal(me). + +
+  <s:Creator resource="#me">
+
+ + + + + + + + + diff --git a/packages/sgml/RDF/online.pl b/packages/sgml/RDF/online.pl new file mode 100644 index 000000000..891bc7807 --- /dev/null +++ b/packages/sgml/RDF/online.pl @@ -0,0 +1,457 @@ +/* $Id$ + + Part of SWI-Prolog RDF parser + + Author: Jan Wielemaker + E-mail: jan@swi.psy.uva.nl + WWW: http://www.swi.psy.uva.nl/projects/SWI-Prolog/ + Copying: LGPL-2. See the file COPYING or http://www.gnu.org + + Copyright (C) 1990-2000 SWI, University of Amsterdam. All rights reserved. +*/ + + +:- use_module(library(cgi)). +:- use_module(library(sgml)). +:- use_module(rdf). +:- use_module(rdf_parser). +:- use_module(rewrite). +:- use_module(pretty_print). + +term_expansion(F, T) :- rew_term_expansion(F, T). +goal_expansion(F, T) :- rew_goal_expansion(F, T). + +:- dynamic new_rdf_namespace/1. + +parse(Text, RDFTerm, Triples) :- + parse_atom(Text, Term), + ( find_rdf(Term, RDFTerm) + -> true + ; RDFTerm = Term + ), + xml_to_rdf(RDFTerm, [], Triples). + +find_rdf(Term, RDFTerm) :- + RDFTerm = element(NS:'RDF', _, _), + term_member(RDFTerm, Term), !, + ( rdf_name_space(NS) + -> true + ; assert(rdf_parser:rdf_name_space(NS)), + assert(new_rdf_namespace(NS)) + ). + +term_member(X, X). +term_member(X, Compound) :- + compound(Compound), + arg(_, Compound, Arg), + term_member(X, Arg). + +% parse_atom(+Atom, -Term, +Options +% +% Parse and atom into a structured term + +parse_atom(Atom, Term) :- + atom_to_memory_file(Atom, MemFile), + open_memory_file(MemFile, read, Stream), + new_sgml_parser(Parser, []), + set_sgml_parser(Parser, dialect(xmlns)), + set_sgml_parser(Parser, space(sgml)), + sgml_parse(Parser, + [ source(Stream), + document(Term) + ]), + free_sgml_parser(Parser), + close(Stream), + free_memory_file(MemFile). + + + /******************************* + * HTML GENERATION * + *******************************/ + +:- op(100, fx, #). +:- op(110, xfx, ::). + +emit([]) :- !. +emit([H|T]) :- !, + emit(H), + emit(T). +emit(Fmt-Args) :- !, + format(Fmt, Args), + retractall(nl_done(_)). +emit(#Term) :- !, + #Term. +emit(#Term::Content) :- !, + #Term::Content. +emit(Atom) :- + write(Atom), + retractall(nl_done(_)). + +#Term::Content :- + Term =.. [Name|Attributes], + layout(before(open, Name)), + format('<~w', [Name]), + attlist(Attributes), + format('>', []), + retractall(nl_done(_)), + layout(after(open, Name)), + emit(Content), + end_tag(Name). +#pre(Text) :- !, + sgml_quote(Text, Quoted), + #pre::Quoted. +#box(Text) :- !, + box(Text, '#e0e0e0'). +#box(Text, Colour) :- !, + box(Text, Colour). +#Term :- + Term =.. [Name|Attributes], + layout(before(open, Name)), + format('<~w', [Name]), + attlist(Attributes), + format('>', []), + retractall(nl_done(_)), + layout(after(open, Name)), + end_tag(Name). + +end_tag(Name) :- + blines(Name, _, o), !. +end_tag(Name) :- + layout(before(close, Name)), + format('', [Name]), + retractall(nl_done(_)), + layout(after(close, Name)). + + +layout(before(open, Name)) :- + blines(Name, N-_, _), !, + nls(N). +layout(after(open, Name)) :- + blines(Name, _-N, _), !, + nls(N). +layout(before(close, Name)) :- + blines(Name, _, N-_), !, + nls(N). +layout(after(close, Name)) :- + blines(Name, _, _-N), !, + nls(N). +layout(_) :- + retractall(nl_done(_)). + +:- dynamic + nl_done/1. + +nls(N) :- + ( nl_done(Done) + -> true + ; Done = 0 + ), + ToDo is N - Done, + New is max(N, Done), + retractall(nl_done(Done)), + assert(nl_done(New)), + do_nl(ToDo). + +do_nl(N) :- + N > 0, !, + nl, + NN is N - 1, + do_nl(NN). +do_nl(_). + +blines(tr, 1-0, 0-0). +blines(table, 2-1, 1-1). +blines(form, 2-1, 1-1). +blines(h1, 2-0, 0-1). +blines(h2, 2-0, 0-2). +blines(h3, 2-0, 0-2). +blines(h4, 2-0, 0-2). +blines(p, 2-1, o). % omitted end-tag + +attlist([]). +attlist([Name=Value|T]) :- !, + sgml_quote_value(Value, Quoted), + format(' ~w=~w', [Name, Quoted]), + attlist(T). +attlist([Name|T]) :- + format(' ~w', [Name]), + attlist(T). + +head(Title) :- + emit([ 'Content-type: text/html\n\n', + '\n', + '\n', + '~w~n'-[Title], + '\n\n', + '\n' + ]). +foot :- + emit([ '\n', + '\n' + ]). + + +pre(Text) :- + sgml_quote(Text, Quoted), + #pre::Quoted. + +box(Text, Colour) :- + emit('

\n'), + #table(width='80%', align=center, border=6, bgcolor=Colour):: + [#tr::[#td(nowrap)::[#pre(Text)]]]. + + + /******************************* + * QUOTING * + *******************************/ + +sgml_quote_value(Value, Arg) :- + atom_chars(Value, Chars), + ( name_chars(Chars) + -> Arg = Value + ; sgml_quote_chars(Chars, Quoted), + atom_chars(Arg, Quoted) + ). + +name_chars([H|T]) :- + char_type(H, alpha), + all_alnum(T). + +all_alnum([]). +all_alnum([H|T]) :- + char_type(H, csymf), + all_alnum(T). + +sgml_quote_chars(L, ['"'|T]) :- + sgml_quote2(L, T, ['"']). + +sgml_quote2([], T, T). +sgml_quote2([H|T0], List, Rest) :- + sgml_quote_char(H, List, T), !, + sgml_quote2(T0, T, Rest). +sgml_quote2([H|T0], [H|T], Rest) :- + sgml_quote2(T0, T, Rest). + +sgml_quote_char('<', [&, l, t, ;|T], T). +sgml_quote_char('>', [&, g, t, ;|T], T). +sgml_quote_char('&', [&, a, m, p, ;|T], T). +sgml_quote_char('"', [&, q, u, o, t, ;|T], T). +%sgml_quote_char('\'', [&, a, p, o, s, ;|T], T). + +sgml_quote(Text, Quoted) :- + atom_chars(Text, Chars), + sgml_quote2(Chars, QuotedChars, []), + atom_chars(Quoted, QuotedChars). + + + /******************************* + * PAGE GENERATION * + *******************************/ + +parsed(Time, Triples) :- + length(Triples, Len), + #h2::'RDF statement parsed successfully', + #p::[ 'Your RDF statement has been parsed in ~2f seconds, '-[Time], + 'creating ', #b::Len, ' triples. ', + 'Please find the created triples in the table below.' + ], + ( getenv('HTTP_REFERER', Referer) + -> #p::[ 'If you want to try another RDF statement, please go ', + 'back to ', #a(href=Referer)::'the request form', '.' + ] + ; true + ). + +rdf_table(Triples) :- + maplist(triple_row, Triples, TripleRows), + #p, + #table(caption='RDF triples', + align=center, border=2, cellpadding=3):: + [ #tr::[#th::'Subject', #th::'Predicate', #th::'Object'] + | TripleRows + ]. + +triple_row(rdf(Subj, Pred, Obj), #tr::[#td::S,#td::P,#td::O]) :- + cell(Subj, S), + cell(Pred, P), + cell(Obj, O). + +cell(rdf:Local, [#em::rdf, :, #b::Local]) :- !. +cell(literal(X), [#b::'literal(', X, #b::')']) :- !. +cell(each(X), [#b::'each(', X, #b::')']) :- !. +cell(pefix(X), [#b::'prefix(', X, #b::')']) :- !. +cell(NS:Local, [NS, :, #b::Local]) :- !. +cell(V, [T]) :- + sformat(T, '~p', [V]). + + /******************************* + * ERRORS * + *******************************/ + +show_errors :- + getenv('ERROR_FILE', File), + size_file(File, Size), + Size > 0, !, + read_file(File, Data), + #h4::[#font(color=red):: + 'The following errors occurred while processing your request'], + #p, + #box(Data, '#ff8c00'). +show_errors. + +show_new_namepace :- + new_rdf_namespace(NS), !, + #h4::[#font(color=red)::'Warning: unofficial RDF Namespace'], + #p::['It appears your RDF description uses the unofficial ', + 'name space ', #b::NS, '. ', + 'This name space has been added for RDF.' + ]. +show_new_namepace. + + + /******************************* + * COMMENT * + *******************************/ + +comment(TextId) :- + #h4::'


Comment', + #p::[ 'If you do not agree with the output or have other comments, ', + 'Please write them in the text-area below and submit them' + ], + getenv('REQUEST_URI', Script), + #form(method=post, action=Script):: + [ #input(type=hidden, name=id, value=TextId), + #table(align=center):: + [ #tr::[#td::[#textarea(name=comment, cols=64, rows=10)]], + #tr::[#td(align=right)::['E-mail: ', #input(name=mail)]], + #tr::[#td(align=right)::[#input(type=submit)]] + ] + ]. + + + /******************************* + * REQUEST * + *******************************/ + +request_location('Online-requests'). + +% Save the request and return a local identifier for it. + +save_request(Text, Id) :- + request_dir(Dir, Date), + concat_atom([Dir, /, Date], DateDir), + ensure_dir(DateDir), + between(1, 10000, N), + concat_atom([DateDir, /, N, '.rdf'], File), + \+ exists_file(File), !, + open(File, write, Fd), + format(Fd, '~w~n', [Text]), + close(Fd), + concat_atom([Date, /, N], Id). + +request_dir(BaseDir, Date) :- + get_time(Time), + convert_time(Time, Y, M, D, _, _, _, _), + request_location(BaseDir), + concat_atom([D, -, M, -, Y], Date). + +ensure_dir(Dir) :- + exists_directory(Dir), !. +ensure_dir(Dir) :- + make_directory(Dir). + +save_comment(Id, Mail, Comment) :- + request_location(Base), + concat_atom([Base, '/', Id], FileBase), + absolute_file_name(FileBase, AbsFileBase), + absolute_file_name(Base, AbsBase), + sub_atom(AbsFileBase, 0, _, _, AbsBase), % verify in tree + atom_concat(AbsFileBase, '.cmt', CmtFile), + open(CmtFile, write, Fd), + format(Fd, 'E-mail: ~w~n~n~w~n', [Mail, Comment]), + close(Fd). + + + /******************************* + * ENTRY * + *******************************/ + +main :- + cgi_get_form(Arguments), + ( ( memberchk(attachment(Text), Arguments), + Text \== '' + ; memberchk(rdf(Text), Arguments) + ) + -> save_request(Text, TextId), + ( OldTime is cputime, + parse(Text, _Prolog, Triples), + Time is cputime - OldTime + -> head('RDF Triples'), + parsed(Time, Triples), + show_errors, + show_new_namepace, + rdf_table(Triples), + comment(TextId), + foot + ; head('Failed to parse'), + #p::[ 'I failed to parse your request' ], + show_errors, + comment(TextId), + foot + ), + halt + ; memberchk(comment(Comment), Arguments), + memberchk(id(Id), Arguments), + memberchk(mail(Mail), Arguments) + -> save_comment(Id, Mail, Comment), + head('Thanks for comment'), + #p::'Thank you for your comments', + foot, + halt + ). +main :- + head('Failed'), + #p::[ 'This CGI-script failed to understand your request' ], + foot, + halt. + +go :- + catch(main, E, error(E)). + +error(E) :- + message_to_string(E, Msg), + head('Failed to parse'), + show_errors, + #p::[ 'An exception was raised while parsing your request:' ], + #pre(Msg), + foot, + halt. + + + /******************************* + * TEST * + *******************************/ + +test :- + read_file('suite/t1.rdf', Text), + catch(parse(Text, _Prolog, Triples), E, error(E)), + head('RDF Triples'), + rdf_table(Triples), + foot. + + + /******************************* + * UTIL * + *******************************/ + +read_file(File, Atom) :- + open(File, read, Fd), + get_code(Fd, C), + read_stream(C, Fd, Chars), + close(Fd), + atom_codes(Atom, Chars). + +read_stream(-1, _, []) :- !. +read_stream(C0, Fd, [C0|T]) :- + get_code(Fd, C), + read_stream(C, Fd, T). diff --git a/packages/sgml/RDF/pretty_print.pl b/packages/sgml/RDF/pretty_print.pl new file mode 100644 index 000000000..74c737d26 --- /dev/null +++ b/packages/sgml/RDF/pretty_print.pl @@ -0,0 +1,167 @@ +/* $Id$ + + Part of SWI-Prolog SGML/XML parser + + Author: Jan Wielemaker + E-mail: jan@swi.psy.uva.nl + WWW: http://www.swi.psy.uva.nl/projects/SWI-Prolog/ + Copying: LGPL-2. See the file COPYING or http://www.gnu.org + + Copyright (C) 1990-2000 SWI, University of Amsterdam. All rights reserved. +*/ + +:- module(dia_pretty_print, + [ pretty_print/1 + ]). + +:- require([ atom_length/2 + , between/3 + , forall/2 + , is_list/1 + , member/2 + , memberchk/2 + ]). + + +pretty_print(Term) :- + numbervars(Term, 0, _), + pp(Term, 0), + write('.'), nl, fail. +pretty_print(_). + + +pp(Term, _Indent) :- + atomic(Term), !, + writeq(Term). +pp(Var, _Indent) :- + var(Var), !, + write(Var). +pp(Var, _Indent) :- + Var = '$VAR'(_), !, + print(Var). +pp('$aref'(Name), _Indent) :- !, + write(Name). +pp(Module:Term, Indent) :- + atomic(Module), !, + writeq(Module), write(:), + pp(Term, Indent). +pp([A1 = V1|ArgList], Indent) :- % [] is done by `atomic'! + is_list(ArgList), + forall(member(A, ArgList), A = (_ = _)), + longest_attribute([A1 = V1|ArgList], 0, L), !, + NewIndent is Indent + 2, + ( L > 9, Indent < 25, length(ArgList, Args), Args > 1 + -> ArgIndent is Indent + 4, + ValGoal = (nl, indent(ArgIndent)) + ; ArgIndent is Indent + 6 + L, + ValGoal = write(' ') + ), + write('[ '), + pp(A1, Indent), term_length(A1, L1), + tab(L-L1), write(' ='), ValGoal, + pp(V1, ArgIndent), + forall(member(A = V, ArgList), + (write(','), nl, + indent(NewIndent), + pp(A, Indent), term_length(A, LA), tab(L-LA), + write(' ='), ValGoal, pp(V, ArgIndent))), + nl, + indent(Indent), + write(']'). +pp([H|T], Indent) :- + is_list(T), !, + write('[ '), + NewIndent is Indent + 2, + pp(H, NewIndent), + forall(member(E, T), + (write(','), nl, + indent(NewIndent), + pp(E, NewIndent))), + nl, + indent(Indent), + write(']'). +pp(Term, Indent) :- + functor(Term, Name, 2), + current_op(_, Type, Name), + memberchk(Type, [xfx, yfx]), !, + arg(1, Term, A1), + arg(2, Term, A2), + pp(A1, Indent), format(' ~q ', [Name]), pp(A2, Indent). +pp(Term, Indent) :- + functor(Term, Name, _Arity), + atom_length(Name, L), + NewIndent is Indent + L + 1, + format('~q(', Name), + ( term_argument_length(Term, AL), + NewIndent + AL < 72 + -> Wrap = nowrap + ; Wrap = wrap + ), + forall(generate_arg(I, Term, Arg), + pparg(I, Arg, Wrap, NewIndent)), + write(')'). + +generate_arg(ArgN, Term, Arg) :- + functor(Term, _, Arity), + between(1, Arity, ArgN), + arg(ArgN, Term, Arg). + +pparg(1, Term, _, Indent) :- !, + pp(Term, Indent). +pparg(_, Term, wrap, Indent) :- !, + write(','), nl, + indent(Indent), + pp(Term, Indent). +pparg(_, Term, _, Indent) :- + write(', '), + pp(Term, Indent). + +longest_attribute([], L, L). +longest_attribute([A = _|T], L0, L) :- + term_length(A, AL), + max(L0, AL, L1), + longest_attribute(T, L1, L). + +term_length(A, AL) :- + atomic(A), !, + atom_length(A, AL). +term_length(Var, AL) :- + var(Var), !, + AL = 1. +term_length('$VAR'(N), AL) :- + varname(N, L), + length(L, AL). +term_length('$aref'(N), AL) :- + atom_length(N, AL). + +term_argument_length(Term, L) :- + term_argument_length(Term, 1, 0, L). + +term_argument_length(Term, A, L0, L) :- + arg(A, Term, Arg), !, + term_length(Arg, AL), + L1 is AL + L0, + NA is A + 1, + term_argument_length(Term, NA, L1, L). +term_argument_length(_, _, L, L). + + +max(A, B, M) :- + ( A >= B + -> M = A + ; M = B + ). + + +varname(N, [C]) :- + N < 26, !, + C is N + 0'A. +varname(N, [C1, C2]) :- + C1 is N // 26 + 0'A, + C2 is N mod 26 + 0'A. + +indent(I) :- + Tabs is I // 8, + forall(between(1, Tabs, _), put(9)), + Spaces is I mod 8, + tab(Spaces). diff --git a/packages/sgml/RDF/rdf-parser b/packages/sgml/RDF/rdf-parser new file mode 100755 index 000000000..d2f196e7f --- /dev/null +++ b/packages/sgml/RDF/rdf-parser @@ -0,0 +1,35 @@ +#!/bin/bash +# +# This is not the normal parser, just the front-end for the CGI interface. +# The real CGI stuff is written in Prolog in the file online.pl. The +# request is in online.html +# +# The RDF parser itself is just a Prolog library. See rdf2pl.{html,pdf}. + +base=@BASEDIR@ +tmp=/tmp/rdf-parser-$$ +export ERROR_FILE=$tmp + +ulimit -t 20 # seconds CPU time limit + +function error() +{ cat << _EOM_ +Content-type: text/plain + +Sorry, an internal error occurred. For details, see below. + +_EOM_ + cat $tmp + rm -r $tmp + exit 0 +} + +cd $base > $tmp 2>&1 +@SWI@ -f none -F none -t halt \ + -g "load_files(online,[silent(true)]),go" 2>$tmp + +case $? in + 0) rm -f $tmp + exit 0 ;; + *) error ;; +esac diff --git a/packages/sgml/RDF/rdf.html b/packages/sgml/RDF/rdf.html new file mode 100644 index 000000000..a0c089676 --- /dev/null +++ b/packages/sgml/RDF/rdf.html @@ -0,0 +1,156 @@ + + + + +Online SWI-Prolog RDF parser demo + + + +

Online SWI-Prolog RDF parser demo

+ +

+This page provides an online demonstration of an RDF parser written in +SWI-Prolog. +This RDF parser has a couple of attractive properties: + +

+
Small
+Both in terms of source-code (< 700 lines) and executable. +
Fast
+Parses about 400 Kbytes/sec on a Pentium-II/450. +
Conforming
+This parser conforms to +http://www.w3.org/TR/REC-rdf-syntax. +
Prolog based
+Many people regard the Prolog programming language a good vehicle to +reason about RDF statements. Having a simple and fast Prolog-based RDF +parser makes life easier. +
Portability
+The RDF parser itself is written in ISO Prolog. The XML parser is +written in ANSI-C. There is no standard for interfacing Prolog and C, +but the interface is relatively small. +
+ +A more detailed description of this packages is available in in this document. The sources of the parser are +included into the sgml +packages for SWI-Prolog. The individual source files can also be +examined through the +cvsweb service, which also provides access to the most recent + +changes + +

+


+Please write your RDF description into the text-area below or select a +local file using the File: item and submit it. If anything goes +wrong, please mail Jan +Wielemaker. + +

+The RDF-data submitted is kept anonymously on our server and +might be used by us to examine problems with our RDF parser. We do not +publish this material. The result-page provides a form for +attaching a comment to the stored RDF statement. + + + + +

+

+ +
+ +
File: + + + + +
+ +
+
+ +


Notes

+
rdf:resource, etc.
+The specification and discussion on the rdf interest group yielded no +satisfactory solution how to deal with RDF attributes that are not in +the RDF namespace such as ID, resource, etc. This parser +interprets such attributes in the namespace of the element, so the +statement below is not interpreted as a propertyElt with value +me but as a typedNode with predicate +resource and value literal(me). + +
+  <s:Creator resource="#me">
+
+ + + + + + + + + + + + + + + diff --git a/packages/sgml/RDF/rdf.pl b/packages/sgml/RDF/rdf.pl new file mode 100644 index 000000000..7bc083d8c --- /dev/null +++ b/packages/sgml/RDF/rdf.pl @@ -0,0 +1,456 @@ +/* $Id$ + + Part of SWI-Prolog + + Author: Jan Wielemaker + E-mail: wielemak@science.uva.nl + WWW: http://www.swi-prolog.org + Copyright (C): 2002-2007, University of Amsterdam + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License + as published by the Free Software Foundation; either version 2 + of the License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + + As a special exception, if you link this library with other files, + compiled with a Free Software compiler, to produce an executable, this + library does not by itself cause the resulting executable to be covered + by the GNU General Public License. This exception does not however + invalidate any other reasons why the executable file might be covered by + the GNU General Public License. +*/ + + +:- module(rdf, + [ load_rdf/2, % +File, -Triples + load_rdf/3, % +File, -Triples, +Options + xml_to_rdf/3, % +XML, -Triples, +Options + process_rdf/3 % +File, :OnTriples, +Options + ]). + +:- meta_predicate(process_rdf(+, :, +)). + +:- use_module(library(sgml)). % Basic XML loading +:- use_module(library(option)). % option/3 +:- use_module(library(lists)). +:- use_module(rdf_parser). % Basic parser +:- use_module(rdf_triple). % Generate triples + +%% load_rdf(+File, -Triples) is det. +%% load_rdf(+File, -Triples, +Options) is det. +% +% Parse an XML file holding an RDF term into a list of RDF triples. +% see rdf_triple.pl for a definition of the output format. Options: +% +% * base_uri(+URI) +% URI to use as base +% +% * expand_foreach(+Bool) +% Apply each(Container, Pred, Object) on the members of +% Container +% +% * namespaces(-Namespaces:list(NS=URL)) +% Return list of namespaces declared using xmlns:NS=URL in +% the document. This can be used to update the namespace +% list with rdf_register_ns/2. +% +% @see Use process_rdf/3 for processing large documents in +% _|call-back|_ style. + +load_rdf(File, Triples) :- + load_rdf(File, Triples, []). + +load_rdf(File, Triples, Options0) :- + entity_options(Options0, EntOptions, Options1), + meta_options(Options1, Options), + init_ns_collect(Options, NSList), + load_structure(File, + [ RDFElement + ], + [ dialect(xmlns), + space(sgml), + call(xmlns, rdf:on_xmlns) + | EntOptions + ]), + rdf_start_file(Options, Cleanup), + call_cleanup(xml_to_rdf(RDFElement, Triples0, Options), + rdf_end_file(Cleanup)), + exit_ns_collect(NSList), + post_process(Options, Triples0, Triples). + +entity_options([], [], []). +entity_options([H|T0], Entities, Rest) :- + ( H = entity(_,_) + -> Entities = [H|ET], + entity_options(T0, ET, Rest) + ; Rest = [H|RT], + entity_options(T0, Entities, RT) + ). + + +%% xml_to_rdf(+XML, -Triples, +Options) + +xml_to_rdf(XML, Triples, Options) :- + is_list(Options), !, + xml_to_plrdf(XML, RDF, Options), + rdf_triples(RDF, Triples). +xml_to_rdf(XML, BaseURI, Triples) :- + atom(BaseURI), !, + xml_to_rdf(XML, Triples, [base_uri(BaseURI)]). + + + /******************************* + * POST-PROCESSING * + *******************************/ + +post_process([], Triples, Triples). +post_process([expand_foreach(true)|T], Triples0, Triples) :- !, + expand_each(Triples0, Triples1), + post_process(T, Triples1, Triples). +post_process([_|T], Triples0, Triples) :- !, + post_process(T, Triples0, Triples). + + + /******************************* + * EXPAND * + *******************************/ + +expand_each(Triples0, Triples) :- + select(rdf(each(Container), Pred, Object), + Triples0, Triples1), !, + each_triples(Triples1, Container, Pred, Object, Triples2), + expand_each(Triples2, Triples). +expand_each(Triples, Triples). + +each_triples([], _, _, _, []). +each_triples([H0|T0], Container, P, O, + [H0, rdf(S,P,O)|T]) :- + H0 = rdf(Container, rdf:A, S), + member_attribute(A), !, + each_triples(T0, Container, P, O, T). +each_triples([H|T0], Container, P, O, [H|T]) :- + each_triples(T0, Container, P, O, T). + +member_attribute(A) :- + sub_atom(A, 0, _, _, '_'). % must check number? + + + /******************************* + * BIG FILES * + *******************************/ + +%% process_rdf(+Input, :OnObject, +Options) +% +% Process RDF from Input. Input is either an atom or a term of the +% format stream(Handle). For each encountered description, call +% OnObject(+Triples) to handle the triples resulting from the +% description. Defined Options are: +% +% * base_uri(+URI) +% Determines the reference URI. +% +% * db(DB) +% When loading from a stream, the source is taken from +% this option or -if non-existent- from base_uri. +% +% * lang(LanguageID) +% Set initial language (as xml:lang) +% +% * convert_typed_literal(:Convertor) +% Call Convertor(+Type, +Content, -RDFObject) to create +% a triple rdf(S, P, RDFObject) instead of rdf(S, P, +% literal(type(Type, Content)). +% +% * namespaces(-Namespaces:list(NS=URL)) +% Return list of namespaces declared using xmlns:NS=URL in +% the document. This can be used to update the namespace +% list with rdf_register_ns/2. +% +% * entity(Name, Value) +% Overrule entity values found in the file +% +% * embedded(Boolean) +% If =true=, do not give warnings if rdf:RDF is embedded +% in other XML data. + +process_rdf(File, OnObject, Options0) :- + is_list(Options0), !, + entity_options(Options0, EntOptions, Options1), + meta_options(Options1, Options2), + process_options(Options2, ProcessOptions, Options), + option(base_uri(BaseURI), Options, []), + rdf_start_file(Options, Cleanup), + strip_module(OnObject, Module, Pred), + nb_setval(rdf_object_handler, Module:Pred), + nb_setval(rdf_options, Options), + nb_setval(rdf_state, -), + init_ns_collect(Options, NSList), + ( File = stream(In) + -> Source = BaseURI + ; is_stream(File) + -> In = File, + option(db(Source), Options, BaseURI) + ; open(File, read, In, [type(binary)]), + Close = In, + Source = File + ), + new_sgml_parser(Parser, [dtd(DTD)]), + def_entities(EntOptions, DTD), + set_sgml_parser(Parser, file(Source)), + set_sgml_parser(Parser, dialect(xmlns)), + set_sgml_parser(Parser, space(sgml)), + do_process_rdf(Parser, In, NSList, Close, Cleanup, ProcessOptions). +process_rdf(File, BaseURI, OnObject) :- + process_rdf(File, OnObject, [base_uri(BaseURI)]). + +def_entities([], _). +def_entities([entity(Name, Value)|T], DTD) :- !, + def_entity(DTD, Name, Value), + def_entities(T, DTD). +def_entities([_|T0], DTD) :- + def_entities(T0, DTD). + +def_entity(DTD, Name, Value) :- + open_dtd(DTD, [], Stream), + xml_quote_attribute(Value, QValue), + format(Stream, '~n', [Name, QValue]), + close(Stream). + + +do_process_rdf(Parser, In, NSList, Close, Cleanup, Options) :- + call_cleanup(( sgml_parse(Parser, + [ source(In), + call(begin, rdf:on_begin), + call(xmlns, rdf:on_xmlns) + | Options + ]), + exit_ns_collect(NSList) + ), + cleanup_process(Close, Cleanup, Parser)). + +cleanup_process(In, Cleanup, Parser) :- + ( var(In) + -> true + ; close(In) + ), + free_sgml_parser(Parser), + nb_delete(rdf_options), + nb_delete(rdf_object_handler), + nb_delete(rdf_state), + nb_delete(rdf_nslist), + rdf_end_file(Cleanup). + +on_begin(NS:'RDF', Attr, _) :- + rdf_name_space(NS), !, + nb_getval(rdf_options, Options0), + modify_state(Attr, Options0, Options), + nb_setval(rdf_state, Options). +on_begin(Tag, Attr, Parser) :- + nb_getval(rdf_state, Options), + ( Options == (-) + -> nb_getval(rdf_options, RdfOptions), + ( memberchk(embedded(true), RdfOptions) + -> true + ; print_message(warning, rdf(unexpected(Tag, Parser))) + ) + ; get_sgml_parser(Parser, line(Start)), + get_sgml_parser(Parser, file(File)), + sgml_parse(Parser, + [ document(Content), + parse(content) + ]), + nb_getval(rdf_object_handler, OnTriples), + element_to_plrdf(element(Tag, Attr, Content), Objects, Options), + rdf_triples(Objects, Triples), + call(OnTriples, Triples, File:Start) + ). + +%% on_xmlns(+NS, +URL, +Parser) +% +% Build up the list of encountered xmlns:NS=URL declarations. We +% use destructive assignment here as an alternative to +% assert/retract, ensuring thread-safety and better performance. + +on_xmlns(NS, URL, _Parser) :- + ( nb_getval(rdf_nslist, List), + List = list(L0) + -> nb_linkarg(1, List, [NS=URL|L0]) + ; true + ). + +init_ns_collect(Options, NSList) :- + ( option(namespaces(NSList), Options, -), + NSList \== (-) + -> nb_setval(rdf_nslist, list([])) + ; nb_setval(rdf_nslist, -), + NSList = (-) + ). + +exit_ns_collect(NSList) :- + ( NSList == (-) + -> true + ; nb_getval(rdf_nslist, list(NSList)) + ). + +modify_state([], Options, Options). +modify_state([H|T], Options0, Options) :- + modify_state1(H, Options0, Options1), + modify_state(T, Options1, Options). + +modify_state1(xml:base = Base0, Options0, Options) :- !, + remove_fragment(Base0, Base), + set_option(base_uri(Base), Options0, Options). +modify_state1(xml:lang = Lang, Options0, Options) :- !, + set_option(lang(Lang), Options0, Options). +modify_state1(_, Options, Options). + +%% remove_fragment(+URI, -WithoutFragment) +% +% When handling xml:base, we must delete the possible fragment. + +remove_fragment(URI, Plain) :- + sub_atom(URI, B, _, _, #), !, + sub_atom(URI, 0, B, _, Plain). +remove_fragment(URI, URI). + + +set_option(Opt, Options0, [Opt|Options]) :- + functor(Opt, F, A), + functor(VO, F, A), + delete(Options0, VO, Options). + + +%% meta_options(+OptionsIn, -OptionsOut) +% +% Do module qualification for options that are module sensitive. + +:- module_transparent + meta_options/2. + +meta_options([], []). +meta_options([Name=Value|T0], List) :- + atom(Name), !, + Opt =.. [Name, Value], + meta_options([Opt|T0], List). +meta_options([H0|T0], [H|T]) :- + ( H0 = convert_typed_literal(Handler) + -> strip_module(Handler, M, P), + H = convert_typed_literal(M:P) + ; H = H0 + ), + meta_options(T0, T). + + +process_options(Options, Process, RestOptions) :- + select_option(content_length(Len), Options, RestOptions), !, + Process = [content_length(Len)]. +process_options(Options, [], Options). + + + /******************************* + * MESSAGES * + *******************************/ + +:- multifile + prolog:message/3. + +% Catch messages. sgml/4 is generated by the SGML2PL binding. + +prolog:message(rdf(unparsed(Data))) --> + { phrase(unparse_xml(Data), XML) + }, + [ 'RDF: Failed to interpret "~s"'-[XML] ]. +prolog:message(rdf(shared_blank_nodes(N))) --> + [ 'RDF: Shared ~D blank nodes'-[N] ]. +prolog:message(rdf(not_a_name(Name))) --> + [ 'RDF: argument to rdf:ID is not an XML name: ~p'-[Name] ]. +prolog:message(rdf(redefined_id(Id))) --> + [ 'RDF: rdf:ID ~p: multiple definitions'-[Id] ]. +prolog:message(rdf(unexpected(Tag, Parser))) --> + { get_sgml_parser(Parser, file(File)), + get_sgml_parser(Parser, line(Line)) + }, + [ 'RDF: ~w:~d: Unexpected element ~w'-[File, Line, Tag] ]. + + + /******************************* + * XML-TO-TEXT * + *******************************/ + +unparse_xml([]) --> !, + []. +unparse_xml([H|T]) --> !, + unparse_xml(H), + unparse_xml(T). +unparse_xml(Atom) --> + { atom(Atom) + }, !, + atom(Atom). +unparse_xml(element(Name, Attr, Content)) --> + "<", + identifier(Name), + attributes(Attr), + ( { Content == [] + } + -> "/>" + ; ">", + unparse_xml(Content) + ). + +attributes([]) --> + []. +attributes([H|T]) --> + attribute(H), + attributes(T). + +attribute(Name=Value) --> + " ", + identifier(Name), + "=", + value(Value). + +identifier(NS:Local) --> !, + "{", atom(NS), "}", + atom(Local). +identifier(Local) --> + atom(Local). + +atom(Atom, Text, Rest) :- + atom_codes(Atom, Chars), + append(Chars, Rest, Text). + +value(Value) --> + { atom_codes(Value, Chars) + }, + "\"", + quoted(Chars), + "\"". + +quoted([]) --> + []. +quoted([H|T]) --> + quote(H), !, + quoted(T). + +quote(0'<) --> "<". +quote(0'>) --> ">". +quote(0'") --> """. +quote(0'&) --> "&". +quote(X) --> [X]. + + + /******************************* + * XREF * + *******************************/ + +:- multifile prolog:meta_goal/2. +prolog:meta_goal(process_rdf(_,G,_), [G+2]). diff --git a/packages/sgml/RDF/rdf2pl.doc b/packages/sgml/RDF/rdf2pl.doc new file mode 100644 index 000000000..31b3732a7 --- /dev/null +++ b/packages/sgml/RDF/rdf2pl.doc @@ -0,0 +1,475 @@ +\documentclass[11pt]{article} +\usepackage{pl} +\usepackage{html} +\usepackage{times} + +\onefile +\htmloutput{html} % Output directory +\htmlmainfile{index} % Main document file +\bodycolor{white} % Page colour + +\newcommand{\elem}[1]{{\tt\string<#1\string>}} + +\begin{document} + +\title{SWI-Prolog RDF parser} +\author{Jan Wielemaker \\ + HCS, \\ + University of Amsterdam \\ + The Netherlands \\ + E-mail: \email{jan@swi-prolog.org}} + +\maketitle + +\begin{abstract} +\url[RDF]{http://www.w3.org/RDF/} ({\bf R}esource {\bf D}escription {\bf +F}ormat) is a \url[W3C]{http://www.w3.org/} standard for expressing +meta-data about web-resources. It has two representations providing +the same semantics. RDF documents are normally transferred as XML +documents using the RDF-XML syntax. This format is unsuitable for +processing. The parser defined here converts an RDF-XML document into +the \jargon{triple} notation. The library \pllib{rdf_write} creates +an RDF/XML document from a list of triples. +\end{abstract} + +\vfill + +\tableofcontents + +\vfill +\vfill + +\newpage + +\section{Introduction} + +RDF is a promising standard for representing meta-data about documents +on the web as well as exchanging frame-based data (e.g. ontologies). RDF +is often associated with `semantics on the web'. It consists of a formal +data-model defined in terms of \jargon{triples}. In addition, a +\jargon{graph} model is defined for visualisation and an XML application +is defined for exchange. + +`Semantics on the web' is also associated with the Prolog programming +language. It is assumed that Prolog is a suitable vehicle to reason with +the data expressed in RDF models. Most of the related web-infra +structure (e.g. XML parsers, DOM implementations) are defined in Java, +Perl, C or C+{+}. + +Various routes are available to the Prolog user. Low-level XML parsing +is due to its nature best done in C or C+{+}. These languages produce +fast code. As XML/SGML are at the basis of most of the other web-related +formats we will benefit most here. XML and SGML, being very stable +specifications, make fast compiled languages even more attractive. + +But what about RDF? RDF-XML is defined in XML, and provided with a +Prolog term representing the XML document processing it according to the +RDF syntax is quick and easy in Prolog. The alternative, getting yet +another library and language attached to the system, is getting less +attractive. In this document we explore the suitability of Prolog for +processing XML documents in general and into RDF in particular. + + +\section{Parsing RDF in Prolog} + +We realised an RDF compiler in Prolog on top of the {\bf sgml2pl} +package (providing a name-space sensitive XML parser). The +transformation is realised in two passes. + +The first pass rewrites the XML term into a Prolog term conveying the +same information in a more friendly manner. This transformation is +defined in a high-level pattern matching language defined on top of +Prolog with properties similar to DCG (Definite Clause Grammar). + +The source of this translation is very close to the BNF notation used by +the \url[specification]{http://www.w3.org/TR/REC-rdf-syntax/}, so +correctness is `obvious'. Below is a part of the definition for RDF +containers. Note that XML elements are represented using a term of the +format: + +\begin{quote} + \term{element}{Name, [AttrName = Value...], [Content ...]} +\end{quote} + +\begin{code} +memberElt(LI) ::= + \referencedItem(LI). +memberElt(LI) ::= + \inlineItem(LI). + +referencedItem(LI) ::= + element(\rdf(li), + [ \resourceAttr(LI) ], + []). + +inlineItem(literal(LI)) ::= + element(\rdf(li), + [ \parseLiteral ], + LI). +inlineItem(description(description, _, _, Properties)) ::= + element(\rdf(li), + [ \parseResource ], + \propertyElts(Properties)). +inlineItem(LI) ::= + element(\rdf(li), + [], + [\rdf_object(LI)]), !. % inlined object +inlineItem(literal(LI)) ::= + element(\rdf(li), + [], + [LI]). % string value +\end{code} + +Expression in the rule that are prefixed by the \verb$\$ operator acts +as invocation of another rule-set. The body-term is converted into +a term where all rule-references are replaced by variables. The +resulting term is matched and translation of the arguments is achieved +by calling the appropriate rule. Below is the Prolog code for the +{\bf referencedItem} rule: + +\begin{code} +referencedItem(A, element(B, [C], [])) :- + rdf(li, B), + resourceAttr(A, C). +\end{code} + +Additional code can be added using a notation close to the Prolog +DCG notation. Here is the rule for a description, producing +properties both using {\bf propAttrs} and {\bf propertyElts}. + +\begin{code} +description(description, About, BagID, Properties) ::= + element(\rdf('Description'), + \attrs([ \?idAboutAttr(About), + \?bagIdAttr(BagID) + | \propAttrs(PropAttrs) + ]), + \propertyElts(PropElts)), + { !, append(PropAttrs, PropElts, Properties) + }. +\end{code} + + +\section{Predicates} + +The parser is designed to operate in various environments and therefore +provides interfaces at various levels. First we describe the top level +defined in \pllib{rdf}, simply parsing a RDF-XML file into a list of +triples. Please note these are {\em not} asserted into the database +because it is not necessarily the final format the user wishes to reason +with and it is not clean how the user wants to deal with multiple RDF +documents. Some options are using global URI's in one pool, in Prolog +modules or using an additional argument. + +\begin{description} + \predicate{load_rdf}{2}{+File, -Triples} +Same as \term{load_rdf}{File, Triples, []}. + + \predicate{load_rdf}{3}{+File, -Triples, +Options} +Read the RDF-XML file \arg{File} and return a list of \arg{Triples}. +\arg{Options} defines additional processing options. Currently defined +options are: + + \begin{description} + \termitem{base_uri}{BaseURI} +If provided local identifiers and identifier-references are globalised +using this URI. If omited or the atom \verb$[]$, local identifiers are +not tagged. + + \termitem{blank_nodes}{Mode} +If \arg{Mode} is \const{share} (default), blank-node properties (i.e.\ +complex properties without identifier) are reused if they result in +exactly the same triple-set. Two descriptions are shared if their +intermediate description is the same. This means they should produce the +same set of triples in the same order. The value \const{noshare} creates +a new resource for each blank node. + + \termitem{expand_foreach}{Boolean} +If \arg{Boolean} is \const{true}, expand \const{rdf:aboutEach} into +a set of triples. By default the parser generates +\term{rdf}{each(Container), Predicate, Subject}. + + \termitem{lang}{Lang} +Define the initial language (i.e.\ pretend there is an \const{xml:lang} +declaration in an enclosing element). + + \termitem{ignore_lang}{Bool} +If \const{true}, \const{xml:lang} declarations in the document are +ignored. This is mostly for compatibility with older versions of +this library that did not support language identifiers. + + \termitem{convert_typed_literal}{:ConvertPred} +If the parser finds a literal with the \const{rdf:datatype}=\arg{Type} +attribute, call \term{ConvertPred}{+Type, +Content, -Literal}. +\arg{Content} is the XML element contentas returned by the XML +parser (a list). The predicate must unify \arg{Literal} +with a Prolog representation of \arg{Content} according to +\arg{Type} or throw an exception if the conversion cannot be made. + +This option servers two purposes. First of all it can be used +to ignore type declarations for backward compatibility of this +library. Second it can be used to convert typed literals to +a meaningful Prolog representation. E.g.\ convert '42' to the +Prolog integer 42 if the type is \const{xsd:int} or a related +type. + + \termitem{namespaces}{-List} +Unify \arg{List} with a list of \arg{NS}=\arg{URL} for each +encountered \const{xmlns}:\arg{NS}=\arg{URL} declaration found +in the source. + + \termitem{entity}{+Name, +Value} +Overrule entity declaration in file. As it is common practice +to declare namespaces using entities in RDF/XML, this option +allows for changing the namespace without changing the file. +Multiple of these options are allowed. + \end{description} + +The \arg{Triples} list is a list of \term{rdf}{Subject, Predicate, +Object} triples. \arg{Subject} is either a plain resource (an atom), +or one of the terms \term{each}{URI} or \term{prefix}{URI} with the +obvious meaning. \arg{Predicate} is either a plain atom for +explicitely non-qualified names or a term +\mbox{\arg{NameSpace}{\bf :}\arg{Name}}. If \arg{NameSpace} is the +defined RDF name space it is returned as the atom \const{rdf}. +Finally, \arg{Object} is a URI, a \arg{Predicate} or a term of the +format \term{literal}{Value} for literal values. \arg{Value} is +either a plain atom or a parsed XML term (list of atoms and elements). +\end{description} + + +\subsection{RDF Object representation} \label{sec:rdfobject} + +The \emph{Object} (3rd) part of a triple can have several different +types. If the object is a resource it is returned as either a plain +atom or a term \mbox{\arg{NameSpace}{\bf :}\arg{Name}}. If it is a +literal it is returned as \term{literal}{Value}, where \arg{Value} +takes one of the formats defined below. + +\begin{itemlist} + \item [An atom] +If the literal \arg{Value} is a plain atom is a literal value not +subject to a datatype or \const{xml:lang} qualifier. + + \item [\term{lang}{LanguageID, Atom}] +If the literal is subject to an \const{xml:lang} qualifier +\arg{LanguageID} specifies the language and \arg{Atom} the +actual text. + + \item [A list] +If the literal is an XML literal as created by +\mbox{parseType="Literal"}, the raw output of the XML parser for the +content of the element is returned. This content is a list of +\term{element}{Name, Attributes, Content} and atoms for CDATA parts as +described with the SWI-Prolog \url[SGML/XML +parser]{http://www.swi-prolog.org/packages/sgml2pl.html} + + \item [\term{type}{Type, StringValue}] +If the literal has an \verb$rdf:datatype=$\arg{Type} a term of this +format is returned. +\end{itemlist} + + +\subsection{Name spaces} + +XML name spaces are identified using a URI. Unfortunately various URI's +are in common use to refer to RDF. The \file{rdf_parser.pl} module +therefore defines the namespace as a multifile/1 predicate, that can be +extended by the user. For example, to parse the \url[Netscape +OpenDirectory]{http://www.mozilla.org/rdf/doc/inference.html} +\file{structure.rdf} file, the following declarations are used: + +\begin{code} +:- multifile + rdf_parser:rdf_name_space/1. + +rdf_parser:rdf_name_space('http://www.w3.org/TR/RDF/'). +rdf_parser:rdf_name_space('http://directory.mozilla.org/rdf'). +rdf_parser:rdf_name_space('http://dmoz.org/rdf'). +\end{code} + +The initial definition of this predicate is given below. + +\begin{code} +rdf_name_space('http://www.w3.org/1999/02/22-rdf-syntax-ns#'). +rdf_name_space('http://www.w3.org/TR/REC-rdf-syntax'). +\end{code} + + +\subsection{Low-level access} + +The above defined load_rdf/[2,3] is not always suitable. For example, it +cannot deal with documents where the RDF statement is embedded in an XML +document. It also cannot deal with really large documents (e.g.\ the +Netscape OpenDirectory project, currently about 90 MBytes), without huge +amounts of memory. + +For really large documents, the {\bf sgml2pl} parser can be programmed +to handle the content of a specific element (i.e. \elem{rdf:RDF}) +element-by-element. The parsing primitives defined in this section +can be used to process these one-by-one. + +\begin{description} + \predicate{xml_to_rdf}{3}{+XML, +BaseURI, -Triples} +Process an XML term produced by load_structure/3 using the +\term{dialect}{xmlns} output option. \arg{XML} is either +a complete \elem{rdf:RDF} element, a list of RDF-objects +(container or description) or a single description of container. + + \predicate{process_rdf}{3}{+Input, :OnTriples, +Options} + +Exploits the call-back interface of {\bf sgml2pl}, calling +\term{\arg{OnTriples}}{Triples, File:Line} with the list of triples +resulting from a single top level RDF object for each RDF element in the +input as well as the source-location where the description started. +\arg{Input} is either a file name or term \term{stream}{Stream}. When +using a stream all triples are associated to the value of the +\const{base_uri} option. This predicate can be used to process arbitrary +large RDF files as the file is processed object-by-object. The example +below simply asserts all triples into the database: + +\begin{code} +assert_list([], _). +assert_list([H|T], Source) :- + assert(H), + assert_list(T, Source). + +?- process_rdf('structure,rdf', assert_list, []). +\end{code} + +\arg{Options} are described with load_rdf/3. The option +\const{expand_foreach} is not supported as the container may be in a +different description. Additional it provides \const{embedded}: + + \begin{description} + \termitem{embedded}{Boolean} +The predicate process_rdf/3 processes arbitrary XML documents, only +interpreting the content of \const{rdf:RDF} elements. If this option +is \const{false} (default), it gives a warning on elements that are +not processed. The option \term{embedded}{true} can be used to +process RDF embedded in \jargon{xhtml} without warnings. + \end{description} + +\end{description} + + + + +\section{Writing RDF graphs} + +The library \pllib{rdf_write} provides the inverse of load_rdf/2 using +the predicate rdf_write_xml/2. In most cases the RDF parser is used in +combination with the Semweb package providing \pllib{semweb/rdf_db}. +This library defines rdf_save/2 to save a named RDF graph from the +database to a file. This library writes a list of rdf terms to a stream. +It has been developed for the SeRQL server which computes an RDF graph +that needs to be transmitted in an HTTP request. As we see this as a +typical use-case scenario the library only provides writing to a stream. + +\begin{description} + \predicate{rdf_write_xml}{2}{+Stream, +Triples} +Write an RDF/XML document to \arg{Stream} from the list of \arg{Triples}. +\arg{Stream} must use one of the following Prolog stream encodings: +\const{ascii}, \const{iso_latin_1} or \const{utf8}. Characters that +cannot be represented in the encoding are represented as XML entities. +Using ASCII is a good idea for documents that can be represented almost +completely in ASCII. For more international documents using UTF-8 creates +a more compact document that is easier to read. + +\begin{code} +rdf_write(File, Triples) :- + open(File, write, Out, [encoding(utf8)]), + call_cleanup(rdf_write_xml(Out, Triples), + close(Out)). +\end{code} +\end{description} + + +\section{Testing the RDF translator} + +A test-suite and driver program are provided by \file{rdf_test.pl} in +the source directory. To run these tests, load this file into Prolog in +the distribution directory. The test files are in the directory +\file{suite} and the proper output in \file{suite/ok}. Predicates +provided by \file{rdf_test.pl}: + +\begin{description} + \predicate{suite}{1}{+N} +Run test \arg{N} using the file \file{suite/tN.rdf} and display the +RDF source, the intermediate Prolog representation and the resulting +triples. + \predicate{passed}{1}{+N} +Process \file{suite/tN.rdf} and store the resulting triples in +\file{suite/ok/tN.pl} for later validation by test/0. + \predicate{test}{0}{} +Run all tests and classify the result. +\end{description} + +\appendix + +\section{Metrics} + +It took three days to write and one to document the Prolog RDF parser. +A significant part of the time was spent understanding the RDF +specification. + +The size of the source (including comments) is given in the table +below. + +\begin{center} +\begin{tabular}{|rrr|l|l|} +\hline +\bf lines & \bf words & \bf bytes & \bf file & \bf function \\ +\hline + 109 & 255 & 2663 & rdf.pl & Driver program \\ + 312 & 649 & 6416 & rdf_parser.pl & 1-st phase parser \\ + 246 & 752 & 5852 & rdf_triple.pl & 2-nd phase parser \\ + 126 & 339 & 2596 & rewrite.pl & rule-compiler \\ +\hline + 793 & 1995 & 17527 & total & \\ +\hline +\end{tabular} +\end{center} + + +We also compared the performance using an RDF-Schema file generated by +\url[Protege-2000]{http://www.smi.stanford.edu/projects/protege/} and +interpreted as RDF. This file contains 162 descriptions in 50 Kbytes, +resulting in 599 triples. Environment: Intel Pentium-II/450 with +384 Mbytes memory running SuSE Linux 6.3. + +The parser described here requires 0.15 seconds excluding 0.13 seconds +Prolog startup time to process this file. The \url[Pro +Solutions]{http://www.pro-solutions.com/rdfdemo/} parser (written in +Perl) requires 1.5 seconds exluding 0.25 seconds startup time. + + +\section{Installation} + +\subsection{Unix systems} + +Installation on Unix system uses the commonly found {\em configure}, +{\em make} and {\em make install} sequence. SWI-Prolog should be +installed before building this package. If SWI-Prolog is not installed +as \program{pl}, the environment variable \env{PL} must be set to the +name of the SWI-Prolog executable. Installation is now accomplished +using: + +\begin{code} +% ./configure +% make +% make install +\end{code} + +This installs the Prolog library files in \file{$PLBASE/library}, where +\file{$PLBASE} refers to the SWI-Prolog `home-directory'. + +\subsection{Windows} + +Run the file \file{setup.pl} by double clicking it. This will install +the required files into the SWI-Prolog directory and update the +library directory. + +\end{document} + + diff --git a/packages/sgml/RDF/rdf_diagram.pl b/packages/sgml/RDF/rdf_diagram.pl new file mode 100644 index 000000000..c028ca1e4 --- /dev/null +++ b/packages/sgml/RDF/rdf_diagram.pl @@ -0,0 +1,492 @@ +/* $Id$ + + Part of SWI-Prolog SGML/XML parser + + Author: Jan Wielemaker + E-mail: jan@swi.psy.uva.nl + WWW: http://www.swi.psy.uva.nl/projects/SWI-Prolog/ + Copying: LGPL-2. See the file COPYING or http://www.gnu.org + + Copyright (C) 1990-2002 SWI, University of Amsterdam. All rights reserved. +*/ + +:- module(rdf_diagram, + [ rdf_diagram_from_file/1 % +File + ]). +:- use_module(library(pce)). +:- use_module(library(pce_tagged_connection)). +:- use_module(library(autowin)). +:- use_module(library(pce_report)). +:- use_module(library(print_graphics)). +:- use_module(library(rdf_parser)). % get access to declared namespaces + + +/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +This file defines the class rdf_diagram, a window capable of showing a +set of triples. + +The predicate rdf_diagram_from_file(+File) is a simple demo and useful +tool to show RDF from simple RDF files. +- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ + + + /******************************* + * SIMPLE ENTRY * + *******************************/ + +% rdf_diagram_from_file(+File) +% +% Show the triples from File in a window. + +rdf_diagram_from_file(File) :- + absolute_file_name(File, + [ access(read), + extensions([rdf,rdfs,owl,'']) + ], AbsFile), + load_rdf(AbsFile, Triples, + [ expand_foreach(true) + ]), + new(D, rdf_diagram(string('RDF diagram for %s', File))), + send(new(report_dialog), below, D), + send(D, triples, Triples), + send(D, open). + + + /******************************* + * CLASS RDF-DIAGRAM * + *******************************/ + +:- pce_begin_class(rdf_diagram, auto_sized_picture, + "Show set of RDF triples in a window"). +:- use_class_template(print_graphics). + +variable(auto_layout, bool := @on, both, "Automatically layout on resize"). +variable(type_in_node, bool := @on, both, "Display type inside node"). + +initialise(D, Label:[name]) :-> + send_super(D, initialise, Label), + send(D, scrollbars, both), + send(D, fill_popup), + send(D, resize_message, + if(and(D?auto_layout == @on, + D?focus_recogniser == @nil), + message(D, layout))). + +fill_popup(D) :-> + send(D, popup, new(P, popup)), + send_list(P, append, + [ menu_item(layout, message(D, layout)), + gap, + menu_item(print, message(D, print)) + ]). + +:- pce_group(triples). + +append(D, Triple:prolog) :-> + "Append and rdf(Subject, Predicate, Object) triple":: + ( subject_name(Triple, SubjectName), + get(D, resource, SubjectName, Subject), + ( get(D, type_in_node, @on), + is_type(Triple) + -> object_resource(Triple, ObjectName), + send(Subject, type, ObjectName) + ; predicate_name(Triple, PredName), + ( object_resource(Triple, ObjectName) + -> get(D, resource, ObjectName, Object) + ; object_literal(Triple, Literal) + -> get(D, literal, Literal, Object) + ), + send(Subject, connect, PredName, Object) + ) + -> true + ; term_to_atom(Triple, Atom), + ignore(send(D, report, error, + 'Failed to display triple: %s', Atom)) + ). + +triples(D, Triples:prolog) :-> + "Show disgram from Prolog triples":: + send(D, clear), + forall(member(T, Triples), + send(D, append, T)), + send(D, layout). + +resource(D, Resource:name) :-> + "Add Resource to diagram":: + get(D, resource, Resource, @on, _). + +resource(D, Resource:name, Create:[bool], Subject:rdf_resource) :<- + "Get reference for a subject or create one":: + ( get(D, member, Resource, Subject) + -> true + ; Create \== @off, + get(D, create_resource, Resource, Subject), + send(D, display, Subject, D?visible?center) + ). + +literal(D, Value:prolog, Gr:rdf_literal) :<- + "Display a literal. Don't try to re-use":: + ( literal_name(Value, Name), + get(D, member, Name, Gr) + -> true + ; get(D, create_literal, Value, Gr), + send(D, display, Gr, D?visible?center) + ). + + +create_resource(D, Resource:name, Subject:rdf_resource) :<- + "Create visualisation of Resource":: + new(Subject, rdf_resource(Resource, D)). + + +create_literal(_D, Value:prolog, Gr:rdf_literal) :<- + "Create visualisation of literal":: + new(Gr, rdf_literal(Value)). + + +node_label(_D, Resource:name, Label:name) :<- + "Generate label to show for a node":: + local_name(Resource, Label). + + +:- pce_group(layout). + +layout(D) :-> + "Produce automatic layout":: + new(Nodes, chain), + send(D?graphicals, for_all, + if(message(@arg1, instance_of, rdf_any), + message(Nodes, append, @arg1))), + send(Nodes?head, layout, 2, 40, + iterations := 200, + area := D?visible, + network := Nodes). + +copy_layout(D, From:rdf_diagram, Subst:prolog) :-> + "Copy the layout from another windows":: + send(D?graphicals, for_some, + message(D, copy_location, @arg1, From, prolog(Subst))). + +copy_location(_D, Obj:graphical, From:rdf_diagram, Subst:prolog) :-> + "Copy location of a single RDF object":: + ( send(Obj, instance_of, rdf_any) + -> ( get(Obj, name, Name), + find(From, Name, Subst, FromObj) + -> format('Copied location of ~p from ~p~n', [Obj, FromObj]), + get(FromObj, center, Center), + send(Obj, center, Center) + ) + ; true + ). + +find(D, Name, _Subst, Obj) :- + get(D, member, Name, Obj). +find(D, Name, Subst, Obj) :- + member(Name=AltName, Subst), + atom_concat('_:', AltName, FullAltName), + get(D, member, FullAltName, Obj). +find(D, Name, Subst, _) :- + format('Cannot find ~w in ~p, Subst =~n', [Name, D]), + pp(Subst), + fail. + + +:- pce_end_class(rdf_diagram). + + + /******************************* + * SHAPES * + *******************************/ + +:- pce_begin_class(rdf_connection, tagged_connection, + "Represents a triple"). + +:- pce_global(@rdf_link, new(link(link, link, + line(0,0,0,0,second)))). + +initialise(C, Gr1:graphical, Gr2:graphical, Pred:name, Ctx:[object]) :-> + "Create from predicate":: + send_super(C, initialise, Gr1, Gr2, @rdf_link), + send(C, tag, rdf_label(Pred, italic, Ctx)). + +ideal_length(C, Len:int) :<- + "Layout: compute the desired length":: + get(C, height, H), + ( H < 40 + -> get(C, tag, Tag), + get(Tag, width, W), + Len is W + 30 + ; Len = 40 + ). + +:- pce_end_class(rdf_connection). + +:- pce_begin_class(rdf_any(name), figure, + "Represent an RDF resource or literal"). + +handle(w/2, 0, link, north). +handle(w, h/2, link, east). +handle(w/2, h, link, south). +handle(0, h/2, link, west). + +initialise(F, Ref:name) :-> + "Create visualisation":: + send_super(F, initialise), + send(F, name, Ref). + +connect(F, Pred:name, Object:graphical) :-> + new(_C, rdf_connection(F, Object, Pred, F)). + +:- pce_global(@rdf_any_recogniser, + make_rdf_any_recogniser). +:- pce_global(@rdf_any_popup, + make_rdf_any_popup). + +make_rdf_any_recogniser(G) :- + new(M1, move_gesture(left)), + new(M2, move_network_gesture(left, c)), + new(P, popup_gesture(@receiver?popup)), + new(G, handler_group(M1, M2, P)). + +popup(_F, Popup:popup) :<- + "Create popup menu":: + Popup = @rdf_any_popup. + +make_rdf_any_popup(Popup) :- + new(Popup, popup), + Gr = @arg1, + send(Popup, append, + menu_item(layout, message(Gr, layout))). + +event(F, Ev:event) :-> + ( \+ send(Ev, is_a, ms_right_down), + send_super(F, event, Ev) + -> true + ; send(@rdf_any_recogniser, event, Ev) + ). + +node_label(F, Resource:name, Label:name) :<- + "Return label to use for a resource":: + get(F, device, Dev), + ( send(Dev, has_get_method, node_label) + -> get(Dev, node_label, Resource, Label) + ; local_name(Resource, Label) + ). + +:- pce_end_class(rdf_any). + + +:- pce_begin_class(move_network_gesture, move_gesture, + "Move network of connected graphicals"). + +variable(outline, box, get, + "Box used to indicate move"). +variable(network, chain*, both, + "Stored value of the network"). +variable(origin, point, get, + "Start origin of network"). + +/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +The gesture maintains an outline, the selection to be moved and the +positon where the move orginiated. The outline itself is given a +normal move_gesture to make it move on dragging. This move_gesture +should operate on the same button and modifier. +- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ + +initialise(G, B:[button_name], M:[modifier]) :-> + send(G, send_super, initialise, B, M), + send(G, slot, outline, new(Box, box(0,0))), + send(G, slot, origin, point(0,0)), + send(Box, texture, dotted), + send(Box, recogniser, move_gesture(G?button, G?modifier)). + +initiate(G, Ev:event) :-> + get(Ev, receiver, Gr), + get(Gr, device, Dev), + get(G, outline, Outline), + get(Gr, network, Network), + send(G, network, Network), + new(Union, area(0,0,0,0)), + send(Network, for_all, message(Union, union, @arg1?area)), + send(G?origin, copy, Union?position), + send(Outline, area, Union), + send(Union, done), + send(Dev, display, Outline), + ignore(send(Ev, post, Outline)). + +drag(G, Ev) :-> + send(Ev, post, G?outline). + + +/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Terminate. First undisplay the outline. Next calculate by how much +the outline has been dragged and move all objects of the selection by +this amount. +- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ + +terminate(G, Ev:event) :-> + ignore(send(G, drag, Ev)), + get(G, outline, Outline), + send(Outline, device, @nil), + get(Outline?area?position, difference, G?origin, Offset), + get(G, network, Network), + send(Network, for_all, message(@arg1, relative_move, Offset)), + send(G, network, @nil). + +:- pce_end_class(move_network_gesture). + + + +:- pce_begin_class(rdf_label, text, + "Label for an RDF relation"). + +variable(resource, name, get, "Represented predicate"). + +initialise(L, Pred:name, Font:font, Context:[object]) :-> + ( Context == @default + -> local_name(Pred, Label) + ; get(Context, node_label, Pred, Label) + ), + send_super(L, initialise, Label, center, Font), + send(L, slot, resource, Pred), + send(L, background, @default). + +:- pce_global(@rdf_label_recogniser, + make_rdf_label_recogniser). + +make_rdf_label_recogniser(G) :- + new(G, handler_group), + send(G, append, + handler(area_enter, message(@receiver, identify))), + send(G, append, + handler(area_exit, message(@receiver, report, status, ''))), + send(G, append, popup_gesture(new(P, popup))), + send_list(P, append, + [ menu_item(copy, + message(@display, copy, @arg1?resource)) + ]). + +event(F, Ev:event) :-> + ( send_super(F, event, Ev) + -> true + ; send(@rdf_label_recogniser, event, Ev) + ). + +identify(L) :-> + send(L, report, status, '%s', L?resource). + +:- pce_end_class. + + + +:- pce_begin_class(rdf_resource, rdf_any, + "Represent an RDF resource"). + +initialise(F, Ref:name, Ctx:[object]) :-> + "Create visualisation":: + send_super(F, initialise, Ref), + send(F, display, ellipse(100, 50), point(-50,-25)), + send(F, display, new(T, rdf_label(Ref, normal, Ctx))), + send(T, center, point(0,0)). + +type(F, Type:name) :-> + send(F, display, new(TL, rdf_label(Type, small, F))), + send(TL, center, point(0,14)), + get(F, member, ellipse, E), + send(E, shadow, 2). + +identify(F) :-> + send(F, report, status, 'Resource %s', F?name). + +:- pce_end_class(rdf_resource). + + +:- pce_begin_class(rdf_literal, rdf_any, + "Represent an RDF literal value"). + +variable(value, prolog, get, "Represented literal value"). + +initialise(F, Value:prolog) :-> + "Create visualisation":: + send(F, slot, value, Value), + literal_label(Value, Label), + atom_concat('__lit:', Label, Id), + send_super(F, initialise, Id), + send(F, display, new(B, box)), + send(B, fill_pattern, colour(grey80)), + send(B, pen, 0), + send(F, display, new(T, text(Label, center))), + send(T, center, point(0,0)), + send(F, fit). + +literal_label(literal(Value0), Value) :- !, + literal_label(Value0, Value). +literal_label(xml(Value0), Value) :- !, + literal_label(Value0, Value). +literal_label(Value, Value) :- + atomic(Value), !. +literal_label(Value, Label) :- + term_to_atom(Value, Label). + +literal_name(Value, Name) :- + literal_label(Value, Label), + atom_concat('__lit:', Label, Name). + +fit(F) :-> + "Make box fit contents":: + get(F, member, text, Text), + get(Text?area, clone, Area), + send(Area, increase, 3), + get(F, member, box, Box), + send(Box, area, Area). + +:- pce_end_class(rdf_literal). + + + + + + + /******************************* + * PRIMITIVES * + *******************************/ + +subject_name(rdf(Name0, _, _), Name) :- + resource_name(Name0, Name). +predicate_name(rdf(_, Name0, _), Name) :- + resource_name(Name0, Name). +object_resource(rdf(_, _, Name0), Name) :- + resource_name(Name0, Name). +object_literal(rdf(_,_,Literal), Literal). + + +resource_name(Name, Name) :- + atom(Name), !. +resource_name(rdf:Local, Name) :- !, % known namespaces + concat_atom([rdf, :, Local], Name). +resource_name(NS:Local, Name) :- !, + atom_concat(NS, Local, Name). +resource_name(node(Anon), Name) :- % Not for predicates + atom_concat('_:', Anon, Name). + +is_type(rdf(_, rdf:type, _)) :- !. % our parser +is_type(rdf(_, Pred, _)) :- % our parser + atom(Pred), + rdf_name_space(NS), + atom_concat(NS, type, Pred), !. + +% local_name(+Resource, -Label) +% +% Return easy readable local name + +local_name(Resource, Local) :- + sub_atom(Resource, _, _, A, #), + sub_atom(Resource, _, A, 0, Local), + \+ sub_atom(Local, _, _, _, #), !. +local_name(Resource, Local) :- + atom_concat('rdf:', Local, Resource), !. +local_name(Resource, Local) :- + file_base_name(Resource, Local), + Local \== ''. +local_name(Resource, Resource). + diff --git a/packages/sgml/RDF/rdf_ntriples.pl b/packages/sgml/RDF/rdf_ntriples.pl new file mode 100644 index 000000000..bbc303a51 --- /dev/null +++ b/packages/sgml/RDF/rdf_ntriples.pl @@ -0,0 +1,311 @@ +/* $Id$ + + Part of SWI-Prolog SGML/XML parser + + Author: Jan Wielemaker + E-mail: jan@swi.psy.uva.nl + WWW: http://www.swi.psy.uva.nl/projects/SWI-Prolog/ + Copying: LGPL-2. See the file COPYING or http://www.gnu.org + + Copyright (C) 1990-2002 SWI, University of Amsterdam. All rights reserved. +*/ + +:- module(rdf_ntriples, + [ load_rdf_ntriples/2, % +File, -Triples + rdf_ntriple_part/4 % +Field, -Value, + ]). + + +/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +This module parses n-triple files as defined by the W3C RDF working in +http://www.w3.org/TR/rdf-testcases/#ntriples. This format is a +simplified version of the RDF N3 notation used in the *.nt files that +are used to describe the normative outcome of the RDF test-cases. + +The returned list terms are of the form + + rdf(Subject, Predicate, Object) + +where + + # Subject + is an atom or node(Id) for anonymous nodes + + # Predicate + is an atom + + # Object + is an atom, node(Id), literal(Atom) or xml(Atom) +- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ + + +% load_rdf_ntriples(+Source, -Triples) +% +% Load a file or stream to a list of rdf(S,P,O) triples. + +load_rdf_ntriples(File, Triples) :- + open_nt_file(File, In, Close), + call_cleanup(stream_to_triples(In, Triples), Close). + +% open_nt_file(+Input, -Stream, -Close) +% +% Open Input, returning Stream and a goal to cleanup Stream if it +% was opened. + +open_nt_file(stream(Stream), Stream, true) :- !. +open_nt_file(Stream, Stream, true) :- + is_stream(Stream), !. +open_nt_file(Spec, Stream, close(Stream)) :- + absolute_file_name(Spec, + [ access(read), + extensions([nt,'']) + ], Path), + open(Path, read, Stream). + + +% rdf_ntriple_part(+Type, -Value, ) +% +% Parse one of the fields of an ntriple. This is used for the +% SWI-Prolog Sesame (www.openrdf.org) implementation to realise +% /servlets/removeStatements. I do not think public use of this +% predicate should be stimulated. + +rdf_ntriple_part(subject, Subject) --> + subject(Subject). +rdf_ntriple_part(predicate, Predicate) --> + predicate(Predicate). +rdf_ntriple_part(object, Object) --> + predicate(Object). + + +% stream_to_triples(+Stream, -ListOfTriples) +% +% Read Stream, returning all its triples + +stream_to_triples(In, Triples) :- + read_line_to_codes(In, Line), + ( Line == end_of_file + -> Triples = [] + ; phrase(line(Triples, Tail), Line), + stream_to_triples(In, Tail) + ). + +line(Triples, Tail) --> + wss, + ( comment + -> {Triples = Tail} + ; triple(Triple) + -> {Triples = [Triple|Tail]} + ). + +comment --> + "#", !, + skip_rest. +comment --> + end_of_input. + +triple(rdf(Subject, Predicate, Object)) --> + subject(Subject), ws, wss, + predicate(Predicate), ws, wss, + object(Object), wss, ".", wss. + +subject(Subject) --> + uniref(Subject), !. +subject(Subject) --> + node_id(Subject). + +predicate(Predicate) --> + uniref(Predicate). + +object(Object) --> + uniref(Object), !. +object(Object) --> + node_id(Object). +object(Object) --> + literal(Object). + + +uniref(URI) --> + "<", + escaped_uri_codes(Codes), + ">", !, + { atom_codes(URI, Codes) + }. + +node_id(node(Id)) --> % anonymous nodes + "_:", + name_start(C0), + name_codes(Codes), + { atom_codes(Id, [C0|Codes]) + }. + +literal(Literal) --> + lang_string(Literal), !. +literal(Literal) --> + xml_string(Literal). + + +% name_start(-Code) +% name_codes(-ListfCodes) +% +% Parse identifier names + +name_start(C) --> + [C], + { code_type(C, alpha) + }. + +name_codes([C|T]) --> + [C], + { code_type(C, alnum) + }, !, + name_codes(T). +name_codes([]) --> + []. + + +% escaped_uri_codes(-CodeList) +% +% Decode string holding %xx escaped characters. + +escaped_uri_codes([]) --> + []. +escaped_uri_codes([C|T]) --> + "%", [D0,D1], !, + { code_type(D0, xdigit(V0)), + code_type(D1, xdigit(V1)), + C is V0<<4 + V1 + }, + escaped_uri_codes(T). +escaped_uri_codes([C|T]) --> + "\\u", [D0,D1,D2,D3], !, + { code_type(D0, xdigit(V0)), + code_type(D1, xdigit(V1)), + code_type(D2, xdigit(V2)), + code_type(D3, xdigit(V3)), + C is V0<<12 + V1<<8 + V2<<4 + V3 + }, + escaped_uri_codes(T). +escaped_uri_codes([C|T]) --> + [C], + escaped_uri_codes(T). + + +% lang_string() +% +% Process a language string + +lang_string(String) --> + "\"", + string(Codes), + "\"", !, + { atom_codes(Atom, Codes) + }, + ( langsep + -> language(Lang), + { String = literal(lang(Lang, Atom)) + } + ; "^^" + -> uniref(Type), + { String = literal(type(Type, Atom)) + } + ; { String = literal(Atom) + } + ). + +langsep --> + "-". +langsep --> + "@". + +% xml_string(String) +% +% Handle xml"..." + +xml_string(xml(String)) --> + "xml\"", % really no whitespace? + string(Codes), + "\"", + { atom_codes(String, Codes) + }. + +string([]) --> + []. +string([C0|T]) --> + string_char(C0), + string(T). + +string_char(0'\\) --> + "\\\\". +string_char(0'") --> + "\\\"". +string_char(10) --> + "\\n". +string_char(13) --> + "\\r". +string_char(9) --> + "\\t". +string_char(C) --> + "\\u", + '4xdigits'(C). +string_char(C) --> + "\\u", + '4xdigits'(C0), + '4xdigits'(C1), + { C is C0<<16 + C1 + }. +string_char(C) --> + [C]. + +'4xdigits'(C) --> + [C0,C1,C2,C3], + { code_type(C0, xdigit(V0)), + code_type(C1, xdigit(V1)), + code_type(C2, xdigit(V2)), + code_type(C3, xdigit(V3)), + + C is V0<<12 + V1<<8 + V2<<4 + V3 + }. + +% language(-Lang) +% +% Return xml:lang language identifier. + +language(Lang) --> + lang_code(C0), + lang_codes(Codes), + { atom_codes(Lang, [C0|Codes]) + }. + +lang_code(C) --> + [C], + { C \== 0'., + \+ code_type(C, white) + }. + +lang_codes([C|T]) --> + lang_code(C), !, + lang_codes(T). +lang_codes([]) --> + []. + + + /******************************* + * BASICS * + *******************************/ + +skip_rest(_,[]). + +ws --> + [C], + { code_type(C, white) + }. + +end_of_input([], []). + + +wss --> + ws, !, + wss. +wss --> + []. diff --git a/packages/sgml/RDF/rdf_parser.pl b/packages/sgml/RDF/rdf_parser.pl new file mode 100644 index 000000000..fa74f8f74 --- /dev/null +++ b/packages/sgml/RDF/rdf_parser.pl @@ -0,0 +1,724 @@ +/* $Id$ + + Part of SWI-Prolog + + Author: Jan Wielemaker + E-mail: wielemak@science.uva.nl + WWW: http://www.swi-prolog.org + Copyright (C): 2002-2006, University of Amsterdam + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License + as published by the Free Software Foundation; either version 2 + of the License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + + As a special exception, if you link this library with other files, + compiled with a Free Software compiler, to produce an executable, this + library does not by itself cause the resulting executable to be covered + by the GNU General Public License. This exception does not however + invalidate any other reasons why the executable file might be covered by + the GNU General Public License. +*/ + +:- module(rdf_parser, + [ xml_to_plrdf/3, % +XMLTerm, -RDFTerm, +Options + element_to_plrdf/3, % +ContentList, -RDFTerm, +Options + rdf_name_space/1 + ]). +:- use_module(rewrite). +:- use_module(library(sgml)). % xml_name/1 +:- use_module(library(lists)). +:- use_module(library(url)). +:- use_module(library(utf8)). + +:- op(500, fx, \?). % Optional (attrs) + +term_expansion(F, T) :- rew_term_expansion(F, T). +goal_expansion(F, T) :- rew_goal_expansion(F, T). + +:- multifile rdf_name_space/1. +:- dynamic rdf_name_space/1. + +%% rdf_name_space(?URL) is nondet. +% +% True if URL must be handled as rdf: Determines special handling +% of rdf:about, rdf:resource, etc. + + +rdf_name_space('http://www.w3.org/1999/02/22-rdf-syntax-ns#'). +rdf_name_space('http://www.w3.org/TR/REC-rdf-syntax'). + + +%% xml_to_plrdf(+RDFElementOrObject, -RDFTerm, +Options) +% +% Translate an XML (using namespaces) term into an Prolog term +% representing the RDF data. This term can then be fed into +% rdf_triples/[2,3] to create a list of RDF triples. +% +% if `BaseURI' == [], local URI's are not globalised. + + +xml_to_plrdf(Element, RDF, Options) :- + is_list(Element), !, + rewrite(\xml_content_objects(RDF, Options), Element). +xml_to_plrdf(Element, RDF, Options) :- + rewrite(\xml_objects(RDF, Options), Element). + +%% element_to_plrdf(+DOM, -RDFTerm, +Options) +% +% Rewrite a single XML element. + +element_to_plrdf(Element, RDF, Options) :- + rewrite(\nodeElementList(RDF, Options), [Element]). + +xml_objects(Objects, Options0) ::= + E0, + { modify_state(E0, Options0, E, Options), !, + rewrite(\xml_objects(Objects, Options), E) + }. +xml_objects(Objects, Options) ::= + element((\rdf('RDF'), !), + _, + \nodeElementList(Objects, Options)), + !. +xml_objects(Objects, Options) ::= + element(_, _, \xml_content_objects(Objects, Options)). + +xml_content_objects([], _) ::= + []. +xml_content_objects([H|T], Options) ::= + [ \xml_objects(H, Options) + | \xml_content_objects(T, Options) + ]. + + +nodeElementList([], _Options) ::= + [], !. +nodeElementList(L, Options) ::= + [ (\ws, !) + | \nodeElementList(L, Options) + ]. +nodeElementList([H|T], Options) ::= + [ \nodeElementOrError(H, Options) + | \nodeElementList(T, Options) + ]. + +nodeElementOrError(H, Options) ::= + \nodeElement(H, Options), !. +nodeElementOrError(unparsed(Data), _Options) ::= + Data. + +nodeElement(container(Type, Id, Elements), Options) ::= + \container(Type, Id, Elements, Options), !. % compatibility +nodeElement(description(Type, About, BagID, Properties), Options) ::= + \description(Type, About, BagID, Properties, Options). + + + /******************************* + * DESCRIPTION * + *******************************/ + +description(Type, About, BagID, Properties, Options0) ::= + E0, + { modify_state(E0, Options0, E, Options), !, + rewrite(\description(Type, About, BagID, Properties, Options), E) + }. +description(description, About, BagID, Properties, Options) ::= + element(\rdf('Description'), + \attrs([ \?idAboutAttr(About, Options), + \?bagIdAttr(BagID, Options) + | \propAttrs(PropAttrs, Options) + ]), + \propertyElts(PropElts, Options)), + { !, append(PropAttrs, PropElts, Properties) + }. +description(Type, About, BagID, Properties, Options) ::= + element(Type, + \attrs([ \?idAboutAttr(About, Options), + \?bagIdAttr(BagID, Options) + | \propAttrs(PropAttrs, Options) + ]), + \propertyElts(PropElts, Options)), + { append(PropAttrs, PropElts, Properties) + }. + +propAttrs([], _) ::= + [], !. +propAttrs([H|T], Options) ::= + [ \propAttr(H, Options) + | \propAttrs(T, Options) + ]. + +propAttr(rdf:type = URI, Options) ::= + \rdf_or_unqualified(type) = \uri(URI, Options), !. +propAttr(Name = Literal, Options) ::= + Name = Value, + { mkliteral(Value, Literal, Options) + }. + +propertyElts([], _) ::= + [], !. +propertyElts(Elts, Options) ::= + [ (\ws, !) + | \propertyElts(Elts, Options) + ]. +propertyElts([H|T], Options) ::= + [ \propertyElt(H, Options) + | \propertyElts(T, Options) + ]. + +propertyElt(E, Options) ::= + \propertyElt(Id, Name, Value, Options), + { mkprop(Name, Value, Prop), + ( var(Id) + -> E = Prop + ; E = id(Id, Prop) + ) + }. + +mkprop(NS:Local, Value, rdf:Local = Value) :- + rdf_name_space(NS), !. +mkprop(Name, Value, Name = Value). + + +propertyElt(Id, Name, Value, Options0) ::= + E0, + { modify_state(E0, Options0, E, Options), !, + rewrite(\propertyElt(Id, Name, Value, Options), E) + }. + % 5.14 emptyPropertyElt +propertyElt(Id, Name, Value, Options) ::= + element(Name, A, \all_ws), + { !, + rewrite(\emptyPropertyElt(Id, Value, Options), A) + }. + +propertyElt(_, Name, description(description, Id, _, Properties), Options) ::= + element(Name, + \attrs([ \parseResource, + \?idAboutAttr(Id, Options) + ]), + \propertyElts(Properties, Options)), + !. +propertyElt(_, Name, Literal, Options) ::= + element(Name, + \attrs([ \parseLiteral + ]), + Content), + { !, + literal_value(Content, Literal, Options) + }. +propertyElt(Id, Name, collection(Elements), Options) ::= + element(Name, + \attrs([ \parseCollection, + \?idAttr(Id, Options) + ]), + \nodeElementList(Elements, Options)). +propertyElt(Id, Name, Literal, Options) ::= + element(Name, + \attrs([ \typeAttr(Type, Options), + \?idAttr(Id, Options) + ]), + Content), + { typed_literal(Type, Content, Literal, Options) + }. +propertyElt(Id, Name, Literal, Options) ::= + element(Name, + \attrs([ \?idAttr(Id, Options) + ]), + [ Value ]), + { atom(Value), !, + mkliteral(Value, Literal, Options) + }. +propertyElt(Id, Name, Value, Options) ::= + element(Name, + \attrs([ \?idAttr(Id, Options) + ]), + \an_rdf_object(Value, Options)), !. +propertyElt(Id, Name, unparsed(Value), Options) ::= + element(Name, + \attrs([ \?idAttr(Id, Options) + ]), + Value). + +emptyPropertyElt(Id, Literal, Options) ::= + \attrs([ \?idAttr(Id, Options), + \?parseLiteral + | \noMoreAttrs + ]), + { !, + mkliteral('', Literal, Options) + }. +emptyPropertyElt(Id, + description(description, About, BagID, Properties), + Options) ::= + \attrs([ \?idAttr(Id, Options), + \?aboutResourceEmptyElt(About, Options), + \?bagIdAttr(BagID, Options), + \?parseResource + | \propAttrs(Properties, Options) + ]), !. + +aboutResourceEmptyElt(about(URI), Options) ::= + \resourceAttr(URI, Options), !. +aboutResourceEmptyElt(node(URI), _Options) ::= + \nodeIDAttr(URI). + +%% literal_value(+In, -Value, +Options) +% +% Create the literal value for rdf:parseType="Literal" attributes. +% The content is the Prolog XML DOM tree for the literal. +% +% @tbd Note that the specs demand a canonical textual representation +% of the XML data as a Unicode string. For now the user can +% achieve this using the convert_typed_literal hook. + +literal_value(Value, literal(type(rdf:'XMLLiteral', Value)), _). + +%% mkliteral(+Atom, -Object, +Options) +% +% Translate attribute value Atom into an RDF object using the +% lang(Lang) option from Options. + +mkliteral(Text, literal(Val), Options) :- + atom(Text), + ( memberchk(lang(Lang), Options), + Lang \== '' + -> Val = lang(Lang, Text) + ; Val = Text + ). + +%% typed_literal(+Type, +Content, -Literal, +Options) +% +% Handle a literal attribute with rdf:datatype=Type qualifier. NB: +% possibly it is faster to use a global variable for the +% conversion hook. + +typed_literal(Type, Content, literal(Object), Options) :- + memberchk(convert_typed_literal(Convert), Options), !, + ( catch(call(Convert, Type, Content, Object), E, true) + -> ( var(E) + -> true + ; Object = E + ) + ; Object = error(cannot_convert(Type, Content), _) + ). +typed_literal(Type, [Text], literal(type(Type, Text)), _Options) :- !. +typed_literal(Type, Content, literal(type(Type, Content)), _Options). + + +idAboutAttr(id(Id), Options) ::= + \idAttr(Id, Options), !. +idAboutAttr(about(About), Options) ::= + \aboutAttr(About, Options), !. +idAboutAttr(node(About), _Options) ::= + \nodeIDAttr(About), !. +idAboutAttr(AboutEach, Options) ::= + \aboutEachAttr(AboutEach, Options). + +%% an_rdf_object(-Object, +OptionsURI) +% +% Deals with an object, but there may be spaces around. I'm still +% not sure where to deal with these. Best is to ask the XML parser +% to get rid of them, So most likely this code will change if this +% happens. + +an_rdf_object(Object, Options) ::= + [ \nodeElement(Object, Options) + ], !. +an_rdf_object(Object, Options) ::= + [ (\ws, !) + | \an_rdf_object(Object, Options) + ]. +an_rdf_object(Object, Options) ::= + [ \nodeElement(Object, Options), + \ws + ], !. + +ws ::= + A, + { atom(A), + atom_chars(A, Chars), + all_blank(Chars), ! + }. +ws ::= + pi(_). + +all_ws ::= + [], !. +all_ws ::= + [\ws | \all_ws]. + +all_blank([]). +all_blank([H|T]) :- + char_type(H, space), % SWI-Prolog specific + all_blank(T). + + + /******************************* + * RDF ATTRIBUTES * + *******************************/ + +idAttr(Id, Options) ::= + \rdf_or_unqualified('ID') = \uniqueid(Id, Options). + +bagIdAttr(Id, Options) ::= + \rdf_or_unqualified(bagID) = \globalid(Id, Options). + +aboutAttr(About, Options) ::= + \rdf_or_unqualified(about) = \uri(About, Options). + +nodeIDAttr(About) ::= + \rdf_or_unqualified(nodeID) = About. + +% Not allowed in current RDF! + +aboutEachAttr(each(AboutEach), Options) ::= + \rdf_or_unqualified(aboutEach) = \uri(AboutEach, Options), !. +aboutEachAttr(prefix(Prefix), Options) ::= + \rdf_or_unqualified(aboutEachPrefix) = \uri(Prefix, Options), !. + +resourceAttr(URI, Options) ::= + \rdf_or_unqualified(resource) = \uri(URI, Options). + +typeAttr(Type, Options) ::= + \rdf_or_unqualified(datatype) = \uri(Type, Options). + +uri(URI, Options) ::= + A, + { memberchk(base_uri(Base), Options), + Base \== [] + -> canonical_uri(A, Base, URI) + ; sub_atom(A, 0, _, _, #) + -> sub_atom(A, 1, _, 0, URI) + ; url_iri(A, URI) + }. + +globalid(Id, Options) ::= + A, + { make_globalid(A, Options, Id) + }. + +uniqueid(Id, Options) ::= + A, + { unique_xml_name(A), + make_globalid(A, Options, Id) + }. + +unique_xml_name(Name) :- + ( xml_name(Name) + -> true + ; print_message(warning, rdf(not_a_name(Name))) + ). + +make_globalid(In, Options, Id) :- + ( memberchk(base_uri(Base), Options), + Base \== [] + -> ( is_absolute_url(In) + -> url_iri(In, Id) + ; concat_atom([Base, In], #, Id0), + url_iri(Id0, Id) + ) + ; sub_atom(In, 0, _, _, #) + -> sub_atom(In, 1, _, 0, Id) + ; url_iri(In, Id) + ). + + +%% canonical_uri(+In, +Base, -Absolute) +% +% Make the URI absolute and decode special sequences. For the last +% clause, which is the correct order? + +canonical_uri('', Base, Base) :- !. % '' expands to xml:base +canonical_uri(URI0, [], URI) :- !, % do not use one + url_iri(URI0, URI). +canonical_uri(URI, Base, Global) :- % use our generic library + global_url(URI, Base, Global0), + url_iri(Global0, Global). + + + /******************************* + * CONTAINERS * + *******************************/ + +/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Note that containers are no longer part of the definition. We'll keep +the code and call it conditionally if we must. +- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ + +container(_, _, _, _) ::= + _, + { \+ current_prolog_flag(rdf_container, true), + !, fail + }. +container(Type, Id, Elements, Options0) ::= + E0, + { modify_state(E0, Options0, E, Options), !, + rewrite(\container(Type, Id, Elements, Options), E) + }. +container(Type, Id, Elements, Options) ::= + element(\containertype(Type), + \attrs([ \?idAttr(Id, Options) + | \memberAttrs(Elements) + ]), + []), !. +container(Type, Id, Elements, Options) ::= + element(\containertype(Type), + \attrs([ \?idAttr(Id, Options) + ]), + \memberElts(Elements, Options)). + +containertype(Type) ::= + \rdf(Type), + { containertype(Type) + }. + +containertype('Bag'). +containertype('Seq'). +containertype('Alt'). + +memberElts([], _) ::= + []. +memberElts([H|T], Options) ::= + [ \memberElt(H, Options) + | \memberElts(T, Options) + ]. + +memberElt(LI, Options) ::= + \referencedItem(LI, Options). +memberElt(LI, Options) ::= + \inlineItem(LI, Options). + +referencedItem(LI, Options0) ::= + E0, + { modify_state(E0, Options0, E, Options), !, + rewrite(\referencedItem(LI, Options), E) + }. +referencedItem(LI, Options) ::= + element(\rdf_or_unqualified(li), + [ \resourceAttr(LI, Options) ], + []). + +inlineItem(Item, Options0) ::= + E0, + { modify_state(E0, Options0, E, Options), !, + rewrite(\inlineItem(Item, Options), E) + }. +inlineItem(Literal, Options) ::= + element(\rdf_or_unqualified(li), + [ \parseLiteral ], + Value), + literal_value(Value, Literal, Options). +inlineItem(description(description, _, _, Properties), Options) ::= + element(\rdf_or_unqualified(li), + [ \parseResource ], + \propertyElts(Properties, Options)). +inlineItem(LI, Options) ::= + element(\rdf_or_unqualified(li), + [], + [\nodeElement(LI, Options)]), !. % inlined object +inlineItem(Literal, Options) ::= + element(\rdf_or_unqualified(li), + [], + [Text]), + { mkliteral(Text, Literal, Options) + }. + +memberAttrs([]) ::= + []. +memberAttrs([H|T]) ::= + [ \memberAttr(H) + | \memberAttrs(T) + ]. + +memberAttr(li(Id, Value)) ::= % Id should be _ + \rdf(Id) = Value. + +parseLiteral ::= \rdf_or_unqualified(parseType) = 'Literal'. +parseResource ::= \rdf_or_unqualified(parseType) = 'Resource'. +parseCollection ::= \rdf_or_unqualified(parseType) = 'Collection'. + + + /******************************* + * PRIMITIVES * + *******************************/ + +rdf(Tag) ::= + NS:Tag, + { rdf_name_space(NS), ! + }. + +rdf_or_unqualified(Tag) ::= + Tag. +rdf_or_unqualified(Tag) ::= + NS:Tag, + { rdf_name_space(NS), ! + }. + + + /******************************* + * BASICS * + *******************************/ + +attrs(Bag) ::= + L0, + { do_attrs(Bag, L0) + }. + +do_attrs([], _) :- !. +do_attrs([\?H|T], L0) :- !, % optional + ( select(X, L0, L), + rewrite(\H, X) + -> true + ; L = L0 + ), + do_attrs(T, L). +do_attrs([H|T], L0) :- + select(X, L0, L), + rewrite(H, X), !, + do_attrs(T, L). +do_attrs(C, L) :- + rewrite(C, L). + +% \noMoreAttrs +% +% Check attribute-list is empty. Reserved xml: attributes are +% excluded from this test. + +noMoreAttrs ::= + [], !. +noMoreAttrs ::= + [ xml:_=_ + | \noMoreAttrs + ]. + +%% modify_state(+Element0, +Options0, -Element, -Options) +% +% If Element0 contains xml:base = Base, strip it from the +% attributes list and update base_uri(_) in the Options +% +% It Element0 contains xml:lang = Lang, strip it from the +% attributes list and update lang(_) in the Options +% +% Remove all xmlns=_, xmlns:_=_ and xml:_=_. Only succeed +% if something changed. + +modify_state(E0, O0, E, O) :- + modify_states([base, lang, xmlns], M, E0, O0, E, O), + M \== []. + +modify_states([], [], E, O, E, O). +modify_states([How|TH0], [How|TH], E0, O0, E, O) :- + modify_state(How, E0, O0, E1, O1), !, + modify_states(TH0, TH, E1, O1, E, O). +modify_states([_|TH0], TH, E0, O0, E, O) :- + modify_states(TH0, TH, E0, O0, E, O). + + +modify_state(base, + element(Name, Attrs0, Content), Options0, + element(Name, Attrs, Content), Options) :- + select(xml:base=Base1, Attrs0, Attrs), !, + ( select(base_uri(Base0), Options0, Options1) + -> true + ; Base0 = [], + Options1 = Options0 + ), + remove_fragment(Base1, Base2), + canonical_uri(Base2, Base0, Base), + Options = [base_uri(Base)|Options1]. +modify_state(lang, element(Name, Attrs0, Content), Options0, + element(Name, Attrs, Content), Options) :- + select(xml:lang=Lang, Attrs0, Attrs), + \+ memberchk(ignore_lang(true), Options0), !, + delete(Options0, lang(_), Options1), + ( Lang == '' + -> Options = Options1 + ; Options = [lang(Lang)|Options1] + ). +modify_state(xmlns, + element(Name, Attrs0, Content), Options, + element(Name, Attrs, Content), Options) :- + clean_xmlns_attr(Attrs0, Attrs), + Attrs \== Attrs0. + +clean_xmlns_attr([], []). +clean_xmlns_attr([H=_|T0], T) :- + xml_attr(H), !, + clean_xmlns_attr(T0, T). +clean_xmlns_attr([H|T0], [H|T]) :- + clean_xmlns_attr(T0, T). + +xml_attr(xmlns). +xml_attr(xmlns:_). +xml_attr(xml:_). + + +%% remove_fragment(+URI, -WithoutFragment) +% +% When handling xml:base, we must delete the possible fragment. + +remove_fragment(URI, Plain) :- + sub_atom(URI, B, _, _, #), !, + sub_atom(URI, 0, B, _, Plain). +remove_fragment(URI, URI). + + + /******************************* + * HELP PCE-EMACS A BIT * + *******************************/ + +:- multifile + emacs_prolog_colours:term_colours/2, + emacs_prolog_colours:goal_classification/2. + +expand(c(X), _, X) :- !. +expand(In, Pattern, Colours) :- + compound(In), !, + In =.. [F|Args], + expand_list(Args, PatternArgs, ColourArgs), + Pattern =.. [F|PatternArgs], + Colours = functor(F) - ColourArgs. +expand(X, X, classify). + +expand_list([], [], []). +expand_list([H|T], [PH|PT], [CH|CT]) :- + expand(H, PH, CH), + expand_list(T, PT, CT). + +:- discontiguous + term_expansion/2. + +term_expansion(term_colours(C), + emacs_prolog_colours:term_colours(Pattern, Colours)) :- + expand(C, Pattern, Colours). + +term_colours((c(head(+(1))) ::= c(match), {c(body)})). +term_colours((c(head(+(1))) ::= c(match))). + +emacs_prolog_colours:goal_classification(\_, expanded). + +:- dynamic + prolog:meta_goal/2. +:- multifile + prolog:meta_goal/2, + prolog:called_by/2. + +prolog:meta_goal(rewrite(A, _), [A]). +prolog:meta_goal(\A, [A+1]). + +prolog:called_by(attrs(Attrs, _Term), Called) :- + findall(G+1, sub_term(\?G, Attrs), Called, Tail), + findall(G+1, sub_term(\G, Attrs), Tail). + + diff --git a/packages/sgml/RDF/rdf_test.pl b/packages/sgml/RDF/rdf_test.pl new file mode 100644 index 000000000..fd95634ce --- /dev/null +++ b/packages/sgml/RDF/rdf_test.pl @@ -0,0 +1,347 @@ +/* $Id$ + + Part of SWI-Prolog SGML/XML parser + + Author: Jan Wielemaker + E-mail: jan@swi.psy.uva.nl + WWW: http://www.swi.psy.uva.nl/projects/SWI-Prolog/ + Copying: LGPL-2. See the file COPYING or http://www.gnu.org + + Copyright (C) 1990-2000 SWI, University of Amsterdam. All rights reserved. +*/ + +:- module(rdf_test, + [ suite/1, % +Test-number + test_dir/1, % +Directory + test_file/1, % +File + time_file/1, % +File + passed/1, % +Test-numberOrFile + test/0, % run whole suite + show_ok/1 % +Test + ]). + +:- multifile + user:file_search_path/2. + +user:file_search_path(library, ..). +user:file_search_path(foreign, ..). + +:- use_module(library(sgml)). +:- use_module(rdf_parser). +:- use_module(rdf_triple). +:- use_module(rdf). +:- use_module(pretty_print). + +:- set_prolog_flag(rdf_container, true). + +/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Test file for the SWI-Prolog RDF parser. Toplevel predicates: + + # test/0 + Run all tests from the `suite' directory and validate the + the result if the correct result is stored in a .ok file. + + # suite(N) + Run test on suite/t.rdf, showing RDF, intermediate + representation and triples on the console. + + # passed(N) + Parse suite/t.rdf and save the result in suite/t.ok + +The intention is to write tests, use suite/1 to make sure they are +parsed correctly and then run passed/1 to save the correct answer, so +running test/0 can validate all results. +- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ + +suite(N) :- + concat_atom(['suite/t', N, '.rdf'], File), + test_file(File). + +test_file(File) :- + rdf_reset_ids, + format('************* Test ~w ***~n', [File]), + cat(File), + load_structure(File, + [ RDFElement ], + [ dialect(xmlns), + space(sgml) + ]), + rdf_start_file([], Cleanup), + xml_to_plrdf(RDFElement, RDF, []), + rdf_end_file(Cleanup), + format('============= Prolog term ==============~n', []), + pretty_print(RDF), + rdf_triples(RDF, Triples), + format('============= Triples ==================~n', []), + write_triples(Triples). + +time_file(File) :- + time(load_rdf(File, Triples)), + length(Triples, Len), + format('Created ~w triples~n', [Len]). + +passed(Id) :- + integer(Id), !, + concat_atom(['suite/t', Id, '.rdf'], File), + passed(File). +passed(File) :- + rdf_reset_ids, + ok_file(File, OkFile), + load_rdf(File, Triples), + open(OkFile, write, Fd, [encoding(utf8)]), + save_triples(Triples, Fd), + close(Fd), + length(Triples, N), + format('Saved ~d triples to ~w~n', [N, OkFile]). + +:- dynamic failed/1. + +test :- + test(load_rdf), + test(process_rdf). + +test(How) :- + retractall(failed(_)), + test_dir(suite, How), + findall(F, failed(F), Failed), + ( Failed == [] + -> true + ; length(Failed, N), + format('ERROR: ~w tests failed~n', [N]), + fail + ). + + +test_dir(Dir) :- + test_dir(Dir, load_rdf). + +test_dir(Dir, How) :- + format('Tests from "~w" [~w]: ', [Dir, How]), + atom_concat(Dir, '/*.rdf', Pattern), + expand_file_name(Pattern, TestFiles), + maplist(test(How), TestFiles), + format(' done~n'). + +test(How, File) :- + format('.'), flush_output, + rdf_reset_ids, + ok_file(File, OkFile), + ( call(How, File, Triples) + -> ( catch(open(OkFile, read, Fd, [encoding(utf8)]), _, fail) + -> ( read_triples(Fd, OkTriples), + close(Fd), + compare_triples(Triples, OkTriples, _Subst) + -> true + ; assert(failed(File)), + format('~N~w: WRONG ANSWER~n', [File]) + ) + ; format('~N~w: (no .ok file)~n', [File]) + ) + ; assert(failed(File)), + format('~N~w: PARSE FAILED~n', [File]) + ). + +ok_file(File, OkFile) :- + file_base_name(File, BaseFile), + file_name_extension(Base, _, BaseFile), + file_directory_name(File, Dir), + concat_atom([Dir, /, ok, /, Base, '.ok'], OkFile). + + +save_triples([], _). +save_triples([H|T], Fd) :- + format(Fd, '~q.~n', [H]), + save_triples(T, Fd). + +read_triples(Fd, Terms) :- + read(Fd, T0), + read_triples(T0, Fd, Terms). + +read_triples(end_of_file, _, []) :- !. +read_triples(rdf(S0,P0,O0), Fd, [rdf(S,P,O)|R]) :- + global_ref(S0, S), + global_ref(P0, P), + global_obj(O0, O), + read(Fd, T1), + read_triples(T1, Fd, R). + +global_ref(rdf:Local, Global) :- + rdf_name_space(NS), !, + atom_concat(NS, Local, Global). +global_ref(NS:Local, Global) :- !, + atom_concat(NS, Local, Global). +global_ref(URI, URI). + +global_obj(literal(X), literal(X)) :- !. +global_obj(Local, Global) :- + global_ref(Local, Global). + + +write_triples([]) :- !. +write_triples([H|T]) :- !, + write_triple(H), + write_triples(T). + +write_triple(Triple) :- + is_rdf_triple(Triple), !, + Triple = rdf(S,P,O), + format('{~p, ~p, ~p}~n', [S,P,O]). +write_triple(Triple) :- + format('@@@@@ Bad Triple: ~p~n', [Triple]), + fail. + +cat(File) :- + open(File, read, Fd), + copy_stream_data(Fd, user_output), + close(Fd). + +:- dynamic triple/1. + +process_rdf(File, Triples) :- + retractall(triple(_)), + process_rdf(File, assert_triples, []), + findall(T, retract(triple(T)), Triples). + +assert_triples([], _). +assert_triples([H|T], Loc) :- + assert(triple(H)), + assert_triples(T, Loc). + + + /******************************* + * VALIDATE * + *******************************/ + +is_rdf_triple(rdf(Subject, Predicate, Object)) :- + is_subject(Subject), + is_predicate(Predicate), + is_object(Object). + +is_subject(0) :- !, fail. % Variables +is_subject(URI) :- is_uri(URI), !. +is_subject(each(URI)) :- is_uri(URI), !. +is_subject(prefix(Pattern)) :- + atom(Pattern), !. + +is_predicate(0) :- !, fail. +is_predicate(rdf:RdfPred) :- !, + is_rdf_predicate(RdfPred). +is_predicate(NS:Pred) :- !, + atom(NS), + atom(Pred). +is_predicate(Pred) :- + atom(Pred). + +is_object(0) :- !, + fail. +is_object(literal(XML)) :- !, + is_xml(XML). +is_object(rdf:RdfType) :- !, + is_rdf_type(RdfType). +is_object(URI) :- + is_uri(URI). + +is_object(Subject) :- + is_subject(Subject), !. +is_object(Pred) :- + is_predicate(Pred), !. + +is_uri(URI) :- atom(URI). + +is_xml(_XML). % for now + +is_rdf_predicate(RdfPred) :- atom(RdfPred). + +is_rdf_type(RdfType) :- atom(RdfType). + + /******************************* + * UTIL * + *******************************/ + +% find_rdf(+XMLTerm, -RDFTerm) +% +% If the document contains an embedded RDF term, return it, else +% return the whole document. The latter is a bit dubious, but good +% for the purpose of this test-file + +find_rdf(Term, RDFTerm) :- + RDFTerm = element(NS:'RDF', _, _), + term_member(RDFTerm, Term), !, + ( rdf_name_space(NS) + -> true + ; assert(rdf_parser:rdf_name_space(NS)), + assert(new_rdf_namespace(NS)) + ). +find_rdf(Term, Term). + +term_member(X, X). +term_member(X, Compound) :- + compound(Compound), + arg(_, Compound, Arg), + term_member(X, Arg). + + /******************************* + * COMPARING * + *******************************/ + +% compare_triples(+PlRDF, +NTRDF, -Substitions) +% +% Compare two models and if they are equal, return a list of +% PlID = NTID, mapping NodeID elements. + + +compare_triples(A, B, Substitutions) :- + compare_list(A, B, [], Substitutions), !. + +compare_list([], [], S, S). +compare_list([H1|T1], In2, S0, S) :- + select(H2, In2, T2), + compare_triple(H1, H2, S0, S1), + compare_list(T1, T2, S1, S). + +compare_triple(rdf(Subj1,P1,O1), rdf(Subj2, P2, O2), S0, S) :- + compare_field(Subj1, Subj2, S0, S1), + compare_field(P1, P2, S1, S2), + compare_field(O1, O2, S2, S). + +compare_field(X, X, S, S) :- !. +compare_field(literal(X), xml(X), S, S) :- !. % TBD +compare_field(rdf:Name, Atom, S, S) :- + atom(Atom), + rdf_parser:rdf_name_space(NS), + atom_concat(NS, Name, Atom), !. +compare_field(NS:Name, Atom, S, S) :- + atom(Atom), + atom_concat(NS, Name, Atom), !. +compare_field(X, Id, S, S) :- + memberchk(X=Id, S), !. +compare_field(X, Y, S, [X=Y|S]) :- + \+ memberchk(X=_, S), + node_id(X), + node_id(Y), + format('Assume ~w = ~w~n', [X, Y]). + +node_id(node(_)) :- !. +node_id(X) :- + atom(X), + generated_prefix(Prefix), + sub_atom(X, 0, _, _, Prefix), !. + +generated_prefix('Bag__'). +generated_prefix('Seq__'). +generated_prefix('Alt__'). +generated_prefix('Description__'). +generated_prefix('Statement__'). + + /******************************* + * SHOW DIAGRAM * + *******************************/ + +show_ok(Test) :- + ok_file(Test, File), + open(File, read, Fd, [encoding(utf8)]), + read_triples(Fd, OkTriples), + close(Fd), + new(D, rdf_diagram(string('Ok for %s', File))), + send(D, triples, OkTriples), + send(D, open). diff --git a/packages/sgml/RDF/rdf_triple.pl b/packages/sgml/RDF/rdf_triple.pl new file mode 100644 index 000000000..54ba6fcf8 --- /dev/null +++ b/packages/sgml/RDF/rdf_triple.pl @@ -0,0 +1,461 @@ +/* $Id$ + + Part of SWI-Prolog RDF parser + + Author: Jan Wielemaker + E-mail: jan@swi.psy.uva.nl + WWW: http://www.swi.psy.uva.nl/projects/SWI-Prolog/ + Copying: LGPL-2. See the file COPYING or http://www.gnu.org + + Copyright (C) 1990-2000 SWI, University of Amsterdam. All rights reserved. +*/ + +:- module(rdf_triple, + [ rdf_triples/2, % +Parsed, -Tripples + rdf_triples/3, % +Parsed, -Tripples, +Tail + rdf_reset_ids/0, % Reset gensym id's + rdf_start_file/2, % +Options, -Cleanup + rdf_end_file/1, % +Cleanup + anon_prefix/1 % Prefix for anonynmous resources + ]). +:- use_module(library(gensym)). +:- use_module(rdf_parser). + +/** Create triples from intermediate representation + +Convert the output of xml_to_rdf/3 from library(rdf) into a list of +triples of the format described below. The intermediate representation +should be regarded a proprietary representation. + + rdf(Subject, Predicate, Object). + +Where `Subject' is + + * Atom + The subject is a resource + + * each(URI) + URI is the URI of an RDF Bag + + * prefix(Pattern) + Pattern is the prefix of a fully qualified Subject URI + +And `Predicate' is + + * Atom + The predicate is always a resource + +And `Object' is + + * Atom + URI of Object resource + + * literal(Value) + Literal value (Either a single atom or parsed XML data) +*/ + +%% rdf_triples(+Term, -Triples) is det. +%% rdf_triples(+Term, -Tridpples, +Tail) is det. +% +% Convert an object as parsed by rdf.pl into a list of rdf/3 +% triples. The identifier of the main object created is returned +% by rdf_triples/3. +% +% Input is the `content' of the RDF element in the format as +% generated by load_structure(File, Term, [dialect(xmlns)]). +% rdf_triples/3 can process both individual descriptions as +% well as the entire content-list of an RDF element. The first +% mode is suitable when using library(sgml) in `call-back' mode. + +rdf_triples(RDF, Tripples) :- + rdf_triples(RDF, Tripples, []). + +rdf_triples([]) --> !, + []. +rdf_triples([H|T]) --> !, + rdf_triples(H), + rdf_triples(T). +rdf_triples(Term) --> + triples(Term, _). + +%% triples(-Triples, -Id, +In, -Tail) +% +% DGC set processing the output of xml_to_rdf/3. In Id, the identifier +% of the main description or container is returned. + +triples(container(Type, Id, Elements), Id) --> !, + { container_id(Type, Id) + }, + rdf(Id, rdf:type, rdf:Type), + container(Elements, 1, Id). +triples(description(Type, About, BagId, Props), Subject) --> + { var(About), + var(BagId), + share_blank_nodes(true) + }, !, + ( { shared_description(description(Type, Props), Subject) + } + -> [] + ; { make_id('__Description', Id) + }, + triples(description(Type, about(Id), BagId, Props), Subject), + { assert_shared_description(description(Type, Props), Subject) + } + ). +triples(description(description, IdAbout, BagId, Props), Subject) --> !, + { description_id(IdAbout, Subject) + }, + properties(Props, BagId, Subject). +triples(description(Type, IdAbout, BagId, Props), Subject) --> + { description_id(IdAbout, Subject), + name_to_type_uri(Type, TypeURI) + }, + properties([ rdf:type = TypeURI + | Props + ], BagId, Subject). +triples(unparsed(Data), Id) --> + { make_id('__Error', Id), + print_message(error, rdf(unparsed(Data))) + }, + []. + + +name_to_type_uri(NS:Local, URI) :- !, + atom_concat(NS, Local, URI). +name_to_type_uri(URI, URI). + + /******************************* + * CONTAINERS * + *******************************/ + +container([], _, _) --> + []. +container([H0|T0], N, Id) --> + li(H0, N, Id), + { NN is N + 1 + }, + container(T0, NN, Id). + +li(li(Nid, V), _, Id) --> !, + rdf(Id, rdf:Nid, V). +li(V, N, Id) --> + triples(V, VId), !, + { atom_concat('_', N, Nid) + }, + rdf(Id, rdf:Nid, VId). +li(V, N, Id) --> + { atom_concat('_', N, Nid) + }, + rdf(Id, rdf:Nid, V). + +container_id(_, Id) :- + nonvar(Id), !. +container_id(Type, Id) :- + container_base(Type, Base), + make_id(Base, Id). + +container_base('Bag', '__Bag'). +container_base('Seq', '__Seq'). +container_base('Alt', '__Alt'). + + + /******************************* + * DESCRIPTIONS * + *******************************/ + +:- thread_local + node_id/2, % nodeID --> ID + unique_id/1. % known rdf:ID + +rdf_reset_node_ids :- + retractall(node_id(_,_)), + retractall(unique_id(_)). + +description_id(Id, Id) :- + var(Id), !, + make_id('__Description', Id). +description_id(about(Id), Id). +description_id(id(Id), Id) :- + ( unique_id(Id) + -> print_message(error, rdf(redefined_id(Id))) + ; assert(unique_id(Id)) + ). +description_id(each(Id), each(Id)). +description_id(prefix(Id), prefix(Id)). +description_id(node(NodeID), Id) :- + ( node_id(NodeID, Id) + -> true + ; make_id('__Node', Id), + assert(node_id(NodeID, Id)) + ). + +properties(PlRDF, BagId, Subject) --> + { nonvar(BagId) + }, !, + rdf(BagId, rdf:type, rdf:'Bag'), + properties(PlRDF, 1, Statements, [], Subject), + fill_bag(Statements, 1, BagId). +properties(PlRDF, _BagId, Subject) --> + properties(PlRDF, 1, [], [], Subject). + + +fill_bag([], _, _) --> + []. +fill_bag([H|T], N, BagId) --> + { NN is N + 1, + atom_concat('_', N, ElemId) + }, + rdf(BagId, rdf:ElemId, H), + fill_bag(T, NN, BagId). + + +properties([], _, Bag, Bag, _) --> + []. +properties([H0|T0], N, Bag0, Bag, Subject) --> + property(H0, N, NN, Bag0, Bag1, Subject), + properties(T0, NN, Bag1, Bag, Subject). + +%% property(Property, N, NN, Subject)// is det. +% +% Generate triples for {Subject, Pred, Object}. Also generates +% triples for Object if necessary. +% +% @param Property One of +% +% * Pred = Object +% Used for normal statements +% * id(Id, Pred = Object) +% Used for reified statements + +property(Pred0 = Object, N, NN, BagH, BagT, Subject) --> % inlined object + triples(Object, Id), !, + { li_pred(Pred0, Pred, N, NN) + }, + statement(Subject, Pred, Id, _, BagH, BagT). +property(Pred0 = collection(Elems), N, NN, BagH, BagT, Subject) --> !, + { li_pred(Pred0, Pred, N, NN) + }, + statement(Subject, Pred, Object, _Id, BagH, BagT), + collection(Elems, Object). +property(Pred0 = Object, N, NN, BagH, BagT, Subject) --> !, + { li_pred(Pred0, Pred, N, NN) + }, + statement(Subject, Pred, Object, _Id, BagH, BagT). +property(id(Id, Pred0 = Object), N, NN, BagH, BagT, Subject) --> + triples(Object, ObjectId), !, + { li_pred(Pred0, Pred, N, NN) + }, + statement(Subject, Pred, ObjectId, Id, BagH, BagT). +property(id(Id, Pred0 = collection(Elems)), N, NN, BagH, BagT, Subject) --> !, + { li_pred(Pred0, Pred, N, NN) + }, + statement(Subject, Pred, Object, Id, BagH, BagT), + collection(Elems, Object). +property(id(Id, Pred0 = Object), N, NN, BagH, BagT, Subject) --> + { li_pred(Pred0, Pred, N, NN) + }, + statement(Subject, Pred, Object, Id, BagH, BagT). + +%% statement(+Subject, +Pred, +Object, +Id, +BagH, -BagT) +% +% Add a statement to the model. If nonvar(Id), we reinify the +% statement using the given Id. + +statement(Subject, Pred, Object, Id, BagH, BagT) --> + rdf(Subject, Pred, Object), + { BagH = [Id|BagT] + -> statement_id(Id) + ; BagT = BagH + }, + ( { nonvar(Id) + } + -> rdf(Id, rdf:type, rdf:'Statement'), + rdf(Id, rdf:subject, Subject), + rdf(Id, rdf:predicate, Pred), + rdf(Id, rdf:object, Object) + ; [] + ). + + +statement_id(Id) :- + nonvar(Id), !. +statement_id(Id) :- + make_id('__Statement', Id). + +%% li_pred(+Pred, -Pred, +Nth, -NextNth) +% +% Transform rdf:li predicates into _1, _2, etc. + +li_pred(rdf:li, rdf:Pred, N, NN) :- !, + NN is N + 1, + atom_concat('_', N, Pred). +li_pred(Pred, Pred, N, N). + +%% collection(+Elems, -Id) +% +% Handle the elements of a collection and return the identifier +% for the whole collection in Id. + +collection([], Nil) --> + { global_ref(rdf:nil, Nil) + }. +collection([H|T], Id) --> + triples(H, HId), + { make_id('__List', Id) + }, + rdf(Id, rdf:type, rdf:'List'), + rdf(Id, rdf:first, HId), + rdf(Id, rdf:rest, TId), + collection(T, TId). + + +rdf(S0, P0, O0) --> + { global_ref(S0, S), + global_ref(P0, P), + global_obj(O0, O) + }, + [ rdf(S, P, O) ]. + + +global_ref(URI, URI) :- + var(URI), !. +global_ref(rdf:Local, Global) :- + rdf_name_space(NS), !, + atom_concat(NS, Local, Global). +global_ref(NS:Local, Global) :- !, + atom_concat(NS, Local, Global). +global_ref(URI, URI). + +global_obj(V, V) :- + var(V), !. +global_obj(literal(type(Local, X)), literal(type(Global, X))) :- !, + global_ref(Local, Global). +global_obj(literal(X), literal(X)) :- !. +global_obj(Local, Global) :- + global_ref(Local, Global). + + + /******************************* + * SHARING * + *******************************/ + +:- thread_local + shared_description/3, % +Hash, +Term, -Subject + share_blank_nodes/1, % Boolean + shared_nodes/1. % counter + +reset_shared_descriptions :- + retractall(shared_description(_,_,_)), + retractall(shared_nodes(_)). + +shared_description(Term, Subject) :- + term_hash(Term, Hash), + shared_description(Hash, Term, Subject), + ( retract(shared_nodes(N)) + -> N1 is N + 1 + ; N1 = 1 + ), + assert(shared_nodes(N1)). + + +assert_shared_description(Term, Subject) :- + term_hash(Term, Hash), + assert(shared_description(Hash, Term, Subject)). + + + /******************************* + * START/END * + *******************************/ + +%% rdf_start_file(+Options, -Cleanup) is det. +% +% Initialise for the translation of a file. + +rdf_start_file(Options, Cleanup) :- + rdf_reset_node_ids, % play safe + reset_shared_descriptions, + set_bnode_sharing(Options, C1), + set_anon_prefix(Options, C2), + add_cleanup(C1, C2, Cleanup). + +%% rdf_end_file(:Cleanup) is det. +% +% Cleanup reaching the end of an RDF file. + +rdf_end_file(Cleanup) :- + rdf_reset_node_ids, + ( shared_nodes(N) + -> print_message(informational, rdf(shared_blank_nodes(N))) + ; true + ), + reset_shared_descriptions, + Cleanup. + +set_bnode_sharing(Options, erase(Ref)) :- + option(blank_nodes(Share), Options, noshare), + ( Share == share + -> assert(share_blank_nodes(true), Ref), ! + ; Share == noshare + -> fail % next clause + ; throw(error(domain_error(share, Share), _)) + ). +set_bnode_sharing(_, true). + +set_anon_prefix(Options, erase(Ref)) :- + option(base_uri(BaseURI), Options, []), + BaseURI \== [], !, + concat_atom(['__', BaseURI, '#'], AnonBase), + asserta(anon_prefix(AnonBase), Ref). +set_anon_prefix(_, true). + +add_cleanup(true, X, X) :- !. +add_cleanup(X, true, X) :- !. +add_cleanup(X, Y, (X, Y)). + + + /******************************* + * UTIL * + *******************************/ + +%% anon_prefix(-Prefix) is semidet. +% +% If defined, it is the prefix used to generate a blank node. + +:- thread_local + anon_prefix/1. + +make_id(For, ID) :- + anon_prefix(Prefix), !, + atom_concat(Prefix, For, Base), + gensym(Base, ID). +make_id(For, ID) :- + gensym(For, ID). + +anon_base('__Bag'). +anon_base('__Seq'). +anon_base('__Alt'). +anon_base('__Description'). +anon_base('__Statement'). +anon_base('__List'). +anon_base('__Node'). + +%% rdf_reset_ids is det. +% +% Utility predicate to reset the gensym counters for the various +% generated identifiers. This simplifies debugging and matching +% output with the stored desired output (see rdf_test.pl). + +rdf_reset_ids :- + anon_prefix(Prefix), !, + ( anon_base(Base), + atom_concat(Prefix, Base, X), + reset_gensym(X), + fail + ; true + ). +rdf_reset_ids :- + ( anon_base(Base), + reset_gensym(Base), + fail + ; true + ). diff --git a/packages/sgml/RDF/rdf_write.pl b/packages/sgml/RDF/rdf_write.pl new file mode 100644 index 000000000..e2c86c29a --- /dev/null +++ b/packages/sgml/RDF/rdf_write.pl @@ -0,0 +1,635 @@ +/* $Id$ + + Part of SWI-Prolog + + Author: Jan Wielemaker + E-mail: J.Wielemak@uva.nl + WWW: http://www.swi-prolog.org + Copyright (C): 2004-2009, University of Amsterdam + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License + as published by the Free Software Foundation; either version 2 + of the License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + + As a special exception, if you link this library with other files, + compiled with a Free Software compiler, to produce an executable, this + library does not by itself cause the resulting executable to be covered + by the GNU General Public License. This exception does not however + invalidate any other reasons why the executable file might be covered by + the GNU General Public License. +*/ + +:- module(rdf_write, + [ rdf_write_xml/2 % +Stream, +Triples + ]). +:- use_module(library('semweb/rdf_db')). +:- use_module(library(lists)). +:- use_module(library(sgml)). +:- use_module(library(sgml_write)). +:- use_module(library(assoc)). +:- use_module(library(pairs)). +:- use_module(library(debug)). + + +/** Write RDF/XML from a list of triples + +This module writes an RDF/XML document from a list of triples of the +format rdf(Subject, Predicate, Object). It is primarily intended for +communicating computed RDF model fragments to external programs using +RDF/XML. + +When used from the HTTP library, use the following code: + +== +reply_graph(RDF) :- + format('Content-type: application/rdf+xml; charset=UTF-8~n~n'), + rdf_write_xml(current_output, RDF). +== + +@author Jan Wielemaker +@see library(semweb/rdf_db) offers saving a named graph directly from + the RDF database. +*/ + + + /******************************* + * WRITE RDFXML * + *******************************/ + +%% rdf_write_xml(+Out:stream, +Triples:list(rdf(S,P,O))) is det. +% +% Write an RDF/XML serialization of Triples to Out. + +rdf_write_xml(Out, Triples) :- + sort(Triples, Unique), + rdf_write_header(Out, Unique), + node_id_map(Unique, AnonIDs), + rdf_write_triples(Unique, AnonIDs, Out), + rdf_write_footer(Out). + + + /******************************* + * HEADER/FOOTER * + *******************************/ + +%% rdf_write_header(+Out, +Triples) +% +% Save XML document header, doctype and open the RDF environment. +% This predicate also sets up the namespace notation. + +rdf_write_header(Out, Triples) :- + xml_encoding(Out, Enc, Encoding), + format(Out, '~n', [Encoding]), + format(Out, '', [Id, NSText]), + fail + ; true + ), + format(Out, '~N]>~n~n', []), + format(Out, '~n', []). + + +xml_encoding(Out, Enc, Encoding) :- + stream_property(Out, encoding(Enc)), + ( xml_encoding_name(Enc, Encoding) + -> true + ; throw(error(domain_error(rdf_encoding, Enc), _)) + ). + +xml_encoding_name(ascii, 'US-ASCII'). +xml_encoding_name(iso_latin_1, 'ISO-8859-1'). +xml_encoding_name(utf8, 'UTF-8'). + +%% xml_escape_parameter_entity(+In, -Out) is det. +% +% Escape % as % for entity declarations. + +xml_escape_parameter_entity(In, Out) :- + sub_atom(In, _, _, _, '%'), !, + atom_codes(In, Codes), + phrase(escape_parent(Codes), OutCodes), + atom_codes(Out, OutCodes). +xml_escape_parameter_entity(In, In). + +escape_parent([]) --> []. +escape_parent([H|T]) --> + ( { H == 37 } + -> "%" + ; [H] + ), + escape_parent(T). + +%% used_namespaces(+Triples:list(rdf(S,P,O)), -List:atom) is det. +% +% Return the list of namespace abbreviations used in a set of +% triples. + +used_namespaces(Triples, NSList) :- + decl_used_predicate_ns(Triples), + resources(Triples, Resources), + empty_assoc(A0), + put_assoc(rdf, A0, *, A1), % needed for rdf:RDF + res_used_namespaces(Resources, _NoNS, A1, A), + assoc_to_keys(A, NSList). + + +res_used_namespaces([], [], A, A). +res_used_namespaces([Resource|T], NoNS, A0, A) :- + ns(NS, Full), + Full \== '', + atom_concat(Full, _Local, Resource), !, + put_assoc(NS, A0, *, A1), + res_used_namespaces(T, NoNS, A1, A). +res_used_namespaces([R|T0], [R|T], A0, A) :- + res_used_namespaces(T0, T, A0, A). + +%% resources(+Triples:list(rdf(S,P,O)), -Resources:list(atom)) is det. +% +% Resources is the set of resources referenced in Triples. + +resources(Triples, Resources) :- + phrase(resources(Triples), Raw), + sort(Raw, Resources). + +resources([]) --> + []. +resources([rdf(S,P,O)|T]) --> + [S,P], + object_resources(O), + resources(T). + +object_resources(Atom) --> + { atom(Atom) }, !, + [ Atom ]. +object_resources(literal(type(Type, _))) --> !, + [ Type ]. +object_resources(_) --> + []. + +%% decl_used_predicate_ns(+Triples:list(rdf(S,P,O))) +% +% For every URL used as a predicate we *MUST* define a namespace +% as we cannot use names holding /, :, etc. as XML identifiers. + +:- thread_local + predicate_ns/2. + +decl_used_predicate_ns(Triples) :- + retractall(predicate_ns(_,_)), + ( member(rdf(_,P,_), Triples), + decl_predicate_ns(P), + fail + ; true + ). + +decl_predicate_ns(Pred) :- + predicate_ns(Pred, _), !. +decl_predicate_ns(Pred) :- + rdf_global_id(NS:_Local, Pred), + assert(predicate_ns(Pred, NS)), !. +decl_predicate_ns(Pred) :- + is_bag_li_predicate(Pred), !. +decl_predicate_ns(Pred) :- + atom_codes(Pred, Codes), + append(NSCodes, LocalCodes, Codes), + xml_codes(LocalCodes), !, + ( NSCodes \== [] + -> atom_codes(NS, NSCodes), + ( ns(Id, NS) + -> assert(predicate_ns(Pred, Id)) + ; between(1, infinite, N), + atom_concat(ns, N, Id), + \+ ns(Id, _) + -> rdf_register_ns(Id, NS), + print_message(informational, + rdf(using_namespace(Id, NS))) + ), + assert(predicate_ns(Pred, Id)) + ; assert(predicate_ns(Pred, -)) % no namespace used + ). + +xml_codes([]). +xml_codes([H|T]) :- + xml_code(H), + xml_codes(T). + +xml_code(X) :- + code_type(X, csym), !. +xml_code(0'-). % ' + + +rdf_write_footer(Out) :- + format(Out, '~n', []). + + + /******************************* + * ANONYMOUS IDS * + *******************************/ + +%% node_id_map(+Triples, -IdMap) is det. +% +% Create an assoc Resource -> NodeID for those anonymous resources +% in Triples that need a NodeID. This implies all anonymous +% resources that are used multiple times as object value. + +node_id_map(Triples, IdMap) :- + anonymous_objects(Triples, Objs), + msort(Objs, Sorted), + empty_assoc(IdMap0), + nodeid_map(Sorted, 0, IdMap0, IdMap). + +anonymous_objects([], []). +anonymous_objects([rdf(_,_,O)|T0], Anon) :- + rdf_is_bnode(O), !, + Anon = [O|T], + anonymous_objects(T0, T). +anonymous_objects([_|T0], T) :- + anonymous_objects(T0, T). + +nodeid_map([], _, Map, Map). +nodeid_map([H,H|T0], Id, Map0, Map) :- !, + remove_leading(H, T0, T), + atom_concat(bn, Id, NodeId), + put_assoc(H, Map0, NodeId, Map1), + Id2 is Id + 1, + nodeid_map(T, Id2, Map1, Map). +nodeid_map([_|T], Id, Map0, Map) :- + nodeid_map(T, Id, Map0, Map). + +remove_leading(H, [H|T0], T) :- !, + remove_leading(H, T0, T). +remove_leading(_, T, T). + + + /******************************* + * TRIPLES * + *******************************/ + +rdf_write_triples(Triples, NodeIDs, Out) :- + rdf_write_triples(Triples, NodeIDs, Out, [], Anon), + rdf_write_anon(Anon, NodeIDs, Out, Anon). + +rdf_write_triples([], _, _, Anon, Anon). +rdf_write_triples([H|T0], NodeIDs, Out, Anon0, Anon) :- + arg(1, H, S), + subject_triples(S, [H|T0], T, OnSubject), + ( rdf_is_bnode(S) + -> rdf_write_triples(T, NodeIDs, Out, [anon(S,_,OnSubject)|Anon0], Anon) + ; rdf_write_subject(OnSubject, S, NodeIDs, Out, Anon0), + rdf_write_triples(T, NodeIDs, Out, Anon0, Anon) + ). + +subject_triples(S, [H|T0], T, [H|M]) :- + arg(1, H, S), !, + subject_triples(S, T0, T, M). +subject_triples(_, T, T, []). + + +rdf_write_anon([], _, _, _). +rdf_write_anon([anon(Subject, Done, Triples)|T], NodeIDs, Out, Anon) :- + Done \== true, !, + Done = true, + rdf_write_subject(Triples, Subject, NodeIDs, Out, Anon), + rdf_write_anon(T, NodeIDs, Out, Anon). +rdf_write_anon([_|T], NodeIDs, Out, Anon) :- + rdf_write_anon(T, NodeIDs, Out, Anon). + +rdf_write_subject(Triples, Subject, NodeIDs, Out, Anon) :- + rdf_write_subject(Triples, Out, Subject, NodeIDs, -, 0, Anon), !, + format(Out, '~n', []). +rdf_write_subject(_, Subject, _, _, _) :- + throw(error(rdf_save_failed(Subject), 'Internal error')). + +rdf_write_subject(Triples, Out, Subject, NodeIDs, DefNS, Indent, Anon) :- + rdf_equal(rdf:type, RdfType), + select(rdf(_, RdfType,Type), Triples, Triples1), + rdf_id(Type, DefNS, TypeId), + xml_is_name(TypeId), !, + format(Out, '~*|<', [Indent]), + rdf_write_id(Out, TypeId), + save_about(Out, Subject, NodeIDs), + save_attributes(Triples1, DefNS, Out, NodeIDs, TypeId, Indent, Anon). +rdf_write_subject(Triples, Out, Subject, NodeIDs, _DefNS, Indent, Anon) :- + format(Out, '~*| format(Out,' rdf:nodeID="~w"', [NodeID]) + ; true + ). +save_about(Out, Subject, _) :- + stream_property(Out, encoding(Encoding)), + rdf_value(Subject, QSubject, Encoding), + format(Out, ' rdf:about="~w"', [QSubject]), !. +save_about(_, _, _) :- + assertion(fail). + +%% save_attributes(+List, +DefNS, +Out, +NodeIDs, Element, +Indent, +Anon) +% +% Save the attributes. Short literal attributes are saved in the +% tag. Others as the content of the description element. The +% begin tag has already been filled. + +save_attributes(Triples, DefNS, Out, NodeIDs, Element, Indent, Anon) :- + split_attributes(Triples, InTag, InBody), + SubIndent is Indent + 2, + save_attributes2(InTag, DefNS, tag, Out, NodeIDs, SubIndent, Anon), + ( InBody == [] + -> format(Out, '/>~n', []) + ; format(Out, '>~n', []), + save_attributes2(InBody, _, body, Out, NodeIDs, SubIndent, Anon), + format(Out, '~N~*|~n', [Indent, Element]) + ). + +% split_attributes(+Triples, -HeadAttrs, -BodyAttr) +% +% Split attribute (Name=Value) list into attributes for the head +% and body. Attributes can only be in the head if they are literal +% and appear only one time in the attribute list. + +split_attributes(Triples, HeadAttr, BodyAttr) :- + duplicate_attributes(Triples, Dupls, Singles), + simple_literal_attributes(Singles, HeadAttr, Rest), + append(Dupls, Rest, BodyAttr). + +% duplicate_attributes(+Attrs, -Duplicates, -Singles) +% +% Extract attributes that appear more than onces as we cannot +% dublicate an attribute in the head according to the XML rules. + +duplicate_attributes([], [], []). +duplicate_attributes([H|T], Dupls, Singles) :- + arg(2, H, Name), + named_attributes(Name, T, D, R), + D \== [], + append([H|D], Dupls2, Dupls), !, + duplicate_attributes(R, Dupls2, Singles). +duplicate_attributes([H|T], Dupls2, [H|Singles]) :- + duplicate_attributes(T, Dupls2, Singles). + +named_attributes(_, [], [], []) :- !. +named_attributes(Name, [H|T], D, R) :- + ( arg(2, H, Name) + -> D = [H|DT], + named_attributes(Name, T, DT, R) + ; R = [H|RT], + named_attributes(Name, T, D, RT) + ). + +% simple_literal_attributes(+Attributes, -Inline, -Body) +% +% Split attributes for (literal) attributes to be used in the +% begin-tag and ones that have to go into the body of the description. + +simple_literal_attributes([], [], []). +simple_literal_attributes([H|TA], [H|TI], B) :- + in_tag_attribute(H), !, + simple_literal_attributes(TA, TI, B). +simple_literal_attributes([H|TA], I, [H|TB]) :- + simple_literal_attributes(TA, I, TB). + +in_tag_attribute(rdf(_,P,literal(Text))) :- + atom(Text), % may not have lang qualifier + atom_length(Text, Len), + Len < 60, + \+ is_bag_li_predicate(P). + + +% save_attributes(+List, +DefNS, +TagOrBody, +Out, +NodeIDs, +Indent, +Anon) +% +% Save a list of attributes. + +save_attributes2([], _, _, _, _, _, _). +save_attributes2([H|T], DefNS, Where, Out, NodeIDs, Indent, Anon) :- + save_attribute(Where, H, DefNS, Out, NodeIDs, Indent, Anon), + save_attributes2(T, DefNS, Where, Out, NodeIDs, Indent, Anon). + +%% save_attribute(+Where, +Triple, +DefNS, +Out, +NodeIDs, +Indent, +Anon) + +save_attribute(tag, rdf(_, Name, literal(Value)), DefNS, Out, _, Indent, _Anon) :- + AttIndent is Indent + 2, + rdf_att_id(Name, DefNS, NameText), + stream_property(Out, encoding(Encoding)), + xml_quote_attribute(Value, QVal, Encoding), + format(Out, '~N~*|', [AttIndent]), + rdf_write_id(Out, NameText), + format(Out, '="~w"', [QVal]). +save_attribute(body, rdf(_,Name,literal(Literal)), DefNS, Out, _, Indent, _) :- !, + rdf_p_id(Name, DefNS, NameText), + format(Out, '~N~*|<', [Indent]), + rdf_write_id(Out, NameText), + ( Literal = lang(Lang, Value) + -> rdf_id(Lang, DefNS, LangText), + format(Out, ' xml:lang="~w">', [LangText]) + ; Literal = type(Type, Value) + -> ( rdf_equal(Type, rdf:'XMLLiteral') + -> write(Out, ' rdf:parseType="Literal">'), + Value = Literal + ; stream_property(Out, encoding(Encoding)), + rdf_value(Type, QVal, Encoding), + format(Out, ' rdf:datatype="~w">', [QVal]) + ) + ; atomic(Literal) + -> write(Out, '>'), + Value = Literal + ; write(Out, ' rdf:parseType="Literal">'), + Value = Literal + ), + save_attribute_value(Value, Out, Indent), + write(Out, ''). +save_attribute(body, rdf(_, Name, Value), DefNS, Out, NodeIDs, Indent, Anon) :- + rdf_is_bnode(Value), + memberchk(anon(Value, Done, ValueTriples), Anon), !, + rdf_p_id(Name, DefNS, NameText), + format(Out, '~N~*|<', [Indent]), + rdf_write_id(Out, NameText), + ( var(Done) + -> Done = true, + SubIndent is Indent + 2, + ( rdf_equal(RdfType, rdf:type), + rdf_equal(ListClass, rdf:'List'), + memberchk(rdf(_, RdfType, ListClass), ValueTriples) + -> format(Out, ' rdf:parseType="Collection">~n', []), + rdf_save_list(ValueTriples, Out, Value, NodeIDs, DefNS, SubIndent, Anon) + ; format(Out, '>~n', []), + rdf_write_subject(ValueTriples, Out, Value, NodeIDs, DefNS, SubIndent, Anon) + ), + format(Out, '~N~*|~n', []) + ; get_assoc(Value, NodeIDs, NodeID) + -> format(Out, ' rdf:nodeID="~w"/>', [NodeID]) + ; assertion(fail) + ). +save_attribute(body, rdf(_, Name, Value), DefNS, Out, _, Indent, _Anon) :- + stream_property(Out, encoding(Encoding)), + rdf_value(Value, QVal, Encoding), + rdf_p_id(Name, DefNS, NameText), + format(Out, '~N~*|<', [Indent]), + rdf_write_id(Out, NameText), + format(Out, ' rdf:resource="~w"/>', [QVal]). + +save_attribute_value(Value, Out, _) :- % strings + atom(Value), !, + stream_property(Out, encoding(Encoding)), + xml_quote_cdata(Value, QVal, Encoding), + write(Out, QVal). +save_attribute_value(Value, Out, _) :- % numbers + number(Value), !, + writeq(Out, Value). % quoted: preserve floats +save_attribute_value(Value, Out, Indent) :- + xml_is_dom(Value), !, + XMLIndent is Indent+2, + xml_write(Out, Value, + [ header(false), + indent(XMLIndent) + ]). +save_attribute_value(Value, _Out, _) :- + throw(error(save_attribute_value(Value), _)). + +rdf_save_list(_, _, List, _, _, _, _) :- + rdf_equal(List, rdf:nil), !. +rdf_save_list(ListTriples, Out, List, NodeIDs, DefNS, Indent, Anon) :- + rdf_equal(RdfFirst, rdf:first), + memberchk(rdf(List, RdfFirst, First), ListTriples), + ( rdf_is_bnode(First), + memberchk(anon(First, true, FirstTriples), Anon) + -> nl(Out), + rdf_write_subject(FirstTriples, Out, First, NodeIDs, DefNS, Indent, Anon) + ; stream_property(Out, encoding(Encoding)), + rdf_value(First, QVal, Encoding), + format(Out, '~N~*|', + [Indent, QVal]) + ), + ( rdf_equal(RdfRest, rdf:rest), + memberchk(rdf(List, RdfRest, List2), ListTriples), + \+ rdf_equal(List2, rdf:nil), + memberchk(anon(List2, true, List2Triples), Anon) + -> rdf_save_list(List2Triples, Out, List2, NodeIDs, DefNS, Indent, Anon) + ; true + ). + +%% rdf_p_id(+Resource, +DefNS, -NSLocal) +% +% As rdf_id/3 for predicate names. Maps _: to rdf:li. +% +% @tbd Ensure we are talking about an rdf:Bag + +rdf_p_id(LI, _, 'rdf:li') :- + is_bag_li_predicate(LI), !. +rdf_p_id(Resource, DefNS, NSLocal) :- + rdf_id(Resource, DefNS, NSLocal). + +%% is_bag_li_predicate(+Pred) is semidet. +% +% True if Pred is _:N, as used for members of an rdf:Bag, rdf:Seq +% or rdf:Alt. + +is_bag_li_predicate(Pred) :- + atom_concat('_:', AN, Pred), + catch(atom_number(AN, N), _, true), integer(N), N >= 0, !. + + +%% rdf_id(+Resource, +DefNS, -NSLocal) +% +% Generate a NS:Local name for Resource given the indicated +% default namespace. This call is used for elements. + +rdf_id(Id, NS, NS:Local) :- + ns(NS, Full), + Full \== '', + atom_concat(Full, Local, Id), !. +rdf_id(Id, _, NS:Local) :- + ns(NS, Full), + Full \== '', + atom_concat(Full, Local, Id), !. +rdf_id(Id, _, Id). + + +%% rdf_write_id(+Out, +NSLocal) is det. +% +% Write an identifier. We cannot use native write on it as both NS +% and Local can be operators. + +rdf_write_id(Out, NS:Local) :- !, + format(Out, '~w:~w', [NS, Local]). +rdf_write_id(Out, Atom) :- + write(Out, Atom). + + +rdf_att_id(Id, _, NS:Local) :- + ns(NS, Full), + Full \== '', + atom_concat(Full, Local, Id), !. +rdf_att_id(Id, _, Id). + + +%% rdf_value(+Resource, -Text, +Encoding) +% +% According to "6.4 RDF URI References" of the RDF Syntax +% specification, a URI reference is UNICODE string not containing +% control sequences, represented as UTF-8 and then as escaped +% US-ASCII. +% +% NOTE: the to_be_described/1 trick ensures entity rewrite in +% resources that start with 'http://t-d-b.org?'. This is a of a +% hack to save the artchive data in the MultimediaN project. We +% should use a more general mechanism. + +rdf_value(V, Text, Encoding) :- + to_be_described(Prefix), + atom_concat(Prefix, V1, V), + ns(NS, Full), + atom_concat(Full, Local, V1), !, + rdf_quote_uri(Local, QLocal0), + xml_quote_attribute(QLocal0, QLocal, Encoding), + concat_atom([Prefix, '&', NS, (';'), QLocal], Text). +rdf_value(V, Text, Encoding) :- + ns(NS, Full), + atom_concat(Full, Local, V), !, + rdf_quote_uri(Local, QLocal0), + xml_quote_attribute(QLocal0, QLocal, Encoding), + concat_atom(['&', NS, (';'), QLocal], Text). +rdf_value(V, Q, Encoding) :- + rdf_quote_uri(V, Q0), + xml_quote_attribute(Q0, Q, Encoding). + +to_be_described('http://t-d-b.org?'). + + + /******************************* + * UTIL * + *******************************/ + +ns(Id, Full) :- + rdf_db:ns(Id, Full). diff --git a/packages/sgml/RDF/rdfs.rdfs b/packages/sgml/RDF/rdfs.rdfs new file mode 100644 index 000000000..45260cf2a --- /dev/null +++ b/packages/sgml/RDF/rdfs.rdfs @@ -0,0 +1,237 @@ + + + + +]> + + + + + + Resource + The class resource, everything. + + + + + type + Indicates membership of a class + + + + + + + Class + The concept of Class + + + + + + subClassOf + Indicates membership of a class + + + + + + + subPropertyOf + Indicates specialization of properties + + + + + + + Property + The concept of a property. + + + + + + comment + Use this for descriptions + + + + + + + label + Provides a human-readable version of a resource name. + + + + + + + domain + A domain class for a property type + + + + + + + range + A range class for a property type + + + + + + + seeAlso + A resource that provides information about the subject resource + + + + + + + + + isDefinedBy + Indicates the namespace of a resource + + + + + + + Literal + This represents the set of atomic values, eg. textual strings. + + + + + Statement + + The class of RDF statements. + + + + + subject + The subject of an RDF statement. + + + + + + + predicate + the predicate of an RDF statement. + + + + + + + object + The object of an RDF statement. + + + + + + Container + + This represents the set Containers. + + + + + Bag + An unordered collection. + + + + + + Seq + An ordered collection. + + + + + + Alt + A collection of alternatives. + + + + + + ContainerMembershipProperty + The container membership properties, rdf:1, rdf:2, ..., all of which are sub-properties of 'member'. + + + + + + member + a member of a container + + + + + + value + Identifies the principal value (usually a string) of a property when the property value is a structured resource + + + + + + + + List + The class of RDF Lists + + + + + nil + The empty list, with no items in it. If the rest of a list is nil then the list has no more items in it. + + + + + first + The first item in an RDF list. Also often called the head. + + + + + + rest + The rest of an RDF list after the first item. Also often called the tail. + + + + + + + Datatype + The class of datatypes. + + + + + XMLLiteral + The class of XML literals. + + + + + + + + diff --git a/packages/sgml/RDF/rewrite.pl b/packages/sgml/RDF/rewrite.pl new file mode 100644 index 000000000..6c10ca531 --- /dev/null +++ b/packages/sgml/RDF/rewrite.pl @@ -0,0 +1,144 @@ +/* $Id$ + + Part of XPCE + Designed and implemented by Anjo Anjewierden and Jan Wielemaker + E-mail: jan@swi.psy.uva.nl + + Copyright (C) 2000 University of Amsterdam. All rights reserved. +*/ + +:- module(rewrite, + [ rewrite/2, % +Rule, +Input + rew_term_expansion/2, + rew_goal_expansion/2 + ]). +:- use_module(library(quintus)). + +:- meta_predicate + rewrite(:, +). +:- op(1200, xfx, user:(::=)). + + + /******************************* + * COMPILATION * + *******************************/ + +rew_term_expansion((Rule ::= RuleBody), (Head :- Body)) :- + translate(RuleBody, Term, Body0), + simplify(Body0, Body), + Rule =.. List, + append(List, [Term], L2), + Head =.. L2. + +rew_goal_expansion(rewrite(To, From), Goal) :- + nonvar(To), + To = \Rule, + compound(Rule), + Rule =.. List, + append(List, [From], List2), + Goal =.. List2. + + + /******************************* + * TOPLEVEL * + *******************************/ + +%% rewrite(?To, +From) +% +% Invoke the term-rewriting system + +rewrite(To, From) :- + strip_module(To, M, T), + ( var(T) + -> From = T + ; T = \Rule + -> call(M:Rule, From) + ; match(To, M, From) + ). + +match(Rule, M, From) :- + translate(Rule, From, Code), + M:Code. + +translate(Var, Var, true) :- + var(Var), !. +translate((\Command, !), Var, (Goal, !)) :- !, + ( callable(Command), + Command =.. List + -> append(List, [Var], L2), + Goal =.. L2 + ; Goal = rewrite(\Command, Var) + ). +translate(\Command, Var, Goal) :- !, + ( callable(Command), + Command =.. List + -> append(List, [Var], L2), + Goal =.. L2 + ; Goal = rewrite(\Command, Var) + ). +translate(Atomic, Atomic, true) :- + atomic(Atomic), !. +translate(C, _, Cmd) :- + command(C, Cmd), !. +translate((A, B), T, Code) :- + ( command(A, Cmd) + -> !, translate(B, T, C), + Code = (Cmd, C) + ; command(B, Cmd) + -> !, translate(A, T, C), + Code = (C, Cmd) + ). +translate(Term0, Term, Command) :- + functor(Term0, Name, Arity), + functor(Term, Name, Arity), + translate_args(0, Arity, Term0, Term, Command). + +translate_args(N, N, _, _, true) :- !. +translate_args(I0, Arity, T0, T1, (C0,C)) :- + I is I0 + 1, + arg(I, T0, A0), + arg(I, T1, A1), + translate(A0, A1, C0), + translate_args(I, Arity, T0, T1, C). + +command(0, _) :- !, % catch variables + fail. +command({A}, A). +command(!, !). + + /******************************* + * SIMPLIFY * + *******************************/ + +%% simplify(+Raw, -Simplified) +% +% Get rid of redundant `true' goals generated by translate/3. + +simplify(V, V) :- + var(V), !. +simplify((A0,B), A) :- + B == true, !, + simplify(A0, A). +simplify((A,B0), B) :- + A == true, !, + simplify(B0, B). +simplify((A0, B0), C) :- !, + simplify(A0, A), + simplify(B0, B), + ( ( A \== A0 + ; B \== B0 + ) + -> simplify((A,B), C) + ; C = (A,B) + ). +simplify(X, X). + + /******************************* + * XREF * + *******************************/ + +:- multifile + prolog:called_by/2. + +prolog:called_by(rewrite(Spec, _Term), Called) :- + findall(G+1, sub_term(\G, Spec), Called). diff --git a/packages/sgml/RDF/suite/ex_19.rdf b/packages/sgml/RDF/suite/ex_19.rdf new file mode 100644 index 000000000..2c67bb089 --- /dev/null +++ b/packages/sgml/RDF/suite/ex_19.rdf @@ -0,0 +1,12 @@ + + + + + + + + + + + diff --git a/packages/sgml/RDF/suite/ok/ex_19.ok b/packages/sgml/RDF/suite/ok/ex_19.ok new file mode 100644 index 000000000..2086c8b9a --- /dev/null +++ b/packages/sgml/RDF/suite/ok/ex_19.ok @@ -0,0 +1,14 @@ +rdf('http://example.org/basket', 'http://example.org/stuff/1.0/':hasFruit, '__List1'). +rdf(l1, rdf:type, rdf:'Statement'). +rdf(l1, rdf:subject, 'http://example.org/basket'). +rdf(l1, rdf:predicate, 'http://example.org/stuff/1.0/':hasFruit). +rdf(l1, rdf:object, '__List1'). +rdf('__List1', rdf:type, rdf:'List'). +rdf('__List1', rdf:first, 'http://example.org/banana'). +rdf('__List1', rdf:rest, '__List2'). +rdf('__List2', rdf:type, rdf:'List'). +rdf('__List2', rdf:first, 'http://example.org/apple'). +rdf('__List2', rdf:rest, '__List3'). +rdf('__List3', rdf:type, rdf:'List'). +rdf('__List3', rdf:first, 'http://example.org/pear'). +rdf('__List3', rdf:rest, rdf:nil). diff --git a/packages/sgml/RDF/suite/ok/t1.ok b/packages/sgml/RDF/suite/ok/t1.ok new file mode 100644 index 000000000..4234741be --- /dev/null +++ b/packages/sgml/RDF/suite/ok/t1.ok @@ -0,0 +1 @@ +rdf('http://www.w3.org/Home/Lassila', 'http://description.org/schema/':'Creator', literal('Ora Lassila')). diff --git a/packages/sgml/RDF/suite/ok/t10.ok b/packages/sgml/RDF/suite/ok/t10.ok new file mode 100644 index 000000000..b27554c1c --- /dev/null +++ b/packages/sgml/RDF/suite/ok/t10.ok @@ -0,0 +1,4 @@ +rdf(pages, rdf:type, rdf:'Bag'). +rdf(pages, rdf:'_1', 'http://foo.org/foo.html'). +rdf(pages, rdf:'_2', 'http://bar.org/bar.html'). +rdf(each(pages), 'http://purl.org/metadata/dublin_core/':'Creator', literal('Ora Lassila')). diff --git a/packages/sgml/RDF/suite/ok/t11.ok b/packages/sgml/RDF/suite/ok/t11.ok new file mode 100644 index 000000000..54903a32c --- /dev/null +++ b/packages/sgml/RDF/suite/ok/t11.ok @@ -0,0 +1 @@ +rdf('http://www.swi.psy.uva.nl/projects/SWI-Prolog/', 'http://description.org/schema/':'Creator', literal('Jan Wielemaker')). diff --git a/packages/sgml/RDF/suite/ok/t12.ok b/packages/sgml/RDF/suite/ok/t12.ok new file mode 100644 index 000000000..4234741be --- /dev/null +++ b/packages/sgml/RDF/suite/ok/t12.ok @@ -0,0 +1 @@ +rdf('http://www.w3.org/Home/Lassila', 'http://description.org/schema/':'Creator', literal('Ora Lassila')). diff --git a/packages/sgml/RDF/suite/ok/t13.ok b/packages/sgml/RDF/suite/ok/t13.ok new file mode 100644 index 000000000..4234741be --- /dev/null +++ b/packages/sgml/RDF/suite/ok/t13.ok @@ -0,0 +1 @@ +rdf('http://www.w3.org/Home/Lassila', 'http://description.org/schema/':'Creator', literal('Ora Lassila')). diff --git a/packages/sgml/RDF/suite/ok/t14.ok b/packages/sgml/RDF/suite/ok/t14.ok new file mode 100644 index 000000000..e842e0cf9 --- /dev/null +++ b/packages/sgml/RDF/suite/ok/t14.ok @@ -0,0 +1,3 @@ +rdf('http://www.w3.org', 'http://description.org/schema/':'Publisher', literal('World Wide Web Consortium')). +rdf('http://www.w3.org', 'http://description.org/schema/':'Title', literal('W3C Home Page')). +rdf('http://www.w3.org', 'http://description.org/schema/':'Date', literal('1998-10-03T02:27')). diff --git a/packages/sgml/RDF/suite/ok/t15.ok b/packages/sgml/RDF/suite/ok/t15.ok new file mode 100644 index 000000000..e842e0cf9 --- /dev/null +++ b/packages/sgml/RDF/suite/ok/t15.ok @@ -0,0 +1,3 @@ +rdf('http://www.w3.org', 'http://description.org/schema/':'Publisher', literal('World Wide Web Consortium')). +rdf('http://www.w3.org', 'http://description.org/schema/':'Title', literal('W3C Home Page')). +rdf('http://www.w3.org', 'http://description.org/schema/':'Date', literal('1998-10-03T02:27')). diff --git a/packages/sgml/RDF/suite/ok/t16.ok b/packages/sgml/RDF/suite/ok/t16.ok new file mode 100644 index 000000000..607ee12ea --- /dev/null +++ b/packages/sgml/RDF/suite/ok/t16.ok @@ -0,0 +1,3 @@ +rdf('http://www.w3.org/Home/Lassila', 'http://description.org/schema/':'Creator', 'http://www.w3.org/staffId/85740'). +rdf('http://www.w3.org/staffId/85740', 'http://description.org/schema/':'Name', literal('Ora Lassila')). +rdf('http://www.w3.org/staffId/85740', 'http://description.org/schema/':'Email', literal('lassila@w3.org')). diff --git a/packages/sgml/RDF/suite/ok/t17.ok b/packages/sgml/RDF/suite/ok/t17.ok new file mode 100644 index 000000000..c1976913a --- /dev/null +++ b/packages/sgml/RDF/suite/ok/t17.ok @@ -0,0 +1,3 @@ +rdf('http://www.w3.org/staffId/85740', 'http://description.org/schema/':'Name', literal('Ora Lassila')). +rdf('http://www.w3.org/staffId/85740', 'http://description.org/schema/':'Email', literal('lassila@w3.org')). +rdf('http://www.w3.org/Home/Lassila', 'http://description.org/schema/':'Creator', 'http://www.w3.org/staffId/85740'). diff --git a/packages/sgml/RDF/suite/ok/t18.ok b/packages/sgml/RDF/suite/ok/t18.ok new file mode 100644 index 000000000..c1976913a --- /dev/null +++ b/packages/sgml/RDF/suite/ok/t18.ok @@ -0,0 +1,3 @@ +rdf('http://www.w3.org/staffId/85740', 'http://description.org/schema/':'Name', literal('Ora Lassila')). +rdf('http://www.w3.org/staffId/85740', 'http://description.org/schema/':'Email', literal('lassila@w3.org')). +rdf('http://www.w3.org/Home/Lassila', 'http://description.org/schema/':'Creator', 'http://www.w3.org/staffId/85740'). diff --git a/packages/sgml/RDF/suite/ok/t19.ok b/packages/sgml/RDF/suite/ok/t19.ok new file mode 100644 index 000000000..de0eccc11 --- /dev/null +++ b/packages/sgml/RDF/suite/ok/t19.ok @@ -0,0 +1,4 @@ +rdf('http://www.w3.org/staffId/85740', rdf:type, 'http://description.org/schema/Person'). +rdf('http://www.w3.org/staffId/85740', 'http://description.org/view/':'Name', literal('Ora Lassila')). +rdf('http://www.w3.org/staffId/85740', 'http://description.org/view/':'Email', literal('lassila@w3.org')). +rdf('http://www.w3.org/Home/Lassila', 'http://description.org/schema/':'Creator', 'http://www.w3.org/staffId/85740'). diff --git a/packages/sgml/RDF/suite/ok/t2.ok b/packages/sgml/RDF/suite/ok/t2.ok new file mode 100644 index 000000000..288860a45 --- /dev/null +++ b/packages/sgml/RDF/suite/ok/t2.ok @@ -0,0 +1,3 @@ +rdf('JW', sex, literal(male)). +rdf('JW', employed_at, literal('SWI')). +rdf('SWI-prolog', 'http://description.org/schema/':'Creator', 'JW'). diff --git a/packages/sgml/RDF/suite/ok/t20.ok b/packages/sgml/RDF/suite/ok/t20.ok new file mode 100644 index 000000000..de0eccc11 --- /dev/null +++ b/packages/sgml/RDF/suite/ok/t20.ok @@ -0,0 +1,4 @@ +rdf('http://www.w3.org/staffId/85740', rdf:type, 'http://description.org/schema/Person'). +rdf('http://www.w3.org/staffId/85740', 'http://description.org/view/':'Name', literal('Ora Lassila')). +rdf('http://www.w3.org/staffId/85740', 'http://description.org/view/':'Email', literal('lassila@w3.org')). +rdf('http://www.w3.org/Home/Lassila', 'http://description.org/schema/':'Creator', 'http://www.w3.org/staffId/85740'). diff --git a/packages/sgml/RDF/suite/ok/t21.ok b/packages/sgml/RDF/suite/ok/t21.ok new file mode 100644 index 000000000..1fa5f0a49 --- /dev/null +++ b/packages/sgml/RDF/suite/ok/t21.ok @@ -0,0 +1,7 @@ +rdf('__Bag1', rdf:type, rdf:'Bag'). +rdf('__Bag1', rdf:'_1', 'http://mycollege.edu/students/Amy'). +rdf('__Bag1', rdf:'_2', 'http://mycollege.edu/students/Tim'). +rdf('__Bag1', rdf:'_3', 'http://mycollege.edu/students/John'). +rdf('__Bag1', rdf:'_4', 'http://mycollege.edu/students/Mary'). +rdf('__Bag1', rdf:'_5', 'http://mycollege.edu/students/Sue'). +rdf('http://mycollege.edu/courses/6.001', 'http://description.org/schema/':students, '__Bag1'). diff --git a/packages/sgml/RDF/suite/ok/t22.ok b/packages/sgml/RDF/suite/ok/t22.ok new file mode 100644 index 000000000..a342189e8 --- /dev/null +++ b/packages/sgml/RDF/suite/ok/t22.ok @@ -0,0 +1,5 @@ +rdf('__Alt1', rdf:type, rdf:'Alt'). +rdf('__Alt1', rdf:'_1', 'ftp://ftp.x.org'). +rdf('__Alt1', rdf:'_2', 'ftp://ftp.cs.purdue.edu'). +rdf('__Alt1', rdf:'_3', 'ftp://ftp.eu.net'). +rdf('http://x.org/packages/X11', 'http://description.org/schema/':'DistributionSite', '__Alt1'). diff --git a/packages/sgml/RDF/suite/ok/t23.ok b/packages/sgml/RDF/suite/ok/t23.ok new file mode 100644 index 000000000..6003221ec --- /dev/null +++ b/packages/sgml/RDF/suite/ok/t23.ok @@ -0,0 +1 @@ +rdf(prefix('http://foo.org/doc'), 'http://description.org/schema/Copyright', literal('© 1998, The Foo Organization')). diff --git a/packages/sgml/RDF/suite/ok/t24.ok b/packages/sgml/RDF/suite/ok/t24.ok new file mode 100644 index 000000000..ea31d774b --- /dev/null +++ b/packages/sgml/RDF/suite/ok/t24.ok @@ -0,0 +1,5 @@ +rdf('__Description1', rdf:subject, 'http://www.w3.org/Home/Lassila'). +rdf('__Description1', rdf:predicate, 'http://description.org/schema/Creator'). +rdf('__Description1', rdf:object, literal('Ora Lassila')). +rdf('__Description1', rdf:type, 'http://www.w3.org/1999/02/22-rdf-syntax-ns#Statement'). +rdf('__Description1', 'http://description.org/schema/':attributedTo, literal('Ralph Swick')). diff --git a/packages/sgml/RDF/suite/ok/t25.ok b/packages/sgml/RDF/suite/ok/t25.ok new file mode 100644 index 000000000..e9edf3956 --- /dev/null +++ b/packages/sgml/RDF/suite/ok/t25.ok @@ -0,0 +1,13 @@ +rdf('D_001', rdf:type, rdf:'Bag'). +rdf('http://www.w3.org/Home/Lassila', 'http://description.org/schema/':'Creator', literal('Ora Lassila')). +rdf('__Statement1', rdf:type, rdf:'Statement'). +rdf('__Statement1', rdf:subject, 'http://www.w3.org/Home/Lassila'). +rdf('__Statement1', rdf:predicate, 'http://description.org/schema/':'Creator'). +rdf('__Statement1', rdf:object, literal('Ora Lassila')). +rdf('http://www.w3.org/Home/Lassila', 'http://description.org/schema/':'Title', literal('Ora\'s Home Page')). +rdf('__Statement2', rdf:type, rdf:'Statement'). +rdf('__Statement2', rdf:subject, 'http://www.w3.org/Home/Lassila'). +rdf('__Statement2', rdf:predicate, 'http://description.org/schema/':'Title'). +rdf('__Statement2', rdf:object, literal('Ora\'s Home Page')). +rdf('D_001', rdf:'_1', '__Statement1'). +rdf('D_001', rdf:'_2', '__Statement2'). diff --git a/packages/sgml/RDF/suite/ok/t26.ok b/packages/sgml/RDF/suite/ok/t26.ok new file mode 100644 index 000000000..2b9857745 --- /dev/null +++ b/packages/sgml/RDF/suite/ok/t26.ok @@ -0,0 +1,8 @@ +rdf('JSPapersByDate', rdf:type, rdf:'Seq'). +rdf('JSPapersByDate', rdf:'_1', 'http://www.dogworld.com/Aug96.doc'). +rdf('JSPapersByDate', rdf:'_2', 'http://www.webnuts.net/Jan97.html'). +rdf('JSPapersByDate', rdf:'_3', 'http://www.carchat.com/Sept97.html'). +rdf('JSPapersBySubj', rdf:type, rdf:'Seq'). +rdf('JSPapersBySubj', rdf:'_1', 'http://www.carchat.com/Sept97.html'). +rdf('JSPapersBySubj', rdf:'_2', 'http://www.dogworld.com/Aug96.doc'). +rdf('JSPapersBySubj', rdf:'_3', 'http://www.webnuts.net/Jan97.html'). diff --git a/packages/sgml/RDF/suite/ok/t27.ok b/packages/sgml/RDF/suite/ok/t27.ok new file mode 100644 index 000000000..1cbc29893 --- /dev/null +++ b/packages/sgml/RDF/suite/ok/t27.ok @@ -0,0 +1,12 @@ +rdf('CreatorsAlphabeticalBySurname', rdf:type, rdf:'Seq'). +rdf('CreatorsAlphabeticalBySurname', rdf:'_1', literal('Mary Andrew')). +rdf('CreatorsAlphabeticalBySurname', rdf:'_2', literal('Jacky Crystal')). +rdf('http://www.foo.com/cool.html', 'http://purl.org/metadata/dublin_core#':'Creator', 'CreatorsAlphabeticalBySurname'). +rdf('MirroredSites', rdf:type, rdf:'Bag'). +rdf('MirroredSites', rdf:'_1', 'http://www.foo.com.au/cool.html'). +rdf('MirroredSites', rdf:'_2', 'http://www.foo.com.it/cool.html'). +rdf('http://www.foo.com/cool.html', 'http://purl.org/metadata/dublin_core#':'Identifier', 'MirroredSites'). +rdf('__Alt1', rdf:type, rdf:'Alt'). +rdf('__Alt1', rdf:'_1', literal(lang(en, 'The Coolest Web Page'))). +rdf('__Alt1', rdf:'_2', literal(lang(it, 'Il Pagio di Web Fuba'))). +rdf('http://www.foo.com/cool.html', 'http://purl.org/metadata/dublin_core#':'Title', '__Alt1'). diff --git a/packages/sgml/RDF/suite/ok/t28.ok b/packages/sgml/RDF/suite/ok/t28.ok new file mode 100644 index 000000000..d07bf3ca0 --- /dev/null +++ b/packages/sgml/RDF/suite/ok/t28.ok @@ -0,0 +1,3 @@ +rdf('__Description1', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#':value, literal('020 - Library Science')). +rdf('__Description1', 'http://mycorp.com/schemas/my-schema#':'Classification', literal('Dewey Decimal Code')). +rdf('http://www.webnuts.net/Jan97.html', 'http://purl.org/metadata/dublin_core#':'Subject', '__Description1'). diff --git a/packages/sgml/RDF/suite/ok/t29.ok b/packages/sgml/RDF/suite/ok/t29.ok new file mode 100644 index 000000000..3954da907 --- /dev/null +++ b/packages/sgml/RDF/suite/ok/t29.ok @@ -0,0 +1,3 @@ +rdf('__Description1', rdf:value, literal('200')). +rdf('__Description1', 'http://www.nist.gov/units/':units, 'http://www.nist.gov/units/Pounds'). +rdf('John_Smith', 'http://www.nist.gov/units/':weight, '__Description1'). diff --git a/packages/sgml/RDF/suite/ok/t3.ok b/packages/sgml/RDF/suite/ok/t3.ok new file mode 100644 index 000000000..9e41fbc44 --- /dev/null +++ b/packages/sgml/RDF/suite/ok/t3.ok @@ -0,0 +1,3 @@ +rdf('JW', name, literal('Jan Wielemaker')). +rdf('JW', works_at, literal('SWI')). +rdf('http://www.swi.psy.uva.nl/projects/SWI-Prolog/', 'http://description.org/schema/':'Creator', 'JW'). diff --git a/packages/sgml/RDF/suite/ok/t30.ok b/packages/sgml/RDF/suite/ok/t30.ok new file mode 100644 index 000000000..19601571c --- /dev/null +++ b/packages/sgml/RDF/suite/ok/t30.ok @@ -0,0 +1,12 @@ +rdf('http://www.dlib.org', 'http://purl.org/metadata/dublin_core#':'Title', literal('D-Lib Program - Research in Digital Libraries')). +rdf('http://www.dlib.org', 'http://purl.org/metadata/dublin_core#':'Description', literal('The D-Lib program supports the community of people\n with research interests in digital libraries and electronic\n publishing.')). +rdf('http://www.dlib.org', 'http://purl.org/metadata/dublin_core#':'Publisher', literal('Corporation For National Research Initiatives')). +rdf('http://www.dlib.org', 'http://purl.org/metadata/dublin_core#':'Date', literal('1995-01-07')). +rdf('__Bag1', rdf:type, rdf:'Bag'). +rdf('__Bag1', rdf:'_1', literal('Research; statistical methods')). +rdf('__Bag1', rdf:'_2', literal('Education, research, related topics')). +rdf('__Bag1', rdf:'_3', literal('Library use Studies')). +rdf('http://www.dlib.org', 'http://purl.org/metadata/dublin_core#':'Subject', '__Bag1'). +rdf('http://www.dlib.org', 'http://purl.org/metadata/dublin_core#':'Type', literal('World Wide Web Home Page')). +rdf('http://www.dlib.org', 'http://purl.org/metadata/dublin_core#':'Format', literal('text/html')). +rdf('http://www.dlib.org', 'http://purl.org/metadata/dublin_core#':'Language', literal(en)). diff --git a/packages/sgml/RDF/suite/ok/t31.ok b/packages/sgml/RDF/suite/ok/t31.ok new file mode 100644 index 000000000..9ebb6a357 --- /dev/null +++ b/packages/sgml/RDF/suite/ok/t31.ok @@ -0,0 +1,17 @@ +rdf('http://www.dlib.org/dlib/may98/05contents.html', 'http://purl.org/metadata/dublin_core#':'Title', literal('DLIB Magazine - The Magazine for Digital Library Research\n - May 1998')). +rdf('http://www.dlib.org/dlib/may98/05contents.html', 'http://purl.org/metadata/dublin_core#':'Description', literal('D-LIB magazine is a monthly compilation of\n contributed stories, commentary, and briefings.')). +rdf('__Description1', 'http://purl.org/metadata/dublin_core_qualifiers#':'AgentType', 'http://purl.org/metadata/dublin_core_qualifiers#Editor'). +rdf('__Description1', rdf:value, literal('Amy Friedlander')). +rdf('http://www.dlib.org/dlib/may98/05contents.html', 'http://purl.org/metadata/dublin_core#':'Contributor', '__Description1'). +rdf('http://www.dlib.org/dlib/may98/05contents.html', 'http://purl.org/metadata/dublin_core#':'Publisher', literal('Corporation for National Research Initiatives')). +rdf('http://www.dlib.org/dlib/may98/05contents.html', 'http://purl.org/metadata/dublin_core#':'Date', literal('1998-01-05')). +rdf('http://www.dlib.org/dlib/may98/05contents.html', 'http://purl.org/metadata/dublin_core#':'Type', literal('electronic journal')). +rdf('__Bag1', rdf:type, rdf:'Bag'). +rdf('__Bag1', rdf:'_1', literal('library use studies')). +rdf('__Bag1', rdf:'_2', literal('magazines and newspapers')). +rdf('http://www.dlib.org/dlib/may98/05contents.html', 'http://purl.org/metadata/dublin_core#':'Subject', '__Bag1'). +rdf('http://www.dlib.org/dlib/may98/05contents.html', 'http://purl.org/metadata/dublin_core#':'Format', literal('text/html')). +rdf('http://www.dlib.org/dlib/may98/05contents.html', 'http://purl.org/metadata/dublin_core#':'Identifier', literal('urn:issn:1082-9873')). +rdf('__Description2', 'http://purl.org/metadata/dublin_core_qualifiers#':'RelationType', 'http://purl.org/metadata/dublin_core_qualifiers#IsPartOf'). +rdf('__Description2', rdf:value, 'http://www.dlib.org'). +rdf('http://www.dlib.org/dlib/may98/05contents.html', 'http://purl.org/metadata/dublin_core#':'Relation', '__Description2'). diff --git a/packages/sgml/RDF/suite/ok/t32.ok b/packages/sgml/RDF/suite/ok/t32.ok new file mode 100644 index 000000000..4ad411708 --- /dev/null +++ b/packages/sgml/RDF/suite/ok/t32.ok @@ -0,0 +1,15 @@ +rdf('http://www.dlib.org/dlib/may98/miller/05miller.html', 'http://purl.org/metadata/dublin_core#':'Title', literal('An Introduction to the Resource Description Framework')). +rdf('http://www.dlib.org/dlib/may98/miller/05miller.html', 'http://purl.org/metadata/dublin_core#':'Creator', literal('Eric J. Miller')). +rdf('http://www.dlib.org/dlib/may98/miller/05miller.html', 'http://purl.org/metadata/dublin_core#':'Description', literal('The Resource Description Framework (RDF) is an\n infrastructure that enables the encoding, exchange and reuse of\n structured metadata. rdf is an application of xml that imposes needed\n structural constraints to provide unambiguous methods of expressing\n semantics. rdf additionally provides a means for publishing both\n human-readable and machine-processable vocabularies designed to\n encourage the reuse and extension of metadata semantics among\n disparate information communities. the structural constraints rdf\n imposes to support the consistent encoding and exchange of\n standardized metadata provides for the interchangeability of separate\n packages of metadata defined by different resource description\n communities. ')). +rdf('http://www.dlib.org/dlib/may98/miller/05miller.html', 'http://purl.org/metadata/dublin_core#':'Publisher', literal('Corporation for National Research Initiatives')). +rdf('__Bag1', rdf:type, rdf:'Bag'). +rdf('__Bag1', rdf:'_1', literal('machine-readable catalog record formats')). +rdf('__Bag1', rdf:'_2', literal('applications of computer file organization and\n\t access methods')). +rdf('http://www.dlib.org/dlib/may98/miller/05miller.html', 'http://purl.org/metadata/dublin_core#':'Subject', '__Bag1'). +rdf('http://www.dlib.org/dlib/may98/miller/05miller.html', 'http://purl.org/metadata/dublin_core#':'Rights', literal('Copyright @ 1998 Eric Miller')). +rdf('http://www.dlib.org/dlib/may98/miller/05miller.html', 'http://purl.org/metadata/dublin_core#':'Type', literal('Electronic Document')). +rdf('http://www.dlib.org/dlib/may98/miller/05miller.html', 'http://purl.org/metadata/dublin_core#':'Format', literal('text/html')). +rdf('http://www.dlib.org/dlib/may98/miller/05miller.html', 'http://purl.org/metadata/dublin_core#':'Language', literal(en)). +rdf('__Description1', 'http://purl.org/metadata/dublin_core_qualifiers#':'RelationType', 'http://purl.org/metadata/dublin_core_qualifiers#IsPartOf'). +rdf('__Description1', rdf:value, 'http://www.dlib.org/dlib/may98/05contents.html'). +rdf('http://www.dlib.org/dlib/may98/miller/05miller.html', 'http://purl.org/metadata/dublin_core#':'Relation', '__Description1'). diff --git a/packages/sgml/RDF/suite/ok/t33.ok b/packages/sgml/RDF/suite/ok/t33.ok new file mode 100644 index 000000000..01924299f --- /dev/null +++ b/packages/sgml/RDF/suite/ok/t33.ok @@ -0,0 +1,2 @@ +rdf('http://mycorp.com/papers/NobelPaper1', 'http://purl.org/metadata/dublin_core#Title', literal(type('http://www.w3.org/1999/02/22-rdf-syntax-ns#XMLLiteral', ['Ramifications of ', element('http://www.w3.org/TR/REC-mathml':apply, [], [element('http://www.w3.org/TR/REC-mathml':power, [], []), element('http://www.w3.org/TR/REC-mathml':apply, [], [element('http://www.w3.org/TR/REC-mathml':plus, [], []), element('http://www.w3.org/TR/REC-mathml':ci, [], [a]), element('http://www.w3.org/TR/REC-mathml':ci, [], [b])]), element('http://www.w3.org/TR/REC-mathml':cn, [], ['2'])]), ' to World Peace\n ']))). +rdf('http://mycorp.com/papers/NobelPaper1', 'http://purl.org/metadata/dublin_core#Creator', literal('David Hume')). diff --git a/packages/sgml/RDF/suite/ok/t34.ok b/packages/sgml/RDF/suite/ok/t34.ok new file mode 100644 index 000000000..29ea46ab4 --- /dev/null +++ b/packages/sgml/RDF/suite/ok/t34.ok @@ -0,0 +1,44 @@ +rdf('L01', rdf:type, rdf:'Bag'). +rdf('http://www.w3.org/PICS/Overview.html', 'http://www.gcf.org/v2.5':suds, literal('0.5')). +rdf('__Statement1', rdf:type, rdf:'Statement'). +rdf('__Statement1', rdf:subject, 'http://www.w3.org/PICS/Overview.html'). +rdf('__Statement1', rdf:predicate, 'http://www.gcf.org/v2.5':suds). +rdf('__Statement1', rdf:object, literal('0.5')). +rdf('http://www.w3.org/PICS/Overview.html', 'http://www.gcf.org/v2.5':density, literal('0')). +rdf('__Statement2', rdf:type, rdf:'Statement'). +rdf('__Statement2', rdf:subject, 'http://www.w3.org/PICS/Overview.html'). +rdf('__Statement2', rdf:predicate, 'http://www.gcf.org/v2.5':density). +rdf('__Statement2', rdf:object, literal('0')). +rdf('http://www.w3.org/PICS/Overview.html', 'http://www.gcf.org/v2.5':'color.hue', literal('1')). +rdf('__Statement3', rdf:type, rdf:'Statement'). +rdf('__Statement3', rdf:subject, 'http://www.w3.org/PICS/Overview.html'). +rdf('__Statement3', rdf:predicate, 'http://www.gcf.org/v2.5':'color.hue'). +rdf('__Statement3', rdf:object, literal('1')). +rdf('L01', rdf:'_1', '__Statement1'). +rdf('L01', rdf:'_2', '__Statement2'). +rdf('L01', rdf:'_3', '__Statement3'). +rdf('L02', rdf:type, rdf:'Bag'). +rdf('http://www.w3.org/PICS/Underview.html', 'http://www.gcf.org/v2.5':subject, literal('2')). +rdf('__Statement4', rdf:type, rdf:'Statement'). +rdf('__Statement4', rdf:subject, 'http://www.w3.org/PICS/Underview.html'). +rdf('__Statement4', rdf:predicate, 'http://www.gcf.org/v2.5':subject). +rdf('__Statement4', rdf:object, literal('2')). +rdf('http://www.w3.org/PICS/Underview.html', 'http://www.gcf.org/v2.5':density, literal('1')). +rdf('__Statement5', rdf:type, rdf:'Statement'). +rdf('__Statement5', rdf:subject, 'http://www.w3.org/PICS/Underview.html'). +rdf('__Statement5', rdf:predicate, 'http://www.gcf.org/v2.5':density). +rdf('__Statement5', rdf:object, literal('1')). +rdf('http://www.w3.org/PICS/Underview.html', 'http://www.gcf.org/v2.5':'color.hue', literal('1')). +rdf('__Statement6', rdf:type, rdf:'Statement'). +rdf('__Statement6', rdf:subject, 'http://www.w3.org/PICS/Underview.html'). +rdf('__Statement6', rdf:predicate, 'http://www.gcf.org/v2.5':'color.hue'). +rdf('__Statement6', rdf:object, literal('1')). +rdf('L02', rdf:'_1', '__Statement4'). +rdf('L02', rdf:'_2', '__Statement5'). +rdf('L02', rdf:'_3', '__Statement6'). +rdf(each('L01'), 'http://www.w3.org/TR/xxxx/WD-PICS-labels#':by, literal('John Doe')). +rdf(each('L01'), 'http://www.w3.org/TR/xxxx/WD-PICS-labels#':on, literal('1994.11.05T08:15-0500')). +rdf(each('L01'), 'http://www.w3.org/TR/xxxx/WD-PICS-labels#':until, literal('1995.12.31T23:59-0000')). +rdf(each('L02'), 'http://www.w3.org/TR/xxxx/WD-PICS-labels#':by, literal('Jane Doe')). +rdf(each('L02'), 'http://www.w3.org/TR/xxxx/WD-PICS-labels#':on, literal('1994.11.05T08:15-0500')). +rdf(each('L02'), 'http://www.w3.org/TR/xxxx/WD-PICS-labels#':until, literal('1995.12.31T23:59-0000')). diff --git a/packages/sgml/RDF/suite/ok/t35.ok b/packages/sgml/RDF/suite/ok/t35.ok new file mode 100644 index 000000000..30e595469 --- /dev/null +++ b/packages/sgml/RDF/suite/ok/t35.ok @@ -0,0 +1,8 @@ +rdf('L03', rdf:type, rdf:'Bag'). +rdf(prefix('http://www.w3.org/WWW/'), 'http://www.ages.org/our-service/v1.0/':age, literal('11')). +rdf('__Statement1', rdf:type, rdf:'Statement'). +rdf('__Statement1', rdf:subject, prefix('http://www.w3.org/WWW/')). +rdf('__Statement1', rdf:predicate, 'http://www.ages.org/our-service/v1.0/':age). +rdf('__Statement1', rdf:object, literal('11')). +rdf('L03', rdf:'_1', '__Statement1'). +rdf(each('L03'), 'http://www.w3.org/TR/xxxx/WD-PICS-labels#':by, literal('abaird@w3.org')). diff --git a/packages/sgml/RDF/suite/ok/t36.ok b/packages/sgml/RDF/suite/ok/t36.ok new file mode 100644 index 000000000..30f016790 --- /dev/null +++ b/packages/sgml/RDF/suite/ok/t36.ok @@ -0,0 +1,4 @@ +rdf('CreatorsAlphabeticalBySurname', rdf:type, rdf:'Seq'). +rdf('CreatorsAlphabeticalBySurname', rdf:'_1', 'Mary Andrew'). +rdf('CreatorsAlphabeticalBySurname', rdf:'_2', 'Jacky Crystal'). +rdf('http://www.foo.com/cool.html', 'http://purl.org/metadata/dublin_core#':'Creator', 'CreatorsAlphabeticalBySurname'). diff --git a/packages/sgml/RDF/suite/ok/t37.ok b/packages/sgml/RDF/suite/ok/t37.ok new file mode 100644 index 000000000..f5aab84b5 --- /dev/null +++ b/packages/sgml/RDF/suite/ok/t37.ok @@ -0,0 +1,7 @@ +rdf('JW', name, literal('Jan Wielemaker')). +rdf('JW', works_at, literal('SWI')). +rdf('http://www.swi.psy.uva.nl/projects/SWI-Prolog/', 'http://description.org/schema/':'Creator', 'JW'). +rdf('pl-creator', rdf:type, rdf:'Statement'). +rdf('pl-creator', rdf:subject, 'http://www.swi.psy.uva.nl/projects/SWI-Prolog/'). +rdf('pl-creator', rdf:predicate, 'http://description.org/schema/':'Creator'). +rdf('pl-creator', rdf:object, 'JW'). diff --git a/packages/sgml/RDF/suite/ok/t38.ok b/packages/sgml/RDF/suite/ok/t38.ok new file mode 100644 index 000000000..9d1858949 --- /dev/null +++ b/packages/sgml/RDF/suite/ok/t38.ok @@ -0,0 +1,128 @@ +rdf('Resource', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type', 'http://www.w3.org/2000/01/rdf-schema#Class'). +rdf('Resource', 'http://www.w3.org/2000/01/rdf-schema#label', literal(lang(en, 'Resource'))). +rdf('Resource', 'http://www.w3.org/2000/01/rdf-schema#label', literal(lang(fr, 'Ressource'))). +rdf('Resource', 'http://www.w3.org/2000/01/rdf-schema#comment', literal('The most general class')). +rdf('http://www.w3.org/1999/02/22-rdf-syntax-ns#type', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#Property'). +rdf('http://www.w3.org/1999/02/22-rdf-syntax-ns#type', 'http://www.w3.org/2000/01/rdf-schema#label', literal(lang(en, type))). +rdf('http://www.w3.org/1999/02/22-rdf-syntax-ns#type', 'http://www.w3.org/2000/01/rdf-schema#label', literal(lang(fr, type))). +rdf('http://www.w3.org/1999/02/22-rdf-syntax-ns#type', 'http://www.w3.org/2000/01/rdf-schema#comment', literal('Indicates membership of a class')). +rdf('http://www.w3.org/1999/02/22-rdf-syntax-ns#type', 'http://www.w3.org/2000/01/rdf-schema#range', 'Class'). +rdf(comment, 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#Property'). +rdf(comment, 'http://www.w3.org/2000/01/rdf-schema#label', literal(lang(en, comment))). +rdf(comment, 'http://www.w3.org/2000/01/rdf-schema#label', literal(lang(fr, commentaire))). +rdf(comment, 'http://www.w3.org/2000/01/rdf-schema#domain', 'Resource'). +rdf(comment, 'http://www.w3.org/2000/01/rdf-schema#comment', literal('Use this for descriptions')). +rdf(comment, 'http://www.w3.org/2000/01/rdf-schema#range', 'Literal'). +rdf(label, 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#Property'). +rdf(label, 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#Property'). +rdf(label, 'http://www.w3.org/2000/01/rdf-schema#label', literal(lang(en, label))). +rdf(label, 'http://www.w3.org/2000/01/rdf-schema#label', literal(lang(fr, label))). +rdf(label, 'http://www.w3.org/2000/01/rdf-schema#domain', 'Resource'). +rdf(label, 'http://www.w3.org/2000/01/rdf-schema#comment', literal('Provides a human-readable version of a resource name.')). +rdf(label, 'http://www.w3.org/2000/01/rdf-schema#range', 'Literal'). +rdf('Class', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type', 'http://www.w3.org/2000/01/rdf-schema#Class'). +rdf('Class', 'http://www.w3.org/2000/01/rdf-schema#label', literal(lang(en, 'Class'))). +rdf('Class', 'http://www.w3.org/2000/01/rdf-schema#label', literal(lang(fr, 'Classe'))). +rdf('Class', 'http://www.w3.org/2000/01/rdf-schema#comment', literal('The concept of Class')). +rdf('Class', 'http://www.w3.org/2000/01/rdf-schema#subClassOf', 'Resource'). +rdf(subClassOf, 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#Property'). +rdf(subClassOf, 'http://www.w3.org/2000/01/rdf-schema#label', literal(lang(en, subClassOf))). +rdf(subClassOf, 'http://www.w3.org/2000/01/rdf-schema#label', literal(lang(fr, sousClasseDe))). +rdf(subClassOf, 'http://www.w3.org/2000/01/rdf-schema#comment', literal('Indicates membership of a class')). +rdf(subClassOf, 'http://www.w3.org/2000/01/rdf-schema#range', 'Class'). +rdf(subClassOf, 'http://www.w3.org/2000/01/rdf-schema#domain', 'Class'). +rdf(subPropertyOf, 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#Property'). +rdf(subPropertyOf, 'http://www.w3.org/2000/01/rdf-schema#label', literal(lang(en, subPropertyOf))). +rdf(subPropertyOf, 'http://www.w3.org/2000/01/rdf-schema#label', literal(lang(fr, sousPropriétéDe))). +rdf(subPropertyOf, 'http://www.w3.org/2000/01/rdf-schema#comment', literal('Indicates specialization of properties')). +rdf(subPropertyOf, 'http://www.w3.org/2000/01/rdf-schema#range', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#Property'). +rdf(subPropertyOf, 'http://www.w3.org/2000/01/rdf-schema#domain', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#Property'). +rdf(seeAlso, 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#Property'). +rdf(seeAlso, 'http://www.w3.org/2000/01/rdf-schema#label', literal(lang(en, seeAlso))). +rdf(seeAlso, 'http://www.w3.org/2000/01/rdf-schema#label', literal(lang(fr, voirAussi))). +rdf(seeAlso, 'http://www.w3.org/2000/01/rdf-schema#comment', literal('Indicates a resource that provides information about the subject resource.')). +rdf(seeAlso, 'http://www.w3.org/2000/01/rdf-schema#range', 'http://www.w3.org/2000/01/rdf-schema#Resource'). +rdf(seeAlso, 'http://www.w3.org/2000/01/rdf-schema#domain', 'http://www.w3.org/2000/01/rdf-schema#Resource'). +rdf(isDefinedBy, 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#Property'). +rdf(isDefinedBy, 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#Property'). +rdf(isDefinedBy, 'http://www.w3.org/2000/01/rdf-schema#subPropertyOf', seeAlso). +rdf(isDefinedBy, 'http://www.w3.org/2000/01/rdf-schema#label', literal(lang(en, isDefinedBy))). +rdf(isDefinedBy, 'http://www.w3.org/2000/01/rdf-schema#label', literal(lang(fr, esDéfiniPar))). +rdf(isDefinedBy, 'http://www.w3.org/2000/01/rdf-schema#comment', literal('Indicates a resource containing and defining the subject resource.')). +rdf(isDefinedBy, 'http://www.w3.org/2000/01/rdf-schema#range', 'http://www.w3.org/2000/01/rdf-schema#Resource'). +rdf(isDefinedBy, 'http://www.w3.org/2000/01/rdf-schema#domain', 'http://www.w3.org/2000/01/rdf-schema#Resource'). +rdf('ConstraintResource', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type', 'http://www.w3.org/2000/01/rdf-schema#Class'). +rdf('ConstraintResource', 'http://www.w3.org/2000/01/rdf-schema#label', literal(lang(en, 'ConstraintResource'))). +rdf('ConstraintResource', 'http://www.w3.org/2000/01/rdf-schema#label', literal(lang(fr, 'RessourceContrainte'))). +rdf('ConstraintResource', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type', 'Class'). +rdf('ConstraintResource', 'http://www.w3.org/2000/01/rdf-schema#subClassOf', 'Resource'). +rdf('ConstraintResource', 'http://www.w3.org/2000/01/rdf-schema#comment', literal('Resources used to express RDF Schema constraints.')). +rdf('ConstraintProperty', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type', 'http://www.w3.org/2000/01/rdf-schema#Class'). +rdf('ConstraintProperty', 'http://www.w3.org/2000/01/rdf-schema#label', literal(lang(en, 'ConstraintProperty'))). +rdf('ConstraintProperty', 'http://www.w3.org/2000/01/rdf-schema#label', literal(lang(fr, 'PropriétéContrainte'))). +rdf('ConstraintProperty', 'http://www.w3.org/2000/01/rdf-schema#subClassOf', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#Property'). +rdf('ConstraintProperty', 'http://www.w3.org/2000/01/rdf-schema#subClassOf', 'ConstraintResource'). +rdf('ConstraintProperty', 'http://www.w3.org/2000/01/rdf-schema#comment', literal('Properties used to express RDF Schema constraints.')). +rdf(domain, 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type', 'http://www.w3.org/2000/01/rdf-schema#ConstraintProperty'). +rdf(domain, 'http://www.w3.org/2000/01/rdf-schema#label', literal(lang(en, domain))). +rdf(domain, 'http://www.w3.org/2000/01/rdf-schema#label', literal(lang(fr, domaine))). +rdf(domain, 'http://www.w3.org/2000/01/rdf-schema#comment', literal('This is how we associate a class with\n properties that its instances can have')). +rdf(range, 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type', 'http://www.w3.org/2000/01/rdf-schema#ConstraintProperty'). +rdf(range, 'http://www.w3.org/2000/01/rdf-schema#label', literal(lang(en, range))). +rdf(range, 'http://www.w3.org/2000/01/rdf-schema#label', literal(lang(fr, étendue))). +rdf(range, 'http://www.w3.org/2000/01/rdf-schema#comment', literal('Properties that can be used in a\n schema to provide constraints')). +rdf(range, 'http://www.w3.org/2000/01/rdf-schema#range', 'Class'). +rdf(range, 'http://www.w3.org/2000/01/rdf-schema#domain', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#Property'). +rdf('http://www.w3.org/1999/02/22-rdf-syntax-ns#Property', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type', 'http://www.w3.org/2000/01/rdf-schema#Class'). +rdf('http://www.w3.org/1999/02/22-rdf-syntax-ns#Property', 'http://www.w3.org/2000/01/rdf-schema#label', literal(lang(en, 'Property'))). +rdf('http://www.w3.org/1999/02/22-rdf-syntax-ns#Property', 'http://www.w3.org/2000/01/rdf-schema#label', literal(lang(fr, 'Propriété'))). +rdf('http://www.w3.org/1999/02/22-rdf-syntax-ns#Property', 'http://www.w3.org/2000/01/rdf-schema#comment', literal('The concept of a property.')). +rdf('http://www.w3.org/1999/02/22-rdf-syntax-ns#Property', 'http://www.w3.org/2000/01/rdf-schema#subClassOf', 'Resource'). +rdf('Literal', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type', 'http://www.w3.org/2000/01/rdf-schema#Class'). +rdf('Literal', 'http://www.w3.org/2000/01/rdf-schema#label', literal(lang(en, 'Literal'))). +rdf('Literal', 'http://www.w3.org/2000/01/rdf-schema#label', literal(lang(fr, 'Littéral'))). +rdf('Literal', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type', 'Class'). +rdf('Literal', 'http://www.w3.org/2000/01/rdf-schema#comment', literal('This represents the set of atomic values, eg. textual strings.')). +rdf('http://www.w3.org/1999/02/22-rdf-syntax-ns#Statement', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type', 'http://www.w3.org/2000/01/rdf-schema#Class'). +rdf('http://www.w3.org/1999/02/22-rdf-syntax-ns#Statement', 'http://www.w3.org/2000/01/rdf-schema#label', literal(lang(en, 'Statement'))). +rdf('http://www.w3.org/1999/02/22-rdf-syntax-ns#Statement', 'http://www.w3.org/2000/01/rdf-schema#label', literal(lang(fr, 'Déclaration'))). +rdf('http://www.w3.org/1999/02/22-rdf-syntax-ns#Statement', 'http://www.w3.org/2000/01/rdf-schema#subClassOf', 'Resource'). +rdf('http://www.w3.org/1999/02/22-rdf-syntax-ns#Statement', 'http://www.w3.org/2000/01/rdf-schema#comment', literal('This represents the set of reified statements.')). +rdf('http://www.w3.org/1999/02/22-rdf-syntax-ns#subject', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#Property'). +rdf('http://www.w3.org/1999/02/22-rdf-syntax-ns#subject', 'http://www.w3.org/2000/01/rdf-schema#label', literal(lang(en, subject))). +rdf('http://www.w3.org/1999/02/22-rdf-syntax-ns#subject', 'http://www.w3.org/2000/01/rdf-schema#label', literal(lang(fr, sujet))). +rdf('http://www.w3.org/1999/02/22-rdf-syntax-ns#subject', 'http://www.w3.org/2000/01/rdf-schema#domain', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#Statement'). +rdf('http://www.w3.org/1999/02/22-rdf-syntax-ns#subject', 'http://www.w3.org/2000/01/rdf-schema#range', 'Resource'). +rdf('http://www.w3.org/1999/02/22-rdf-syntax-ns#predicate', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#Property'). +rdf('http://www.w3.org/1999/02/22-rdf-syntax-ns#predicate', 'http://www.w3.org/2000/01/rdf-schema#label', literal(lang(en, predicate))). +rdf('http://www.w3.org/1999/02/22-rdf-syntax-ns#predicate', 'http://www.w3.org/2000/01/rdf-schema#label', literal(lang(fr, prédicat))). +rdf('http://www.w3.org/1999/02/22-rdf-syntax-ns#predicate', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#Property'). +rdf('http://www.w3.org/1999/02/22-rdf-syntax-ns#predicate', 'http://www.w3.org/2000/01/rdf-schema#domain', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#Statement'). +rdf('http://www.w3.org/1999/02/22-rdf-syntax-ns#predicate', 'http://www.w3.org/2000/01/rdf-schema#range', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#Property'). +rdf('http://www.w3.org/1999/02/22-rdf-syntax-ns#object', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#Property'). +rdf('http://www.w3.org/1999/02/22-rdf-syntax-ns#object', 'http://www.w3.org/2000/01/rdf-schema#label', literal(lang(en, object))). +rdf('http://www.w3.org/1999/02/22-rdf-syntax-ns#object', 'http://www.w3.org/2000/01/rdf-schema#label', literal(lang(fr, objet))). +rdf('http://www.w3.org/1999/02/22-rdf-syntax-ns#object', 'http://www.w3.org/2000/01/rdf-schema#domain', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#Statement'). +rdf('Container', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type', 'http://www.w3.org/2000/01/rdf-schema#Class'). +rdf('Container', 'http://www.w3.org/2000/01/rdf-schema#label', literal(lang(en, 'Container'))). +rdf('Container', 'http://www.w3.org/2000/01/rdf-schema#label', literal(lang(fr, 'Enveloppe'))). +rdf('Container', 'http://www.w3.org/2000/01/rdf-schema#subClassOf', 'Resource'). +rdf('Container', 'http://www.w3.org/2000/01/rdf-schema#comment', literal('This represents the set Containers.')). +rdf('http://www.w3.org/1999/02/22-rdf-syntax-ns#Bag', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type', 'http://www.w3.org/2000/01/rdf-schema#Class'). +rdf('http://www.w3.org/1999/02/22-rdf-syntax-ns#Bag', 'http://www.w3.org/2000/01/rdf-schema#label', literal(lang(en, 'Bag'))). +rdf('http://www.w3.org/1999/02/22-rdf-syntax-ns#Bag', 'http://www.w3.org/2000/01/rdf-schema#label', literal(lang(fr, 'Ensemble'))). +rdf('http://www.w3.org/1999/02/22-rdf-syntax-ns#Bag', 'http://www.w3.org/2000/01/rdf-schema#subClassOf', 'Container'). +rdf('http://www.w3.org/1999/02/22-rdf-syntax-ns#Seq', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type', 'http://www.w3.org/2000/01/rdf-schema#Class'). +rdf('http://www.w3.org/1999/02/22-rdf-syntax-ns#Seq', 'http://www.w3.org/2000/01/rdf-schema#label', literal(lang(en, 'Sequence'))). +rdf('http://www.w3.org/1999/02/22-rdf-syntax-ns#Seq', 'http://www.w3.org/2000/01/rdf-schema#label', literal(lang(fr, 'Séquence'))). +rdf('http://www.w3.org/1999/02/22-rdf-syntax-ns#Seq', 'http://www.w3.org/2000/01/rdf-schema#subClassOf', 'Container'). +rdf('http://www.w3.org/1999/02/22-rdf-syntax-ns#Alt', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type', 'http://www.w3.org/2000/01/rdf-schema#Class'). +rdf('http://www.w3.org/1999/02/22-rdf-syntax-ns#Alt', 'http://www.w3.org/2000/01/rdf-schema#label', literal(lang(en, 'Alt'))). +rdf('http://www.w3.org/1999/02/22-rdf-syntax-ns#Alt', 'http://www.w3.org/2000/01/rdf-schema#label', literal(lang(fr, 'Choix'))). +rdf('http://www.w3.org/1999/02/22-rdf-syntax-ns#Alt', 'http://www.w3.org/2000/01/rdf-schema#subClassOf', 'Container'). +rdf('ContainerMembershipProperty', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type', 'http://www.w3.org/2000/01/rdf-schema#Class'). +rdf('ContainerMembershipProperty', 'http://www.w3.org/2000/01/rdf-schema#label', literal(lang(en, 'ContainerMembershipProperty'))). +rdf('ContainerMembershipProperty', 'http://www.w3.org/2000/01/rdf-schema#subClassOf', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#Property'). +rdf('http://www.w3.org/1999/02/22-rdf-syntax-ns#value', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#Property'). +rdf('http://www.w3.org/1999/02/22-rdf-syntax-ns#value', 'http://www.w3.org/2000/01/rdf-schema#label', literal(lang(en, object))). +rdf('http://www.w3.org/1999/02/22-rdf-syntax-ns#value', 'http://www.w3.org/2000/01/rdf-schema#label', literal(lang(fr, value))). diff --git a/packages/sgml/RDF/suite/ok/t39.ok b/packages/sgml/RDF/suite/ok/t39.ok new file mode 100644 index 000000000..b0599f6e0 --- /dev/null +++ b/packages/sgml/RDF/suite/ok/t39.ok @@ -0,0 +1,9 @@ +rdf('OntologyObjectMetaClass', rdf:type, 'http://www.w3.org/TR/1999/PR-rdf-schema-19990303#Class'). +rdf('OntologyObjectMetaClass', 'http://www.w3.org/TR/1999/PR-rdf-schema-19990303#':subClassOf, 'http://www.w3.org/TR/1999/PR-rdf-schema-19990303#Class'). +rdf('OntologyObjectMetaClass', 'http://smi-web.stanford.edu/projects/protege/protege-rdf/protege-19992012#':abstractProperty, literal(concrete)). +rdf(identifier, rdf:type, 'http://www.w3.org/TR/1999/PR-rdf-schema-19990303#Property'). +rdf('__Description1', rdf:type, 'http://www.w3.org/rdfutil#facetResource'). +rdf('__Description1', 'http://www.w3.org/rdfutil#':domain, 'OntologyObjectMetaClass'). +rdf('__Description1', 'http://www.w3.org/rdfutil#':range, 'http://www.w3.org/TR/xmlschema-2/#string'). +rdf('__Description1', 'http://www.w3.org/rdfutil#':cardinality, literal('1')). +rdf(identifier, 'http://www.w3.org/rdfutil#':facets, '__Description1'). diff --git a/packages/sgml/RDF/suite/ok/t4.ok b/packages/sgml/RDF/suite/ok/t4.ok new file mode 100644 index 000000000..a5ff63035 --- /dev/null +++ b/packages/sgml/RDF/suite/ok/t4.ok @@ -0,0 +1,6 @@ +rdf('http://www.w3.org/Home/Lassila', 'http://description.org/schema/':'Creator', literal('Ora Lassila')). +rdf(statement1, rdf:type, rdf:'Statement'). +rdf(statement1, rdf:subject, 'http://www.w3.org/Home/Lassila'). +rdf(statement1, rdf:predicate, 'http://description.org/schema/':'Creator'). +rdf(statement1, rdf:object, literal('Ora Lassila')). +rdf(statement1, 'http://description.org/schema/':believedBy, literal('Stefan Decker')). diff --git a/packages/sgml/RDF/suite/ok/t40.ok b/packages/sgml/RDF/suite/ok/t40.ok new file mode 100644 index 000000000..71c6ab496 --- /dev/null +++ b/packages/sgml/RDF/suite/ok/t40.ok @@ -0,0 +1,4 @@ +rdf('__Description1', rdf:type, 'http://www.mytypes.org/schema/Bicycle'). +rdf('__Description1', wheels, literal('2')). +rdf('__Description2', rdf:type, 'http://www.mytypes.org/schema/Bicycle'). +rdf('__Description2', 'http://www.mytypes.org/schema/':wheels, literal('2')). diff --git a/packages/sgml/RDF/suite/ok/t41.ok b/packages/sgml/RDF/suite/ok/t41.ok new file mode 100644 index 000000000..ce476fcc6 --- /dev/null +++ b/packages/sgml/RDF/suite/ok/t41.ok @@ -0,0 +1,3 @@ +rdf('JohnWeight', rdf:value, literal('200')). +rdf('JohnWeight', 'http://www.nist.gov/units/':units, 'http://www.nist.gov/units/Pounds'). +rdf('John_Smith', 'http://www.nist.gov/units/':weight, 'JohnWeight'). diff --git a/packages/sgml/RDF/suite/ok/t42.ok b/packages/sgml/RDF/suite/ok/t42.ok new file mode 100644 index 000000000..326e0a56d --- /dev/null +++ b/packages/sgml/RDF/suite/ok/t42.ok @@ -0,0 +1,4 @@ +rdf('John_Smith', a1, literal('John')). +rdf('__Description1', v, literal('200')). +rdf('__Description1', t, literal(pounds)). +rdf('John_Smith', a1, '__Description1'). diff --git a/packages/sgml/RDF/suite/ok/t5.ok b/packages/sgml/RDF/suite/ok/t5.ok new file mode 100644 index 000000000..f5dd815e4 --- /dev/null +++ b/packages/sgml/RDF/suite/ok/t5.ok @@ -0,0 +1,7 @@ +rdf('__Description1', name, literal('Jan Wielemaker')). +rdf('__Description1', works_at, literal('SWI')). +rdf('http://www.swi.psy.uva.nl/projects/SWI-Prolog/', 'http://description.org/schema/':'Creator', '__Description1'). +rdf('JW', rdf:type, rdf:'Statement'). +rdf('JW', rdf:subject, 'http://www.swi.psy.uva.nl/projects/SWI-Prolog/'). +rdf('JW', rdf:predicate, 'http://description.org/schema/':'Creator'). +rdf('JW', rdf:object, '__Description1'). diff --git a/packages/sgml/RDF/suite/ok/t6.ok b/packages/sgml/RDF/suite/ok/t6.ok new file mode 100644 index 000000000..281850d72 --- /dev/null +++ b/packages/sgml/RDF/suite/ok/t6.ok @@ -0,0 +1,4 @@ +rdf('__Bag1', rdf:type, rdf:'Bag'). +rdf('__Bag1', rdf:'_1', literal('Jan Wielemaker')). +rdf('__Bag1', rdf:'_2', literal('Anjo Anjewierden')). +rdf('http://www.swi.psy.uva.nl/projects/xpce/', 'http://description.org/schema/':'Creator', '__Bag1'). diff --git a/packages/sgml/RDF/suite/ok/t7.ok b/packages/sgml/RDF/suite/ok/t7.ok new file mode 100644 index 000000000..f200bb6f8 --- /dev/null +++ b/packages/sgml/RDF/suite/ok/t7.ok @@ -0,0 +1,5 @@ +rdf('__Bag1', rdf:type, rdf:'Bag'). +rdf('__Description1', name, literal('Jan Wielemaker')). +rdf('__Description1', employed_at, literal('SWI')). +rdf('__Bag1', rdf:'_1', '__Description1'). +rdf('http://www.swi.psy.uva.nl/projects/xpce/', 'http://description.org/schema/':'Creator', '__Bag1'). diff --git a/packages/sgml/RDF/suite/ok/t8.ok b/packages/sgml/RDF/suite/ok/t8.ok new file mode 100644 index 000000000..737ac05c0 --- /dev/null +++ b/packages/sgml/RDF/suite/ok/t8.ok @@ -0,0 +1,3 @@ +rdf(weight_001, 'http://www.nist.gov/RDFschema/':'Units', 'http://www.nist.gov/units/pounds'). +rdf(weight_001, value, literal('200')). +rdf('John_Smith', 'http://www.nist.gov/RDFschema/':'Weight', weight_001). diff --git a/packages/sgml/RDF/suite/ok/t9.ok b/packages/sgml/RDF/suite/ok/t9.ok new file mode 100644 index 000000000..e9867c898 --- /dev/null +++ b/packages/sgml/RDF/suite/ok/t9.ok @@ -0,0 +1,9 @@ +rdf('Statement_001', rdf:type, rdf:'Bag'). +rdf('http://www.bar.com/some.doc', 'http://purl.org/metadata/dublin_core/':'Creator', literal('John Smith')). +rdf('__Statement1', rdf:type, rdf:'Statement'). +rdf('__Statement1', rdf:subject, 'http://www.bar.com/some.doc'). +rdf('__Statement1', rdf:predicate, 'http://purl.org/metadata/dublin_core/':'Creator'). +rdf('__Statement1', rdf:object, literal('John Smith')). +rdf('Statement_001', rdf:'_1', '__Statement1'). +rdf(each('Statement_001'), 'http://www.w3.org/Schemas/DS-Schema/':'CreatedOn', literal('1998-02-06T14:00Z')). +rdf(each('Statement_001'), 'http://www.w3.org/Schemas/DS-Schema/':'CreatedBy', literal('Jane Cooper')). diff --git a/packages/sgml/RDF/suite/ok/types.ok b/packages/sgml/RDF/suite/ok/types.ok new file mode 100644 index 000000000..047e7390b --- /dev/null +++ b/packages/sgml/RDF/suite/ok/types.ok @@ -0,0 +1,6 @@ +rdf('__Description1', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type', 'http://www.w3.org/2002/07/owl#Restriction'). +rdf('http://www.w3.org/2002/03owlt/I5.8/inconsistent001#p', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type', 'http://www.w3.org/2002/07/owl#DatatypeProperty'). +rdf('http://www.w3.org/2002/03owlt/I5.8/inconsistent001#p', 'http://www.w3.org/2000/01/rdf-schema#range', 'http://www.w3.org/2001/XMLSchema#byte'). +rdf('__Description1', 'http://www.w3.org/2002/07/owl#onProperty', 'http://www.w3.org/2002/03owlt/I5.8/inconsistent001#p'). +rdf('__Description1', 'http://www.w3.org/2002/07/owl#cardinality', literal(type('http://www.w3.org/2001/XMLSchema#nonNegativeInteger', '257'))). +rdf('http://www.w3.org/2002/03owlt/I5.8/inconsistent001#john', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type', '__Description1'). diff --git a/packages/sgml/RDF/suite/ok/xmllit.ok b/packages/sgml/RDF/suite/ok/xmllit.ok new file mode 100644 index 000000000..b6f7c8f38 --- /dev/null +++ b/packages/sgml/RDF/suite/ok/xmllit.ok @@ -0,0 +1,2 @@ +rdf(id1, 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type', 'http://www.swi.psy.uva.nl/test#test'). +rdf(id1, 'http://www.swi.psy.uva.nl/test#a', literal(type('http://www.w3.org/1999/02/22-rdf-syntax-ns#XMLLiteral', [element(b, [], [strong])]))). diff --git a/packages/sgml/RDF/suite/ok/xsdtypes.ok b/packages/sgml/RDF/suite/ok/xsdtypes.ok new file mode 100644 index 000000000..fd42f66d6 --- /dev/null +++ b/packages/sgml/RDF/suite/ok/xsdtypes.ok @@ -0,0 +1,2 @@ +rdf(id1, rdf:type, 'http://www.swi.psy.uva.nl/test#test'). +rdf(id1, 'http://www.swi.psy.uva.nl/test#':int, literal(type('http://www.w3.org/2000/10/XMLSchema#int', '42'))). diff --git a/packages/sgml/RDF/suite/t1.rdf b/packages/sgml/RDF/suite/t1.rdf new file mode 100644 index 000000000..56a753b4e --- /dev/null +++ b/packages/sgml/RDF/suite/t1.rdf @@ -0,0 +1,10 @@ + + + + + + + Ora Lassila + + diff --git a/packages/sgml/RDF/suite/t10.rdf b/packages/sgml/RDF/suite/t10.rdf new file mode 100644 index 000000000..faaf907af --- /dev/null +++ b/packages/sgml/RDF/suite/t10.rdf @@ -0,0 +1,12 @@ + + + + +
  • +
  • + + + Ora Lassila + + diff --git a/packages/sgml/RDF/suite/t11.rdf b/packages/sgml/RDF/suite/t11.rdf new file mode 100644 index 000000000..a9c760111 --- /dev/null +++ b/packages/sgml/RDF/suite/t11.rdf @@ -0,0 +1,7 @@ + + + + + diff --git a/packages/sgml/RDF/suite/t12.rdf b/packages/sgml/RDF/suite/t12.rdf new file mode 100644 index 000000000..9491f7f75 --- /dev/null +++ b/packages/sgml/RDF/suite/t12.rdf @@ -0,0 +1,11 @@ + + + + + + + Ora Lassila + + diff --git a/packages/sgml/RDF/suite/t13.rdf b/packages/sgml/RDF/suite/t13.rdf new file mode 100644 index 000000000..42d6b2e0c --- /dev/null +++ b/packages/sgml/RDF/suite/t13.rdf @@ -0,0 +1,9 @@ + + + + + + + Ora Lassila + + diff --git a/packages/sgml/RDF/suite/t14.rdf b/packages/sgml/RDF/suite/t14.rdf new file mode 100644 index 000000000..ce3d083ce --- /dev/null +++ b/packages/sgml/RDF/suite/t14.rdf @@ -0,0 +1,12 @@ + + + + + + + World Wide Web Consortium + W3C Home Page + 1998-10-03T02:27 + + diff --git a/packages/sgml/RDF/suite/t15.rdf b/packages/sgml/RDF/suite/t15.rdf new file mode 100644 index 000000000..5c6b60332 --- /dev/null +++ b/packages/sgml/RDF/suite/t15.rdf @@ -0,0 +1,11 @@ + + + + + + + diff --git a/packages/sgml/RDF/suite/t16.rdf b/packages/sgml/RDF/suite/t16.rdf new file mode 100644 index 000000000..236add31b --- /dev/null +++ b/packages/sgml/RDF/suite/t16.rdf @@ -0,0 +1,15 @@ + + + + + + + + + + + Ora Lassila + lassila@w3.org + + diff --git a/packages/sgml/RDF/suite/t17.rdf b/packages/sgml/RDF/suite/t17.rdf new file mode 100644 index 000000000..933dc074a --- /dev/null +++ b/packages/sgml/RDF/suite/t17.rdf @@ -0,0 +1,15 @@ + + + + + + + + + Ora Lassila + lassila@w3.org + + + + diff --git a/packages/sgml/RDF/suite/t18.rdf b/packages/sgml/RDF/suite/t18.rdf new file mode 100644 index 000000000..61bde093c --- /dev/null +++ b/packages/sgml/RDF/suite/t18.rdf @@ -0,0 +1,12 @@ + + + + + + + + + diff --git a/packages/sgml/RDF/suite/t19.rdf b/packages/sgml/RDF/suite/t19.rdf new file mode 100644 index 000000000..fde4d032a --- /dev/null +++ b/packages/sgml/RDF/suite/t19.rdf @@ -0,0 +1,17 @@ + + + + + + + + + + Ora Lassila + lassila@w3.org + + + + diff --git a/packages/sgml/RDF/suite/t2.rdf b/packages/sgml/RDF/suite/t2.rdf new file mode 100644 index 000000000..1c079512b --- /dev/null +++ b/packages/sgml/RDF/suite/t2.rdf @@ -0,0 +1,8 @@ + + + + + + + diff --git a/packages/sgml/RDF/suite/t20.rdf b/packages/sgml/RDF/suite/t20.rdf new file mode 100644 index 000000000..03109be63 --- /dev/null +++ b/packages/sgml/RDF/suite/t20.rdf @@ -0,0 +1,17 @@ + + + + + + + + + + Ora Lassila + lassila@w3.org + + + + diff --git a/packages/sgml/RDF/suite/t21.rdf b/packages/sgml/RDF/suite/t21.rdf new file mode 100644 index 000000000..09abd8f67 --- /dev/null +++ b/packages/sgml/RDF/suite/t21.rdf @@ -0,0 +1,19 @@ + + + + + + + + + + + + + + + + + diff --git a/packages/sgml/RDF/suite/t22.rdf b/packages/sgml/RDF/suite/t22.rdf new file mode 100644 index 000000000..84ec1aa82 --- /dev/null +++ b/packages/sgml/RDF/suite/t22.rdf @@ -0,0 +1,17 @@ + + + + + + + + + + + + + + + diff --git a/packages/sgml/RDF/suite/t23.rdf b/packages/sgml/RDF/suite/t23.rdf new file mode 100644 index 000000000..9d826e056 --- /dev/null +++ b/packages/sgml/RDF/suite/t23.rdf @@ -0,0 +1,11 @@ + + + + + + + © 1998, The Foo Organization + + diff --git a/packages/sgml/RDF/suite/t24.rdf b/packages/sgml/RDF/suite/t24.rdf new file mode 100644 index 000000000..60283abcc --- /dev/null +++ b/packages/sgml/RDF/suite/t24.rdf @@ -0,0 +1,15 @@ + + + + + + + + + Ora Lassila + + Ralph Swick + + diff --git a/packages/sgml/RDF/suite/t25.rdf b/packages/sgml/RDF/suite/t25.rdf new file mode 100644 index 000000000..bfd1696a8 --- /dev/null +++ b/packages/sgml/RDF/suite/t25.rdf @@ -0,0 +1,12 @@ + + + + + + + Ora Lassila + Ora's Home Page + + diff --git a/packages/sgml/RDF/suite/t26.rdf b/packages/sgml/RDF/suite/t26.rdf new file mode 100644 index 000000000..366f69191 --- /dev/null +++ b/packages/sgml/RDF/suite/t26.rdf @@ -0,0 +1,16 @@ + + + + + + +
  • +
  • +
  • + + +
  • +
  • +
  • + + diff --git a/packages/sgml/RDF/suite/t27.rdf b/packages/sgml/RDF/suite/t27.rdf new file mode 100644 index 000000000..0d297ffd3 --- /dev/null +++ b/packages/sgml/RDF/suite/t27.rdf @@ -0,0 +1,30 @@ + + + + + + + + + Mary Andrew + Jacky Crystal + + + + + + + + + + + + + The Coolest Web Page + Il Pagio di Web Fuba + + + + diff --git a/packages/sgml/RDF/suite/t28.rdf b/packages/sgml/RDF/suite/t28.rdf new file mode 100644 index 000000000..15e866dff --- /dev/null +++ b/packages/sgml/RDF/suite/t28.rdf @@ -0,0 +1,15 @@ + + + + + + + + + diff --git a/packages/sgml/RDF/suite/t29.rdf b/packages/sgml/RDF/suite/t29.rdf new file mode 100644 index 000000000..229c804e8 --- /dev/null +++ b/packages/sgml/RDF/suite/t29.rdf @@ -0,0 +1,15 @@ + + + + + + + + 200 + + + + diff --git a/packages/sgml/RDF/suite/t3.rdf b/packages/sgml/RDF/suite/t3.rdf new file mode 100644 index 000000000..d61cdde38 --- /dev/null +++ b/packages/sgml/RDF/suite/t3.rdf @@ -0,0 +1,15 @@ + + + + + + + + + Jan Wielemaker + SWI + + + + diff --git a/packages/sgml/RDF/suite/t30.rdf b/packages/sgml/RDF/suite/t30.rdf new file mode 100644 index 000000000..2540d9f6a --- /dev/null +++ b/packages/sgml/RDF/suite/t30.rdf @@ -0,0 +1,26 @@ + + + + + + + D-Lib Program - Research in Digital Libraries + The D-Lib program supports the community of people + with research interests in digital libraries and electronic + publishing. + Corporation For National Research Initiatives + 1995-01-07 + + + Research; statistical methods + Education, research, related topics + Library use Studies + + + World Wide Web Home Page + text/html + en + + diff --git a/packages/sgml/RDF/suite/t31.rdf b/packages/sgml/RDF/suite/t31.rdf new file mode 100644 index 000000000..69eed39ed --- /dev/null +++ b/packages/sgml/RDF/suite/t31.rdf @@ -0,0 +1,36 @@ + + + + + + + DLIB Magazine - The Magazine for Digital Library Research + - May 1998 + D-LIB magazine is a monthly compilation of + contributed stories, commentary, and briefings. + + + Amy Friedlander + + Corporation for National Research Initiatives + 1998-01-05 + electronic journal + + + library use studies + magazines and newspapers + + + text/html + urn:issn:1082-9873 + + + + + + diff --git a/packages/sgml/RDF/suite/t32.rdf b/packages/sgml/RDF/suite/t32.rdf new file mode 100644 index 000000000..47b4aa432 --- /dev/null +++ b/packages/sgml/RDF/suite/t32.rdf @@ -0,0 +1,43 @@ + + + + + + + An Introduction to the Resource Description Framework + Eric J. Miller + The Resource Description Framework (RDF) is an + infrastructure that enables the encoding, exchange and reuse of + structured metadata. rdf is an application of xml that imposes needed + structural constraints to provide unambiguous methods of expressing + semantics. rdf additionally provides a means for publishing both + human-readable and machine-processable vocabularies designed to + encourage the reuse and extension of metadata semantics among + disparate information communities. the structural constraints rdf + imposes to support the consistent encoding and exchange of + standardized metadata provides for the interchangeability of separate + packages of metadata defined by different resource description + communities. + Corporation for National Research Initiatives + + + machine-readable catalog record formats + applications of computer file organization and + access methods + + + Copyright @ 1998 Eric Miller + Electronic Document + text/html + en + + + + + + diff --git a/packages/sgml/RDF/suite/t33.rdf b/packages/sgml/RDF/suite/t33.rdf new file mode 100644 index 000000000..1154b4ae0 --- /dev/null +++ b/packages/sgml/RDF/suite/t33.rdf @@ -0,0 +1,18 @@ + + + + + + + + +Ramifications of ab +2 to World Peace + + David Hume + + diff --git a/packages/sgml/RDF/suite/t34.rdf b/packages/sgml/RDF/suite/t34.rdf new file mode 100644 index 000000000..9919885a8 --- /dev/null +++ b/packages/sgml/RDF/suite/t34.rdf @@ -0,0 +1,28 @@ + + + + + + + + + + + + + diff --git a/packages/sgml/RDF/suite/t35.rdf b/packages/sgml/RDF/suite/t35.rdf new file mode 100644 index 000000000..70a55743a --- /dev/null +++ b/packages/sgml/RDF/suite/t35.rdf @@ -0,0 +1,15 @@ + + + + + + + + + + diff --git a/packages/sgml/RDF/suite/t36.rdf b/packages/sgml/RDF/suite/t36.rdf new file mode 100644 index 000000000..af677d04e --- /dev/null +++ b/packages/sgml/RDF/suite/t36.rdf @@ -0,0 +1,15 @@ + + + + + + + + + + + diff --git a/packages/sgml/RDF/suite/t37.rdf b/packages/sgml/RDF/suite/t37.rdf new file mode 100644 index 000000000..d8484601f --- /dev/null +++ b/packages/sgml/RDF/suite/t37.rdf @@ -0,0 +1,15 @@ + + + + + + + + + Jan Wielemaker + SWI + + + + diff --git a/packages/sgml/RDF/suite/t38.rdf b/packages/sgml/RDF/suite/t38.rdf new file mode 100644 index 000000000..fa7126490 --- /dev/null +++ b/packages/sgml/RDF/suite/t38.rdf @@ -0,0 +1,190 @@ + + + + + + + + + Resource + Ressource + The most general class + + + + type + type + Indicates membership of a class + + + + + comment + commentaire + + Use this for descriptions + + + + + + label + label + + Provides a human-readable version of a resource name. + + + + + Class + Classe + The concept of Class + + + + + subClassOf + sousClasseDe + Indicates membership of a class + + + + + + subPropertyOf + sousPropriétéDe + Indicates specialization of properties + + + + + + seeAlso + voirAussi + Indicates a resource that provides information about the subject resource. + + + + + + + + isDefinedBy + esDéfiniPar + Indicates a resource containing and defining the subject resource. + + + + + + ConstraintResource + RessourceContrainte + + + Resources used to express RDF Schema constraints. + + + + ConstraintProperty + PropriétéContrainte + + + Properties used to express RDF Schema constraints. + + + + domain + domaine + This is how we associate a class with + properties that its instances can have + + + + range + étendue + Properties that can be used in a + schema to provide constraints + + + + + + Property + Propriété + The concept of a property. + + + + + Literal + Littéral + + This represents the set of atomic values, eg. textual strings. + + + + Statement + Déclaration + + This represents the set of reified statements. + + + + subject + sujet + + + + + + predicate + prédicat + + + + + + + object + objet + + + + + Container + Enveloppe + + This represents the set Containers. + + + + Bag + Ensemble + + + + + Sequence + Séquence + + + + + Alt + Choix + + + + + ContainerMembershipProperty + + + + + object + value + + + diff --git a/packages/sgml/RDF/suite/t39.rdf b/packages/sgml/RDF/suite/t39.rdf new file mode 100644 index 000000000..516e65f49 --- /dev/null +++ b/packages/sgml/RDF/suite/t39.rdf @@ -0,0 +1,25 @@ + + + + + + + + concrete + + + + + + + + + 1 + + + + diff --git a/packages/sgml/RDF/suite/t4.rdf b/packages/sgml/RDF/suite/t4.rdf new file mode 100644 index 000000000..f721024cd --- /dev/null +++ b/packages/sgml/RDF/suite/t4.rdf @@ -0,0 +1,14 @@ + + + + + Ora Lassila + + + + + + Stefan Decker + + diff --git a/packages/sgml/RDF/suite/t40.rdf b/packages/sgml/RDF/suite/t40.rdf new file mode 100644 index 000000000..38c7aff0d --- /dev/null +++ b/packages/sgml/RDF/suite/t40.rdf @@ -0,0 +1,11 @@ + + + + + + + 2 + + + diff --git a/packages/sgml/RDF/suite/t41.rdf b/packages/sgml/RDF/suite/t41.rdf new file mode 100644 index 000000000..9c072a883 --- /dev/null +++ b/packages/sgml/RDF/suite/t41.rdf @@ -0,0 +1,15 @@ + + + + + + + + 200 + + + + diff --git a/packages/sgml/RDF/suite/t42.rdf b/packages/sgml/RDF/suite/t42.rdf new file mode 100644 index 000000000..cb08c584d --- /dev/null +++ b/packages/sgml/RDF/suite/t42.rdf @@ -0,0 +1,16 @@ + + + + + + + John + + + 200 + pounds + + + + diff --git a/packages/sgml/RDF/suite/t5.rdf b/packages/sgml/RDF/suite/t5.rdf new file mode 100644 index 000000000..a56717b7b --- /dev/null +++ b/packages/sgml/RDF/suite/t5.rdf @@ -0,0 +1,15 @@ + + + + + + + + + Jan Wielemaker + SWI + + + + diff --git a/packages/sgml/RDF/suite/t6.rdf b/packages/sgml/RDF/suite/t6.rdf new file mode 100644 index 000000000..61193ca24 --- /dev/null +++ b/packages/sgml/RDF/suite/t6.rdf @@ -0,0 +1,15 @@ + + + + + + + + + Jan Wielemaker + Anjo Anjewierden + + + + diff --git a/packages/sgml/RDF/suite/t7.rdf b/packages/sgml/RDF/suite/t7.rdf new file mode 100644 index 000000000..55f35ec38 --- /dev/null +++ b/packages/sgml/RDF/suite/t7.rdf @@ -0,0 +1,17 @@ + + + + + + + + + + Jan Wielemaker + SWI + + + + + diff --git a/packages/sgml/RDF/suite/t8.rdf b/packages/sgml/RDF/suite/t8.rdf new file mode 100644 index 000000000..bc79b4579 --- /dev/null +++ b/packages/sgml/RDF/suite/t8.rdf @@ -0,0 +1,16 @@ + + + + + + + + + + 200 + + + + diff --git a/packages/sgml/RDF/suite/t9.rdf b/packages/sgml/RDF/suite/t9.rdf new file mode 100644 index 000000000..de0441974 --- /dev/null +++ b/packages/sgml/RDF/suite/t9.rdf @@ -0,0 +1,12 @@ + + + + John Smith + + + 1998-02-06T14:00Z + Jane Cooper + + diff --git a/packages/sgml/RDF/suite/types.rdf b/packages/sgml/RDF/suite/types.rdf new file mode 100644 index 000000000..992e9cfb4 --- /dev/null +++ b/packages/sgml/RDF/suite/types.rdf @@ -0,0 +1,22 @@ + + + + + + + + + + 257 + + + + + diff --git a/packages/sgml/RDF/suite/xmllit.rdf b/packages/sgml/RDF/suite/xmllit.rdf new file mode 100644 index 000000000..7a0220847 --- /dev/null +++ b/packages/sgml/RDF/suite/xmllit.rdf @@ -0,0 +1,19 @@ + + + + + +]> + + + + + strong + + + diff --git a/packages/sgml/RDF/suite/xsdtypes.rdf b/packages/sgml/RDF/suite/xsdtypes.rdf new file mode 100644 index 000000000..9bfbb85c1 --- /dev/null +++ b/packages/sgml/RDF/suite/xsdtypes.rdf @@ -0,0 +1,19 @@ + + + + + +]> + + + + + 42 + + + diff --git a/packages/sgml/RDF/w3c_test.pl b/packages/sgml/RDF/w3c_test.pl new file mode 100644 index 000000000..3d27759e7 --- /dev/null +++ b/packages/sgml/RDF/w3c_test.pl @@ -0,0 +1,467 @@ +/* $Id$ + + Part of SWI-Prolog SGML/XML parser + + Author: Jan Wielemaker + E-mail: jan@swi.psy.uva.nl + WWW: http://www.swi.psy.uva.nl/projects/SWI-Prolog/ + Copying: LGPL-2. See the file COPYING or http://www.gnu.org + + Copyright (C) 1990-2002 SWI, University of Amsterdam. All rights reserved. +*/ + +:- module(rdf_w3c_test, + [ process_manifest/0, + process_manifest/1, + run_tests/0, % run all tests + run/0, % run selected test + show/1, % RDF diagram for File + run_test/1 % run a single test + ]). + + % get libraries locally +:- asserta(user:file_search_path(library, '.')). + +:- use_module(rdf). % our RDF parser +:- use_module(rdf_ntriples). % read .nt files +:- load_files([ library(pce), + library(toolbar), + library(pce_report), + rdf_diagram, + library('emacs/emacs') + ], + [ silent(true) + ]). + +:- dynamic + verbose/0. +%verbose. + +set_verbose :- + verbose, !. +set_verbose :- + assert(verbose). + +:- dynamic + rdf/3. + +ns(test, + 'http://www.w3.org/2000/10/rdf-tests/rdfcore/testSchema#'). + +local('http://www.w3.org/2000/10/rdf-tests/rdfcore/', + 'W3Ctests/'). + +process_manifest :- + process_manifest('W3Ctests/Manifest.rdf'). +process_manifest(Manifest) :- + retractall(rdf(_,_,_)), + load_rdf(Manifest, Triples), + assert_triples(Triples). + +assert_triples([]). +assert_triples([rdf(S, P, O)|T]) :- + canonise(S, Subject), + canonise(P, Predicate), + canonise(O, Object), + assert(rdf(Subject, Predicate, Object)), + assert_triples(T). + +canonise(NS:Name, N:Name) :- + ns(N, NS), !. +canonise(Absolute, N:Name) :- + atom(Absolute), + ns(N, NS), + atom_concat(NS, Name, Absolute), !. +canonise(X, X). + + +run_tests :- + process_manifest, + start_tests, + ( rdf(About, rdf:type, test:Type), + \+ rdf(About, test:status, literal('OBSOLETE')), + test_type(Type), +% once(run_test(About)), % Should not be needed + run_test(About), + fail + ; true + ), !, + report_results. + +test_type('PositiveParserTest'). +%test_type('NegativeParserTest'). + +run_test(Test) :- + rdf(Test, test:inputDocument, In), + local_file(In, InFile), + exists_file(InFile), + ( load_rdf(InFile, RDF, + [ base_uri(In), + expand_foreach(true) + ]) + -> true + ; RDF = [] + ), + Data = [ source(InFile), + result(RDF), + norm(NT), + substitutions(Substitions) + ], + % there may be alternative output + % documents + ( rdf(Test, test:outputDocument, Out), + local_file(Out, NTFile), + load_rdf_ntriples(NTFile, NT), + feedback('Comparing to ~w~n', [NTFile]), + compare_triples(RDF, NT, Substitions) + -> test_result(pass, Test, Data) + % if all fails, display the first + ; rdf(Test, test:outputDocument, Out), + local_file(Out, NTFile), + load_rdf_ntriples(NTFile, NT), + Substitions = [], + test_result(fail, Test, Data) + ). + + +local_file(URL, File) :- + local(URLPrefix, FilePrefix), + atom_concat(URLPrefix, Base, URL), !, + atom_concat(FilePrefix, Base, File). + + + /******************************* + * GUI * + *******************************/ + +:- pce_begin_class(w3c_rdf_test_gui, frame). + +initialise(F, Show:chain) :-> + send_super(F, initialise, 'W3C RDF test suite results'), + send(F, append, new(B, browser)), + send(B, hor_stretch, 100), + send(B, hor_shrink, 100), + ( send(Show, member, source) + -> new(V, emacs_view(height := 3)), + send(V, name, text) + ; true + ), + ( send(Show, member, result) + -> new(R, rdf_diagram), + send(R, name, result), + send(R, label, 'Result') + ; true + ), + ( send(Show, member, norm) + -> new(N, rdf_diagram), + send(N, name, norm), + send(N, label, 'Norm') + ; true + ), + stack_windows([V,R,N], _, W), + ( nonvar(W) + -> send(W, right, B) + ; true + ), + send(new(D, tool_dialog(F)), above, B), + send(new(report_dialog), below, B), + send(F, fill_menu, D), + send(F, fill_browser, B). + +stack_windows([], L, L). +stack_windows([H|T], W0, W) :- + var(H), !, + stack_windows(T, W0, W). +stack_windows([H|T], W0, W) :- + var(W0), !, + stack_windows(T, H, W). +stack_windows([H|T], WL, W) :- + send(H, below, WL), + stack_windows(T, H, W). + +fill_menu(F, D:tool_dialog) :-> + send_list(D, + [ append(menu_item(exit, message(F, destroy)), + file) + ]). + +fill_browser(_F, B:browser) :-> + send(B, style, pass, style(colour := dark_green)), + send(B, style, fail, style(colour := red)), + send(B?image, recogniser, + handler(ms_right_down, + and(message(B, selection, + ?(B, dict_item, @event)), + new(or)))), + send(B, popup, new(P, popup)), + send(B, select_message, message(@arg1, run)), + send_list(P, append, + [ menu_item(run, + message(@arg1, run)), + menu_item(edit, + message(@arg1, edit_test)), + gap, + menu_item(show_result, + message(@arg1, show_triples, result)), + menu_item(show_norm, + message(@arg1, show_triples, norm)), + gap, + menu_item(discussion, + message(@arg1, open_url, discussion), + condition := + message(@arg1, has_url, discussion)), + menu_item(approval, + message(@arg1, open_url, approval), + condition := + message(@arg1, has_url, approval)), + gap, + menu_item(copy_test_uri, + message(@arg1, copy_test_uri)) + ]). + + +test_result(F, Result:{pass,fail}, Test:name, Data:prolog) :-> + "Test failed":: + get(F, member, browser, B), + ( get(B, member, Test, Item) + -> send(Item, object, prolog(Data)), + send(Item, style, Result) + ; send(B, append, + rdf_test_item(Test, @default, prolog(Data), Result)) + ). + +clear(F) :-> + get(F, member, browser, B), + send(B, clear). + +summarise(F) :-> + get(F, member, browser, Browser), + new(Pass, number(0)), + new(Fail, number(0)), + send(Browser?members, for_all, + if(@arg1?style == pass, + message(Pass, plus, 1), + message(Fail, plus, 1))), + send(F, report, status, '%d tests succeeded; %d failed', + Pass, Fail). + +:- pce_end_class(w3c_rdf_test_gui). + +:- pce_begin_class(rdf_test_item, dict_item). + + +edit_test(Item) :-> + "Edit input document of test":: + get(Item, object, List), + member(source(InFile), List), + edit(file(InFile)). + +show_triples(Item, Set:{result,norm}) :-> + "Show result of our parser":: + get(Item, key, Test), + get(Item, object, List), + Term =.. [Set,Triples], + member(Term, List), + send(Item, show_diagram(Triples, + string('%s for %s', Set?label_name, Test))). + +show_diagram(_Item, Triples:prolog, Label:name) :-> + "Show diagram for triples":: + new(D, rdf_diagram(Label)), + send(new(report_dialog), below, D), + send(D, triples, Triples), + send(D, open). + +open_url(Item, Which:name) :-> + "Open associated URL in browser":: + get(Item, key, Test), + rdf(Test, test:Which, URL), + www_open_url(URL). + +has_url(Item, Which:name) :-> + "Test if item has URL":: + get(Item, key, Test), + rdf(Test, test:Which, _URL). + +run(Item) :-> + "Re-run the test":: + get(Item, key, Test), + run_test(Test), + send(Item, show). + +copy_test_uri(Item) :-> + "Copy URI of test to clipboard":: + get(Item, key, Test), + send(@display, copy, Test). + +show(Item) :-> + "Show source, result and norm diagrams":: + get(Item?image, frame, Frame), + get(Item, object, List), + ( get(Frame, member, result, Result) + -> member(result(RTriples), List), + send(Result, triples, RTriples) + ; true + ), + ( get(Frame, member, norm, Norm) + -> member(norm(NTriples), List), + send(Norm, triples, NTriples) + ; true + ), + ( get(Frame, member, text, View) + -> member(source(File), List), + send(View, text_buffer, new(TB, emacs_buffer(File))), + % scroll to RDF text + ( member(Pattern, [':RDF', 'RDF']), + get(TB, find, 0, Pattern, Start), + get(TB, scan, Start, line, 0, start, BOL) + -> send(View, scroll_to, BOL, 1) + ; true + ) + ; true + ). +% member(substitutions(Substitutions), List), +% send(Result, copy_layout, Norm, Substitutions), + +:- pce_end_class(rdf_test_item). + + +:- pce_global(@rdf_test_gui, make_rdf_test_gui). + +make_rdf_test_gui(Ref) :- + send(new(Ref, w3c_rdf_test_gui(chain(source,result))), open). + + +test_result(Result, Test, Data) :- + send(@rdf_test_gui, test_result, Result, Test, Data), + ( Result == fail, verbose + -> member(result(Our), Data), + length(Our, OurLength), + format('~N** Our Triples (~w)~n', OurLength), + pp(Our), + member(norm(Norm), Data), + length(Norm, NormLength), + format('~N** Normative Triples (~w)~n', NormLength), + pp(Norm) + ; true + ). + + + +start_tests :- + send(@rdf_test_gui, clear). + +report_results :- + send(@rdf_test_gui, summarise). + +run :- + set_verbose, + get(@rdf_test_gui, member, browser, B), + get(B, selection, DI), + get(DI, key, Test), + run_test(Test). + + + /******************************* + * SHOW A FILE * + *******************************/ + + +show(File) :- + rdf_diagram_from_file(File). + + + /******************************* + * COMPARING * + *******************************/ + +% compare_triples(+PlRDF, +NTRDF, -Substitions) +% +% Compare two models and if they are equal, return a list of +% PlID = NTID, mapping NodeID elements. + + +compare_triples(A, B, Substitutions) :- + compare_list(A, B, [], Substitutions). + +compare_list([], [], S, S). +compare_list(L1, L2, S0, S) :- + take_bag(L1, B1, E1, R1), !, + take_bag(L2, B2, E2, R2), + compare_field(B1, B2, S0, S1), + compare_bags(E1, E2, S1, S2), + compare_list(R1, R2, S2, S). +compare_list([H1|T1], In2, S0, S) :- + select(H2, In2, T2), + compare_triple(H1, H2, S0, S1), % put(.), flush_output, + compare_list(T1, T2, S1, S). + +compare_triple(rdf(Subj1,P1,O1), rdf(Subj2, P2, O2), S0, S) :- + compare_field(Subj1, Subj2, S0, S1), + compare_field(P1, P2, S1, S2), + compare_field(O1, O2, S2, S). + +compare_field(X, X, S, S) :- !. +compare_field(literal(X), xml(X), S, S) :- !. % TBD +compare_field(rdf:Name, Atom, S, S) :- + atom(Atom), + rdf_parser:rdf_name_space(NS), + atom_concat(NS, Name, Atom), !. +compare_field(NS:Name, Atom, S, S) :- + atom(Atom), + atom_concat(NS, Name, Atom), !. +compare_field(X, node(Id), S, S) :- + memberchk(X=Id, S), !. +compare_field(X, node(Id), S, [X=Id|S]) :- + \+ memberchk(X=_, S), + atom(X), + generated_prefix(Prefix), + sub_atom(X, 0, _, _, Prefix), !, + feedback('Assume ~w = ~w~n', [X, node(Id)]). + +generated_prefix(Prefix) :- + rdf_truple:anon_base(Prefix). + +% compare_bags(+Members1, +Members2, +S0, -S) +% +% Order of _1, _2, etc. are not relevant in BadID reification. Are +% they in general? Anyway, we'll normalise the order of the bags + +compare_bags([], [], S, S). +compare_bags([E1|T1], M, S0, S) :- + select(E2, M, T2), + compare_field(E1, E2, S0, S1), + compare_bags(T1, T2, S1, S). + +take_bag(Triples, Bag, Elems, RestTriples) :- + select(rdf(Bag, Type, BagClass), Triples, T1), + compare_field(rdf:type, Type, [], []), + compare_field(rdf:'Bag', BagClass, [], []), + bag_members(T1, Bag, Elems, RestTriples). + +bag_members([], _, [], []). +bag_members([rdf(Bag, IsElm, E)|T], Bag, [E|ET], Rest) :- + member_prop(IsElm), !, + bag_members(T, Bag, ET, Rest). +bag_members([T0|T], Bag, Elems, [T0|R]) :- + bag_members(T, Bag, Elems, R). + +member_prop(rdf:Name) :- + atom_codes(Name, [0'_|Codes]), + number_codes(_N, Codes), !. +member_prop(Prop) :- + atom(Prop), + rdf_parser:rdf_name_space(NS), + atom_concat(NS, Name, Prop), + atom_codes(Name, [0'_|Codes]), + number_codes(_N, Codes), !. + + +% feedback(+Format, +Args) +% +% Print if verbose + +feedback(Fmt, Args) :- + verbose, !, + format(user_error, Fmt, Args). +feedback(_, _). diff --git a/packages/sgml/RDF/write_test.pl b/packages/sgml/RDF/write_test.pl new file mode 100644 index 000000000..1b6e06029 --- /dev/null +++ b/packages/sgml/RDF/write_test.pl @@ -0,0 +1,155 @@ +/* $Id$ + + Part of SWI-Prolog + + Author: Jan Wielemaker + E-mail: wielemak@science.uva.nl + WWW: http://www.swi-prolog.org + Copyright (C): 1985-2007, University of Amsterdam + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License + as published by the Free Software Foundation; either version 2 + of the License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + + As a special exception, if you link this library with other files, + compiled with a Free Software compiler, to produce an executable, this + library does not by itself cause the resulting executable to be covered + by the GNU General Public License. This exception does not however + invalidate any other reasons why the executable file might be covered by + the GNU General Public License. +*/ + +:- module(test_rdf_write, + [ run_tests/0, + run_tests/1 + ]). + +:- asserta(user:file_search_path(foreign, '..')). +:- asserta(user:file_search_path(foreign, '../../semweb')). +:- asserta(user:file_search_path(library, '../..')). +:- asserta(user:file_search_path(library, '..')). +:- asserta(user:file_search_path(library, '.')). +:- asserta(user:file_search_path(library, '../../plunit')). + +:- use_module(library(plunit)). +:- use_module(library(rdf_write)). +:- use_module(library(sgml)). +:- use_module(library(lists)). +:- use_module(library(debug)). +:- use_module(library(semweb/rdf_db)). +:- use_module(rdf). + + + /******************************* + * ROUND TRIP * + *******************************/ + +test_graph(Triples) :- + tmp_file(rdf, Tmp), + open(Tmp, write, Out, [encoding(utf8)]), + rdf_write_xml(Out, Triples), + close(Out), + load_rdf(Tmp, ReadTriples), + delete_file(Tmp), + compare_triples(Triples, ReadTriples, _). + + + /******************************* + * COMPARING * + *******************************/ + +% compare_triples(+PlRDF, +NTRDF, -Substitions) +% +% Compare two models and if they are equal, return a list of +% PlID = NTID, mapping NodeID elements. + + +compare_triples(A, B, Substitutions) :- + compare_list(A, B, [], Substitutions), !. + +compare_list([], [], S, S). +compare_list([H1|T1], In2, S0, S) :- + select(H2, In2, T2), + compare_triple(H1, H2, S0, S1), + compare_list(T1, T2, S1, S). + +compare_triple(rdf(Subj1,P1,O1), rdf(Subj2, P2, O2), S0, S) :- + compare_field(Subj1, Subj2, S0, S1), + compare_field(P1, P2, S1, S2), + compare_field(O1, O2, S2, S). + +compare_field(X, X, S, S) :- !. +compare_field(literal(X), xml(X), S, S) :- !. % TBD +compare_field(rdf:Name, Atom, S, S) :- + atom(Atom), + rdf_parser:rdf_name_space(NS), + atom_concat(NS, Name, Atom), !. +compare_field(NS:Name, Atom, S, S) :- + atom(Atom), + atom_concat(NS, Name, Atom), !. +compare_field(X, Id, S, S) :- + memberchk(X=Id, S), !. +compare_field(X, Y, S, [X=Y|S]) :- + \+ memberchk(X=_, S), + rdf_is_bnode(X), + rdf_is_bnode(Y), + debug(bnode, 'Assume ~w = ~w~n', [X, Y]). + + + /******************************* + * TESTS * + *******************************/ + +:- begin_tests(rdf_write). + +test(1, true) :- + test_graph([ rdf(s, p, o) + ]). +test(anon_s, true) :- + test_graph([ rdf('__s', p, o) + ]). +test(anon_o, true) :- + test_graph([ rdf(s, p, '__o') + ]). +test(anon_loop, blocked('NodeID map must check for cycles')) :- + test_graph([ rdf('__r1', p1, '__r2'), + rdf('__r2', p1, '__r1') + ]). +test(anon_loop, true) :- + test_graph([ rdf('__r1', p1, '__r2'), + rdf('__r1', p2, '__r2'), + rdf('__r2', p1, '__r1'), + rdf('__r2', p2, '__r1') + ]). +test(anon_reuse, true) :- + test_graph([ rdf('__s1', p1, '__o1'), + rdf('__s2', p1, '__o1') + ]). +test(anon_reuse, true) :- + test_graph([ rdf('__s1', p1, '__o1'), + rdf('__s2', p1, '__o1'), + rdf('__o1', name, literal(foo)) + ]). +test(literal, true) :- + test_graph([ rdf(s, p, literal(hello)) + ]). +test(lang, true) :- + test_graph([ rdf(s, p, literal(lang(en, hello))) + ]). +test(type, true) :- + test_graph([ rdf(s, p, literal(type(t, hello))) + ]). + +:- end_tests(rdf_write). + + diff --git a/packages/sgml/TODO b/packages/sgml/TODO new file mode 100644 index 000000000..b1932803e --- /dev/null +++ b/packages/sgml/TODO @@ -0,0 +1,26 @@ +TODO LIST: + + * Handling of external entities (both param and normal) in Prolog + * Check ID/IDREF + * Donot use quoted values for tag identifications. Right now the + following leads to bad error handling: + + +]> + +& diff --git a/packages/sgml/Test/att.xml b/packages/sgml/Test/att.xml new file mode 100644 index 000000000..740795cfd --- /dev/null +++ b/packages/sgml/Test/att.xml @@ -0,0 +1,3 @@ + + + diff --git a/packages/sgml/Test/badxmlent.xml b/packages/sgml/Test/badxmlent.xml new file mode 100644 index 000000000..c25d3fc04 --- /dev/null +++ b/packages/sgml/Test/badxmlent.xml @@ -0,0 +1,5 @@ + + + + John & Mary + diff --git a/packages/sgml/Test/bar.sgml b/packages/sgml/Test/bar.sgml new file mode 100644 index 000000000..82ec34364 --- /dev/null +++ b/packages/sgml/Test/bar.sgml @@ -0,0 +1,8 @@ + + +]> + + diff --git a/packages/sgml/Test/bat.sgml b/packages/sgml/Test/bat.sgml new file mode 100644 index 000000000..14ad1d490 --- /dev/null +++ b/packages/sgml/Test/bat.sgml @@ -0,0 +1,68 @@ + + + + + + +]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/packages/sgml/Test/cdata.sgml b/packages/sgml/Test/cdata.sgml new file mode 100644 index 000000000..6bad9ae57 --- /dev/null +++ b/packages/sgml/Test/cdata.sgml @@ -0,0 +1,8 @@ + +]> + + + + + diff --git a/packages/sgml/Test/ce.sgml b/packages/sgml/Test/ce.sgml new file mode 100644 index 000000000..68e5b7ea0 --- /dev/null +++ b/packages/sgml/Test/ce.sgml @@ -0,0 +1,7 @@ + +]> + + +Test for handling character entities: A &#RS; + diff --git a/packages/sgml/Test/cent-nul.xml b/packages/sgml/Test/cent-nul.xml new file mode 100644 index 000000000..06a795751 --- /dev/null +++ b/packages/sgml/Test/cent-nul.xml @@ -0,0 +1 @@ +This content holds a � byte that should be skipped diff --git a/packages/sgml/Test/cent-utf8.xml b/packages/sgml/Test/cent-utf8.xml new file mode 100644 index 000000000..80f4939ad --- /dev/null +++ b/packages/sgml/Test/cent-utf8.xml @@ -0,0 +1,19 @@ + + + + + + + + +]> + +From Española -- a ‘test’ for you. +From Española -- a ‘test’ for you. +From &townname; -- &scarequote1;. +From &townname; -- &scarequote2;. + diff --git a/packages/sgml/Test/cmt.sgml b/packages/sgml/Test/cmt.sgml new file mode 100644 index 000000000..3ae403c46 --- /dev/null +++ b/packages/sgml/Test/cmt.sgml @@ -0,0 +1,13 @@ + + + + + ] +> + + + + diff --git a/packages/sgml/Test/comment.xml b/packages/sgml/Test/comment.xml new file mode 100644 index 000000000..050b8d277 --- /dev/null +++ b/packages/sgml/Test/comment.xml @@ -0,0 +1,10 @@ + + + + + + + + + + diff --git a/packages/sgml/Test/conref.sgml b/packages/sgml/Test/conref.sgml new file mode 100644 index 000000000..19b75e4fa --- /dev/null +++ b/packages/sgml/Test/conref.sgml @@ -0,0 +1,8 @@ + + + +]> + + + diff --git a/packages/sgml/Test/conref2.sgml b/packages/sgml/Test/conref2.sgml new file mode 100644 index 000000000..5684709d8 --- /dev/null +++ b/packages/sgml/Test/conref2.sgml @@ -0,0 +1,10 @@ + + + +]> +1. + +3. + +5. diff --git a/packages/sgml/Test/crlf.sgml b/packages/sgml/Test/crlf.sgml new file mode 100644 index 000000000..fe99410ad --- /dev/null +++ b/packages/sgml/Test/crlf.sgml @@ -0,0 +1,11 @@ + + + + + +]> +par 1 + +par 2 diff --git a/packages/sgml/Test/defent.sgml b/packages/sgml/Test/defent.sgml new file mode 100644 index 000000000..bb4a9c719 --- /dev/null +++ b/packages/sgml/Test/defent.sgml @@ -0,0 +1,10 @@ + + +]> + +A +B C@ D +&b +B +&c diff --git a/packages/sgml/Test/entent.sgml b/packages/sgml/Test/entent.sgml new file mode 100644 index 000000000..8c3c06830 --- /dev/null +++ b/packages/sgml/Test/entent.sgml @@ -0,0 +1,10 @@ + + + + + +]> + + +&o; diff --git a/packages/sgml/Test/estag.sgml b/packages/sgml/Test/estag.sgml new file mode 100644 index 000000000..d746fbeea --- /dev/null +++ b/packages/sgml/Test/estag.sgml @@ -0,0 +1,10 @@ + + + + +]> + + +&o; + diff --git a/packages/sgml/Test/foo.sgml b/packages/sgml/Test/foo.sgml new file mode 100644 index 000000000..7e770176b --- /dev/null +++ b/packages/sgml/Test/foo.sgml @@ -0,0 +1,7 @@ + + + +]> +Neddie Seagoon +Minnie Bannister diff --git a/packages/sgml/Test/i.sgml b/packages/sgml/Test/i.sgml new file mode 100644 index 000000000..e17d51799 --- /dev/null +++ b/packages/sgml/Test/i.sgml @@ -0,0 +1,12 @@ +"> + + "> + + + +]> + + +&Q2;! Hello &QS;you there&QE;, I'm &S; + diff --git a/packages/sgml/Test/layout.xml b/packages/sgml/Test/layout.xml new file mode 100644 index 000000000..53b10a4a4 --- /dev/null +++ b/packages/sgml/Test/layout.xml @@ -0,0 +1,11 @@ + + + +
      +
    • Line one
    • +
    • Line with emphasised text
    • +
    + +

    This is a nice paragraph with some bold text

    + +
    diff --git a/packages/sgml/Test/mapbug.sgml b/packages/sgml/Test/mapbug.sgml new file mode 100644 index 000000000..6bc931790 --- /dev/null +++ b/packages/sgml/Test/mapbug.sgml @@ -0,0 +1,41 @@ + + + + + + + + + + + + + + + + + + + + + +]> +The "first" paragraph. + +The "%2%nd" paragraph. + +The %3%rd paragraph. + diff --git a/packages/sgml/Test/ment.sgml b/packages/sgml/Test/ment.sgml new file mode 100644 index 000000000..636d80042 --- /dev/null +++ b/packages/sgml/Test/ment.sgml @@ -0,0 +1,7 @@ + + +]> +One: &one;. +Two: &two;. +Three: &three;. diff --git a/packages/sgml/Test/minus2.xml b/packages/sgml/Test/minus2.xml new file mode 100644 index 000000000..5af6f644e --- /dev/null +++ b/packages/sgml/Test/minus2.xml @@ -0,0 +1,3 @@ + + + diff --git a/packages/sgml/Test/netc.sgml b/packages/sgml/Test/netc.sgml new file mode 100644 index 000000000..b7e3b0c32 --- /dev/null +++ b/packages/sgml/Test/netc.sgml @@ -0,0 +1,12 @@ + + + + + +]> + +This slash should end netc:/ + diff --git a/packages/sgml/Test/ng.sgml b/packages/sgml/Test/ng.sgml new file mode 100644 index 000000000..e4c2d08e0 --- /dev/null +++ b/packages/sgml/Test/ng.sgml @@ -0,0 +1,10 @@ + + +]> + + + diff --git a/packages/sgml/Test/noent.sgml b/packages/sgml/Test/noent.sgml new file mode 100644 index 000000000..c543689d0 --- /dev/null +++ b/packages/sgml/Test/noent.sgml @@ -0,0 +1,14 @@ + + + + + ]> + +

    This demonstrates the problem: +

    +  status = DOM_operation, &result);
    +
    +

    I hope this helps. + diff --git a/packages/sgml/Test/not.sgml b/packages/sgml/Test/not.sgml new file mode 100644 index 000000000..9018c2064 --- /dev/null +++ b/packages/sgml/Test/not.sgml @@ -0,0 +1,13 @@ + + + + + +]> + + + +$$E = MC^2$$ + + diff --git a/packages/sgml/Test/ok/amp.ok b/packages/sgml/Test/ok/amp.ok new file mode 100644 index 000000000..4672c0ca5 --- /dev/null +++ b/packages/sgml/Test/ok/amp.ok @@ -0,0 +1 @@ +[element(test, [], [element(p, [], [&])])]. diff --git a/packages/sgml/Test/ok/att.ok b/packages/sgml/Test/ok/att.ok new file mode 100644 index 000000000..181b871bb --- /dev/null +++ b/packages/sgml/Test/ok/att.ok @@ -0,0 +1,2 @@ +[element(foo, [bar='10'], [])]. +[]. diff --git a/packages/sgml/Test/ok/badxmlent.ok b/packages/sgml/Test/ok/badxmlent.ok new file mode 100644 index 000000000..66de8fb72 --- /dev/null +++ b/packages/sgml/Test/ok/badxmlent.ok @@ -0,0 +1,2 @@ +[element(test, [a='John & Mary'], ['\n John & Mary\n'])]. +[sgml(sgml_parser(1949540), 'badxmlent.xml', 3, 'Syntax error: Illegal entity, found "& Mary""'), sgml(sgml_parser(1949540), 'badxmlent.xml', 4, 'Syntax error: Illegal entity, found "& "')]. diff --git a/packages/sgml/Test/ok/bar.ok b/packages/sgml/Test/ok/bar.ok new file mode 100644 index 000000000..6764b91e4 --- /dev/null +++ b/packages/sgml/Test/ok/bar.ok @@ -0,0 +1 @@ +[element(bar, [a='Major Mynah', b=abc, c='12', d='foo bar ugh'], [])]. diff --git a/packages/sgml/Test/ok/bat.ok b/packages/sgml/Test/ok/bat.ok new file mode 100644 index 000000000..2b020e170 --- /dev/null +++ b/packages/sgml/Test/ok/bat.ok @@ -0,0 +1,2 @@ +[element(bat, [], [element(x, [a=foo], []), element(x, [a=bar], []), element(x, [a='foo&bar'], []), element(x, [a='file.cgi?y=1'], []), element(x, [b=en], []), element(x, [b=en], []), element(x, [b='en en'], []), element(x, [c=[en]], []), element(x, [c=[en]], []), element(x, [c=[en, en]], []), element(x, [c=[un]], []), element(x, [c=['12']], []), element(x, [d='an-id'], []), element(x, [d='an*id'], []), element(x, [d='*id*'], []), element(x, [d='an id'], []), element(x, [e='an-id'], []), element(x, [e='un-id'], []), element(x, [f=['']], []), element(x, [f=['an-id']], []), element(x, [f=['an-id', 'an-id']], []), element(x, [g='1'], []), element(x, [g=''], []), element(x, [g='a-rather-long-name'], []), element(x, [g='a%name%with%percents'], []), element(x, [g='a name'], []), element(x, [g='a-name'], []), element(x, [h=['']], []), element(x, [h=[a]], []), element(x, [h=[name]], []), element(x, [h=[a, name]], []), element(x, [k='1'], []), element(x, [k='999999999999999999999999999999999999999999999'], []), element(x, [k=0], []), element(x, [k=0], []), element(x, [k=0], []), element(x, [n=[one, two]], []), element(x, [n=['1a', '2a']], []), element(x, [n=['1*ft', '2*in']], []), element(x, [o=no], []), element(x, [o=un], []), element(x, [p='--a--'], []), element(x, [p='--b--'], []), element(x, [p=' --a-- '], [])])]. +[sgml(sgml_parser(423746), 'bat.sgml', 27, 'Syntax error: Attribute value requires quotes, found "foo&bar"'), sgml(sgml_parser(423746), 'bat.sgml', 28, 'Syntax error: Attribute value requires quotes, found "file.cgi?y=1"'), sgml(sgml_parser(423746), 'bat.sgml', 30, 'Element "x" has no attribute with value "en"'), sgml(sgml_parser(423746), 'bat.sgml', 30, 'Syntax error: Bad attribute list, found "b=en en"'), sgml(sgml_parser(423746), 'bat.sgml', 33, 'Element "x" has no attribute with value "en"'), sgml(sgml_parser(423746), 'bat.sgml', 33, 'Syntax error: Bad attribute list, found "c=en en"'), sgml(sgml_parser(423746), 'bat.sgml', 36, 'Syntax error: entity NAMES expected, found "12"'), sgml(sgml_parser(423746), 'bat.sgml', 38, 'Syntax error: Attribute value requires quotes, found "an*id"'), sgml(sgml_parser(423746), 'bat.sgml', 38, 'Syntax error: NAME expected, found "an*id"'), sgml(sgml_parser(423746), 'bat.sgml', 39, 'Syntax error: Attribute value requires quotes, found "*id*"'), sgml(sgml_parser(423746), 'bat.sgml', 39, 'Syntax error: NAME expected, found "*id*"'), sgml(sgml_parser(423746), 'bat.sgml', 43, 'Syntax error: NAMES expected, found """"'), sgml(sgml_parser(423746), 'bat.sgml', 46, 'Syntax error: NAME expected, found "1"'), sgml(sgml_parser(423746), 'bat.sgml', 47, 'Syntax error: NAME expected, found "\'\'"'), sgml(sgml_parser(423746), 'bat.sgml', 49, 'Syntax error: Attribute value requires quotes, found "a%name%with%percents"'), sgml(sgml_parser(423746), 'bat.sgml', 49, 'Syntax error: NAME expected, found "a%name%with%percents"'), sgml(sgml_parser(423746), 'bat.sgml', 52, 'Syntax error: NAMES expected, found """"'), sgml(sgml_parser(423746), 'bat.sgml', 58, 'Syntax error: NUMBER expected, found "1.2"'), sgml(sgml_parser(423746), 'bat.sgml', 59, 'Syntax error: NUMBER expected, found ""1.2""'), sgml(sgml_parser(423746), 'bat.sgml', 60, 'Syntax error: NUMBER expected, found ""-1.2""'), sgml(sgml_parser(423746), 'bat.sgml', 61, 'Syntax error: NUTOKENS expected, found ""one two""'), sgml(sgml_parser(423746), 'bat.sgml', 63, 'Syntax error: NUTOKENS expected, found ""1*ft 2*in""'), sgml(sgml_parser(423746), 'bat.sgml', 66, 'Element "x" has no attribute "p"')]. diff --git a/packages/sgml/Test/ok/cdata.ok b/packages/sgml/Test/ok/cdata.ok new file mode 100644 index 000000000..0c88d88e8 --- /dev/null +++ b/packages/sgml/Test/ok/cdata.ok @@ -0,0 +1,2 @@ +[element(test, [], ['[Ora Lassila]'])]. +[]. diff --git a/packages/sgml/Test/ok/ce.ok b/packages/sgml/Test/ok/ce.ok new file mode 100644 index 000000000..fa7340c2c --- /dev/null +++ b/packages/sgml/Test/ok/ce.ok @@ -0,0 +1 @@ +[element(test, [], ['Test for handling character entities: A \n'])]. diff --git a/packages/sgml/Test/ok/cent-nul.ok b/packages/sgml/Test/ok/cent-nul.ok new file mode 100644 index 000000000..113b25eb1 --- /dev/null +++ b/packages/sgml/Test/ok/cent-nul.ok @@ -0,0 +1,2 @@ +[element(test, [], ['This content holds a byte that should be skipped'])]. +[sgml(sgml_parser(482992), 'cent-nul.xml', 1, 'Syntax error: Bad character entity, found "#0"')]. diff --git a/packages/sgml/Test/ok/cent-utf8.ok b/packages/sgml/Test/ok/cent-utf8.ok new file mode 100644 index 000000000..19a4c6b3c --- /dev/null +++ b/packages/sgml/Test/ok/cent-utf8.ok @@ -0,0 +1,2 @@ +[element(testdoc, [id='t7-20020923', resp='MSM'], ['\n', element(names, [], ['From Española -- a ‘test’ for you.']), '\n', element(nums, [], ['From Española -- a ‘test’ for you.']), '\n', element(names, [], ['From Española -- a ‘test’ for you.']), '\n', element(nums, [], ['From Española -- a ‘test’ for you.']), '\n'])]. +[]. diff --git a/packages/sgml/Test/ok/cmt.ok b/packages/sgml/Test/ok/cmt.ok new file mode 100644 index 000000000..7b6ac92ae --- /dev/null +++ b/packages/sgml/Test/ok/cmt.ok @@ -0,0 +1 @@ +[element(a, [], [])]. diff --git a/packages/sgml/Test/ok/comment.ok b/packages/sgml/Test/ok/comment.ok new file mode 100644 index 000000000..5149c333e --- /dev/null +++ b/packages/sgml/Test/ok/comment.ok @@ -0,0 +1,2 @@ +[element(test, [], ['\n \n \n \n \n \n'])]. +[sgml(sgml_parser(1951880), 'comment.xml', 5, 'Syntax error: Illegal comment, found " + + + +This is rcdata &hello; & &world + + +This is an a + + diff --git a/packages/sgml/Test/rdefent.sgml b/packages/sgml/Test/rdefent.sgml new file mode 100644 index 000000000..158dd815f --- /dev/null +++ b/packages/sgml/Test/rdefent.sgml @@ -0,0 +1,9 @@ + + + + %latin; + +]> + +  diff --git a/packages/sgml/Test/rsre.sgml b/packages/sgml/Test/rsre.sgml new file mode 100644 index 000000000..ffdbcd6cb --- /dev/null +++ b/packages/sgml/Test/rsre.sgml @@ -0,0 +1,18 @@ + + + + + + + + +]> +SGML does NOT add record start characters at the beginning or +record end characters at the end of entities unless they are +already there. In this document, there IS a record start +character for the first line of text and there IS a record end +character for the last line of text. The #RS character reference +should match at the beginning of each line except possibly the +first one, so we expect SEVEN (LINE) elements for this file. diff --git a/packages/sgml/Test/sdata.sgml b/packages/sgml/Test/sdata.sgml new file mode 100644 index 000000000..734d66051 --- /dev/null +++ b/packages/sgml/Test/sdata.sgml @@ -0,0 +1,8 @@ + + +]> + + +Van Alpha tot Ω + diff --git a/packages/sgml/Test/shortval.sgml b/packages/sgml/Test/shortval.sgml new file mode 100644 index 000000000..48c210b3d --- /dev/null +++ b/packages/sgml/Test/shortval.sgml @@ -0,0 +1,6 @@ + + +]> + + diff --git a/packages/sgml/Test/simple.xml b/packages/sgml/Test/simple.xml new file mode 100644 index 000000000..06b1da17e --- /dev/null +++ b/packages/sgml/Test/simple.xml @@ -0,0 +1 @@ +Some content diff --git a/packages/sgml/Test/sr.sgml b/packages/sgml/Test/sr.sgml new file mode 100644 index 000000000..26b351fe1 --- /dev/null +++ b/packages/sgml/Test/sr.sgml @@ -0,0 +1,20 @@ + + + + + "> + + + + + + +]> + + +Peter said: "He, this is a nice program". + +Bob said: "Yes, it is" + diff --git a/packages/sgml/Test/sr2.sgml b/packages/sgml/Test/sr2.sgml new file mode 100644 index 000000000..4881fc8e8 --- /dev/null +++ b/packages/sgml/Test/sr2.sgml @@ -0,0 +1,25 @@ + + + + + + + + + + + + + + + + + ]> + + +n(a) + diff --git a/packages/sgml/Test/test.pl b/packages/sgml/Test/test.pl new file mode 100644 index 000000000..4731cc409 --- /dev/null +++ b/packages/sgml/Test/test.pl @@ -0,0 +1,163 @@ +/* $Id$ + + Part of SWI-Prolog SGML/XML parser + + Author: Jan Wielemaker + E-mail: jan@swi.psy.uva.nl + WWW: http://www.swi.psy.uva.nl/projects/SWI-Prolog/ + Copying: LGPL-2. See the file COPYING or http://www.gnu.org + + Copyright (C) 1990-2000 SWI, University of Amsterdam. All rights reserved. +*/ + +:- module(sgml_test, + [ test/1, % +File + testdir/1, % +Dir + pass/1, % +File + show/1, % +File + test/0 + ]). + +:- prolog_load_context(directory, CWD), + working_directory(_, CWD). + +:- asserta(user:file_search_path(library, '..')). +:- asserta(user:file_search_path(foreign, '..')). +:- use_module(library(sgml)). + + +test :- + testdir(.). + +testdir(Dir) :- + atom_concat(Dir, '/*', Pattern), + expand_file_name(Pattern, Files), + maplist(dotest, Files). + +dotest(File) :- + file_name_extension(_, Ext, File), + memberchk(Ext, [sgml, xml, html]), !, + test(File). +dotest(_). + +test(File) :- + format('~NTest ~w ... ', [File]), + flush_output, + load_file(File, Term), + ground(Term), % make sure + okfile(File, OkFile), + ( exists_file(OkFile) + -> load_prolog_file(OkFile, TermOk, ErrorsOk), + ( compare_dom(Term, TermOk) + -> format('ok') + ; format('WRONG'), + format('~NOK:~n'), + pp(TermOk), + format('~NANSWER:~n'), + pp(Term) + ), + error_terms(Errors), + ( compare_errors(Errors, ErrorsOk) + -> true + ; format(' [Different errors]~nOK:~n'), + pp(ErrorsOk), + format('~NANSWER:~n'), + pp(Errors) + ), + nl + ; show_errors, + format('Loaded, no validating data~n'), + pp(Term) + ). + +show(File) :- + load_file(File, Term), + pp(Term). + +pass(File) :- + load_file(File, Term), + okfile(File, OkFile), + open(OkFile, write, Fd), + format(Fd, '~q.~n', [Term]), + ( error_terms(Errors) + -> format(Fd, '~q.~n', [Errors]) + ; true + ), + close(Fd). + +:- dynamic + error/3. +:- multifile + user:message_hook/3. + +user:message_hook(Term, Kind, Lines) :- + Term = sgml(_,_,_,_), + assert(error(Term, Kind, Lines)). + +show_errors :- + ( error(_Term, Kind, Lines), + atom_concat(Kind, ': ', Prefix), + print_message_lines(user_error, Prefix, Lines), + fail + ; true + ). + +error_terms(Errors) :- + findall(Term, error(Term, _, _), Errors). + +compare_errors([], []). +compare_errors([sgml(_Parser1, _File1, Line, Msg)|T0], + [sgml(_Parser2, _File2, Line, Msg)|T]) :- + compare_errors(T0, T). + +load_file(File, Term) :- + load_pred(Ext, Pred), + file_name_extension(_, Ext, File), !, + retractall(error(_,_,_)), + call(Pred, File, Term). +load_file(Base, Term) :- + load_pred(Ext, Pred), + file_name_extension(Base, Ext, File), + exists_file(File), !, + retractall(error(_,_,_)), + call(Pred, File, Term). + + +load_pred(sgml, load_sgml_file). +load_pred(xml, load_xml_file). +load_pred(html, load_html_file). + +okfile(File, OkFile) :- + file_name_extension(Base, _, File), + file_directory_name(Base, Dir), + concat_atom([Dir, '/ok/', Base, '.ok'], OkFile). + +load_prolog_file(File, Term, Errors) :- + open(File, read, Fd, + [ encoding(utf8) + ]), + read(Fd, Term), + ( read(Fd, Errors), + Errors \== end_of_file + -> true + ; Errors = [] + ), + close(Fd). + +compare_dom([], []) :- !. +compare_dom([H1|T1], [H2|T2]) :- !, + compare_dom(H1, H2), + compare_dom(T1, T2). +compare_dom(X, X) :- !. +compare_dom(element(Name, A1, Content1), + element(Name, A2, Content2)) :- + compare_attributes(A1, A2), + compare_dom(Content1, Content2). + +compare_attributes(A1, A2) :- + sort(A1, L1), + sort(A2, L2), + L1 == L2. + + + diff --git a/packages/sgml/Test/ugh.sgml b/packages/sgml/Test/ugh.sgml new file mode 100644 index 000000000..33f54cb28 --- /dev/null +++ b/packages/sgml/Test/ugh.sgml @@ -0,0 +1,9 @@ + + + +]> + +8494 +1234 +1234 diff --git a/packages/sgml/Test/utf8-cent.xml b/packages/sgml/Test/utf8-cent.xml new file mode 100644 index 000000000..830711692 --- /dev/null +++ b/packages/sgml/Test/utf8-cent.xml @@ -0,0 +1,7 @@ + + +From Española -- a ‘test’ for you. +From Española -- a ‘test’ for you. +From Española -- a ‘test’ for you. +From Española -- a ‘test’ for you. + diff --git a/packages/sgml/Test/utf8-ru.xml b/packages/sgml/Test/utf8-ru.xml new file mode 100644 index 000000000..5059daa49 --- /dev/null +++ b/packages/sgml/Test/utf8-ru.xml @@ -0,0 +1,3 @@ + + +<Человек Ñзык="мова">Borys diff --git a/packages/sgml/Test/utf8.xml b/packages/sgml/Test/utf8.xml new file mode 100644 index 000000000..c5f3baf19 --- /dev/null +++ b/packages/sgml/Test/utf8.xml @@ -0,0 +1,10 @@ + + + + + +Dürst + + + diff --git a/packages/sgml/Test/wchar.xml b/packages/sgml/Test/wchar.xml new file mode 100644 index 000000000..4db60a540 --- /dev/null +++ b/packages/sgml/Test/wchar.xml @@ -0,0 +1,11 @@ + + +]> + +

  • Some cyrillic chars: ий
  • +
  • This a an sizzors symbol: ✄
  • +
  • OK, entered via entity: &ok;
  • + + + diff --git a/packages/sgml/Test/wcharlong.xml b/packages/sgml/Test/wcharlong.xml new file mode 100644 index 000000000..cb0317690 --- /dev/null +++ b/packages/sgml/Test/wcharlong.xml @@ -0,0 +1,16 @@ + + + +

    +This is a long test holding, with the intention to switch to wide character +encoding from the local buffer to malloc'ed buffer. This (✌) is a +piece symbol. +

    +

    +This is a long test holding, with the intention to switch to wide character +encoding after the output buffer has been switched to malloc'ed mode. For this +reason, our buffer should should hold more than 256 character, as defined in +util.h in the structure ocharbuf. This (➽) is a fat arrow. +

    +
    + diff --git a/packages/sgml/Test/wrtest.pl b/packages/sgml/Test/wrtest.pl new file mode 100644 index 000000000..44dc38e10 --- /dev/null +++ b/packages/sgml/Test/wrtest.pl @@ -0,0 +1,241 @@ +/* $Id$ + + Part of SWI-Prolog SGML/XML parser + + Author: Jan Wielemaker + E-mail: jan@swi.psy.uva.nl + WWW: http://www.swi.psy.uva.nl/projects/SWI-Prolog/ + Copying: LGPL-2. See the file COPYING or http://www.gnu.org + + Copyright (C) 1990-2000 SWI, University of Amsterdam. All rights reserved. +*/ + +:- prolog_load_context(directory, CWD), + working_directory(_, CWD). + +:- asserta(file_search_path(foreign, '..')). +:- asserta(file_search_path(library, '..')). + +:- use_module(library(sgml)). +:- use_module(library(sgml_write)). + +test :- % default test + fp('.'). + +test(File) :- + file_name_extension(_, xml, File), !, + load_xml_file(File, Term), + xml_write(user_output, Term, []). +test(File) :- + file_name_extension(_, sgml, File), !, + load_sgml_file(File, Term), + sgml_write(user_output, Term, []). +test(File) :- + file_name_extension(_, html, File), !, + load_html_file(File, Term), + html_write(user_output, Term, []). + +test(File, Into, Encoding) :- + file_name_extension(_, xml, File), !, + load_xml_file(File, Term), + open(Into, write, Out, [encoding(Encoding)]), + xml_write(Out, Term, []), + close(Out). + +fp(Dir) :- + atom_concat(Dir, '/*', Pattern), + expand_file_name(Pattern, Files), + ( member(File, Files), + file_name_extension(_, Ext, File), + ml_file(Ext), + file_base_name(File, Base), + \+ blocked(Base), + format(user_error, '~w ... ', [Base]), + ( \+ utf8(Base) + -> format(user_error, ' (ISO Latin-1) ... ', []), + fixed_point(File, iso_latin_1) + ; true + ), + format(user_error, ' (UTF-8) ... ', []), + fixed_point(File, utf8), + format(user_error, ' done~n', []), + fail + ; true + ). + +ml_file(xml). +ml_file(sgml). +ml_file(html). + +%% blocked(+File) +% +% List of test-files that are blocked. These are either negative +% tests or tests involving SDATA. + +blocked('bat.sgml'). +blocked('i.sgml'). +blocked('sdata.sgml'). +blocked('cent-nul.xml'). +blocked('defent.sgml'). +blocked('comment.xml'). +blocked('badxmlent.xml'). + + +%% utf8(+File) +% +% File requires UTF-8. These are files that have UTF-8 characters +% in element or attribute names. + +utf8('utf8-ru.xml'). + + +%% fixed_point(+File, +Encoding) +% +% Perform write/read round-trip and validate the data has not +% changed. + +fixed_point(File, Encoding) :- + file_name_extension(_, xml, File), !, + fp(File, Encoding, load_xml_file, xml_write). +fixed_point(File, Encoding) :- + file_name_extension(_, sgml, File), !, + fp(File, Encoding, load_sgml_file, sgml_write). +fixed_point(File, Encoding) :- + file_name_extension(_, html, File), !, + fp(File, Encoding, load_html_file, html_write). + +fp(File, Encoding, Load, Write) :- + put_char(user_error, r), + call(Load, File, Term), + tmp_file(xml, TmpFile), + open(TmpFile, write, TmpOut, [encoding(Encoding)]), + put_char(user_error, w), + call(Write, TmpOut, Term, []), + close(TmpOut), +% cat(TmpFile, Encoding), + put_char(user_error, r), + call(Load, TmpFile, Term2), + delete_file(TmpFile), + ( eq(Term, Term2) + -> true + ; format(user_error, 'First file:~n', []), + %pp(Term), + save_in_file(f1, Term), + format(user_error, 'Second file:~n', []), + %pp(Term2), + save_in_file(f2, Term2), + fail + ). + +save_in_file(File, Term) :- + open(File, write, Out, [encoding(iso_latin_1)]), + current_output(C0), + set_output(Out), + pp(Term), + set_output(C0), + close(Out). + + +cat(File, Encoding) :- + open(File, read, In, [encoding(Encoding)]), + copy_stream_data(In, current_output), + close(In). + +% eq(M1, M2) +% +% Test two terms for equivalence. The following mismatches are +% allowed: +% +% * Order of attributes +% * Layout in `element-only' content + +eq(X, X) :- !. +eq([], []) :- !. +eq([B|T], L) :- % delete blanks + blank_atom(B), !, + eq(T, L). +eq(L, [B|T]) :- + blank_atom(B), !, + eq(T, L). +eq([H1|T1], [H2|T2]) :- !, + eq(H1, H2), + eq(T1, T2). +eq(element(Name, A1, C1), element(Name, A2, C2)) :- + att_eq(A1, A2), + ceq(C1, C2). +eq(A1, A2) :- + atom(A1), + atom(A2), !, + normalise_blanks(A1, B1), + normalise_blanks(A2, B2), + ( B1 == B2 + -> true + ; format(user_error, + 'ERROR: CDATA differs:~n\ + \t~p~n\ + \t~p~n', + [B1, B2]) + ). +eq(X, Y) :- + format(user_error, + 'ERROR: Content differs:~n\ + \t~p~n\ + \t~p~n', + [X, Y]). + +att_eq(A1, A2) :- % ordering is unimportant + sort(A1, S), + sort(A2, S), !. +att_eq(A1, A2) :- + format(user_error, + 'ERROR: Attribute lists differ:~n\ + \t~p~n\ + \t~p~n', + [A1, A2]). + +ceq(C1, C2) :- + element_content(C1, E1), + element_content(C2, E2), !, + eq(E1, E2). +ceq(C1, C2) :- + eq(C1, C2). + +element_content([], []). +element_content([element(Name,Atts,C)|T0], [element(Name,Atts,C)|T]) :- !, + element_content(T0, T). +element_content([Blank|T0], T) :- + blank_atom(Blank), + element_content(T0, T). + +blank_atom(Atom) :- + atom(Atom), + atom_codes(Atom, Codes), + all_blanks(Codes). + +all_blanks([]). +all_blanks([H|T]) :- + code_type(H, space), + all_blanks(T). + +normalise_blanks(Atom, Normalised) :- + atom_codes(Atom, Codes), + eat_blanks(Codes, Codes1), + normalise_blanks2(Codes1, N), + atom_codes(Normalised, N). + +normalise_blanks2([], []). +normalise_blanks2([H|T0], T) :- + code_type(H, space), !, + eat_blanks(T0, T1), + ( T1 == [] + -> T = [] + ; T = [32|T2], + normalise_blanks2(T1, T2) + ). +normalise_blanks2([H|T0], [H|T]) :- + normalise_blanks2(T0, T). + +eat_blanks([H|T0], T) :- + code_type(H, space), !, + eat_blanks(T0, T). +eat_blanks(L, L). diff --git a/packages/sgml/VERSION b/packages/sgml/VERSION new file mode 100644 index 000000000..2165f8f9b --- /dev/null +++ b/packages/sgml/VERSION @@ -0,0 +1 @@ +2.0.4 diff --git a/packages/sgml/catalog.c b/packages/sgml/catalog.c new file mode 100644 index 000000000..80e9dcebb --- /dev/null +++ b/packages/sgml/catalog.c @@ -0,0 +1,672 @@ +/* $Id$ + + Part of SWI-Prolog + + Author: Jan Wielemaker and Richard O'Keefe + E-mail: wielemak@science.uva.nl + WWW: http://www.swi-prolog.org + Copyright (C): 1985-2006, University of Amsterdam + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ + +#define _ISOC99_SOURCE 1 /* fwprintf(), etc prototypes */ +#include "util.h" +#include "catalog.h" +#include +#include +#include +#include +#define DTD_MINOR_ERRORS 1 +#include /* error codes */ + +#ifdef __WINDOWS__ +#define swprintf _snwprintf +#endif + +#ifdef _REENTRANT +#include + +static pthread_mutex_t catalog_mutex = PTHREAD_MUTEX_INITIALIZER; +#define LOCK() pthread_mutex_lock(&catalog_mutex) +#define UNLOCK() pthread_mutex_unlock(&catalog_mutex) +#else +#define LOCK() +#define UNLOCK() +#endif + +#ifndef MAXPATHLEN +#define MAXPATHLEN 1024 +#endif +#ifndef MAXLINE +#define MAXLINE 1024 +#endif +#ifndef EOS +#define EOS '\0' +#endif +#ifndef TRUE +#define TRUE 1 +#define FALSE 0 +#endif + +#define streq(s1, s2) istreq(s1, s2) +#define uc(p) (*(p)) + +typedef struct catalogue_item *catalogue_item_ptr; +struct catalogue_item +{ catalogue_item_ptr next; + int kind; + ichar const *target; + ichar const *replacement; +}; + +static catalogue_item_ptr first_item = 0, last_item = 0; + +typedef struct _catalog_file +{ ichar *file; + struct _catalog_file *next; + int loaded; /* did we parse this file? */ + catalogue_item_ptr first_item; /* List of items in the file */ + catalogue_item_ptr last_item; +} catalog_file; + +static catalog_file *catalog; + +#ifdef __WINDOWS__ +#define isDirSep(c) ((c) == '/' || (c) == '\\') +#define DIRSEPSTR L"\\" +#else +#define isDirSep(c) ((c) == '/') +#define DIRSEPSTR L"/" +#endif + +static ichar * +DirName(const ichar *f, ichar *dir) +{ const ichar *base, *p; + + for (base = p = f; *p; p++) + { if (isDirSep(*p) && p[1] != EOS) + base = p; + } + if (base == f) + { if (isDirSep(*f)) + istrcpy(dir, DIRSEPSTR); + else + istrcpy(dir, L"."); + } else + { istrncpy(dir, f, base - f); + dir[base - f] = EOS; + } + + return dir; +} + + +int +is_absolute_path(const ichar *name) +{ if (isDirSep(name[0]) +#ifdef __WINDOWS__ + || (iswalpha(uc(name)) && name[1] == ':') +#endif + ) + return TRUE; + + return FALSE; +} + + +/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +localpath() creates an absolute path for name relative to ref. The +returned path must be freed using sgml_free() when done. +- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ + +ichar * +localpath(const ichar *ref, const ichar *name) +{ ichar *local; + + if (!ref || is_absolute_path(name)) + local = istrdup(name); + else + { ichar buf[MAXPATHLEN]; + + DirName(ref, buf); + istrcat(buf, DIRSEPSTR); + istrcat(buf, name); + + local = istrdup(buf); + } + + if (!local) + sgml_nomem(); + + return local; +} + + +static int +register_catalog_file_unlocked(const ichar *file, catalog_location where) +{ catalog_file **f = &catalog; + catalog_file *cf; + + for (; *f; f = &(*f)->next) + { cf = *f; + + if (istreq(cf->file, file)) + return TRUE; /* existing, move? */ + } + + cf = sgml_malloc(sizeof(*cf)); + memset(cf, 0, sizeof(*cf)); + cf->file = istrdup(file); + if (!cf->file) + sgml_nomem(); + + if (where == CTL_END) + { cf->next = NULL; + *f = cf; + } else + { cf->next = catalog; + catalog = cf; + } + + return TRUE; +} + + +static wchar_t * +wgetenv(const char *name) +{ const char *vs; + + if ( (vs = getenv(name)) ) + { size_t wl = mbstowcs(NULL, vs, 0); + + if ( wl > 0 ) + { wchar_t *ws = sgml_malloc((wl+1)*sizeof(wchar_t)); + mbstowcs(ws, vs, wl+1); + + return ws; + } + } + + return NULL; +} + + +static void +init_catalog(void) +{ static int done = FALSE; + + LOCK(); + if ( !done++ ) + { ichar *path = wgetenv("SGML_CATALOG_FILES"); + + if (!path) + { UNLOCK(); + return; + } + + while (*path) + { ichar buf[MAXPATHLEN]; + ichar *s; + + if ((s = istrchr(path, L':'))) + { istrncpy(buf, path, s - path); + buf[s - path] = '\0'; + path = s + 1; + if ( buf[0] ) /* skip empty entries */ + register_catalog_file_unlocked(buf, CTL_START); + } else + { if ( path[0] ) /* skip empty entries */ + register_catalog_file_unlocked(path, CTL_START); + break; + } + } + } + UNLOCK(); +} + + +int +register_catalog_file(const ichar *file, catalog_location where) +{ int rc; + + init_catalog(); + + LOCK(); + rc = register_catalog_file_unlocked(file, where); + UNLOCK(); + + return rc; +} + + + /******************************* + * CATALOG FILE PARSING * + *******************************/ + +/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +The code from here to the end of this file was written by Richard +O'Keefe and modified by Jan Wielemaker to fit in with the rest of the +parser. +- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ + +#include +#include +#include +#include + +/* OVERRIDE YES/NO + sets a boolean flag initialised to NO. + The value of this flag is stored as part of each entry. + (PUBLIC|DOCTYPE|ENTITY)&YES will match whether a system identifier + was provided in the source document or not; + (PUBLIC|DOCTYPE|ENTITY)&NO will only match if a system identifier + was not provided. +*/ + +/* catalogue = + ( PUBLIC pubid filename + | SYSTEM sysid filename + | DOCTYPE name filename + | ENTITY name filename + | OVERRIDE YES + | OVERRIDE NO + | BASE filename + | junk + )* +*/ + + +/* Keywords are matched ignoring case. */ + +static int +ci_streql(ichar const *a, ichar const *b) +{ return istrcaseeq(a, b); +} + +/* Names may be matched heading case in XML. */ + +static int +cs_streql(ichar const *a, ichar const *b) +{ return istreq(a, b); +} + +/* Any other word or any quoted string is reported as CAT_OTHER. + When we are not looking for the beginning of an entry, the only + positive outcome is CAT_OTHER. +*/ + +static int +scan_overflow(size_t buflen) +{ gripe(ERC_REPRESENTATION, L"token length"); + + return EOF; +} + +static int +scan(FILE* src, ichar *buffer, size_t buflen, int kw_expected) +{ int c, q; + ichar *p = buffer, *e = p + buflen - 1; + + for (;;) + { c = getc(src); + if (c <= ' ') + { if (c < 0) + return EOF; + continue; + } + if (c == '-') + { c = getc(src); + if (c != '-') + { *p++ = '-'; + break; + } + for (;;) + { c = getc(src); + if (c < 0) + return EOF; + if (c == '-') + { c = getc(src); + if (c < 0) + return EOF; + if (c == '-') + break; + } + } + continue; + } + if (c == '"' || c == '\'') + { q = c; + for (;;) + { c = getc(src); + if (c < 0) + return EOF; + if (c == q) + { *p = '\0'; + return CAT_OTHER; + } + if (p == e) + return scan_overflow(buflen); + *p++ = c; + } + } + break; + } + /* We reach here if there is an unquoted token. */ + /* Don't try "PUBLIC--well/sortof--'foo' 'bar'" */ + /* because hyphens are allowed in unquoted words */ + /* and so are slashes and a bunch of other stuff. */ + /* To keep this code simple, an unquoted token */ + /* ends at EOF, ', ", or layout. */ + while (c > ' ' && c != '"' && c != '\'') + { if (p == e) + return scan_overflow(buflen); + *p++ = c; + c = getc(src); + } + *p = '\0'; + if (kw_expected) + { if (ci_streql(buffer, L"public")) + return CAT_PUBLIC; + if (ci_streql(buffer, L"system")) + return CAT_SYSTEM; + if (ci_streql(buffer, L"entity")) + return CAT_ENTITY; + if (ci_streql(buffer, L"doctype")) + return CAT_DOCTYPE; + if (ci_streql(buffer, L"override")) + return CAT_OVERRIDE; + if (ci_streql(buffer, L"base")) + return CAT_BASE; + } + return CAT_OTHER; +} + +/* The strings can represent names (taken verbatim), + system identifiers (ditto), or public identifiers (squished). + We need to squish, and we need to copy. When it comes to + squishing, we don't need to worry about Unicode spaces, + because public identifiers aren't allow to have any characters + that aren't in ASCII. +*/ + +static void +squish(ichar *pubid) +{ ichar const *s = (ichar const *) pubid; + ichar *d = (ichar *) pubid; + ichar c; + int w; + + w = 1; + while ((c = *s++) != '\0') + { if (c <= ' ') + { if (!w) + *d++ = ' ', w = 1; + } else + { *d++ = c, w = 0; + } + } + if (w && d != (ichar *) pubid) + d--; + *d = '\0'; +} + +/* We represent a catalogue internally by a list of + (CAT_xxx, string, string) + triples. +*/ + +static void +load_one_catalogue(catalog_file * file) +{ FILE *src = wfopen(file->file, "r"); + ichar buffer[2 * FILENAME_MAX]; + ichar base[2 * FILENAME_MAX]; + ichar *p; + int t; + catalogue_item_ptr this_item; + int override = 0; + + if ( !src ) + { gripe(ERC_NO_CATALOGUE, file->file); + return; + } + + (void) istrcpy(base, file->file); + p = base + istrlen(base); + while (p != base && !isDirSep(p[-1])) + p--; + + for (;;) + { t = scan(src, buffer, sizeof(buffer), 1); + switch (t) + { case CAT_BASE: + if (scan(src, buffer, sizeof(buffer), 0) == EOF) + break; + (void) istrcpy(base, buffer); + p = base + istrlen(base); + if (p != base && !isDirSep(p[-1])) + *p++ = '/'; + continue; + case CAT_OVERRIDE: + if (scan(src, buffer, sizeof(buffer), 0) == EOF) + break; + override = towlower(buffer[0]) == 'y' ? CAT_OVERRIDE : 0; + continue; + case CAT_PUBLIC: + case CAT_SYSTEM: + case CAT_ENTITY: + case CAT_DOCTYPE: + this_item = sgml_malloc(sizeof *this_item); + if (scan(src, buffer, sizeof buffer, 0) == EOF) + break; + if (t == CAT_PUBLIC) + squish(buffer); + this_item->next = 0; + this_item->kind = t == CAT_SYSTEM ? t : t + override; + this_item->target = istrdup(buffer); + + if (scan(src, buffer, sizeof buffer, 0) == EOF) + break; + + if (is_absolute_path(buffer) || p == base) + { this_item->replacement = istrdup(buffer); + } else + { (void) istrcpy(p, buffer); + this_item->replacement = istrdup(base); + } + + if (file->first_item == 0) + { file->first_item = this_item; + } else + { file->last_item->next = this_item; + } + + file->last_item = this_item; + continue; + case EOF: + break; + default: + continue; + } + break; + } + + fclose(src); +} + + +/* To look up a DTD: + f = find_in_catalogue(CAT_DOCTYPE, name, pubid, sysid, ci); + If it cannot otherwise be found and name is not null, + ${name}.dtd will be returned. + + To look up a parameter entity: + f = find_in_catalogue(CAT_PENTITY, name, pubid, sysid, ci); + The name may begin with a % but need not; if it doesn't + a % will be prefixed for the search. + If it cannot otherwise be found ${name}.pen will be returned. + + To look up an ordinary entity: + f = find_in_catalogue(CAT_ENTITY, name, pubid, sysid, ci); + If the name begins with a % this is just like a CAT_PENTITY search. + If it cannot otherwise be found %{name}.ent will be returned. + + The full catalogue format allows for NOTATION (which we still need + for XML), SGMLDECL, DTDDECL, and LINKTYPE. At the moment, only + notation is plausible. To handle such things, + f = find_in_catalogue(CAT_OTHER, name, pubid, sysid, ci); + If it cannot be found, NULL is returned. + + The name, pubid, and sysid may each be NULL. It doesn't really + make sense for them all to be NULL. + + For SGML, name matching (DOCTYPE, ENTITY) should normally ignore + alphabetic case. Pass ci=1 to make this happen. For XML, name + matching must heed alphabetic case. Pass ci=0 to make that happen. + + A CAT_DOCTYPE, CAT_ENTITY, or CAT_PENTITY search doesn't really make + sense withint a name, so if the name should happen to be 0, the search + kind is converted to CAT_OTHER. +*/ + +ichar const * +find_in_catalogue(int kind, + ichar const *name, + ichar const *pubid, ichar const *sysid, int ci) +{ ichar penname[FILENAME_MAX]; + const size_t penlen = sizeof(penname)/sizeof(ichar); + catalogue_item_ptr item; + ichar const *result; + catalog_file *catfile; + + init_catalog(); + + if ( name == 0 ) + { kind = CAT_OTHER; + } else + { switch (kind) + { case CAT_OTHER: + case CAT_DOCTYPE: + break; + case CAT_PENTITY: + if (name[0] != '%') + { penname[0] = '%'; + (void) istrcpy(penname + 1, name); + name = penname; + } + break; + case CAT_ENTITY: + if (name[0] == '%') + { kind = CAT_PENTITY; + } + break; + default: + return 0; + } + } + + result = 0; + for (catfile = catalog;; catfile = catfile->next) + { if (catfile) + { if (!catfile->loaded) + { load_one_catalogue(catfile); + catfile->loaded = TRUE; + } + item = catfile->first_item; + } else + item = first_item; + + for (; item != 0; item = item->next) + { switch (item->kind) + { case CAT_PUBLIC: + if (sysid != 0) + break; + /*FALLTHROUGH*/ + case OVR_PUBLIC: + if (pubid != 0 && result == 0 && cs_streql(pubid, item->target)) + result = item->replacement; + break; + case CAT_SYSTEM: + if (sysid != 0 && cs_streql(sysid, item->target)) + return item->replacement; + break; + case CAT_DOCTYPE: + if (sysid != 0) + break; + /*FALLTHROUGH*/ + case OVR_DOCTYPE: + if (name != 0 && kind == CAT_DOCTYPE && result == 0 + && (ci ? ci_streql : cs_streql) (name, item->target)) + result = item->replacement; + break; + case CAT_ENTITY: + if (sysid != 0) + break; + /*FALLTHROUGH*/ case OVR_ENTITY: + if (name != 0 && kind >= CAT_ENTITY && result == 0 + && (ci ? ci_streql : cs_streql) (name, item->target)) + result = item->replacement; + break; + default: + break; + } + } + + if (!catfile) + break; + } + if ( result != 0 ) + return result; + if ( sysid != 0 ) + return sysid; + if ( kind == CAT_OTHER || kind == CAT_DOCTYPE ) + return 0; + + if ( istrlen(name)+4+1 > penlen ) + { gripe(ERC_REPRESENTATION, L"entity name"); + return NULL; + } + + item = sgml_malloc(sizeof(*item)); + item->next = 0; + item->kind = kind; + item->target = istrdup(name); + + switch (kind) + { case CAT_DOCTYPE: + (void) swprintf(penname, penlen, L"%ls.dtd", name); + break; + case CAT_PENTITY: + item->kind = CAT_ENTITY; + (void) swprintf(penname, penlen, L"%ls.pen", name + 1); + break; + case CAT_ENTITY: + (void) swprintf(penname, penlen, L"%ls.ent", name); + break; + default: + abort(); + } + + item->replacement = istrdup(penname); + if (first_item == 0) + { first_item = item; + } else + { last_item->next = item; + } + last_item = item; + + return item->replacement; +} + diff --git a/packages/sgml/catalog.h b/packages/sgml/catalog.h new file mode 100644 index 000000000..f85deb184 --- /dev/null +++ b/packages/sgml/catalog.h @@ -0,0 +1,64 @@ +/* $Id$ + + Part of SWI-Prolog + + Author: Jan Wielemaker + E-mail: jan@swi.psy.uva.nl + WWW: http://www.swi-prolog.org + Copyright (C): 1985-2002, University of Amsterdam + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ + +#ifndef CATALOG_H_INCLUDED +#define CATALOG_H_INCLUDED +#include "util.h" + +/* When we look for a token, we skip layout characters and comments. + There there is nothing left, we return EOF. + If we are looking for the beginning of an entry, the possibilities + are then +*/ + +#define CAT_OTHER (0) /* token + parameter of find... */ +#define CAT_SYSTEM (1) /* token only */ +#define CAT_PUBLIC (2) /* token only */ +#define CAT_DOCTYPE (3) /* token + parameter of find... */ +#define CAT_ENTITY (4) /* token + parameter of find... */ +#define CAT_PENTITY (5) /* parameter of find... only */ +#define CAT_OVERRIDE (5) /* token only */ +#define CAT_BASE (6) /* token only */ +#define OVR_PUBLIC (CAT_OVERRIDE + CAT_PUBLIC) +#define OVR_DOCTYPE (CAT_OVERRIDE + CAT_DOCTYPE) +#define OVR_ENTITY (CAT_OVERRIDE + CAT_ENTITY) + + +typedef enum +{ CTL_START, + CTL_END +} catalog_location; + +int register_catalog_file(const ichar *file, catalog_location where); +int is_absolute_path(const ichar *iname); +ichar *localpath(const ichar *ref, const ichar *name); +ichar const *find_in_catalogue( + int kind, + ichar const *name, + ichar const *pubid, + ichar const *sysid, + int ci +); + +#endif /*CATALOG_H_INCLUDED*/ diff --git a/packages/sgml/charmap.c b/packages/sgml/charmap.c new file mode 100644 index 000000000..af6392e59 --- /dev/null +++ b/packages/sgml/charmap.c @@ -0,0 +1,104 @@ +/* $Id$ + + Part of SWI-Prolog + + Author: Jan Wielemaker + E-mail: jan@swi.psy.uva.nl + WWW: http://www.swi-prolog.org + Copyright (C): 1985-2002, University of Amsterdam + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ + +#include +#include +#include "dtd.h" + +static void +char_range(dtd_charclass *map, int from, int to, int msk) +{ unsigned char *ca; + int i; + + for(i=from, ca=map->class+i; i++<=to; ) + *ca++ |= msk; +} + + +dtd_charclass * +new_charclass() +{ dtd_charclass *map = sgml_calloc(1, sizeof(*map)); + unsigned char *ca = map->class; + + char_range(map, 'a', 'z', CH_LCLETTER); + char_range(map, 'A', 'Z', CH_LCLETTER); + char_range(map, '0', '9', CH_DIGIT); + + ca['.'] |= CH_CNM; + ca['-'] |= CH_CNM; + ca[183] |= CH_CNM; /* XML */ + ca[':'] |= CH_CNMSTRT; /* HTML and XML */ + ca['_'] |= CH_CNMSTRT; /* HTML and XML */ + + char_range(map, 192, 214, CH_CNMSTRT); /* XML ISO-LATIN-1 accented chars */ + char_range(map, 216, 246, CH_CNMSTRT); + char_range(map, 248, 255, CH_CNMSTRT); + + ca['\t'] |= CH_WHITE; + ca[' '] |= CH_WHITE; + ca['\r'] |= CH_RE; + ca['\n'] |= CH_RS; + + return map; +} + + +dtd_charfunc * +new_charfunc() +{ dtd_charfunc *f = sgml_calloc(1, sizeof(*f)); + ichar *cf = f->func; + + cf[CF_STAGO] = '<'; + cf[CF_STAGC] = '>'; + cf[CF_ETAGO1] = '<'; + cf[CF_ETAGO2] = '/'; + cf[CF_VI] = '='; + cf[CF_NS] = ':'; + cf[CF_LIT] = '"'; + cf[CF_LITA] = '\''; + cf[CF_PERO] = '%'; + cf[CF_ERO] = '&'; + cf[CF_ERC] = ';'; + cf[CF_MDO1] = '<'; + cf[CF_MDO2] = '!'; + cf[CF_MDC] = '>'; + cf[CF_PRO1] = '<'; + cf[CF_PRO2] = '?'; + cf[CF_PRC] = '>'; + cf[CF_GRPO] = '('; + cf[CF_GRPC] = ')'; + cf[CF_SEQ] = ','; + cf[CF_AND] = '&'; + cf[CF_OR] = '|'; + cf[CF_OPT] = '?'; + cf[CF_PLUS] = '+'; + cf[CF_DSO] = '['; + cf[CF_DSC] = ']'; + cf[CF_REP] = '*'; + cf[CF_RS] = '\n'; + cf[CF_RE] = '\r'; + cf[CF_CMT] = '-'; + + return f; +} diff --git a/packages/sgml/configure.in b/packages/sgml/configure.in new file mode 100644 index 000000000..c383e3835 --- /dev/null +++ b/packages/sgml/configure.in @@ -0,0 +1,114 @@ +dnl Process this file with autoconf to produce a configure script. + +AC_INIT(install-sh) +AC_PREREQ([2.50]) +AC_CONFIG_HEADER(config.h) + +AC_ARG_WITH(prolog, + [ --with-prolog=PLBINARY use given SWI Prolog binary]) + +AC_SUBST(PL) +AC_SUBST(LD) +AC_SUBST(PLLD) +AC_SUBST(PLLIBS) +AC_SUBST(PLBASE) +AC_SUBST(PLARCH) +AC_SUBST(PLINCL) +AC_SUBST(COFLAGS) +AC_SUBST(CWFLAGS) +AC_SUBST(CMFLAGS) +AC_SUBST(ETAGS) +AC_SUBST(SO) +AC_SUBST(LDSOFLAGS) +AC_SUBST(RUNTEX) + +AC_ARG_ENABLE(mt, [ --enable-mt Enable Multi-threading], + [case "$enableval" in + yes) MT=yes + ;; + *) ;; + esac]) + +AC_PROG_CC +LD=$CC + +# Do not cache this, it changes too often in many configurations +unset ac_cv_prog_PL + +if test -z "$PLINCL"; then +plcandidates="$with_prolog swi-prolog swipl pl" +AC_CHECK_PROGS(PL, $plcandidates, "none") +AC_CHECK_PROGS(PLLD, plld, "none") +if test $PLLD = "none"; then + AC_ERROR("Cannot find SWI-Prolog plld utility. SWI-Prolog must be installed first") +fi +if test $PL = "none"; then + AC_ERROR("Cannot find SWI-Prolog. SWI-Prolog must be installed first") +else + AC_CHECKING("Running $PL -dump-runtime-variables") + eval `$PL -dump-runtime-variables` +fi +PLINCL="$PLBASE/include" +AC_MSG_RESULT(" PLBASE=$PLBASE") +AC_MSG_RESULT(" PLARCH=$PLARCH") +AC_MSG_RESULT(" PLLIBS=$PLLIBS") +AC_MSG_RESULT(" PLLDFLAGS=$PLLDFLAGS") +AC_MSG_RESULT(" PLSHARED=$PLSHARED") +if test "$PLTHREADS" = yes; then MT=yes; fi +else +PLLD=../plld.sh +PL=../pl.sh +fi + +if test "$MT" = yes; then + AC_DEFINE([_REENTRANT], 1, + [Define for multi-threaded version]) +fi + +CC=$PLLD +LD=$PLLD +LDSOFLAGS=-shared +CMFLAGS=-fpic +SO="$PLSOEXT" + +if test ! -z "$GCC"; then + COFLAGS="${COFLAGS--O2 -fno-strict-aliasing}" + CWFLAGS="${CWFLAGS--Wall}" +else + COFLAGS="${COFLAGS--O}" +fi + +case "$PLARCH" in + *irix*) if test -z "$GCC"; then + CWFLAGS="$CWFLAGS -woff 1164" + fi + ;; + *darwin*) CMFLAGS="$CMFLAGS -cc-options,-no-cpp-precomp" + ;; + *) ;; +esac + +AC_CHECK_PROGS(MAKE, gmake make, "make") +AC_MSG_CHECKING("whether make is GNU-make") +if ($MAKE -v 2>&1) | grep GNU > /dev/null; then + AC_MSG_RESULT(yes) + gmake=yes +else + VPATH="VPATH = " + gmake=no +fi +AC_CHECK_PROGS(ETAGS, etags ctags, ":") +AC_CHECK_PROGS(RUNTEX, runtex, ":") +AC_PROG_INSTALL +AC_PROG_CPP +AC_ISC_POSIX +AC_HEADER_STDC +CFLAGS="$CMFLAGS" +AC_C_BIGENDIAN +AC_C_INLINE +AC_CHECK_SIZEOF(long, 4) + +AC_CHECK_HEADERS(malloc.h unistd.h sys/time.h fcntl.h floatingpoint.h) +AC_CHECK_FUNCS(snprintf strerror strtoll) + +AC_OUTPUT(Makefile) diff --git a/packages/sgml/dtd.h b/packages/sgml/dtd.h new file mode 100644 index 000000000..3495543cf --- /dev/null +++ b/packages/sgml/dtd.h @@ -0,0 +1,481 @@ +/* $Id$ + + Part of SWI-Prolog + + Author: Jan Wielemaker + E-mail: jan@swi.psy.uva.nl + WWW: http://www.swi-prolog.org + Copyright (C): 1985-2002, University of Amsterdam + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ + +#ifndef DTD_H_INCLUDED +#define DTD_H_INCLUDED +#include "sgmldefs.h" + +#define CH_WHITE 0x0001 +#define CH_LCLETTER 0x0002 +#define CH_UCLETTER 0x0004 +#define CH_CNMSTRT 0x0008 /* may start a name */ +#define CH_CNM 0x0010 /* may be in a name */ +#define CH_DIGIT 0x0020 +#define CH_RE 0x0040 +#define CH_RS 0x0080 + +#define CH_LETTER (CH_LCLETTER|CH_UCLETTER) +#define CH_NMSTART (CH_LCLETTER|CH_UCLETTER|CH_CNMSTRT) +#define CH_NAME (CH_NMSTART|CH_DIGIT|CH_CNM) +#define CH_BLANK (CH_WHITE|CH_RE|CH_RS) + +#define CHR_BLANK 0x1 /* SHORTREF 'B' */ +#define CHR_DBLANK 0x2 /* SHORTREF 'BB' */ + +#define SGML_DTD_MAGIC 0x7364573 + +typedef enum +{ CF_STAGO = 0, /* < */ + CF_STAGC, /* > */ + CF_ETAGO1, /* < */ + CF_ETAGO2, /* / */ + CF_VI, /* = */ + CF_NS, /* : (XMLNS) */ + CF_LIT, /* " */ + CF_LITA, /* ' */ + CF_PERO, /* % */ + CF_ERO, /* & */ + CF_ERC, /* ; */ + CF_MDO1, /* < */ + CF_MDO2, /* ! (MDO= */ + CF_PRO1, /* < */ + CF_PRO2, /* ? (PRO= */ + CF_GRPO, /* ( */ + CF_GRPC, /* ) */ + CF_SEQ, /* , */ + CF_AND, /* & */ + CF_OR, /* | */ + CF_OPT, /* ? */ + CF_PLUS, /* + */ + CF_DSO, /* [ */ + CF_DSC, /* ] */ + CF_REP, /* * */ + CF_RS, /* \n */ + CF_RE, /* \r */ + CF_CMT, /* - */ + CF_NG, /* , or & or | */ + CF_ENDTABLE /* to find size */ +} charfunc; /* function of characters */ + +typedef enum +{ SGML_ENC_ISO_LATIN1 = 0, /* ISO Latin-1 */ + SGML_ENC_UTF8 /* Multi-byte UTF-8 encoding */ +} dtd_char_encoding; + +typedef enum +{ C_CDATA, /* pure cdata */ + C_PCDATA, /* parsed character data */ + C_RCDATA, /* pure cdata + entities */ + C_EMPTY, /* empy element */ + C_ANY /* element may contain anything */ +} contenttype; + +typedef enum +{ MC_ONE, /* one time */ + MC_OPT, /* optional element (?) */ + MC_REP, /* any times (*) */ + MC_PLUS /* one-or-more (+) */ +} modelcard; + +typedef enum +{ MT_UNDEF = 0, /* undefined */ + MT_PCDATA, /* Contains PCDATA */ + MT_ELEMENT, /* refers to element */ + MT_SEQ, /* Sequence (,) */ + MT_AND, /* Ony order (&) */ + MT_OR /* Disjunction (|) */ +} modeltype; + +typedef enum +{ AT_CDATA, /* CDATA attribute */ + AT_ENTITY, /* entity-name */ + AT_ENTITIES, /* entity-name list */ + AT_ID, /* identifier */ + AT_IDREF, /* identifier reference */ + AT_IDREFS, /* list of identifier references */ + AT_NAME, /* name token */ + AT_NAMES, /* list of names */ + AT_NAMEOF, /* one of these names */ + AT_NMTOKEN, /* name-token */ + AT_NMTOKENS, /* name-token list */ + AT_NOTATION, /* notation-name */ + AT_NUMBER, /* number */ + AT_NUMBERS, /* number list */ + AT_NUTOKEN, /* number token */ + AT_NUTOKENS /* number token list */ +} attrtype; + +typedef enum +{ AT_FIXED, /* fixed value */ + AT_REQUIRED, /* Required attribute */ + AT_CURRENT, /* most recent value */ + AT_CONREF, /* cross-reference */ + AT_IMPLIED, /* Implied attribute */ + AT_DEFAULT /* has default */ +} attrdef; + + +typedef enum +{ ET_SYSTEM, /* System (file) entity */ + ET_PUBLIC, /* Public (external) entity */ + ET_LITERAL /* Literal text */ +} entity_type; + + +typedef enum +{ EC_SGML, /* SGML data */ + EC_STARTTAG, /* SGML start-tag */ + EC_ENDTAG, /* SGML end-tag */ + EC_CDATA, /* CDATA entity */ + EC_SDATA, /* SDATA entity */ + EC_NDATA, /* non-sgml data */ + EC_PI /* Programming instruction */ +} data_type; + + +typedef enum +{ DL_SGML, /* Use SGML */ + DL_XML, /* Use XML */ + DL_XMLNS /* Use XML + Namespaces */ +} dtd_dialect; + + +typedef enum +{ OPT_SHORTTAG /* do/don't accept shorttag */ +} dtd_option; + + +typedef enum +{ SP_PRESERVE = 0, /* Preserve all white-space */ + SP_DEFAULT, /* Default space handling */ + SP_REMOVE, /* Remove all blank CDATA elements */ + SP_SGML, /* Compliant SGML mode */ + SP_INHERIT /* DTD: inherit from environment */ +} dtd_space_mode; + + +typedef enum +{ NU_TOKEN, /* Treat numbers as tokens */ + NU_INTEGER /* Convert to integer */ +} dtd_number_mode; + + + /******************************* + * ERRORS * + *******************************/ + +#ifdef DTD_IMPLEMENTATION +#define DTD_MINOR_ERRORS 1 +#endif + +typedef enum +{ ERS_WARNING, /* probably correct result */ + ERS_ERROR, /* probably incrorrect result */ + ERS_STYLE /* dubious/bad style; correct result */ +} dtd_error_severity; + + +typedef enum +{ ERC_REPRESENTATION, /* Internal limit */ + /* id */ + ERC_RESOURCE, /* external limit */ + /* id */ + ERC_LIMIT, /* Exceeded SGML limit */ + /* id */ + ERC_VALIDATE, /* DTD Validation */ + /* Message */ + ERC_SYNTAX_ERROR, /* Syntax error */ + /* Message, found */ + ERC_EXISTENCE, /* Existence error */ + /* Type, name */ + ERC_REDEFINED /* Redefined object */ + /* Type, name */ +#ifdef DTD_MINOR_ERRORS + , /* reopen list */ + ERC_SYNTAX_WARNING, /* Syntax warning (i.e. fixed) */ + /* Message, found */ + ERC_DOMAIN, /* Relative to declared type */ + /* Type, found */ + ERC_OMITTED_CLOSE, + /* Element */ + ERC_OMITTED_OPEN, + /* Element */ + ERC_NOT_OPEN, + /* Element */ + ERC_NOT_ALLOWED, + /* Element */ + ERC_NOT_ALLOWED_PCDATA, + /* Text */ + ERC_NO_ATTRIBUTE, + /* Element, Attribute */ + ERC_NO_ATTRIBUTE_VALUE, + /* Element, Value */ + ERC_NO_VALUE, + /* Entity */ + ERC_NO_DOCTYPE, + /* Implicit, file */ + ERC_NO_CATALOGUE + /* file */ +#endif +} dtd_error_id; + + +typedef enum +{ IN_NONE, /* unspecified input */ + IN_FILE, /* input from file */ + IN_ENTITY /* input from entity */ +} input_type; + + +typedef struct _dtd_srcloc +{ input_type type; /* type of input */ + union + { const ichar *file; /* name of the file */ + const ichar *entity; /* name of entity */ + } name; + int line; /* 1-based Line no */ + int linepos; /* 1-based char */ + long charpos; /* 0-based file char */ + struct _dtd_srcloc *parent; /* parent location */ +} dtd_srcloc; + + +typedef struct _dtd_error +{ dtd_error_id id; /* ERC_* identifier */ + dtd_error_id minor; /* Minor code */ + dtd_error_severity severity; /* ERS_* severity */ + dtd_srcloc *location; /* location of the error */ + wchar_t *plain_message; /* Clean message */ + wchar_t *message; /* complete message */ + /* (Warning: file:line: ) */ + wchar_t *argv[2]; /* context arguments */ +} dtd_error; + + + /******************************* + * DTD TYPES * + *******************************/ + +typedef struct _dtd_symbol +{ const ichar *name; /* name of the atom */ + struct _dtd_symbol *next; /* next in atom list */ + struct _dtd_element *element; /* connected element (if any) */ + struct _dtd_entity *entity; /* connected entity (if any) */ +} dtd_symbol; + + +typedef struct _dtd_symbol_table +{ int size; /* Allocated size */ + dtd_symbol **entries; /* Entries */ +} dtd_symbol_table; + + +typedef struct _dtd_entity +{ dtd_symbol *name; /* its name */ + entity_type type; /* ET_* */ + data_type content; /* EC_* */ + int catalog_location; /* what catalog to use for lookup */ + int length; /* size of literal value */ + ichar *value; /* literal value */ + ichar *extid; /* external identifier */ + ichar *exturl; /* url to fetch from */ + ichar *baseurl; /* base url for exturl */ + struct _dtd_entity *next; /* list-link */ +} dtd_entity; + + +typedef struct _dtd_notation +{ dtd_symbol *name; /* name of the notation */ + entity_type type; /* ET_{PUBLIC|SYSTEM} */ + ichar *public; /* public id */ + ichar *system; /* file with info */ + struct _dtd_notation *next; /* list-link */ +} dtd_notation; + + +typedef struct _dtd_element_list +{ struct _dtd_element *value; /* element */ + struct _dtd_element_list *next; /* next in list */ +} dtd_element_list; + + +typedef struct _dtd_name_list +{ dtd_symbol *value; + struct _dtd_name_list *next; +} dtd_name_list; + + +typedef struct _dtd_attr +{ dtd_symbol *name; /* name of attribute */ + attrtype type; /* type (AT_*) */ + attrdef def; /* AT_REQUIRED/AT_IMPLIED */ + int islist; /* attribute is a list */ + union + { dtd_name_list *nameof; /* (name1|name2|...) */ + } typeex; + union + { ichar *cdata; /* default for CDATA */ + ichar *list; /* text for list-data */ + dtd_symbol *name; /* AT_NAME or AT_NAMEOF */ + long number; /* AT_NUMBER */ + } att_def; + int references; /* reference count */ +} dtd_attr; + + +typedef struct _dtd_attr_list +{ dtd_attr *attribute; + struct _dtd_attr_list *next; +} dtd_attr_list; + + +typedef struct _dtd_model +{ modeltype type; /* MT_* */ + modelcard cardinality; /* MC_* */ + + union + { struct _dtd_model *group; /* ,/|/& group */ + struct _dtd_element *element; /* element */ + } content; + struct _dtd_model *next; /* next in list (for groups) */ +} dtd_model; + + +typedef struct _dtd_edef +{ contenttype type; /* EMPTY, MIXED, ... */ + int omit_open; /* allow omitted open tag? */ + int omit_close; /* allow omitted close tag? */ + dtd_model *content; /* the content model */ + dtd_element_list *included; /* +(namegroup) */ + dtd_element_list *excluded; /* -(namegroup) */ + struct _dtd_state *initial_state; /* Initial state in state engine */ + struct _dtd_state *final_state; /* Final state in state engine */ + int references; /* #elements using this def */ +} dtd_edef; + + +typedef struct _dtd_map +{ ichar *from; /* mapped text */ + int len; /* length of mapped text */ + dtd_symbol *to; /* name of symbol mapped onto */ + struct _dtd_map *next; /* next in shortref map */ +} dtd_map; + + +typedef struct _dtd_shortref +{ dtd_symbol *name; /* name of SHORTREF map */ + dtd_map *map; /* implemented map */ + char ends[SHORTMAP_SIZE]; /* ending-characters in map */ + int defined; /* has been defined */ + struct _dtd_shortref *next; /* next declared shortref */ +} dtd_shortref; + + +typedef struct _dtd_element +{ dtd_symbol *name; /* its name */ + dtd_edef *structure; /* content structure of the element */ + dtd_attr_list *attributes; /* defined attributes */ + dtd_space_mode space_mode; /* How to handle white-space (SP_*) */ + dtd_shortref *map; /* SHORTREF map */ + int undefined; /* Only implicitely defined */ + struct _dtd_element *next; /* in DTD'e element list */ +} dtd_element; + + +typedef struct _dtd_charclass +{ unsigned char class[INPUT_CHARSET_SIZE]; /* ichar --> class-mask */ +} dtd_charclass; + + +typedef struct _dtd_charfunc +{ ichar func[(int)CF_ENDTABLE]; /* CF_ --> ichar */ +} dtd_charfunc; + + +typedef struct _dtd +{ int magic; /* SGML_DTD_MAGIC */ + int implicit; /* There is no DTD */ + dtd_dialect dialect; /* DL_* */ + int case_sensitive; /* Tags are case-sensitive */ + int ent_case_sensitive; /* Entities are case-sensitive */ + ichar *doctype; /* defined document type */ + dtd_symbol_table *symbols; /* symbol-table */ + dtd_entity *pentities; /* defined parameter entities */ + dtd_entity *entities; /* defined entities */ + dtd_entity *default_entity; /* default-entity (if any) */ + dtd_notation *notations; /* Declared notations */ + dtd_shortref *shortrefs; /* SHORTREF declarations */ + dtd_element *elements; /* defined elements */ + dtd_charfunc *charfunc; /* CF_ --> ichar */ + dtd_charclass *charclass; /* ichar -> CH_-mask */ + dtd_char_encoding encoding; /* document encoding */ + dtd_space_mode space_mode; /* Default for handling white-space */ + dtd_number_mode number_mode; /* How to treat number attributes */ + int shorttag; /* support SHORTTAG */ + int references; /* destruction reference count */ +} dtd; + +extern dtd_charfunc *new_charfunc(void); /* default classification */ +extern dtd_charclass *new_charclass(void); /* default classification */ + +extern dtd_symbol* dtd_find_symbol(dtd *dtd, const ichar *name); +extern dtd_symbol* dtd_add_symbol(dtd *dtd, const ichar *name); + + + /******************************* + * PUBLIC * + *******************************/ + +#include "parser.h" + +dtd * file_to_dtd(const ichar *file, const ichar *doctype, + dtd_dialect dialect); +int sgml_process_file(dtd_parser *p, + const ichar *file, unsigned flags); +int sgml_process_stream(dtd_parser *p, FILE *in, + unsigned flags); +dtd_parser * new_dtd_parser(dtd *dtd); +void free_dtd_parser(dtd_parser *p); + +void free_dtd(dtd *dtd); +int load_dtd_from_file(dtd_parser *p, const ichar *file); +dtd * new_dtd(const ichar *doctype); +int set_dialect_dtd(dtd *dtd, dtd_dialect dialect); +int set_option_dtd(dtd *dtd, dtd_option option, int set); + +void putchar_dtd_parser(dtd_parser *p, int chr); +int begin_document_dtd_parser(dtd_parser *p); +int end_document_dtd_parser(dtd_parser *p); +void reset_document_dtd_parser(dtd_parser *p); +void set_file_dtd_parser(dtd_parser *p, + input_type in, const ichar *file); +void set_mode_dtd_parser(dtd_parser *p, data_mode mode); +void sgml_cplocation(dtd_srcloc *dst, dtd_srcloc *src); +int xml_set_encoding(dtd_parser *p, const char *enc); + +#endif /*DTD_H_INCLUDED*/ + + diff --git a/packages/sgml/dtd2pl.1 b/packages/sgml/dtd2pl.1 new file mode 100644 index 000000000..9e18f966f --- /dev/null +++ b/packages/sgml/dtd2pl.1 @@ -0,0 +1,51 @@ +.TH SWI 1 "March 8, 2000" +.SH NAME +dtd2pl \- Convert SGML DTD files to Prolog +.SH SYNOPSIS +.BR dtd2pl +.I "dtd-file" +.br +.SH DESCRIPTION +The Program +.BI dtd2pl +Provides a crude translation of an SGML DTD files into a Prolog database. +Curently a DTD is translated into a Prolog +.B module +named +.IR "_dtd" "." +This module does no export any predicates. The DTD is represented using +the following predicates. + +.SH Predicates +.TP +.BI "element(" "Name," " omit(" "Open, Close" ")," " Content" ")" +.TP +.BI "include(" "Element, Included" ")" +.TP +.BI "exclude(" "Element, Excluded" ")" +.TP +.BI "attribute(" "Element, Name, Type, Default" ")" + +.SH ENVIRONMENT + +.TP +.BI "SGML_CATALOG_FILES " "file" ":" "file ..." +Catalog files are used to resolve +.B PUBLIC +identifiers of external +.B entities. + +.SH BUGS +Lacks many options to tailor the output to a specific needs. + +.SH AUTHOR +Jan Wielemaker, SWI, University of Amsterdam + +.SH "SEE ALSO" +.I pl(1) plld(1) +.I SWI-Prolog 3.1 Reference Manual, +University of Amsterdam, Dept. of Social Science and Informatics (SWI). +.SH COPYRIGHT +Copyright (C) 1991-1998, Jan Wielemaker +.SH AUTHOR +Jan Wielemaker diff --git a/packages/sgml/dtd2pl.c b/packages/sgml/dtd2pl.c new file mode 100644 index 000000000..8750c3edf --- /dev/null +++ b/packages/sgml/dtd2pl.c @@ -0,0 +1,91 @@ +/* $Id$ + + Part of SWI-Prolog + + Author: Jan Wielemaker + E-mail: jan@swi.psy.uva.nl + WWW: http://www.swi-prolog.org + Copyright (C): 1985-2002, University of Amsterdam + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ + +#include +#include +#include +#include +#include "dtd.h" +#include "prolog.h" + +#define streq(s,q) strcmp((s), (q)) == 0 + +char *program; + +static void +usage() +{ fprintf(stderr, "Usage: %s [-xml|sgml] file.dtd\n", program); +} + +int +main(int argc, char **argv) +{ dtd_dialect dialect = DL_SGML; + + program = argv[0]; + argv++; + argc--; + + while(argc > 0 && argv[0][0] == '-') + { if ( streq(argv[0], "-xml") ) + { dialect = DL_XML; + argc--; + argv++; + } else if ( streq(argv[0], "-sgml") ) + { dialect = DL_SGML; + argc--; + argv++; + } else + { usage(); + exit(1); + } + } + + if ( argc == 1 ) + { int wl = mbstowcs(NULL, argv[0], 0); + + if ( wl > 0 ) + { wchar_t *ws = malloc((wl+1)*sizeof(wchar_t)); + dtd *dtd; + + mbstowcs(ws, argv[0], wl+1); + dtd = file_to_dtd(ws, L"test", dialect); + + if ( dtd ) + { prolog_print_dtd(dtd, PL_PRINT_ALL & ~PL_PRINT_PENTITIES); + return 0; + } + } else + { perror("mbstowcs"); + exit(1); + } + } + + usage(); + return 1; +} + + + + + diff --git a/packages/sgml/error.c b/packages/sgml/error.c new file mode 100644 index 000000000..799c999cb --- /dev/null +++ b/packages/sgml/error.c @@ -0,0 +1,178 @@ +/* $Id$ + + Part of SWI-Prolog + + Author: Jan Wielemaker + E-mail: jan@swi.psy.uva.nl + WWW: http://www.swi-prolog.org + Copyright (C): 1985-2002, University of Amsterdam + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ + +#include +#include +#include +#include "error.h" +#include +#include +#include + +int +sgml2pl_error(plerrorid id, ...) +{ term_t except = PL_new_term_ref(); + term_t formal = PL_new_term_ref(); + term_t swi = PL_new_term_ref(); + va_list args; + char msgbuf[1024]; + char *msg = NULL; + + va_start(args, id); + switch(id) + { case ERR_ERRNO: + { int err = va_arg(args, int); + + msg = strerror(err); + + switch(err) + { case ENOMEM: + PL_unify_term(formal, + PL_FUNCTOR_CHARS, "resource_error", 1, + PL_CHARS, "no_memory"); + break; + case EACCES: + { const char *file = va_arg(args, const char *); + const char *action = va_arg(args, const char *); + + PL_unify_term(formal, + PL_FUNCTOR_CHARS, "permission_error", 3, + PL_CHARS, action, + PL_CHARS, "file", + PL_CHARS, file); + break; + } + case ENOENT: + { const char *file = va_arg(args, const char *); + + PL_unify_term(formal, + PL_FUNCTOR_CHARS, "existence_error", 2, + PL_CHARS, "file", + PL_CHARS, file); + break; + } + default: + PL_unify_atom_chars(formal, "system_error"); + break; + } + break; + } + case ERR_TYPE: + { const char *expected = va_arg(args, const char*); + term_t actual = va_arg(args, term_t); + + if ( PL_is_variable(actual) && + strcmp(expected, "variable") != 0 ) + PL_unify_atom_chars(formal, "instantiation_error"); + else + PL_unify_term(formal, + PL_FUNCTOR_CHARS, "type_error", 2, + PL_CHARS, expected, + PL_TERM, actual); + break; + } + case ERR_DOMAIN: + { const char *expected = va_arg(args, const char*); + term_t actual = va_arg(args, term_t); + + if ( PL_is_variable(actual) ) + PL_unify_atom_chars(formal, "instantiation_error"); + else + PL_unify_term(formal, + PL_FUNCTOR_CHARS, "domain_error", 2, + PL_CHARS, expected, + PL_TERM, actual); + break; + } + case ERR_EXISTENCE: + { const char *type = va_arg(args, const char *); + term_t obj = va_arg(args, term_t); + + PL_unify_term(formal, + PL_FUNCTOR_CHARS, "existence_error", 2, + PL_CHARS, type, + PL_TERM, obj); + + break; + } + case ERR_FAIL: + { term_t goal = va_arg(args, term_t); + + PL_unify_term(formal, + PL_FUNCTOR_CHARS, "goal_failed", 1, + PL_TERM, goal); + + break; + } + case ERR_LIMIT: + { const char *limit = va_arg(args, const char *); + long maxval = va_arg(args, long); + + PL_unify_term(formal, + PL_FUNCTOR_CHARS, "limit_exceeded", 2, + PL_CHARS, limit, + PL_LONG, maxval); + + break; + } + case ERR_MISC: + { const char *id = va_arg(args, const char *); + const char *fmt = va_arg(args, const char *); + + vsprintf(msgbuf, fmt, args); + msg = msgbuf; + + PL_unify_term(formal, + PL_FUNCTOR_CHARS, "miscellaneous", 1, + PL_CHARS, id); + break; + } + default: + assert(0); + } + va_end(args); + + if ( msg ) + { term_t predterm = PL_new_term_ref(); + term_t msgterm = PL_new_term_ref(); + + if ( msg ) + { PL_put_atom_chars(msgterm, msg); + } + + PL_unify_term(swi, + PL_FUNCTOR_CHARS, "context", 2, + PL_TERM, predterm, + PL_TERM, msgterm); + } + + PL_unify_term(except, + PL_FUNCTOR_CHARS, "error", 2, + PL_TERM, formal, + PL_TERM, swi); + + + return PL_raise_exception(except); +} + diff --git a/packages/sgml/error.h b/packages/sgml/error.h new file mode 100644 index 000000000..e3c2d23f9 --- /dev/null +++ b/packages/sgml/error.h @@ -0,0 +1,46 @@ +/* $Id$ + + Part of SWI-Prolog + + Author: Jan Wielemaker + E-mail: jan@swi.psy.uva.nl + WWW: http://www.swi-prolog.org + Copyright (C): 1985-2002, University of Amsterdam + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ + +#ifndef H_ERROR_INCLUDED +#define H_ERROR_INCLUDED +#include + +typedef enum +{ ERR_ERRNO, /* , int */ + /* ENOMEM */ + /* EACCES --> file, action */ + /* ENOENT --> file */ + ERR_TYPE, /* char *expected, term_t actual */ + ERR_DOMAIN, /* char *expected, term_t actual */ + ERR_EXISTENCE, /* char *expected, term_t actual */ + + ERR_FAIL, /* term_t goal */ + + ERR_LIMIT, /* char *limit, long max */ + ERR_MISC /* char *fmt, ... */ +} plerrorid; + +int sgml2pl_error(plerrorid, ...); + +#endif /*H_ERROR_INCLUDED*/ diff --git a/packages/sgml/install-sh b/packages/sgml/install-sh new file mode 100755 index 000000000..ab74c882e --- /dev/null +++ b/packages/sgml/install-sh @@ -0,0 +1,238 @@ +#!/bin/sh +# +# install - install a program, script, or datafile +# This comes from X11R5. +# +# Calling this script install-sh is preferred over install.sh, to prevent +# `make' implicit rules from creating a file called install from it +# when there is no Makefile. +# +# This script is compatible with the BSD install script, but was written +# from scratch. +# + + +# set DOITPROG to echo to test this script + +# Don't use :- since 4.3BSD and earlier shells don't like it. +doit="${DOITPROG-}" + + +# put in absolute paths if you don't have them in your path; or use env. vars. + +mvprog="${MVPROG-mv}" +cpprog="${CPPROG-cp}" +chmodprog="${CHMODPROG-chmod}" +chownprog="${CHOWNPROG-chown}" +chgrpprog="${CHGRPPROG-chgrp}" +stripprog="${STRIPPROG-strip}" +rmprog="${RMPROG-rm}" +mkdirprog="${MKDIRPROG-mkdir}" + +tranformbasename="" +transform_arg="" +instcmd="$mvprog" +chmodcmd="$chmodprog 0755" +chowncmd="" +chgrpcmd="" +stripcmd="" +rmcmd="$rmprog -f" +mvcmd="$mvprog" +src="" +dst="" +dir_arg="" + +while [ x"$1" != x ]; do + case $1 in + -c) instcmd="$cpprog" + shift + continue;; + + -d) dir_arg=true + shift + continue;; + + -m) chmodcmd="$chmodprog $2" + shift + shift + continue;; + + -o) chowncmd="$chownprog $2" + shift + shift + continue;; + + -g) chgrpcmd="$chgrpprog $2" + shift + shift + continue;; + + -s) stripcmd="$stripprog" + shift + continue;; + + -t=*) transformarg=`echo $1 | sed 's/-t=//'` + shift + continue;; + + -b=*) transformbasename=`echo $1 | sed 's/-b=//'` + shift + continue;; + + *) if [ x"$src" = x ] + then + src=$1 + else + # this colon is to work around a 386BSD /bin/sh bug + : + dst=$1 + fi + shift + continue;; + esac +done + +if [ x"$src" = x ] +then + echo "install: no input file specified" + exit 1 +else + true +fi + +if [ x"$dir_arg" != x ]; then + dst=$src + src="" + + if [ -d $dst ]; then + instcmd=: + else + instcmd=mkdir + fi +else + +# Waiting for this to be detected by the "$instcmd $src $dsttmp" command +# might cause directories to be created, which would be especially bad +# if $src (and thus $dsttmp) contains '*'. + + if [ -f $src -o -d $src ] + then + true + else + echo "install: $src does not exist" + exit 1 + fi + + if [ x"$dst" = x ] + then + echo "install: no destination specified" + exit 1 + else + true + fi + +# If destination is a directory, append the input filename; if your system +# does not like double slashes in filenames, you may need to add some logic + + if [ -d $dst ] + then + dst="$dst"/`basename $src` + else + true + fi +fi + +## this sed command emulates the dirname command +dstdir=`echo $dst | sed -e 's,[^/]*$,,;s,/$,,;s,^$,.,'` + +# Make sure that the destination directory exists. +# this part is taken from Noah Friedman's mkinstalldirs script + +# Skip lots of stat calls in the usual case. +if [ ! -d "$dstdir" ]; then +defaultIFS=' +' +IFS="${IFS-${defaultIFS}}" + +oIFS="${IFS}" +# Some sh's can't handle IFS=/ for some reason. +IFS='%' +set - `echo ${dstdir} | sed -e 's@/@%@g' -e 's@^%@/@'` +IFS="${oIFS}" + +pathcomp='' + +while [ $# -ne 0 ] ; do + pathcomp="${pathcomp}${1}" + shift + + if [ ! -d "${pathcomp}" ] ; + then + $mkdirprog "${pathcomp}" + else + true + fi + + pathcomp="${pathcomp}/" +done +fi + +if [ x"$dir_arg" != x ] +then + $doit $instcmd $dst && + + if [ x"$chowncmd" != x ]; then $doit $chowncmd $dst; else true ; fi && + if [ x"$chgrpcmd" != x ]; then $doit $chgrpcmd $dst; else true ; fi && + if [ x"$stripcmd" != x ]; then $doit $stripcmd $dst; else true ; fi && + if [ x"$chmodcmd" != x ]; then $doit $chmodcmd $dst; else true ; fi +else + +# If we're going to rename the final executable, determine the name now. + + if [ x"$transformarg" = x ] + then + dstfile=`basename $dst` + else + dstfile=`basename $dst $transformbasename | + sed $transformarg`$transformbasename + fi + +# don't allow the sed command to completely eliminate the filename + + if [ x"$dstfile" = x ] + then + dstfile=`basename $dst` + else + true + fi + +# Make a temp file name in the proper directory. + + dsttmp=$dstdir/#inst.$$# + +# Move or copy the file name to the temp name + + $doit $instcmd $src $dsttmp && + + trap "rm -f ${dsttmp}" 0 && + +# and set any options; do chmod last to preserve setuid bits + +# If any of these fail, we abort the whole thing. If we want to +# ignore errors from any of these, just make sure not to ignore +# errors from the above "$doit $instcmd $src $dsttmp" command. + + if [ x"$chowncmd" != x ]; then $doit $chowncmd $dsttmp; else true;fi && + if [ x"$chgrpcmd" != x ]; then $doit $chgrpcmd $dsttmp; else true;fi && + if [ x"$stripcmd" != x ]; then $doit $stripcmd $dsttmp; else true;fi && + if [ x"$chmodcmd" != x ]; then $doit $chmodcmd $dsttmp; else true;fi && + +# Now rename the file to the real destination. + + $doit $rmcmd -f $dstdir/$dstfile && + $doit $mvcmd $dsttmp $dstdir/$dstfile + +fi && + + +exit 0 diff --git a/packages/sgml/iso_639.pl b/packages/sgml/iso_639.pl new file mode 100644 index 000000000..09c91a9d1 --- /dev/null +++ b/packages/sgml/iso_639.pl @@ -0,0 +1,628 @@ +/* $Id$ + + Part of SWI-Prolog + + Author: Jan Wielemaker + E-mail: jan@swi.psy.uva.nl + WWW: http://www.swi-prolog.org + Copyright (C): 1985-2004, University of Amsterdam + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License + as published by the Free Software Foundation; either version 2 + of the License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + + As a special exception, if you link this library with other files, + compiled with a Free Software compiler, to produce an executable, this + library does not by itself cause the resulting executable to be covered + by the GNU General Public License. This exception does not however + invalidate any other reasons why the executable file might be covered by + the GNU General Public License. +*/ + +:- module(iso_639, + [ iso_639_2/2, % Code, Language + iso_639_3/2, % Code, Language + iso_639/2 % Code, Language + ]). + +/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +ISO 639 language codes. This material is based on + + http://www.wwp.brown.edu/encoding/training/ISO/iso639.html + +It would be nice to know a bit more about these languages, such as the +applicable character sets. +- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ + +iso_639_2(Code, Lang) :- + l2(Code, Lang). +iso_639_3(Code, Lang) :- + l3(Code, Lang). +iso_639(Code, Lang) :- + ( l2(Code, Lang) + ; l3(Code, Lang) + ). + + +% l3(?Code, ?Lang) +% +% ISO-639 3-letter codes + +l3(abk, 'Abkhazian'). +l3(ace, 'Achinese'). +l3(ach, 'Acoli'). +l3(ada, 'Adangme'). +l3(aar, 'Afar'). +l3(afh, 'Afrihili'). +l3(afr, 'Afrikaans'). +l3(afa, 'Afro-Asiatic (Other)'). +l3(aka, 'Akan'). +l3(akk, 'Akkadian'). +l3(alb, 'Albanian'). +l3(sqi, 'Albanian'). +l3(ale, 'Aleut'). +l3(alg, 'Algonquian languages'). +l3(tut, 'Altaic (Other)'). +l3(amh, 'Amharic'). +l3(apa, 'Apache languages'). +l3(ara, 'Arabic'). +l3(arc, 'Aramaic'). +l3(arp, 'Arapaho'). +l3(arn, 'Araucanian'). +l3(arw, 'Arawak'). +l3(arm, 'Armenian'). +l3(hye, 'Armenian'). +l3(art, 'Artificial (Other)'). +l3(asm, 'Assamese'). +l3(ath, 'Athapascan languages'). +l3(map, 'Austronesian (Other)'). +l3(ava, 'Avaric'). +l3(ave, 'Avestan'). +l3(awa, 'Awadhi'). +l3(aym, 'Aymara'). +l3(aze, 'Azerbaijani'). +l3(nah, 'Aztec'). +l3(ban, 'Balinese'). +l3(bat, 'Baltic (Other)'). +l3(bal, 'Baluchi'). +l3(bam, 'Bambara'). +l3(bai, 'Bamileke languages'). +l3(bad, 'Banda'). +l3(bnt, 'Bantu (Other)'). +l3(bas, 'Basa'). +l3(bak, 'Bashkir'). +l3(baq, 'Basque'). +l3(eus, 'Basque'). +l3(bej, 'Beja'). +l3(bem, 'Bemba'). +l3(ben, 'Bengali'). +l3(ber, 'Berber (Other)'). +l3(bho, 'Bhojpuri'). +l3(bih, 'Bihari'). +l3(bik, 'Bikol'). +l3(bin, 'Bini'). +l3(bis, 'Bislama'). +l3(bra, 'Braj'). +l3(bre, 'Breton'). +l3(bug, 'Buginese'). +l3(bul, 'Bulgarian'). +l3(bua, 'Buriat'). +l3(bur, 'Burmese'). +l3(mya, 'Burmese'). +l3(bel, 'Byelorussian'). +l3(cad, 'Caddo'). +l3(car, 'Carib'). +l3(cat, 'Catalan'). +l3(cau, 'Caucasian (Other)'). +l3(ceb, 'Cebuano'). +l3(cel, 'Celtic (Other)'). +l3(cai, 'Central American Indian (Other)'). +l3(chg, 'Chagatai'). +l3(cha, 'Chamorro'). +l3(che, 'Chechen'). +l3(chr, 'Cherokee'). +l3(chy, 'Cheyenne'). +l3(chb, 'Chibcha'). +l3(chi, 'Chinese'). +l3(zho, 'Chinese'). +l3(chn, 'Chinook jargon'). +l3(cho, 'Choctaw'). +l3(chu, 'Church Slavic'). +l3(chv, 'Chuvash'). +l3(cop, 'Coptic'). +l3(cor, 'Cornish'). +l3(cos, 'Corsican'). +l3(cre, 'Cree'). +l3(mus, 'Creek'). +l3(crp, 'Creoles and Pidgins (Other)'). +l3(cpe, 'Creoles and Pidgins, English-based (Other)'). +l3(cpf, 'Creoles and Pidgins, French-based (Other)'). +l3(cpp, 'Creoles and Pidgins, Portuguese-based (Other)'). +l3(cus, 'Cushitic (Other)'). +l3(ces, 'Czech'). +l3(cze, 'Czech'). +l3(dak, 'Dakota'). +l3(dan, 'Danish'). +l3(del, 'Delaware'). +l3(din, 'Dinka'). +l3(div, 'Divehi'). +l3(doi, 'Dogri'). +l3(dra, 'Dravidian (Other)'). +l3(dua, 'Duala'). +l3(dut, 'Dutch'). +l3(nla, 'Dutch'). +l3(dum, 'Dutch, Middle (ca. 1050-1350)'). +l3(dyu, 'Dyula'). +l3(dzo, 'Dzongkha'). +l3(efi, 'Efik'). +l3(egy, 'Egyptian (Ancient)'). +l3(eka, 'Ekajuk'). +l3(elx, 'Elamite'). +l3(eng, 'English'). +l3(enm, 'English, Middle (ca. 1100-1500)'). +l3(ang, 'English, Old (ca. 450-1100)'). +l3(esk, 'Eskimo (Other)'). +l3(epo, 'Esperanto'). +l3(est, 'Estonian'). +l3(ewe, 'Ewe'). +l3(ewo, 'Ewondo'). +l3(fan, 'Fang'). +l3(fat, 'Fanti'). +l3(fao, 'Faroese'). +l3(fij, 'Fijian'). +l3(fin, 'Finnish'). +l3(fiu, 'Finno-Ugrian (Other)'). +l3(fon, 'Fon'). +l3(fra, 'French'). +l3(fre, 'French'). +l3(frm, 'French, Middle (ca. 1400-1600)'). +l3(fro, 'French, Old (842- ca. 1400)'). +l3(fry, 'Frisian'). +l3(ful, 'Fulah'). +l3(gaa, 'Ga'). +l3(gae, 'Gaelic (Scots)'). +l3(gdh, 'Gaelic (Scots)'). +l3(glg, 'Gallegan'). +l3(lug, 'Ganda'). +l3(gay, 'Gayo'). +l3(gez, 'Geez'). +l3(geo, 'Georgian'). +l3(kat, 'Georgian'). +l3(deu, 'German'). +l3(ger, 'German'). +l3(gmh, 'German, Middle High (ca. 1050-1500)'). +l3(goh, 'German, Old High (ca. 750-1050)'). +l3(gem, 'Germanic (Other)'). +l3(gil, 'Gilbertese'). +l3(gon, 'Gondi'). +l3(got, 'Gothic'). +l3(grb, 'Grebo'). +l3(grc, 'Greek, Ancient (to 1453)'). +l3(ell, 'Greek, Modern (1453-)'). +l3(gre, 'Greek, Modern (1453-)'). +l3(kal, 'Greenlandic'). +l3(grn, 'Guarani'). +l3(guj, 'Gujarati'). +l3(hai, 'Haida'). +l3(hau, 'Hausa'). +l3(haw, 'Hawaiian'). +l3(heb, 'Hebrew'). +l3(her, 'Herero'). +l3(hil, 'Hiligaynon'). +l3(him, 'Himachali'). +l3(hin, 'Hindi'). +l3(hmo, 'Hiri Motu'). +l3(hun, 'Hungarian'). +l3(hup, 'Hupa'). +l3(iba, 'Iban'). +l3(ice, 'Icelandic'). +l3(isl, 'Icelandic'). +l3(ibo, 'Igbo'). +l3(ijo, 'Ijo'). +l3(ilo, 'Iloko'). +l3(inc, 'Indic (Other)'). +l3(ine, 'Indo-European (Other)'). +l3(ind, 'Indonesian'). +l3(ina, 'Interlingua (International Auxiliary language Association)'). +l3(ine, 'Interlingue'). +l3(iku, 'Inuktitut'). +l3(ipk, 'Inupiak'). +l3(ira, 'Iranian (Other)'). +l3(gai, 'Irish'). +l3(iri, 'Irish'). +l3(sga, 'Irish, Old (to 900)'). +l3(mga, 'Irish, Middle (900 - 1200)'). +l3(iro, 'Iroquoian languages'). +l3(ita, 'Italian'). +l3(jpn, 'Japanese'). +l3(jav, 'Javanese'). +l3(jaw, 'Javanese'). +l3(jrb, 'Judeo-Arabic'). +l3(jpr, 'Judeo-Persian'). +l3(kab, 'Kabyle'). +l3(kac, 'Kachin'). +l3(kam, 'Kamba'). +l3(kan, 'Kannada'). +l3(kau, 'Kanuri'). +l3(kaa, 'Kara-Kalpak'). +l3(kar, 'Karen'). +l3(kas, 'Kashmiri'). +l3(kaw, 'Kawi'). +l3(kaz, 'Kazakh'). +l3(kha, 'Khasi'). +l3(khm, 'Khmer'). +l3(khi, 'Khoisan (Other)'). +l3(kho, 'Khotanese'). +l3(kik, 'Kikuyu'). +l3(kin, 'Kinyarwanda'). +l3(kir, 'Kirghiz'). +l3(kom, 'Komi'). +l3(kon, 'Kongo'). +l3(kok, 'Konkani'). +l3(kor, 'Korean'). +l3(kpe, 'Kpelle'). +l3(kro, 'Kru'). +l3(kua, 'Kuanyama'). +l3(kum, 'Kumyk'). +l3(kur, 'Kurdish'). +l3(kru, 'Kurukh'). +l3(kus, 'Kusaie'). +l3(kut, 'Kutenai'). +l3(lad, 'Ladino'). +l3(lah, 'Lahnda'). +l3(lam, 'Lamba'). +l3(oci, 'Langue d\'Oc (post 1500)'). +l3(lao, 'Lao'). +l3(lat, 'Latin'). +l3(lav, 'Latvian'). +l3(ltz, 'Letzeburgesch'). +l3(lez, 'Lezghian'). +l3(lin, 'Lingala'). +l3(lit, 'Lithuanian'). +l3(loz, 'Lozi'). +l3(lub, 'Luba-Katanga'). +l3(lui, 'Luiseno'). +l3(lun, 'Lunda'). +l3(luo, 'Luo (Kenya and Tanzania)'). +l3(mac, 'Macedonian'). +l3(mak, 'Macedonian'). +l3(mad, 'Madurese'). +l3(mag, 'Magahi'). +l3(mai, 'Maithili'). +l3(mak, 'Makasar'). +l3(mlg, 'Malagasy'). +l3(may, 'Malay'). +l3(msa, 'Malay'). +l3(mal, 'Malayalam'). +l3(mlt, 'Maltese'). +l3(man, 'Mandingo'). +l3(mni, 'Manipuri'). +l3(mno, 'Manobo languages'). +l3(max, 'Manx'). +l3(mao, 'Maori'). +l3(mri, 'Maori'). +l3(mar, 'Marathi'). +l3(chm, 'Mari'). +l3(mah, 'Marshall'). +l3(mwr, 'Marwari'). +l3(mas, 'Masai'). +l3(myn, 'Mayan languages'). +l3(men, 'Mende'). +l3(mic, 'Micmac'). +l3(min, 'Minangkabau'). +l3(mis, 'Miscellaneous (Other)'). +l3(moh, 'Mohawk'). +l3(mol, 'Moldavian'). +l3(mkh, 'Mon-Kmer (Other)'). +l3(lol, 'Mongo'). +l3(mon, 'Mongolian'). +l3(mos, 'Mossi'). +l3(mul, 'Multiple languages'). +l3(mun, 'Munda languages'). +l3(nau, 'Nauru'). +l3(nav, 'Navajo'). +l3(nde, 'Ndebele, North'). +l3(nbl, 'Ndebele, South'). +l3(ndo, 'Ndongo'). +l3(nep, 'Nepali'). +l3(new, 'Newari'). +l3(nic, 'Niger-Kordofanian (Other)'). +l3(ssa, 'Nilo-Saharan (Other)'). +l3(niu, 'Niuean'). +l3(non, 'Norse, Old'). +l3(nai, 'North American Indian (Other)'). +l3(nor, 'Norwegian'). +l3(nno, 'Norwegian (Nynorsk)'). +l3(nub, 'Nubian languages'). +l3(nym, 'Nyamwezi'). +l3(nya, 'Nyanja'). +l3(nyn, 'Nyankole'). +l3(nyo, 'Nyoro'). +l3(nzi, 'Nzima'). +l3(oji, 'Ojibwa'). +l3(ori, 'Oriya'). +l3(orm, 'Oromo'). +l3(osa, 'Osage'). +l3(oss, 'Ossetic'). +l3(oto, 'Otomian languages'). +l3(pal, 'Pahlavi'). +l3(pau, 'Palauan'). +l3(pli, 'Pali'). +l3(pam, 'Pampanga'). +l3(pag, 'Pangasinan'). +l3(pan, 'Panjabi'). +l3(pap, 'Papiamento'). +l3(paa, 'Papuan-Australian (Other)'). +l3(fas, 'Persian'). +l3(per, 'Persian'). +l3(peo, 'Persian, Old (ca 600 - 400 B.C.)'). +l3(phn, 'Phoenician'). +l3(pol, 'Polish'). +l3(pon, 'Ponape'). +l3(por, 'Portuguese'). +l3(pra, 'Prakrit languages'). +l3(pro, 'Provencal, Old (to 1500)'). +l3(pus, 'Pushto'). +l3(que, 'Quechua'). +l3(roh, 'Rhaeto-Romance'). +l3(raj, 'Rajasthani'). +l3(rar, 'Rarotongan'). +l3(roa, 'Romance (Other)'). +l3(ron, 'Romanian'). +l3(rum, 'Romanian'). +l3(rom, 'Romany'). +l3(run, 'Rundi'). +l3(rus, 'Russian'). +l3(sal, 'Salishan languages'). +l3(sam, 'Samaritan Aramaic'). +l3(smi, 'Sami languages'). +l3(smo, 'Samoan'). +l3(sad, 'Sandawe'). +l3(sag, 'Sango'). +l3(san, 'Sanskrit'). +l3(srd, 'Sardinian'). +l3(sco, 'Scots'). +l3(sel, 'Selkup'). +l3(sem, 'Semitic (Other)'). +l3(scr, 'Serbo-Croatian'). +l3(srr, 'Serer'). +l3(shn, 'Shan'). +l3(sna, 'Shona'). +l3(sid, 'Sidamo'). +l3(bla, 'Siksika'). +l3(snd, 'Sindhi'). +l3(sin, 'Singhalese'). +l3(sit, 'Sino-Tibetan (Other)'). +l3(sio, 'Siouan languages'). +l3(sla, 'Slavic (Other)'). +l3(ssw, 'Siswant'). +l3(slk, 'Slovak'). +l3(slo, 'Slovak'). +l3(slv, 'Slovenian'). +l3(sog, 'Sogdian'). +l3(som, 'Somali'). +l3(son, 'Songhai'). +l3(wen, 'Sorbian languages'). +l3(nso, 'Sotho, Northern'). +l3(sot, 'Sotho, Southern'). +l3(sai, 'South American Indian (Other)'). +l3(esl, 'Spanish'). +l3(spa, 'Spanish'). +l3(suk, 'Sukuma'). +l3(sux, 'Sumerian'). +l3(sun, 'Sudanese'). +l3(sus, 'Susu'). +l3(swa, 'Swahili'). +l3(ssw, 'Swazi'). +l3(sve, 'Swedish'). +l3(swe, 'Swedish'). +l3(syr, 'Syriac'). +l3(tgl, 'Tagalog'). +l3(tah, 'Tahitian'). +l3(tgk, 'Tajik'). +l3(tmh, 'Tamashek'). +l3(tam, 'Tamil'). +l3(tat, 'Tatar'). +l3(tel, 'Telugu'). +l3(ter, 'Tereno'). +l3(tha, 'Thai'). +l3(bod, 'Tibetan'). +l3(tib, 'Tibetan'). +l3(tig, 'Tigre'). +l3(tir, 'Tigrinya'). +l3(tem, 'Timne'). +l3(tiv, 'Tivi'). +l3(tli, 'Tlingit'). +l3(tog, 'Tonga (Nyasa)'). +l3(ton, 'Tonga (Tonga Islands)'). +l3(tru, 'Truk'). +l3(tsi, 'Tsimshian'). +l3(tso, 'Tsonga'). +l3(tsn, 'Tswana'). +l3(tum, 'Tumbuka'). +l3(tur, 'Turkish'). +l3(ota, 'Turkish, Ottoman (1500 - 1928)'). +l3(tuk, 'Turkmen'). +l3(tyv, 'Tuvinian'). +l3(twi, 'Twi'). +l3(uga, 'Ugaritic'). +l3(uig, 'Uighur'). +l3(ukr, 'Ukrainian'). +l3(umb, 'Umbundu'). +l3(und, 'Undetermined'). +l3(urd, 'Urdu'). +l3(uzb, 'Uzbek'). +l3(vai, 'Vai'). +l3(ven, 'Venda'). +l3(vie, 'Vietnamese'). +l3(vol, 'Volapük'). +l3(vot, 'Votic'). +l3(wak, 'Wakashan languages'). +l3(wal, 'Walamo'). +l3(war, 'Waray'). +l3(was, 'Washo'). +l3(cym, 'Welsh'). +l3(wel, 'Welsh'). +l3(wol, 'Wolof'). +l3(xho, 'Xhosa'). +l3(sah, 'Yakut'). +l3(yao, 'Yao'). +l3(yap, 'Yap'). +l3(yid, 'Yiddish'). +l3(yor, 'Yoruba'). +l3(zap, 'Zapotec'). +l3(zen, 'Zenaga'). +l3(zha, 'Zhuang'). +l3(zul, 'Zulu'). +l3(zun, 'Zuni'). + +% l2(?Code, ?Lang) +% +% ISO-639 2 letter codes + +l2(aa, 'Afar'). +l2(ab, 'Abkhazian'). +l2(af, 'Afrikaans'). +l2(am, 'Amharic'). +l2(ar, 'Arabic'). +l2(as, 'Assamese'). +l2(ay, 'Aymara'). +l2(az, 'Azerbaijani'). +l2(ba, 'Bashkir'). +l2(be, 'Byelorussian'). +l2(bg, 'Bulgarian'). +l2(bh, 'Bihari'). +l2(bi, 'Bislama'). +l2(bn, 'Bengali, Bangla'). +l2(bo, 'Tibetan'). +l2(br, 'Breton'). +l2(ca, 'Catalan'). +l2(co, 'Corsican'). +l2(cs, 'Czech'). +l2(cy, 'Welsh'). +l2(da, 'Danish'). +l2(de, 'German'). +l2(dz, 'Bhutani'). +l2(el, 'Greek'). +l2(en, 'English, American'). +l2(eo, 'Esperanto'). +l2(es, 'Spanish'). +l2(et, 'Estonian'). +l2(eu, 'Basque'). +l2(fa, 'Persian'). +l2(fi, 'Finnish'). +l2(fj, 'Fiji'). +l2(fo, 'Faeroese'). +l2(fr, 'French'). +l2(fy, 'Frisian'). +l2(ga, 'Irish'). +l2(gd, 'Gaelic, Scots Gaelic'). +l2(gl, 'Galician'). +l2(gn, 'Guarani'). +l2(gu, 'Gujarati'). +l2(ha, 'Hausa'). +l2(hi, 'Hindi'). +l2(hr, 'Croatian'). +l2(hu, 'Hungarian'). +l2(hy, 'Armenian'). +l2(ia, 'Interlingua'). +l2(ie, 'Interlingue'). +l2(ik, 'Inupiak'). +l2(in, 'Indonesian'). +l2(is, 'Icelandic'). +l2(it, 'Italian'). +l2(iw, 'Hebrew'). +l2(ja, 'Japanese'). +l2(ji, 'Yiddish'). +l2(jw, 'Javanese'). +l2(ka, 'Georgian'). +l2(kk, 'Kazakh'). +l2(kl, 'Greenlandic'). +l2(km, 'Cambodian'). +l2(kn, 'Kannada'). +l2(ko, 'Korean'). +l2(ks, 'Kashmiri'). +l2(ku, 'Kurdish'). +l2(ky, 'Kirghiz'). +l2(la, 'Latin'). +l2(ln, 'Lingala'). +l2(lo, 'Laothian'). +l2(lt, 'Lithuanian'). +l2(lv, 'Latvian, Lettish'). +l2(mg, 'Malagasy'). +l2(mi, 'Maori'). +l2(mk, 'Macedonian'). +l2(ml, 'Malayalam'). +l2(mn, 'Mongolian'). +l2(mo, 'Moldavian'). +l2(mr, 'Marathi'). +l2(ms, 'Malay'). +l2(mt, 'Maltese'). +l2(my, 'Burmese'). +l2(na, 'Nauru'). +l2(ne, 'Nepali'). +l2(nl, 'Dutch'). +l2(no, 'Norwegian'). +l2(oc, 'Occitan'). +l2(om, 'Oromo, Afan'). +l2(or, 'Oriya'). +l2(pa, 'Punjabi'). +l2(pl, 'Polish'). +l2(ps, 'Pashto, Pushto'). +l2(pt, 'Portuguese'). +l2(qu, 'Quechua'). +l2(rm, 'Rhaeto-Romance'). +l2(rn, 'Kirundi'). +l2(ro, 'Romanian'). +l2(ru, 'Russian'). +l2(rw, 'Kinyarwanda'). +l2(sa, 'Sanskrit'). +l2(sd, 'Sindhi'). +l2(sg, 'Sangro'). +l2(sh, 'Serbo-Croatian'). +l2(si, 'Singhalese'). +l2(sk, 'Slovak'). +l2(sl, 'Slovenian'). +l2(sm, 'Samoan'). +l2(sn, 'Shona'). +l2(so, 'Somali'). +l2(sq, 'Albanian'). +l2(sr, 'Serbian'). +l2(ss, 'Siswati'). +l2(st, 'Sesotho'). +l2(su, 'Sudanese'). +l2(sv, 'Swedish'). +l2(sw, 'Swahili'). +l2(ta, 'Tamil'). +l2(te, 'Tegulu'). +l2(tg, 'Tajik'). +l2(th, 'Thai'). +l2(ti, 'Tigrinya'). +l2(tk, 'Turkmen'). +l2(tl, 'Tagalog'). +l2(tn, 'Setswana'). +l2(to, 'Tonga'). +l2(tr, 'Turkish'). +l2(ts, 'Tsonga'). +l2(tt, 'Tatar'). +l2(tw, 'Twi'). +l2(uk, 'Ukrainian'). +l2(ur, 'Urdu'). +l2(uz, 'Uzbek'). +l2(vi, 'Vietnamese'). +l2(vo, 'Volapuk'). +l2(wo, 'Wolof'). +l2(xh, 'Xhosa'). +l2(yo, 'Yoruba'). +l2(zh, 'Chinese'). +l2(zu, 'Zulu'). diff --git a/packages/sgml/make.bat b/packages/sgml/make.bat new file mode 100755 index 000000000..70b43e3f1 --- /dev/null +++ b/packages/sgml/make.bat @@ -0,0 +1,3 @@ +@echo off + +nmake /f makefile.mak %* diff --git a/packages/sgml/model.c b/packages/sgml/model.c new file mode 100644 index 000000000..7c3e6fb9f --- /dev/null +++ b/packages/sgml/model.c @@ -0,0 +1,524 @@ +/* $Id$ + + Part of SWI-Prolog + + Author: Jan Wielemaker + E-mail: jan@swi.psy.uva.nl + WWW: http://www.swi-prolog.org + Copyright (C): 1985-2002, University of Amsterdam + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ + +#include +#include +#include +#include "dtd.h" +#include "model.h" + +#define MAX_VISITED 256 +#define MAX_ALLOWED 64 + +/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +This module implements a finite state engine for validating the content +model of elements. A state machine is the only feasible approach for +realising an event-driven SGML parser. + +The public functions are: + +dtd_state *new_dtd_state(void) + Create an anonymous new state. Normally an element creates two of + these for it ->initial_state and ->final_state attributes. + +dtd_state *make_state_engine(dtd_element *e) + Associate a state engine to this element and return the initial + state of the engine. If the element has an engine, simply return + the initial state. + +dtd_state *make_dtd_transition(dtd_state *here, dtd_element *e) + Given the current state, see whether we can accept e and return + the resulting state. If no transition is possible return NULL. + +int same_state(dtd_state *final, dtd_state *here) + See whether two states are the same, or the final state can be + reached only traversing equivalence links. + +The A&B&... model + +Models of the type a&b&c are hard to translate, as the resulting state +machine is of size order N! In practice only a little of this will be +used however and we `fix' this problem using a `lazy state-engine', that +expands to the next level only after reaching some level. See the +function state_transitions(). The design takes more lazy generation into +consideration. +- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ + +typedef struct _state_transition +{ dtd_element *element; /* element on transition */ + dtd_state *state; /* state to go to */ + struct _state_transition *next; /* next possible transition */ +} transition; + +typedef struct _dtd_model_list /* list (set) of models */ +{ dtd_model *model; + struct _dtd_model_list *next; +} dtd_model_list; + +typedef enum +{ EX_AND /* expand (a&b&...) */ +} expand_type; + +typedef struct _state_expander +{ dtd_state *target; /* Target state to expand to */ + expand_type type; /* EX_* */ + union + { struct + { dtd_model_list *set; /* Models we should still see */ + } and; /* Expand (a&b&...) */ + } kind; +} expander; + +typedef struct _visited +{ int size; /* set-size */ + dtd_state *states[MAX_VISITED]; /* The set */ +} visited; + + +static void translate_model(dtd_model *m, dtd_state *from, dtd_state *to); +static transition *state_transitions(dtd_state *state); + +static int +visit(dtd_state *state, visited *visited) +{ int i; + + for(i=0; isize; i++) + { if ( visited->states[i] == state ) + return FALSE; + } + + if ( visited->size >= MAX_VISITED ) + { fprintf(stderr, "Reached MAX_VISITED!\n"); + return FALSE; + } + + visited->states[visited->size++] = state; + + return TRUE; +} + + +static dtd_state * +do_make_dtd_transition(dtd_state *here, dtd_element *e, visited *visited) +{ transition *tset = state_transitions(here); + transition *t; + + for(t=tset; t; t=t->next) + { if ( t->element == e ) + return t->state; + } + + for(t=tset; t; t=t->next) + { if ( t->element == NULL && visit(t->state, visited) ) + { dtd_state *new; + + if ( (new=do_make_dtd_transition(t->state, e, visited)) ) + return new; + } + } + + return NULL; +} + + +dtd_state * +make_dtd_transition(dtd_state *here, dtd_element *e) +{ visited visited; + visited.size = 0; + + if ( !here ) /* from nowhere to nowhere */ + return NULL; + + return do_make_dtd_transition(here, e, &visited); +} + + +static int +find_same_state(dtd_state *final, dtd_state *here, visited *visited) +{ transition *t; + + if ( final == here ) + return TRUE; + + for(t=state_transitions(here); t; t=t->next) + { if ( t->element == NULL && visit(t->state, visited) ) + { if ( find_same_state(final, t->state, visited) ) + return TRUE; + } + } + + return FALSE; +} + + +int +same_state(dtd_state *final, dtd_state *here) +{ visited visited; + visited.size = 0; + + return find_same_state(final, here, &visited); +} + + +/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +state_allows_for(dtd_state *state, dtd_element **allow, int *n) + See what elements are allowed if we are in this state. This is + currently not used, but might prove handly for error messages or + syntax-directed editors. +- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ + +static void +do_state_allows_for(dtd_state *here, dtd_element **allow, int *n, + visited *visited) +{ transition *t; + + for(t=state_transitions(here); t; t=t->next) + { int i; + + if ( t->element == NULL ) + { if ( visit(t->state, visited) ) + do_state_allows_for(t->state, allow, n, visited); + } else + { for(i=0; i<*n; i++) + { if ( allow[i] == t->element ) + goto next; + } + allow[(*n)++] = t->element; + } + next: + ; + } +} + + +void +state_allows_for(dtd_state *state, dtd_element **allow, int *n) +{ visited visited; + visited.size = 0; + + *n = 0; + if ( state ) + do_state_allows_for(state, allow, n, &visited); +} + + +static int +do_find_omitted_path(dtd_state *state, dtd_element *e, + dtd_element **path, int *pl, + visited *visited) +{ transition *tset = state_transitions(state); + transition *t; + int pathlen = *pl; + + for(t=tset; t; t=t->next) + { if ( t->element == e ) + return TRUE; + + if ( t->element && + t->element != CDATA_ELEMENT && + t->element->structure && + t->element->structure->omit_open && + visit(t->state, visited) ) + { dtd_state *initial = make_state_engine(t->element); + + path[pathlen] = t->element; + *pl = pathlen+1; + if ( do_find_omitted_path(initial, e, path, pl, visited) ) + return TRUE; + *pl = pathlen; + } + } + + for(t=tset; t; t=t->next) + { if ( !t->element && + visit(t->state, visited) ) + { if ( do_find_omitted_path(t->state, e, path, pl, visited) ) + return TRUE; + } + } + + return FALSE; +} + + +int +find_omitted_path(dtd_state *state, dtd_element *e, dtd_element **path) +{ int pl = 0; + visited visited; + visited.size = 0; + + if ( state && do_find_omitted_path(state, e, path, &pl, &visited) ) + return pl; + + return -1; +} + + +dtd_state * +new_dtd_state() +{ dtd_state *s = sgml_calloc(1, sizeof(*s)); + + return s; +} + + +static void +link(dtd_state *from, dtd_state *to, dtd_element *e) +{ transition *t = sgml_calloc(1, sizeof(*t)); + + t->state = to; + t->element = e; + t->next = from->transitions; + from->transitions = t; +} + + + /******************************* + * EXPANSION * + *******************************/ + +static void +add_model_list(dtd_model_list **list, dtd_model *m) +{ dtd_model_list *l = sgml_calloc(1, sizeof(*l)); + + l->model = m; + + for( ; *list; list = &(*list)->next) + ; + *list = l; +} + + +static transition * +state_transitions(dtd_state *state) +{ if ( !state->transitions && state->expander ) + { expander *ex = state->expander; + + switch(ex->type) + { case EX_AND: + { dtd_model_list *left = ex->kind.and.set; + + if ( !left ) /* empty AND (should not happen) */ + { link(state, ex->target, NULL); + } else if ( !left->next ) /* only one left */ + { translate_model(left->model, state, ex->target); + } else + { for( ; left; left = left->next ) + { dtd_state *tmp = new_dtd_state(); + expander *nex = sgml_calloc(1, sizeof(*nex)); + dtd_model_list *l; + + translate_model(left->model, state, tmp); + tmp->expander = nex; + nex->target = ex->target; + nex->type = EX_AND; + for(l=ex->kind.and.set; l; l=l->next) + { if ( l != left ) + add_model_list(&nex->kind.and.set, l->model); + } + } + } + } + } + } + + return state->transitions; +} + + + /******************************* + * TRANSLATION * + *******************************/ + + +static void +translate_one(dtd_model *m, dtd_state *from, dtd_state *to) +{ switch(m->type) + { case MT_ELEMENT: + { dtd_element *e = m->content.element; + + link(from, to, e); + return; + } + case MT_SEQ: /* a,b,... */ + { dtd_model *sub; + + for( sub = m->content.group; sub->next; sub = sub->next ) + { dtd_state *tmp = new_dtd_state(); + translate_model(sub, from, tmp); + from = tmp; + } + translate_model(sub, from, to); + return; + } + case MT_AND: /* a&b&... */ + { expander *ex = sgml_calloc(1, sizeof(*ex)); + dtd_model *sub; + + ex->target = to; + ex->type = EX_AND; + + for( sub = m->content.group; sub; sub = sub->next ) + add_model_list(&ex->kind.and.set, sub); + + from->expander = ex; + return; + } + case MT_OR: /* a|b|... */ + { dtd_model *sub; + + for( sub = m->content.group; sub; sub = sub->next ) + translate_model(sub, from, to); + return; + } + case MT_PCDATA: + case MT_UNDEF: + assert(0); + } + +} + + +static void +translate_model(dtd_model *m, dtd_state *from, dtd_state *to) +{ if ( m->type == MT_PCDATA ) + { link(from, from, CDATA_ELEMENT); + link(from, to, NULL); + return; + } + + switch(m->cardinality) + { case MC_OPT: /* ? */ + link(from, to, NULL); + /*FALLTHROUGH*/ + case MC_ONE: + translate_one(m, from, to); + return; + case MC_REP: /* * */ + translate_one(m, from, from); + link(from, to, NULL); + return; + case MC_PLUS: /* + */ + translate_one(m, from, to); + translate_one(m, to, to); + return; + } +} + + +dtd_state * +make_state_engine(dtd_element *e) +{ if ( e->structure ) + { dtd_edef *def = e->structure; + + if ( !def->initial_state ) + { if ( def->content ) + { def->initial_state = new_dtd_state(); + def->final_state = new_dtd_state(); + + translate_model(def->content, def->initial_state, def->final_state); + } else if ( def->type == C_CDATA || def->type == C_RCDATA ) + { def->initial_state = new_dtd_state(); + def->final_state = new_dtd_state(); + + link(def->initial_state, def->initial_state, CDATA_ELEMENT); + link(def->initial_state, def->final_state, NULL); + } else + return NULL; + } + + return def->initial_state; + } + + return NULL; +} + + + /******************************* + * FREE * + *******************************/ + +static void do_free_state_engine(dtd_state *state, visited *visited); + +static void +free_model_list(dtd_model_list *l) +{ dtd_model_list *next; + + for( ; l; l=next) + { next = l->next; + + sgml_free(l); + } +} + + +static void +free_expander(expander *e, visited *visited) +{ if ( visit(e->target, visited) ) + do_free_state_engine(e->target, visited); + + switch(e->type) + { case EX_AND: + free_model_list(e->kind.and.set); + default: + ; + } + + sgml_free(e); +} + + +static void +do_free_state_engine(dtd_state *state, visited *visited) +{ transition *t, *next; + + for(t=state->transitions; t; t=next) + { next = t->next; + + if ( visit(t->state, visited) ) + do_free_state_engine(t->state, visited); + + sgml_free(t); + } + + if ( state->expander ) + free_expander(state->expander, visited); + + sgml_free(state); +} + + +void +free_state_engine(dtd_state *state) +{ if ( state ) + { visited visited; + visited.size = 0; + + visit(state, &visited); + do_free_state_engine(state, &visited); + } +} + + + diff --git a/packages/sgml/model.h b/packages/sgml/model.h new file mode 100644 index 000000000..c3beccbe3 --- /dev/null +++ b/packages/sgml/model.h @@ -0,0 +1,47 @@ +/* $Id$ + + Part of SWI-Prolog + + Author: Jan Wielemaker + E-mail: jan@swi.psy.uva.nl + WWW: http://www.swi-prolog.org + Copyright (C): 1985-2002, University of Amsterdam + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ + +#ifndef MODEL_H_INCLUDED +#define MODEL_H_INCLUDED + +#define MAXOMITTED 32 + +#define CDATA_ELEMENT ((dtd_element *)1) + +typedef struct _dtd_state +{ struct _state_transition *transitions; + struct _state_expander *expander; +} dtd_state; + +dtd_state *new_dtd_state(void); +dtd_state * make_dtd_transition(dtd_state *here, dtd_element *e); +int same_state(dtd_state *final, dtd_state *here); +int find_omitted_path(dtd_state *state, dtd_element *e, + dtd_element **path); +dtd_state * make_state_engine(dtd_element *e); +void free_state_engine(dtd_state *state); +void state_allows_for(dtd_state *state, + dtd_element **allow, int *n); + +#endif /*MODEL_H_INCLUDED*/ diff --git a/packages/sgml/parser.c b/packages/sgml/parser.c new file mode 100644 index 000000000..faf49a4bb --- /dev/null +++ b/packages/sgml/parser.c @@ -0,0 +1,5602 @@ +/* $Id$ + + Part of SWI-Prolog + + Author: Jan Wielemaker + E-mail: wielemak@science.uva.nl + WWW: http://www.swi-prolog.org + Copyright (C): 1985-2006, University of Amsterdam + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ + +#define _ISOC99_SOURCE 1 /* fwprintf(), etc prototypes */ + +#define DTD_IMPLEMENTATION 1 +#include +#include +#include "dtd.h" +#include "model.h" +#include "util.h" +#include "catalog.h" +#include "parser.h" +#include +#include +#include +#include +#include +#include "utf8.h" +#include +#include +#include "xml_unicode.h" + +#define DEBUG(g) ((void)0) +#define ZERO_TERM_LEN (-1) /* terminated by nul */ + +#ifdef __WINDOWS__ +#define inline __inline +#define swprintf _snwprintf +#endif + + + /******************************* + * LOCAL TYPES * + *******************************/ + +typedef struct locbuf +{ dtd_srcloc start; /* p->startloc */ + dtd_srcloc here; /* p->location */ +} locbuf; + + + /******************************* + * PROTOYPES * + *******************************/ + +static const ichar * itake_name(dtd *dtd, const ichar *in, dtd_symbol **id); +static const ichar * itake_entity_name(dtd *dtd, const ichar *in, + dtd_symbol **id); +static const ichar * itake_namegroup(dtd *dtd, const ichar *decl, + dtd_symbol **names, int *n); +static const ichar * iskip_layout(dtd *dtd, const ichar *in); +static dtd_parser * clone_dtd_parser(dtd_parser *p); +static void free_model(dtd_model *m); +static int process_entity_declaration(dtd_parser *p, + const ichar *decl); +static void free_notations(dtd_notation *n); +static void free_shortrefs(dtd_shortref *sr); +static int process_cdata(dtd_parser *p, int last); +static int process_entity(dtd_parser *p, const ichar *name); +static int emit_cdata(dtd_parser *p, int last); +static dtd_space_mode istr_to_space_mode(const ichar *val); +static void update_space_mode(dtd_parser *p, dtd_element *e, + int natts, sgml_attribute *atts); +static dtd_model * make_model(dtd *dtd, const ichar *decl, + const ichar **end); +static void for_elements_in_model(dtd_model *m, + void (*f)(dtd_element *e, + void *closure), + void *closure); +void putchar_dtd_parser(dtd_parser *p, int chr); +void free_dtd_parser(dtd_parser *p); +static const ichar * isee_character_entity(dtd *dtd, const ichar *in, + int *chr); +static int add_default_attributes(dtd_parser *p, dtd_element *e, + int natts, + sgml_attribute *atts); +static int prepare_cdata(dtd_parser *p); + + + /******************************* + * MACROS * + *******************************/ + +#define WITH_CLASS(p, c, g) \ + { sgml_event_class _oc = p->event_class; \ + p->event_class = c; \ + g; \ + p->event_class = _oc; \ + } + +#define WITH_PARSER(p, g) \ + { dtd_parser *_old = p; \ + current_parser = p; \ + g; \ + current_parser = _old; \ + } + + /******************************* + * STATISTICS * + *******************************/ + +#ifdef O_STATISTICS + +int edefs_created = 0; +int edefs_freed = 0; +int edefs_implicit = 0; +int edefs_atts = 0; +int edefs_decl = 0; +int dtd_created = 0; +int dtd_freed = 0; + +void +sgml_statistics(void) +{ fprintf(stderr, "EDEFS: created %d; freed %d\n", edefs_created, edefs_freed); + fprintf(stderr, "EDEFS: implicit %d; atts %d; decl %d\n", + edefs_implicit, edefs_atts, edefs_decl); + fprintf(stderr, "DTDs: created: %d; freed: %d\n", dtd_created, dtd_freed); +} + +#define STAT(g) g + +#else + +#define STAT(g) ((void)0) + +#endif + + + /******************************* + * SRC LOCATION * + *******************************/ + + +static void /* TBD: also handle startloc */ +push_location(dtd_parser *p, locbuf *save) +{ save->here = p->location; + save->start = p->startloc; + + p->location.parent = &save->here; + p->startloc.parent = &save->start; +} + + +static void +pop_location(dtd_parser *p, locbuf *saved) +{ p->location = saved->here; + p->startloc = saved->start; +} + + +static inline void +_sgml_cplocation(dtd_srcloc *d, dtd_srcloc *loc) +{ d->type = loc->type; + d->name.file = loc->name.file; + d->line = loc->line; + d->linepos = loc->linepos; + d->charpos = loc->charpos; + /* but not the parent! */ +} + +void +sgml_cplocation(dtd_srcloc *d, dtd_srcloc *loc) +{ _sgml_cplocation(d, loc); +} + +#define sgml_cplocation(d,s) _sgml_cplocation(d, s) + +static void +inc_location(dtd_srcloc *l, int chr) +{ if ( chr == '\n' ) + { l->linepos = 0; + l->line++; + } + + l->linepos++; + l->charpos++; +} + + +static void +dec_location(dtd_srcloc *l, int chr) +{ if ( chr == '\n' ) + { l->linepos = 2; /* not good! */ + l->line--; + } + l->linepos--; + l->charpos--; +} + + /******************************* + * CLASSIFICATION PRIMITIVES * + *******************************/ + +static inline int +HasClass(dtd *dtd, wint_t chr, int mask) +{ if ( chr <= 0xff ) + return (dtd->charclass->class[(chr)] & (mask)); + else + { switch(mask) + { case CH_NAME: + return ( xml_basechar(chr) || + xml_digit(chr) || + xml_ideographic(chr) || + xml_combining_char(chr) || + xml_extender(chr) + ); + case CH_NMSTART: + return ( xml_basechar(chr) || + xml_ideographic(chr) ); + case CH_WHITE: + return FALSE; /* only ' ' and '\t' */ + case CH_BLANK: + return iswspace(chr); + case CH_DIGIT: + return xml_digit(chr); + case CH_RS: + case CH_RE: + return FALSE; + default: + assert(0); + return FALSE; + } + } +} + + +static const ichar * +isee_func(dtd *dtd, const ichar *in, charfunc func) +{ if ( dtd->charfunc->func[func] == *in ) + return ++in; + + return NULL; +} + + /******************************* + * SYMBOLS * + *******************************/ + +static dtd_symbol_table * +new_symbol_table(void) +{ dtd_symbol_table *t = sgml_calloc(1, sizeof(*t)); + t->size = SYMBOLHASHSIZE; + t->entries = sgml_calloc(t->size, sizeof(dtd_symbol*)); + + return t; +} + + +static void +free_symbol_table(dtd_symbol_table *t) +{ int i; + + for(i=0; isize; i++) + { dtd_symbol *s, *next; + + for(s=t->entries[i]; s; s=next) + { next = s->next; + + sgml_free((ichar*)s->name); + sgml_free(s); + } + } + + sgml_free(t->entries); + sgml_free(t); +} + + +dtd_symbol * +dtd_find_symbol(dtd *dtd, const ichar *name) +{ dtd_symbol_table *t = dtd->symbols; + + if ( dtd->case_sensitive ) + { int k = istrhash(name, t->size); + dtd_symbol *s; + + for(s=t->entries[k]; s; s = s->next) + { if ( istreq(s->name, name) ) + return s; + } + } else + { int k = istrcasehash(name, t->size); + dtd_symbol *s; + + for(s=t->entries[k]; s; s = s->next) + { if ( istrcaseeq(s->name, name) ) + return s; + } + } + + return NULL; +} + + +static dtd_symbol * +dtd_find_entity_symbol(dtd *dtd, const ichar *name) +{ dtd_symbol_table *t = dtd->symbols; + + if ( dtd->ent_case_sensitive ) + { int k = istrhash(name, t->size); + dtd_symbol *s; + + for(s=t->entries[k]; s; s = s->next) + { if ( istreq(s->name, name) ) + return s; + } + } else + { int k = istrcasehash(name, t->size); + dtd_symbol *s; + + for(s=t->entries[k]; s; s = s->next) + { if ( istrcaseeq(s->name, name) ) + return s; + } + } + + return NULL; +} + + +dtd_symbol * +dtd_add_symbol(dtd *dtd, const ichar *name) +{ dtd_symbol_table *t = dtd->symbols; + int k = istrhash(name, t->size); + dtd_symbol *s; + + for(s=t->entries[k]; s; s = s->next) + { if ( istreq(s->name, name) ) + return s; + } + + s = sgml_calloc(1, sizeof(*s)); + s->name = istrdup(name); + s->next = t->entries[k]; + t->entries[k] = s; + + return s; +} + + + /******************************* + * ENTITIES * + *******************************/ + +static void +free_entity_list(dtd_entity *e) +{ dtd_entity *next; + + for( ; e; e=next) + { next = e->next; + + if ( e->value ) sgml_free(e->value); + if ( e->extid ) sgml_free(e->extid); + if ( e->exturl ) sgml_free(e->exturl); + if ( e->baseurl ) sgml_free(e->baseurl); + + sgml_free(e); + } +} + + +static dtd_entity * +find_pentity(dtd *dtd, dtd_symbol *id) +{ dtd_entity *e; + + for(e = dtd->pentities; e; e=e->next) + { if ( e->name == id ) + return e; + } + + return NULL; +} + + +/* returned path must be freed when done */ + +static ichar * +entity_file(dtd *dtd, dtd_entity *e) +{ switch(e->type) + { case ET_SYSTEM: + case ET_PUBLIC: + { const ichar *f; + + f = find_in_catalogue(e->catalog_location, + e->name->name, + e->extid, + e->exturl, + dtd->dialect != DL_SGML); + + if ( f ) /* owned by catalog */ + { ichar *file; + + if ( is_absolute_path(f) || !e->baseurl ) + file = istrdup(f); + else + file = localpath(e->baseurl, f); + + return file; + } + } + default: + return NULL; + } +} + + +static const ichar * +entity_value(dtd_parser *p, dtd_entity *e, int *len) +{ ichar *file; + + if ( !e->value && (file=entity_file(p->dtd, e)) ) + { int normalise = (e->content == EC_SGML || e->content == EC_CDATA); + size_t l; + + e->value = load_sgml_file_to_charp(file, normalise, &l); + e->length = (long)l; + sgml_free(file); + } + + if ( len ) + *len = e->length; + + return e->value; +} + + +static int +expand_pentities(dtd_parser *p, const ichar *in, int ilen, ichar *out, int len) +{ dtd *dtd = p->dtd; + int pero = dtd->charfunc->func[CF_PERO]; /* % */ + int ero = dtd->charfunc->func[CF_ERO]; /* & */ + const ichar *s; + const ichar *end; + + if ( ilen == ZERO_TERM_LEN ) + { end = in + wcslen(in); + } else + { end = &in[ilen]; + } + + while(in < end) + { if ( *in == pero ) + { dtd_symbol *id; + + if ( (s = itake_entity_name(dtd, in+1, &id)) ) + { dtd_entity *e = find_pentity(dtd, id); + const ichar *eval; + int l; + + in = s; + if ( (s=isee_func(dtd, s, CF_ERC)) ) /* ; is not obligatory? */ + in = s; + + if ( !e ) + return gripe(ERC_EXISTENCE, L"parameter entity", id->name); + + if ( !(eval = entity_value(p, e, NULL)) ) + return FALSE; + + if ( !expand_pentities(p, eval, ZERO_TERM_LEN, out, len) ) + return FALSE; + l = (int)istrlen(out); /* could be better */ + out += l; + len -= l; + + continue; + } + } + + if ( --len <= 0 ) + { gripe(ERC_REPRESENTATION, L"Declaration too long"); + return FALSE; + } + + if ( *in == ero && in[1] == '#' ) /* &# */ + { int chr; + + if ( (s=isee_character_entity(dtd, in, &chr)) ) + { if ( chr == 0 ) + { gripe(ERC_SYNTAX_ERROR, L"Illegal character entity", in); + } else + { *out++ = chr; + in = s; + continue; + } + } + } + + *out++ = *in++; + } + + *out = '\0'; + + return TRUE; +} + + +static int +char_entity_value(const ichar *decl) +{ if ( *decl == '#' ) + { const ichar *s = decl+1; + ichar *end; + long v; + + /* do octal too? */ + if ( s[0] == 'x' || s[0] == 'X' ) + v = wcstoul(s+1, &end, 16); + else + v = wcstoul(s, &end, 10); + + if ( *end == '\0' ) + { return (int)v; + } else if ( istreq(s, L"RS") ) + { return '\n'; + } else if ( istreq(s, L"RE") ) + { return '\r'; + } else if ( istreq(s, L"TAB") ) + { return '\t'; + } else if ( istreq(s, L"SPACE") ) + { return ' '; + } + } + + return -1; +} + + +static const ichar * +isee_character_entity(dtd *dtd, const ichar *in, int *chr) +{ const ichar *s; + + if ( (s=isee_func(dtd, in, CF_ERO)) && *s == '#' ) + { ichar e[32]; + ichar *o = e; + int v; + + *o++ = *s++; + while(o < e+sizeof(e)/sizeof(ichar)-1 && HasClass(dtd, *s, CH_NAME)) + *o++ = *s++; + if ( isee_func(dtd, s, CF_ERC)) /* skip ; */ + s++; + + *o = '\0'; + if ( (v=char_entity_value(e)) >= 0 ) + { *chr = v; + return s; + } + } + + return NULL; +} + + +/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Expand entities in a string. Used to expand CDATA attribute values. +- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ + +static int +expand_entities(dtd_parser *p, const ichar *in, int len, ocharbuf *out) +{ const ichar *s; + const ichar *end = &in[len]; + dtd *dtd = p->dtd; + int ero = dtd->charfunc->func[CF_ERO]; /* & */ + + while(in < end) + { if ( *in == ero ) + { const ichar *estart = in; /* for recovery */ + int chr; + + if ( (s=isee_character_entity(dtd, in, &chr)) ) + { if ( chr == 0 ) + gripe(ERC_SYNTAX_ERROR, L"Illegal character entity", in); + + add_ocharbuf(out, chr); + in = s; + continue; + } + + if ( HasClass(dtd, in[1], CH_NMSTART) ) + { dtd_symbol *id; + dtd_entity *e; + const ichar *eval; + + if ( !(in = itake_name(dtd, in+1, &id)) ) + { in = estart; + goto recover; + } + if ( isee_func(dtd, in, CF_ERC) || *in == '\n' ) + in++; + + if ( !(e = id->entity) && !(e=dtd->default_entity) ) + { gripe(ERC_EXISTENCE, L"entity", id->name); + in = estart; + goto recover; + } + + if ( !(eval = entity_value(p, e, NULL)) ) + { gripe(ERC_NO_VALUE, e->name->name); + in = estart; + goto recover; + } + + if ( e->content == EC_SGML ) + { if ( !expand_entities(p, eval, (int)istrlen(eval), out) ) + return FALSE; + } else + { const ichar *s; + + for(s=eval; *s; s++) + add_ocharbuf(out, *s); + } + + continue; + } + + if ( dtd->dialect != DL_SGML ) + gripe(ERC_SYNTAX_ERROR, L"Illegal entity", estart); + } + + recover: + + if ( *in == CR && in[1] == LF ) + in++; + + if ( HasClass(dtd, *in, CH_BLANK) ) + { add_ocharbuf(out, ' '); + in++; + } else + { add_ocharbuf(out, *in++); + } + } + + terminate_ocharbuf(out); + + return TRUE; +} + + + + /******************************* + * ELEMENTS * + *******************************/ + +static dtd_element * +find_element(dtd *dtd, dtd_symbol *id) +{ dtd_element *e; + + if ( id->element ) + return id->element; /* must check */ + + e = sgml_calloc(1, sizeof(*e)); + e->space_mode = SP_INHERIT; + e->undefined = TRUE; + e->name = id; + id->element = e; + + e->next = dtd->elements; + dtd->elements = e; + + return e; +} + + +static dtd_edef * +new_element_definition(dtd *dtd) +{ dtd_edef *def = sgml_calloc(1, sizeof(*def)); + + STAT(edefs_created++); + + return def; +} + + +static dtd_element * +def_element(dtd *dtd, dtd_symbol *id) +{ dtd_element *e = find_element(dtd, id); + + if ( !e->structure ) + { e->structure = new_element_definition(dtd); + e->structure->references = 1; + e->structure->type = C_EMPTY; + } + + return e; +} + + +static void +free_name_list(dtd_name_list *nl) +{ dtd_name_list *next; + + for( ; nl; nl=next) + { next = nl->next; + + sgml_free(nl); + } +} + + +#define REFS_VIRGIN (-42) + +static void +free_attribute(dtd_attr *a) +{ if ( a->references == REFS_VIRGIN || --a->references == 0 ) + { switch(a->type) + { case AT_NAMEOF: + case AT_NOTATION: + free_name_list(a->typeex.nameof); + default: + ; + } + switch(a->def) + { case AT_DEFAULT: + case AT_FIXED: + { if ( a->islist ) + sgml_free(a->att_def.list); + else if ( a->type == AT_CDATA && a->att_def.cdata ) + sgml_free(a->att_def.cdata); + } + default: + ; + } + + sgml_free(a); + } +} + + +static void +free_attribute_list(dtd_attr_list *l) +{ dtd_attr_list *next; + + for(; l; l=next) + { next = l->next; + + free_attribute(l->attribute); + sgml_free(l); + } +} + + +static void +free_element_list(dtd_element_list *l) +{ dtd_element_list *next; + + for( ; l; l=next) + { next = l->next; + + sgml_free(l); + } +} + +static void +free_element_definition(dtd_edef *def) +{ if ( --def->references == 0 ) + { STAT(edefs_freed++); + if ( def->content ) + free_model(def->content); + free_element_list(def->included); + free_element_list(def->excluded); + free_state_engine(def->initial_state); + + sgml_free(def); + } +} + + +static void +free_elements(dtd_element *e) +{ dtd_element *next; + + for( ; e; e=next) + { next = e->next; + + if ( e->structure ) + free_element_definition(e->structure); + free_attribute_list(e->attributes); + + sgml_free(e); + } +} + + + /******************************* + * ATTRIBUTES * + *******************************/ + +static dtd_attr * +find_attribute(dtd_element *e, dtd_symbol *name) +{ dtd_attr_list *a; + + for(a=e->attributes; a; a=a->next) + { if ( a->attribute->name == name ) + return a->attribute; + } + + return NULL; +} + + + /******************************* + * PARSE PRIMITIVES * + *******************************/ + +static const ichar * +iskip_layout(dtd *dtd, const ichar *in) +{ ichar cmt = dtd->charfunc->func[CF_CMT]; /* also skips comment */ + + for( ; *in; in++ ) + { if ( HasClass(dtd, *in, CH_BLANK) ) + continue; + + if ( in[0] == cmt && in[1] == cmt ) + { in += 2; + + for( ; *in; in++ ) + { if ( in[0] == cmt && in[1] == cmt ) + break; + } + in++; + continue; + } + + return in; + } + + return in; +} + + +/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +See whether we are looking at identifier "id". "id" must be lowercase! +This is only used for reserved words, and parsed case-insentive in both +XML and SGML modes. +- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ + +static const ichar * +isee_identifier(dtd *dtd, const ichar *in, char *id) +{ in = iskip_layout(dtd, in); + + /* match */ + while (*id && (wint_t)*id == towlower(*in) ) + id++, in++; + if ( *id == 0 && !HasClass(dtd, *in, CH_NAME) ) + return iskip_layout(dtd, in); + + return NULL; +} + + +static const ichar * +itake_name(dtd *dtd, const ichar *in, dtd_symbol **id) +{ ichar buf[MAXNMLEN]; + ichar *o = buf; + ichar *e = &buf[MAXNMLEN]-1; + + in = iskip_layout(dtd, in); + if ( !HasClass(dtd, *in, CH_NMSTART) ) + return NULL; + + if ( dtd->case_sensitive ) + { while( HasClass(dtd, *in, CH_NAME) && o < e ) + *o++ = *in++; + } else + { while( HasClass(dtd, *in, CH_NAME) && o < e ) + *o++ = towlower(*in++); + } + + if ( o == e ) + { gripe(ERC_REPRESENTATION, L"NAME too long"); + return NULL; + } + + *o++ = '\0'; + + *id = dtd_add_symbol(dtd, buf); + + return iskip_layout(dtd, in); +} + + +static const ichar * +itake_entity_name(dtd *dtd, const ichar *in, dtd_symbol **id) +{ ichar buf[MAXNMLEN]; + ichar *o = buf; + ichar *e = &buf[MAXNMLEN]-1; + + in = iskip_layout(dtd, in); + if ( !HasClass(dtd, *in, CH_NMSTART) ) + return NULL; + + if ( dtd->ent_case_sensitive ) + { while( HasClass(dtd, *in, CH_NAME) && o < e ) + *o++ = *in++; + } else + { while( HasClass(dtd, *in, CH_NAME) && o < e ) + *o++ = towlower(*in++); + } + if ( o == e ) + { gripe(ERC_REPRESENTATION, L"Entity NAME too long"); + return NULL; + } + + *o++ = '\0'; + + *id = dtd_add_symbol(dtd, buf); + + return in; +} + + +static const ichar * +itake_nmtoken(dtd *dtd, const ichar *in, dtd_symbol **id) +{ ichar buf[MAXNMLEN]; + ichar *o = buf; + ichar *e = &buf[MAXNMLEN]-1; + + in = iskip_layout(dtd, in); + if ( !HasClass(dtd, *in, CH_NAME) ) + return NULL; + if ( dtd->case_sensitive ) + { while( HasClass(dtd, *in, CH_NAME) && o < e ) + *o++ = *in++; + } else + { while( HasClass(dtd, *in, CH_NAME) && o < e ) + *o++ = towlower(*in++); + } + if ( o == e ) + { gripe(ERC_REPRESENTATION, L"NMTOKEN too long"); + return NULL; + } + + *o = '\0'; + + *id = dtd_add_symbol(dtd, buf); + + return iskip_layout(dtd, in); +} + + +static const ichar * +itake_nutoken(dtd *dtd, const ichar *in, dtd_symbol **id) +{ ichar buf[MAXNMLEN]; + ichar *o = buf; + ichar *e = &buf[MAXNMLEN]-1; + + in = iskip_layout(dtd, in); + if ( !HasClass(dtd, *in, CH_DIGIT) ) + return NULL; + + if ( dtd->case_sensitive ) + { while( HasClass(dtd, *in, CH_NAME) && o < e ) + *o++ = *in++; + } else + { while( HasClass(dtd, *in, CH_NAME) && o < e ) + *o++ = towlower(*in++); + } + + if ( o == e ) + { gripe(ERC_REPRESENTATION, L"NUTOKEN too long"); + return NULL; + } + + *o = '\0'; + if ( o - buf > 8 ) + gripe(ERC_LIMIT, L"nutoken length"); + + *id = dtd_add_symbol(dtd, buf); + + return iskip_layout(dtd, in); +} + + +static const ichar * +itake_number(dtd *dtd, const ichar *in, dtd_attr *at) +{ in = iskip_layout(dtd, in); + + switch(dtd->number_mode) + { case NU_TOKEN: + { ichar buf[MAXNMLEN]; + ichar *o = buf; + + while( HasClass(dtd, *in, CH_DIGIT) ) + *o++ = *in++; + if ( o == buf ) + return NULL; /* empty */ + *o = '\0'; + at->att_def.name = dtd_add_symbol(dtd, buf); + + return iskip_layout(dtd, (const ichar *)in); + } + case NU_INTEGER: + { ichar *end; + + at->att_def.number = wcstol(in, &end, 10); + if ( end > in && errno != ERANGE ) + return iskip_layout(dtd, end); + } + } + + return NULL; +} + + +/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Get a quoted value. After successful return, *start points to the start +of the string in the input and *len to the length. The data is *not* +nul terminated. +- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ + +static const ichar * +itake_string(dtd *dtd, const ichar *in, ichar **start, int *len) +{ in = iskip_layout(dtd, in); + + if ( isee_func(dtd, in, CF_LIT) || + isee_func(dtd, in, CF_LITA) ) + { ichar q = *in++; + + *start = (ichar *)in; + while( *in && *in != q ) + in++; + if ( *in ) + { *len = (int)(in - (*start)); + + return iskip_layout(dtd, ++in); + } + } + + return NULL; +} + + +static const ichar * +itake_dubbed_string(dtd *dtd, const ichar *in, ichar **out) +{ ichar *start; + int len; + const ichar *end; + + if ( (end=itake_string(dtd, in, &start, &len)) ) + *out = istrndup(start, len); + + return end; +} + + +/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +itake_url() is used to get the argument of a SYSTEM or 2nd argument of a +PUBLIC reference. Once upon a time it tried to tag the argument as +file:, but this job cannot be before lookup in the catalogue. It +is now the same as itake_dubbed_string(), so we simply call this one. +- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ + +static const ichar * +itake_url(dtd *dtd, const ichar *in, ichar **out) +{ return itake_dubbed_string(dtd, in, out); +} + + +static const ichar * +itake_nmtoken_chars(dtd *dtd, const ichar *in, ichar *out, int len) +{ in = iskip_layout(dtd, in); + if ( !HasClass(dtd, *in, CH_NAME) ) + return NULL; + while( HasClass(dtd, *in, CH_NAME) ) + { if ( --len <= 0 ) + gripe(ERC_REPRESENTATION, L"Name token too long"); + *out++ = (dtd->case_sensitive ? *in++ : (ichar)towlower(*in++)); + } + *out++ = '\0'; + + return iskip_layout(dtd, in); +} + + +/* There used to be a function + + itake_nonblank_chars(dtd, in, out, len) -> new end + + which + - skipped layout, + - copied characters from in[] to out[] until layout or \0 was found, + - added a terminating \0 to out[], + - skipped any following layout, and + - returned the new position. + + That function was only called by get_attribute_value(), which used + it to parse an unquoted attribute value. According to SGML, that's + not right: unquoted attribute values must look like NMTOKENs (but + have a different length bound). In particular, elements like + zoo + '. According to HTML practice, pretty much any + old junk will be accepted, and some HTML parsers will allow bare + slashes in such an attribute. + + Typical HTML is *so* bad that it doesn't agree with *any* part of + the HTML specifications (e.g., is commonly wrapped around + block-level elements, which has never been legal). It's not clear + that there is much point in trying to accomodate bad HTML; if you + really need to do that, use the free program HTML Tidy (from the + http://www.w3c.org/ site) to clean up, and parse its output instead. + + However, in order to break as little as possible, the new (sgml-1.0.14) + function accepts anything except > / \0 and blanks. + +JW: I decided to accept / as part of an unquoted in SGML-mode if + shorttag is disabled as well as in XML mode if it is not the + end of the begin-element +*/ + +static ichar const * +itake_unquoted(dtd *dtd, ichar const *in, ichar *out, int len) +{ ichar const end2 = dtd->charfunc->func[CF_ETAGO2]; /* / */ + ichar c; + + /* skip leading layout. Do NOT skip comments! --x-- is a value! */ + while (c = *in, HasClass(dtd, c, CH_BLANK)) + in++; + + /* copy the attribute to out[] */ + while ( !HasClass(dtd, c, CH_BLANK) && + c != '\0' ) + { if ( c == end2 && (dtd->shorttag || + (in[1] == '\0' && dtd->dialect != DL_SGML)) ) + break; + + if ( --len > 0 ) + *out++ = c; + else if ( len == 0 ) + gripe(ERC_REPRESENTATION, L"Attribute too long"); + c = *++in; + } + *out = '\0'; + + /* skip trailing layout. While it is kind to skip comments here, + it is technically wrong to do so. Tags may not contain comments. + */ + + return iskip_layout(dtd, in); +} + + + /******************************* + * DTD * + *******************************/ + +dtd * +new_dtd(const ichar *doctype) +{ dtd *dtd = sgml_calloc(1, sizeof(*dtd)); + + STAT(dtd_created++); + dtd->magic = SGML_DTD_MAGIC; + dtd->implicit = TRUE; + dtd->dialect = DL_SGML; + if ( doctype ) + dtd->doctype = istrdup(doctype); + dtd->symbols = new_symbol_table(); + dtd->charclass = new_charclass(); + dtd->charfunc = new_charfunc(); + dtd->space_mode = SP_SGML; + dtd->ent_case_sensitive = TRUE; /* case-sensitive entities */ + dtd->shorttag = TRUE; /* allow for number_mode = NU_TOKEN; + + return dtd; +} + + +void +free_dtd(dtd *dtd) +{ if ( --dtd->references == 0 ) + { STAT(dtd_freed++); + + if ( dtd->doctype ) + sgml_free(dtd->doctype); + + free_entity_list(dtd->entities); + free_entity_list(dtd->pentities); + free_notations(dtd->notations); + free_shortrefs(dtd->shortrefs); + free_elements(dtd->elements); + free_symbol_table(dtd->symbols); + sgml_free(dtd->charfunc); + sgml_free(dtd->charclass); + dtd->magic = 0; + + sgml_free(dtd); + } +} + + +static const wchar_t *xml_entities[] = +{ L"lt CDATA \"<\"", /* < */ + L"gt CDATA \">\"", /* > */ + L"amp CDATA \"&\"", /* & */ + L"apos CDATA \"'\"", /* ' */ + L"quot CDATA \""\"", /* " */ + NULL +}; + + +int +set_dialect_dtd(dtd *dtd, dtd_dialect dialect) +{ if ( dtd->dialect != dialect ) + { dtd->dialect = dialect; + + switch(dialect) + { case DL_SGML: + { dtd->case_sensitive = FALSE; + dtd->space_mode = SP_SGML; + dtd->shorttag = TRUE; + break; + } + case DL_XML: + case DL_XMLNS: + { const ichar **el; + dtd_parser p; + + dtd->case_sensitive = TRUE; + dtd->encoding = SGML_ENC_UTF8; + dtd->space_mode = SP_PRESERVE; + dtd->shorttag = FALSE; + + memset(&p, 0, sizeof(p)); + p.dtd = dtd; + for(el = xml_entities; *el; el++) + process_entity_declaration(&p, *el); + + break; + } + } + } + + return TRUE; +} + + +int +set_option_dtd(dtd *dtd, dtd_option option, int set) +{ switch(option) + { case OPT_SHORTTAG: + dtd->shorttag = set; + break; + } + + return TRUE; +} + + +static const ichar * +baseurl(dtd_parser *p) +{ if ( p->location.type == IN_FILE && p->location.name.file ) + { return p->location.name.file; + } + + return NULL; +} + + +static const ichar * +process_entity_value_declaration(dtd_parser *p, + const ichar *decl, dtd_entity *e) +{ dtd *dtd = p->dtd; + const ichar *s; + + if ( e->type == ET_SYSTEM ) + { if ( (s=itake_url(dtd, decl, &e->exturl)) ) + { e->baseurl = istrdup(baseurl(p)); + return s; + } + + goto string_expected; + } else + { ichar *start; int len; + ichar val[MAXSTRINGLEN]; + + if ( !(s = itake_string(dtd, decl, &start, &len)) ) + goto string_expected; + decl = s; + + expand_pentities(p, start, len, val, sizeof(val)/sizeof(ichar)); + + switch ( e->type ) + { case ET_PUBLIC: + { e->extid = istrdup(val); + if ( isee_func(dtd, decl, CF_LIT) || + isee_func(dtd, decl, CF_LITA) ) + { if ( (s=itake_url(dtd, decl, &e->exturl)) ) + { e->baseurl = istrdup(baseurl(p)); + decl = s; + } + } + return decl; + } + case ET_LITERAL: + { e->value = istrdup(val); + e->length = (int)wcslen(e->value); + return decl; + } + default: + assert(0); + return NULL; + } + } + +string_expected: + gripe(ERC_SYNTAX_ERROR, L"String expected", decl); + return NULL; +} + + +/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +The sgml-standard tells us to accept the first definition of an entity, +silently suppressing any further attempt to redefine the entity. +- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ + +static int +process_entity_declaration(dtd_parser *p, const ichar *decl) +{ dtd *dtd = p->dtd; + const ichar *s; + dtd_symbol *id; + dtd_entity *e; + int isparam; + int isdef = FALSE; + /* parameter entity */ + if ( (s=isee_func(dtd, decl, CF_PERO)) ) + { isparam = TRUE; + decl = s; + } else + isparam = FALSE; + + if ( !(s = itake_entity_name(dtd, decl, &id)) ) + { if ( !(s = isee_identifier(dtd, decl, "#default")) ) + return gripe(ERC_SYNTAX_ERROR, L"Name expected", decl); + id = dtd_add_symbol(dtd, (ichar*)"#DEFAULT"); + isdef = TRUE; + } + + if ( isparam && find_pentity(dtd, id) ) + { gripe(ERC_REDEFINED, L"parameter entity", id); + return TRUE; /* already defined parameter entity */ + } + if ( id->entity ) + { gripe(ERC_REDEFINED, L"entity", id); + return TRUE; /* already defined normal entity */ + } + + decl = iskip_layout(dtd, s); + e = sgml_calloc(1, sizeof(*e)); + e->name = id; + e->catalog_location = (isparam ? CAT_PENTITY : CAT_ENTITY); + + if ( (s = isee_identifier(dtd, decl, "system")) ) + { e->type = ET_SYSTEM; + e->content = EC_SGML; + decl = s; + } else if ( (s = isee_identifier(dtd, decl, "public")) ) + { e->type = ET_PUBLIC; + e->content = EC_SGML; + decl = s; + } else + { e->type = ET_LITERAL; + + if ( !isparam ) + { if ( (s=isee_identifier(dtd, decl, "cdata")) ) + { decl = s; + e->content = EC_CDATA; + } else if ( (s=isee_identifier(dtd, decl, "sdata")) ) + { decl = s; + e->content = EC_SDATA; + } else if ( (s=isee_identifier(dtd, decl, "pi")) ) + { decl = s; + e->content = EC_PI; + } else if ( (s=isee_identifier(dtd, decl, "starttag")) ) + { decl = s; + e->content = EC_STARTTAG; + } else if ( (s=isee_identifier(dtd, decl, "endtag")) ) + { decl = s; + e->content = EC_ENDTAG; + } else + e->content = EC_SGML; + } + } + + if ( (decl=process_entity_value_declaration(p, decl, e)) ) + { if ( e->type == ET_LITERAL ) + { switch(e->content) + { case EC_STARTTAG: + { ichar *buf = sgml_malloc((e->length + 3)*sizeof(ichar)); + + buf[0] = dtd->charfunc->func[CF_STAGO]; + istrcpy(&buf[1], e->value); + buf[++e->length] = dtd->charfunc->func[CF_STAGC]; + buf[++e->length] = 0; + + sgml_free(e->value); + e->value = buf; + e->content = EC_SGML; + + break; + } + case EC_ENDTAG: + { ichar *buf = sgml_malloc((e->length + 4)*sizeof(ichar)); + + buf[0] = dtd->charfunc->func[CF_ETAGO1]; + buf[1] = dtd->charfunc->func[CF_ETAGO2]; + istrcpy(&buf[2], e->value); + e->length++; + buf[++e->length] = dtd->charfunc->func[CF_STAGC]; + buf[++e->length] = 0; + + sgml_free(e->value); + e->value = buf; + e->content = EC_SGML; + + break; + } + default: + break; + } + } else + { if ( *decl ) + { dtd_symbol *nname; + + if ( (s=isee_identifier(dtd, decl, "cdata")) ) + { decl = s; + e->content = EC_CDATA; + } else if ( (s=isee_identifier(dtd, decl, "sdata")) ) + { decl = s; + e->content = EC_SDATA; + } else if ( (s=isee_identifier(dtd, decl, "ndata")) ) + { decl = s; + e->content = EC_NDATA; + } else + return gripe(ERC_SYNTAX_ERROR, L"Bad datatype declaration", decl); + + if ( (s=itake_name(dtd, decl, &nname)) ) /* what is this? */ + { decl = s; + } else + return gripe(ERC_SYNTAX_ERROR, L"Bad notation declaration", decl); + } + } + + if ( *decl ) + return gripe(ERC_SYNTAX_ERROR, L"Unexpected end of declaraction", decl); + } + + if ( isparam ) + { e->next = dtd->pentities; + dtd->pentities = e; + } else + { e->name->entity = e; + e->next = dtd->entities; + dtd->entities = e; + } + + if ( isdef ) + dtd->default_entity = e; + + return TRUE; +} + + + /******************************* + * NOTATIONS * + *******************************/ + +static dtd_notation * +find_notation(dtd *dtd, dtd_symbol *name) +{ dtd_notation *n; + + for(n=dtd->notations; n; n = n->next) + { if ( n->name == name ) + return n; + } + + return NULL; +} + + +static void +add_notation(dtd *dtd, dtd_notation *not) +{ dtd_notation **n = &dtd->notations; + + for( ; *n; n = &(*n)->next) + ; + *n = not; +} + +static int +process_notation_declaration(dtd_parser *p, const ichar *decl) +{ dtd *dtd = p->dtd; + dtd_symbol *nname; + const ichar *s; + ichar *system = NULL, *public = NULL; + dtd_notation *not; + + if ( !(s=itake_name(dtd, decl, &nname)) ) + return gripe(ERC_SYNTAX_ERROR, L"Notation name expected", decl); + decl = s; + + if ( find_notation(dtd, nname) ) + { gripe(ERC_REDEFINED, L"notation", nname); + return TRUE; + } + + if ( (s=isee_identifier(dtd, decl, "system")) ) + { ; + } else if ( (s=isee_identifier(dtd, decl, "public")) ) + { decl = s; + if ( !(s=itake_dubbed_string(dtd, decl, &public)) ) + return gripe(ERC_SYNTAX_ERROR, L"Public identifier expected", decl); + } else + return gripe(ERC_SYNTAX_ERROR, L"SYSTEM or PUBLIC expected", decl); + + decl = s; + if ( (s=itake_dubbed_string(dtd, decl, &system)) ) + decl = s; + + if ( *decl ) + return gripe(ERC_SYNTAX_ERROR, L"Unexpected end of declaraction", decl); + + not = sgml_calloc(1, sizeof(*not)); + not->name = nname; + not->system = system; + not->public = public; + not->next = NULL; + add_notation(dtd, not); + + return TRUE; +} + + +static void +free_notations(dtd_notation *n) +{ dtd_notation *next; + + for( ; n; n=next) + { next = n->next; + + sgml_free(n->system); + sgml_free(n->public); + + sgml_free(n); + } +} + + /******************************* + * SHORTREF * + *******************************/ + +static void +free_maps(dtd_map *map) +{ dtd_map *next; + + for( ; map; map=next) + { next = map->next; + if ( map->from ) + sgml_free(map->from); + sgml_free(map); + } +} + + +static void +free_shortrefs(dtd_shortref *sr) +{ dtd_shortref *next; + + for( ; sr; sr=next) + { next = sr->next; + free_maps(sr->map); + sgml_free(sr); + } +} + + +static const ichar * +shortref_add_map(dtd *dtd, const ichar *decl, dtd_shortref *sr) +{ ichar *start; int len; + ichar from[MAXMAPLEN]; + ichar *f = from; + dtd_symbol *to; + const ichar *s; + const ichar *end; + dtd_map **p; + dtd_map *m; + + if ( !(s=itake_string(dtd, decl, &start, &len)) ) + { gripe(ERC_SYNTAX_ERROR, L"map-string expected", decl); + return NULL; + } + decl = s; + if ( !(s=itake_entity_name(dtd, decl, &to)) ) + { gripe(ERC_SYNTAX_ERROR, L"map-to name expected", decl); + return NULL; + } + end = s; + + for(decl=start; len > 0;) + { if ( *decl == 'B' ) /* blank */ + { if ( decl[1] == 'B' ) + { *f++ = CHR_DBLANK; + decl += 2; + len -= 2; + continue; + } + *f++ = CHR_BLANK; + decl++; + len--; + } else + { *f++ = *decl++; /* any other character */ + len--; + } + } + *f = 0; + + for(p=&sr->map; *p; p = &(*p)->next) + ; + + m = sgml_calloc(1, sizeof(*m)); + m->from = istrdup(from); + m->len = (int)istrlen(from); + m->to = to; + + *p = m; + + return end; +} + + +static dtd_shortref * +def_shortref(dtd_parser *p, dtd_symbol *name) +{ dtd *dtd = p->dtd; + dtd_shortref *sr, **pr; + + for(pr=&dtd->shortrefs; *pr; pr = &(*pr)->next) + { dtd_shortref *r = *pr; + + if ( r->name == name ) + return r; + } + + sr = sgml_calloc(1, sizeof(*sr)); + sr->name = name; + *pr = sr; + + return sr; +} + + +/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Create an array with TRUE in any character that can be the last of the +shortref map. +- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ + +static void +compile_map(dtd *dtd, dtd_shortref *sr) +{ dtd_map *map; + + for(map = sr->map; map; map = map->next) + { ichar last = map->from[map->len-1]; + + switch( last ) + { case CHR_BLANK: + case CHR_DBLANK: + { wint_t i; + + for( i=0; i< SHORTMAP_SIZE; i++) + { if ( HasClass(dtd, i, CH_BLANK) ) + sr->ends[i] = TRUE; + } + } + + default: + sr->ends[last] = TRUE; + } + } +} + + +static int +process_shortref_declaration(dtd_parser *p, const ichar *decl) +{ dtd *dtd = p->dtd; + ichar buf[MAXDECL]; + dtd_shortref *sr; + dtd_symbol *name; + const ichar *s; + + if ( !expand_pentities(p, decl, ZERO_TERM_LEN, buf, sizeof(buf)/sizeof(ichar)) ) + return FALSE; + decl = buf; + + if ( !(s=itake_name(dtd, decl, &name)) ) + return gripe(ERC_SYNTAX_ERROR, L"Name expected", decl); + decl = s; + + sr = def_shortref(p, name); + if ( sr->defined ) + { gripe(ERC_REDEFINED, L"shortref", name); + + return TRUE; + } + + sr->defined = TRUE; + + while( *(decl = iskip_layout(dtd, decl)) != '\0' + && (s=shortref_add_map(dtd, decl, sr)) ) + decl = s; + compile_map(dtd, sr); + + if ( *decl ) + return gripe(ERC_SYNTAX_ERROR, L"Map expected", decl); + + return TRUE; +} + + +/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Find named name. The name NULL stands for the #empty map + +- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ + +static dtd_shortref * +find_map(dtd *dtd, dtd_symbol *name) +{ dtd_shortref *sr; + + if ( !name ) + { static dtd_shortref *empty; + + if ( !empty ) + { empty = sgml_calloc(1, sizeof(*empty)); + empty->name = dtd_add_symbol(dtd, (ichar*)"#EMPTY"); + empty->defined = TRUE; + } + + return empty; + } + + for( sr = dtd->shortrefs; sr; sr = sr->next ) + { if ( sr->name == name ) + { if ( !sr->defined ) + break; + + return sr; + } + } + + return NULL; +} + + +static void +set_map_element(dtd_element *e, void *closure) +{ e->map = closure; +} + + +static int +process_usemap_declaration(dtd_parser *p, const ichar *decl) +{ dtd *dtd = p->dtd; + ichar buf[MAXDECL]; + dtd_symbol *name; + const ichar *s; + dtd_symbol *ename; + dtd_element *e; + dtd_shortref *map; + + if ( !expand_pentities(p, decl, ZERO_TERM_LEN, buf, sizeof(buf)/sizeof(ichar)) ) + return FALSE; + decl = buf; + + if ( !(s=itake_name(dtd, decl, &name)) ) + { if ( (s=isee_identifier(dtd, decl, "#empty")) ) + name = NULL; + else + return gripe(ERC_SYNTAX_ERROR, L"map-name expected", decl); + } + + decl = s; + if ( !(map = find_map(dtd, name)) ) + map = def_shortref(p, name); /* make undefined map */ + + if ( isee_func(dtd, decl, CF_GRPO) ) /* ( */ + { dtd_model *model; + + if ( (model = make_model(dtd, decl, &s)) ) + { for_elements_in_model(model, set_map_element, map); + free_model(model); + decl = s; + } else + return FALSE; + } else if ( (s=itake_name(dtd, decl, &ename)) ) + { e = find_element(dtd, ename); + e->map = map; + decl = s; + } else if ( p->environments ) + { if ( !map->defined ) + gripe(ERC_EXISTENCE, L"map", name->name); + + p->environments->map = map; + p->map = p->environments->map; + } else + return gripe(ERC_SYNTAX_ERROR, L"element-name expected", decl); + + if ( *decl ) + return gripe(ERC_SYNTAX_ERROR, L"Unparsed", decl); + + return TRUE; +} + + +static int +match_map(dtd *dtd, dtd_map *map, ocharbuf *buf) +{ wchar_t *data = buf->data.w; + wchar_t *e = data+buf->size-1; + ichar *m = map->from+map->len-1; + + while( m >= map->from ) + { if ( e < data ) + return 0; + + if ( *m == *e ) + { m--; + e--; + continue; + } + if ( *m == CHR_DBLANK ) + { if ( e>data && HasClass(dtd, *e, CH_WHITE) ) + e--; + else + return FALSE; + goto wblank; + } + if ( *m == CHR_BLANK ) + { wblank: + while( e>data && HasClass(dtd, *e, CH_WHITE) ) + e--; + m--; + continue; + } + return 0; + } + + return (int)(data+buf->size-1-e); +} + + +static int +match_shortref(dtd_parser *p) +{ dtd_map *map; + + for(map = p->map->map; map; map = map->next) + { int len; + + if ( (len=match_map(p->dtd, map, p->cdata)) ) + { p->cdata->size -= len; + + if ( p->cdata_must_be_empty ) + { int blank = TRUE; + const wchar_t *s; + int i; + + for(s = p->cdata->data.w, i=0; i++ < p->cdata->size; s++) + { if ( !iswspace(*s) ) + { blank = FALSE; + break; + } + } + + p->blank_cdata = blank; + } + + WITH_CLASS(p, EV_SHORTREF, + { sgml_cplocation(&p->startloc, &p->location); + p->startloc.charpos -= len; + p->startloc.linepos -= len; + if ( p->startloc.linepos < 0 ) + { p->startloc.line--; + p->startloc.linepos = 0; /* not correct! */ + } + DEBUG(printf("%d-%d: Matched map '%s' --> %s, len = %d\n", + p->startloc.charpos, + p->location.charpos, + map->from, map->to->name, len)); + + process_entity(p, map->to->name); + }) /* TBD: optimise */ + return TRUE; + } + } + + return FALSE; +} + + + /******************************* + * ELEMENTS * + *******************************/ + +static void +add_submodel(dtd_model *m, dtd_model *sub) +{ dtd_model **d; + + for( d = &m->content.group; *d; d = &(*d)->next ) + ; + *d = sub; +} + + +/* for_elements_in_model() + Walk along the model, calling f(e, closure) for any element found + in the model. Used for +*/ + +static void +for_elements_in_model(dtd_model *m, + void (*f)(dtd_element *e, void *closure), + void *closure) +{ switch(m->type) + { case MT_SEQ: + case MT_AND: + case MT_OR: + { dtd_model *sub = m->content.group; + + for(; sub; sub = sub->next) + for_elements_in_model(sub, f, closure); + break; + } + case MT_ELEMENT: + (*f)(m->content.element, closure); + break; + default: + ; + } +} + + +static void +free_model(dtd_model *m) +{ switch(m->type) + { case MT_SEQ: + case MT_AND: + case MT_OR: + { dtd_model *sub = m->content.group; + dtd_model *next; + + for(; sub; sub = next) + { next = sub->next; + + free_model(sub); + } + } + default: + ; + } + + sgml_free(m); +} + + +static dtd_model * +make_model(dtd *dtd, const ichar *decl, const ichar **end) +{ const ichar *s; + dtd_model *m = sgml_calloc(1, sizeof(*m)); + dtd_symbol *id; + + decl = iskip_layout(dtd, decl); + + if ( (s=isee_identifier(dtd, decl, "#pcdata")) ) + { m->type = MT_PCDATA; + m->cardinality = MC_ONE; /* actually don't care */ + *end = s; + return m; + } + + if ( (s=itake_name(dtd, decl, &id)) ) + { m->type = MT_ELEMENT; + m->content.element = find_element(dtd, id); + decl = s; + } else + { if ( !(s=isee_func(dtd, decl, CF_GRPO)) ) + { gripe(ERC_SYNTAX_ERROR, L"Name group expected", decl); + free_model(m); + return NULL; + } + decl = s; + + for(;;) + { dtd_model *sub; + modeltype mt; + + if ( !(sub = make_model(dtd, decl, &s)) ) + { free_model(sub); + return NULL; + } + decl = s; + add_submodel(m, sub); + + if ( (s = isee_func(dtd, decl, CF_OR)) ) + { decl = s; + mt = MT_OR; + } else if ( (s = isee_func(dtd, decl, CF_SEQ)) ) + { decl = s; + mt = MT_SEQ; + } else if ( (s = isee_func(dtd, decl, CF_AND)) ) + { decl = s; + mt = MT_AND; + } else if ( (s = isee_func(dtd, decl, CF_GRPC)) ) + { decl = s; + break; + } else + { gripe(ERC_SYNTAX_ERROR, L"Connector ('|', ',' or '&') expected", decl); + free_model(m); + return NULL; + } + decl = iskip_layout(dtd, decl); + + if ( m->type != mt ) + { if ( !m->type ) + m->type = mt; + else + { gripe(ERC_SYNTAX_ERROR, L"Different connector types in model", decl); + free_model(m); + return NULL; + } + } + } + } + + if ( (s = isee_func(dtd, decl, CF_OPT)) ) + { decl = s; + m->cardinality = MC_OPT; + } else if ( (s=isee_func(dtd, decl, CF_REP)) ) + { decl = s; + m->cardinality = MC_REP; + } else if ( (s=isee_func(dtd, decl, CF_PLUS)) ) + { /* ROK: watch out for (x) +(y) */ + if ( isee_func(dtd, iskip_layout(dtd, s), CF_GRPO) == NULL ) + { decl = s; + m->cardinality = MC_PLUS; + } + } else + m->cardinality = MC_ONE; + + if ( m->type == MT_UNDEF ) /* simplify (e+), etc. */ + { dtd_model *sub = m->content.group; + modelcard card; + + assert(!sub->next); + if ( sub->cardinality == MC_ONE ) + card = m->cardinality; + else if ( m->cardinality == MC_ONE ) + card = sub->cardinality; + else + { m->type = MT_OR; + goto out; + } + + *m = *sub; + m->cardinality = card; + sgml_free(sub); + } + +out: + *end = iskip_layout(dtd, decl); + return m; +} + + +static const ichar * +process_model(dtd *dtd, dtd_edef *e, const ichar *decl) +{ const ichar *s; + + decl = iskip_layout(dtd, decl); + if ( (s = isee_identifier(dtd, decl, "empty")) ) + { e->type = C_EMPTY; + return s; + } + if ( (s = isee_identifier(dtd, decl, "cdata")) ) + { e->type = C_CDATA; + return s; + } + if ( (s = isee_identifier(dtd, decl, "rcdata")) ) + { e->type = C_RCDATA; + return s; + } + if ( (s = isee_identifier(dtd, decl, "any")) ) + { e->type = C_ANY; + return s; + } + + e->type = C_PCDATA; + if ( !(e->content = make_model(dtd, decl, &decl)) ) + return FALSE; + + return decl; +} + + +/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +See a name-group separator. As long as we haven't decided, this can be +CF_NG. If we have decided they must all be the same. +- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ + +static const ichar * +isee_ngsep(dtd *dtd, const ichar *decl, charfunc *sep) +{ const ichar *s; + + if ( (s=isee_func(dtd, decl, *sep)) ) + return iskip_layout(dtd, s); + if ( *sep == CF_NG ) /* undecided */ + { static const charfunc ng[] = { CF_SEQ, CF_OR, CF_AND }; + int n; + + for(n=0; n<3; n++) + { if ( (s=isee_func(dtd, decl, ng[n])) ) + { *sep = ng[n]; + return iskip_layout(dtd, s); + } + } + } + + return NULL; +} + + + +static const ichar * +itake_namegroup(dtd *dtd, const ichar *decl, + dtd_symbol **names, int *n) +{ const ichar *s; + int en = 0; + + if ( (s=isee_func(dtd, decl, CF_GRPO)) ) + { charfunc ngs = CF_NG; + + for(;;) + { if ( !(decl=itake_name(dtd, s, &names[en++])) ) + { gripe(ERC_SYNTAX_ERROR, L"Name expected", s); + return NULL; + } + if ( (s=isee_ngsep(dtd, decl, &ngs)) ) + { decl = iskip_layout(dtd, s); + continue; + } + if ( (s=isee_func(dtd, decl, CF_GRPC)) ) + { *n = en; + decl = s; + return iskip_layout(dtd, decl); + } + gripe(ERC_SYNTAX_ERROR, L"Bad name-group", decl); + return NULL; + } + } + + return NULL; +} + + +typedef struct +{ dtd_symbol **list; + int size; +} namelist; + + +static void +add_list_element(dtd_element *e, void *closure) +{ namelist *nl = closure; + + nl->list[nl->size++] = e->name; +} + + +static const ichar * +itake_el_or_model_element_list(dtd *dtd, const ichar *decl, dtd_symbol **names, int *n) +{ const ichar *s; + + if ( isee_func(dtd, decl, CF_GRPO) ) + { dtd_model *model; + + if ( (model = make_model(dtd, decl, &s)) ) + { namelist nl; + + nl.list = names; + nl.size = 0; + for_elements_in_model(model, add_list_element, &nl); + free_model(model); + + *n = nl.size; + return s; + } else + return NULL; + } else + { if ( !(s = itake_name(dtd, decl, &names[0])) ) + { gripe(ERC_SYNTAX_ERROR, L"Name expected", decl); + return NULL; + } + *n = 1; + return s; + } +} + + +static void +add_element_list(dtd_element_list **l, dtd_element *e) +{ dtd_element_list *n = sgml_calloc(1, sizeof(*n)); + + n->value = e; + + for( ; *l; l = &(*l)->next ) + ; + *l = n; +} + + +static int +process_element_declaraction(dtd_parser *p, const ichar *decl) +{ dtd *dtd = p->dtd; + ichar buf[MAXDECL]; + const ichar *s; + dtd_symbol *eid[MAXATTELEM]; + dtd_edef *def; + int en; + int i; + + /* expand parameter entities */ + if ( !expand_pentities(p, decl, ZERO_TERM_LEN, + buf, sizeof(buf)/sizeof(ichar)) ) + return FALSE; + decl = buf; + + if ( !(s=itake_el_or_model_element_list(dtd, decl, eid, &en)) ) + return gripe(ERC_SYNTAX_ERROR, L"Name or name-group expected", decl); + decl = s; + if ( en == 0 ) + return TRUE; /* 0 elements */ + + STAT(edefs_decl++); + def = new_element_definition(dtd); + for(i=0; ielement->structure == NULL); + eid[i]->element->structure = def; + eid[i]->element->undefined = FALSE; + } + def->references = en; /* for GC */ + + /* omitted tag declarations (opt) */ + if ( (s = isee_identifier(dtd, decl, "-")) ) + { def->omit_close = FALSE; + goto seeclose; + } else if ( (s = isee_identifier(dtd, decl, "o")) ) + { def->omit_open = TRUE; + + seeclose: + decl = s; + if ( (s = isee_identifier(dtd, decl, "-")) ) + { def->omit_close = FALSE; + } else if ( (s = isee_identifier(dtd, decl, "o")) ) + { for(i=0; iomit_close = TRUE; + } else + return gripe(ERC_SYNTAX_ERROR, L"Bad omit-tag declaration", decl); + + decl = s; + } + + /* content model */ + if ( !(decl=process_model(dtd, def, decl)) ) + return FALSE; + + /* in/excluded elements */ + if ( decl[0] == '-' || decl[0] == '+' ) + { dtd_symbol *ng[MAXNAMEGROUP]; + int ns; + dtd_element_list **l; + + if ( decl[0] == '-' ) + l = &def->excluded; + else + l = &def->included; + + decl++; + if ( (s=itake_namegroup(dtd, decl, ng, &ns)) ) + { int i; + + decl = s; + + for(i=0; ivalue = s; + + for( ; *nl; nl = &(*nl)->next ) + ; + + *nl = n; +} + + +static void +set_element_properties(dtd_element *e, dtd_attr *a) +{ if ( istreq(a->name->name, L"xml:space") ) + { switch(a->def) + { case AT_FIXED: + case AT_DEFAULT: + break; + default: + return; + } + + switch (a->type ) + { case AT_NAMEOF: + case AT_NAME: + case AT_NMTOKEN: + e->space_mode = istr_to_space_mode(a->att_def.name->name); + break; + case AT_CDATA: + e->space_mode = istr_to_space_mode((ichar *)a->att_def.cdata); + break; + default: + break; + } + } +} + + +static void +add_attribute(dtd *dtd, dtd_element *e, dtd_attr *a) +{ dtd_attr_list **l; + dtd_attr_list *n; + + for(l = &e->attributes; *l; l = &(*l)->next) + { if ( (*l)->attribute->name == a->name ) + { gripe(ERC_REDEFINED, L"attribute", a->name); + a->references++; /* attempt to redefine attribute: */ + free_attribute(a); /* first wins according to standard */ + + return; + } + } + + n = sgml_calloc(1, sizeof(*n)); + + n->attribute = a; + a->references++; + *l = n; + set_element_properties(e, a); +} + + +static int +process_attlist_declaraction(dtd_parser *p, const ichar *decl) +{ dtd *dtd = p->dtd; + dtd_symbol *eid[MAXATTELEM]; + int i, en; + ichar buf[MAXDECL]; + const ichar *s; + + /* expand parameter entities */ + if ( !expand_pentities(p, decl, ZERO_TERM_LEN, buf, sizeof(buf)/sizeof(ichar)) ) + return FALSE; + decl = iskip_layout(dtd, buf); + DEBUG(printf("Expanded to %s\n", decl)); + + if ( !(decl=itake_el_or_model_element_list(dtd, decl, eid, &en)) ) + return FALSE; + + /* fetch attributes */ + while(*decl) + { dtd_attr *at = sgml_calloc(1, sizeof(*at)); + at->references = REFS_VIRGIN; + + /* name of attribute */ + if ( !(s = itake_name(dtd, decl, &at->name)) ) + { free_attribute(at); + return gripe(ERC_SYNTAX_ERROR, L"Name expected", decl); + } + decl = s; + + /* (name1|name2|...) type */ + if ( (s=isee_func(dtd, decl, CF_GRPO)) ) + { charfunc ngs = CF_NG; + + at->type = AT_NAMEOF; + decl=s; + + for(;;) + { dtd_symbol *nm; + + if ( !(s = itake_nmtoken(dtd, decl, &nm)) ) + { free_attribute(at); + return gripe(ERC_SYNTAX_ERROR, L"Name expected", decl); + } + decl = s; + add_name_list(&at->typeex.nameof, nm); + if ( (s=isee_ngsep(dtd, decl, &ngs)) ) + { decl = s; + continue; + } + if ( (s = isee_func(dtd, decl, CF_GRPC)) ) + { decl=s; + decl = iskip_layout(dtd, decl); + break; + } + free_attribute(at); + return gripe(ERC_SYNTAX_ERROR, L"Illegal name-group", decl); + } + } else if ( (s=isee_identifier(dtd, decl, "cdata")) ) + { decl = s; + at->type = AT_CDATA; + } else if ( (s=isee_identifier(dtd, decl, "entity")) ) + { decl = s; + at->type = AT_ENTITY; + } else if ( (s=isee_identifier(dtd, decl, "entities")) ) + { decl = s; + at->type = AT_ENTITIES; + at->islist = TRUE; + } else if ( (s=isee_identifier(dtd, decl, "id")) ) + { decl = s; + at->type = AT_ID; + } else if ( (s=isee_identifier(dtd, decl, "idref")) ) + { decl = s; + at->type = AT_IDREF; + } else if ( (s=isee_identifier(dtd, decl, "idrefs")) ) + { decl = s; + at->type = AT_IDREFS; + at->islist = TRUE; + } else if ( (s=isee_identifier(dtd, decl, "name")) ) + { decl = s; + at->type = AT_NAME; + } else if ( (s=isee_identifier(dtd, decl, "names")) ) + { decl = s; + at->type = AT_NAMES; + at->islist = TRUE; + } else if ( (s=isee_identifier(dtd, decl, "nmtoken")) ) + { decl = s; + at->type = AT_NMTOKEN; + } else if ( (s=isee_identifier(dtd, decl, "nmtokens")) ) + { decl = s; + at->type = AT_NMTOKENS; + at->islist = TRUE; + } else if ( (s=isee_identifier(dtd, decl, "number")) ) + { decl = s; + at->type = AT_NUMBER; + } else if ( (s=isee_identifier(dtd, decl, "numbers")) ) + { decl = s; + at->type = AT_NUMBERS; + at->islist = TRUE; + } else if ( (s=isee_identifier(dtd, decl, "nutoken")) ) + { decl = s; + at->type = AT_NUTOKEN; + } else if ( (s=isee_identifier(dtd, decl, "nutokens")) ) + { decl = s; + at->type = AT_NUTOKENS; + at->islist = TRUE; + } else if ( (s=isee_identifier(dtd, decl, "notation")) ) + { dtd_symbol *ng[MAXNAMEGROUP]; + int ns; + + at->type = AT_NOTATION; + decl=s; + if ( (s=itake_namegroup(dtd, decl, ng, &ns)) ) + { decl = s; + + for(i=0; itypeex.nameof, ng[i]); + } else + { free_attribute(at); + return gripe(ERC_SYNTAX_ERROR, L"name-group expected", decl); + } + } else + { free_attribute(at); + return gripe(ERC_SYNTAX_ERROR, L"Attribute-type expected", decl); + } + + /* Attribute Defaults */ + if ( (s=isee_identifier(dtd, decl, "#fixed")) ) + { decl = s; + at->def = AT_FIXED; + } else if ( (s=isee_identifier(dtd, decl, "#required")) ) + { decl = s; + at->def = AT_REQUIRED; + } else if ( (s=isee_identifier(dtd, decl, "#current")) ) + { decl = s; + at->def = AT_CURRENT; + } else if ( (s=isee_identifier(dtd, decl, "#conref")) ) + { decl = s; + at->def = AT_CONREF; + } else if ( (s=isee_identifier(dtd, decl, "#implied")) ) + { decl = s; + at->def = AT_IMPLIED; + } else /* real default */ + at->def = AT_DEFAULT; + + if ( at->def == AT_DEFAULT || at->def == AT_FIXED ) + { ichar buf[MAXSTRINGLEN]; + ichar *start; int len; + const ichar *end; + + if ( !(end=itake_string(dtd, decl, &start, &len)) ) + { end=itake_nmtoken_chars(dtd, decl, buf, sizeof(buf)/sizeof(ichar)); + start = buf; + len = (int)istrlen(buf); + } + if ( !end ) + return gripe(ERC_SYNTAX_ERROR, L"Bad attribute default", decl); + +/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Note: itake_name(), etc. work on nul-terminated strings. The result of +itake_string() is a pointer in a nul-terminated string and these +functions will stop scanning at the quote anyway, so we can use the +length of the parsed data to verify we parsed all of it. +- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ + + switch(at->type) + { case AT_CDATA: + { at->att_def.cdata = istrndup(start, len); + break; + } + case AT_ENTITY: + case AT_NOTATION: + case AT_NAME: + { if ( !(s=itake_name(dtd, start, &at->att_def.name)) || + (s-start) != len ) + return gripe(ERC_DOMAIN, L"name", decl); + break; + } + case AT_NMTOKEN: + case AT_NAMEOF: + { if ( !(s=itake_nmtoken(dtd, start, &at->att_def.name)) || + (s-start) != len ) + return gripe(ERC_DOMAIN, L"nmtoken", decl); + break; + } + case AT_NUTOKEN: + { if ( !(s=itake_nutoken(dtd, start, &at->att_def.name)) || + (s-start) != len ) + return gripe(ERC_DOMAIN, L"nutoken", decl); + break; + } + case AT_NUMBER: + { if ( !(s=itake_number(dtd, start, at)) || + (s-start) != len ) + return gripe(ERC_DOMAIN, L"number", decl); + break; + } + case AT_NAMES: + case AT_ENTITIES: + case AT_IDREFS: + case AT_NMTOKENS: + case AT_NUMBERS: + case AT_NUTOKENS: + { at->att_def.list = istrndup(buf, len); + break; + } + default: + { free_attribute(at); + return gripe(ERC_REPRESENTATION, L"No default for type"); + } + } + + decl = end; + } + + /* add to list */ + at->references = 0; + for(i=0; iparent) + { if ( env->element->structure ) + { dtd_edef *def = env->element->structure; + dtd_element_list *el; + + for(el=def->excluded; el; el=el->next) + { if ( el->value == e ) + return IE_EXCLUDED; + } + for(el=def->included; el; el=el->next) + { if ( el->value == e ) + return IE_INCLUDED; + } + } + } + + return IE_NORMAL; +} + + +static int +complete(sgml_environment *env) +{ if ( env->element->structure && + !env->element->undefined && + env->element->structure->type != C_ANY ) + { dtd_edef *def = env->element->structure; + + if ( !same_state(def->final_state, env->state) ) + return FALSE; + } + + return TRUE; +} + + +static void +validate_completeness(sgml_environment *env) +{ if ( !complete(env) ) + { wchar_t buf[MAXNMLEN+50]; + + swprintf(buf, MAXNMLEN+50, L"Incomplete element: <%s>", + env->element->name->name); + + gripe(ERC_VALIDATE, buf); /* TBD: expected */ + } +} + + +static sgml_environment * +push_element(dtd_parser *p, dtd_element *e, int callback) +{ if ( e != CDATA_ELEMENT ) + { sgml_environment *env = sgml_calloc(1, sizeof(*env)); + + emit_cdata(p, FALSE); + + env->element = e; + env->state = make_state_engine(e); + env->space_mode = (p->environments ? p->environments->space_mode + : p->dtd->space_mode); + env->parent = p->environments; + p->environments = env; + + if ( p->dtd->shorttag ) + { env->saved_waiting_for_net = p->waiting_for_net; + + if ( p->event_class == EV_SHORTTAG ) + { p->waiting_for_net = TRUE; + env->wants_net = TRUE; + } else + { env->wants_net = FALSE; + if ( e->structure && e->structure->omit_close == FALSE ) + p->waiting_for_net = FALSE; + } + } + + if ( e->map ) + p->map = env->map = e->map; + else if ( env->parent ) + p->map = env->map = env->parent->map; + + p->first = TRUE; + if ( callback && p->on_begin_element ) + { sgml_attribute atts[MAXATTRIBUTES]; + int natts = 0; + + if ( !(p->flags & SGML_PARSER_NODEFS) ) + natts = add_default_attributes(p, e, natts, atts); + + (*p->on_begin_element)(p, e, natts, atts); + } + + if ( e->structure ) + { if ( e->structure->type == C_CDATA || + e->structure->type == C_RCDATA ) + { p->state = (e->structure->type == C_CDATA ? S_CDATA : S_RCDATA); + p->cdata_state = p->state; + p->etag = e->name->name; + p->etaglen = (int)istrlen(p->etag); + sgml_cplocation(&p->startcdata, &p->location); + } else + p->cdata_state = S_PCDATA; + } + } + + return p->environments; +} + + +static void +free_environment(sgml_environment *env) +{ +#ifdef XMLNS + if ( env->xmlns ) + xmlns_free(env); +#endif + + sgml_free(env); +} + + +/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Pop the stack, closing all environment uptil `to'. The close was +initiated by pushing the element `e'. +- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ + +static int +pop_to(dtd_parser *p, sgml_environment *to, dtd_element *e0) +{ sgml_environment *env, *parent; + + for(env = p->environments; env != to; env=parent) + { dtd_element *e = env->element; + + validate_completeness(env); + parent = env->parent; + + if ( e->structure && !e->structure->omit_close ) + gripe(ERC_OMITTED_CLOSE, e->name->name); + + if ( e0 != CDATA_ELEMENT ) + emit_cdata(p, TRUE); + + p->first = FALSE; + p->environments = env; + if ( p->dtd->shorttag ) + p->waiting_for_net = env->saved_waiting_for_net; + + WITH_CLASS(p, EV_OMITTED, + if ( p->on_end_element ) + (*p->on_end_element)(p, e)); + free_environment(env); + } + p->environments = to; + p->map = to->map; + + return TRUE; +} + + +static void +allow_for(dtd_element *in, dtd_element *e) +{ dtd_edef *def = in->structure; + dtd_model *g; + + if ( def->type == C_EMPTY ) + { def->type = C_PCDATA; + def->content = sgml_calloc(1, sizeof(*def->content)); + def->content->type = MT_OR; + def->content->cardinality = MC_REP; + } + assert(def->content->type == MT_OR); + + g = def->content->content.group; + + if ( e == CDATA_ELEMENT ) + { dtd_model *m; + + for(; g; g = g->next) + { if ( g->type == MT_PCDATA ) + return; + } + m = sgml_calloc(1, sizeof(*m)); + m->type = MT_PCDATA; + m->cardinality = MC_ONE; /* ignored */ + add_submodel(def->content, m); + } else + { dtd_model *m; + + for(; g; g = g->next) + { if ( g->type == MT_ELEMENT && g->content.element == e ) + return; + } + m = sgml_calloc(1, sizeof(*m)); + m->type = MT_ELEMENT; + m->cardinality = MC_ONE; /* ignored */ + m->content.element = e; + add_submodel(def->content, m); + } +} + + + +static int +open_element(dtd_parser *p, dtd_element *e, int warn) +{ if ( !p->environments && p->enforce_outer_element ) + { dtd_element *f = p->enforce_outer_element->element; + + if ( f && f != e ) + { if ( !f->structure || + !f->structure->omit_open ) + gripe(ERC_OMITTED_OPEN, f->name->name); + + WITH_CLASS(p, EV_OMITTED, + { open_element(p, f, TRUE); + if ( p->on_begin_element ) + { sgml_attribute atts[MAXATTRIBUTES]; + int natts = 0; + + if ( !(p->flags & SGML_PARSER_NODEFS) ) + natts = add_default_attributes(p, f, natts, atts); + + (*p->on_begin_element)(p, f, natts, atts); + } + }); + } + } + + /* no DTD available yet */ + if ( !p->environments && !p->dtd->doctype && e != CDATA_ELEMENT ) + { const ichar *file; + + file = find_in_catalogue(CAT_DOCTYPE, e->name->name, NULL, NULL, + p->dtd->dialect != DL_SGML); + if ( file ) + { dtd_parser *clone = clone_dtd_parser(p); + + gripe(ERC_NO_DOCTYPE, e->name->name, file); + + if ( load_dtd_from_file(clone, file) ) + p->dtd->doctype = istrdup(e->name->name); + else + gripe(ERC_EXISTENCE, L"file", file); + + free_dtd_parser(clone); + } + } + + if ( p->environments ) + { sgml_environment *env = p->environments; + + if ( env->element->undefined ) + { allow_for(env->element, e); /* */ + push_element(p, e, FALSE); + return TRUE; + } + + if ( env->element->structure && + env->element->structure->type == C_ANY ) + { if ( e != CDATA_ELEMENT && e->undefined ) + gripe(ERC_EXISTENCE, L"Element", e->name->name); + push_element(p, e, FALSE); + return TRUE; + } + + switch(in_or_excluded(env, e)) + { case IE_INCLUDED: + push_element(p, e, FALSE); + return TRUE; + case IE_EXCLUDED: + if ( warn ) + gripe(ERC_NOT_ALLOWED, e->name->name); + /*FALLTHROUGH*/ + case IE_NORMAL: + for(; env; env=env->parent) + { dtd_state *new; + + if ( (new = make_dtd_transition(env->state, e)) ) + { env->state = new; + pop_to(p, env, e); + push_element(p, e, FALSE); + return TRUE; + } else + { dtd_element *oe[MAXOMITTED]; /* omitted open */ + int olen; + int i; + + if ( (olen=find_omitted_path(env->state, e, oe)) > 0 ) + { pop_to(p, env, e); + WITH_CLASS(p, EV_OMITTED, + for(i=0; istate = make_dtd_transition(env->state, oe[i]); + env = push_element(p, oe[i], TRUE); + }) + env->state = make_dtd_transition(env->state, e); + push_element(p, e, FALSE); + return TRUE; + } + } + + if ( !env->element->structure || + !env->element->structure->omit_close ) + break; + } + } + + if ( warn ) + { if ( e == CDATA_ELEMENT ) + gripe(ERC_VALIDATE, L"#PCDATA not allowed here"); + else if ( e->undefined ) + gripe(ERC_EXISTENCE, L"Element", e->name->name); + else + gripe(ERC_NOT_ALLOWED, e->name->name); + } + } + + if ( warn ) + { push_element(p, e, FALSE); + return TRUE; + } else + return FALSE; +} + + +static int +close_element(dtd_parser *p, dtd_element *e, int conref) +{ sgml_environment *env; + + for(env = p->environments; env; env=env->parent) + { if ( env->element == e ) /* element is open */ + { sgml_environment *parent; + + for(env = p->environments; ; env=parent) + { dtd_element *ce = env->element; + + if ( !(conref && env == p->environments) ) + validate_completeness(env); + parent = env->parent; + + p->first = FALSE; + if ( p->on_end_element ) + (*p->on_end_element)(p, env->element); + free_environment(env); + p->environments = parent; + + if ( ce == e ) /* closing current element */ + { p->map = (parent ? parent->map : NULL); + return TRUE; + } else /* omited close */ + { if ( ce->structure && !ce->structure->omit_close ) + gripe(ERC_OMITTED_CLOSE, ce->name->name); + } + } + } + } + + return gripe(ERC_NOT_OPEN, e->name->name); +} + + +static int +close_current_element(dtd_parser *p) +{ if ( p->environments ) + { dtd_element *e = p->environments->element; + + emit_cdata(p, TRUE); + return close_element(p, e, FALSE); + } + + return gripe(ERC_SYNTAX_ERROR, L"No element to close", ""); +} + + +/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +get_attribute_value() + +Get the value for an attribute. Once I thought this was simple, but +Richard O'Keefe pointed to the complex handling of white-space in SGML +attributes. Basically, if the attribute is quoted, we need: + + * If CDATA, map all blank to space characters, then expand + entities + + * If !CDATA expand all entities, canonise white space by + deleting leading and trailing space and squishing multiple + space characters to a single (lower for us) case. + +This almost, but not completely matches the XML definition. This however +is so complex we will ignore it for now. + +[Rewritten by Richard O'Keefe with these addional comments] +Reads a value, the attribute name and value indicator having been +processed already. It calls itake_string() to read quoted values, and +itake_unquoted() to read unquoted values. + +itake_string(dtd, in, buf, size) + - skips layout INCLUDING comments, + - returns NULL if the next character is not ' or ", + - copies characters from in to buf until a matching ' or " is found, + - adds a terminating \0, + - skips more layout INCLUDING comments, and + - returns the new input position. +It is quite wrong to skip leading comments here. In the tag + + + +the characters "--ugh--" *are the value*. They are not a comment. +Comments are not in fact allowed inside tags, unfortunately. +This tag is equivalent to + + + +where something is an attribute that has zoo as one of its enumerals. + +Because itake_string() is called in many other places, this bug has +not yet been fixed. +- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ + +static ichar const * +get_attribute_value(dtd_parser *p, ichar const *decl, sgml_attribute *att) +{ ichar tmp[MAXSTRINGLEN]; + ichar *buf = tmp; + ichar const *s; + ichar c; + dtd *dtd = p->dtd; + ichar const *end; + ichar *start; int len; + + enum + { DIG_FIRST = 8, /* any token start with digit? */ + NAM_FIRST = 4, /* any token start with non-digit name char? */ + NAM_LATER = 2, /* any token have non-digit name char later? */ + ANY_OTHER = 1, /* any token have illegal character? */ + YET_EMPTY = 0 + } + token = YET_EMPTY; + + att->value.textW = NULL; /* UCS text */ + att->value.number = 0; + att->flags = 0; + + end = itake_string(dtd, decl, &start, &len); + + if ( end != NULL ) + { ocharbuf out; + + init_ocharbuf(&out); + expand_entities(p, start, len, &out); + + if ( att->definition->type == AT_CDATA ) + { malloc_ocharbuf(&out); + + att->value.number = out.size; + att->value.textW = out.data.w; + + return end; + } else + { ichar *d; + + buf = out.data.w; + + /* canonicalise blanks */ + s = buf; + while ((c = *s++) != '\0' && HasClass(dtd, c, CH_BLANK)) + ; + d = buf; + while ( c != '\0' ) + { token |= HasClass(dtd, c, CH_DIGIT) ? DIG_FIRST + : HasClass(dtd, c, CH_NAME) ? NAM_FIRST : /* oops! */ ANY_OTHER; + if ( d != buf ) + *d++ = ' '; + if ( dtd->case_sensitive ) + { *d++ = c; + while ((c = *s++) != '\0' && !HasClass(dtd, c, CH_BLANK)) + { token |= HasClass(dtd, c, CH_DIGIT) ? 0 + : HasClass(dtd, c, CH_NAME) ? NAM_LATER : /* oops! */ ANY_OTHER; + *d++ = c; + } + } else + { *d++ = towlower(c); + while ((c = *s++) != '\0' && !HasClass(dtd, c, CH_BLANK)) + { token |= HasClass(dtd, c, CH_DIGIT) ? 0 + : HasClass(dtd, c, CH_NAME) ? NAM_LATER : /* oops! */ ANY_OTHER; + *d++ = towlower(c); + } + } + while (c != '\0' && HasClass(dtd, c, CH_BLANK)) + c = *s++; + } + *d = '\0'; + } + } else + { end = itake_unquoted(dtd, decl, tmp, sizeof(tmp)/sizeof(ichar)); + if (end == NULL) + return NULL; + + s = buf; + c = *s++; + if (c != '\0') + { token |= HasClass(dtd, c, CH_DIGIT) ? DIG_FIRST + : HasClass(dtd, c, CH_NAME) ? NAM_FIRST : /* oops! */ ANY_OTHER; + while ((c = *s++) != 0) + { token |= HasClass(dtd, c, CH_DIGIT) ? 0 + : HasClass(dtd, c, CH_NAME) ? NAM_LATER : /* oops! */ ANY_OTHER; + } + } + if ( token == YET_EMPTY || (token & ANY_OTHER) != 0) + gripe(ERC_SYNTAX_WARNING, L"Attribute value requires quotes", buf); + + if (!dtd->case_sensitive && att->definition->type != AT_CDATA) + istrlower(buf); + } + + switch (att->definition->type) + { case AT_NUMBER: /* number */ + if (token != DIG_FIRST) + { gripe(ERC_SYNTAX_WARNING, L"NUMBER expected", decl); + } else if (dtd->number_mode == NU_INTEGER) + { (void) istrtol(buf, &att->value.number); + } else + { att->value.textW = istrdup(buf); + att->value.number = (long)istrlen(buf); + } + return end; + case AT_CDATA: /* CDATA attribute */ + att->value.textW = istrdup(buf); + att->value.number = (long)istrlen(buf); + return end; + case AT_ID: /* identifier */ + case AT_IDREF: /* identifier reference */ + case AT_NAME: /* name token */ + case AT_NOTATION: /* notation-name */ + if (token == YET_EMPTY || (token & (DIG_FIRST | ANY_OTHER)) != 0) + gripe(ERC_SYNTAX_WARNING, L"NAME expected", decl); + break; + case AT_NAMEOF: /* one of these names */ + case AT_NMTOKEN: /* name-token */ + if (token == YET_EMPTY || (token & ANY_OTHER) != 0) + gripe(ERC_SYNTAX_WARNING, L"NMTOKEN expected", decl); + if ( att->definition->type == AT_NAMEOF ) + { dtd_name_list *nl; + + for(nl=att->definition->typeex.nameof; nl; nl = nl->next) + { if ( istreq(nl->value->name, buf) ) + goto passed; + } + gripe(ERC_SYNTAX_WARNING, L"unexpected value", decl); + } + break; + case AT_NUTOKEN: /* number token */ + if ((token & (NAM_FIRST | ANY_OTHER)) != 0) + gripe(ERC_SYNTAX_WARNING, L"NUTOKEN expected", decl); + break; + case AT_ENTITY: /* entity-name */ + if (token == YET_EMPTY || (token & (DIG_FIRST | ANY_OTHER)) != 0) + gripe(ERC_SYNTAX_WARNING, L"entity NAME expected", decl); + break; + case AT_NAMES: /* list of names */ + case AT_IDREFS: /* list of identifier references */ + if (token == YET_EMPTY || (token & (DIG_FIRST | ANY_OTHER)) != 0) + gripe(ERC_SYNTAX_WARNING, L"NAMES expected", decl); + break; + case AT_ENTITIES: /* entity-name list */ + if (token == YET_EMPTY || (token & (DIG_FIRST | ANY_OTHER)) != 0) + gripe(ERC_SYNTAX_WARNING, L"entity NAMES expected", decl); + break; + case AT_NMTOKENS: /* name-token list */ + if (token == YET_EMPTY || (token & ANY_OTHER) != 0) + gripe(ERC_SYNTAX_WARNING, L"NMTOKENS expected", decl); + break; + case AT_NUMBERS: /* number list */ + if (token != DIG_FIRST) + gripe(ERC_SYNTAX_WARNING, L"NUMBERS expected", decl); + break; + case AT_NUTOKENS: + if ((token & (NAM_FIRST | ANY_OTHER)) != 0) + gripe(ERC_SYNTAX_WARNING, L"NUTOKENS expected", decl); + break; + default: + assert(0); + return NULL; + } + +passed: + att->value.textW = istrdup(buf); /* TBD: more validation */ + att->value.number = (long)istrlen(buf); + return end; +} + + +static const ichar * +process_attributes(dtd_parser *p, dtd_element *e, const ichar *decl, + sgml_attribute *atts, int *argc) +{ int attn = 0; + dtd *dtd = p->dtd; + + decl = iskip_layout(dtd, decl); + while(decl && *decl) + { dtd_symbol *nm; + const ichar *s; + + if ( (s=itake_nmtoken(dtd, decl, &nm)) ) + { decl = s; + + if ( (s=isee_func(dtd, decl, CF_VI)) ) /* name= */ + { dtd_attr *a; + + if ( !HasClass(dtd, nm->name[0], CH_NMSTART) ) + gripe(ERC_SYNTAX_WARNING, + "Illegal start of attribute-name", decl); + + decl = s; + if ( !(a=find_attribute(e, nm)) ) + { a = sgml_calloc(1, sizeof(*a)); + + a->name = nm; + a->type = AT_CDATA; + a->def = AT_IMPLIED; + add_attribute(dtd, e, a); + + if ( !e->undefined && + !(dtd->dialect != DL_SGML && + (istreq(L"xmlns", nm->name) || + istrprefix(L"xmlns:", nm->name))) ) + gripe(ERC_NO_ATTRIBUTE, e->name->name, nm->name); + } + atts[attn].definition = a; + if ( (decl=get_attribute_value(p, decl, atts+attn)) ) + { attn++; + continue; + } + } else if ( e->structure ) + { dtd_attr_list *al; /* value shorthand */ + + for(al=e->attributes; al; al=al->next) + { dtd_attr *a = al->attribute; + + if ( a->type == AT_NAMEOF || a->type == AT_NOTATION ) + { dtd_name_list *nl; + + for(nl=a->typeex.nameof; nl; nl = nl->next) + { if ( nl->value == nm ) + { if ( dtd->dialect != DL_SGML ) + gripe(ERC_SYNTAX_WARNING, + "Value short-hand in XML mode", decl); + atts[attn].flags = 0; + atts[attn].definition = a; + atts[attn].value.textW = istrdup(nm->name); + atts[attn].value.number = (long)istrlen(nm->name); + attn++; + goto next; + } + } + } + } + gripe(ERC_NO_ATTRIBUTE_VALUE, e->name->name, nm->name); + decl = s; + } else + { gripe(ERC_SYNTAX_ERROR, L"Bad attribute", decl); + decl = s; + } + } else + { *argc = attn; + return decl; + } + + next: + ; + } + + *argc = attn; + return decl; +} + + +/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +sgml_add_default_attributes() + +This function adds attributes for omitted default and fixed attributes. +These attributes are added to the end of the attribute list. This +function returns the new number of attributes. The `atts' array is +assumed to be MAXATTRIBUTES long, normally passed from +process_begin_element. +- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ + +static int +add_default_attributes(dtd_parser *p, dtd_element *e, + int natts, sgml_attribute *atts) +{ dtd_attr_list *al; + + if ( e == CDATA_ELEMENT ) + return natts; + + for(al=e->attributes; al; al=al->next) + { dtd_attr *a = al->attribute; + + switch(a->def) + { case AT_REQUIRED: /* TBD: check if present */ + case AT_CURRENT: /* TBD: register in DTD and reuse */ + case AT_CONREF: + case AT_IMPLIED: + goto next; + case AT_FIXED: + case AT_DEFAULT: + { int i; + sgml_attribute *ap; + + for(i=0, ap=atts; idefinition == a ) + goto next; + } + + ap->definition = a; + ap->value.textW = NULL; + ap->value.number = 0; + ap->flags = SGML_AT_DEFAULT; + + switch(a->type) + { case AT_CDATA: + ap->value.textW = a->att_def.cdata; + ap->value.number = (long)istrlen(ap->value.textW); + break; + case AT_NUMBER: + if ( p->dtd->number_mode == NU_TOKEN ) + { ap->value.textW = (ichar*)a->att_def.name->name; + ap->value.number = (long)istrlen(ap->value.textW); + } else + { ap->value.number = a->att_def.number; + } + break; + default: + if ( a->islist ) + { ap->value.textW = a->att_def.list; + } else + { ap->value.textW = (ichar*)a->att_def.name->name; + } + ap->value.number = (long)istrlen(ap->value.textW); + } + + natts++; + } + } + next:; + } + + return natts; +} + + +static void +free_attribute_values(int argc, sgml_attribute *argv) +{ int i; + + for(i=0; iflags & SGML_AT_DEFAULT) ) + continue; /* shared with the DTD */ + + if ( argv->value.textW ) + sgml_free(argv->value.textW); + } +} + + +static int +process_begin_element(dtd_parser *p, const ichar *decl) +{ dtd *dtd = p->dtd; + dtd_symbol *id; + const ichar *s; + + if ( (s=itake_name(dtd, decl, &id)) ) + { sgml_attribute atts[MAXATTRIBUTES]; + int natts; + dtd_element *e = find_element(dtd, id); + int empty = FALSE; + int conref = FALSE; + + if ( !e->structure ) + { dtd_edef *def; + e->undefined = TRUE; + STAT(edefs_implicit++); + def_element(dtd, id); + def = e->structure; + def->type = C_EMPTY; + } + + open_element(p, e, TRUE); + + decl=s; + if ( (s=process_attributes(p, e, decl, atts, &natts)) ) + decl=s; + + if ( dtd->dialect != DL_SGML ) + { if ( (s=isee_func(dtd, decl, CF_ETAGO2)) ) + { empty = TRUE; /* XML */ + decl = s; + } +#ifdef XMLNS + if ( dtd->dialect == DL_XMLNS ) + update_xmlns(p, e, natts, atts); +#endif + if ( dtd->dialect != DL_SGML ) + update_space_mode(p, e, natts, atts); + } else + { int i; + + for(i=0; idef == AT_CONREF ) + { empty = TRUE; + conref = TRUE; + } + } + } + if ( *decl ) + gripe(ERC_SYNTAX_ERROR, L"Bad attribute list", decl); + + if ( !(p->flags & SGML_PARSER_NODEFS) ) + natts = add_default_attributes(p, e, natts, atts); + + if ( empty || + (dtd->dialect == DL_SGML && + e->structure && + e->structure->type == C_EMPTY && + !e->undefined) ) + p->empty_element = e; + else + p->empty_element = NULL; + + if ( p->on_begin_element ) + (*p->on_begin_element)(p, e, natts, atts); + + free_attribute_values(natts, atts); + + if ( p->empty_element ) + { p->empty_element = NULL; + close_element(p, e, conref); + if ( conref ) /* might be S_CDATA due to declared content */ + p->cdata_state = p->state = S_PCDATA; + } + + return TRUE; + } + + return gripe(ERC_SYNTAX_ERROR, L"Bad open-element tag", decl); +} + + +static int +process_end_element(dtd_parser *p, const ichar *decl) +{ dtd *dtd = p->dtd; + dtd_symbol *id; + const ichar *s; + + emit_cdata(p, TRUE); + if ( (s=itake_name(dtd, decl, &id)) && *s == '\0' ) + return close_element(p, find_element(dtd, id), FALSE); + + if ( p->dtd->shorttag && *decl == '\0' ) /* : close current element */ + return close_current_element(p); + + return gripe(ERC_SYNTAX_ERROR, L"Bad close-element tag", decl); +} + + +/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +process_net(dtd_parser *p) + We've seen a / of a shorttag element. Close this one. +- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ + +static int +process_net(dtd_parser *p) +{ sgml_environment *env; + + prepare_cdata(p); + for(env = p->environments; env; env=env->parent) + { if ( env->wants_net ) + { sgml_environment *parent; + + pop_to(p, env, NULL); /* close parents */ + validate_completeness(env); + parent = env->parent; + + emit_cdata(p, TRUE); + p->first = FALSE; + + if ( p->on_end_element ) + { WITH_CLASS(p, EV_SHORTTAG, + (*p->on_end_element)(p, env->element)); + } + + free_environment(env); + p->environments = parent; + p->map = (parent ? parent->map : NULL); + + return TRUE; + } + } + + return FALSE; +} + + +static int /* */ +process_doctype(dtd_parser *p, const ichar *decl, const ichar *decl0) +{ dtd *dtd = p->dtd; + dtd_symbol *id; + const ichar *s; + dtd_entity *et = NULL; + + if ( !(s=itake_name(dtd, decl, &id)) ) + return gripe(ERC_SYNTAX_ERROR, L"Name expected", decl); + decl = s; + + if ( (s=isee_identifier(dtd, decl, "system")) ) + { et = sgml_calloc(1, sizeof(*et)); + et->type = ET_SYSTEM; + decl = s; + } else if ( (s=isee_identifier(dtd, decl, "public")) ) + { et = sgml_calloc(1, sizeof(*et)); + et->type = ET_PUBLIC; + decl = s; + } else if ( isee_func(dtd, decl, CF_DSO) ) + goto local; + + if ( et ) + { et->name = id; + et->catalog_location = CAT_DOCTYPE; + if ( !(s=process_entity_value_declaration(p, decl, et)) ) + return FALSE; + decl = s; + } + + if ( !dtd->doctype ) /* i.e. anonymous DTD */ + { ichar *file; + dtd_parser *clone; + + dtd->doctype = istrdup(id->name); /* Fill it */ + if ( et ) + file = entity_file(dtd, et); + else + file = istrdup(find_in_catalogue(CAT_DOCTYPE, + dtd->doctype, NULL, NULL, + dtd->dialect != DL_SGML)); + + if ( !file ) + { gripe(ERC_EXISTENCE, L"DTD", dtd->doctype); + } else + { clone = clone_dtd_parser(p); + if ( !load_dtd_from_file(clone, file) ) + gripe(ERC_EXISTENCE, L"file", file); + free_dtd_parser(clone); + sgml_free(file); + } + } + + if ( et ) + free_entity_list(et); + +local: + if ( (s=isee_func(dtd, decl, CF_DSO)) ) /* [...] */ + { int grouplevel = 1; + data_mode oldmode = p->dmode; + dtdstate oldstate = p->state; + locbuf oldloc; + const ichar *q; + icharbuf *saved_ibuf = p->buffer; + + push_location(p, &oldloc); + /* try to find start-location. */ + /* fails if there is comment before */ + /* the []! */ + sgml_cplocation(&p->location, &p->startloc); + inc_location(&p->location, '<'); + for(q=decl0; q < s; q++) + inc_location(&p->location, *q); + p->dmode = DM_DTD; + p->state = S_PCDATA; + p->buffer = new_icharbuf(); + + for( ; *s; s++ ) + { if ( isee_func(dtd, s, CF_LIT) || /* skip quoted strings */ + isee_func(dtd, s, CF_LITA) ) + { ichar q = *s; + + putchar_dtd_parser(p, *s++); /* pass open quote */ + + for( ; *s && *s != q; s++ ) + putchar_dtd_parser(p, *s); + + if ( *s == q ) /* pass closing quote */ + putchar_dtd_parser(p, *s); + continue; + } + + if ( isee_func(dtd, s, CF_DSO) ) + grouplevel++; + else if ( isee_func(dtd, s, CF_DSC) && --grouplevel == 0 ) + break; + putchar_dtd_parser(p, *s); + } + p->dtd->implicit = FALSE; + + p->state = oldstate; + p->dmode = oldmode; + free_icharbuf(p->buffer); + p->buffer = saved_ibuf; + pop_location(p, &oldloc); + } + + p->enforce_outer_element = id; /* make this the outer element */ + + return TRUE; +} + + +static void +init_decoding(dtd_parser *p) +{ +#ifdef UTF8 + int decode; + dtd *dtd = p->dtd; + + if ( dtd->encoding == SGML_ENC_UTF8 && + p->encoded == TRUE ) + decode = TRUE; + else + decode = FALSE; + + if ( p->utf8_decode != decode ) + { DEBUG(fprintf(stderr, "%s UTF-8 decoding on %p\n", + decode ? "Enable" : "Disable", + p)); + + p->utf8_decode = decode; + } +#endif +} + + +/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +xml_set_encoding() is the public interface to set the encoding for the +parser. +- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ + +static int /* strcasecmp() with C locale */ +posix_strcasecmp(const char *s1, const char *s2) +{ for(; *s1 && *s2; s1++, s2++) + { int c1 = *s1&0xff; + int c2 = *s2&0xff; + + if ( c1 >= 'A' && c1 <= 'Z' ) c1 += 'a'-'A'; + if ( c2 >= 'A' && c2 <= 'Z' ) c2 += 'a'-'A'; + + if ( c1 != c2 ) + return c1-c2; + } + + return *s1 - *s2; +} + + +int +xml_set_encoding(dtd_parser *p, const char *enc) +{ dtd *dtd = p->dtd; + + if ( posix_strcasecmp(enc, "iso-8859-1") == 0 ) + { dtd->encoding = SGML_ENC_ISO_LATIN1; + } else if ( posix_strcasecmp(enc, "us-ascii") == 0 ) + { dtd->encoding = SGML_ENC_ISO_LATIN1; /* doesn't make a difference */ + } else if ( posix_strcasecmp(enc, "utf-8") == 0 ) + { dtd->encoding = SGML_ENC_UTF8; + } else + return FALSE; + + init_decoding(p); + return TRUE; +} + + +/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +set_encoding() sets the encoding from the encoding="..." field of the +XML header. +- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ + +static void +set_encoding(dtd_parser *p, const ichar *enc) +{ char buf[32]; + char *e = buf+sizeof(buf)-1; + char *o; + const ichar *i; + + for(i=enc, o=buf; *i; ) + { if ( *i < 128 && o < e ) + { *o++ = (char)*i++; + } else + { goto error; + } + } + *o = '\0'; + + if ( !xml_set_encoding(p, buf) ) + { error: + gripe(ERC_EXISTENCE, L"character encoding", enc); + } +} + + +/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Process + +Should deal with character encoding for XML documents. +- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ + +static int +process_pi(dtd_parser *p, const ichar *decl) +{ const ichar *s; + dtd *dtd = p->dtd; + + if ( (s=isee_identifier(dtd, decl, "xml")) ) /* */ + { decl = s; + + switch(dtd->dialect) + { case DL_SGML: + set_dialect_dtd(dtd, DL_XML); + break; + case DL_XML: + case DL_XMLNS: + break; + } + + while(*decl) + { dtd_symbol *nm; + + if ( (s=itake_name(dtd, decl, &nm)) && + (s=isee_func(dtd, s, CF_VI)) ) /* = */ + { ichar *start; + int len; + ichar buf[MAXSTRINGLEN]; + const ichar *end; + + if ( !(end=itake_string(dtd, s, &start, &len)) ) + { end=itake_nmtoken_chars(dtd, s, buf, sizeof(buf)/sizeof(ichar)); + start = buf; + len = (int)istrlen(buf); + } + + if ( end ) + { decl = end; + + if ( istrcaseeq(nm->name, L"encoding") ) + { ichar tmp[32]; + + if ( len < (int)(sizeof(tmp)/sizeof(ichar)-1) ) + { istrncpy(tmp, start, len); + tmp[len] = 0; + + set_encoding(p, tmp); + } else + { gripe(ERC_SYNTAX_ERROR, L"Unterminated encoding?", decl); + } + } + + /* fprintf(stderr, "XML %s = %s\n", nm->name, buf); */ + + continue; + } + } + + gripe(ERC_SYNTAX_ERROR, L"Illegal XML parameter", decl); + break; + } + + return TRUE; + } + + if ( p->on_pi ) + (*p->on_pi)(p, decl); + + return FALSE; /* Warn? */ +} + + +static int +process_sgml_declaration(dtd_parser *p, const ichar *decl) +{ return gripe(ERC_SYNTAX_WARNING, L"Ignored declaration", NULL); +} + + +static int +process_declaration(dtd_parser *p, const ichar *decl) +{ const ichar *s; + dtd *dtd = p->dtd; + + if ( p->dmode != DM_DTD ) + { if ( (s=isee_func(dtd, decl, CF_ETAGO2)) ) /* */ + { return process_end_element(p, s); + } else if ( HasClass(dtd, *decl, CH_NAME) ) /* */ + { decl = s; + + if ( p->on_decl ) + (*p->on_decl)(p, decl); + + if ( (s = isee_identifier(dtd, decl, "entity")) ) + process_entity_declaration(p, s); + else if ( (s = isee_identifier(dtd, decl, "element")) ) + process_element_declaraction(p, s); + else if ( (s = isee_identifier(dtd, decl, "attlist")) ) + process_attlist_declaraction(p, s); + else if ( (s = isee_identifier(dtd, decl, "notation")) ) + process_notation_declaration(p, s); + else if ( (s = isee_identifier(dtd, decl, "shortref")) ) + process_shortref_declaration(p, s); + else if ( (s = isee_identifier(dtd, decl, "usemap")) ) + process_usemap_declaration(p, s); + else if ( (s = isee_identifier(dtd, decl, "sgml")) ) + process_sgml_declaration(p, s); + else if ( (s = isee_identifier(dtd, decl, "doctype")) ) + { if ( p->dmode != DM_DTD ) + process_doctype(p, s, decl-1); + } else + { s = iskip_layout(dtd, decl); + + if ( *s ) + gripe(ERC_SYNTAX_ERROR, L"Invalid declaration", s); + } + + return TRUE; + } + + return gripe(ERC_SYNTAX_ERROR, L"Invalid declaration", decl); +} + + /******************************* + * STREAM BINDING * + *******************************/ + +static dtd_parser *current_parser; /* For gripes */ + +void +set_file_dtd_parser(dtd_parser *p, input_type type, const ichar *name) +{ p->location.type = type; + p->location.name.file = name; + p->location.line = 1; + p->location.linepos = 0; + p->location.charpos = 0; +} + + +static void +set_src_dtd_parser(dtd_parser *p, input_type type, const ichar *name) +{ p->location.type = type; + p->location.name.entity = name; + p->location.line = 1; + p->location.linepos = 0; + p->location.charpos = 0; +} + + +void +set_mode_dtd_parser(dtd_parser *p, data_mode m) +{ p->dmode = m; /* DM_DTD or DM_DATA */ + p->state = S_PCDATA; + p->blank_cdata = TRUE; +} + + +dtd_parser * +new_dtd_parser(dtd *dtd) +{ dtd_parser *p = sgml_calloc(1, sizeof(*p)); + + if ( !dtd ) + dtd = new_dtd(NULL); + dtd->references++; + + p->magic = SGML_PARSER_MAGIC; + p->dtd = dtd; + p->state = S_PCDATA; + p->mark_state = MS_INCLUDE; + p->dmode = DM_DTD; + p->encoded = TRUE; /* encoded octet stream */ + p->buffer = new_icharbuf(); + p->cdata = new_ocharbuf(); + p->event_class = EV_EXPLICIT; + set_src_dtd_parser(p, IN_NONE, NULL); + + return p; +} + + +static dtd_parser * +clone_dtd_parser(dtd_parser *p) +{ dtd_parser *clone = sgml_calloc(1, sizeof(*p)); + + *clone = *p; + clone->dtd->references++; + clone->environments = NULL; + clone->marked = NULL; + clone->etag = NULL; + clone->grouplevel = 0; + clone->state = S_PCDATA; + clone->mark_state = MS_INCLUDE; + clone->dmode = DM_DTD; + clone->buffer = new_icharbuf(); + clone->cdata = new_ocharbuf(); + + return clone; +} + + +void +free_dtd_parser(dtd_parser *p) +{ free_icharbuf(p->buffer); + free_ocharbuf(p->cdata); + + free_dtd(p->dtd); + + sgml_free(p); +} + + +static int +process_chars(dtd_parser *p, input_type in, const ichar *name, const ichar *s) +{ locbuf old; + + push_location(p, &old); + set_src_dtd_parser(p, in, name); + empty_icharbuf(p->buffer); /* dubious */ + for(; *s; s++) + putchar_dtd_parser(p, *s); + pop_location(p, &old); + + return TRUE; +} + + +static int +process_include(dtd_parser *p, const ichar *entity_name) +{ dtd_symbol *id; + dtd_entity *pe; + dtd *dtd = p->dtd; + + if ( (id=dtd_find_entity_symbol(dtd, entity_name)) && + (pe=find_pentity(p->dtd, id)) ) + { ichar *file; + + if ( (file = entity_file(dtd, pe)) ) + { int rc = sgml_process_file(p, file, SGML_SUB_DOCUMENT); + sgml_free(file); + + return rc; + } else + { const ichar *text = entity_value(p, pe, NULL); + + if ( !text ) + return gripe(ERC_NO_VALUE, pe->name->name); + + return process_chars(p, IN_ENTITY, entity_name, text); + } + } + + return gripe(ERC_EXISTENCE, L"parameter entity", entity_name); +} + + +/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Process mark_state according to KEYWORD. Processes the rest in normal +S_PCDATA style, which pops the mark-stack on seeing ]]> + +For the purpose of we switch to S_GROUP if +- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ + +static void +process_marked_section(dtd_parser *p) +{ ichar buf[MAXDECL]; + dtd *dtd = p->dtd; + const ichar *decl = p->buffer->data; + const ichar *s; + + if ( (decl=isee_func(dtd, decl, CF_MDO2)) && /* ! */ + (decl=isee_func(dtd, decl, CF_DSO)) && /* [ */ + expand_pentities(p, decl, ZERO_TERM_LEN, buf, sizeof(buf)/sizeof(ichar)) ) + { dtd_symbol *kwd; + + decl = buf; + if ( (s=itake_name(dtd, decl, &kwd)) && + isee_func(dtd, s, CF_DSO) ) /* [ */ + { dtd_marked *m = sgml_calloc(1, sizeof(*m)); + + m->keyword = kwd; /* push on the stack */ + m->parent = p->marked; + p->marked = m; + + if ( istrcaseeq(kwd->name, L"IGNORE") ) + m->type = MS_IGNORE; + else if ( istrcaseeq(kwd->name, L"INCLUDE") ) + m->type = MS_INCLUDE; + else if ( istrcaseeq(kwd->name, L"TEMP") ) + m->type = MS_INCLUDE; + else if ( istrcaseeq(kwd->name, L"CDATA") ) + m->type = MS_CDATA; + else if ( istrcaseeq(kwd->name, L"RCDATA") ) + m->type = MS_RCDATA; + else + m->type = MS_INCLUDE; /* default */ + + empty_icharbuf(p->buffer); + if ( m->type == MS_CDATA ) + p->state = S_MSCDATA; + else + p->state = S_PCDATA; + if ( p->mark_state != MS_IGNORE ) + p->mark_state = m->type; + } + } else + { decl = p->buffer->data; + + if ( (decl=isee_func(dtd, decl, CF_MDO2)) && /* ! */ + !isee_func(dtd, decl, CF_DSO) ) /* [ */ + { p->state = S_GROUP; + p->grouplevel = 1; + } + } +} + + +static void +pop_marked_section(dtd_parser *p) +{ dtd_marked *m = p->marked; + + if ( m ) + { p->marked = m->parent; + sgml_free(m); + p->mark_state = (p->marked ? p->marked->type : MS_INCLUDE); + } +} + + +/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Update the space-mode for the current element. The space mode defines +how spaces are handled in the CDATA output. +- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ + +static dtd_space_mode +istr_to_space_mode(const ichar *val) +{ if ( istreq(val, L"default") ) + return SP_DEFAULT; + if ( istreq(val, L"preserve") ) + return SP_PRESERVE; + if ( istreq(val, L"sgml") ) + return SP_SGML; + if ( istreq(val, L"remove") ) + return SP_REMOVE; + + return SP_INHERIT; /* interpret as error */ +} + + +static void +update_space_mode(dtd_parser *p, dtd_element *e, + int natts, sgml_attribute *atts) +{ for( ; natts-- > 0; atts++ ) + { const ichar *name = atts->definition->name->name; + + if ( istreq(name, L"xml:space") && + atts->definition->type == AT_CDATA && + atts->value.textW ) + { dtd_space_mode m = istr_to_space_mode(atts->value.textW); + + if ( m != SP_INHERIT ) + p->environments->space_mode = m; + else + gripe(ERC_EXISTENCE, L"xml:space-mode", atts->value.textW); + + return; + } + } + + if ( e->space_mode != SP_INHERIT ) + p->environments->space_mode = e->space_mode; +} + + +static void +empty_cdata(dtd_parser *p) +{ if ( p->dmode == DM_DATA ) + { empty_ocharbuf(p->cdata); + p->blank_cdata = TRUE; + p->cdata_must_be_empty = FALSE; + } +} + + +static void +cb_cdata(dtd_parser *p, ocharbuf *buf, int offset, int size) +{ if ( p->on_data ) + (*p->on_data)(p, EC_CDATA, size, buf->data.w+offset); +} + + +static int +emit_cdata(dtd_parser *p, int last) +{ dtd *dtd = p->dtd; + locbuf locsafe; + ocharbuf *cdata = p->cdata; + int offset = 0; + int size = cdata->size; + + if ( size == 0 ) + return TRUE; /* empty or done */ + + push_location(p, &locsafe); + sgml_cplocation(&p->location, &p->startloc); /* start of markup */ + sgml_cplocation(&p->startloc, &p->startcdata); /* real start of CDATA */ + + if ( p->environments ) + { switch(p->environments->space_mode) + { case SP_SGML: + case SP_DEFAULT: + if ( p->first ) + { wint_t c = fetch_ocharbuf(cdata, offset); + + if ( HasClass(dtd, c, CH_RE) ) + { inc_location(&p->startloc, c); + offset++; + size--; + c = fetch_ocharbuf(cdata, offset); + } + + if ( HasClass(dtd, c, CH_RS) ) + { inc_location(&p->startloc, c); + offset++; + size--; + } + } + if ( last && size > 0 ) + { wint_t c = fetch_ocharbuf(cdata, offset+size-1); + + if ( HasClass(dtd, c, CH_RS) ) + { dec_location(&p->location, c); + size--; + poke_ocharbuf(cdata, offset+size, '\0'); + if ( size > 0 ) + c = fetch_ocharbuf(cdata, offset+size-1); + else + c = 0; /* HasClass(CH_RE) must fail */ + } + if ( HasClass(dtd, c, CH_RE) ) + { dec_location(&p->location, c); + size--; + poke_ocharbuf(cdata, offset+size, '\0'); + } + } + if ( p->environments->space_mode == SP_DEFAULT ) + { int o = 0; + int i; + + for(i=0; istartloc, c); + else + break; + } + + if ( i 0); + + if ( !p->blank_cdata ) + { if ( p->cdata_must_be_empty ) + { gripe(ERC_NOT_ALLOWED_PCDATA, p->cdata); /* TBD: now passes buffer! */ + } + cb_cdata(p, cdata, offset, size); + } else if ( p->environments ) + { sgml_environment *env = p->environments; + dtd_state *new; + + /* If an element is not in the DTD we must */ + /* assume mixed content and emit spaces */ + + if ( (new=make_dtd_transition(env->state, CDATA_ELEMENT)) ) + { env->state = new; + cb_cdata(p, cdata, offset, size); + } else if ( env->element->undefined && + p->environments->space_mode == SP_PRESERVE ) + { cb_cdata(p, cdata, offset, size); + } + } + + pop_location(p, &locsafe); + + empty_cdata(p); + + return TRUE; +} + + +static int +prepare_cdata(dtd_parser *p) +{ if ( p->cdata->size == 0 ) + return TRUE; + + terminate_ocharbuf(p->cdata); + + if ( p->mark_state == MS_INCLUDE ) + { dtd *dtd = p->dtd; + + if ( p->environments ) /* needed for */ + { dtd_element *e = p->environments->element; + + if ( e->structure && e->structure->type == C_EMPTY && !e->undefined ) + close_element(p, e, FALSE); + } + + if ( p->blank_cdata == TRUE ) + { int blank = TRUE; + int i; + + for(i=0; icdata->size; i++) + { wint_t c = fetch_ocharbuf(p->cdata, i); + + if ( !HasClass(dtd, c, CH_BLANK) ) + { blank = FALSE; + break; + } + } + + p->blank_cdata = blank; + if ( !blank ) + { if ( p->dmode == DM_DTD ) + gripe(ERC_SYNTAX_ERROR, L"CDATA in DTD", p->cdata->data); + else + open_element(p, CDATA_ELEMENT, TRUE); + } + } + } + + return TRUE; +} + + +static int +process_cdata(dtd_parser *p, int last) +{ int rc; + + WITH_PARSER(p, (prepare_cdata(p), rc=emit_cdata(p, last))); + + return rc; +} + + +static int +process_entity(dtd_parser *p, const ichar *name) +{ if ( name[0] == '#' ) /* #charcode: character entity */ + { int v = char_entity_value(name); + + if ( v <= 0 ) + return gripe(ERC_SYNTAX_ERROR, L"Bad character entity", name); + + add_ocharbuf(p->cdata, v); + } else + { dtd_symbol *id; + dtd_entity *e; + dtd *dtd = p->dtd; + int len; + const ichar *text; + const ichar *s; + int chr; + ichar *file; + + if ( !(id=dtd_find_entity_symbol(dtd, name)) || + !(e=id->entity) ) + { if ( dtd->default_entity ) + e = dtd->default_entity; + else + return gripe(ERC_EXISTENCE, L"entity", name); + } + + if ( !e->value && + e->content == EC_SGML && + (file=entity_file(p->dtd, e)) ) + { int rc; + + empty_icharbuf(p->buffer); /* dubious */ + rc = sgml_process_file(p, file, SGML_SUB_DOCUMENT); + sgml_free(file); + return rc; + } + + if ( !(text = entity_value(p, e, &len)) ) + return gripe(ERC_NO_VALUE, e->name->name); + + switch ( e->content ) + { case EC_SGML: + case EC_CDATA: + if ( (s=isee_character_entity(dtd, text, &chr)) && *s == '\0' ) + { if ( chr == 0 ) + return gripe(ERC_SYNTAX_ERROR, L"Illegal character entity", text); + + if ( p->blank_cdata == TRUE && + !HasClass(dtd, (wint_t)chr, CH_BLANK) ) + { p->cdata_must_be_empty = !open_element(p, CDATA_ELEMENT, FALSE); + p->blank_cdata = FALSE; + } + + add_ocharbuf(p->cdata, chr); + return TRUE; + } + if ( e->content == EC_SGML ) + { locbuf oldloc; + int decode = p->utf8_decode; + + push_location(p, &oldloc); + p->utf8_decode = FALSE; + set_src_dtd_parser(p, IN_ENTITY, e->name->name); + empty_icharbuf(p->buffer); /* dubious */ + for(s=text; *s; s++) + putchar_dtd_parser(p, *s); + p->utf8_decode = decode; + pop_location(p, &oldloc); + } else if ( *text ) + { const ichar *o; + + if ( p->blank_cdata == TRUE ) + { p->cdata_must_be_empty = !open_element(p, CDATA_ELEMENT, FALSE); + p->blank_cdata = FALSE; + } + + for(o=text; *o; o++) + add_ocharbuf(p->cdata, *o); + } + break; + case EC_SDATA: + case EC_NDATA: + process_cdata(p, FALSE); + if ( p->on_data ) + (*p->on_data)(p, e->content, len, text); + break; + case EC_PI: + process_cdata(p, FALSE); + if ( p->on_pi ) + (*p->on_pi)(p, text); + case EC_STARTTAG: +#if 0 + prepare_cdata(p); + process_begin_element(p, text); +#endif + break; + case EC_ENDTAG: +#if 0 + prepare_cdata(p); + process_end_element(p, text); +#endif + break; + } + + return TRUE; + } + + return TRUE; +} + + +/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Deal with end of input. We should give a proper error message depending +on the state and the start-location of the error. +- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ + +static int +end_document_dtd_parser_(dtd_parser *p) +{ int rval; + + switch(p->state) + { case S_RCDATA: + case S_CDATA: + case S_PCDATA: + rval = TRUE; + break; + case S_CMT: + case S_CMT1: + case S_CMTE0: + case S_CMTE1: + case S_DECLCMT0: + case S_DECLCMT: + case S_DECLCMTE0: + rval = gripe(ERC_SYNTAX_ERROR, + L"Unexpected end-of-file in comment", L""); + break; + case S_ECDATA1: + case S_ECDATA2: + case S_EMSC1: + case S_EMSC2: + case S_DECL0: + case S_DECL: + case S_MDECL0: + case S_STRING: + case S_CMTO: + case S_GROUP: + case S_PENT: + case S_ENT: + case S_ENT0: + rval = gripe(ERC_SYNTAX_ERROR, + L"Unexpected end-of-file", L""); + break; +#ifdef UTF8 + case S_UTF8: + rval = gripe(ERC_SYNTAX_ERROR, + L"Unexpected end-of-file in UTF-8 sequence", L""); + break; +#endif + case S_MSCDATA: + case S_EMSCDATA1: + case S_EMSCDATA2: + rval = gripe(ERC_SYNTAX_ERROR, + L"Unexpected end-of-file in CDATA marked section", L""); + break; + case S_PI: + case S_PI2: + rval = gripe(ERC_SYNTAX_ERROR, + L"Unexpected end-of-file in processing instruction", L""); + break; + default: + rval = gripe(ERC_SYNTAX_ERROR, + L"Unexpected end-of-file in ???"); + break; + } + + if ( p->dmode == DM_DATA ) + { sgml_environment *env; + + if ( p->cdata->size > 0 && + fetch_ocharbuf(p->cdata, p->cdata->size-1) == CR ) + del_ocharbuf(p->cdata); + + process_cdata(p, TRUE); + + if ( (env=p->environments) ) + { dtd_element *e; + + while(env->parent) + env = env->parent; + + pop_to(p, env, CDATA_ELEMENT); + e = env->element; + if ( e->structure && !e->structure->omit_close ) + gripe(ERC_OMITTED_CLOSE, e->name->name); + close_element(p, e, FALSE); + } + } + + return rval; +} + + +int +end_document_dtd_parser(dtd_parser *p) +{ int rval; + + WITH_PARSER(p, rval = end_document_dtd_parser_(p)); + + return rval; +} + + +int +begin_document_dtd_parser(dtd_parser *p) +{ init_decoding(p); + + return TRUE; +} + + +void +reset_document_dtd_parser(dtd_parser *p) +{ if ( p->environments ) + { sgml_environment *env, *parent; + + for(env = p->environments; env; env=parent) + { parent = env->parent; + + free_environment(env); + } + + p->environments = NULL; + } + + while(p->marked) + pop_marked_section(p); + + empty_icharbuf(p->buffer); + empty_ocharbuf(p->cdata); + + p->mark_state = MS_INCLUDE; + p->state = S_PCDATA; + p->grouplevel = 0; + p->blank_cdata = TRUE; + p->event_class = EV_EXPLICIT; + p->dmode = DM_DATA; + + begin_document_dtd_parser(p); +} + + + +/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Set the UTF-8 state +- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ + +#ifdef UTF8 +static void +process_utf8(dtd_parser *p, int chr) +{ int bytes; + int mask; + + for( bytes=1, mask=0x20; chr&mask; bytes++, mask >>= 1 ) + ; + mask--; /* 0x20 --> 0x1f */ + + p->utf8_saved_state = p->state; /* state to return to */ + p->state = S_UTF8; + p->utf8_char = chr & mask; + p->utf8_left = bytes; +} +#endif + + +/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +add_cdata() adds a character to the output data. It also maps \r\n onto +a single \n for Windows newline conventions. + +There is a problem here in shortref handling. We open the CDATA_ELEMENT +as soon as we find a character as this may open other elements through +omitted tags and thus install a new shortref map. + +If, at a later stage, all CDATA read sofar turns out to be a shortref we +have incorrectly opened the CDATA_ELEMENT. As `undoing' the +open_element() is not an option (it may already have caused `events' on +omitted tags) we are in trouble. +- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ + +static void +add_cdata(dtd_parser *p, int chr) +{ if ( p->mark_state == MS_INCLUDE ) + { ocharbuf *buf = p->cdata; + + if ( p->blank_cdata == TRUE && + !HasClass(p->dtd, (wint_t)chr, CH_BLANK) ) + { p->cdata_must_be_empty = !open_element(p, CDATA_ELEMENT, FALSE); + p->blank_cdata = FALSE; + } + + if ( chr == '\n' ) /* insert missing CR */ + { int sz; + + if ( (sz=buf->size) == 0 || + fetch_ocharbuf(buf, sz-1) != CR ) + add_cdata(p, CR); + } + + add_ocharbuf(buf, chr); + + if ( p->map && + chr <= 0xff && p->map->ends[chr] && + match_shortref(p) ) + return; + + if ( chr == '\n' ) /* dubious. Whould we do that */ + { int sz; /* here or in space-handling? */ + + if ( (sz=buf->size) > 1 && + fetch_ocharbuf(buf, sz-1) == LF && + fetch_ocharbuf(buf, sz-2) == CR ) + { poke_ocharbuf(buf, sz-2, LF); + buf->size--; + } + } + } +} + + +static void +add_verbatim_cdata(dtd_parser *p, int chr) +{ if ( p->mark_state != MS_IGNORE ) + { ocharbuf *buf = p->cdata; + + if ( p->blank_cdata == TRUE && + !HasClass(p->dtd, (wint_t)chr, CH_BLANK) ) + { p->cdata_must_be_empty = !open_element(p, CDATA_ELEMENT, FALSE); + p->blank_cdata = FALSE; + } + + if ( chr == '\n' && buf->size > 0 && + fetch_ocharbuf(buf, buf->size-1) == '\r' ) + buf->size--; + + add_ocharbuf(buf, chr); + } +} + + +/* We discovered illegal markup and now process it as normal CDATA +*/ + +static void +recover_parser(dtd_parser *p) +{ const ichar *s; + + terminate_icharbuf(p->buffer); + add_cdata(p, p->saved); + for(s=p->buffer->data; *s; s++) + add_cdata(p, *s); + p->state = S_PCDATA; +} + + +static inline void +setlocation(dtd_srcloc *d, dtd_srcloc *loc, int line, int lpos) +{ d->line = line; + d->linepos = lpos; + d->charpos = loc->charpos - 1; + d->type = loc->type; + d->name = loc->name; +} + + +void +putchar_dtd_parser(dtd_parser *p, int chr) +{ dtd *dtd = p->dtd; + const ichar *f = dtd->charfunc->func; + int line = p->location.line; + int lpos = p->location.linepos; + + p->location.charpos++; /* TBD: actually `bytepos' */ + +#ifdef UTF8 + if ( p->state == S_UTF8 ) + { if ( (chr & 0xc0) != 0x80 ) /* TBD: recover */ + gripe(ERC_SYNTAX_ERROR, L"Bad UTF-8 sequence", L""); + p->utf8_char <<= 6; + p->utf8_char |= (chr & ~0xc0); + if ( --p->utf8_left == 0 ) + { chr = p->utf8_char; + p->state = p->utf8_saved_state; + } else + { return; + } + } else if ( ISUTF8_MB(chr) && p->utf8_decode ) + { process_utf8(p, chr); + return; + } +#endif + + if ( f[CF_RS] == chr ) + { p->location.line++; + p->location.linepos = 0; + } else + { if ( f[CF_RE] == chr ) + p->location.linepos = 0; + else + p->location.linepos++; + } + +reprocess: + switch(p->state) + { case S_PCDATA: + { if ( f[CF_MDO1] == chr ) /* < */ + { setlocation(&p->startloc, &p->location, line, lpos); + p->state = S_DECL0; + empty_icharbuf(p->buffer); + return; + } + if ( p->dmode == DM_DTD ) + { if ( f[CF_PERO] == chr ) /* % */ + { setlocation(&p->startloc, &p->location, line, lpos); + p->state = S_PENT; + return; + } + } else + { if ( f[CF_ERO] == chr ) /* & */ + { setlocation(&p->startloc, &p->location, line, lpos); + p->state = S_ENT0; + return; + } + } + + if ( p->marked && f[CF_DSC] == chr ) /* ] in marked section */ + { empty_icharbuf(p->buffer); + p->state = S_EMSC1; + p->saved = chr; /* for recovery */ + return; + } + + if ( p->waiting_for_net && f[CF_ETAGO2] == chr ) /* shorttag */ + { setlocation(&p->startloc, &p->location, line, lpos); + WITH_PARSER(p, + process_net(p)); + return; + } + + /* Real character data */ + if ( p->cdata->size == 0 ) + setlocation(&p->startcdata, &p->location, line, lpos); + + add_cdata(p, chr); + return; + } + case S_ECDATA2: /* Seen etaglen == p->buffer->size && + istrncaseeq(p->buffer->data, p->etag, p->etaglen) ) + { p->cdata->size -= p->etaglen+2; /* 2 for cdata); + terminate_icharbuf(p->buffer); + if ( p->mark_state == MS_INCLUDE ) + { WITH_PARSER(p, + process_cdata(p, TRUE); + process_end_element(p, p->buffer->data)); + empty_cdata(p); + } + empty_icharbuf(p->buffer); + p->cdata_state = p->state = S_PCDATA; + } else + { add_verbatim_cdata(p, chr); + if ( p->etaglen < p->buffer->size || + !HasClass(dtd, (wint_t)chr, CH_NAME)) + { empty_icharbuf(p->buffer); /* mismatch */ + p->state = p->cdata_state; + } else + add_icharbuf(p->buffer, chr); + } + return; + } + case S_ECDATA1: /* seen < in CDATA */ + { add_verbatim_cdata(p, chr); + if ( f[CF_ETAGO2] == chr ) /* / */ + { empty_icharbuf(p->buffer); + p->state = S_ECDATA2; + } else if ( f[CF_ETAGO1] != chr ) /* <: do not change state */ + p->state = p->cdata_state; + return; + } + case S_RCDATA: + { if ( f[CF_ERO] == chr ) /* & */ + { setlocation(&p->startloc, &p->location, line, lpos); + p->state = S_ENT0; + return; + } + /*FALLTHROUGH*/ + } + case S_CDATA: + { add_verbatim_cdata(p, chr); + + if ( f[CF_MDO1] == chr ) /* < */ + { setlocation(&p->startloc, &p->location, line, lpos); + p->state = S_ECDATA1; + } + + /* / in CDATA shorttag element */ + if ( p->waiting_for_net && f[CF_ETAGO2] == chr ) + { setlocation(&p->startloc, &p->location, line, lpos); + p->cdata->size--; + terminate_ocharbuf(p->cdata); + terminate_icharbuf(p->buffer); + if ( p->mark_state == MS_INCLUDE ) + { WITH_PARSER(p, + process_cdata(p, TRUE); + process_net(p)); + empty_cdata(p); + } + empty_icharbuf(p->buffer); + p->cdata_state = p->state = S_PCDATA; + } + + return; + } + case S_MSCDATA: + { add_verbatim_cdata(p, chr); + if ( f[CF_DSC] == chr ) /* ] */ + p->state = S_EMSCDATA1; + return; + } + case S_EMSCDATA1: + { add_verbatim_cdata(p, chr); + if ( f[CF_DSC] == chr ) /* ]] */ + p->state = S_EMSCDATA2; + else + p->state = S_MSCDATA; + return; + } + case S_EMSCDATA2: + { add_verbatim_cdata(p, chr); + if ( f[CF_MDC] == chr ) /* ]]> */ + { p->cdata->size -= 3; /* Delete chars for ]] */ + pop_marked_section(p); + p->state = S_PCDATA; + } else if ( f[CF_DSC] != chr ) /* if ]]], stay in this state */ + p->state = S_MSCDATA; + return; + } + case S_EMSC1: + { if ( f[CF_DSC] == chr ) /* ]] in marked section */ + { p->state = S_EMSC2; + return; + } else + { add_icharbuf(p->buffer, chr); + recover_parser(p); + return; + } + } + case S_EMSC2: + { if ( f[CF_MDC] == chr ) /* ]]> in marked section */ + { pop_marked_section(p); + p->state = S_PCDATA; + return; + } else + { add_icharbuf(p->buffer, chr); + recover_parser(p); + return; + } + } + case S_PENT: /* %parameter entity; */ + { if ( f[CF_ERC] == chr ) + { p->state = S_PCDATA; + terminate_icharbuf(p->buffer); + if ( p->mark_state == MS_INCLUDE ) + { WITH_PARSER(p, process_include(p, p->buffer->data)); + } + empty_icharbuf(p->buffer); + return; + } + if ( HasClass(dtd, (wint_t)chr, CH_NAME) ) + { add_icharbuf(p->buffer, chr); + return; + } + + terminate_icharbuf(p->buffer); + gripe(ERC_SYNTAX_ERROR, L"Illegal parameter entity", p->buffer->data); + break; + } + case S_ENT0: /* Seen & */ + { if ( chr == '#' || HasClass(dtd, (wint_t)chr, CH_NAME) ) + { empty_icharbuf(p->buffer); + add_icharbuf(p->buffer, chr); + p->state = S_ENT; + } else + { if ( dtd->dialect != DL_SGML ) + { wchar_t buf[3]; + buf[0] = '&'; + buf[1] = chr; + buf[2] = '\0'; + gripe(ERC_SYNTAX_ERROR, L"Illegal entity", buf); + } + + add_cdata(p, f[CF_ERO]); + p->state = p->cdata_state; + goto reprocess; + } + + return; + } + case S_ENT: /* &entity; */ + { if ( HasClass(dtd, (wint_t)chr, CH_NAME) ) + { add_icharbuf(p->buffer, chr); + return; + } + + terminate_icharbuf(p->buffer); + p->state = p->cdata_state; + if ( p->mark_state == MS_INCLUDE ) + { WITH_PARSER(p, process_entity(p, p->buffer->data)); + } + empty_icharbuf(p->buffer); + + if ( chr == CR ) + p->state = S_ENTCR; + else if ( f[CF_ERC] != chr && chr != '\n' ) + goto reprocess; + + break; + } + case S_ENTCR: /* seen &entCR, eat the LF */ + { p->state = p->cdata_state; + if ( chr != LF ) + goto reprocess; + + break; + } + case S_DECL0: /* Seen < */ + { if ( f[CF_ETAGO2] == chr ) /* buffer, chr); + p->state = S_DECL; + } else if ( HasClass(dtd, (wint_t)chr, CH_NAME) ) /* buffer, chr); + p->state = S_DECL; + } else if ( f[CF_MDO2] == chr ) /* state = S_MDECL0; + } else if ( f[CF_PRO2] == chr ) /* state = S_PI; + } else /* recover */ + { add_cdata(p, f[CF_MDO1]); + add_cdata(p, chr); + p->state = S_PCDATA; + } + + return; + } + case S_MDECL0: /* Seen state = S_CMTO; + return; + } + add_icharbuf(p->buffer, f[CF_MDO2]); + add_icharbuf(p->buffer, chr); + p->state = S_DECL; + return; + } + case S_DECL: /* <...> */ + { if ( f[CF_MDC] == chr ) /* > */ + { prepare_cdata(p); + p->state = S_PCDATA; + terminate_icharbuf(p->buffer); + if ( p->mark_state == MS_INCLUDE ) + { WITH_PARSER(p, process_declaration(p, p->buffer->data)); + } + empty_icharbuf(p->buffer); + return; + } + if ( dtd->shorttag && f[CF_ETAGO2] == chr && p->buffer->size > 0 ) + { prepare_cdata(p); + p->state = S_PCDATA; + terminate_icharbuf(p->buffer); + if ( p->mark_state == MS_INCLUDE ) + { WITH_CLASS(p, EV_SHORTTAG, + WITH_PARSER(p, process_declaration(p, p->buffer->data))); + } + empty_icharbuf(p->buffer); + p->waiting_for_net = TRUE; + return; + } + + add_icharbuf(p->buffer, chr); + + if ( f[CF_LIT] == chr ) /* " */ + { p->state = S_STRING; + p->saved = chr; + p->lit_saved_state = S_DECL; + } else if ( f[CF_LITA] == chr ) /* ' */ + { p->state = S_STRING; + p->saved = chr; + p->lit_saved_state = S_DECL; + return; + } else if ( f[CF_CMT] == chr && /* - */ + p->buffer->data[0] == f[CF_MDO2] ) /* Started state = S_DECLCMT0; + } else if ( f[CF_DSO] == chr ) /* [: marked section */ + { terminate_icharbuf(p->buffer); + + process_marked_section(p); + } + + break; + } + case S_DECLCMT0: /* <...- */ + { if ( f[CF_CMT] == chr ) + { p->buffer->size--; + p->state = S_DECLCMT; + } else + { add_icharbuf(p->buffer, chr); + p->state = S_DECL; + } + break; + } + case S_DECLCMT: /* <...--.. */ + { if ( f[CF_CMT] == chr ) + p->state = S_DECLCMTE0; + break; + } + case S_DECLCMTE0: /* <...--..- */ + { if ( f[CF_CMT] == chr ) + p->state = S_DECL; + else + p->state = S_DECLCMT; + break; + } + case S_PI: + { add_icharbuf(p->buffer, chr); + if ( f[CF_PRO2] == chr ) /* state = S_PI2; + if ( f[CF_PRC] == chr ) /* no ? is ok too (XML/SGML) */ + goto pi; + return; + } + case S_PI2: + { if ( f[CF_PRC] == chr ) + { pi: + process_cdata(p, FALSE); + p->state = S_PCDATA; + p->buffer->size--; + terminate_icharbuf(p->buffer); + if ( p->mark_state == MS_INCLUDE ) + { WITH_PARSER(p, process_pi(p, p->buffer->data)); + } + empty_icharbuf(p->buffer); + return; + } + add_icharbuf(p->buffer, chr); + p->state = S_PI; + return; + } + case S_STRING: + { add_icharbuf(p->buffer, chr); + if ( chr == p->saved ) + p->state = p->lit_saved_state; + break; + } + case S_CMTO: /* Seen state = S_CMT1; + return; + } else + { add_cdata(p, f[CF_MDO1]); + add_cdata(p, f[CF_MDO2]); + add_cdata(p, f[CF_CMT]); + add_cdata(p, chr); + p->state = S_PCDATA; + return; + } + } + case S_CMT1: /* \\ */ + wputc(x, f); + wputc(x, f); + } else if (x == '\t') + { wputc(x, f); /* \t */ + } else if (x == '\n') + { fprintf(f, "\\n"); /* \n */ + } else + { fprintf(f, "\\%03o", x); + } + } +} + + +static void +print_cdata(char c, sgml_attribute *a) +{ wputc(c, stdout); + wprint_escaped(stdout, a->value.textW, a->value.number); + wputc('\n', stdout); +} + + +static int +print_close(dtd_parser * p, dtd_element * e) +{ print_word(p, ')', e->name->name, 0); + putchar('\n'); + + return TRUE; +} + + +typedef struct atdef +{ attrtype type; /* AT_* */ + char const *name; /* name */ + int islist; /* list-type */ +} atdef; + +static atdef attrs[] = { + {AT_CDATA, "CDATA", FALSE}, + {AT_ENTITY, "ENTITY", FALSE}, + {AT_ENTITIES, "ENTITY", TRUE}, + {AT_ID, "ID", FALSE}, + {AT_IDREF, "IDREF", FALSE}, + {AT_IDREFS, "IDREF", TRUE}, + {AT_NAME, "NAME", FALSE}, + {AT_NAMES, "NAME", TRUE}, + {AT_NMTOKEN, "NMTOKEN", FALSE}, + {AT_NMTOKENS, "NMTOKEN", TRUE}, + {AT_NUMBER, "NUMBER", FALSE}, + {AT_NUMBERS, "NUMBER", TRUE}, + {AT_NUTOKEN, "NUTOKEN", FALSE}, + {AT_NUTOKENS, "NUTOKEN", TRUE}, + {AT_NOTATION, "NOTATION", FALSE}, + + {AT_CDATA, (char *) 0, FALSE} +}; + + +static atdef * +find_attrdef(attrtype type) +{ atdef *ad; + + for (ad = attrs; ad->name != (char *) 0; ad++) + { if (ad->type == type) + return ad; + } + assert(0); + return (atdef *) 0; +} + + +static ichar * +istrblank(ichar const *s) +{ for (; *s; s++) + { if (iswspace(*s)) + return (ichar *) s; + } + return (ichar *) 0; +} + + +static int +print_open(dtd_parser * p, dtd_element * e, int argc, sgml_attribute *argv) +{ int i; + + for (i = 0; i < argc; i++) + { print_word(p, 'A', argv[i].definition->name->name, 0); + switch (argv[i].definition->type) + { case AT_CDATA: + printf(" CDATA"); + print_cdata(' ', &argv[i]); + continue; /* so we don't get two line breaks */ + case AT_NUMBER: + printf(" NUMBER "); + if (argv[i].value.textW) + print_word(p, ' ', argv[i].value.textW, 0); + else + printf("%ld", argv[i].value.number); + break; + case AT_NAMEOF: + printf(" NAME"); + print_word(p, ' ', argv[i].value.textW, 0); + break; + default: + { atdef *ad = find_attrdef(argv[i].definition->type); + ichar const *val = argv[i].value.textW; + + printf(" %s", ad->name); + if (ad->islist) + { ichar const *n; + + while ((n = istrblank(val)) != 0) + { if (n != val) + print_word(p, ' ', val, n); + val = n + 1; + } + } + print_word(p, ' ', val, 0); + } + break; + } + putchar('\n'); + } + print_word(p, '(', e->name->name, 0); + putchar('\n'); + return TRUE; +} + + +static int +print_data(dtd_parser * p, data_type type, int len, const wchar_t *data) +{ char c; + + switch (type) + { case EC_CDATA: + c = '-'; + break; + case EC_NDATA: + c = 'N'; + break; + case EC_SDATA: + c = 'S'; + break; + default: + assert(0); + } + wputc(c, stdout); + wprint_escaped(stdout, data, len); + wputc('\n', stdout); + return TRUE; +} + + +static int +on_entity(dtd_parser *p, dtd_entity *e, int chr) +{ if (e == 0) + printf("&#%d;\n", chr); + else + wprintf(L"&%s;\n", e->name->name); + return TRUE; +} + + +static int +on_pi(dtd_parser *p, ichar const *pi) +{ wputc('?', stdout); + wprint_escaped(stdout, pi, wcslen(pi)); + return TRUE; +} + + +static dtd_srcloc * +file_location(dtd_srcloc *l) +{ while (l->parent && l->type != IN_FILE) + l = l->parent; + return l; +} + +static int +on_error(dtd_parser * p, dtd_error * error) +{ char const *severity; + char const *dialect; + dtd_srcloc *l = file_location(error->location); + + switch (p->dtd->dialect) + { case DL_SGML: + dialect = "sgml"; + break; + case DL_XML: + dialect = "xml"; + break; + case DL_XMLNS: + default: /* make compiler happy */ + dialect = "xmlns"; + break; + } + + switch (error->severity) + { case ERS_STYLE: + severity = "Style"; + if ( !style_messages ) + return TRUE; + break; + case ERS_WARNING: + severity = "Warning"; + nwarnings++; + break; + case ERS_ERROR: + default: /* make compiler happy */ + severity = "Error"; + nerrors++; + break; + } + + if ( l->name.file ) + { fwprintf(stderr, L"%s: (%s mode) %s: %ls:%d:%d %ls\n", + program, dialect, severity, + l->name.entity, l->line, l->linepos, + error->plain_message); + } else + { fwprintf(stderr, L"%s: (%s mode) %s: %d:%d %ls\n", + program, dialect, severity, + error->plain_message); + } + + return TRUE; +} + + +static void +set_functions(dtd_parser * p, int output) +{ if (output) + { p->on_end_element = print_close; + p->on_begin_element = print_open; + p->on_data = print_data; + p->on_entity = on_entity; + p->on_pi = on_pi; + } + p->on_error = on_error; +} + + +static wchar_t * +mb2wc(const char *s) +{ int wl = mbstowcs(NULL, s, 0); + + if ( wl > 0 ) + { wchar_t *ws = malloc((wl+1)*sizeof(wchar_t)); + mbstowcs(ws, s, wl+1); + + return ws; + } + + perror("mbstowcs"); + exit(1); +} + + +#define shift (argc--, argv++) + +#define strcaseeq(x, y) istrcaseeq((ichar const *)(x), (ichar const *)(y)) + +static ichar const *no_dtd = (ichar const *) NULL; + +int +main(int argc, char **argv) +{ dtd_parser *p = NULL; + char *s; + int xml = FALSE; + int output = TRUE; + int nodefs = FALSE; /* include defaulted attributes */ + + setlocale(LC_CTYPE, ""); + + s = strchr(argv[0], '/'); + program = s == NULL ? argv[0] : s + 1; + if (streq(program, "xml")) + xml = TRUE; + + shift; + + while (argc > 0 && argv[0][0] == '-') + { if (streq(argv[0], "-xml")) + { xml = TRUE; + } else if (streq(argv[0], "-s")) + { output = FALSE; + } else if (streq(argv[0], "-nodefs")) + { nodefs = TRUE; + } else if (streq(argv[0], "-style")) + { style_messages = TRUE; + } else + { usage(); + } + shift; + } + + if (argc > 0) + { char *slash = strchr(argv[0], '/'); + char *dot = strchr(argv[0], '.'); + char *ext = dot == 0 || (slash != 0 && slash > dot) ? "." : dot; + + if (strcaseeq(ext, ".dtd")) + { char doctype[256]; + + strncpy(doctype, argv[0], ext - argv[0]); + doctype[ext - argv[0]] = '\0'; + + p = new_dtd_parser(new_dtd(mb2wc(doctype))); + load_dtd_from_file(p, mb2wc(argv[0])); + shift; + } else if (strcaseeq(ext, ".html") || strcaseeq(ext, ".htm")) + { p = new_dtd_parser(new_dtd((ichar const *) "html")); + load_dtd_from_file(p, L"html.dtd"); + } else if (xml || strcaseeq(ext, ".xml")) + { dtd *dtd = new_dtd(no_dtd); + + set_dialect_dtd(dtd, DL_XML); + p = new_dtd_parser(dtd); + } else + { p = new_dtd_parser(new_dtd(no_dtd)); + } + } else + { p = new_dtd_parser(new_dtd(no_dtd)); + } + + if (nodefs) + p->flags |= SGML_PARSER_NODEFS; + + switch (argc) + { case 1: + { set_functions(p, output); + sgml_process_file(p, mb2wc(argv[0]), 0); + free_dtd_parser(p); + if (output && nerrors == 0) + printf("C\n"); + return 0; + } + case 0: + { set_functions(p, output); + set_file_dtd_parser(p, IN_FILE, L"stdin"); + set_mode_dtd_parser(p, DM_DATA); + sgml_process_stream(p, stdin, 0); + free_dtd_parser(p); + if (output && nerrors == 0 && nwarnings == 0) + printf("C\n"); + return 0; + } + default: + { usage(); + return EXIT_FAILURE; + } + } +} diff --git a/packages/sgml/sgml.doc b/packages/sgml/sgml.doc new file mode 100644 index 000000000..999cbec6b --- /dev/null +++ b/packages/sgml/sgml.doc @@ -0,0 +1,1319 @@ +\documentclass[11pt]{article} +\usepackage{times} +\usepackage{pl} +\usepackage{html} +\sloppy +\makeindex + +\onefile +\htmloutput{html} % Output directory +\htmlmainfile{index} % Main document file +\bodycolor{white} % Page colour + +\begin{document} + +\title{SWI-Prolog SGML/XML parser} +\author{Jan Wielemaker \\ + HCS, \\ + University of Amsterdam \\ + The Netherlands \\ + E-mail: \email{J.Wielemaker@uva.nl}} + +\maketitle + +\begin{abstract} +Markup languages are an increasingly important method for +data-representation and exchange. This article documents the package +\pllib{sgml}, a foreign library for SWI-Prolog to parse SGML +and XML documents, returning information on both the document and the +document's DTD. The parser is designed to be small, fast and flexible. +\end{abstract} + +\pagebreak +\tableofcontents + +\vfill +\vfill + +\newpage + +\section{Introduction} + +Markup languages have recently regained popularity for two reasons. One +is document exchange, which is largely based on HTML, an instance of +SGML, and the other is for data exchange between programs, which is +often based on XML, which can be considered a simplified and +rationalised version of SGML. + +James Clark's SP parser is a flexible SGML and XML parser. Unfortunately +it has some drawbacks. It is very big, not very fast, cannot work under +event-driven input and is generally hard to program beyond the scope of +the well designed generic interface. The generic interface however does +not provide access to the DTD, does not allow for flexible handling of +input or parsing the DTD independently of a document instance. + +The parser described in this document is small (less than 100 kBytes +executable on a Pentium), fast (between 2 and 5 times faster than SP), +provides access to the DTD, and provides flexible input handling. + +The document output is equal to the output produced by \jargon{xml2pl}, +an SP interface to SWI-Prolog written by Anjo Anjewierden. + + +\section{Bluffer's Guide} + +This package allows you to parse SGML, XML and HTML data into a Prolog +data structure. The high-level interface defined in \pllib{sgml} +provides access at the file-level, while the low-level interface defined +in the foreign module works with Prolog streams. Please use the source +of \file{sgml.pl} as a starting point for dealing with data from +other sources than files, such as SWI-Prolog resources, network-sockets, +character strings, \emph{etc.} The first example below loads an HTML file. + +\begin{code} + + + + +Demo + + + +

    This is a demo + +Paragraphs in HTML need not be closed. + +This is called `omitted-tag' handling. + + +\end{code} + +\begin{code} +?- load_html_file('test.html', Term), + pretty_print(Term). + +[ element(html, + [], + [ element(head, + [], + [ element(title, + [], + [ 'Demo' + ]) + ]), + element(body, + [], + [ '\n', + element(h1, + [ align = center + ], + [ 'This is a demo' + ]), + '\n\n', + element(p, + [], + [ 'Paragraphs in HTML need not be closed.\n' + ]), + element(p, + [], + [ 'This is called `omitted-tag\' handling.' + ]) + ]) + ]) +]. +\end{code} + +The document is represented as a list, each element being an atom to +represent \const{CDATA} or a term \term{element}{Name, Attributes, Content}. +Entities (e.g. \verb$<$) are expanded and included in the +atom representing the element content or attribute value.% + \footnote{Up to SWI-Prolog 5.4.x, Prolog could not represent + \jargon{wide} characters and entities that did not fit in + the Prolog characters set were emitted as a term + \term{number}{+Code}. With the introduction of wide + characters in the 5.5 branch this is no longer needed.} + + +\subsection{`Goodies' Predicates} + +These predicates are for basic use of the library, converting entire and +self-contained files in SGML, HTML, or XML into a structured term. They +are based on load_structure/3. + +\begin{description} + \predicate{load_sgml_file}{2}{+File, -ListOfContent} +Same as \term{load_structure}{File, ListOfContent, [dialect(sgml)]}. + + \predicate{load_xml_file}{2}{+File, -ListOfContent} +Same as \term{load_structure(File, ListOfContent, [dialect(xml)]}. + + \predicate{load_html_file}{2}{+File, -Content} +Load \arg{File} and parse as HTML. Implemented as below. Note that +load_html_file/2 re-uses a cached DTD object as defined by dtd/2. As DTD +objects may be corrupted while loading errornous documents sharing is +undesirable if the documents are not known to be correct. See dtd/2 for +details. + +\begin{code} +load_html_file(File, Term) :- + dtd(html, DTD), + load_structure(File, Term, + [ dtd(DTD), + dialect(sgml), + shorttag(false) + ]). +\end{code} +\end{description} + + +\section{Predicate Reference} + +\subsection{Loading Structured Documents} + +SGML or XML files are loaded through the common predicate +load_structure/3. This is a predicate with many options. For +simplicity a number of commonly used shorthands are provided: +load_sgml_file/2, load_xml_file/2, and +load_html_file/2. + +\begin{description} + \predicate{load_structure}{3}{+Source, -ListOfContent, +Options} +Parse \arg{Source} and return the resulting structure in +\arg{ListOfContent}. \arg{Source} is either a term of the format +\term{stream}{StreamHandle} or a file-name. \arg{Options} is a list of +options controlling the conversion process. + +A proper XML document contains only a single toplevel element whose name +matches the document type. Nevertheless, a list is returned for +consistency with the representation of element content. The $), $ instruction is handled internally. +\end{description} + + +The \arg{Options} list controls the conversion process. Currently +defined options are: + +\begin{description} + \termitem{dtd}{?DTD} +Reference to a DTD object. If specified, the \verb$$ +declaration is ignored and the document is parsed and validated against +the provided DTD. If provided as a variable, the created DTD is +returned. See \secref{implicitdtd}. + + \termitem{dialect}{+Dialect} +Specify the parsing dialect. Supported are \const{sgml} (default), \const{xml} +and \const{xmlns}. See \secref{xml} for details on the differences. + + \termitem{shorttag}{+Bool} +Define whether SHORTTAG abbreviation is accepted. The default is true +for SGML mode and false for the XML modes. Without SHORTTAG, a / +is accepted with warning as part of an unquoted attribute-value, though +/> still closes the element-tag in XML mode. It may be set to +false for parsing HTML documents to allow for unquoted URLs containing +/. + + \termitem{space}{+SpaceMode} +Sets the `space-handling-mode' for the initial environment. This mode is +inherited by the other environments, which can override the inherited +value using the XML reserved attribute In +addition, newlines at the end of lines containing only markup should be +deleted. This is not yet implemented. This is the default mode for +the SGML dialect. + + \termitem{space}{preserve} +White space is passed literally to the application. This mode leaves all +white space handling to the application. This is the default mode for +the XML dialect. + + \termitem{space}{default} +In addition to \const{sgml} space-mode, all consequtive white-space is +reduced to a single space-character. This mode canonises all white +space. + + \termitem{space}{remove} +In addition to \const{default}, all leading and trailing white-space is +removed from \const{CDATA} objects. If, as a result, the \const{CDATA} +becomes empty, nothing is passed to the application. This mode is +especially handy for processing `data-oriented' documents, such as RDF. +It is not suitable for normal text documents. Consider the HTML +fragment below. When processed in this mode, the spaces between the +three modified words are lost. This mode is not part of any standard; +XML 1.0 allows only \const{default} and \const{preserve}. + +\begin{code} +Consider adjacent bold
      and
    italic words. +\end{code} +\end{description} + +\subsection{XML documents} \label{sec:xml} + +The parser can operate in two modes: \const{sgml} mode and \const{xml} mode, as +defined by the \term{dialect}{Dialect} option. Regardless of this +option, if the first line of the document reads as below, the parser is +switched automatically into XML mode. + +\begin{code} + +\end{code} + +Currently switching to XML mode implies: + +\begin{itemlist} + \item [XML empty elements] +The construct \verb$$ is recognised as +an empty element. + + \item [Predefined entities] +The following entitities are predefined: \const{lt} (\verb$<$), \const{gt} +(\verb$>$), \const{amp} (\verb$&$), \const{apos} (\verb$'$) +and \const{quot} (\verb$"$). + + \item [Case sensitivity] +In XML mode, names are treated case-sensitive, except for the DTD +reserved names (i.e. \exam{ELEMENT}, \emph{etc.}). + + \item [Character classes] +In XML mode, underscores (\verb$_$) and colon (\verb$:$) are +allowed in names. + + \item [White-space handling] +White space mode is set to \const{preserve}. In addition to setting +white-space handling at the toplevel the XML reserved attribute + +\end{code} +\end{itemlist} + + +\subsubsection{XML Namespaces} \label{sec:xmlns} + +Using the \jargon{dialect} \const{xmlns}, the parser will interpret XML +namespaces. In this case, the names of elements are returned as a term +of the format + +\begin{quote} +\arg{URL}\const{:}\arg{LocalName} +\end{quote} + +If an identifier has no namespace and there is no default namespace it +is returned as a simple atom. If an identifier has a namespace but this +namespace is undeclared, the namespace name rather than the related URL +is returned. + +Attributes declaring namespaces ({\tt xmlns:=}) are reported +as if \const{xmlns} were not a defined resource. + +In many cases, getting attribute-names as \arg{url}:\arg{name} +is not desirable. Such terms are hard to unify and sometimes multiple +URLs may be mapped to the same identifier. This may happen due to poor +version management, poor standardisation or because the the application +doesn't care too much about versions. This package defines two +call-backs that can be set using set_sgml_parser/2 to deal +with this problem. + +The call-back \const{xmlns} is called as XML namespaces are noticed. +It can be used to extend a canonical mapping for later use +by the \const{urlns} call-back. The following illustrates this behaviour. +Any namespace containing \const{rdf-syntax} in its URL or that is used as +\const{rdf} namespace is canonised to \const{rdf}. This implies that any +attribute and element name from the RDF namespace appears as +\verb$rdf:$ + +\begin{code} +:- dynamic + xmlns/3. + +on_xmlns(rdf, URL, _Parser) :- !, + asserta(xmlns(URL, rdf, _)). +on_xmlns(_, URL, _Parser) :- + sub_atom(URL, _, _, _, 'rdf-syntax'), !, + asserta(xmlns(URL, rdf, _)). + +load_rdf_xml(File, Term) :- + load_structure(File, Term, + [ dialect(xmlns), + call(xmlns, on_xmlns), + call(urlns, xmlns) + ]). +\end{code} + +\subsection{DTD-Handling} + +The DTD (\textbf{D}ocument \textbf{T}ype \textbf{D}efinition) is a +separate entity in sgml2pl, that can be created, freed, defined and +inspected. Like the parser itself, it is filled by opening it as a +Prolog output stream and sending data to it. This section summarises the +predicates for handling the DTD. + +\begin{description} + \predicate{new_dtd}{2}{+DocType, -DTD} +Creates an empty DTD for the named \arg{DocType}. The returned +DTD-reference is an opaque term that can be used in the other predicates +of this package. + + \predicate{free_dtd}{1}{+DTD} +Deallocate all resources associated to the DTD. Further use of \arg{DTD} +is invalid. + + \predicate{load_dtd}{2}{+DTD, +File} +Define the DTD by loading the SGML-DTD file \arg{File}. Same +as load_dtd/3 with empty option list. + + \predicate{load_dtd}{3}{+DTD, +File, +Options} +Define the DTD by loading \arg{File}. Defined options are the +\const{dialect} option from open_dtd/3 and the \const{encoding} +option from open/4. Notably the \const{dialect} option must +match the dialect used for subsequent parsing using this DTD. + + \predicate{open_dtd}{3}{+DTD, +Options, -OutStream} +Open a DTD as an output stream. See load_dtd/2 for an example. +Defined options are: + +\begin{description} + \termitem{dialect}{Dialect} +Define the DTD dialect. Default is \const{sgml}. Using \const{xml} or +\const{xmlns} processes the DTD case-sensitive. +\end{description} + + \predicate{dtd}{2}{+DocType, -DTD} +Find the DTD representing the indicated \jargon{doctype}. This predicate +uses a cache of DTD objects. If a doctype has no associated dtd, it +searches for a file using the file search path \exam{dtd} using the call: + +\begin{code} +..., +absolute_file_name(dtd(Type), + [ extensions([dtd]), + access(read) + ], DtdFile), +... +\end{code} + +Note that DTD objects may be modified while processing errornous +documents. For example, loading an SGML document starting with +\verb$$ switches the DTD to XML mode and encountering unknown +elements adds these elements to the DTD object. Re-using a DTD object to +parse multiple documents should be restricted to situations where the +documents processed are known to be error-free. + + \predicate{dtd_property}{2}{+DTD, ?Property} +This predicate is used to examine the content of a DTD. Property is one +of: + +\begin{description} + \termitem{doctype}{DocType} +An atom representing the document-type defined by this DTD. + + \termitem{elements}{ListOfElements} +A list of atoms representing the names of the elements in this DTD. + + \termitem{element}{Name, Omit, Content} +The DTD contains an element with the given name. \arg{Omit} is a term of +the format \term{omit}{OmitOpen, OmitClose}, where both arguments are +booleans (\const{true} or \const{false} representing whether the open- +or close-tag may be omitted. \arg{Content} is the content-model of the +element represented as a Prolog term. This term takes the following +form: + + \begin{description} + \termitem{empty}{} +The element has no content. + + \termitem{cdata}{} +The element contains non-parsed character data. All data up to the +matching end-tag is included in the data (\jargon{declared content}). + + \termitem{rcdata}{} +As \const{cdata}, but entity-references are expanded. + + \termitem{any}{} +The element may contain any number of any element from the DTD in +any order. + + \termitem{\#pcdata}{} +The element contains parsed character data . + + \termitem{\arg{element}} An element with this name. + + \termitem{*}{SubModel} +0 or more appearances. + + \termitem{?}{SubModel} +0 or one appearance. + + \termitem{+}{SubModel} +1 or more appearances. + + \termitem{,}{SubModel1, SubModel2} +\arg{SubModel1} followed by \arg{SubModel2}. + + \termitem{\&}{SubModel1, SubModel2} +\arg{SubModel1} and \arg{SubModel2} in any order. + + \termitem{\chr{|}}{SubModel1, SubModel2} +\arg{SubModel1} or \arg{SubModel2}. +\end{description} + + \termitem{attributes}{Element, ListOfAttributes} +\arg{ListOfAttributes} is a list of atoms representing the attributes +of the element \arg{Element}. + + \termitem{attribute}{Element, Attribute, Type, Default} +Query an element. \arg{Type} is one of \const{cdata}, \const{entity}, +\const{id}, \const{idref}, \const{name}, \const{nmtoken}, +\const{notation}, \const{number} or \const{nutoken}. For DTD types that +allow for a list, the notation \term{list}{Type} is used. Finally, the +DTD construct \verb$(a|b|...)$ is mapped to the term +\term{nameof}{ListOfValues}. + +\arg{Default} describes the sgml default. It is one \const{required}, +\const{current}, \const{conref} or \const{implied}. If a real default is +present, it is one of \term{default}{Value} or \term{fixed}{Value}. + + \termitem{entities}{ListOfEntities} +\arg{ListOfEntities} is a list of atoms representing the names of the +defined entities. + + \termitem{entity}{Name, Value} +\arg{Name} is the name of an entity with given value. Value is one of +\begin{description} + + \termitem{\arg{Atom}}{} +If the value is atomic, it represents the literal value of the entity. + + \termitem{system}{Url} +\arg{Url} is the URL of the system external entity. + + \termitem{public}{Id, Url} +For external public entities, \arg{Id} is the identifier. If an URL is +provided this is returned in \arg{Url}. Otherwise this argument is +unbound. +\end{description} + + \termitem{notations}{ListOfNotations} +Returns a list holding the names of all \const{NOTATION} declarations. + + \termitem{notation}{Name, Decl} +Unify \arg{Decl} with a list if \term{system}{+File} and/or +\term{public}{+PublicId}. +\end{description} +\end{description} + +\subsubsection{The DOCTYPE declaration} + +As this parser allows for processing partial documents and process the +DTD separately, the DOCTYPE declaration plays a special role. + +If a document has no DOCTYPE declaraction, the parser returns a list +holding all elements and CDATA found. If the document has a DOCTYPE +declaraction, the parser will open the element defined in the DOCTYPE as +soon as the first real data is encountered. + +\subsection{Extracting a DTD} \label{sec:implicitdtd} + +Some documents have no DTD. One of the neat facilities of this library +is that it builds a DTD while parsing a document with an +\end{code} + +Any encountered attribute is added to the attribute list with the type +\const{CDATA} and default \const{\#IMPLIED}. + +The example below extracts the elements used in an unknown XML document. + +\begin{code} +elements_in_xml_document(File, Elements) :- + load_structure(File, _, + [ dialect(xml), + dtd(DTD) + ]), + dtd_property(DTD, elements(Elements)), + free_dtd(DTD). +\end{code} + +\subsection{Parsing Primitives} + +\begin{description} + \predicate{new_sgml_parser}{2}{-Parser, +Options} +Creates a new parser. A parser can be used one or multiple times for +parsing documents or parts thereof. It may be bound to a DTD or the DTD +may be left implicit, in which case it is created from the document +prologue or parsing is performed without a DTD. Options: +\begin{description} + \termitem{dtd}{?DTD} +If specified with an initialised DTD, this DTD is used for parsing the +document, regardless of the document prologue. If specified using as a +variable, a reference to the created DTD is returned. This DTD may be +created from the document prologue or build implicitely from the +document's content. +\end{description} + + \predicate{free_sgml_parser}{1}{+Parser} +Destroy all resources related to the parser. This does not destroy the +DTD if the parser was created using the \term{dtd}{DTD} option. + + \predicate{set_sgml_parser}{2}{+Parser, +Option} +Sets attributes to the parser. Currently defined attributes: + +\begin{description} + \termitem{file}{File} +Sets the file for reporting errors and warnings. Sets the line to 1. + \termitem{line}{Line} +Sets the current line. Useful if the stream is not at the start of the +(file) object for generating proper line-numbers. + \termitem{charpos}{Offset} +Sets the current character location. See also the \term{file}{File} +option. + \termitem{dialect}{Dialect} +Set the markup dialect. Known dialects: +\begin{description} + + \termitem{sgml}{} +The default dialect is to process as SGML. This implies markup is +case-insensitive and standard SGML abbreviation is allowed (abreviated +attributes and omitted tags). + + \termitem{xml}{} +This dialect is selected automatically if the processing instruction +\verb$$ is encountered. See \secref{xml} for details. + + \termitem{xmlns}{} +Process file as XML file with namespace support. See \secref{xmlns} for +details. See also the \verb$qualify_attributes$ option below. +\end{description} + + \termitem{qualify_attributes}{Boolean} +How to handle unqualified attribute (i.e. without an explicit namespace) +in XML namespace (\const{xmlns}) mode. Default and standard compliant is +not to qualify such elements. If \const{true}, such attributes are +qualified with the namespace of the element they appear in. This option +is for backward compatibility as this is the behaviour of older +versions. In addition, the namespace document suggests unqualified +attributes are often interpreted in the namespace of their element. + + \termitem{space}{SpaceMode} +Define the initial handling of white-space in PCDATA. This attribute is +described in \secref{space}. + + \termitem{number}{NumberMode} +If \const{token} (default), attributes of type number are passed as a +Prolog atom. If \const{integer}, such attributes are translated into +Prolog integers. If the conversion fails (e.g. due to overflow) a +warning is issued and the value is passed as an atom. + + \termitem{encoding}{Encoding} +Set the initial encoding. The default initial encoding for XML documents is +UTF-8 and for SGML documents ISO-8859-1. XML documents may change the +encoding using the encoding= attribute in the header. Explicit +use of this option is only required to parse non-conforming documents. +Currently accepted values are \const{iso-8859-1} and \const{utf-8}. + + \termitem{doctype}{Element} +Defines the toplevel element expected. If a \verb$doctype(_). This feature is especially +useful when parsing part of a document (see the \const{parse} option to +sgml_parse/2. +\end{description} + + \predicate{get_sgml_parser}{2}{+Parser, -Option} +Retrieve infomation on the current status of the parser. Notably useful +if the parser is used in the call-back mode. Currently defined options: + +\begin{description} + \termitem{file}{-File} +Current file-name. Note that this may be different from the provided +file if an external entity is being loaded. + + \termitem{line}{-Line} +Line-offset from where the parser started its processing in the file-object. + + \termitem{charpos}{-CharPos} +Offset from where the parser started its processing in the file-object. +See \secref{indexaccess}. + + \termitem{charpos}{-Start, -End} +Character offsets of the start and end of the source processed causing the +current call-back. Used in \program{PceEmacs} to for colouring +text in SGML and XML modes. + + \termitem{source}{-Stream} +Prolog stream being processed. May be used in the \const{on_begin}, \emph{etc.} +callbacks from sgml_parse/2. + + \termitem{dialect}{-Dialect} +Return the current dialect used by the parser (\const{sgml}, \const{xml} or \const{xmlns}). + + \termitem{event_class}{-Class} +The \jargon{event class} can be requested in call-back events. It +denotes the cause of the event, providing useful information for syntax +highlighting. Defined values are: + \begin{description} + + \termitem{explicit}{} + The code generating this event is explicitely present in the + document. + + \termitem{omitted}{} + The current event is caused by the insertion of an omitted tag. + This may be a normal event in SGML mode or an error in XML mode. + + \termitem{shorttag}{} + The current event (\const{begin} or \const{end}) is caused by an + element written down using the \jargon{shorttag} notation + (\verb$$. + + \termitem{shortref}{} + The current event is caused by the expansion of a + \jargon{shortref}. This allows for highlighting shortref strings + in the source-text. + \end{description} + + \termitem{doctype}{-Element} +Return the defined document-type (= toplevel element). See also +set_sgml_parser/2. + + \termitem{dtd}{-DTD} +Return the currently used DTD. See dtd_property/2 for obtaining information +on the DTD such as element and attribute properties. + + \termitem{context}{-StackOfElements} +Returns the stack of currently open elements as a list. The head of this +list is the current element. This can be used to determine the context +of, for example, CDATA events in call-back mode. The elements +are passed as atoms. Currently no access to the attributes is provided. + + \termitem{allowed}{-Elements} +Determines which elements may be inserted at the current location. This +information is returned as a list of element-names. If character data is +allowed in the current location, \const{\#pcdata} is part of +\arg{Elements}. If no element is open, the \jargon{doctype} is returned. + +This option is intended to support syntax-sensitive editors. Such an +editor should load the DTD, find an appropriate starting point and then +feed all data between the starting point and the caret into the parser. +Next it can use this option to determine the elements allowed at this +point. Below is a code fragment illustrating this use given a parser +with loaded DTD, an input stream and a start-location. + +\begin{code} + ..., + seek(In, Start, bof, _), + set_sgml_parser(Parser, charpos(Start)), + set_sgml_parser(Parser, doctype(_)), + Len is Caret - Start, + sgml_parse(Parser, + [ source(In), + content_length(Len), + parse(input) % do not complete document + ]), + get_sgml_parser(Parser, allowed(Allowed)), + ... +\end{code} +\end{description} + + \predicate{sgml_parse}{2}{+Parser, +Options} +Parse an XML file. The parser can operate in two input and two output +modes. Output is either a structured term as described with +load_structure/2 or call-backs on predefined events. The +first is especially suitable for manipulating not-too-large documents, +while the latter provides a primitive means for handling very large +documents. + +Input is a stream. A full description of the option-list is below. + +\begin{description} + \termitem{document}{+Term} +A variable that will be unified with a list describing the content of +the document (see load_structure/2). + \termitem{source}{+Stream} +An input stream that is read. This option print_message/2. + \termitem{style}{} + Print dubious input such as attempts for redefinitions in the DTD + using print_message/2 with severity + \const{informational}. + \end{description} + \termitem{call}{+Event, :PredicateName} +Issue call-backs on the specified events. \arg{PredicateName} is the +name of the predicate to call on this event, possibly prefixed with a +module identifier. If the handler throws an exception, parsing is stopped +and sgml_parse/2 re-throws the exception. The defined events are: +\begin{description} + \termitem{begin}{} +An open-tag has been parsed. The named handler is called with three +arguments: \term{\arg{Handler}}{+Tag, +Attributes, +Parser}. + \termitem{end}{} +A close-tag has been parsed. The named handler is called with two +arguments: \term{\arg{Handler}}{+Tag, +Parser}. + + \termitem{cdata}{} +CDATA has been parsed. The named handler is called with two arguments: +\term{Handler}{+CDATA, +Parser}, where CDATA is an atom +representing the data. + + \termitem{pi}{} +A processing instruction has been parsed. The named handler is called +with two arguments: \term{\arg{Handler}}{+Text, +Parser}, where +\arg{Text} is the text of the processing instruction. + + \termitem{decl}{} +A declaration (\verb$$) has been read. The named handler is +called with two arguments: \term{\arg{Handler}}{+Text, +Parser}, +where \arg{Text} is the text of the declaration with comments removed. + +This option is expecially useful for highlighting declarations and comments in +editor support, where the location of the declaration is extracted using +get_sgml_parser/2. + + \termitem{error}{} +An error has been encountered. the named handler is called with three +arguments: \term{\arg{Handler}}{+Severity, +Message, +Parser}, where +\arg{Severity} is one of \const{warning} or \const{error} and +\arg{Message} is an atom representing the diagnostic message. The +location of the error can be determined using get_sgml_parser/2 + +If this option is present, errors and warnings are not reported using +print_message/3 + + \termitem{xmlns}{} +When parsing an in \const{xmlns} mode, a new namespace declaraction is +pushed on the environment. The named handler is called with three +arguments: \term{\arg{Handler}}{+NameSpace, +URL, +Parser}. +See \secref{xmlns} for details. + + \termitem{urlns}{} +When parsing an in \const{xmlns} mode, this predicate can be used to map a +url into either a canonical URL for this namespace or another internal +identifier. See \secref{xmlns} for details. +\end{description} +\end{description} +\end{description} + +\subsubsection{Partial Parsing} + +In some cases, part of a document needs to be parsed. One option is to +use load_structure/2 or one of its variations and extract +the desired elements from the returned structure. This is a clean +solution, especially on small and medium-sized documents. It however is +unsuitable for parsing really big documents. Such documents can only be +handled with the call-back output interface realised by the +\term{call}{Event, Action} option of sgml_parse/2. +Event-driven processing is not very natural in Prolog. + +The SGML2PL library allows for a mixed approach. Consider the case where +we want to process all descriptions from RDF elements in a document. The +code below calls process_rdf_description(Element) on each element +that is directly inside an RDF element. + +\begin{code} +:- dynamic + in_rdf/0. + +load_rdf(File) :- + retractall(in_rdf), + open(File, read, In), + new_sgml_parser(Parser, []), + set_sgml_parser(Parser, file(File)), + set_sgml_parser(Parser, dialect(xml)), + sgml_parse(Parser, + [ source(In), + call(begin, on_begin), + call(end, on_end) + ]), + close(In). + +on_end('RDF', _) :- + retractall(in_rdf). + +on_begin('RDF', _, _) :- + assert(in_rdf). +on_begin(Tag, Attr, Parser) :- + in_rdf, !, + sgml_parse(Parser, + [ document(Content), + parse(content) + ]), + process_rdf_description(element(Tag, Attr, Content)). +\end{code} + +\subsection{Type checking} + +\begin{description} + \predicate{xml_is_dom}{1}{@{Term}} +True if \arg{Term} is an SGML/XML term as produced by one of the above +predciates and acceptable by xml_write/3 and friends. +\end{description} + +\section{Stream encoding issues} \label{sec:encoding} + +The parser can deal with ISO Latin-1 and UTF-8 encoded files, doing +decoding based on the encoding argument provided to +set_sgml_parser/2 or, for XML, based on the \const{encoding} +attribute of the XML header. The parser reads from SWI-Prolog streams, +which also provide encoding handling. Therefore, there are two modes +for parsing. If the SWI-Prolog stream has encoding \const{octet} (which +is the default for binary streams), the decoder of the SGML parser will +be used and positions reported by the parser are octet offsets in the +stream. In other cases, the Prolog stream decoder is used and offsets +are character code counts. + +\section{Processing Indexed Files} \label{sec:indexaccess} + +In some cases applications wish to process small portions of large +SGML, XML or RDF files. For example, the \emph{OpenDirectory} project +by Netscape has produced a 90MB RDF file representing the main index. +The parser described here can process this document as a unit, but +loading takes 85 seconds on a Pentium-II 450 and the resulting term +requires about 70MB global stack. One option is to process the entire +document and output it as a Prolog fact-base of RDF triplets, but in +many cases this is undesirable. Another example is a large SGML file +containing online documentation. The application normally wishes to +provide only small portions at a time to the user. Loading the entire +document into memory is then undesirable. + +Using the \term{parse}{element} option, we open a file, seek +(using seek/4) to the position of the element and +read the desired element. + +The index can be built using the call-back interface of +sgml_parse/2. For example, the following code makes an +index of the \file{ structure.rdf} file of the OpenDirectory +project: + +\begin{code} +:- dynamic + location/3. % Id, File, Offset + +rdf_index(File) :- + retractall(location(_,_)), + open(File, read, In, [type(binary)]), + new_sgml_parser(Parser, []), + set_sgml_parser(Parser, file(File)), + set_sgml_parser(Parser, dialect(xml)), + sgml_parse(Parser, + [ source(In), + call(begin, index_on_begin) + ]), + close(In). + +index_on_begin(_Element, Attributes, Parser) :- + memberchk('r:id'=Id, Attributes), + get_sgml_parser(Parser, charpos(Offset)), + get_sgml_parser(Parser, file(File)), + assert(location(Id, File, Offset)). +\end{code} + +The following code extracts the RDF element with required id: + +\begin{code} +rdf_element(Id, Term) :- + location(Id, File, Offset), + load_structure(File, Term, + [ dialect(xml), + offset(Offset), + parse(element) + ]). +\end{code} + +\section{External entities} + +While processing an SGML document the document may refer to external +data. This occurs in three places: external parameter entities, normal +external entities and the \const{DOCTYPE} declaration. The current version +of this tool deals rather primitively with external data. External +entities can only be loaded from a file and the mapping between the +entity names and the file is done using a \jargon{catalog} file in a +format compatible with that used by James Clark's SP Parser, +based on the SGML Open (now OASIS) specification. + +Catalog files can be specified using two primitives: the predicate +sgml_register_catalog_file/2 or the environment variable +\env{SGML_CATALOG_FILES} (compatible with the SP package). + +\begin{description} + \predicate{sgml_register_catalog_file}{2}{+File, +Location} +Register the indicated \arg{File} as a catalog file. \arg{Location} is +either \const{start} or \const{end} and defines whether the catalog is +considered first or last. This predicate has no effect if \arg{File} is +already part of the catalog. + +If no files are registered using this predicate, the first query on the +catalog examines \env{SGML_CATALOG_FILES} and fills the catalog with +all files in this path. +\end{description} + +Two types of lines are used by this package. + +\begin{quote} +\const{DOCTYPE} \arg{doctype} \arg{file} \\ +\const{PUBLIC} \exam{"}\arg{Id}\exam{"} \arg{file} +\end{quote} + +The specified \arg{file} path is taken relative to the location of the +catolog file. For the \const{DOCTYPE} declaraction, \pllib{sgml} first +makes an attempt to resolve the \const{SYSTEM} or \const{PUBLIC} +identifier. If this fails it tries to resolve the \arg{doctype} using +the provided catalog files. + +Strictly speaking, \pllib{sgml} breaks the rules for XML, +where system identifiers must be Universal Resource Indicators, not +local file names. Simple uses of relative URIs will work correctly under +UNIX and Windows. + +In the future we will design a call-back mechanism for locating and +processing external entities, so Prolog-based file-location and Prolog +resources can be used to store external entities. + +\section{Writing markup} + +\subsection{Writing documents} + +The library \pllib{sgml_write} provides the inverse of the parser, +converting the parser's output back into a file. This process is fairly +simple for XML, but due to the power of the SGML DTD it is much harder +to achieve a reasonable generic result for SGML. + +These predicates can write the output in two encoding schemas depending +on the encoding of the \arg{Stream}. In UTF-8 mode, all characters are +encoded using UTF-8 sequences. In ISO Latin-1 mode, characters outside +the ISO Latin-1 range are represented using a named character entity if +provided by the DTD or a numeric character entity. + +\begin{description} + \predicate{xml_write}{3}{+Stream, +Term, +Options} +Write the XML header with encoding information and the content of +the document as represented by \arg{Term} to \arg{Stream}. This +predicate deals with XML with or without namespaces. If namespace +identifiers are not provided they are generated. This predicate +defines the following \arg{Options} + +\begin{description} + \termitem{dtd}{DTD} +Specify the DTD. In SGML documents the DTD is required to distinguish +between elements that are declared empty in the DTD and elements that +just happen to have no content. Further optimisation (shortref, omitted +tags, etc.) could be considered in the future. The DTD is also used to +find the declared named character entities. + \termitem{doctype}{Doctype} +Document type to include in the header. When omitted it is taken from +the outer element. + \termitem{header}{Bool} +If \arg{Bool} is \const{false}, the XML header is suppressed. Useful for +embedding in other XML streams. + \termitem{layout}{Bool} +Do/do not emit layout characters to make the output readable, Default is +to emit layout. With layout enabled, elements only containing other +elements are written using increasing indentation. This introduces +(depending on the mode and defined whitespace handling) CDATA sequences +with only layout between elements when read back in. If \const{false}, no +layout characters are added. As this mode does not need to analyse the +document it is faster and guarantees correct output when read back. +Unfortunately the output is hardly human readable and causes problems +with many editors. + \termitem{indent}{Integer} +Set the initial element indentation. It more than zero, the indent +is written before the document. + \termitem{nsmap}{Map} +Set the initial namespace map. \arg{Map} is a list of +\arg{Name} = \arg{URI}. This option, together with \const{header} and +\const{ident} is added to use xml_write/3 to generate XML +that is embedded in a larger XML document. + \termitem{net}{Bool} + +Use/do not use \jargon{Null End Tags}. For XML, this applies only to +empty elements, so you get \verb$$ (default, +\term{net}{true}) or \verb$$ +(\term{net}{false}). For SGML, this applies to empty elements, so +you get \verb$$ (if foo is declared to be \const{EMPTY} in the DTD), +\verb$$ (default, \term{net}{false}) or +\verb$/ can be emitted as \verb$xxx$ +(default, \term{net}{false} or \verb$&"$.% + \footnote{Older versions also mapped \texttt{'} to + \texttt{\'}.} +Characters that cannot represented in \arg{Encoding} are mapped to XML +character entities. + + \predicate{xml_quote_attribute}{2}{+In, -Quoted} +Backward compatibility version for xml_quote_attribute/3. +Assumes \const{ascii} encoding. + + \predicate{xml_quote_cdata}{3}{+In, -Quoted, +Encoding} +Very similar to xml_quote_attribute/3, but does not quote the +single- and double-quotes. + + \predicate{xml_quote_cdata}{2}{+In, -Quoted} +Backward compatibility version for xml_quote_cdata/3. +Assumes \const{ascii} encoding. + + \predicate{xml_name}{2}{+In, +Encoding} +Succeed if \arg{In} is an atom or string that satisfies the rules for +a valid XML element or attribute name. As with the other predicates in +this group, if \arg{Encoding} cannot represent one of the characters, this +function fails. It uses a hard-coded table for ASCII-range characters and +iswalpha()/iswalnum() for the first and remaining characters of the name. + + \predicate{xml_name}{1}{+In} +Backward compatibility version for xml_name/2. Assumes \const{ascii} +encoding. +\end{description} + +\section{Unsupported features} + +The current parser is rather limited. While it is able to deal with many +serious documents, it omits several less-used features of SGML and XML. +Known missing SGML features include + +\begin{itemlist} + \item [NOTATION on entities] +Though notation is parsed, notation attributes on external entity +declarations are not handed to the user. + \item [NOTATION attributes] +SGML notations may have attributes, declared using +\verb$$. Those data attributes +are provided when you declare an external CDATA, NDATA, or SDATA entity. + +XML does not include external CDATA, NDATA, or SDATA entities, +nor any of the other uses to which data attributes are put in SGML, +so it doesn't include data attributes for notations either. + +Sgml2pl does not support this feature and is unlikely to; +you should be aware that SGML documents using this feature cannot +be converted faithfully to XML. + \item [SHORTTAG] +The SGML SHORTTAG syntax is only partially implemented. Currently, +\verb$content$, which can also be written as +\verb$content$. +Empty start tags (\verb$<>$), unclosed start tags +(\verb$) and unclosed end tags ( true + ; Base = Type + ), + absolute_file_name(dtd(Base), + [ extensions([dtd]), + access(read) + ], DtdFile), + load_dtd(DTD, DtdFile), + register_cleanup, + asserta(current_dtd(Type, DTD)). + +%% load_dtd(+DTD, +DtdFile, +Options) +% +% Load file into a DTD. Defined options are: +% +% * dialect(+Dialect) +% Dialect to use (xml, xmlns, sgml) +% +% * encoding(+Encoding) +% Encoding of DTD file + +load_dtd(DTD, DtdFile) :- + load_dtd(DTD, DtdFile, []). +load_dtd(DTD, DtdFile, Options) :- + split_dtd_options(Options, DTDOptions, FileOptions), + open_dtd(DTD, DTDOptions, DtdOut), + swi:swi_open(DtdFile, read, DtdIn, FileOptions), + swi:swi_copy_stream_data(DtdIn, DtdOut), + swi:swi_close(DtdIn), + swi:swi_close(DtdOut). + +split_dtd_options([], [], []). +split_dtd_options([H|T], [H|TD], S) :- + dtd_option(H), !, + split_dtd_options(T, TD, S). +split_dtd_options([H|T], TD, [H|S]) :- + split_dtd_options(T, TD, S). + +dtd_option(dialect(_)). + + +%% destroy_dtds +% +% Destroy DTDs cached by this thread as they will become +% unreachable anyway. + +destroy_dtds :- + ( current_dtd(_Type, DTD), + free_dtd(DTD), + fail + ; true + ). + +%% register_cleanup +% +% Register cleanup of DTDs created for this thread. + +register_cleanup :- + registered_cleanup, !. +register_cleanup :- + catch(thread_at_exit(destroy_dtds), _, true), + assert(registered_cleanup). + + + /******************************* + * EXAMINE DTD * + *******************************/ + +prop(doctype(_), _). +prop(elements(_), _). +prop(entities(_), _). +prop(notations(_), _). +prop(entity(E, _), DTD) :- + ( nonvar(E) + -> true + ; '$dtd_property'(DTD, entities(EL)), + member(E, EL) + ). +prop(element(E, _, _), DTD) :- + ( nonvar(E) + -> true + ; '$dtd_property'(DTD, elements(EL)), + member(E, EL) + ). +prop(attributes(E, _), DTD) :- + ( nonvar(E) + -> true + ; '$dtd_property'(DTD, elements(EL)), + member(E, EL) + ). +prop(attribute(E, A, _, _), DTD) :- + ( nonvar(E) + -> true + ; '$dtd_property'(DTD, elements(EL)), + member(E, EL) + ), + ( nonvar(A) + -> true + ; '$dtd_property'(DTD, attributes(E, AL)), + member(A, AL) + ). +prop(notation(N, _), DTD) :- + ( nonvar(N) + -> true + ; '$dtd_property'(DTD, notations(NL)), + member(N, NL) + ). + +dtd_property(DTD, Prop) :- + prop(Prop, DTD), + '$dtd_property'(DTD, Prop). + + + /******************************* + * SGML * + *******************************/ + +parser_option(dialect(_)). +parser_option(shorttag(_)). +parser_option(file(_)). +parser_option(line(_)). +parser_option(space(_)). +parser_option(number(_)). +parser_option(defaults(_)). +parser_option(doctype(_)). +parser_option(qualify_attributes(_)). +parser_option(encoding(_)). + +set_parser_options(Parser, Options, RestOptions) :- + parser_option(Option), + select_option(Option, Options, RestOptions0), !, + set_sgml_parser(Parser, Option), + set_parser_options(Parser, RestOptions0, RestOptions). +set_parser_options(_, Options, Options). + + +load_structure(stream(In), Term, Options) :- !, + ( select_option(offset(Offset), Options, Options1) + -> seek(In, Offset, bof, _) + ; Options1 = Options + ), + ( select_option(dtd(DTD), Options1, Options2) + -> ExplicitDTD = true + ; ExplicitDTD = false, + Options2 = Options1 + ), + new_sgml_parser(Parser, + [ dtd(DTD) + ]), + def_entities(Options2, DTD, Options3), + call_cleanup(parse(Parser, Options3, TermRead, In), + free_sgml_parser(Parser)), + ( ExplicitDTD == true + -> ( DTD = dtd(_, DocType), + dtd_property(DTD, doctype(DocType)) + -> true + ; true + ) + ; free_dtd(DTD) + ), + Term = TermRead. +load_structure(Stream, Term, Options) :- + swi:swi_is_stream(Stream), !, + load_structure(stream(Stream), Term, Options). +load_structure(File, Term, Options) :- + swi:swi_open(File, read, In, [type(binary)]), + load_structure(stream(In), Term, [file(File)|Options]), + swi:swi_close(In). + +parse(Parser, Options, Document, In) :- + set_parser_options(Parser, Options, Options1), + sgml_parse(Parser, + [ document(Document), + source(In) + | Options1 + ]). + +def_entities([], _, []). +def_entities([entity(Name, Value)|T], DTD, Opts) :- !, + def_entity(DTD, Name, Value), + def_entities(T, DTD, Opts). +def_entities([H|T0], DTD, [H|T]) :- + def_entities(T0, DTD, T). + +def_entity(DTD, Name, Value) :- + open_dtd(DTD, [], Stream), + xml_quote_attribute(Value, QValue), + swi:swi_format(Stream, '~n', [Name, QValue]), + swi:close(Stream). + + + /******************************* + * UTILITIES * + *******************************/ + +load_sgml_file(File, Term) :- + load_structure(File, Term, [dialect(sgml)]). + +load_xml_file(File, Term) :- + load_structure(File, Term, [dialect(xml)]). + +load_html_file(File, Term) :- + dtd(html, DTD), + load_structure(File, Term, + [ dtd(DTD), + dialect(sgml), + shorttag(false) + ]). + + + /******************************* + * ENCODING * + *******************************/ + +% xml_quote_attribute(+In, -Quoted) +% xml_quote_cdata(+In, -Quoted) +% +% Backward compatibility for versions that allow to specify +% encoding. All characters that cannot fit the encoding are mapped +% to XML character entities (&#dd;). Using ASCII is the safest +% value. + +xml_quote_attribute(In, Quoted) :- + xml_quote_attribute(In, Quoted, ascii). + +xml_quote_cdata(In, Quoted) :- + xml_quote_cdata(In, Quoted, ascii). + +xml_name(In) :- + xml_name(In, ascii). + + + /******************************* + * TYPE CHECKING * + *******************************/ + +% xml_is_dome(@Term) +% +% True if term statisfies the structure as returned by +% load_structure/3 and friends. + +xml_is_dom(0) :- !, fail. % catch variables +xml_is_dom([]) :- !. +xml_is_dom([H|T]) :- !, + xml_is_dom(H), + xml_is_dom(T). +xml_is_dom(element(Name, Attributes, Content)) :- !, + dom_name(Name), + dom_attributes(Attributes), + xml_is_dom(Content). +xml_is_dom(pi(Pi)) :- !, + atom(Pi). +xml_is_dom(CDATA) :- + atom(CDATA). + +dom_name(NS:Local) :- + atom(NS), + atom(Local), !. +dom_name(Local) :- + atom(Local). + +dom_attributes(0) :- !, fail. +dom_attributes([]). +dom_attributes([H|T]) :- + dom_attribute(H), + dom_attributes(T). + +dom_attribute(Name=Value) :- + dom_name(Name), + atomic(Value). + + + /******************************* + * MESSAGES * + *******************************/ +:- multifile + prolog:message/3. + +% Catch messages. sgml/4 is generated by the SGML2PL binding. + +prolog:message(sgml(Parser, File, Line, Message)) --> + { get_sgml_parser(Parser, dialect(Dialect)) + }, + [ 'SGML2PL(~w): ~w:~w: ~w'-[Dialect, File, Line, Message] ]. + + + /******************************* + * XREF SUPPORT * + *******************************/ + +:- multifile + prolog:called_by/2. + +prolog:called_by(sgml_parse(_, Options), Called) :- + is_list(Options), + findall(G+3, + ( member(call(_, G), Options), + callable(G) + ), + Called). diff --git a/packages/sgml/sgml2pl.c b/packages/sgml/sgml2pl.c new file mode 100644 index 000000000..0dde74a5c --- /dev/null +++ b/packages/sgml/sgml2pl.c @@ -0,0 +1,2401 @@ +/* $Id$ + + Part of SWI-Prolog + + Author: Jan Wielemaker + E-mail: wielemak@science.uva.nl + WWW: http://www.swi-prolog.org + Copyright (C): 1985-2008, University of Amsterdam + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ + +#define _ISOC99_SOURCE 1 /* fwprintf(), etc prototypes */ + +#ifdef __WINDOWS__ +#include +#endif + +#define DTD_MINOR_ERRORS 1 /* get detailed errors */ + +#include +#include "dtd.h" +#include "catalog.h" +#include "model.h" +#include +#include +#include +#include "error.h" +#include +#include +#include +#include + +#define streq(s1, s2) (strcmp(s1, s2) == 0) + +#define MAX_ERRORS 50 +#define MAX_WARNINGS 50 + +#define ENDSNUL ((size_t)-1) + + /******************************* + * PARSER CONTEXT DATA * + *******************************/ + +#define PD_MAGIC 0x36472ba1 /* just a number */ + +typedef enum +{ SA_FILE = 0, /* Stop at end-of-file */ + SA_INPUT, /* Do not complete input */ + SA_ELEMENT, /* Stop after first element */ + SA_CONTENT, /* Stop after close */ + SA_DECL /* Stop after declaration */ +} stopat; + +typedef enum +{ EM_QUIET = 0, /* Suppress messages */ + EM_PRINT, /* Print message */ + EM_STYLE /* include style-messages */ +} errormode; + +typedef struct _env +{ term_t tail; + struct _env *parent; +} env; + + +typedef struct _parser_data +{ int magic; /* PD_MAGIC */ + dtd_parser *parser; /* parser itself */ + + int warnings; /* #warnings seen */ + int errors; /* #errors seen */ + int max_errors; /* error limit */ + int max_warnings; /* warning limit */ + errormode error_mode; /* how to handle errors */ + int positions; /* report file-positions */ + int exception; /* pending exception from callback */ + + predicate_t on_begin; /* begin element */ + predicate_t on_end; /* end element */ + predicate_t on_cdata; /* cdata */ + predicate_t on_entity; /* entity */ + predicate_t on_pi; /* processing instruction */ + predicate_t on_xmlns; /* xmlns */ + predicate_t on_urlns; /* url --> namespace */ + predicate_t on_error; /* errors */ + predicate_t on_decl; /* declarations */ + + stopat stopat; /* Where to stop */ + int stopped; /* Environment is complete */ + + IOSTREAM* source; /* Where we are reading from */ + + term_t list; /* output term (if any) */ + term_t tail; /* tail of the list */ + env *stack; /* environment stack */ + int free_on_close; /* sgml_free parser on close */ +} parser_data; + + + /******************************* + * CONSTANTS * + *******************************/ + +static functor_t FUNCTOR_and2; +static functor_t FUNCTOR_bar2; +static functor_t FUNCTOR_comma2; +static functor_t FUNCTOR_default1; +static functor_t FUNCTOR_dialect1; +static functor_t FUNCTOR_document1; +static functor_t FUNCTOR_dtd1; +static functor_t FUNCTOR_dtd2; +static functor_t FUNCTOR_element3; +static functor_t FUNCTOR_entity1; +static functor_t FUNCTOR_equal2; +static functor_t FUNCTOR_file1; +static functor_t FUNCTOR_fixed1; +static functor_t FUNCTOR_line1; +static functor_t FUNCTOR_list1; +static functor_t FUNCTOR_max_errors1; +static functor_t FUNCTOR_nameof1; +static functor_t FUNCTOR_notation1; +static functor_t FUNCTOR_omit2; +static functor_t FUNCTOR_opt1; +static functor_t FUNCTOR_plus1; +static functor_t FUNCTOR_rep1; +static functor_t FUNCTOR_sgml_parser1; +static functor_t FUNCTOR_parse1; +static functor_t FUNCTOR_source1; +static functor_t FUNCTOR_content_length1; +static functor_t FUNCTOR_call2; +static functor_t FUNCTOR_charpos1; +static functor_t FUNCTOR_charpos2; +static functor_t FUNCTOR_ns2; /* :/2 */ +static functor_t FUNCTOR_space1; +static functor_t FUNCTOR_pi1; +static functor_t FUNCTOR_sdata1; +static functor_t FUNCTOR_ndata1; +static functor_t FUNCTOR_number1; +static functor_t FUNCTOR_syntax_errors1; +static functor_t FUNCTOR_minus2; +static functor_t FUNCTOR_positions1; +static functor_t FUNCTOR_event_class1; +static functor_t FUNCTOR_doctype1; +static functor_t FUNCTOR_allowed1; +static functor_t FUNCTOR_context1; +static functor_t FUNCTOR_defaults1; +static functor_t FUNCTOR_shorttag1; +static functor_t FUNCTOR_qualify_attributes1; +static functor_t FUNCTOR_encoding1; + +static atom_t ATOM_true; +static atom_t ATOM_false; +static atom_t ATOM_cdata; +static atom_t ATOM_rcdata; +static atom_t ATOM_pcdata; +static atom_t ATOM_empty; +static atom_t ATOM_any; +static atom_t ATOM_position; + +#define mkfunctor(n, a) PL_new_functor(PL_new_atom(n), a) + +static void +initConstants(void) +{ + FUNCTOR_sgml_parser1 = mkfunctor("sgml_parser", 1); + FUNCTOR_equal2 = mkfunctor("=", 2); + FUNCTOR_dtd1 = mkfunctor("dtd", 1); + FUNCTOR_element3 = mkfunctor("element", 3); + FUNCTOR_entity1 = mkfunctor("entity", 1); + FUNCTOR_document1 = mkfunctor("document", 1); + FUNCTOR_dtd2 = mkfunctor("dtd", 2); + FUNCTOR_omit2 = mkfunctor("omit", 2); + FUNCTOR_and2 = mkfunctor("&", 2); + FUNCTOR_comma2 = mkfunctor(",", 2); + FUNCTOR_bar2 = mkfunctor("|", 2); + FUNCTOR_opt1 = mkfunctor("?", 1); + FUNCTOR_rep1 = mkfunctor("*", 1); + FUNCTOR_plus1 = mkfunctor("+", 1); + FUNCTOR_default1 = mkfunctor("default", 1); + FUNCTOR_fixed1 = mkfunctor("fixed", 1); + FUNCTOR_list1 = mkfunctor("list", 1); + FUNCTOR_nameof1 = mkfunctor("nameof", 1); + FUNCTOR_notation1 = mkfunctor("notation", 1); + FUNCTOR_file1 = mkfunctor("file", 1); + FUNCTOR_line1 = mkfunctor("line", 1); + FUNCTOR_dialect1 = mkfunctor("dialect", 1); + FUNCTOR_max_errors1 = mkfunctor("max_errors", 1); + FUNCTOR_parse1 = mkfunctor("parse", 1); + FUNCTOR_source1 = mkfunctor("source", 1); + FUNCTOR_content_length1= mkfunctor("content_length", 1); + FUNCTOR_call2 = mkfunctor("call", 2); + FUNCTOR_charpos1 = mkfunctor("charpos", 1); + FUNCTOR_charpos2 = mkfunctor("charpos", 2); + FUNCTOR_ns2 = mkfunctor(":", 2); + FUNCTOR_space1 = mkfunctor("space", 1); + FUNCTOR_pi1 = mkfunctor("pi", 1); + FUNCTOR_sdata1 = mkfunctor("sdata", 1); + FUNCTOR_ndata1 = mkfunctor("ndata", 1); + FUNCTOR_number1 = mkfunctor("number", 1); + FUNCTOR_syntax_errors1 = mkfunctor("syntax_errors", 1); + FUNCTOR_minus2 = mkfunctor("-", 2); + FUNCTOR_positions1 = mkfunctor("positions", 1); + FUNCTOR_event_class1 = mkfunctor("event_class", 1); + FUNCTOR_doctype1 = mkfunctor("doctype", 1); + FUNCTOR_allowed1 = mkfunctor("allowed", 1); + FUNCTOR_context1 = mkfunctor("context", 1); + FUNCTOR_defaults1 = mkfunctor("defaults", 1); + FUNCTOR_shorttag1 = mkfunctor("shorttag", 1); + FUNCTOR_qualify_attributes1 = mkfunctor("qualify_attributes", 1); + FUNCTOR_encoding1 = mkfunctor("encoding", 1); + + ATOM_true = PL_new_atom("true"); + ATOM_false = PL_new_atom("false"); + ATOM_cdata = PL_new_atom("cdata"); + ATOM_rcdata = PL_new_atom("rcdata"); + ATOM_pcdata = PL_new_atom("#pcdata"); + ATOM_empty = PL_new_atom("empty"); + ATOM_any = PL_new_atom("any"); + ATOM_position = PL_new_atom("#position"); +} + + /******************************* + * ACCESS * + *******************************/ + +static int +unify_parser(term_t parser, dtd_parser *p) +{ return PL_unify_term(parser, PL_FUNCTOR, FUNCTOR_sgml_parser1, + PL_POINTER, p); +} + + +static int +get_parser(term_t parser, dtd_parser **p) +{ if ( PL_is_functor(parser, FUNCTOR_sgml_parser1) ) + { term_t a = PL_new_term_ref(); + void *ptr; + + PL_get_arg(1, parser, a); + if ( PL_get_pointer(a, &ptr) ) + { dtd_parser *tmp = ptr; + + if ( tmp->magic == SGML_PARSER_MAGIC ) + { *p = tmp; + + return TRUE; + } + return sgml2pl_error(ERR_EXISTENCE, "sgml_parser", parser); + } + } + + return sgml2pl_error(ERR_TYPE, "sgml_parser", parser); +} + + +static int +unify_dtd(term_t t, dtd *dtd) +{ if ( dtd->doctype ) + return PL_unify_term(t, PL_FUNCTOR, FUNCTOR_dtd2, + PL_POINTER, dtd, + PL_CHARS, dtd->doctype); + else + return PL_unify_term(t, PL_FUNCTOR, FUNCTOR_dtd2, + PL_POINTER, dtd, + PL_VARIABLE); +} + + +static int +get_dtd(term_t t, dtd **dtdp) +{ if ( PL_is_functor(t, FUNCTOR_dtd2) ) + { term_t a = PL_new_term_ref(); + void *ptr; + + PL_get_arg(1, t, a); + if ( PL_get_pointer(a, &ptr) ) + { dtd *tmp = ptr; + + if ( tmp->magic == SGML_DTD_MAGIC ) + { *dtdp = tmp; + + return TRUE; + } + return sgml2pl_error(ERR_EXISTENCE, "dtd", t); + } + } + + return sgml2pl_error(ERR_TYPE, "dtd", t); +} + + + /******************************* + * NEW/FREE * + *******************************/ + +static foreign_t +pl_new_sgml_parser(term_t ref, term_t options) +{ term_t head = PL_new_term_ref(); + term_t tail = PL_copy_term_ref(options); + term_t tmp = PL_new_term_ref(); + + dtd *dtd = NULL; + dtd_parser *p; + + while ( PL_get_list(tail, head, tail) ) + { if ( PL_is_functor(head, FUNCTOR_dtd1) ) + { PL_get_arg(1, head, tmp); + + if ( PL_is_variable(tmp) ) /* dtd(X) */ + { dtd = new_dtd(NULL); /* no known doctype */ + dtd->references++; + unify_dtd(tmp, dtd); + } else if ( !get_dtd(tmp, &dtd) ) + return FALSE; + } + } + if ( !PL_get_nil(tail) ) + return sgml2pl_error(ERR_TYPE, "list", tail); + + p = new_dtd_parser(dtd); + + return unify_parser(ref, p); +} + + +static foreign_t +pl_free_sgml_parser(term_t parser) +{ dtd_parser *p; + + if ( get_parser(parser, &p) ) + { free_dtd_parser(p); + return TRUE; + } + + return FALSE; +} + + +static foreign_t +pl_new_dtd(term_t doctype, term_t ref) +{ ichar *dt; + dtd *dtd; + + if ( !PL_get_wchars(doctype, NULL, &dt, CVT_ATOM|CVT_EXCEPTION) ) + return FALSE; + + if ( !(dtd=new_dtd(dt)) ) + return FALSE; + + dtd->references++; + + return unify_dtd(ref, dtd); +} + + +static foreign_t +pl_free_dtd(term_t t) +{ dtd *dtd; + + if ( get_dtd(t, &dtd) ) + { free_dtd(dtd); + return TRUE; + } + + return FALSE; +} + + /******************************* + * DATA EXCHANGE * + *******************************/ + +static void +put_atom_wchars(term_t t, wchar_t const *s) +{ PL_put_variable(t); + PL_unify_wchars(t, PL_ATOM, ENDSNUL, s); +} + + + /******************************* + * PROPERTIES * + *******************************/ + +static foreign_t +pl_set_sgml_parser(term_t parser, term_t option) +{ dtd_parser *p; + + if ( !get_parser(parser, &p) ) + return FALSE; + + if ( PL_is_functor(option, FUNCTOR_file1) ) + { term_t a = PL_new_term_ref(); + wchar_t *file; + dtd_symbol *fs; + + PL_get_arg(1, option, a); + if ( !PL_get_wchars(a, NULL, &file, CVT_ATOM|CVT_EXCEPTION) ) + return FALSE; + fs = dtd_add_symbol(p->dtd, file); /* symbol will be freed */ + set_file_dtd_parser(p, IN_FILE, fs->name); + } else if ( PL_is_functor(option, FUNCTOR_line1) ) + { term_t a = PL_new_term_ref(); + + PL_get_arg(1, option, a); + if ( !PL_get_integer(a, &p->location.line) ) + return sgml2pl_error(ERR_TYPE, "integer", a); + } else if ( PL_is_functor(option, FUNCTOR_charpos1) ) + { term_t a = PL_new_term_ref(); + + PL_get_arg(1, option, a); + if ( !PL_get_long(a, &p->location.charpos) ) + return sgml2pl_error(ERR_TYPE, "integer", a); + } else if ( PL_is_functor(option, FUNCTOR_dialect1) ) + { term_t a = PL_new_term_ref(); + char *s; + + PL_get_arg(1, option, a); + if ( !PL_get_atom_chars(a, &s) ) + return sgml2pl_error(ERR_TYPE, "atom", a); + + if ( streq(s, "xml") ) + set_dialect_dtd(p->dtd, DL_XML); + else if ( streq(s, "xmlns") ) + set_dialect_dtd(p->dtd, DL_XMLNS); + else if ( streq(s, "sgml") ) + set_dialect_dtd(p->dtd, DL_SGML); + else + return sgml2pl_error(ERR_DOMAIN, "sgml_dialect", a); + } else if ( PL_is_functor(option, FUNCTOR_space1) ) + { term_t a = PL_new_term_ref(); + char *s; + + PL_get_arg(1, option, a); + if ( !PL_get_atom_chars(a, &s) ) + return sgml2pl_error(ERR_TYPE, "atom", a); + + if ( streq(s, "preserve") ) + p->dtd->space_mode = SP_PRESERVE; + else if ( streq(s, "default") ) + p->dtd->space_mode = SP_DEFAULT; + else if ( streq(s, "remove") ) + p->dtd->space_mode = SP_REMOVE; + else if ( streq(s, "sgml") ) + p->dtd->space_mode = SP_SGML; + + else + return sgml2pl_error(ERR_DOMAIN, "space", a); + } else if ( PL_is_functor(option, FUNCTOR_defaults1) ) + { term_t a = PL_new_term_ref(); + int val; + + PL_get_arg(1, option, a); + if ( !PL_get_bool(a, &val) ) + return sgml2pl_error(ERR_TYPE, "boolean", a); + + if ( val ) + p->flags &= ~SGML_PARSER_NODEFS; + else + p->flags |= SGML_PARSER_NODEFS; + } else if ( PL_is_functor(option, FUNCTOR_qualify_attributes1) ) + { term_t a = PL_new_term_ref(); + int val; + + PL_get_arg(1, option, a); + if ( !PL_get_bool(a, &val) ) + return sgml2pl_error(ERR_TYPE, "boolean", a); + + if ( val ) + p->flags |= SGML_PARSER_QUALIFY_ATTS; + else + p->flags &= ~SGML_PARSER_QUALIFY_ATTS; + } else if ( PL_is_functor(option, FUNCTOR_shorttag1) ) + { term_t a = PL_new_term_ref(); + int val; + + PL_get_arg(1, option, a); + if ( !PL_get_bool(a, &val) ) + return sgml2pl_error(ERR_TYPE, "boolean", a); + + set_option_dtd(p->dtd, OPT_SHORTTAG, val); + } else if ( PL_is_functor(option, FUNCTOR_number1) ) + { term_t a = PL_new_term_ref(); + char *s; + + PL_get_arg(1, option, a); + if ( !PL_get_atom_chars(a, &s) ) + return sgml2pl_error(ERR_TYPE, "atom", a); + + if ( streq(s, "token") ) + p->dtd->number_mode = NU_TOKEN; + else if ( streq(s, "integer") ) + p->dtd->number_mode = NU_INTEGER; + else + return sgml2pl_error(ERR_DOMAIN, "number", a); + } else if ( PL_is_functor(option, FUNCTOR_encoding1) ) + { term_t a = PL_new_term_ref(); + char *val; + + PL_get_arg(1, option, a); + if ( !PL_get_atom_chars(a, &val) ) + return sgml2pl_error(ERR_TYPE, "atom", a); + if ( !xml_set_encoding(p, val) ) + return sgml2pl_error(ERR_DOMAIN, "encoding", a); + } else if ( PL_is_functor(option, FUNCTOR_doctype1) ) + { term_t a = PL_new_term_ref(); + ichar *s; + + PL_get_arg(1, option, a); + if ( PL_is_variable(a) ) + { p->enforce_outer_element = NULL; + } else + { if ( !PL_get_wchars(a, NULL, &s, CVT_ATOM) ) + return sgml2pl_error(ERR_TYPE, "atom_or_variable", a); + + p->enforce_outer_element = dtd_add_symbol(p->dtd, s); + } + } else + return sgml2pl_error(ERR_DOMAIN, "sgml_parser_option", option); + + return TRUE; +} + + +static dtd_srcloc * +file_location(dtd_parser *p, dtd_srcloc *l) +{ while(l->parent && l->type != IN_FILE) + l = l->parent; + + return l; +} + + +static foreign_t +pl_get_sgml_parser(term_t parser, term_t option) +{ dtd_parser *p; + + if ( !get_parser(parser, &p) ) + return FALSE; + + if ( PL_is_functor(option, FUNCTOR_charpos1) ) + { term_t a = PL_new_term_ref(); + + PL_get_arg(1, option, a); + return PL_unify_integer(a, file_location(p, &p->startloc)->charpos); + } else if ( PL_is_functor(option, FUNCTOR_line1) ) + { term_t a = PL_new_term_ref(); + + PL_get_arg(1, option, a); + return PL_unify_integer(a, file_location(p, &p->startloc)->line); + } else if ( PL_is_functor(option, FUNCTOR_charpos2) ) + { term_t a = PL_new_term_ref(); + + if ( PL_get_arg(1, option, a) && + PL_unify_integer(a, file_location(p, &p->startloc)->charpos) && + PL_get_arg(2, option, a) && + PL_unify_integer(a, file_location(p, &p->location)->charpos) ) + return TRUE; + else + return FALSE; + } else if ( PL_is_functor(option, FUNCTOR_file1) ) + { dtd_srcloc *l = file_location(p, &p->location); + + if ( l->type == IN_FILE && l->name.file ) + { term_t a = PL_new_term_ref(); + + PL_get_arg(1, option, a); + return PL_unify_wchars(a, PL_ATOM, ENDSNUL, l->name.file); + } + } else if ( PL_is_functor(option, FUNCTOR_source1) ) + { parser_data *pd = p->closure; + + if ( pd && pd->magic == PD_MAGIC && pd->source ) + { term_t a = PL_new_term_ref(); + + PL_get_arg(1, option, a); + return PL_unify_stream(a, pd->source); + } + } else if ( PL_is_functor(option, FUNCTOR_dialect1) ) + { term_t a = PL_new_term_ref(); + + PL_get_arg(1, option, a); + switch(p->dtd->dialect) + { case DL_SGML: + return PL_unify_atom_chars(a, "sgml"); + case DL_XML: + return PL_unify_atom_chars(a, "xml"); + case DL_XMLNS: + return PL_unify_atom_chars(a, "xmlns"); + } + } else if ( PL_is_functor(option, FUNCTOR_event_class1) ) + { term_t a = PL_new_term_ref(); + + PL_get_arg(1, option, a); + switch(p->event_class) + { case EV_EXPLICIT: + return PL_unify_atom_chars(a, "explicit"); + case EV_OMITTED: + return PL_unify_atom_chars(a, "omitted"); + case EV_SHORTTAG: + return PL_unify_atom_chars(a, "shorttag"); + case EV_SHORTREF: + return PL_unify_atom_chars(a, "shortref"); + } + } else if ( PL_is_functor(option, FUNCTOR_dtd1) ) + { term_t a = PL_new_term_ref(); + + PL_get_arg(1, option, a); + + return unify_dtd(a, p->dtd); + } else if ( PL_is_functor(option, FUNCTOR_doctype1) ) + { term_t a = PL_new_term_ref(); + + PL_get_arg(1, option, a); + if ( p->enforce_outer_element ) + return PL_unify_wchars(a, PL_ATOM, ENDSNUL, + p->enforce_outer_element->name); + else + return TRUE; /* leave variable */ + } else if ( PL_is_functor(option, FUNCTOR_allowed1) ) + { term_t tail = PL_new_term_ref(); + term_t head = PL_new_term_ref(); + term_t tmp = PL_new_term_ref(); + sgml_environment *env = p->environments; + + PL_get_arg(1, option, tail); + + if ( env ) + { for( ; env; env = env->parent) + { dtd_element *buf[256]; /* MAX_VISITED! */ + int n = sizeof(buf)/sizeof(dtd_element *); /* not yet used! */ + int i; + + state_allows_for(env->state, buf, &n); + + for(i=0; iname->name); + + if ( !PL_unify_list(tail, head, tail) || + !PL_unify(head, tmp) ) + return FALSE; + } + + if ( !env->element->structure || + !env->element->structure->omit_close ) + break; + } + } else if ( p->enforce_outer_element ) + { put_atom_wchars(tmp, p->enforce_outer_element->name); + + if ( !PL_unify_list(tail, head, tail) || + !PL_unify(head, tmp) ) + return FALSE; + } + + return PL_unify_nil(tail); + } else if ( PL_is_functor(option, FUNCTOR_context1) ) + { term_t tail = PL_new_term_ref(); + term_t head = PL_new_term_ref(); + term_t tmp = PL_new_term_ref(); + sgml_environment *env = p->environments; + + PL_get_arg(1, option, tail); + + for( ; env; env = env->parent) + { put_atom_wchars(tmp, env->element->name->name); + + if ( !PL_unify_list(tail, head, tail) || + !PL_unify(head, tmp) ) + return FALSE; + } + + return PL_unify_nil(tail); + } else + return sgml2pl_error(ERR_DOMAIN, "parser_option", option); + + return FALSE; +} + + +static int +call_prolog(parser_data *pd, predicate_t pred, term_t av) +{ qid_t qid = PL_open_query(NULL, PL_Q_PASS_EXCEPTION, pred, av); + int rc = PL_next_solution(qid); + + if ( !rc && PL_exception(qid) ) + pd->exception = TRUE; + else + pd->exception = FALSE; + + PL_close_query(qid); + + return rc; +} + + +static void +end_frame(fid_t fid, int ex) +{ if ( ex ) + PL_close_foreign_frame(fid); + else + PL_discard_foreign_frame(fid); +} + + +/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +put_url(dtd_parser *p, term_t t, const ichar *url) + Store the url-part of a name-space qualifier in term. We call + xml:xmlns(-Canonical, +Full) trying to resolve the specified + namespace to an internal canonical namespace. + + We do a little caching as there will generally be only a very + small pool of urls in use. We assume the url-pointers we get + life for the time of the parser. It might be possible that + multiple url pointers point to the same url, but this only clobbers + the cache a little. +- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ + +#define URL_CACHE 4 /* # entries cached */ + +typedef struct +{ const ichar *url; /* URL pointer */ + atom_t canonical; +} url_cache; + +static url_cache cache[URL_CACHE]; + +static void +reset_url_cache(void) +{ int i; + url_cache *c = cache; + + for(i=0; iclosure; + int i; + + if ( !pd->on_urlns ) + { put_atom_wchars(t, url); + return; + } + + for(i=0; i0; i--) + cache[i] = cache[i-1]; + cache[0].url = url; + cache[0].canonical = 0; + + { fid_t fid = PL_open_foreign_frame(); + term_t av = PL_new_term_refs(3); + atom_t a; + + put_atom_wchars(av+0, url); + unify_parser(av+2, p); + if ( PL_call_predicate(NULL, PL_Q_NORMAL, pd->on_urlns, av) && + PL_get_atom(av+1, &a) ) + { PL_register_atom(a); + cache[0].canonical = a; + PL_put_atom(t, a); + } else + { put_atom_wchars(t, url); + } + PL_discard_foreign_frame(fid); + } +} + + +static void +put_attribute_name(dtd_parser *p, term_t t, dtd_symbol *nm) +{ const ichar *url, *local; + + if ( p->dtd->dialect == DL_XMLNS ) + { xmlns_resolve_attribute(p, nm, &local, &url); + + if ( url ) + { term_t av = PL_new_term_refs(2); + + put_url(p, av+0, url); + put_atom_wchars(av+1, local); + PL_cons_functor_v(t, FUNCTOR_ns2, av); + } else + put_atom_wchars(t, local); + } else + put_atom_wchars(t, nm->name); +} + + +static void +put_element_name(dtd_parser *p, term_t t, dtd_element *e) +{ const ichar *url, *local; + + if ( p->dtd->dialect == DL_XMLNS ) + { assert(p->environments->element == e); + xmlns_resolve_element(p, &local, &url); + + if ( url ) + { term_t av = PL_new_term_refs(2); + + put_url(p, av+0, url); + put_atom_wchars(av+1, local); + PL_cons_functor_v(t, FUNCTOR_ns2, av); + } else + put_atom_wchars(t, local); + } else + put_atom_wchars(t, e->name->name); +} + + +static ichar * +istrblank(const ichar *s) +{ for( ; *s; s++ ) + { if ( iswspace(*s) ) + return (ichar *)s; + } + + return NULL; +} + + +static int +unify_listval(dtd_parser *p, + term_t t, attrtype type, size_t len, const ichar *text) +{ if ( type == AT_NUMBERS && p->dtd->number_mode == NU_INTEGER ) + { wchar_t *e; + +#if SIZEOF_LONG == 4 && defined(HAVE_WCSTOLL) + int64_t v = wcstoll(text, &e, 10); + if ( (size_t)(e-text) == len && errno != ERANGE ) + return PL_unify_int64(t, v); +#else + long v = wcstol(text, &e, 10); + + if ( (size_t)(e-text) == len && errno != ERANGE ) + return PL_unify_integer(t, v); +#endif + /* TBD: Error!? */ + } + + return PL_unify_wchars(t, PL_ATOM, len, text); +} + + +static int +put_att_text(term_t t, sgml_attribute *a) +{ if ( a->value.textW ) + { PL_put_variable(t); + PL_unify_wchars(t, PL_ATOM, a->value.number, a->value.textW); + return TRUE; + } else + return FALSE; +} + + +static void +put_attribute_value(dtd_parser *p, term_t t, sgml_attribute *a) +{ switch(a->definition->type) + { case AT_CDATA: + put_att_text(t, a); + break; + case AT_NUMBER: + { if ( !put_att_text(t, a) ) + PL_put_integer(t, a->value.number); + break; + } + default: /* multi-valued attribute */ + { if ( a->definition->islist && a->value.textW ) + { term_t tail, head = PL_new_term_ref(); + const ichar *val = a->value.textW; + const ichar *e; + + PL_put_variable(t); + tail = PL_copy_term_ref(t); + + for(e=istrblank(val); e; val = e+1, e=istrblank(val)) + { if ( e == val ) + continue; /* skip spaces */ + PL_unify_list(tail, head, tail); + unify_listval(p, head, a->definition->type, e-val, val); + } + PL_unify_list(tail, head, tail); + unify_listval(p, head, a->definition->type, istrlen(val), val); + PL_unify_nil(tail); + } else + put_att_text(t, a); + } + } +} + + +/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Produce a tag-location in the format + + start_location=file:char-char +- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ + +static int +put_tag_position(dtd_parser *p, term_t pos) +{ dtd_srcloc *l = &p->startloc; + + if ( l->type == IN_FILE && l->name.file ) + { PL_put_variable(pos); + PL_unify_term(pos, + PL_FUNCTOR, FUNCTOR_ns2, + PL_NWCHARS, wcslen(l->name.file), l->name.file, + PL_FUNCTOR, FUNCTOR_minus2, + PL_LONG, l->charpos, + PL_LONG, p->location.charpos); + return TRUE; + } + + return FALSE; +} + + + +static int +unify_attribute_list(dtd_parser *p, term_t alist, + int argc, sgml_attribute *argv) +{ int i; + term_t tail = PL_copy_term_ref(alist); + term_t h = PL_new_term_ref(); + term_t a = PL_new_term_refs(2); + parser_data *pd = p->closure; + + for(i=0; iname); + put_attribute_value(p, a+1, &argv[i]); + PL_cons_functor_v(a, FUNCTOR_equal2, a); + if ( !PL_unify_list(tail, h, tail) || + !PL_unify(h, a) ) + return FALSE; + } + + if ( pd->positions && put_tag_position(p, a+1) ) + { PL_put_atom(a, ATOM_position); + + PL_cons_functor_v(a, FUNCTOR_equal2, a); + if ( !PL_unify_list(tail, h, tail) || + !PL_unify(h, a) ) + return FALSE; + } + + if ( PL_unify_nil(tail) ) + { PL_reset_term_refs(tail); + + return TRUE; + } + + return FALSE; +} + + + +static int +on_begin(dtd_parser *p, dtd_element *e, int argc, sgml_attribute *argv) +{ parser_data *pd = p->closure; + + if ( pd->stopped ) + return TRUE; + + if ( pd->tail ) + { term_t content = PL_new_term_ref(); /* element content */ + term_t alist = PL_new_term_ref(); /* attribute list */ + term_t et = PL_new_term_ref(); /* element structure */ + term_t h = PL_new_term_ref(); + + put_element_name(p, h, e); + unify_attribute_list(p, alist, argc, argv); + PL_unify_term(et, PL_FUNCTOR, FUNCTOR_element3, + PL_TERM, h, + PL_TERM, alist, + PL_TERM, content); + if ( PL_unify_list(pd->tail, h, pd->tail) && + PL_unify(h, et) ) + { env *env = sgml_calloc(1, sizeof(*env)); + + env->tail = pd->tail; + env->parent = pd->stack; + pd->stack = env; + + pd->tail = content; + PL_reset_term_refs(alist); + } + + return TRUE; + } + + if ( pd->on_begin ) + { fid_t fid = PL_open_foreign_frame(); + term_t av = PL_new_term_refs(3); + + put_element_name(p, av+0, e); + unify_attribute_list(p, av+1, argc, argv); + unify_parser(av+2, p); + + call_prolog(pd, pd->on_begin, av); + end_frame(fid, pd->exception); + } + + return TRUE; +} + + +static int +on_end(dtd_parser *p, dtd_element *e) +{ parser_data *pd = p->closure; + + if ( pd->stopped ) + return TRUE; + + if ( pd->on_end ) + { fid_t fid = PL_open_foreign_frame(); + term_t av = PL_new_term_refs(2); + + put_element_name(p, av+0, e); + unify_parser(av+1, p); + + call_prolog(pd, pd->on_end, av); + end_frame(fid, pd->exception); + } + + if ( pd->tail && !pd->stopped ) + { PL_unify_nil(pd->tail); + PL_reset_term_refs(pd->tail); /* ? */ + + if ( pd->stack ) + { env *parent = pd->stack->parent; + + pd->tail = pd->stack->tail; + sgml_free(pd->stack); + pd->stack = parent; + } else + { if ( pd->stopat == SA_CONTENT ) + pd->stopped = TRUE; + } + } + + if ( pd->stopat == SA_ELEMENT && !p->environments->parent ) + pd->stopped = TRUE; + + return TRUE; +} + + +static int +on_entity(dtd_parser *p, dtd_entity *e, int chr) +{ parser_data *pd = p->closure; + + if ( pd->stopped ) + return TRUE; + + if ( pd->on_entity ) + { fid_t fid = PL_open_foreign_frame(); + term_t av = PL_new_term_refs(2); + + if ( e ) + put_atom_wchars(av+0, e->name->name); + else + PL_put_integer(av+0, chr); + + unify_parser(av+1, p); + + call_prolog(pd, pd->on_end, av); + end_frame(fid, pd->exception); + } + + if ( pd->tail ) + { term_t h = PL_new_term_ref(); + + if ( !PL_unify_list(pd->tail, h, pd->tail) ) + return FALSE; + + if ( e ) + PL_unify_term(h, + PL_FUNCTOR, FUNCTOR_entity1, + PL_CHARS, e->name->name); + else + PL_unify_term(h, + PL_FUNCTOR, FUNCTOR_entity1, + PL_INT, chr); + + PL_reset_term_refs(h); + } + + return TRUE; +} + + +static int +on_data(dtd_parser *p, data_type type, int len, const wchar_t *data) +{ parser_data *pd = p->closure; + + if ( pd->on_cdata ) + { fid_t fid = PL_open_foreign_frame(); + term_t av = PL_new_term_refs(2); + + PL_unify_wchars(av+0, PL_ATOM, len, data); + + unify_parser(av+1, p); + + call_prolog(pd, pd->on_cdata, av); + end_frame(fid, pd->exception); + } + + if ( pd->tail && !pd->stopped ) + { term_t h = PL_new_term_ref(); + + if ( PL_unify_list(pd->tail, h, pd->tail) ) + { int rval = TRUE; + term_t a; + + switch(type) + { case EC_CDATA: + a = h; + break; + case EC_SDATA: + { term_t d = PL_new_term_ref(); + + a = d; + rval = PL_unify_term(h, PL_FUNCTOR, FUNCTOR_sdata1, PL_TERM, d); + break; + } + case EC_NDATA: + { term_t d = PL_new_term_ref(); + + a = d; + rval = PL_unify_term(h, PL_FUNCTOR, FUNCTOR_ndata1, PL_TERM, d); + break; + } + default: + rval = FALSE; + assert(0); + } + + if ( rval ) + rval = PL_unify_wchars(a, PL_ATOM, len, data); + + if ( rval ) + { PL_reset_term_refs(h); + return TRUE; + } + } + } + + return FALSE; +} + + +static int +on_cdata(dtd_parser *p, data_type type, int len, const wchar_t *data) +{ return on_data(p, type, len, data); +} + + +static int +can_end_omitted(dtd_parser *p) +{ sgml_environment *env; + + for(env=p->environments; env; env = env->parent) + { dtd_element *e = env->element; + + if ( !(e->structure && e->structure->omit_close) ) + return FALSE; + } + + return TRUE; +} + + +static int +on_error(dtd_parser *p, dtd_error *error) +{ parser_data *pd = p->closure; + const char *severity; + + if ( pd->stopped ) + return TRUE; + + if ( pd->stopat == SA_ELEMENT && + (error->minor == ERC_NOT_OPEN || error->minor == ERC_NOT_ALLOWED) && + can_end_omitted(p) ) + { end_document_dtd_parser(p); + sgml_cplocation(&p->location, &p->startloc); + pd->stopped = TRUE; + return TRUE; + } + + switch(error->severity) + { case ERS_STYLE: + if ( pd->error_mode != EM_STYLE ) + return TRUE; + severity = "informational"; + break; + case ERS_WARNING: + pd->warnings++; + severity = "warning"; + break; + case ERS_ERROR: + default: /* make compiler happy */ + pd->errors++; + severity = "error"; + break; + } + + if ( pd->on_error ) /* msg, parser */ + { fid_t fid = PL_open_foreign_frame(); + term_t av = PL_new_term_refs(3); + + PL_put_atom_chars(av+0, severity); + PL_unify_wchars(av+1, PL_ATOM, + wcslen(error->plain_message), error->plain_message); + unify_parser(av+2, p); + + call_prolog(pd, pd->on_error, av); + end_frame(fid, pd->exception); + } else if ( pd->error_mode != EM_QUIET ) + { fid_t fid = PL_open_foreign_frame(); + predicate_t pred = PL_predicate("print_message", 2, "user"); + term_t av = PL_new_term_refs(2); + term_t src = PL_new_term_ref(); + term_t parser = PL_new_term_ref(); + dtd_srcloc *l = file_location(p, &p->startloc); + + unify_parser(parser, p); + PL_put_atom_chars(av+0, severity); + + if ( l->name.file ) + { if ( l->type == IN_FILE ) + put_atom_wchars(src, l->name.file); + else + put_atom_wchars(src, l->name.entity); + } else + { PL_put_nil(src); + } + + PL_unify_term(av+1, + PL_FUNCTOR_CHARS, "sgml", 4, + PL_TERM, parser, + PL_TERM, src, + PL_INT, l->line, + PL_NWCHARS, wcslen(error->plain_message), error->plain_message); + + PL_call_predicate(NULL, PL_Q_NODEBUG, pred, av); + + PL_discard_foreign_frame(fid); + } + + return TRUE; +} + + +static int +on_xmlns(dtd_parser *p, dtd_symbol *ns, dtd_symbol *url) +{ parser_data *pd = p->closure; + + if ( pd->stopped ) + return TRUE; + + if ( pd->on_xmlns ) + { fid_t fid = PL_open_foreign_frame(); + term_t av = PL_new_term_refs(3); + + if ( ns ) + put_atom_wchars(av+0, ns->name); + else + PL_put_nil(av+0); + put_atom_wchars(av+1, url->name); + unify_parser(av+2, p); + + call_prolog(pd, pd->on_xmlns, av); + end_frame(fid, pd->exception); + } + + return TRUE; +} + + +static int +on_pi(dtd_parser *p, const ichar *pi) +{ parser_data *pd = p->closure; + + if ( pd->stopped ) + return TRUE; + + if ( pd->on_pi ) + { fid_t fid = PL_open_foreign_frame(); + term_t av = PL_new_term_refs(2); + + put_atom_wchars(av+0, pi); + unify_parser(av+1, p); + + call_prolog(pd, pd->on_pi, av); + end_frame(fid, pd->exception); + } + + if ( pd->tail ) + { term_t h = PL_new_term_ref(); + + if ( !PL_unify_list(pd->tail, h, pd->tail) ) + return FALSE; + + PL_unify_term(h, + PL_FUNCTOR, FUNCTOR_pi1, + PL_NWCHARS, wcslen(pi), pi); + + PL_reset_term_refs(h); + } + + return TRUE; +} + + +static int +on_decl(dtd_parser *p, const ichar *decl) +{ parser_data *pd = p->closure; + + if ( pd->stopped ) + return TRUE; + + if ( pd->on_decl ) + { fid_t fid = PL_open_foreign_frame(); + term_t av = PL_new_term_refs(2); + + put_atom_wchars(av+0, decl); + unify_parser(av+1, p); + + call_prolog(pd, pd->on_decl, av); + end_frame(fid, pd->exception); + } + + if ( pd->stopat == SA_DECL ) + pd->stopped = TRUE; + + return TRUE; +} + + +static int +write_parser(void *h, char *buf, int len) +{ parser_data *pd = h; + unsigned char *s = (unsigned char *)buf; + unsigned char *e = s+len; + + if ( !pd->parser || pd->parser->magic != SGML_PARSER_MAGIC ) + { errno = EINVAL; + return -1; + } + + if ( (pd->errors > pd->max_errors && pd->max_errors >= 0) || pd->stopped ) + { errno = EIO; + return -1; + } + + for(; sparser, *s); + + return len; +} + + +static int +close_parser(void *h) +{ parser_data *pd = h; + dtd_parser *p; + + if ( !(p=pd->parser) || p->magic != SGML_PARSER_MAGIC ) + { errno = EINVAL; + return -1; + } + + if ( pd->tail ) + PL_unify_nil(pd->tail); + + if ( p->dmode == DM_DTD ) + p->dtd->implicit = FALSE; /* assume we loaded a DTD */ + + if ( pd->free_on_close ) + free_dtd_parser(p); + else + p->closure = NULL; + + sgml_free(pd); + + return 0; +} + + +static IOFUNCTIONS sgml_stream_functions = +{ (Sread_function) NULL, + (Swrite_function) write_parser, + (Sseek_function) NULL, + (Sclose_function) close_parser, + NULL +}; + + +static parser_data * +new_parser_data(dtd_parser *p) +{ parser_data *pd; + + pd = sgml_calloc(1, sizeof(*pd)); + pd->magic = PD_MAGIC; + pd->parser = p; + pd->max_errors = MAX_ERRORS; + pd->max_warnings = MAX_WARNINGS; + pd->error_mode = EM_PRINT; + pd->exception = FALSE; + p->closure = pd; + + return pd; +} + + +static foreign_t +pl_open_dtd(term_t ref, term_t options, term_t stream) +{ dtd *dtd; + dtd_parser *p; + parser_data *pd; + IOSTREAM *s; + term_t tail = PL_copy_term_ref(options); + term_t option = PL_new_term_ref(); + + if ( !get_dtd(ref, &dtd) ) + return FALSE; + p = new_dtd_parser(dtd); + p->dmode = DM_DTD; + pd = new_parser_data(p); + pd->free_on_close = TRUE; + + while( PL_get_list(tail, option, tail) ) + { if ( PL_is_functor(option, FUNCTOR_dialect1) ) + { term_t a = PL_new_term_ref(); + char *s; + + PL_get_arg(1, option, a); + if ( !PL_get_atom_chars(a, &s) ) + return sgml2pl_error(ERR_TYPE, "atom", a); + + if ( streq(s, "xml") ) + set_dialect_dtd(dtd, DL_XML); + else if ( streq(s, "xmlns") ) + set_dialect_dtd(dtd, DL_XMLNS); + else if ( streq(s, "sgml") ) + set_dialect_dtd(dtd, DL_SGML); + else + return sgml2pl_error(ERR_DOMAIN, "sgml_dialect", a); + } else + return sgml2pl_error(ERR_DOMAIN, "dtd_option", option); + } + if ( !PL_get_nil(tail) ) + return sgml2pl_error(ERR_TYPE, "list", options); + + s = Snew(pd, SIO_OUTPUT|SIO_FBUF, &sgml_stream_functions); + + if ( !PL_open_stream(stream, s) ) + return FALSE; + + return TRUE; +} + + +static int +set_callback_predicates(parser_data *pd, term_t option) +{ term_t a = PL_new_term_ref(); + char *fname; + atom_t pname; + predicate_t *pp = NULL; /* keep compiler happy */ + int arity; + module_t m = NULL; + + PL_get_arg(2, option, a); + PL_strip_module(a, &m, a); + if ( !PL_get_atom(a, &pname) ) + return sgml2pl_error(ERR_TYPE, "atom", a); + PL_get_arg(1, option, a); + if ( !PL_get_atom_chars(a, &fname) ) + return sgml2pl_error(ERR_TYPE, "atom", a); + + if ( streq(fname, "begin") ) + { pp = &pd->on_begin; /* tag, attributes, parser */ + arity = 3; + } else if ( streq(fname, "end") ) + { pp = &pd->on_end; /* tag, parser */ + arity = 2; + } else if ( streq(fname, "cdata") ) + { pp = &pd->on_cdata; /* cdata, parser */ + arity = 2; + } else if ( streq(fname, "entity") ) + { pp = &pd->on_entity; /* name, parser */ + arity = 2; + } else if ( streq(fname, "pi") ) + { pp = &pd->on_pi; /* pi, parser */ + arity = 2; + } else if ( streq(fname, "xmlns") ) + { pp = &pd->on_xmlns; /* ns, url, parser */ + arity = 3; + } else if ( streq(fname, "urlns") ) + { pp = &pd->on_urlns; /* url, ns, parser */ + arity = 3; + } else if ( streq(fname, "error") ) + { pp = &pd->on_error; /* severity, message, parser */ + arity = 3; + } else if ( streq(fname, "decl") ) + { pp = &pd->on_decl; /* decl, parser */ + arity = 2; + } else + return sgml2pl_error(ERR_DOMAIN, "sgml_callback", a); + + *pp = PL_pred(PL_new_functor(pname, arity), m); + return TRUE; +} + + +static foreign_t +pl_sgml_parse(term_t parser, term_t options) +{ dtd_parser *p; + parser_data *pd; + parser_data *oldpd; + term_t head = PL_new_term_ref(); + term_t tail = PL_copy_term_ref(options); + IOSTREAM *in = NULL; + int recursive; + int has_content_length = FALSE; + int64_t content_length = 0; /* content_length(Len) */ + int count = 0; + int rc = TRUE; + + if ( !get_parser(parser, &p) ) + return FALSE; + + if ( p->closure ) /* recursive call */ + { recursive = TRUE; + + oldpd = p->closure; + if ( oldpd->magic != PD_MAGIC || oldpd->parser != p ) + return sgml2pl_error(ERR_MISC, "sgml", + "Parser associated with illegal data"); + + pd = sgml_calloc(1, sizeof(*pd)); + *pd = *oldpd; + p->closure = pd; + + in = pd->source; + } else + { recursive = FALSE; + oldpd = NULL; /* keep compiler happy */ + + set_mode_dtd_parser(p, DM_DATA); + + p->on_begin_element = on_begin; + p->on_end_element = on_end; + p->on_entity = on_entity; + p->on_pi = on_pi; + p->on_data = on_cdata; + p->on_error = on_error; + p->on_xmlns = on_xmlns; + p->on_decl = on_decl; + + pd = new_parser_data(p); + } + + while ( PL_get_list(tail, head, tail) ) + { if ( PL_is_functor(head, FUNCTOR_document1) ) + { pd->list = PL_new_term_ref(); + PL_get_arg(1, head, pd->list); + pd->tail = PL_copy_term_ref(pd->list); + pd->stack = NULL; + } else if ( PL_is_functor(head, FUNCTOR_source1) ) + { term_t a = PL_new_term_ref(); + + PL_get_arg(1, head, a); + if ( !PL_get_stream_handle(a, &in) ) + return FALSE; + } else if ( PL_is_functor(head, FUNCTOR_content_length1) ) + { term_t a = PL_new_term_ref(); + + PL_get_arg(1, head, a); + if ( !PL_get_int64(a, &content_length) ) + return sgml2pl_error(ERR_TYPE, "integer", a); + has_content_length = TRUE; + } else if ( PL_is_functor(head, FUNCTOR_call2) ) + { if ( !set_callback_predicates(pd, head) ) + return FALSE; + } else if ( PL_is_functor(head, FUNCTOR_parse1) ) + { term_t a = PL_new_term_ref(); + char *s; + + PL_get_arg(1, head, a); + if ( !PL_get_atom_chars(a, &s) ) + return sgml2pl_error(ERR_TYPE, "atom", a); + if ( streq(s, "element") ) + pd->stopat = SA_ELEMENT; + else if ( streq(s, "content") ) + pd->stopat = SA_CONTENT; + else if ( streq(s, "file") ) + pd->stopat = SA_FILE; + else if ( streq(s, "input") ) + pd->stopat = SA_INPUT; + else if ( streq(s, "declaration") ) + pd->stopat = SA_DECL; + else + return sgml2pl_error(ERR_DOMAIN, "parse", a); + } else if ( PL_is_functor(head, FUNCTOR_max_errors1) ) + { term_t a = PL_new_term_ref(); + + PL_get_arg(1, head, a); + if ( !PL_get_integer(a, &pd->max_errors) ) + return sgml2pl_error(ERR_TYPE, "integer", a); + } else if ( PL_is_functor(head, FUNCTOR_syntax_errors1) ) + { term_t a = PL_new_term_ref(); + char *s; + + PL_get_arg(1, head, a); + if ( !PL_get_atom_chars(a, &s) ) + return sgml2pl_error(ERR_TYPE, "atom", a); + + if ( streq(s, "quiet") ) + pd->error_mode = EM_QUIET; + else if ( streq(s, "print") ) + pd->error_mode = EM_PRINT; + else if ( streq(s, "style") ) + pd->error_mode = EM_STYLE; + else + return sgml2pl_error(ERR_DOMAIN, "syntax_error", a); + } else if ( PL_is_functor(head, FUNCTOR_positions1) ) + { term_t a = PL_new_term_ref(); + char *s; + + PL_get_arg(1, head, a); + if ( !PL_get_atom_chars(a, &s) ) + return sgml2pl_error(ERR_TYPE, "atom", a); + + if ( streq(s, "true") ) + pd->positions = TRUE; + else if ( streq(s, "false") ) + pd->positions = FALSE; + else + return sgml2pl_error(ERR_DOMAIN, "positions", a); + } else + return sgml2pl_error(ERR_DOMAIN, "sgml_option", head); + } + if ( !PL_get_nil(tail) ) + return sgml2pl_error(ERR_TYPE, "list", tail); + + /* Parsing input from a stream */ +#define CHECKERROR \ + { if ( pd->exception ) \ + { rc = FALSE; \ + goto out; \ + } \ + if ( pd->errors > pd->max_errors && pd->max_errors >= 0 ) \ + { rc = sgml2pl_error(ERR_LIMIT, "max_errors", (long)pd->max_errors); \ + goto out; \ + } \ + } + + if ( pd->stopat == SA_CONTENT && p->empty_element ) + goto out; + + if ( in ) + { int eof = FALSE; + + if ( in->encoding == ENC_OCTET ) + p->encoded = TRUE; /* parser must decode */ + else + p->encoded = FALSE; /* already decoded */ + + if ( !recursive ) + { pd->source = in; + begin_document_dtd_parser(p); + } + + while(!eof) + { int c, ateof; + + if ( (++count % 8192) == 0 && PL_handle_signals() < 0 ) + { rc = FALSE; + goto out; + } + + if ( has_content_length ) + { if ( content_length <= 0 ) + c = EOF; + else + c = Sgetcode(in); + ateof = (--content_length <= 0); + } else + { c = Sgetcode(in); + ateof = Sfeof(in); + } + + if ( ateof ) + { eof = TRUE; + if ( c == LF ) /* file ends in LF */ + c = CR; + else if ( c != CR ) /* file ends in normal char */ + { if ( has_content_length && in->position ) + { int64_t bc0 = in->position->byteno; + putchar_dtd_parser(p, c); + content_length -= in->position->byteno-bc0; + } else + { putchar_dtd_parser(p, c); + } + CHECKERROR; + if ( pd->stopped ) + goto stopped; + c = CR; + } + } + + if ( has_content_length && in->position ) + { int64_t bc0 = in->position->byteno; + putchar_dtd_parser(p, c); + content_length -= in->position->byteno-bc0; + } else + { putchar_dtd_parser(p, c); + } + CHECKERROR; + if ( pd->stopped ) + { stopped: + pd->stopped = FALSE; + if ( pd->stopat != SA_CONTENT ) + reset_document_dtd_parser(p); /* ensure a clean start */ + goto out; + } + } + + if ( !recursive && pd->stopat != SA_INPUT ) + end_document_dtd_parser(p); + CHECKERROR; + + out: + reset_url_cache(); + if ( pd->tail ) + PL_unify_nil(pd->tail); + + if ( recursive ) + { p->closure = oldpd; + } else + { p->closure = NULL; + } + + pd->magic = 0; /* invalidate */ + sgml_free(pd); + + return rc; + } + + reset_url_cache(); + + return TRUE; +} + + + /******************************* + * DTD PROPERTIES * + *******************************/ + +static void put_model(term_t t, dtd_model *m); + +/* doctype(DocType) */ + +static int +dtd_prop_doctype(dtd *dtd, term_t prop) +{ if ( dtd->doctype ) + return PL_unify_wchars(prop, PL_ATOM, ENDSNUL, dtd->doctype); + return FALSE; +} + + +/* elements(ListOfElements) */ + +static void +make_model_list(term_t t, dtd_model *m, functor_t f) +{ if ( !m->next ) + { put_model(t, m); + } else + { term_t av = PL_new_term_refs(2); + + put_model(av+0, m); + make_model_list(av+1, m->next, f); + PL_cons_functor_v(t, f, av); + PL_reset_term_refs(av); + } +} + + +static void +put_model(term_t t, dtd_model *m) +{ functor_t f; + + switch(m->type) + { case MT_PCDATA: + PL_put_atom(t, ATOM_pcdata); + goto card; + case MT_ELEMENT: + put_atom_wchars(t, m->content.element->name->name); + goto card; + case MT_AND: + f = FUNCTOR_and2; + break; + case MT_SEQ: + f = FUNCTOR_comma2; + break; + case MT_OR: + f = FUNCTOR_bar2; + break; + case MT_UNDEF: + default: + assert(0); + f = 0; + break; + } + + if ( !m->content.group ) + PL_put_atom(t, ATOM_empty); + else + make_model_list(t, m->content.group, f); + +card: + switch(m->cardinality) + { case MC_ONE: + break; + case MC_OPT: + PL_cons_functor_v(t, FUNCTOR_opt1, t); + break; + case MC_REP: + PL_cons_functor_v(t, FUNCTOR_rep1, t); + break; + case MC_PLUS: + PL_cons_functor_v(t, FUNCTOR_plus1, t); + break; + } +} + + +static void +put_content(term_t t, dtd_edef *def) +{ switch(def->type) + { case C_EMPTY: + PL_put_atom(t, ATOM_empty); + return; + case C_CDATA: + PL_put_atom(t, ATOM_cdata); + return; + case C_RCDATA: + PL_put_atom(t, ATOM_rcdata); + return; + case C_ANY: + PL_put_atom(t, ATOM_any); + return; + default: + if ( def->content ) + put_model(t, def->content); + } +} + + +static int +dtd_prop_elements(dtd *dtd, term_t prop) +{ term_t tail = PL_copy_term_ref(prop); + term_t head = PL_new_term_ref(); + term_t et = PL_new_term_ref(); + dtd_element *e; + + for( e=dtd->elements; e; e=e->next ) + { put_atom_wchars(et, e->name->name); + if ( !PL_unify_list(tail, head, tail) || + !PL_unify(head, et) ) + return FALSE; + } + + return PL_unify_nil(tail); +} + + +static int +get_element(dtd *dtd, term_t name, dtd_element **elem) +{ ichar *s; + dtd_element *e; + dtd_symbol *id; + + if ( !PL_get_wchars(name, NULL, &s, CVT_ATOM|CVT_EXCEPTION) ) + return FALSE; + + if ( !(id=dtd_find_symbol(dtd, s)) || + !(e=id->element) ) + return FALSE; + + *elem = e; + return TRUE; +} + + + + +static int +dtd_prop_element(dtd *dtd, term_t name, term_t omit, term_t content) +{ dtd_element *e; + dtd_edef *def; + term_t model = PL_new_term_ref(); + + if ( !get_element(dtd, name, &e) || !(def=e->structure) ) + return FALSE; + + if ( !PL_unify_term(omit, PL_FUNCTOR, FUNCTOR_omit2, + PL_ATOM, def->omit_open ? ATOM_true : ATOM_false, + PL_ATOM, def->omit_close ? ATOM_true : ATOM_false) ) + return FALSE; + + put_content(model, def); + return PL_unify(content, model); +} + + +static int +dtd_prop_attributes(dtd *dtd, term_t ename, term_t atts) +{ dtd_element *e; + term_t tail = PL_copy_term_ref(atts); + term_t head = PL_new_term_ref(); + term_t elem = PL_new_term_ref(); + dtd_attr_list *al; + + if ( !get_element(dtd, ename, &e) ) + return FALSE; + + for(al=e->attributes; al; al=al->next) + { put_atom_wchars(elem, al->attribute->name->name); + + if ( !PL_unify_list(tail, head, tail) || + !PL_unify(head, elem) ) + return FALSE; + } + + return PL_unify_nil(tail); +} + + +typedef struct _plattrdef +{ attrtype type; /* AT_* */ + const char * name; /* name */ + int islist; /* list-type */ + atom_t atom; /* name as atom */ +} plattrdef; + +static plattrdef plattrs[] = +{ + { AT_CDATA, "cdata", FALSE }, + { AT_ENTITY, "entity", FALSE }, + { AT_ENTITIES, "entity", TRUE }, + { AT_ID, "id", FALSE }, + { AT_IDREF, "idref", FALSE }, + { AT_IDREFS, "idref", TRUE }, + { AT_NAME, "name", FALSE }, + { AT_NAMES, "name", TRUE }, +/*{ AT_NAMEOF, "nameof", FALSE },*/ + { AT_NMTOKEN, "nmtoken", FALSE }, + { AT_NMTOKENS, "nmtoken", TRUE }, + { AT_NUMBER, "number", FALSE }, + { AT_NUMBERS, "number", TRUE }, + { AT_NUTOKEN, "nutoken", FALSE }, + { AT_NUTOKENS, "nutoken", TRUE }, + { AT_NOTATION, "notation", FALSE }, + + { AT_CDATA, NULL, FALSE } +}; + + +static int +unify_attribute_type(term_t type, dtd_attr *a) +{ plattrdef *ad = plattrs; + + for( ; ad->name; ad++ ) + { if ( ad->type == a->type ) + { if ( !ad->atom ) + ad->atom = PL_new_atom(ad->name); + + if ( ad->islist ) + return PL_unify_term(type, PL_FUNCTOR, FUNCTOR_list1, + PL_ATOM, ad->atom); + else + return PL_unify_atom(type, ad->atom); + } + } + + if ( a->type == AT_NAMEOF || a->type == AT_NOTATION ) + { dtd_name_list *nl; + term_t tail = PL_new_term_ref(); + term_t head = PL_new_term_ref(); + term_t elem = PL_new_term_ref(); + + if ( !PL_unify_functor(type, + a->type == AT_NAMEOF ? + FUNCTOR_nameof1 : + FUNCTOR_notation1) ) + return FALSE; + PL_get_arg(1, type, tail); + + for(nl = a->typeex.nameof; nl; nl = nl->next) + { put_atom_wchars(elem, nl->value->name); + + if ( !PL_unify_list(tail, head, tail) || + !PL_unify(head, elem) ) + return FALSE; + } + return PL_unify_nil(tail); + } + + assert(0); + return FALSE; +} + + + +static int +unify_attribute_default(term_t defval, dtd_attr *a) +{ int v; + + switch(a->def) + { case AT_REQUIRED: + return PL_unify_atom_chars(defval, "required"); + case AT_CURRENT: + return PL_unify_atom_chars(defval, "current"); + case AT_CONREF: + return PL_unify_atom_chars(defval, "conref"); + case AT_IMPLIED: + return PL_unify_atom_chars(defval, "implied"); + case AT_DEFAULT: + v = PL_unify_functor(defval, FUNCTOR_default1); + goto common; + case AT_FIXED: + v = PL_unify_functor(defval, FUNCTOR_fixed1); + common: + if ( v ) + { term_t tmp = PL_new_term_ref(); + + PL_get_arg(1, defval, tmp); + switch( a->type ) + { case AT_CDATA: + return PL_unify_wchars(tmp, PL_ATOM, ENDSNUL, a->att_def.cdata); + case AT_NAME: + case AT_NMTOKEN: + case AT_NAMEOF: + case AT_NOTATION: + return PL_unify_wchars(tmp, PL_ATOM, ENDSNUL, a->att_def.name->name); + case AT_NUMBER: + return PL_unify_integer(tmp, a->att_def.number); + default: + assert(0); + } + } else + return FALSE; + default: + assert(0); + return FALSE; + } +} + + +static int +dtd_prop_attribute(dtd *dtd, term_t ename, term_t aname, + term_t type, term_t def_value) +{ dtd_element *e; + ichar *achars; + dtd_symbol *asym; + dtd_attr_list *al; + + + if ( !get_element(dtd, ename, &e) ) + return FALSE; + if ( !PL_get_wchars(aname, NULL, &achars, CVT_ATOM|CVT_EXCEPTION) ) + return FALSE; + if ( !(asym=dtd_find_symbol(dtd, achars)) ) + return FALSE; + + for(al=e->attributes; al; al=al->next) + { if ( al->attribute->name == asym ) + { if ( unify_attribute_type(type, al->attribute) && + unify_attribute_default(def_value, al->attribute) ) + return TRUE; + + return FALSE; + } + } + + return FALSE; +} + + +static int +dtd_prop_entities(dtd *dtd, term_t list) +{ term_t tail = PL_copy_term_ref(list); + term_t head = PL_new_term_ref(); + term_t et = PL_new_term_ref(); + dtd_entity *e; + + for( e=dtd->entities; e; e=e->next ) + { put_atom_wchars(et, e->name->name); + if ( !PL_unify_list(tail, head, tail) || + !PL_unify(head, et) ) + return FALSE; + } + + return PL_unify_nil(tail); +} + + +static int +dtd_prop_entity(dtd *dtd, term_t ename, term_t value) +{ ichar *s; + dtd_entity *e; + dtd_symbol *id; + + if ( !PL_get_wchars(ename, NULL, &s, CVT_ATOM|CVT_EXCEPTION) ) + return FALSE; + + if ( !(id=dtd_find_symbol(dtd, s)) || + !(e=id->entity) ) + return FALSE; + + switch(e->type) + { case ET_SYSTEM: + return PL_unify_term(value, PL_FUNCTOR_CHARS, "system", 1, + PL_CHARS, e->exturl); + case ET_PUBLIC: + if ( e->exturl ) + return PL_unify_term(value, PL_FUNCTOR_CHARS, "public", 2, + PL_CHARS, e->extid, + PL_CHARS, e->exturl); + else + return PL_unify_term(value, PL_FUNCTOR_CHARS, "public", 2, + PL_CHARS, e->extid, + PL_VARIABLE); + + case ET_LITERAL: + default: + if ( e->value ) + { const char *wrap; + + switch(e->content) + { case EC_SGML: wrap = "sgml"; break; + case EC_STARTTAG: wrap = "start_tag"; break; + case EC_ENDTAG: wrap = "end_tag"; break; + case EC_CDATA: wrap = NULL; break; + case EC_SDATA: wrap = "sdata"; break; + case EC_NDATA: wrap = "ndata"; break; + case EC_PI: wrap = "pi"; break; + default: wrap = NULL; assert(0); + } + + if ( wrap ) + return PL_unify_term(value, PL_FUNCTOR_CHARS, wrap, 1, + PL_CHARS, e->value); + else + return PL_unify_wchars(value, PL_ATOM, wcslen(e->value), e->value); + } + } + + assert(0); + return FALSE; +} + + +static int +dtd_prop_notations(dtd *dtd, term_t list) +{ dtd_notation *n; + term_t tail = PL_copy_term_ref(list); + term_t head = PL_new_term_ref(); + + for(n=dtd->notations; n; n=n->next) + { if ( PL_unify_list(tail, head, tail) && + PL_unify_wchars(head, PL_ATOM, wcslen(n->name->name), n->name->name) ) + continue; + + return FALSE; + } + + return PL_unify_nil(tail); +} + + +static int +dtd_prop_notation(dtd *dtd, term_t nname, term_t desc) +{ char *s; + dtd_symbol *id; + + if ( !PL_get_atom_chars(nname, &s) ) + return sgml2pl_error(ERR_TYPE, "atom", nname); + + if ( (id=dtd_find_symbol(dtd, (ichar *)s)) ) + { dtd_notation *n; + + for(n=dtd->notations; n; n=n->next) + { if ( n->name == id ) + { term_t tail = PL_copy_term_ref(desc); + term_t head = PL_new_term_ref(); + + if ( n->system ) + { if ( !PL_unify_list(tail, head, tail) || + !PL_unify_term(head, + PL_FUNCTOR_CHARS, "system", 1, + PL_CHARS, n->system) ) + return FALSE; + } + if ( n->public ) + { if ( !PL_unify_list(tail, head, tail) || + !PL_unify_term(head, + PL_FUNCTOR_CHARS, "public", 1, + PL_CHARS, n->public) ) + return FALSE; + } + + return PL_unify_nil(tail); + } + } + } + + return FALSE; +} + + + +typedef struct _prop +{ int (*func)(); + const char *name; + int arity; + functor_t functor; +} prop; + + +static prop dtd_props[] = +{ { dtd_prop_doctype, "doctype", 1 }, + { dtd_prop_elements, "elements", 1 }, + { dtd_prop_element, "element", 3 }, + { dtd_prop_attributes, "attributes", 2, }, + { dtd_prop_attribute, "attribute", 4, }, + { dtd_prop_entities, "entities", 1, }, + { dtd_prop_entity, "entity", 2, }, + { dtd_prop_notations, "notations", 1, }, + { dtd_prop_notation, "notation", 2, }, + { NULL } +}; + + +static void +initprops(void) +{ static int done = FALSE; + + if ( !done ) + { prop *p; + + done = TRUE; + for(p=dtd_props; p->func; p++) + p->functor = PL_new_functor(PL_new_atom(p->name), p->arity); + } +} + + +/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +dtd_property(DTD, doctype(DocType)) +dtd_property(DTD, elements(ListOfNames)) +dtd_property(DTD, element(Name, Omit, Model)) +dtd_property(DTD, attributes(ElementName, ListOfAttributes)), +dtd_property(DTD, attribute(ElementName, AttributeName, Type, Default)) +dtd_property(DTD, entities(ListOfEntityNames)) +dtd_property(DTD, entity(Name, Type)) +dtd_property(DTD, notations(ListOfNotationNames) +dtd_property(DTD, notation(Name, File)) +- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ + +static foreign_t +pl_dtd_property(term_t ref, term_t property) +{ dtd *dtd; + const prop *p; + + initprops(); + + if ( !get_dtd(ref, &dtd) ) + return FALSE; + + for(p=dtd_props; p->func; p++) + { if ( PL_is_functor(property, p->functor) ) + { term_t a = PL_new_term_refs(p->arity); + int i; + + for(i=0; iarity; i++) + PL_get_arg(i+1, property, a+i); + + switch(p->arity) + { case 1: + return (*p->func)(dtd, a+0); + case 2: + return (*p->func)(dtd, a+0, a+1); + case 3: + return (*p->func)(dtd, a+0, a+1, a+2); + case 4: + return (*p->func)(dtd, a+0, a+1, a+2, a+3); + default: + assert(0); + return FALSE; + } + } + } + + return sgml2pl_error(ERR_DOMAIN, "dtd_property", property); +} + + /******************************* + * CATALOG * + *******************************/ + +static foreign_t +pl_sgml_register_catalog_file(term_t file, term_t where) +{ wchar_t *fn; + char *w; + catalog_location loc; + + if ( !PL_get_wchars(file, NULL, &fn, CVT_ATOM|CVT_EXCEPTION) ) + return FALSE; + if ( !PL_get_atom_chars(where, &w) ) + return sgml2pl_error(ERR_TYPE, "atom", where); + + if ( streq(w, "start") ) + loc = CTL_START; + else if ( streq(w, "end") ) + loc = CTL_END; + else + return sgml2pl_error(ERR_DOMAIN, "location", where); + + return register_catalog_file(fn, loc); +} + + + /******************************* + * INSTALL * + *******************************/ + +extern install_t install_xml_quote(void); +#ifdef O_STATISTICS +extern void sgml_statistics(void); +#endif + +install_t +install(void) +{ initConstants(); + + PL_register_foreign("new_dtd", 2, pl_new_dtd, 0); + PL_register_foreign("free_dtd", 1, pl_free_dtd, 0); + PL_register_foreign("new_sgml_parser", 2, pl_new_sgml_parser, 0); + PL_register_foreign("free_sgml_parser", 1, pl_free_sgml_parser, 0); + PL_register_foreign("set_sgml_parser", 2, pl_set_sgml_parser, 0); + PL_register_foreign("get_sgml_parser", 2, pl_get_sgml_parser, 0); + PL_register_foreign("open_dtd", 3, pl_open_dtd, 0); + PL_register_foreign("sgml_parse", 2, pl_sgml_parse, + PL_FA_TRANSPARENT); + PL_register_foreign("_sgml_register_catalog_file", 2, + pl_sgml_register_catalog_file, 0); + PL_register_foreign("$dtd_property", 2, pl_dtd_property, + 0); + + install_xml_quote(); +#ifdef O_STATISTICS + atexit(sgml_statistics); +#endif +} + diff --git a/packages/sgml/sgml_mode.html b/packages/sgml/sgml_mode.html new file mode 100644 index 000000000..518b6e706 --- /dev/null +++ b/packages/sgml/sgml_mode.html @@ -0,0 +1,115 @@ + + + + +PceEmacs SGML mode + + + +

    PceEmacs SGML mode

    + +

    Syntax highlighting

    + +

    +These PceEmacs modes are designed to be simple. Colouring uses the +following colours + +

    + +
    bold blue + begin tag +
    + bold blue onlight-grey + + SGML SHORTREF expansion +
    blue + end tag +
    sea-green + CDATA declared content element +
    orange + warning +
    red + error +
    + +

    +Colouring the whole buffer is the only really safe colouring method, but +unfortunately doesn't scale with large files. Therefore, this mode is +only initiated after loading a file in this mode, after typing +Control-L (recenter) or after an explicit +colourise_buffer command. + +

    +The more error-prone colourise_element is activated after many +commands as well as after an idle-period of 2 seconds. This command +searches backward to the first element that encloses the caret and then +colourises this element. It is errorprone for two reasons. First of +all, it does not validate whether the checked element itself is at a +valid location. Second, it does not see elements allowed throught the +DTD's +(Model) construct. + + +

    Errors and warnings

    + +

    +Errors and warning encountered during parsing cause the suspect region +to be coloured orange (warning) or red (error). Moving the caret in the suspect +area causes the error to be printed in the status line at the bottom +of the editor. + + +

    Syntax-guided editing

    + +

    +Most syntax-guided editing is supported using a context-sensitive +popup-menu associated with the right mouse button. To create an +new element, press the right mouse-button at the desired location +and select the desired element. The modifier buttons define how +the element is inserted: + +

    + +
    control Insert adjacent begin- and end-tag. +
    shift Place begin- and end-tag each on their own line +
    alt (SGML only) create a SHORTTAG element +
    + +

    +To tag the current selection with a tag, first make a selection and +then press the right mouse-button inside the selection. The same +modifiers as above apply. If no modifier is given, the default +is to place the tag at their own line of the selection contains +complete lines and `in-line' otherwise. + +

    +To add attributes to an element, press the right-button inside the +elements begin-tag and select the desired elements. + + +

    Bugs and TODO

    + +
      +
    • Cache the parsed DTD +
    • Deal with included +(Group) elements properly +
    • Display total number of encountered errors and provide navigation + on large files +
    • Personalise layout: style to use for certain tags, syntax + highlighting, etc. +
    • Take are of already present attributes and elements. +
    + + + + + + diff --git a/packages/sgml/sgml_mode.pl b/packages/sgml/sgml_mode.pl new file mode 100644 index 000000000..55c3a5f3b --- /dev/null +++ b/packages/sgml/sgml_mode.pl @@ -0,0 +1,1081 @@ +/* $Id$ + + Part of SWI-Prolog + + Author: Jan Wielemaker + E-mail: jan@swi.psy.uva.nl + WWW: http://www.swi-prolog.org + Copyright (C): 1985-2002, University of Amsterdam + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License + as published by the Free Software Foundation; either version 2 + of the License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + + As a special exception, if you link this library with other files, + compiled with a Free Software compiler, to produce an executable, this + library does not by itself cause the resulting executable to be covered + by the GNU General Public License. This exception does not however + invalidate any other reasons why the executable file might be covered by + the GNU General Public License. +*/ + +:- module(emacs_sgml_mode, []). +:- use_module(library(pce)). +:- use_module(library(emacs_extend)). + +/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +This module exploits the SGML/XML parser from the SWI-Prolog package +sgml2pl to provide syntax colouring for SGML, XML and HTML modes. Based +on a true parser, we can provide much better feedback as heuristic +parsers used in most syntax-driven editors. For example, we can provide +feedback on SHORTREF matches in SGML mode by highlighting the tokens +acting as a short reference. We can also easily give the scope of +elements that are closed by omited elements. +- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ + +:- emacs_begin_mode(sgml, language, + "Mode for editing SGML documents", + % BINDINGS + [ open_document = button(sgml), + reload_dtd = button(sgml), + colourise_buffer = button(sgml), + colourise_and_recenter = key('\\C-l'), + tag_selection = key('\\e,'), + colourise_element = key('\\C-c\\C-s'), + forward_move_out = key('\\ee') + ], + % SYNTAX TABLE + [ '"' = string_quote('"'), + '\'' = string_quote('\''), + paragraph_end('\\s *$\\|^

    +This page describes the PceEmacs modes SGML, HTML and +XML. These modes are based on the +SWI-Prolog +package sgml2pl, an SGML/XML parser for Prolog. It exploits this parser +to achieve both colouring syntax elements and errors as well as to +provided menu-driven editing of SGML, HTML and XML documents. + + +

    \\|\\s +<') + ]). + +class_variable(auto_colourise_size_limit, int, 100000, + "Auto-colourise if buffer is smaller then this"). + +variable(dialect, + {sgml,xml,html}:=sgml, + both, + "?ML Dialect used to parse"). +variable(upcase_elements, + bool := @off, + both, + "Upcase inserted elements?"). +variable(parser, + prolog, + none, + "Associated (DTD) parser"). +variable(auto_colourise_size_limit, + int, + both, + "Auto-colourise if buffer is smaller then this"). + +% make_parser(M, Parser) +% +% Create a parser suitable for the current mode and load the DTD +% into it. + +make_parser(M, Parser) :- + get(M, dialect, Dialect), + get(M, text_buffer, TB), + get(TB, file, File), + get(File, name, FileName), + get(M, dialect, Dialect), + ( Dialect == html + -> TheDialect = sgml, + dtd(html, DTD), + Options = [dtd(DTD)] + ; TheDialect = Dialect, + Options = [] + ), + new_sgml_parser(Parser, Options), + set_sgml_parser(Parser, file(FileName)), + set_sgml_parser(Parser, dialect(TheDialect)). + + +% load_dtd(+Mode, +Parser) +% +% Load the document DTD into the given parser. + +load_dtd(M, _) :- + get(M, dialect, html), !. +load_dtd(M, Parser) :- + get(M, text_buffer, TB), + new(Re, regex(' get(Re, register_start, Start), + pce_open(TB, read, In), + seek(In, Start, bof, _), + catch(sgml_parse(Parser, + [ source(In), + parse(declaration) + ]), + E, + show_message(M, E)), + close(In) + ; send(M, report, warning, 'No true + ; send(M, destroy_dtd), + make_parser(M, Parser), + load_dtd(M, Parser), + send(M, slot, parser, Parser), + set_sgml_parser(Parser, doctype(_)) % use for partial parsing + ). + + +dtd(M, Reload:[bool], DTD:prolog) :<- + "Fetch the current DTD":: + get(M, parser, Reload, Parser), + get_sgml_parser(Parser, dtd(DTD)). + + +reload_dtd(M) :-> + "Reload the DTD":: + get(M, dtd, _). + + +destroy_dtd(M) :-> + "Destroy the associated DTD object":: + ( get(M, slot, parser, Parser), + Parser = sgml_parser(_) + -> free_sgml_parser(Parser), + send(M, slot, parser, []) + ; true + ). + + + /******************************* + * INITIALISE * + *******************************/ + +setup_mode(E) :-> + "Switch editor into fill-mode":: + send(E, fill_mode, @on). + + +unlink(M) :-> + send(M, destroy_dtd), + send_super(M, unlink). + + +open_document(M, DT:doctype=name) :-> + "Insert \n\n', DT), + send(M, backward_char, 4). + + + /******************************* + * HELP * + *******************************/ + +help_on_mode(M) :-> + ( absolute_file_name(sgml_mode, + [ extensions([html]), + access(read), + file_errors(fail) + ], + HTML) + -> atom_concat('file:', HTML, URI), + www_open_url(URI) + ; send(M, report, warning, 'Can''t find help file') + ). + + /******************************* + * RECOULOR POLICIES * + *******************************/ + +idle(M) :-> + "Idle event was received, colour the current element":: + send(M, colourise_element, @off). + +setup_styles(M) :-> + "Associate defined syntax-styles":: + ( get(M, attribute, styles_initialised, @on) + -> true + ; send(M, reload_styles), + send(M, attribute, styles_initialised, @on) + ). + +set_caret_and_inform(M) :-> + get(M, editor, Editor), + get(Editor?image, index, @event, Caret), + send(M, caret, Caret), + get(M?text_buffer, find_all_fragments, + message(@arg1, overlap, Caret), + Fragments), + send(Fragments, sort, ?(@arg1?length, compare, @arg2?length)), + get(Fragments, find, ?(@arg1, attribute, balloon), Frag), + get(Frag, balloon, Balloon), + send(M, report, warning, 'SGML warning: %s', Balloon). + +event(M, Ev:event) :-> + "Show insert-menu on right-down":: + send(Ev, is_a, ms_right_down), + ( get(M?image, index, Ev, I) + -> send(M, caret, I) + ; true + ), + send(M, show_menu, Ev). + +reload_styles(M) :-> + "Force reloading the styles":: + ( style_object(Name, Style), + send(M, style, Name, Style), + fail + ; true + ). + +colourise_element(M, Warn:[bool]) :-> + "Colour element at location":: + send(M, setup_styles), + get(M, caret, Caret), + get(M, text_buffer, TB), + new(Re, regex('<\\w+')), + make_parser(M, Parser), + load_dtd(M, Parser), + set_sgml_parser(Parser, doctype(_)), + pce_open(TB, read, In), + ( get(TB, scan, Caret, line, -2, start, Start), +% format('Starting from ~w~n', [Start]), + find_element(M, Parser, Re, In, Start, From-To), + Caret < To + -> send(M, remove_syntax_fragments, From, To), +% colour_item(element, TB, From, To), + seek(In, From, bof, _), + set_sgml_parser(Parser, charpos(From)), + colourise(M, Parser, + [ source(In), + parse(element) + ]) + ; Warn == @off + -> true + ; send(M, report, warning, 'Could not find element') + ), + close(In), + free_sgml_parser(Parser). + +% find_element(+Mode, +Parser, +BeginRegex, +In, +Caret, -From-To) +% +% Find the start and end of the current element. We do so by scanning +% backwards to '<\\w+' (Re). Then we parse the element and see where +% it ends. If this isn't passed the current caret location we look +% further backward. +% +% This predicate is non-deterministic, broadening the scope on +% backtracking. +% +% By asserting caret/1 before calling this predicate, it will +% assert a term element(Tag, Attributes, Start, End), where both +% locations are terms of the form loc(Class, Start, End) indicating +% the location and type of the begin- and end-tag. + +:- dynamic + caret/1, % Caret + element/4, % Tag, Attributes, Start, End + stack/5. % Tag, Attributes, Class, Fro, To + +set_caret(Caret) :- + retractall(caret(Caret)), + assert(caret(Caret)). + +unset_caret :- + retractall(caret(_)). + +find_element(M, Caret, Range) :- + get(M, parser, Parser), + get(M, text_buffer, TB), + pce_open(TB, read, In), + new(Re, regex('<\\w+')), + ( find_element(M, Parser, Re, In, Caret, Range) + -> close(In) + ; close(In), + fail + ). + +find_element(M, Parser, Re, In, Caret, Range) :- + get(M, text_buffer, TB), + send(Re, search, TB, Caret, 0), + get(Re, register_start, 0, Start0), + find_element(M, Parser, Re, In, Caret, Start0, Range). + +find_element(M, Parser, _Re, In, Caret, Start, Start-To) :- + \+ get(M?text_buffer, find_fragment, + and(message(@arg1, overlap, Start), + @arg1?parsed == @off), + _), + seek(In, Start, bof, _), + set_sgml_parser(Parser, charpos(Start)), + ( caret(_) + -> retractall(element(_,_,_,_)), + retractall(stack(_,_,_,_,_)), + Extra = [ call(begin, emacs_sgml_mode:find_on_begin), + call(end, emacs_sgml_mode:find_on_end) + ] + ; Extra = [] + ), + catch(sgml_parse(Parser, + [ source(In), + parse(element), + syntax_errors(quiet) + | Extra + ]), + E, + show_message(M, E)), + % charpos/1 yields start-position + get_sgml_parser(Parser, charpos(_, To)), +% format('Found ~d-~d~n', [Start, To]), + To-1 > Caret. +find_element(M, Parser, Re, In, Caret, Start0, Range) :- + get(M, text_buffer, TB), + send(Re, search, TB, Start0, 0), + get(Re, register_start, 0, Start1), + find_element(M, Parser, Re, In, Caret, Start1, Range). + +find_on_begin(Tag, Attributes, Parser) :- + get_sgml_parser(Parser, charpos(From, To)), + get_sgml_parser(Parser, event_class(Class)), +% format('BEGIN: ~w ~w-~w (~w)~n', [Tag, From, To, Class]), + asserta(stack(Tag, Attributes, Class, From, To)). +find_on_end(Tag, Parser) :- + get_sgml_parser(Parser, charpos(EFrom, ETo)), +% format('END: ~w ~w-~w~n', [Tag, EFrom, ETo]), + retract(stack(Tag, Attributes, BClass, BFrom, BTo)), + caret(Caret), + ( between(BFrom, ETo, Caret) + -> get_sgml_parser(Parser, event_class(EClass)), + ( element(_,_,_,_) + -> true + ; assert(element(Tag, Attributes, + loc(BClass, BFrom, BTo), + loc(EClass, EFrom, ETo))) + ) + ; true + ). + + + /******************************* + * COLOURISING * + *******************************/ + +colourise_and_recenter(M) :-> + "Colour according to syntax and recenter":: + send(M, auto_colourise_buffer), + send(M, recenter). + +colourise_buffer(M) :-> + OldTime is cputime, + new(Class, class(sgml_mode_fragment)), + get(Class, no_created, OldCreated), + + send(M, setup_styles), + send(M, remove_syntax_fragments), + send(M, report, progress, 'Colourising buffer ...'), + colourise_buffer(M), + Used is cputime - OldTime, + get(Class, no_created, NewCreated), + Created is NewCreated - OldCreated, + send(M, report, done, + 'done, %.2f seconds, %d fragments', Used, Created). + +:- dynamic + current_tb/2. + +colourise_buffer(M) :- + make_parser(M, Parser), + get(M, text_buffer, TB), + pce_open(TB, read, In), + colourise(M, Parser, + [ source(In) + ]), + free_sgml_parser(Parser). + +colourise(M, Parser, Options) :- + get_sgml_parser(Parser, file(File)), + get(M, text_buffer, TB), + asserta(current_tb(TB, File), Ref), + catch(sgml_parse(Parser, + [ call(begin, emacs_sgml_mode:on_begin), + call(end, emacs_sgml_mode:on_end), + call(cdata, emacs_sgml_mode:on_cdata), + call(decl, emacs_sgml_mode:on_decl), + call(error, emacs_sgml_mode:on_error) + | Options + ]), + E, + show_message(M, E)), + erase(Ref). + +on_begin(_Tag, _Attributes, Parser) :- + get_sgml_parser(Parser, file(File)), + current_tb(TB, File), +% format('Tag ~w, Attr = ~p~n', [Tag, Attributes]), + get_sgml_parser(Parser, charpos(From, To)), + get_sgml_parser(Parser, event_class(Class)), + Class \== omitted, + colour_item(tag(begin, Class), TB, From, To). +on_end(_Tag, Parser) :- + get_sgml_parser(Parser, file(File)), + current_tb(TB, File), + get_sgml_parser(Parser, charpos(From, To)), + get_sgml_parser(Parser, event_class(Class)), +% format('At ~d-~d: Class = ~w~n', [From, To, Class]), + Class \== omitted, + colour_item(tag(end, Class), TB, From, To). +on_cdata(_CDATA, Parser) :- + get_sgml_parser(Parser, file(File)), + current_tb(TB, File), + get_sgml_parser(Parser, charpos(From, To)), + ( get_sgml_parser(Parser, context([Tag|_])) + -> ( get_sgml_parser(Parser, dtd(DTD)), + dtd_property(DTD, element(Tag, _, Model)), + ( Model == cdata + ; Model == rcdata + ) + -> Type = cdata + ; Type = pcdata + ) + ), +% format('CDATA from ~d-~d~n', [From, To]), + colour_item(cdata(Type), TB, From, To, Fragment), + ( Type == cdata + -> send(Fragment, parsed, @off) + ; true + ). +on_decl(DECL, Parser) :- + get_sgml_parser(Parser, file(File)), + current_tb(TB, File), + get_sgml_parser(Parser, event_class(explicit)), + get_sgml_parser(Parser, charpos(From, To)), +% format('Decl ~d-~d: ~w~n', [From, To, DECL]), + ( DECL == '' + -> colour_item(comment, TB, From, To, Fragment), + send(Fragment, parsed, @off) + ; send(regex('DOCTYPE', @off), match, DECL) + -> colour_item(doctype, TB, From, To) + ; new(Re, regex('\\w*')), + send(Re, match, DECL), + get(Re, register_value, DECL, 0, name, DeclType0), + get(DeclType0, downcase, DeclType), +% format('Decl(~w)~n', [DeclType]), + colour_item(decl(DeclType), TB, From, To) + ). +on_error(Severity, Message, Parser) :- + current_tb(TB, File), + ( get_sgml_parser(Parser, file(File)) + -> get_sgml_parser(Parser, charpos(From, To)), + colour_item(error(Severity), TB, From, To, Fragment), + ( Fragment \== @nil + -> send(Fragment, message, Message), + send(Fragment, severity, Severity) + ; true + ) + ; format(user_error, 'SGML: Error in other file!~n', []) + ). + +% colour_item(+Class, +TB, +Pos) +% +% colourise region if a style is defined for this class. + +colour_item(Class, TB, From, To) :- + colour_item(Class, TB, From, To, _Fragment). + +colour_item(Class, TB, From, To, Fragment) :- + style_name(Class, Name), !, + Len is To - From, + Len > 0, + new(Fragment, sgml_mode_fragment(TB, From, Len, Name)). +colour_item(_, _, _, _, @nil). + + + /******************************* + * STYLES * + *******************************/ + +:- discontiguous + style_name/2, % +Pattern, -StyleName + style_object/2. % +Name, -Style + +term_expansion(style(Pattern, Style), + [ style_name(Pattern, Name), + style_object(Name, Style) + ]) :- + gensym(syntax_style_, Name). + +style(tag(begin, shortref), style(colour := blue, + background := grey90, + bold := @on)). +style(tag(begin, _), style(colour := blue, + bold := @on)). +style(tag(end, shorttag), style(colour := blue, + bold := @on)). +style(tag(end, shortref), style(colour := blue, + background := grey90, + bold := @on)). +style(tag(end, _), style(colour := blue)). +style(cdata(cdata), style(colour := sea_green)). +style(doctype, style(bold := @on)). +style(comment, style(colour := dark_green, + background := grey90)). +style(decl(_), style(colour := purple)). +style(error(warning), style(background := orange)). +style(error(_), style(background := red)). +style(entity, style(colour := dark_green)). +style(element, style(background := pale_turquoise)). + + + /******************************* + * TAGGING * + *******************************/ + +set_insert_point(M, Point:[int]) :-> + "Set mark at point if not set":: + get(M, mark, Mark), + ( Mark == 0 + -> ( Point == @default + -> send(M, mark, M?caret) + ; send(M, mark, Point) + ) + ; true + ). + +insert_begin(M, Tag:name) :-> + "Insert begin-tag and required attributes":: + fix_case(M, Tag, TheTag), + send(M, format, '<%s>', TheTag), + get(M, dtd, DTD), + findall(A, dtd_property(DTD, attribute(Tag, A, _, required)), List), + send(M, backward_char), + insert_attributes(List, M), + send(M, forward_char). + + +insert_attributes([], _). +insert_attributes([H|T], M) :- + send(M, format, ' %s=""', H), + send(M, set_insert_point, M?caret-1), + insert_attributes(T, M). + + +insert_end(M, Tag:name) :-> + "Insert end-tag for element":: + fix_case(M, Tag, TheTag), + send(M, format, '', TheTag). + + +fix_case(M, Tag, TheTag) :- + ( get(M, upcase_elements, @on) + -> get(Tag, upcase, TheTag) + ; TheTag = Tag + ). + + +style_for_event(Ev, Style) :- + ( send(Ev, has_modifier, c) + -> Style = inline + ; send(Ev, has_modifier, s) + -> Style = block + ; send(Ev, has_modifier, m) + -> Style = shorttag + ; Style = @default + ). + + +show_menu(M, Ev:event) :-> + "Show menu to insert-tag/tag selection":: + ( send(M, in_tag) + -> send(M, show_attribute_menu, Ev) + ; send(M, show_element_menu, Ev) + ). + + +in_tag(M) :-> + "Test whether caret is between <>":: + get(M, caret, Caret), + get(M, text_buffer, TB), + send(regex('<[^>]*'), match, TB, Caret, 0), + send(regex('[^<]*[>/]'), match, TB, Caret). % / for shortag + + +show_element_menu(M, Ev:event) :-> + "Show menu for inserting a new element":: + ( get(M, allowed_elements, List), + delete(List, '#pcdata', Elems), + Elems \== [], + sort(Elems, Sorted) + -> ( get(M, selection, point(A,B)), B > A + -> Label = tag_selection + ; Label = insert_element + ), + get(Ev, button, Button), + style_for_event(Ev, Style), + new(G, popup_gesture(new(P, popup(Label, + message(M, popup_tag_selection, + @arg1, Style))), + Button, new(modifier))), + send(P, show_label, @on), + length(Sorted, Len), + Cols is max(1, Len // 20), + send(P, columns, Cols), + send_list(P, append, Sorted), + send(G, event, Ev) + ; send(M, report, warning, 'No element allowed here') + ). + + +show_attribute_menu(M, Ev:event) :-> + "Show menu for adding attributes":: + get(M, caret, Caret), + get(M, text_buffer, TB), + new(Re, regex('<[^>]*')), + send(Re, match, TB, Caret, 0), + get(Re, register_start, 0, Start), + ( get(M, looking_at_element, Start, E) + -> make_parser(M, Parser), + load_dtd(M, Parser), + get_sgml_parser(Parser, dtd(DTD)), + dtd_property(DTD, attributes(E, Atts)), + ( Atts == [] + -> free_sgml_parser(Parser), + send(M, report, warning, 'Element "%s" has no attributes', E) + ; sort(Atts, Sorted), +% format('Atts = ~p~n', [Sorted]), + get(Ev, button, Button), + new(G, popup_gesture(new(P, popup(add_attribute, + message(M, insert_attribute, + @arg1))), + Button, + new(modifier))), + send(P, show_label, @on), + length(Sorted, Len), + Cols is max(1, Len // 10), + send(P, columns, Cols), + fill_attribute_menu(Sorted, DTD, E, P, M), + free_sgml_parser(Parser), + send(G, event, Ev) + ) + ; send(M, report, warning, 'Not in begin-tag') + ). + + +fill_attribute_menu([], _, _, _, _). +fill_attribute_menu([A|T], DTD, E, P, Mode) :- + dtd_property(DTD, attribute(E, A, Type, Default)), + add_attribute_menu(Type, Default, A, P, Mode), + fill_attribute_menu(T, DTD, E, P, Mode). + +add_attribute_menu(nameof(List), Def, A, P, Mode) :- !, + send(P, append, new(P2, popup(A, message(Mode, insert_attribute, + A, @arg1)))), + add_attribute_values(List, Def, P2). +add_attribute_menu(Type, Def, A, P, _Mode) :- !, + type_label(Type, TypeLabel), + send(P, append, new(MI, menu_item(A, @default, + string('%s (%s)', A, TypeLabel)))), + ( Def == required + -> send(MI, font, bold) + ; true + ). + +type_label(list(Type), Label) :- !, + atom_concat(Type, s, Label). +type_label(Type, Type). + +add_attribute_values([], _, _). +add_attribute_values([H|T], Def, P) :- + send(P, append, new(MI, menu_item(H))), + ( Def == default(H) + -> send(MI, font, bold) + ; true + ), + add_attribute_values(T, Def, P). + + +insert_attribute(M, Att:name, Val:'[name|int|real]') :-> + "Add attribute-value pair":: + get(M, text_buffer, TB), + get(M, caret, Caret), + new(Re, regex('\\(\\s +\\|[/>]\\)')), + send(Re, search, TB, Caret), % find place to insert + get(Re, register_start, 0, Where), + ( send(regex('\\s +'), match, TB, Where) + -> get(Re, register_end, 0, NewCaret), % after blanks + send(M, caret, NewCaret) + ; send(M, caret, Where), + send(M, format, ' ') + ), + ( Val == @default + -> send(M, format, '%s=""', Att), + get(M, caret, C), + ( send(M, looking_at, '\\s \\|[/>]') + -> true + ; send(M, format, ' ') + ), + send(M, caret, C-1) + ; send(M, format, '%s="%s"', Att, Val), + ( send(M, looking_at, '\\s \\|[/>]') + -> true + ; send(M, format, ' ') + ) + ), + send(M, mark_undo). % called from popup! + + +popup_tag_selection(M, Tag:name, Style0:[{inline,shorttag,block}]) :-> + "->tag_selection wrapper for popup":: + ( Style0 == @default + -> style_for_event(@event, Style) + ; Style = Style0 + ), + send(M, tag_selection, Tag, Style), + send(M, mark_undo). + + +tag_selection(M, Tag:[name], Style:[{inline,block,shorttag}]) :-> + "Tag the current selection using element":: + ( Tag == @default + -> new(TI, text_item(element)), + ( get(M, allowed_elements, List), +% format('Allowed: ~p~n', [List]), + delete(List, '#pcdata', Elems), + sort(Elems, Sorted) + -> send(TI, value_set, Sorted) + ; true + ), + get(M, prompt_using, TI, String), + get(String, value, TheTag) + ; TheTag = Tag + ), + ( get(M, selection, point(A,B)), + B > A + -> send(M, tag_region, TheTag, A, B, Style), + send(M, selection, 0, 0), + send(M, colourise_element) + ; send(M, insert_element, TheTag, Style) + ). + + +tag_region(M, Tag:[name], From:int, To:int, + Style:[{inline,block,shorttag}]) :-> + "Tag a defined region":: + fix_case(M, Tag, TheTag), + get(M, text_buffer, TB), + ( Style == shorttag + -> send(TB, insert, To, /), + send(TB, insert, From, string('<%s/', TheTag)) + ; Style == block + -> ( get(M, column, To, 0) + -> send(TB, insert, To, string('\n', TheTag)) + ; send(TB, insert, From, string('\n\n', TheTag)) + ), + ( get(M, column, From, 0) + -> send(TB, insert, From, string('<%s>\n', TheTag)) + ; send(TB, insert, From, string('\n<%s>\n', TheTag)) + ) + ; Style == inline + -> send(TB, insert, To, string('', TheTag)), + send(TB, insert, From, string('<%s>', TheTag)) + ; get(M, column, From, 0), + get(M, column, To, 0) + -> send(M, tag_region, Tag, From, To, block) + ; send(M, tag_region, Tag, From, To, inline) + ). + + +insert_element(M, Tag:element=name, Style:[{inline,shorttag,block}]) :-> + "Insert a new empty element":: + ( get(M, dtd, DTD), + dtd_property(DTD, element(Tag, Omit, Content)) + -> true + ; Omit = omit(false, false), + Content = '#pcdata' + ), + fix_case(M, Tag, TheTag), + send(M, prepare_insert), + send(M, mark, 0), % put insert position here + insert_by_style(Style, M, TheTag, Omit, Content, _), + send(M, colourise_element), + ( get(M, mark, Mark), + Mark > 0 + -> send(M, caret, Mark) + ; true + ). + +prepare_insert(M) :-> + "Find location to insert a new tag":: + get(M, caret, Caret), + ( find_element(M, Caret, From-_To) + -> get(M, looking_at_element, From, E), +% format('~p: Inserting in "~w" at ~w~n', [M, E, From]), + get(M, dtd, DTD), + dtd_property(DTD, element(E, _, Content)), + ( mixed_content(Content) + -> true + ; get(M, column, From, Col0), + Col is Col0+2, + get(M, text_buffer, TB), + get(TB, scan, Caret, line, 0, start, SOL), + ( new(Re, regex('\\s *')), + send(Re, match, TB, SOL, Caret), + get(Re, register_end, Caret) + -> true % at a blank line + ; send(M, newline) + ), + send(M, align_line, Col) + ) + ; true + ). + +insert_by_style(_, M, Tag, _, empty, End) :- !, + send(M, insert_begin, Tag), + ( get(M, dialect, xml) + -> send(M, backward_char), + send(M, format, /), + send(M, forward_char) + ; true + ), + get(M, caret, End). +insert_by_style(Style, M, Tag, _, Model, End) :- + required_content(Model, List), + ( mixed_content(Model) + -> def_style(Style, inline, TheStyle), + insert_by_style(TheStyle, M, Tag, End), + send(M, set_insert_point) + ; insert_by_style(block, M, Tag, End0), + get(M, text_buffer, TB), + new(Mark, fragment(TB, End0, 0)), + insert_sub_elements(List, M), + get(Mark, start, End), + free(Mark) + ). + +mixed_content(M) :- + term_member('#pcdata', M), !. + +term_member(X, X). +term_member(X, C) :- + compound(C), + arg(_, C, A), + term_member(X, A). + +insert_by_style(shorttag, M, Tag, End) :- !, + send(M, insert_begin, Tag), + send(M, backward_delete_char), + send(M, format, '//'), + get(M, caret, End), + send(M, backward_char). +insert_by_style(inline, M, Tag, End) :- !, + send(M, insert_begin, Tag), + get(M, caret, New), + send(M, insert_end, Tag), + get(M, caret, End), + send(M, caret, New). +insert_by_style(block, M, Tag, End) :- !, + send(M, insert_begin, Tag), + get(M, caret, Insert), + send(M, newline_and_indent), + send(M, insert_end, Tag), + get(M, caret, End), + send(M, caret, Insert). +insert_by_style(@default, M, Tag, End) :- + get(M, text_buffer, TB), + get(M, caret, Caret), + get(TB, scan, Caret, line, 0, start, SOL), + ( send(regex('\\s *$'), match, TB, SOL) + -> insert_by_style(block, M, Tag, End) + ; insert_by_style(inline, M, Tag, End) + ). + + +def_style(@default, Style, Style) :- !. +def_style(Style, _, Style). + +insert_sub_elements([], _). +insert_sub_elements([H|T], M) :- + send(M, format, ' '), + get(M, dtd, DTD), + dtd_property(DTD, element(H, Omit, Content)), + send(M, prepare_insert), + insert_by_style(@default, M, H, Omit, Content, End), + ( T == [] + -> true + ; send(M, caret, End), + insert_sub_elements(T, M) + ). + + +required_content(empty, []). +required_content(cdata, []). +required_content(Model, Elems) :- + phrase(required_content(Model), Elems). + +required_content((A,B)) --> !, + required_content(A), + required_content(B). +required_content(&(A,B)) --> !, + required_content(A), + required_content(B). +required_content('|'(_,_)) --> !, + []. +required_content(?(_)) --> + []. +required_content(*(_)) --> + []. +required_content(+(A)) --> + required_content(A). +required_content('#pcdata') --> !, + []. +required_content(A) --> + [A]. + + +looking_at_element(M, From:int, Elem:name) :<- + new(Re, regex('<\\([-_:a-zA-Z0-9]+\\)')), + get(M, text_buffer, TB), + send(Re, match, TB, From), + get(Re, register_value, TB, 1, name, Elem). + + +allowed_elements(M, Allowed:prolog) :<- + "Show elements allowed here":: + get(M, caret, Caret), + get(M, text_buffer, TB), + new(Re, regex('<\\w+')), + make_parser(M, Parser), + load_dtd(M, Parser), + get_sgml_parser(Parser, dtd(DTD)), + set_sgml_parser(Parser, doctype(_)), + pce_open(TB, read, In), + set_caret(Caret), + ( find_element(M, Parser, Re, In, Caret, From-_To), + get(M, looking_at_element, From, E), +% format('Looking at ~w~n', [E]), + ( dtd_property(DTD, doctype(E)) + ; dtd_property(DTD, element(E, omit(_, false), _)) + ) + -> unset_caret, + seek(In, From, bof, _), + set_sgml_parser(Parser, charpos(From)), + Len is Caret - From, + catch(sgml_parse(Parser, + [ goal(emacs_sgml_mode:feed(In, Len)), + syntax_errors(quiet), + parse(input) % do not complete document + ]), + E, + show_message(M, E)), + ( element(_,_,_,loc(explicit,_,_)) + -> format('End-tag available~n', []) + ; true + ), + get_sgml_parser(Parser, allowed(Allowed)) + ; unset_caret, + dtd_property(DTD, doctype(DocType)), + atom(DocType) + -> Allowed = [DocType] + ; send(M, report, warning, 'No current element'), + Allowed = [] + ), + close(In), + free_sgml_parser(Parser). + +feed(In, Len, Parser) :- + copy_stream_data(In, Parser, Len). + +report_allowed(M) :-> % DEBUGGING + "Report allowed elements at point":: + get(M, allowed_elements, Allowed), + concat_atom(Allowed, ', ', Atom), + send(M, report, status, 'Allowed: %s', Atom). + +show_message(M, E) :- + message_to_string(E, String), + send(M, report, warning, 'Caught error: %s', String). + + + /******************************* + * MOVING AROUND * + *******************************/ + +forward_move_out(M) :-> + "Move forwards to end of current element":: + get(M, caret, Caret), + ( find_element(M, Caret, _From-To) + -> send(M, caret, To) + ; send(M, report, warning, 'Cannot find element') + ). + +:- emacs_end_mode. + + + + /******************************* + * XML * + *******************************/ + +:- emacs_begin_mode(xml, sgml, + "Mode for editing XML documents", + [], + []). + +initialise(M) :-> + send_super(M, initialise), + send(M, dialect, xml). + +open_document(M, DTD:doctype=name) :-> + "Insert document header":: + send(M, format, '\n'), + send_super(M, open_document, DTD). + +:- emacs_end_mode. + + + /******************************* + * HTML * + *******************************/ + +:- emacs_begin_mode(html, sgml, + "Mode for editing HTML documents", + [], + []). + +initialise(M) :-> + send_super(M, initialise), + send(M, dialect, html). + +open_document(M) :-> + "Insert document header":: + send(M, format, + '\n\n'), + send(M, insert_element, html). + +:- emacs_end_mode. + + + /******************************* + * FRAGMENT * + *******************************/ + +:- pce_begin_class(sgml_mode_fragment, emacs_colour_fragment, + "Provide colourised region"). + +variable(parsed, bool := @on, both, "@off for unparsed fragments"). + +:- pce_end_class. diff --git a/packages/sgml/sgml_write.pl b/packages/sgml/sgml_write.pl new file mode 100644 index 000000000..0517be1d8 --- /dev/null +++ b/packages/sgml/sgml_write.pl @@ -0,0 +1,808 @@ +/* $Id$ + + Part of SWI-Prolog + + Author: Jan Wielemaker & Richard O'Keefe + E-mail: wielemaker@science.uva.nl + WWW: http://www.swi-prolog.org + Copyright (C): 1985-2004, University of Amsterdam + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License + as published by the Free Software Foundation; either version 2 + of the License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + + As a special exception, if you link this library with other files, + compiled with a Free Software compiler, to produce an executable, this + library does not by itself cause the resulting executable to be covered + by the GNU General Public License. This exception does not however + invalidate any other reasons why the executable file might be covered by + the GNU General Public License. +*/ + +:- module(sgml_write, + [ html_write/2, % +Data, +Options + html_write/3, % +Stream, +Data, +Options + sgml_write/2, % +Data, +Options + sgml_write/3, % +Stream, +Data, +Options + xml_write/2, % +Data, +Options + xml_write/3 % +Stream, +Data, +Options + ]). +:- use_module(library(lists)). +:- use_module(library(sgml)). +:- use_module(library(debug)). +:- use_module(library(assoc)). +:- use_module(library(option)). +:- use_module(library(error)). + +/** XML/SGML writer module + +This library provides the inverse functionality of the sgml.pl parser +library, writing XML, SGML and HTML documents from the parsed output. It +is intended to allow rewriting in a different dialect or encoding or to +perform document transformation in Prolog on the parsed representation. + +The current implementation is particularly keen on getting character +encoding and the use of character entities right. Some work has been +done providing layout, but space handling in XML and SGML make this a +very hazardous area. + +The Prolog-based low-level character and escape handling is the real +bottleneck in this library and will probably be moved to C in a later +stage. + +@see library(http/html_write) provides a high-level library for + emitting HTML and XHTML. +*/ + +%% xml_write(+Data, +Options) is det. +%% sgml_write(+Data, +Options) is det. +%% html_write(+Data, +Options) is det. +%% xml_write(+Stream, +Data, +Options) is det. +%% sgml_write(+Stream, +Data, +Options) is det. +%% html_write(+Stream, +Data, +Options) is det. +% +% Write a term as created by the SGML/XML parser to a stream in +% SGML or XML format. Options: +% +% * dtd(DTD) +% The DTD. This is needed for SGML documents that contain +% elements with content model EMPTY. Characters which may +% not be written directly in the Stream's encoding will be +% written using character data entities from the DTD if at +% all possible, otherwise as numeric character references. +% Note that the DTD will NOT be written out at all; as yet +% there is no way to write out an internal subset, though +% it would not be hard to add one. +% +% * doctype(DocType) +% Document type for the SGML document type declaration. +% If omitted it is taken from the root element. There is +% never any point in having this be disagree with the +% root element. A declaration will be written +% if and only if at least one of doctype(_), public(_), or +% system(_) is provided in Options. +% +% * public(PubId) +% The public identifier to be written in the line. +% +% * system(SysId) +% The system identifier to be written in the line. +% +% * header(Bool) +% If Bool is 'false', do not emit the header +% line. (xml_write/3 only) +% +% * nsmap(Map:list(Id=URI)) +% When emitting embedded XML, assume these namespaces +% are already defined from the environment. (xml_write/3 +% only). +% +% * indent(Indent) +% Indentation of the document (for embedding) +% +% * layout(Bool) +% Emit/do not emit layout characters to make output +% readable. +% +% * net(Bool) +% Use/do not use Null End Tags. +% For XML, this applies only to empty elements, so you get +% +% == +% (default, net(true)) +% (net(false)) +% == +% +% For SGML, this applies to empty elements, so you get +% +% == +% (if foo is declared to be EMPTY in the DTD) +% (default, net(false)) +% xxx (default, net(false)) +% set_stream(Stream, encoding(utf8)), + call_cleanup(xml_write(Stream, Data, Options), + set_stream(Stream, encoding(text))) + ; new_state(xml, State), + init_state(Options, State), + get_state(State, nsmap, NSMap), + add_missing_namespaces(Data, NSMap, Data1), + emit_xml_encoding(Stream, Options), + emit_doctype(Options, Data, Stream), + write_initial_indent(State, Stream), + emit(Data1, Stream, State) + ). + + +sgml_write(Data, Options) :- + current_output(Stream), + sgml_write(Stream, Data, Options). + +sgml_write(Stream0, Data, Options) :- + fix_user_stream(Stream0, Stream), + ( stream_property(Stream, encoding(text)) + -> set_stream(Stream, encoding(utf8)), + call_cleanup(sgml_write(Stream, Data, Options), + set_stream(Stream, encoding(text))) + ; new_state(sgml, State), + init_state(Options, State), + write_initial_indent(State, Stream), + emit_doctype(Options, Data, Stream), + emit(Data, Stream, State) + ). + + +html_write(Data, Options) :- + current_output(Stream), + html_write(Stream, Data, Options). + +html_write(Stream, Data, Options) :- + sgml_write(Stream, Data, + [ dtd(html) + | Options + ]). + +fix_user_stream(user, user_output) :- !. +fix_user_stream(Stream, Stream). + + +init_state([], _). +init_state([H|T], State) :- + update_state(H, State), + init_state(T, State). + +update_state(dtd(DTD), State) :- !, + ( atom(DTD) + -> dtd(DTD, DTDObj) + ; DTDObj = DTD + ), + set_state(State, dtd, DTDObj), + dtd_character_entities(DTDObj, EntityMap), + set_state(State, entity_map, EntityMap). +update_state(nsmap(Map), State) :- !, + set_state(State, nsmap, Map). +update_state(indent(Indent), State) :- !, + must_be(integer, Indent), + set_state(State, indent, Indent). +update_state(layout(Bool), State) :- !, + must_be(boolean, Bool), + set_state(State, layout, Bool). +update_state(doctype(_), _) :- !. +update_state(public(_), _) :- !. +update_state(system(_), _) :- !. +update_state(net(Bool), State) :- !, + must_be(boolean, Bool), + set_state(State, net, Bool). +update_state(header(Bool), _) :- !, + must_be(boolean, Bool). +update_state(Option, _) :- + domain_error(xml_write_option, Option). + +% emit_xml_encoding(+Stream, +Options) +% +% Emit the XML fileheader with encoding information. Setting the +% right encoding on the output stream must be done before calling +% xml_write/3. + +emit_xml_encoding(Out, Options) :- + option(header(Hdr), Options, true), + Hdr == true, !, + stream_property(Out, encoding(Encoding)), + ( Encoding == utf8 + -> format(Out, '~n~n', []) + ; Encoding == iso_latin_1 + -> format(Out, '~n~n', []) + ; domain_error(xml_encoding, Encoding) + ). +emit_xml_encoding(_, _). + + +%% emit_doctype(+Options, +Data, +Stream) +% +% Emit the document-type declaration. +% There is a problem with the first clause if we are emitting SGML: +% the SGML DTDs for HTML up to HTML 4 *do not allow* any 'version' +% attribute; so the only time this is useful is when it is illegal! + +emit_doctype(_Options, Data, Out) :- + ( memberchk(element(html,Att,_), Data) + ; Data = element(html,Att,_) + ), + memberchk(version=Version, Att), + !, + format(Out, '~n~n', [Version]). +emit_doctype(Options, Data, Out) :- + ( memberchk(public(PubId), Options) -> true + ; PubId = (-) + ), + ( memberchk(system(SysId), Options) -> true + ; SysId = (-) + ), + \+ (PubId == (-), + SysId == (-), + \+ memberchk(doctype(_), Options) + ), + ( Data = element(DocType,_,_) + ; memberchk(element(DocType,_,_), Data) + ; memberchk(doctype(DocType), Options) + ), + !, + write_doctype(Out, DocType, PubId, SysId). +emit_doctype(_, _, _). + +write_doctype(Out, DocType, -, -) :- !, + format(Out, '~n~n', [DocType]). +write_doctype(Out, DocType, -, SysId) :- !, + format(Out, '~n~n', [DocType,SysId]). +write_doctype(Out, DocType, PubId, -) :- !, + format(Out, '~n~n', [DocType,PubId]). +write_doctype(Out, DocType, PubId, SysId) :- + format(Out, '~n~n', [DocType,PubId,SysId]). + + +%% emit(+Element, +Out, +State, +Options) +% +% Emit a single element + +emit([], _, _) :- !. +emit([H|T], Out, State) :- !, + emit(H, Out, State), + emit(T, Out, State). +emit(Element, Out, State) :- + \+ \+ emit_element(Element, Out, State). + +emit_element(pi(PI), Out, State) :- + get_state(State, entity_map, EntityMap), + write(Out, + write(Out, ?>) + ; write(Out, >) + ). +emit_element(element(Name, Attributes, Content), Out, State) :- + att_length(Attributes, State, Alen), + ( Alen > 60, + get_state(State, layout, true) + -> Sep = nl, + AttIndent = 4 + ; Sep = sp, + AttIndent = 0 + ), + ( get_state(State, dialect, xml) + -> update_nsmap(Attributes, State) + ; true + ), + put_char(Out, '<'), + emit_name(Name, Out, State), + ( AttIndent > 0 + -> \+ \+ ( inc_indent(State, AttIndent), + attributes(Attributes, Sep, Out, State) + ) + ; attributes(Attributes, Sep, Out, State) + ), + content(Content, Out, Name, State). + +attributes([], _, _, _). +attributes([H|T], Sep, Out, State) :- + ( Sep == nl + -> write_indent(State, Out) + ; put_char(Out, ' ') + ), + attribute(H, Out, State), + attributes(T, Sep, Out, State). + +attribute(Name=Value, Out, State) :- + emit_name(Name, Out, State), + put_char(Out, =), + sgml_write_attribute(Out, Value, State). + +att_length(Atts, State, Len) :- + att_length(Atts, State, 0, Len). + +att_length([], _, Len, Len). +att_length([A0|T], State, Len0, Len) :- + alen(A0, State, AL), + Len1 is Len0 + 1 + AL, + att_length(T, State, Len1, Len). + +alen(URI:Name=Value, State, Len) :- !, + atom_length(Value, AL), + vlen(Name, NL), + get_state(State, nsmap, Nsmap), + ( memberchk(NS=URI, Nsmap) + -> atom_length(NS, NsL) + ; atom_length(URI, NsL) + ), + Len is AL+NL+NsL+3. +alen(Name=Value, _, Len) :- + atom_length(Name, NL), + vlen(Value, AL), + Len is AL+NL+3. + +vlen(Value, Len) :- + is_list(Value), !, + vlen_list(Value, 0, Len). +vlen(Value, Len) :- + atom_length(Value, Len). + +vlen_list([], L, L). +vlen_list([H|T], L0, L) :- + atom_length(H, HL), + ( L0 == 0 + -> L1 is L0 + HL + ; L1 is L0 + HL + 1 + ), + vlen_list(T, L1, L). + + +emit_name(Name, Out, _) :- + atom(Name), !, + write(Out, Name). +emit_name(URI:Name, Out, State) :- + get_state(State, nsmap, NSMap), + memberchk(NS=URI, NSMap), !, + ( NS == [] + -> write(Out, Name) + ; format(Out, '~w:~w', [NS, Name]) + ). +emit_name(Term, Out, _) :- + write(Out, Term). + +%% update_nsmap(+Attributes, !State) +% +% Modify the nsmap of State to reflect modifications due to xmlns +% arguments. + +update_nsmap(Attributes, State) :- + get_state(State, nsmap, Map0), + update_nsmap(Attributes, Map0, Map), + set_state(State, nsmap, Map). + +update_nsmap([], Map, Map). +update_nsmap([xmlns:NS=URI|T], Map0, Map) :- !, + set_nsmap(NS, URI, Map0, Map1), + update_nsmap(T, Map1, Map). +update_nsmap([xmlns=URI|T], Map0, Map) :- !, + set_nsmap([], URI, Map0, Map1), + update_nsmap(T, Map1, Map). +update_nsmap([_|T], Map0, Map) :- !, + update_nsmap(T, Map0, Map). + +set_nsmap(NS, URI, Map0, Map) :- + select(NS=_, Map0, Map1), !, + Map = [NS=URI|Map1]. +set_nsmap(NS, URI, Map, [NS=URI|Map]). + + +%% content(+Content, +Out, +Element, +State, +Options) +% +% Emit the content part of a structure as well as the termination +% for the content. For empty content we have three versions: XML +% style '/>', SGML declared EMPTY element (nothing) or normal SGML +% element (we must close with the same element name). + +content([], Out, Element, State) :- !, % empty element + ( get_state(State, net, true) + -> ( get_state(State, dialect, xml) -> + write(Out, />) + ; empty_element(State, Element) -> + write(Out, >) + ; write(Out, //) + ) + ;/* get_state(State, net, false) */ + write(Out, >), + ( get_state(State, dialect, sgml), + empty_element(State, Element) + -> true + ; emit_close(Element, Out, State) + ) + ). +content([Atom], Out, Element, State) :- + atom(Atom), !, + ( get_state(State, dialect, sgml), + get_state(State, net, true), + \+ sub_atom(Atom, _, _, _, /), + atom_length(Atom, Len), + Len < 20 + -> write(Out, /), + sgml_write_content(Out, Atom, State), + write(Out, /) + ;/* XML or not NET */ + write(Out, >), + sgml_write_content(Out, Atom, State), + emit_close(Element, Out, State) + ). +content(Content, Out, Element, State) :- + get_state(State, layout, true), + /* If xml:space='preserve' is present, */ + /* we MUST NOT tamper with white space at all. */ + \+ (Element = element(_,Atts,_), + memberchk('xml:space'=preserve, Atts) + ), + element_content(Content, Elements), + !, + format(Out, >, []), + \+ \+ ( + inc_indent(State), + write_element_content(Elements, Out, State) + ), + write_indent(State, Out), + emit_close(Element, Out, State). +content(Content, Out, Element, State) :- + format(Out, >, []), + write_mixed_content(Content, Out, Element, State), + emit_close(Element, Out, State). + + +emit_close(Element, Out, State) :- + write(Out, ''). + + +write_mixed_content([], _, _, _). +write_mixed_content([H|T], Out, Element, State) :- + write_mixed_content_element(H, Out, State), + write_mixed_content(T, Out, Element, State). + +write_mixed_content_element(H, Out, State) :- + ( atom(H) + -> sgml_write_content(Out, H, State) + ; functor(H, element, 3) + -> emit(H, Out, State) + ; functor(H, pi, 1) + -> emit(H, Out, State) + ; H = sdata(Data) % cannot be written without entity! + -> print_message(warning, sgml_write(sdata_as_cdata(Data))), + sgml_write_content(Out, Data, State) + ; assertion(fail) + ). + + +element_content([], []). +element_content([element(Name,Atts,C)|T0], [element(Name,Atts,C)|T]) :- !, + element_content(T0, T). +element_content([Blank|T0], T) :- + atom(Blank), + atom_codes(Blank, Codes), + all_blanks(Codes), + element_content(T0, T). + +all_blanks([]). +all_blanks([H|T]) :- + code_type(H, space), + all_blanks(T). + +write_element_content([], _, _). +write_element_content([H|T], Out, State) :- + write_indent(State, Out), + emit(H, Out, State), + write_element_content(T, Out, State). + + + /******************************* + * NAMESPACES * + *******************************/ + +%% add_missing_namespaces(+DOM0, +NsMap, -DOM) +% +% Add xmlns:NS=URI definitions to the toplevel element(s) to +% deal with missing namespaces. + +add_missing_namespaces([], _, []) :- !. +add_missing_namespaces([H0|T0], Def, [H|T]) :- !, + add_missing_namespaces(H0, Def, H), + add_missing_namespaces(T0, Def, T). +add_missing_namespaces(Elem0, Def, Elem) :- + Elem0 = element(Name, Atts0, Content), !, + missing_namespaces(Elem0, Def, Missing), + ( Missing == [] + -> Elem = Elem0 + ; add_missing_ns(Missing, Atts0, Atts), + Elem = element(Name, Atts, Content) + ). +add_missing_namespaces(DOM, _, DOM). % CDATA, etc. + +add_missing_ns([], Atts, Atts). +add_missing_ns([H|T], Atts0, Atts) :- + generate_ns(H, NS), + add_missing_ns(T, [xmlns:NS=H|Atts0], Atts). + +%% generate_ns(+URI, -NS) is det. +% +% Generate a namespace (NS) identifier for URI. + +generate_ns(URI, NS) :- + default_ns(URI, NS), !. +generate_ns(_, NS) :- + gensym(xns, NS). + +:- multifile + rdf_db:ns/2. + +default_ns('http://www.w3.org/2001/XMLSchema-instance', xsi). +default_ns(URI, NS) :- + rdf_db:ns(NS, URI). + +%% missing_namespaces(+DOM, +NSMap, -Missing) +% +% Return a list of URIs appearing in DOM that are not covered +% by xmlns definitions. + +missing_namespaces(DOM, Defined, Missing) :- + missing_namespaces(DOM, Defined, [], Missing). + +missing_namespaces([], _, L, L) :- !. +missing_namespaces([H|T], Def, L0, L) :- !, + missing_namespaces(H, Def, L0, L1), + missing_namespaces(T, Def, L1, L). +missing_namespaces(element(Name, Atts, Content), Def, L0, L) :- !, + update_nsmap(Atts, Def, Def1), + missing_ns(Name, Def1, L0, L1), + missing_att_ns(Atts, Def1, L1, L2), + missing_namespaces(Content, Def1, L2, L). +missing_namespaces(_, _, L, L). + +missing_att_ns([], _, M, M). +missing_att_ns([Name=_|T], Def, M0, M) :- + missing_ns(Name, Def, M0, M1), + missing_att_ns(T, Def, M1, M). + +missing_ns(URI:_, Def, M0, M) :- !, + ( ( memberchk(_=URI, Def) + ; memberchk(URI, M0) + ; URI = xml % predefined ones + ; URI = xmlns + ) + -> M = M0 + ; M = [URI|M0] + ). +missing_ns(_, _, M, M). + + /******************************* + * QUOTED WRITE * + *******************************/ + +sgml_write_attribute(Out, Values, State) :- + is_list(Values), !, + get_state(State, entity_map, EntityMap), + put_char(Out, '"'), + write_quoted_list(Values, Out, """<&>", EntityMap), + put_char(Out, '"'). +sgml_write_attribute(Out, Value, State) :- + get_state(State, entity_map, EntityMap), + put_char(Out, '"'), + write_quoted(Out, Value, """<&>", EntityMap), + put_char(Out, '"'). + +write_quoted_list([], _, _, _). +write_quoted_list([H|T], Out, Escape, EntityMap) :- + write_quoted(Out, H, Escape, EntityMap), + ( T == [] + -> true + ; put_char(Out, ' '), + write_quoted_list(T, Out, Escape, EntityMap) + ). + + +sgml_write_content(Out, Value, State) :- + get_state(State, entity_map, EntityMap), + write_quoted(Out, Value, "<&>", EntityMap). + + +write_quoted(Out, Atom, Escape, EntityMap) :- + atom_codes(Atom, Codes), + writeq(Codes, Out, Escape, EntityMap). + + +writeq([], _, _, _). +writeq([H|T], Out, Escape, EntityMap) :- + ( memberchk(H, Escape) + -> write_entity(H, Out, EntityMap) + ; H >= 256 + -> ( stream_property(Out, encoding(utf8)) + -> put_code(Out, H) + ; write_entity(H, Out, EntityMap) + ) + ; put_code(Out, H) + ), + writeq(T, Out, Escape, EntityMap). + + +write_entity(Code, Out, EntityMap) :- + ( get_assoc(Code, EntityMap, EntityName) + -> format(Out, '&~w;', [EntityName]) + ; format(Out, '&#~w;', [Code]) + ). + + + /******************************* + * INDENTATION * + *******************************/ + +write_initial_indent(State, Out) :- + ( get_state(State, indent, Indent), + Indent > 0 + -> emit_indent(Indent, Out) + ; true + ). + +write_indent(State, _) :- + get_state(State, layout, false), !. +write_indent(State, Out) :- + get_state(State, indent, Indent), + emit_indent(Indent, Out). + +emit_indent(Indent, Out) :- + Tabs is Indent // 8, + Spaces is Indent mod 8, + format(Out, '~N', []), + write_n(Tabs, '\t', Out), + write_n(Spaces, ' ', Out). + +write_n(N, Char, Out) :- + ( N > 0 + -> put_char(Out, Char), + N2 is N - 1, + write_n(N2, Char, Out) + ; true + ). + +inc_indent(State) :- + inc_indent(State, 2). + +inc_indent(State, Inc) :- + state(indent, Arg), + arg(Arg, State, I0), + I is I0 + Inc, + setarg(Arg, State, I). + + + /******************************* + * DTD HANDLING * + *******************************/ + +%% empty_element(+State, +Element) +% +% True if Element is declared with EMPTY content in the (SGML) +% DTD. + +empty_element(State, Element) :- + get_state(State, dtd, DTD), + DTD \== (-), + dtd_property(DTD, element(Element, _, empty)). + +%% dtd_character_entities(+DTD, -Map) +% +% Return an assoc mapping character entities to their name. Note +% that the entity representation is a bit dubious. Entities should +% allow for a wide-character version and avoid the &#..; trick. + +dtd_character_entities(DTD, Map) :- + empty_assoc(Empty), + dtd_property(DTD, entities(Entities)), + fill_entity_map(Entities, DTD, Empty, Map). + +fill_entity_map([], _, Map, Map). +fill_entity_map([H|T], DTD, Map0, Map) :- + ( dtd_property(DTD, entity(H, CharEntity)), + atom(CharEntity), + ( sub_atom(CharEntity, 0, _, _, '&#'), + sub_atom(CharEntity, _, _, 0, ';') + -> sub_atom(CharEntity, 2, _, 1, Name), + atom_number(Name, Code) + ; atom_length(CharEntity, 1), + char_code(CharEntity, Code) + ) + -> put_assoc(Code, Map0, H, Map1), + fill_entity_map(T, DTD, Map1, Map) + ; fill_entity_map(T, DTD, Map0, Map) + ). + + + + /******************************* + * FIELDS * + *******************************/ + +state(indent, 1). % current indentation +state(layout, 2). % use layout (true/false) +state(dtd, 3). % DTD for entity names +state(entity_map, 4). % compiled entity-map +state(dialect, 5). % xml/sgml +state(nsmap, 6). % defined namespaces +state(net, 7). % Should null end-tags be used? + +new_state(Dialect, + state( + 0, % indent + true, % layout + -, % DTD + EntityMap, % entity_map + Dialect, % dialect + [], % NS=Full map + Net % Null End-Tags? + )) :- + ( Dialect == sgml + -> Net = false, + empty_assoc(EntityMap) + ; Net = true, + xml_entities(EntityMap) + ). + +get_state(State, Field, Value) :- + state(Field, Arg), + arg(Arg, State, Value). + +set_state(State, Field, Value) :- + state(Field, Arg), + setarg(Arg, State, Value). + +xml_entities(Map) :- + list_to_assoc([ 60 - lt, + 61 - amp, + 62 - gt, + 39 - apos, + 34 - quot + ], Map). + + + /******************************* + * MESSAGES * + *******************************/ + +:- multifile + prolog:message/3. + +prolog:message(sgml_write(sdata_as_cdata(Data))) --> + [ 'SGML-write: emitting SDATA as CDATA: "~p"'-[Data] ]. diff --git a/packages/sgml/sgmldefs.h b/packages/sgml/sgmldefs.h new file mode 100644 index 000000000..9b0879b25 --- /dev/null +++ b/packages/sgml/sgmldefs.h @@ -0,0 +1,85 @@ +/* $Id$ + + Part of SWI-Prolog + + Author: Jan Wielemaker + E-mail: jan@swi.psy.uva.nl + WWW: http://www.swi-prolog.org + Copyright (C): 1985-2002, University of Amsterdam + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ + +#ifndef SGMLDEFS_H_INCLUDED +#define SGMLDEFS_H_INCLUDED + +#ifdef HAVE_CONFIG_H +#include +#else +#ifdef __WINDOWS__ +#define HAVE_MALLOC_H 1 +#define HAVE_IO_H 1 +#endif +#endif + +#ifdef HAVE_DMALLOC_H +#include +#endif + +#define UTF8 1 /* Include UTF-8 decoding */ +#define XMLNS 1 /* support XML namespaces */ + +#include + + /******************************* + * INPUT/OUTPUT CHARACTERS * + *******************************/ + +typedef wchar_t ichar; /* input character */ + +#define SHORTMAP_SIZE 256 /* shortmaps in 0..255 */ + +#define USE_STRING_FUNCTIONS 1 /* use built-in str* functions */ + + + /******************************* + * LIMITS * + *******************************/ + +#define INPUT_CHARSET_SIZE 256 /* for now */ +#define SYMBOLHASHSIZE 256 +#define MAXSTRINGLEN 2048 +#define MAXNMLEN 256 +#define MAXDECL 10240 +#define MAXATTELEM 256 /* #elements in one ATTLIST */ +#define MAXNAMEGROUP 256 /* #names in a (group) */ +#define MAXATTRIBUTES 256 /* attributes per element */ +#define MAXMAPLEN 32 /* max sequence length for SHORTREF */ +#define SHORTENTITYFILE 100 /* short external entities in mem */ + + + /******************************* + * CONSTANTS * + *******************************/ + +#ifndef TRUE +#define TRUE 1 +#define FALSE 0 +#endif + +#define LF 10 +#define CR 13 + +#endif /*SGMLDEFS_H_INCLUDED*/ diff --git a/packages/sgml/utf8.c b/packages/sgml/utf8.c new file mode 100644 index 000000000..a5189c3bd --- /dev/null +++ b/packages/sgml/utf8.c @@ -0,0 +1,117 @@ +/* $Id$ + + Part of SWI-Prolog + + Author: Jan Wielemaker + E-mail: jan@swi.psy.uva.nl + WWW: http://www.swi-prolog.org + Copyright (C): 1985-2002, University of Amsterdam + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ + +#include +#include "utf8.h" + +/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +UTF-8 Decoding, based on http://www.cl.cam.ac.uk/~mgk25/unicode.html +- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ + +#define CONT(i) ((in[i]&0xc0) == 0x80) +#define VAL(i, s) ((in[i]&0x3f) << s) + +char * +sgml__utf8_get_char(const char *in, int *chr) +{ /* 2-byte, 0x80-0x7ff */ + if ( (in[0]&0xe0) == 0xc0 && CONT(1) ) + { *chr = ((in[0]&0x1f) << 6)|VAL(1,0); + return (char *)in+2; + } + /* 3-byte, 0x800-0xffff */ + if ( (in[0]&0xf0) == 0xe0 && CONT(1) && CONT(2) ) + { *chr = ((in[0]&0xf) << 12)|VAL(1,6)|VAL(2,0); + return (char *)in+3; + } + /* 4-byte, 0x10000-0x1FFFFF */ + if ( (in[0]&0xf8) == 0xf0 && CONT(1) && CONT(2) && CONT(3) ) + { *chr = ((in[0]&0x7) << 18)|VAL(1,12)|VAL(2,6)|VAL(3,0); + return (char *)in+4; + } + /* 5-byte, 0x200000-0x3FFFFFF */ + if ( (in[0]&0xfc) == 0xf8 && CONT(1) && CONT(2) && CONT(3) && CONT(4) ) + { *chr = ((in[0]&0x3) << 24)|VAL(1,18)|VAL(2,12)|VAL(3,6)|VAL(4,0); + return (char *)in+5; + } + /* 6-byte, 0x400000-0x7FFFFFF */ + if ( (in[0]&0xfe) == 0xfc && CONT(1) && CONT(2) && CONT(3) && CONT(4) && CONT(5) ) + { *chr = ((in[0]&0x1) << 30)|VAL(1,24)|VAL(2,18)|VAL(3,12)|VAL(4,6)|VAL(5,0); + return (char *)in+4; + } + + *chr = *in; + + return (char *)in+1; +} + + +char * +sgml_utf8_put_char(char *out, int chr) +{ if ( chr < 0x80 ) + { *out++ = chr; + } else if ( chr < 0x800 ) + { *out++ = 0xc0|((chr>>6)&0x1f); + *out++ = 0x80|(chr&0x3f); + } else if ( chr < 0x10000 ) + { *out++ = 0xe0|((chr>>12)&0x0f); + *out++ = 0x80|((chr>>6)&0x3f); + *out++ = 0x80|(chr&0x3f); + } else if ( chr < 0x200000 ) + { *out++ = 0xf0|((chr>>18)&0x07); + *out++ = 0x80|((chr>>12)&0x3f); + *out++ = 0x80|((chr>>6)&0x3f); + *out++ = 0x80|(chr&0x3f); + } else if ( chr < 0x4000000 ) + { *out++ = 0xf8|((chr>>24)&0x03); + *out++ = 0x80|((chr>>18)&0x3f); + *out++ = 0x80|((chr>>12)&0x3f); + *out++ = 0x80|((chr>>6)&0x3f); + *out++ = 0x80|(chr&0x3f); + } else if ( (unsigned)chr < 0x80000000 ) + { *out++ = 0xfc|((chr>>30)&0x01); + *out++ = 0x80|((chr>>24)&0x3f); + *out++ = 0x80|((chr>>18)&0x3f); + *out++ = 0x80|((chr>>12)&0x3f); + *out++ = 0x80|((chr>>6)&0x3f); + *out++ = 0x80|(chr&0x3f); + } + + return out; +} + + +size_t +sgml_utf8_strlen(const char *s, size_t len) +{ const char *e = &s[len]; + unsigned int l = 0; + + while(s= 0xc0 && (unsigned)(c) <= 0xfd) + +#define utf8_get_char(in, chr) \ + (*(in) & 0x80 ? sgml__utf8_get_char(in, chr) \ + : (*(chr) = *(in), (char *)(in)+1)) + +extern char *sgml__utf8_get_char(const char *in, int *chr); +#define utf8_get_uchar(in, chr) (unsigned char*)utf8_get_char((char*)(in), chr) + +extern char *sgml_utf8_put_char(char *out, int chr); +#define utf8_put_char(out, chr) \ + ((chr) < 0x80 ? out[0]=(char)(chr), out+1 \ + : sgml_utf8_put_char(out, (chr))) + +extern size_t sgml_utf8_strlen(const char *s, size_t len); +#define utf8_strlen sgml_utf8_strlen + +#endif /*UTF8_H_INCLUDED*/ diff --git a/packages/sgml/util.c b/packages/sgml/util.c new file mode 100644 index 000000000..001c2349c --- /dev/null +++ b/packages/sgml/util.c @@ -0,0 +1,747 @@ +/* $Id$ + + Part of SWI-Prolog + + Author: Jan Wielemaker + E-mail: jan@swi.psy.uva.nl + WWW: http://www.swi-prolog.org + Copyright (C): 1985-2002, University of Amsterdam + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ + +#define _ISOC99_SOURCE 1 /* fwprintf(), etc prototypes */ + +#define UTIL_H_IMPLEMENTATION +#include "util.h" +#include +#include +#include +#include +#ifdef HAVE_MALLOC_H +#include +#endif +#include +#include +#include +#ifdef HAVE_UNISTD_H +#include +#endif +#ifdef HAVE_IO_H +#include +#endif +#include +#include +#include +#include "utf8.h" + +size_t +istrlen(const ichar *s) +{ size_t len =0; + + while(*s++) + len++; + + return len; +} + + +ichar * +istrdup(const ichar *s) +{ if ( s ) + { ichar *dup = sgml_malloc((istrlen(s)+1)*sizeof(ichar)); + ichar *d = dup; + + while(*s) + *d++ = *s++; + *d = 0; + + return dup; + } else + { return NULL; + } +} + + +ichar * +istrndup(const ichar *s, int len) +{ ichar *dup = sgml_malloc((len+1)*sizeof(ichar)); + ichar *d = dup; + + while(--len >= 0) + *d++ = *s++; + *d = 0; + + return dup; +} + + +ichar * +istrcpy(ichar *d, const ichar *s) +{ ichar *r = d; + + while(*s) + *d++ = *s++; + *d = 0; + + return r; +} + + +ichar * +istrcat(ichar *d, const ichar *s) +{ ichar *r = d; + + d += istrlen(d); + istrcpy(d, s); + + return r; +} + + +ichar * +istrncpy(ichar *d, const ichar *s, size_t len) +{ ichar *r = d; + + while(*s && len-- > 0) + *d++ = *s++; + + return r; +} + + + +int +istrcaseeq(const ichar *s1, const ichar *s2) +{ ichar c; + + while ((c = *s1++) != '\0') + { if (towlower(*s2++) != towlower(c)) + return FALSE; + } + + return *s2 == '\0'; +} + + +int +istreq(const ichar *s1, const ichar *s2) +{ while(*s1 && *s1 == *s2) + s1++, s2++; + + if ( *s1 == 0 && *s2 == 0 ) + return TRUE; + + return FALSE; +} + + +int +istrncaseeq(const ichar *s1, const ichar *s2, int len) +{ while(--len >= 0 && towlower(*s1) == towlower(*s2)) + s1++, s2++; + + if ( len < 0 ) + return TRUE; + + return FALSE; +} + + +int +istrprefix(const ichar *pref, const ichar *s) +{ while(*pref && *pref == *s) + pref++, s++; + + if ( *pref == 0 ) + return TRUE; + + return FALSE; +} + + +ichar * +istrchr(const ichar *s, int c) +{ for( ; *s; s++ ) + { if ( c == *s ) + return (ichar *)s; + } + + return NULL; +} + + +ichar * +istrupper(ichar *s) +{ ichar *r = s; + + for( ; *s; s++) + *s = toupper(*s); + + return r; +} + + +ichar * +istrlower(ichar *s) +{ ichar *r = s; + + for( ; *s; s++) + *s = towlower(*s); + + return r; +} + + +int +istrhash(const ichar *t, int tsize) +{ unsigned int value = 0; + unsigned int shift = 5; + + while(*t) + { unsigned int c = *t++; + + c -= 'a'; + value ^= c << (shift & 0xf); + shift ^= c; + } + + value = value ^ (value >> 16); + + return value % tsize; +} + + +int +istrcasehash(const ichar *t, int tsize) +{ unsigned int value = 0; + unsigned int shift = 5; + + while(*t) + { unsigned int c = towlower(*t++); /* case insensitive */ + + c -= 'a'; + value ^= c << (shift & 0xf); + shift ^= c; + } + + value = value ^ (value >> 16); + + return value % tsize; +} + + +int +istrtol(const ichar *s, long *val) +{ long v; + ichar *e; + + if ( *s ) + { v = wcstol(s, &e, 10); + if ( !e[0] && errno != ERANGE ) + { *val = v; + return TRUE; + } + } + + return FALSE; +} + + + + /******************************* + * INPUT CHARACTER BUFFER * + *******************************/ + +/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Input character buffer is used to collect data between SGML markup, such +as <...> +- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ + +icharbuf * +new_icharbuf() +{ icharbuf *buf = sgml_malloc(sizeof(*buf)); + + buf->allocated = 0; + buf->size = 0; + buf->data = NULL; + + return buf; +} + + +void +free_icharbuf(icharbuf *buf) +{ if ( buf->data ) + sgml_free(buf->data); + + sgml_free(buf); +} + + +void +__add_icharbuf(icharbuf *buf, int chr) +{ if ( buf->size == buf->allocated ) + { buf->allocated = (buf->allocated ? buf->allocated*2 : 128); + + if ( buf->data ) + buf->data = sgml_realloc(buf->data, buf->allocated*sizeof(ichar)); + else + buf->data = sgml_malloc(buf->allocated*sizeof(ichar)); + } + + buf->data[buf->size++] = chr; +} + + +void +del_icharbuf(icharbuf *buf) +{ if ( buf->size > 0 ) + buf->size--; +} + + +void +terminate_icharbuf(icharbuf *buf) +{ add_icharbuf(buf, '\0'); + buf->size--; +} + + +void +empty_icharbuf(icharbuf *buf) +{ buf->size = 0; +} + + + /******************************* + * OUTPUT CHARACTER BUFFER * + *******************************/ + +/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Output character buffer deals with two representations: ISO Latin-1 and +UCS. It starts life as ISO Latin-1 and is upgraded to UCS as the first +character that doesn't fit ISO Latin-1 is added to the buffer. +- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ + +ocharbuf * +init_ocharbuf(ocharbuf *buf) +{ buf->size = 0; + buf->allocated = sizeof(buf->localbuf)/sizeof(wchar_t); + buf->data.w = buf->localbuf; + + return buf; +} + + +ocharbuf * +new_ocharbuf() +{ ocharbuf *buf = sgml_malloc(sizeof(*buf)); + + return init_ocharbuf(buf); +} + + +void +free_ocharbuf(ocharbuf *buf) +{ if ( buf->data.w && buf->data.w != buf->localbuf ) + sgml_free(buf->data.w); + + sgml_free(buf); +} + + +/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Make sure the data of the buffer is malloc'ed and nul-terminated. +- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ + +ocharbuf * +malloc_ocharbuf(ocharbuf *buf) +{ if ( buf->data.w == buf->localbuf ) + { int bytes = (buf->size+1) * sizeof(wchar_t); + + buf->data.w = sgml_malloc(bytes); + memcpy(buf->data.w, buf->localbuf, bytes); + buf->data.w[buf->size] = 0; + } else + terminate_ocharbuf(buf); + + return buf; +} + + +void +add_ocharbuf(ocharbuf *buf, int chr) +{ if ( buf->size == buf->allocated ) + { buf->allocated *= 2; + + if ( buf->data.w != (wchar_t*)buf->localbuf ) + { buf->data.w = sgml_realloc(buf->data.w, buf->allocated*sizeof(wchar_t)); + } else + { buf->data.w = sgml_malloc(buf->allocated*sizeof(wchar_t)); + memcpy(buf->data.w, buf->localbuf, sizeof(buf->localbuf)); + } + } + buf->data.w[buf->size++] = chr; +} + + +void +del_ocharbuf(ocharbuf *buf) +{ if ( buf->size > 0 ) + buf->size--; +} + + +void +terminate_ocharbuf(ocharbuf *buf) +{ add_ocharbuf(buf, '\0'); + buf->size--; +} + + +/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +empty_ocharbuf() frees the associated buffer after a big lump has been +in it. Otherwise it simply sets the size to 0. +- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ + +void +empty_ocharbuf(ocharbuf *buf) +{ buf->size = 0; + + if ( buf->allocated > 8192 ) + { assert(buf->data.w != buf->localbuf); + sgml_free(buf->data.w); + + buf->allocated = sizeof(buf->localbuf)/sizeof(wchar_t); + buf->data.w = buf->localbuf; + } +} + + + /******************************* + * BUFFER RING * + *******************************/ + +#define RINGSIZE 16 +static void *ring[RINGSIZE]; +static int ringp; + +wchar_t * +str2ring(const wchar_t *in) +{ wchar_t *copy = sgml_malloc((wcslen(in)+1)*sizeof(wchar_t)); + + if ( !copy ) + { sgml_nomem(); + return NULL; + } + + wcscpy(copy, in); + if ( ring[ringp] ) + sgml_free(ring[ringp]); + ring[ringp++] = copy; + if ( ringp == RINGSIZE ) + ringp = 0; + + return copy; +} + + +void * +ringallo(size_t size) +{ char *result = sgml_malloc(size); + + if ( ring[ringp] ) + sgml_free(ring[ringp]); + ring[ringp++] = result; + if ( ringp == RINGSIZE ) + ringp = 0; + + return result; +} + + + /******************************* + * MISC * + *******************************/ + +wchar_t const * +str_summary(wchar_t const *s, int len) +{ wchar_t *buf; + size_t l = wcslen(s); + + if ( l < (size_t)len ) + return s; + buf = ringallo((len + 10)*sizeof(wchar_t)); + wcsncpy(buf, s, len-5); + wcscpy(&buf[len-5], L" ... "); + wcscpy(&buf[len], &s[l-5]); + + return buf; +} + + +wchar_t * +utf8towcs(const char *in) +{ size_t sl = strlen(in); + size_t len = utf8_strlen(in, sl); + wchar_t *buf = sgml_malloc((len + 1)*sizeof(wchar_t)); + const char *e = in+sl; + int i; + + for(i=0; in < e;) + { int chr; + + in = utf8_get_char(in, &chr); + buf[i++] = chr; + } + + buf[i] = 0; + return buf; +} + + +char * +wcstoutf8(const wchar_t *in) +{ size_t size = 0; + const wchar_t *s; + char *rc, *o; + + for(s=in; *s; s++) + { char buf[6]; + + if ( *s >= 0x80 ) + { char *o2 = utf8_put_char(buf, *s); + size += o2-buf; + } else + { size++; + } + } + + rc = sgml_malloc(size+1); + for(o=rc, s=in; *s; s++) + { o = utf8_put_char(o, *s); + } + *o = '\0'; + + return rc; +} + + + /******************************* + * FILES * + *******************************/ + +/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Load a file into memory. This would be so easy if we didn't had to deal +with &#RE/&#RS handling that forces us to create the proper record start +and end. +- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ + +#ifndef O_BINARY +#define O_BINARY 0 +#endif + +FILE * +wfopen(const wchar_t *name, const char *mode) +{ size_t mbl = wcstombs(NULL, name, 0); + + if ( mbl > 0 ) + { char *mbs = sgml_malloc(mbl+1); + FILE *f; + + wcstombs(mbs, name, mbl+1); + f = fopen(mbs, mode); + sgml_free(mbs); + + return f; + } + + return NULL; +} + + +static int +wopen(const wchar_t *name, int flags) +{ size_t mbl = wcstombs(NULL, name, 0); + + if ( mbl > 0 ) + { char *mbs = sgml_malloc(mbl+1); + int fd; + + wcstombs(mbs, name, mbl+1); + fd = open(mbs, flags); + sgml_free(mbs); + + return fd; + } + + return -1; +} + + +ichar * +load_sgml_file_to_charp(const ichar *file, int normalise_rsre, size_t *length) +{ int fd; + + if ( (fd = wopen(file, O_RDONLY|O_BINARY)) >= 0 ) + { struct stat buf; + + if ( fstat(fd, &buf) == 0 ) + { size_t len = buf.st_size; + char *r = sgml_malloc(len+1); + + if ( r ) + { char *s = r; + + while(len>0) + { int n; + + if ( (n=(int)read(fd, s, (unsigned int)len)) < 0 ) + { close(fd); /* I/O error */ + sgml_free(r); + return NULL; + } else if ( n == 0 ) + break; + len -= n; + s += n; + } + + len = s-r; + *s = '\0'; /* ensure closing EOS */ + close(fd); + + { int nl; + int last_is_lf; + ichar *r2, *t; + + if ( normalise_rsre ) + { last_is_lf = (len > 0 && s[-1] == '\n'); + for(s=r, nl=0; *s; s++) + { if ( *s == '\n' && s>r && s[-1] != '\r' ) + nl++; + } + } else + { nl = 0; + last_is_lf = 0; + } + + r2 = sgml_malloc((len+nl+1)*sizeof(ichar)); + for(s=r, t=r2; *s; s++) + { if ( *s == '\n' ) + { if ( s>r && s[-1] != '\r' ) + *t++ = CR; + *t++ = LF; + } else + *t++ = *s; + } + len = t-r2; + *t = '\0'; + + if ( last_is_lf ) + r2[--len] = '\0'; /* delete last LF */ + + if ( length ) + *length = len; + sgml_free(r); + return r2; + } + } + } + } + + return NULL; +} + + + /******************************* + * ALLOCATION * + *******************************/ + +#ifdef _WINDOWS +#include +#endif + +void +sgml_nomem() +{ fprintf(stderr, "SGML: Fatal: out of memory\n"); + +#ifdef _WINDOWS + MessageBox(NULL, "SGML: Fatal: out of memory", "SGML", MB_OK|MB_TASKMODAL); +#endif + + exit(1); +} + + +void * +sgml_malloc(size_t size) +{ void *mem; + + if ( size == 0 ) + return NULL; + + if ( (mem = malloc(size)) ) + return mem; + + sgml_nomem(); + return NULL; +} + + +void * +sgml_realloc(void *old, size_t size) +{ void *mem; + + if ( old ) + { if ( (mem = realloc(old, size)) ) + return mem; + } else + { if ( (mem = malloc(size)) ) + return mem; + } + + sgml_nomem(); + return NULL; +} + + +void * +sgml_calloc(size_t n, size_t size) +{ void *mem; + + if ( (mem=calloc(n, size)) ) + return mem; + + sgml_nomem(); + return NULL; +} + + +void +sgml_free(void *mem) +{ if ( mem ) + free(mem); +} + + + /******************************* + * DEBUG * + *******************************/ + +void +wputs(ichar *s) +{ fwprintf(stderr, L"%ls", s); +} diff --git a/packages/sgml/util.h b/packages/sgml/util.h new file mode 100644 index 000000000..db01138cb --- /dev/null +++ b/packages/sgml/util.h @@ -0,0 +1,119 @@ +/* $Id$ + + Part of SWI-Prolog + + Author: Jan Wielemaker + E-mail: jan@swi.psy.uva.nl + WWW: http://www.swi-prolog.org + Copyright (C): 1985-2002, University of Amsterdam + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ + +#ifndef DTD_UTIL_H_INCLUDED +#define DTD_UTIL_H_INCLUDED +#include "sgmldefs.h" + +#include +#include +#include + +#ifdef _WINDOWS /* get size_t */ +#include +#endif + +typedef struct +{ int allocated; + int size; + ichar *data; +} icharbuf; + +typedef struct +{ int allocated; + int size; + union + { wchar_t *w; /* UCS */ + } data; + wchar_t localbuf[256]; /* Initial local store */ +} ocharbuf; + +size_t istrlen(const ichar *s); +ichar * istrdup(const ichar *s); +ichar * istrndup(const ichar *s, int len); +ichar * istrcpy(ichar *d, const ichar *s); +ichar * istrcat(ichar *d, const ichar *s); +ichar * istrncpy(ichar *d, const ichar *s, size_t len); +ichar * istrupper(ichar *s); +ichar * istrlower(ichar *s); +int istrprefix(const ichar *pref, const ichar *s); +int istreq(const ichar *s1, const ichar *s2); +int istrcaseeq(const ichar *s1, const ichar *s2); +int istrncaseeq(const ichar *s1, const ichar *s2, int len); +int istrhash(const ichar *t, int tsize); +int istrcasehash(const ichar *t, int tsize); +ichar * istrchr(const ichar *s, int c); +int istrtol(const ichar *s, long *val); +void * sgml_malloc(size_t size); +void * sgml_calloc(size_t n, size_t size); +void sgml_free(void *mem); +void * sgml_realloc(void *old, size_t size); +void sgml_nomem(void); + +#define add_icharbuf(buf, chr) \ + do { if ( buf->size < buf->allocated && chr < 128 ) \ + buf->data[buf->size++] = chr; \ + else \ + __add_icharbuf(buf, chr); \ + } while(0) + +icharbuf * new_icharbuf(void); +void free_icharbuf(icharbuf *buf); +void __add_icharbuf(icharbuf *buf, int chr); +void del_icharbuf(icharbuf *buf); +void terminate_icharbuf(icharbuf *buf); +void empty_icharbuf(icharbuf *buf); + +ocharbuf * init_ocharbuf(ocharbuf *buf); +ocharbuf * new_ocharbuf(void); +void free_ocharbuf(ocharbuf *buf); +ocharbuf * malloc_ocharbuf(ocharbuf *buf); +void add_ocharbuf(ocharbuf *buf, int chr); +void del_ocharbuf(ocharbuf *buf); +void terminate_ocharbuf(ocharbuf *buf); +void empty_ocharbuf(ocharbuf *buf); +#define fetch_ocharbuf(buf, at) ((wint_t)buf->data.w[at]) +#define poke_ocharbuf(buf, at, chr) \ + { buf->data.w[at] = chr; \ + } + +const wchar_t * str_summary(const wchar_t *s, int len); +wchar_t * str2ring(const wchar_t *in); +void * ringallo(size_t); +wchar_t * utf8towcs(const char *in); +char * wcstoutf8(const wchar_t *in); +ichar * load_sgml_file_to_charp(const ichar *file, int normalise_rsre, + size_t *len); +FILE * wfopen(const wchar_t *name, const char *mode); + +void wputs(ichar *s); + +#if defined(USE_STRING_FUNCTIONS) && !defined(UTIL_H_IMPLEMENTATION) + +#define istrlen(s1) wcslen((s1)) +#define istreq(s1,s2) (wcscmp((s1),(s2))==0) + +#endif + +#endif /*DTD_UTIL_H_INCLUDED*/ diff --git a/packages/sgml/xml_unicode.c b/packages/sgml/xml_unicode.c new file mode 100644 index 000000000..f6ecb19a3 --- /dev/null +++ b/packages/sgml/xml_unicode.c @@ -0,0 +1,1260 @@ +/* $Id$ + + Part of SWI-Prolog + + Author: Jan Wielemaker + E-mail: wielemak@science.uva.nl + WWW: http://www.swi-prolog.org + Copyright (C): 1985-2006, University of Amsterdam + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License + as published by the Free Software Foundation; either version 2 + of the License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + + As a special exception, if you link this library with other files, + compiled with a Free Software compiler, to produce an executable, this + library does not by itself cause the resulting executable to be covered + by the GNU General Public License. This exception does not however + invalidate any other reasons why the executable file might be covered by + the GNU General Public License. +*/ + +#include "xml_unicode.h" + + +/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +XML character classification. + +The core of this file is generated by xml_unicode.pl using the data from +http://www.w3.org/TR/2006/REC-xml-20060816. +- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ + +int +xml_basechar(int c) +{ if ( c <= 0x0c10 ) + { if ( c <= 0x0939 ) + { if ( c <= 0x03e0 ) + { if ( c <= 0x01f5 ) + { if ( c <= 0x0131 ) + { if ( c <= 0x00d6 ) + { if ( c <= 0x005a ) + { return (c >= 0x0041 && c <= 0x005a); + } else + { if ( c <= 0x007a ) + { return (c >= 0x0061 && c <= 0x007a); + } else + { return (c >= 0x00c0 && c <= 0x00d6); + } + } + } else + { if ( c <= 0x00f6 ) + { return (c >= 0x00d8 && c <= 0x00f6); + } else + { if ( c <= 0x00ff ) + { return (c >= 0x00f8 && c <= 0x00ff); + } else + { return (c >= 0x0100 && c <= 0x0131); + } + } + } + } else + { if ( c <= 0x017e ) + { if ( c <= 0x013e ) + { return (c >= 0x0134 && c <= 0x013e); + } else + { if ( c <= 0x0148 ) + { return (c >= 0x0141 && c <= 0x0148); + } else + { return (c >= 0x014a && c <= 0x017e); + } + } + } else + { if ( c <= 0x01c3 ) + { return (c >= 0x0180 && c <= 0x01c3); + } else + { if ( c <= 0x01f0 ) + { return (c >= 0x01cd && c <= 0x01f0); + } else + { return (c >= 0x01f4 && c <= 0x01f5); + } + } + } + } + } else + { if ( c <= 0x038c ) + { if ( c <= 0x02c1 ) + { if ( c <= 0x0217 ) + { return (c >= 0x01fa && c <= 0x0217); + } else + { if ( c <= 0x02a8 ) + { return (c >= 0x0250 && c <= 0x02a8); + } else + { return (c >= 0x02bb && c <= 0x02c1); + } + } + } else + { if ( c <= 0x0386 ) + { return (c == 0x0386);} else + { if ( c <= 0x038a ) + { return (c >= 0x0388 && c <= 0x038a); + } else + { return (c == 0x038c);} + } + } + } else + { if ( c <= 0x03d6 ) + { if ( c <= 0x03a1 ) + { return (c >= 0x038e && c <= 0x03a1); + } else + { if ( c <= 0x03ce ) + { return (c >= 0x03a3 && c <= 0x03ce); + } else + { return (c >= 0x03d0 && c <= 0x03d6); + } + } + } else + { if ( c <= 0x03dc ) + { if ( c <= 0x03da ) + { return (c == 0x03da);} else + { return (c == 0x03dc);} + } else + { if ( c <= 0x03de ) + { return (c == 0x03de);} else + { return (c == 0x03e0);} + } + } + } + } + } else + { if ( c <= 0x0556 ) + { if ( c <= 0x04c4 ) + { if ( c <= 0x044f ) + { if ( c <= 0x03f3 ) + { return (c >= 0x03e2 && c <= 0x03f3); + } else + { if ( c <= 0x040c ) + { return (c >= 0x0401 && c <= 0x040c); + } else + { return (c >= 0x040e && c <= 0x044f); + } + } + } else + { if ( c <= 0x045c ) + { return (c >= 0x0451 && c <= 0x045c); + } else + { if ( c <= 0x0481 ) + { return (c >= 0x045e && c <= 0x0481); + } else + { return (c >= 0x0490 && c <= 0x04c4); + } + } + } + } else + { if ( c <= 0x04eb ) + { if ( c <= 0x04c8 ) + { return (c >= 0x04c7 && c <= 0x04c8); + } else + { if ( c <= 0x04cc ) + { return (c >= 0x04cb && c <= 0x04cc); + } else + { return (c >= 0x04d0 && c <= 0x04eb); + } + } + } else + { if ( c <= 0x04f5 ) + { return (c >= 0x04ee && c <= 0x04f5); + } else + { if ( c <= 0x04f9 ) + { return (c >= 0x04f8 && c <= 0x04f9); + } else + { return (c >= 0x0531 && c <= 0x0556); + } + } + } + } + } else + { if ( c <= 0x064a ) + { if ( c <= 0x05ea ) + { if ( c <= 0x0559 ) + { return (c == 0x0559);} else + { if ( c <= 0x0586 ) + { return (c >= 0x0561 && c <= 0x0586); + } else + { return (c >= 0x05d0 && c <= 0x05ea); + } + } + } else + { if ( c <= 0x05f2 ) + { return (c >= 0x05f0 && c <= 0x05f2); + } else + { if ( c <= 0x063a ) + { return (c >= 0x0621 && c <= 0x063a); + } else + { return (c >= 0x0641 && c <= 0x064a); + } + } + } + } else + { if ( c <= 0x06ce ) + { if ( c <= 0x06b7 ) + { return (c >= 0x0671 && c <= 0x06b7); + } else + { if ( c <= 0x06be ) + { return (c >= 0x06ba && c <= 0x06be); + } else + { return (c >= 0x06c0 && c <= 0x06ce); + } + } + } else + { if ( c <= 0x06d5 ) + { if ( c <= 0x06d3 ) + { return (c >= 0x06d0 && c <= 0x06d3); + } else + { return (c == 0x06d5);} + } else + { if ( c <= 0x06e6 ) + { return (c >= 0x06e5 && c <= 0x06e6); + } else + { return (c >= 0x0905 && c <= 0x0939); + } + } + } + } + } + } + } else + { if ( c <= 0x0aa8 ) + { if ( c <= 0x0a0a ) + { if ( c <= 0x09b0 ) + { if ( c <= 0x098c ) + { if ( c <= 0x093d ) + { return (c == 0x093d);} else + { if ( c <= 0x0961 ) + { return (c >= 0x0958 && c <= 0x0961); + } else + { return (c >= 0x0985 && c <= 0x098c); + } + } + } else + { if ( c <= 0x0990 ) + { return (c >= 0x098f && c <= 0x0990); + } else + { if ( c <= 0x09a8 ) + { return (c >= 0x0993 && c <= 0x09a8); + } else + { return (c >= 0x09aa && c <= 0x09b0); + } + } + } + } else + { if ( c <= 0x09dd ) + { if ( c <= 0x09b2 ) + { return (c == 0x09b2);} else + { if ( c <= 0x09b9 ) + { return (c >= 0x09b6 && c <= 0x09b9); + } else + { return (c >= 0x09dc && c <= 0x09dd); + } + } + } else + { if ( c <= 0x09e1 ) + { return (c >= 0x09df && c <= 0x09e1); + } else + { if ( c <= 0x09f1 ) + { return (c >= 0x09f0 && c <= 0x09f1); + } else + { return (c >= 0x0a05 && c <= 0x0a0a); + } + } + } + } + } else + { if ( c <= 0x0a39 ) + { if ( c <= 0x0a30 ) + { if ( c <= 0x0a10 ) + { return (c >= 0x0a0f && c <= 0x0a10); + } else + { if ( c <= 0x0a28 ) + { return (c >= 0x0a13 && c <= 0x0a28); + } else + { return (c >= 0x0a2a && c <= 0x0a30); + } + } + } else + { if ( c <= 0x0a33 ) + { return (c >= 0x0a32 && c <= 0x0a33); + } else + { if ( c <= 0x0a36 ) + { return (c >= 0x0a35 && c <= 0x0a36); + } else + { return (c >= 0x0a38 && c <= 0x0a39); + } + } + } + } else + { if ( c <= 0x0a74 ) + { if ( c <= 0x0a5c ) + { return (c >= 0x0a59 && c <= 0x0a5c); + } else + { if ( c <= 0x0a5e ) + { return (c == 0x0a5e);} else + { return (c >= 0x0a72 && c <= 0x0a74); + } + } + } else + { if ( c <= 0x0a8d ) + { if ( c <= 0x0a8b ) + { return (c >= 0x0a85 && c <= 0x0a8b); + } else + { return (c == 0x0a8d);} + } else + { if ( c <= 0x0a91 ) + { return (c >= 0x0a8f && c <= 0x0a91); + } else + { return (c >= 0x0a93 && c <= 0x0aa8); + } + } + } + } + } + } else + { if ( c <= 0x0b5d ) + { if ( c <= 0x0b0c ) + { if ( c <= 0x0ab9 ) + { if ( c <= 0x0ab0 ) + { return (c >= 0x0aaa && c <= 0x0ab0); + } else + { if ( c <= 0x0ab3 ) + { return (c >= 0x0ab2 && c <= 0x0ab3); + } else + { return (c >= 0x0ab5 && c <= 0x0ab9); + } + } + } else + { if ( c <= 0x0abd ) + { return (c == 0x0abd);} else + { if ( c <= 0x0ae0 ) + { return (c == 0x0ae0);} else + { return (c >= 0x0b05 && c <= 0x0b0c); + } + } + } + } else + { if ( c <= 0x0b30 ) + { if ( c <= 0x0b10 ) + { return (c >= 0x0b0f && c <= 0x0b10); + } else + { if ( c <= 0x0b28 ) + { return (c >= 0x0b13 && c <= 0x0b28); + } else + { return (c >= 0x0b2a && c <= 0x0b30); + } + } + } else + { if ( c <= 0x0b39 ) + { if ( c <= 0x0b33 ) + { return (c >= 0x0b32 && c <= 0x0b33); + } else + { return (c >= 0x0b36 && c <= 0x0b39); + } + } else + { if ( c <= 0x0b3d ) + { return (c == 0x0b3d);} else + { return (c >= 0x0b5c && c <= 0x0b5d); + } + } + } + } + } else + { if ( c <= 0x0b9c ) + { if ( c <= 0x0b90 ) + { if ( c <= 0x0b61 ) + { return (c >= 0x0b5f && c <= 0x0b61); + } else + { if ( c <= 0x0b8a ) + { return (c >= 0x0b85 && c <= 0x0b8a); + } else + { return (c >= 0x0b8e && c <= 0x0b90); + } + } + } else + { if ( c <= 0x0b95 ) + { return (c >= 0x0b92 && c <= 0x0b95); + } else + { if ( c <= 0x0b9a ) + { return (c >= 0x0b99 && c <= 0x0b9a); + } else + { return (c == 0x0b9c);} + } + } + } else + { if ( c <= 0x0baa ) + { if ( c <= 0x0b9f ) + { return (c >= 0x0b9e && c <= 0x0b9f); + } else + { if ( c <= 0x0ba4 ) + { return (c >= 0x0ba3 && c <= 0x0ba4); + } else + { return (c >= 0x0ba8 && c <= 0x0baa); + } + } + } else + { if ( c <= 0x0bb9 ) + { if ( c <= 0x0bb5 ) + { return (c >= 0x0bae && c <= 0x0bb5); + } else + { return (c >= 0x0bb7 && c <= 0x0bb9); + } + } else + { if ( c <= 0x0c0c ) + { return (c >= 0x0c05 && c <= 0x0c0c); + } else + { return (c >= 0x0c0e && c <= 0x0c10); + } + } + } + } + } + } + } + } else + { if ( c <= 0x114c ) + { if ( c <= 0x0e8d ) + { if ( c <= 0x0d0c ) + { if ( c <= 0x0c90 ) + { if ( c <= 0x0c39 ) + { if ( c <= 0x0c28 ) + { return (c >= 0x0c12 && c <= 0x0c28); + } else + { if ( c <= 0x0c33 ) + { return (c >= 0x0c2a && c <= 0x0c33); + } else + { return (c >= 0x0c35 && c <= 0x0c39); + } + } + } else + { if ( c <= 0x0c61 ) + { return (c >= 0x0c60 && c <= 0x0c61); + } else + { if ( c <= 0x0c8c ) + { return (c >= 0x0c85 && c <= 0x0c8c); + } else + { return (c >= 0x0c8e && c <= 0x0c90); + } + } + } + } else + { if ( c <= 0x0cb9 ) + { if ( c <= 0x0ca8 ) + { return (c >= 0x0c92 && c <= 0x0ca8); + } else + { if ( c <= 0x0cb3 ) + { return (c >= 0x0caa && c <= 0x0cb3); + } else + { return (c >= 0x0cb5 && c <= 0x0cb9); + } + } + } else + { if ( c <= 0x0cde ) + { return (c == 0x0cde);} else + { if ( c <= 0x0ce1 ) + { return (c >= 0x0ce0 && c <= 0x0ce1); + } else + { return (c >= 0x0d05 && c <= 0x0d0c); + } + } + } + } + } else + { if ( c <= 0x0e30 ) + { if ( c <= 0x0d39 ) + { if ( c <= 0x0d10 ) + { return (c >= 0x0d0e && c <= 0x0d10); + } else + { if ( c <= 0x0d28 ) + { return (c >= 0x0d12 && c <= 0x0d28); + } else + { return (c >= 0x0d2a && c <= 0x0d39); + } + } + } else + { if ( c <= 0x0d61 ) + { return (c >= 0x0d60 && c <= 0x0d61); + } else + { if ( c <= 0x0e2e ) + { return (c >= 0x0e01 && c <= 0x0e2e); + } else + { return (c == 0x0e30);} + } + } + } else + { if ( c <= 0x0e82 ) + { if ( c <= 0x0e33 ) + { return (c >= 0x0e32 && c <= 0x0e33); + } else + { if ( c <= 0x0e45 ) + { return (c >= 0x0e40 && c <= 0x0e45); + } else + { return (c >= 0x0e81 && c <= 0x0e82); + } + } + } else + { if ( c <= 0x0e88 ) + { if ( c <= 0x0e84 ) + { return (c == 0x0e84);} else + { return (c >= 0x0e87 && c <= 0x0e88); + } + } else + { if ( c <= 0x0e8a ) + { return (c == 0x0e8a);} else + { return (c == 0x0e8d);} + } + } + } + } + } else + { if ( c <= 0x0f47 ) + { if ( c <= 0x0eab ) + { if ( c <= 0x0ea3 ) + { if ( c <= 0x0e97 ) + { return (c >= 0x0e94 && c <= 0x0e97); + } else + { if ( c <= 0x0e9f ) + { return (c >= 0x0e99 && c <= 0x0e9f); + } else + { return (c >= 0x0ea1 && c <= 0x0ea3); + } + } + } else + { if ( c <= 0x0ea5 ) + { return (c == 0x0ea5);} else + { if ( c <= 0x0ea7 ) + { return (c == 0x0ea7);} else + { return (c >= 0x0eaa && c <= 0x0eab); + } + } + } + } else + { if ( c <= 0x0eb3 ) + { if ( c <= 0x0eae ) + { return (c >= 0x0ead && c <= 0x0eae); + } else + { if ( c <= 0x0eb0 ) + { return (c == 0x0eb0);} else + { return (c >= 0x0eb2 && c <= 0x0eb3); + } + } + } else + { if ( c <= 0x0ebd ) + { return (c == 0x0ebd);} else + { if ( c <= 0x0ec4 ) + { return (c >= 0x0ec0 && c <= 0x0ec4); + } else + { return (c >= 0x0f40 && c <= 0x0f47); + } + } + } + } + } else + { if ( c <= 0x1107 ) + { if ( c <= 0x10f6 ) + { if ( c <= 0x0f69 ) + { return (c >= 0x0f49 && c <= 0x0f69); + } else + { if ( c <= 0x10c5 ) + { return (c >= 0x10a0 && c <= 0x10c5); + } else + { return (c >= 0x10d0 && c <= 0x10f6); + } + } + } else + { if ( c <= 0x1100 ) + { return (c == 0x1100);} else + { if ( c <= 0x1103 ) + { return (c >= 0x1102 && c <= 0x1103); + } else + { return (c >= 0x1105 && c <= 0x1107); + } + } + } + } else + { if ( c <= 0x1112 ) + { if ( c <= 0x1109 ) + { return (c == 0x1109);} else + { if ( c <= 0x110c ) + { return (c >= 0x110b && c <= 0x110c); + } else + { return (c >= 0x110e && c <= 0x1112); + } + } + } else + { if ( c <= 0x113e ) + { if ( c <= 0x113c ) + { return (c == 0x113c);} else + { return (c == 0x113e);} + } else + { if ( c <= 0x1140 ) + { return (c == 0x1140);} else + { return (c == 0x114c);} + } + } + } + } + } + } else + { if ( c <= 0x1f15 ) + { if ( c <= 0x1175 ) + { if ( c <= 0x1163 ) + { if ( c <= 0x1155 ) + { if ( c <= 0x114e ) + { return (c == 0x114e);} else + { if ( c <= 0x1150 ) + { return (c == 0x1150);} else + { return (c >= 0x1154 && c <= 0x1155); + } + } + } else + { if ( c <= 0x1159 ) + { return (c == 0x1159);} else + { if ( c <= 0x1161 ) + { return (c >= 0x115f && c <= 0x1161); + } else + { return (c == 0x1163);} + } + } + } else + { if ( c <= 0x1169 ) + { if ( c <= 0x1165 ) + { return (c == 0x1165);} else + { if ( c <= 0x1167 ) + { return (c == 0x1167);} else + { return (c == 0x1169);} + } + } else + { if ( c <= 0x116e ) + { return (c >= 0x116d && c <= 0x116e); + } else + { if ( c <= 0x1173 ) + { return (c >= 0x1172 && c <= 0x1173); + } else + { return (c == 0x1175);} + } + } + } + } else + { if ( c <= 0x11ba ) + { if ( c <= 0x11ab ) + { if ( c <= 0x119e ) + { return (c == 0x119e);} else + { if ( c <= 0x11a8 ) + { return (c == 0x11a8);} else + { return (c == 0x11ab);} + } + } else + { if ( c <= 0x11af ) + { return (c >= 0x11ae && c <= 0x11af); + } else + { if ( c <= 0x11b8 ) + { return (c >= 0x11b7 && c <= 0x11b8); + } else + { return (c == 0x11ba);} + } + } + } else + { if ( c <= 0x11f0 ) + { if ( c <= 0x11c2 ) + { return (c >= 0x11bc && c <= 0x11c2); + } else + { if ( c <= 0x11eb ) + { return (c == 0x11eb);} else + { return (c == 0x11f0);} + } + } else + { if ( c <= 0x1e9b ) + { if ( c <= 0x11f9 ) + { return (c == 0x11f9);} else + { return (c >= 0x1e00 && c <= 0x1e9b); + } + } else + { if ( c <= 0x1ef9 ) + { return (c >= 0x1ea0 && c <= 0x1ef9); + } else + { return (c >= 0x1f00 && c <= 0x1f15); + } + } + } + } + } + } else + { if ( c <= 0x1fcc ) + { if ( c <= 0x1f5b ) + { if ( c <= 0x1f4d ) + { if ( c <= 0x1f1d ) + { return (c >= 0x1f18 && c <= 0x1f1d); + } else + { if ( c <= 0x1f45 ) + { return (c >= 0x1f20 && c <= 0x1f45); + } else + { return (c >= 0x1f48 && c <= 0x1f4d); + } + } + } else + { if ( c <= 0x1f57 ) + { return (c >= 0x1f50 && c <= 0x1f57); + } else + { if ( c <= 0x1f59 ) + { return (c == 0x1f59);} else + { return (c == 0x1f5b);} + } + } + } else + { if ( c <= 0x1fb4 ) + { if ( c <= 0x1f5d ) + { return (c == 0x1f5d);} else + { if ( c <= 0x1f7d ) + { return (c >= 0x1f5f && c <= 0x1f7d); + } else + { return (c >= 0x1f80 && c <= 0x1fb4); + } + } + } else + { if ( c <= 0x1fbe ) + { if ( c <= 0x1fbc ) + { return (c >= 0x1fb6 && c <= 0x1fbc); + } else + { return (c == 0x1fbe);} + } else + { if ( c <= 0x1fc4 ) + { return (c >= 0x1fc2 && c <= 0x1fc4); + } else + { return (c >= 0x1fc6 && c <= 0x1fcc); + } + } + } + } + } else + { if ( c <= 0x2126 ) + { if ( c <= 0x1fec ) + { if ( c <= 0x1fd3 ) + { return (c >= 0x1fd0 && c <= 0x1fd3); + } else + { if ( c <= 0x1fdb ) + { return (c >= 0x1fd6 && c <= 0x1fdb); + } else + { return (c >= 0x1fe0 && c <= 0x1fec); + } + } + } else + { if ( c <= 0x1ff4 ) + { return (c >= 0x1ff2 && c <= 0x1ff4); + } else + { if ( c <= 0x1ffc ) + { return (c >= 0x1ff6 && c <= 0x1ffc); + } else + { return (c == 0x2126);} + } + } + } else + { if ( c <= 0x2182 ) + { if ( c <= 0x212b ) + { return (c >= 0x212a && c <= 0x212b); + } else + { if ( c <= 0x212e ) + { return (c == 0x212e);} else + { return (c >= 0x2180 && c <= 0x2182); + } + } + } else + { if ( c <= 0x30fa ) + { if ( c <= 0x3094 ) + { return (c >= 0x3041 && c <= 0x3094); + } else + { return (c >= 0x30a1 && c <= 0x30fa); + } + } else + { if ( c <= 0x312c ) + { return (c >= 0x3105 && c <= 0x312c); + } else + { return (c >= 0xac00 && c <= 0xd7a3); + } + } + } + } + } + } + } + } +} + +int +xml_ideographic(int c) +{ if ( c <= 0x9fa5 ) + { return (c >= 0x4e00 && c <= 0x9fa5); + } else + { if ( c <= 0x3007 ) + { return (c == 0x3007);} else + { return (c >= 0x3021 && c <= 0x3029); + } + } +} + +int +xml_combining_char(int c) +{ if ( c <= 0x0b43 ) + { if ( c <= 0x0983 ) + { if ( c <= 0x0670 ) + { if ( c <= 0x05b9 ) + { if ( c <= 0x0361 ) + { if ( c <= 0x0345 ) + { return (c >= 0x0300 && c <= 0x0345); + } else + { return (c >= 0x0360 && c <= 0x0361); + } + } else + { if ( c <= 0x0486 ) + { return (c >= 0x0483 && c <= 0x0486); + } else + { if ( c <= 0x05a1 ) + { return (c >= 0x0591 && c <= 0x05a1); + } else + { return (c >= 0x05a3 && c <= 0x05b9); + } + } + } + } else + { if ( c <= 0x05c2 ) + { if ( c <= 0x05bd ) + { return (c >= 0x05bb && c <= 0x05bd); + } else + { if ( c <= 0x05bf ) + { return (c == 0x05bf);} else + { return (c >= 0x05c1 && c <= 0x05c2); + } + } + } else + { if ( c <= 0x05c4 ) + { return (c == 0x05c4);} else + { if ( c <= 0x0652 ) + { return (c >= 0x064b && c <= 0x0652); + } else + { return (c == 0x0670);} + } + } + } + } else + { if ( c <= 0x0903 ) + { if ( c <= 0x06e4 ) + { if ( c <= 0x06dc ) + { return (c >= 0x06d6 && c <= 0x06dc); + } else + { if ( c <= 0x06df ) + { return (c >= 0x06dd && c <= 0x06df); + } else + { return (c >= 0x06e0 && c <= 0x06e4); + } + } + } else + { if ( c <= 0x06e8 ) + { return (c >= 0x06e7 && c <= 0x06e8); + } else + { if ( c <= 0x06ed ) + { return (c >= 0x06ea && c <= 0x06ed); + } else + { return (c >= 0x0901 && c <= 0x0903); + } + } + } + } else + { if ( c <= 0x094d ) + { if ( c <= 0x093c ) + { return (c == 0x093c);} else + { if ( c <= 0x094c ) + { return (c >= 0x093e && c <= 0x094c); + } else + { return (c == 0x094d);} + } + } else + { if ( c <= 0x0954 ) + { return (c >= 0x0951 && c <= 0x0954); + } else + { if ( c <= 0x0963 ) + { return (c >= 0x0962 && c <= 0x0963); + } else + { return (c >= 0x0981 && c <= 0x0983); + } + } + } + } + } + } else + { if ( c <= 0x0a3f ) + { if ( c <= 0x09cd ) + { if ( c <= 0x09bf ) + { if ( c <= 0x09bc ) + { return (c == 0x09bc);} else + { if ( c <= 0x09be ) + { return (c == 0x09be);} else + { return (c == 0x09bf);} + } + } else + { if ( c <= 0x09c4 ) + { return (c >= 0x09c0 && c <= 0x09c4); + } else + { if ( c <= 0x09c8 ) + { return (c >= 0x09c7 && c <= 0x09c8); + } else + { return (c >= 0x09cb && c <= 0x09cd); + } + } + } + } else + { if ( c <= 0x0a02 ) + { if ( c <= 0x09d7 ) + { return (c == 0x09d7);} else + { if ( c <= 0x09e3 ) + { return (c >= 0x09e2 && c <= 0x09e3); + } else + { return (c == 0x0a02);} + } + } else + { if ( c <= 0x0a3c ) + { return (c == 0x0a3c);} else + { if ( c <= 0x0a3e ) + { return (c == 0x0a3e);} else + { return (c == 0x0a3f);} + } + } + } + } else + { if ( c <= 0x0abc ) + { if ( c <= 0x0a4d ) + { if ( c <= 0x0a42 ) + { return (c >= 0x0a40 && c <= 0x0a42); + } else + { if ( c <= 0x0a48 ) + { return (c >= 0x0a47 && c <= 0x0a48); + } else + { return (c >= 0x0a4b && c <= 0x0a4d); + } + } + } else + { if ( c <= 0x0a71 ) + { return (c >= 0x0a70 && c <= 0x0a71); + } else + { if ( c <= 0x0a83 ) + { return (c >= 0x0a81 && c <= 0x0a83); + } else + { return (c == 0x0abc);} + } + } + } else + { if ( c <= 0x0acd ) + { if ( c <= 0x0ac5 ) + { return (c >= 0x0abe && c <= 0x0ac5); + } else + { if ( c <= 0x0ac9 ) + { return (c >= 0x0ac7 && c <= 0x0ac9); + } else + { return (c >= 0x0acb && c <= 0x0acd); + } + } + } else + { if ( c <= 0x0b03 ) + { return (c >= 0x0b01 && c <= 0x0b03); + } else + { if ( c <= 0x0b3c ) + { return (c == 0x0b3c);} else + { return (c >= 0x0b3e && c <= 0x0b43); + } + } + } + } + } + } + } else + { if ( c <= 0x0e31 ) + { if ( c <= 0x0c4d ) + { if ( c <= 0x0bc8 ) + { if ( c <= 0x0b57 ) + { if ( c <= 0x0b48 ) + { return (c >= 0x0b47 && c <= 0x0b48); + } else + { if ( c <= 0x0b4d ) + { return (c >= 0x0b4b && c <= 0x0b4d); + } else + { return (c >= 0x0b56 && c <= 0x0b57); + } + } + } else + { if ( c <= 0x0b83 ) + { return (c >= 0x0b82 && c <= 0x0b83); + } else + { if ( c <= 0x0bc2 ) + { return (c >= 0x0bbe && c <= 0x0bc2); + } else + { return (c >= 0x0bc6 && c <= 0x0bc8); + } + } + } + } else + { if ( c <= 0x0c03 ) + { if ( c <= 0x0bcd ) + { return (c >= 0x0bca && c <= 0x0bcd); + } else + { if ( c <= 0x0bd7 ) + { return (c == 0x0bd7);} else + { return (c >= 0x0c01 && c <= 0x0c03); + } + } + } else + { if ( c <= 0x0c44 ) + { return (c >= 0x0c3e && c <= 0x0c44); + } else + { if ( c <= 0x0c48 ) + { return (c >= 0x0c46 && c <= 0x0c48); + } else + { return (c >= 0x0c4a && c <= 0x0c4d); + } + } + } + } + } else + { if ( c <= 0x0cd6 ) + { if ( c <= 0x0cc4 ) + { if ( c <= 0x0c56 ) + { return (c >= 0x0c55 && c <= 0x0c56); + } else + { if ( c <= 0x0c83 ) + { return (c >= 0x0c82 && c <= 0x0c83); + } else + { return (c >= 0x0cbe && c <= 0x0cc4); + } + } + } else + { if ( c <= 0x0cc8 ) + { return (c >= 0x0cc6 && c <= 0x0cc8); + } else + { if ( c <= 0x0ccd ) + { return (c >= 0x0cca && c <= 0x0ccd); + } else + { return (c >= 0x0cd5 && c <= 0x0cd6); + } + } + } + } else + { if ( c <= 0x0d48 ) + { if ( c <= 0x0d03 ) + { return (c >= 0x0d02 && c <= 0x0d03); + } else + { if ( c <= 0x0d43 ) + { return (c >= 0x0d3e && c <= 0x0d43); + } else + { return (c >= 0x0d46 && c <= 0x0d48); + } + } + } else + { if ( c <= 0x0d4d ) + { return (c >= 0x0d4a && c <= 0x0d4d); + } else + { if ( c <= 0x0d57 ) + { return (c == 0x0d57);} else + { return (c == 0x0e31);} + } + } + } + } + } else + { if ( c <= 0x0f3f ) + { if ( c <= 0x0ecd ) + { if ( c <= 0x0eb1 ) + { if ( c <= 0x0e3a ) + { return (c >= 0x0e34 && c <= 0x0e3a); + } else + { if ( c <= 0x0e4e ) + { return (c >= 0x0e47 && c <= 0x0e4e); + } else + { return (c == 0x0eb1);} + } + } else + { if ( c <= 0x0eb9 ) + { return (c >= 0x0eb4 && c <= 0x0eb9); + } else + { if ( c <= 0x0ebc ) + { return (c >= 0x0ebb && c <= 0x0ebc); + } else + { return (c >= 0x0ec8 && c <= 0x0ecd); + } + } + } + } else + { if ( c <= 0x0f37 ) + { if ( c <= 0x0f19 ) + { return (c >= 0x0f18 && c <= 0x0f19); + } else + { if ( c <= 0x0f35 ) + { return (c == 0x0f35);} else + { return (c == 0x0f37);} + } + } else + { if ( c <= 0x0f39 ) + { return (c == 0x0f39);} else + { if ( c <= 0x0f3e ) + { return (c == 0x0f3e);} else + { return (c == 0x0f3f);} + } + } + } + } else + { if ( c <= 0x0fb7 ) + { if ( c <= 0x0f95 ) + { if ( c <= 0x0f84 ) + { return (c >= 0x0f71 && c <= 0x0f84); + } else + { if ( c <= 0x0f8b ) + { return (c >= 0x0f86 && c <= 0x0f8b); + } else + { return (c >= 0x0f90 && c <= 0x0f95); + } + } + } else + { if ( c <= 0x0f97 ) + { return (c == 0x0f97);} else + { if ( c <= 0x0fad ) + { return (c >= 0x0f99 && c <= 0x0fad); + } else + { return (c >= 0x0fb1 && c <= 0x0fb7); + } + } + } + } else + { if ( c <= 0x20e1 ) + { if ( c <= 0x0fb9 ) + { return (c == 0x0fb9);} else + { if ( c <= 0x20dc ) + { return (c >= 0x20d0 && c <= 0x20dc); + } else + { return (c == 0x20e1);} + } + } else + { if ( c <= 0x302f ) + { return (c >= 0x302a && c <= 0x302f); + } else + { if ( c <= 0x3099 ) + { return (c == 0x3099);} else + { return (c == 0x309a);} + } + } + } + } + } + } +} + +int +xml_digit(int c) +{ if ( c <= 0x0aef ) + { if ( c <= 0x06f9 ) + { if ( c <= 0x0039 ) + { return (c >= 0x0030 && c <= 0x0039); + } else + { if ( c <= 0x0669 ) + { return (c >= 0x0660 && c <= 0x0669); + } else + { return (c >= 0x06f0 && c <= 0x06f9); + } + } + } else + { if ( c <= 0x09ef ) + { if ( c <= 0x096f ) + { return (c >= 0x0966 && c <= 0x096f); + } else + { return (c >= 0x09e6 && c <= 0x09ef); + } + } else + { if ( c <= 0x0a6f ) + { return (c >= 0x0a66 && c <= 0x0a6f); + } else + { return (c >= 0x0ae6 && c <= 0x0aef); + } + } + } + } else + { if ( c <= 0x0cef ) + { if ( c <= 0x0bef ) + { if ( c <= 0x0b6f ) + { return (c >= 0x0b66 && c <= 0x0b6f); + } else + { return (c >= 0x0be7 && c <= 0x0bef); + } + } else + { if ( c <= 0x0c6f ) + { return (c >= 0x0c66 && c <= 0x0c6f); + } else + { return (c >= 0x0ce6 && c <= 0x0cef); + } + } + } else + { if ( c <= 0x0e59 ) + { if ( c <= 0x0d6f ) + { return (c >= 0x0d66 && c <= 0x0d6f); + } else + { return (c >= 0x0e50 && c <= 0x0e59); + } + } else + { if ( c <= 0x0ed9 ) + { return (c >= 0x0ed0 && c <= 0x0ed9); + } else + { return (c >= 0x0f20 && c <= 0x0f29); + } + } + } + } +} + +int +xml_extender(int c) +{ if ( c <= 0x0640 ) + { if ( c <= 0x02d0 ) + { if ( c <= 0x00b7 ) + { return (c == 0x00b7);} else + { return (c == 0x02d0);} + } else + { if ( c <= 0x02d1 ) + { return (c == 0x02d1);} else + { if ( c <= 0x0387 ) + { return (c == 0x0387);} else + { return (c == 0x0640);} + } + } + } else + { if ( c <= 0x3005 ) + { if ( c <= 0x0e46 ) + { return (c == 0x0e46);} else + { if ( c <= 0x0ec6 ) + { return (c == 0x0ec6);} else + { return (c == 0x3005);} + } + } else + { if ( c <= 0x3035 ) + { return (c >= 0x3031 && c <= 0x3035); + } else + { if ( c <= 0x309e ) + { return (c >= 0x309d && c <= 0x309e); + } else + { return (c >= 0x30fc && c <= 0x30fe); + } + } + } + } +} + diff --git a/packages/sgml/xml_unicode.h b/packages/sgml/xml_unicode.h new file mode 100644 index 000000000..d0d17bed4 --- /dev/null +++ b/packages/sgml/xml_unicode.h @@ -0,0 +1,41 @@ +/* $Id$ + + Part of SWI-Prolog + + Author: Jan Wielemaker + E-mail: wielemak@science.uva.nl + WWW: http://www.swi-prolog.org + Copyright (C): 1985-2006, University of Amsterdam + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License + as published by the Free Software Foundation; either version 2 + of the License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + + As a special exception, if you link this library with other files, + compiled with a Free Software compiler, to produce an executable, this + library does not by itself cause the resulting executable to be covered + by the GNU General Public License. This exception does not however + invalidate any other reasons why the executable file might be covered by + the GNU General Public License. +*/ + +#ifndef XML_UNICODE_H_INCLUDED +#define XML_UNICODE_H_INCLUDED + +int xml_basechar(int c); +int xml_ideographic(int c); +int xml_combining_char(int c); +int xml_digit(int c); +int xml_extender(int c); + +#endif /*XML_UNICODE_H_INCLUDED*/ diff --git a/packages/sgml/xml_unicode.pl b/packages/sgml/xml_unicode.pl new file mode 100644 index 000000000..029b6064c --- /dev/null +++ b/packages/sgml/xml_unicode.pl @@ -0,0 +1,436 @@ +/* $Id$ + + Part of SWI-Prolog + + Author: Jan Wielemaker + E-mail: wielemak@science.uva.nl + WWW: http://www.swi-prolog.org + Copyright (C): 1985-2006, University of Amsterdam + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License + as published by the Free Software Foundation; either version 2 + of the License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + + As a special exception, if you link this library with other files, + compiled with a Free Software compiler, to produce an executable, this + library does not by itself cause the resulting executable to be covered + by the GNU General Public License. This exception does not however + invalidate any other reasons why the executable file might be covered by + the GNU General Public License. +*/ + +:- module(xml_unicode, + [ mkclassify/0 + ]). + +%% mkclassify +% +% Generate the core of xml_unicode.c. + +mkclassify :- + forall(list(List, _), + mkfunc(List)). + +mkfunc(Name) :- + format('int~n'), + format('wcis_~w(int c)~n', [Name]), + format('{ '), + list(Name, List), + mkswitch(List), + format('}~n~n'). + +mkswitch(List) :- + mkswitch(List, 2). + +mkswitch([Low-High], Indent) :- !, + indent(Indent), + format('return (c >= 0x~|~`0t~16r~4+ && c <= 0x~|~`0t~16r~4+);~n', [Low, High]). +mkswitch([Value], Indent) :- !, + indent(Indent), + format('return (c == 0x~|~`0t~16r~4+);', [Value]). +mkswitch(List, Indent) :- + split(List, Low, High), + end(Low, MaxLow), + indent(Indent), + NextIndent is Indent + 2, + format('if ( c <= 0x~|~`0t~16r~4+ )~n', [MaxLow]), + indent(Indent), + format('{ '), + mkswitch(Low, NextIndent), + indent(Indent), + format('} else~n'), + indent(Indent), + format('{ '), + mkswitch(High, NextIndent), + indent(Indent), + format('}~n'). + +end(List, Max) :- + last(List, Last), + ( Last = _-Max + -> true + ; Max = Last + ). + +split(List, Low, High) :- + length(List, Len), + Mid is Len//2, + length(Low, Mid), + append(Low, High, List). + +indent(N) :- + line_position(current_output, Pos), + Spaces is N - Pos, + format('~*c', [Spaces, 32]). + + + +list(basechar, + [ 0x0041-0x005A, + 0x0061-0x007A, + 0x00C0-0x00D6, + 0x00D8-0x00F6, + 0x00F8-0x00FF, + 0x0100-0x0131, + 0x0134-0x013E, + 0x0141-0x0148, + 0x014A-0x017E, + 0x0180-0x01C3, + 0x01CD-0x01F0, + 0x01F4-0x01F5, + 0x01FA-0x0217, + 0x0250-0x02A8, + 0x02BB-0x02C1, + 0x0386, + 0x0388-0x038A, + 0x038C, + 0x038E-0x03A1, + 0x03A3-0x03CE, + 0x03D0-0x03D6, + 0x03DA, + 0x03DC, + 0x03DE, + 0x03E0, + 0x03E2-0x03F3, + 0x0401-0x040C, + 0x040E-0x044F, + 0x0451-0x045C, + 0x045E-0x0481, + 0x0490-0x04C4, + 0x04C7-0x04C8, + 0x04CB-0x04CC, + 0x04D0-0x04EB, + 0x04EE-0x04F5, + 0x04F8-0x04F9, + 0x0531-0x0556, + 0x0559, + 0x0561-0x0586, + 0x05D0-0x05EA, + 0x05F0-0x05F2, + 0x0621-0x063A, + 0x0641-0x064A, + 0x0671-0x06B7, + 0x06BA-0x06BE, + 0x06C0-0x06CE, + 0x06D0-0x06D3, + 0x06D5, + 0x06E5-0x06E6, + 0x0905-0x0939, + 0x093D, + 0x0958-0x0961, + 0x0985-0x098C, + 0x098F-0x0990, + 0x0993-0x09A8, + 0x09AA-0x09B0, + 0x09B2, + 0x09B6-0x09B9, + 0x09DC-0x09DD, + 0x09DF-0x09E1, + 0x09F0-0x09F1, + 0x0A05-0x0A0A, + 0x0A0F-0x0A10, + 0x0A13-0x0A28, + 0x0A2A-0x0A30, + 0x0A32-0x0A33, + 0x0A35-0x0A36, + 0x0A38-0x0A39, + 0x0A59-0x0A5C, + 0x0A5E, + 0x0A72-0x0A74, + 0x0A85-0x0A8B, + 0x0A8D, + 0x0A8F-0x0A91, + 0x0A93-0x0AA8, + 0x0AAA-0x0AB0, + 0x0AB2-0x0AB3, + 0x0AB5-0x0AB9, + 0x0ABD, + 0x0AE0, + 0x0B05-0x0B0C, + 0x0B0F-0x0B10, + 0x0B13-0x0B28, + 0x0B2A-0x0B30, + 0x0B32-0x0B33, + 0x0B36-0x0B39, + 0x0B3D, + 0x0B5C-0x0B5D, + 0x0B5F-0x0B61, + 0x0B85-0x0B8A, + 0x0B8E-0x0B90, + 0x0B92-0x0B95, + 0x0B99-0x0B9A, + 0x0B9C, + 0x0B9E-0x0B9F, + 0x0BA3-0x0BA4, + 0x0BA8-0x0BAA, + 0x0BAE-0x0BB5, + 0x0BB7-0x0BB9, + 0x0C05-0x0C0C, + 0x0C0E-0x0C10, + 0x0C12-0x0C28, + 0x0C2A-0x0C33, + 0x0C35-0x0C39, + 0x0C60-0x0C61, + 0x0C85-0x0C8C, + 0x0C8E-0x0C90, + 0x0C92-0x0CA8, + 0x0CAA-0x0CB3, + 0x0CB5-0x0CB9, + 0x0CDE, + 0x0CE0-0x0CE1, + 0x0D05-0x0D0C, + 0x0D0E-0x0D10, + 0x0D12-0x0D28, + 0x0D2A-0x0D39, + 0x0D60-0x0D61, + 0x0E01-0x0E2E, + 0x0E30, + 0x0E32-0x0E33, + 0x0E40-0x0E45, + 0x0E81-0x0E82, + 0x0E84, + 0x0E87-0x0E88, + 0x0E8A, + 0x0E8D, + 0x0E94-0x0E97, + 0x0E99-0x0E9F, + 0x0EA1-0x0EA3, + 0x0EA5, + 0x0EA7, + 0x0EAA-0x0EAB, + 0x0EAD-0x0EAE, + 0x0EB0, + 0x0EB2-0x0EB3, + 0x0EBD, + 0x0EC0-0x0EC4, + 0x0F40-0x0F47, + 0x0F49-0x0F69, + 0x10A0-0x10C5, + 0x10D0-0x10F6, + 0x1100, + 0x1102-0x1103, + 0x1105-0x1107, + 0x1109, + 0x110B-0x110C, + 0x110E-0x1112, + 0x113C, + 0x113E, + 0x1140, + 0x114C, + 0x114E, + 0x1150, + 0x1154-0x1155, + 0x1159, + 0x115F-0x1161, + 0x1163, + 0x1165, + 0x1167, + 0x1169, + 0x116D-0x116E, + 0x1172-0x1173, + 0x1175, + 0x119E, + 0x11A8, + 0x11AB, + 0x11AE-0x11AF, + 0x11B7-0x11B8, + 0x11BA, + 0x11BC-0x11C2, + 0x11EB, + 0x11F0, + 0x11F9, + 0x1E00-0x1E9B, + 0x1EA0-0x1EF9, + 0x1F00-0x1F15, + 0x1F18-0x1F1D, + 0x1F20-0x1F45, + 0x1F48-0x1F4D, + 0x1F50-0x1F57, + 0x1F59, + 0x1F5B, + 0x1F5D, + 0x1F5F-0x1F7D, + 0x1F80-0x1FB4, + 0x1FB6-0x1FBC, + 0x1FBE, + 0x1FC2-0x1FC4, + 0x1FC6-0x1FCC, + 0x1FD0-0x1FD3, + 0x1FD6-0x1FDB, + 0x1FE0-0x1FEC, + 0x1FF2-0x1FF4, + 0x1FF6-0x1FFC, + 0x2126, + 0x212A-0x212B, + 0x212E, + 0x2180-0x2182, + 0x3041-0x3094, + 0x30A1-0x30FA, + 0x3105-0x312C, + 0xAC00-0xD7A3]). + +list(ideographic, + [ 0x4E00-0x9FA5, + 0x3007, + 0x3021-0x3029 + ]). + +list(combining_char, + [ 0x0300-0x0345, + 0x0360-0x0361, + 0x0483-0x0486, + 0x0591-0x05A1, + 0x05A3-0x05B9, + 0x05BB-0x05BD, + 0x05BF, + 0x05C1-0x05C2, + 0x05C4, + 0x064B-0x0652, + 0x0670, + 0x06D6-0x06DC, + 0x06DD-0x06DF, + 0x06E0-0x06E4, + 0x06E7-0x06E8, + 0x06EA-0x06ED, + 0x0901-0x0903, + 0x093C, + 0x093E-0x094C, + 0x094D, + 0x0951-0x0954, + 0x0962-0x0963, + 0x0981-0x0983, + 0x09BC, + 0x09BE, + 0x09BF, + 0x09C0-0x09C4, + 0x09C7-0x09C8, + 0x09CB-0x09CD, + 0x09D7, + 0x09E2-0x09E3, + 0x0A02, + 0x0A3C, + 0x0A3E, + 0x0A3F, + 0x0A40-0x0A42, + 0x0A47-0x0A48, + 0x0A4B-0x0A4D, + 0x0A70-0x0A71, + 0x0A81-0x0A83, + 0x0ABC, + 0x0ABE-0x0AC5, + 0x0AC7-0x0AC9, + 0x0ACB-0x0ACD, + 0x0B01-0x0B03, + 0x0B3C, + 0x0B3E-0x0B43, + 0x0B47-0x0B48, + 0x0B4B-0x0B4D, + 0x0B56-0x0B57, + 0x0B82-0x0B83, + 0x0BBE-0x0BC2, + 0x0BC6-0x0BC8, + 0x0BCA-0x0BCD, + 0x0BD7, + 0x0C01-0x0C03, + 0x0C3E-0x0C44, + 0x0C46-0x0C48, + 0x0C4A-0x0C4D, + 0x0C55-0x0C56, + 0x0C82-0x0C83, + 0x0CBE-0x0CC4, + 0x0CC6-0x0CC8, + 0x0CCA-0x0CCD, + 0x0CD5-0x0CD6, + 0x0D02-0x0D03, + 0x0D3E-0x0D43, + 0x0D46-0x0D48, + 0x0D4A-0x0D4D, + 0x0D57, + 0x0E31, + 0x0E34-0x0E3A, + 0x0E47-0x0E4E, + 0x0EB1, + 0x0EB4-0x0EB9, + 0x0EBB-0x0EBC, + 0x0EC8-0x0ECD, + 0x0F18-0x0F19, + 0x0F35, + 0x0F37, + 0x0F39, + 0x0F3E, + 0x0F3F, + 0x0F71-0x0F84, + 0x0F86-0x0F8B, + 0x0F90-0x0F95, + 0x0F97, + 0x0F99-0x0FAD, + 0x0FB1-0x0FB7, + 0x0FB9, + 0x20D0-0x20DC, + 0x20E1, + 0x302A-0x302F, + 0x3099, + 0x309A + ]). + +list(digit, + [ 0x0030-0x0039, + 0x0660-0x0669, + 0x06F0-0x06F9, + 0x0966-0x096F, + 0x09E6-0x09EF, + 0x0A66-0x0A6F, + 0x0AE6-0x0AEF, + 0x0B66-0x0B6F, + 0x0BE7-0x0BEF, + 0x0C66-0x0C6F, + 0x0CE6-0x0CEF, + 0x0D66-0x0D6F, + 0x0E50-0x0E59, + 0x0ED0-0x0ED9, + 0x0F20-0x0F29 + ]). + +list(extender, + [ 0x00B7, + 0x02D0, + 0x02D1, + 0x0387, + 0x0640, + 0x0E46, + 0x0EC6, + 0x3005, + 0x3031-0x3035, + 0x309D-0x309E, + 0x30FC-0x30FE + ]). diff --git a/packages/sgml/xmlns.c b/packages/sgml/xmlns.c new file mode 100644 index 000000000..30b31b7a2 --- /dev/null +++ b/packages/sgml/xmlns.c @@ -0,0 +1,244 @@ +/* $Id$ + + Part of SWI-Prolog + + Author: Jan Wielemaker + E-mail: jan@swi.psy.uva.nl + WWW: http://www.swi-prolog.org + Copyright (C): 1985-2002, University of Amsterdam + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ + +#include +#include +#include "dtd.h" +#include "parser.h" + +#ifdef XMLNS + +static xmlns * +xmlns_push(dtd_parser *p, const ichar *ns, const ichar *url) +{ sgml_environment *env = p->environments; + dtd_symbol *n = (*ns ? dtd_add_symbol(p->dtd, ns) : (dtd_symbol *)NULL); + dtd_symbol *u = dtd_add_symbol(p->dtd, url); /* TBD: ochar/ichar */ + + if ( p->on_xmlns ) + (*p->on_xmlns)(p, n, u); + + if ( env ) + { xmlns *x = sgml_malloc(sizeof(*n)); + + x->name = n; + x->url = u; + x->next = env->xmlns; + env->xmlns = x; + + return x; + } + + return NULL; +} + + +void +xmlns_free(sgml_environment *env) +{ xmlns *n, *next; + + for(n = env->xmlns; n; n = next) + { next = n->next; + + sgml_free(n); + } +} + + +xmlns * +xmlns_find(sgml_environment *env, dtd_symbol *ns) +{ for(; env; env = env->parent) + { xmlns *n; + + for(n=env->xmlns; n; n = n->next) + { if ( n->name == ns ) + return n; + } + } + + return NULL; +} + + +static ichar * +isxmlns(const ichar *s, int nschr) +{ if ( s[0]=='x' && s[1]=='m' && s[2]=='l' && s[3] =='n'&& s[4]=='s' ) + { if ( !s[5] ) + return (ichar *)s+5; /* implicit */ + if ( s[5] == nschr ) + return (ichar *)s+6; + } + + return NULL; +} + + +void +update_xmlns(dtd_parser *p, dtd_element *e, int natts, sgml_attribute *atts) +{ dtd_attr_list *al; + int nschr = p->dtd->charfunc->func[CF_NS]; /* : */ + + for(al=e->attributes; al; al=al->next) + { dtd_attr *a = al->attribute; + const ichar *name = a->name->name; + + if ( (name = isxmlns(name, nschr)) && /* TBD: flag when processing DTD */ + a->type == AT_CDATA && + (a->def == AT_FIXED || a->def == AT_DEFAULT) ) + xmlns_push(p, name, a->att_def.cdata); + } + + for( ; natts-- > 0; atts++ ) + { const ichar *name = atts->definition->name->name; + + if ( (name=isxmlns(name, nschr)) && + atts->definition->type == AT_CDATA && + atts->value.textW ) + xmlns_push(p, name, atts->value.textW); + } +} + + +/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +xmlns_resolve() + Convert a symbol as returned by the XML level-1.0 parser to its namespace + tuple {url}localname. This function is not used internally, but provided + for use from the call-back functions of the parser. + + It exploits the stack of namespace-environments managed by the parser + itself (see update_xmlns()) +- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ + +int +xmlns_resolve_attribute(dtd_parser *p, dtd_symbol *id, + const ichar **local, const ichar **url) +{ dtd *dtd = p->dtd; + int nschr = dtd->charfunc->func[CF_NS]; /* : */ + ichar buf[MAXNMLEN]; + ichar *o = buf; + const ichar *s; + xmlns *ns; + + for(s=id->name; *s; s++) + { if ( *s == nschr ) + { dtd_symbol *n; + + *o = '\0'; + *local = s+1; + n = dtd_add_symbol(dtd, buf); + + if ( istrprefix(L"xml", buf) ) /* XML reserved namespaces */ + { *url = n->name; + return TRUE; + } else if ( (ns = xmlns_find(p->environments, n)) ) + { if ( ns->url->name[0] ) + *url = ns->url->name; + else + *url = NULL; + return TRUE; + } else + { *url = n->name; /* undefined namespace */ + gripe(ERC_EXISTENCE, L"namespace", n->name); + return FALSE; + } + } + *o++ = *s; + } + + *local = id->name; + + if ( (p->flags & SGML_PARSER_QUALIFY_ATTS) && + (ns = p->environments->thisns) && ns->url->name[0] ) + *url = ns->url->name; + else + *url = NULL; /* no default namespace is defined */ + + return TRUE; +} + + +/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Resolve the namespace for the current element. This namespace is stored +in the environment as `thisns' and acts as default for resolving the +namespaces of the attributes (see above). +- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ + +int +xmlns_resolve_element(dtd_parser *p, const ichar **local, const ichar **url) +{ sgml_environment *e; + + if ( (e=p->environments) ) + { dtd_symbol *id = e->element->name; + dtd *dtd = p->dtd; + int nschr = dtd->charfunc->func[CF_NS]; /* : */ + ichar buf[MAXNMLEN]; + ichar *o = buf; + const ichar *s; + xmlns *ns; + + for(s=id->name; *s; s++) + { if ( *s == nschr ) /* explicit namespace */ + { dtd_symbol *n; + + *o = '\0'; + *local = s+1; + n = dtd_add_symbol(dtd, buf); + + if ( (ns = xmlns_find(p->environments, n)) ) + { if ( ns->url->name[0] ) + *url = ns->url->name; + else + *url = NULL; + e->thisns = ns; /* default for attributes */ + return TRUE; + } else + { *url = n->name; /* undefined namespace */ + gripe(ERC_EXISTENCE, "namespace", n->name); + e->thisns = xmlns_push(p, n->name, n->name); /* define implicitly */ + return FALSE; + } + } + *o++ = *s; + } + + *local = id->name; + + if ( (ns = xmlns_find(p->environments, NULL)) ) + { if ( ns->url->name[0] ) + *url = ns->url->name; + else + *url = NULL; + e->thisns = ns; + } else + { *url = NULL; /* no default namespace is defined */ + e->thisns = NULL; + } + + return TRUE; + } else + return FALSE; +} + + +#endif /*XMLNS*/ + diff --git a/packages/sgml/xmlns.h b/packages/sgml/xmlns.h new file mode 100644 index 000000000..51a1579ef --- /dev/null +++ b/packages/sgml/xmlns.h @@ -0,0 +1,43 @@ +/* $Id$ + + Part of SWI-Prolog + + Author: Jan Wielemaker + E-mail: jan@swi.psy.uva.nl + WWW: http://www.swi-prolog.org + Copyright (C): 1985-2002, University of Amsterdam + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ + +#ifndef XMLNS_H_INCLUDED +#define XMLNS_H_INCLUDED + +typedef struct _xmlns +{ dtd_symbol *name; /* Prefix of the NS */ + dtd_symbol *url; /* pointed-to URL */ + struct _xmlns *next; /* next name */ +} xmlns; + +void xmlns_free(sgml_environment *env); +xmlns* xmlns_find(sgml_environment *env, dtd_symbol *ns); +void update_xmlns(dtd_parser *p, dtd_element *e, + int natts, sgml_attribute *atts); +int xmlns_resolve_attribute(dtd_parser *p, dtd_symbol *id, + const ichar **local, const ichar **url); +int xmlns_resolve_element(dtd_parser *p, + const ichar **local, const ichar **url); + +#endif /*XMLNS_H_INCLUDED*/ diff --git a/packages/sgml/xsdp_types.pl b/packages/sgml/xsdp_types.pl new file mode 100644 index 000000000..dc07eb361 --- /dev/null +++ b/packages/sgml/xsdp_types.pl @@ -0,0 +1,214 @@ +/* $Id$ + + Part of SWI-Prolog + + Author: Jan Wielemaker + E-mail: wielemak@science.uva.nl + WWW: http://www.swi-prolog.org + Copyright (C): 1985-2006, University of Amsterdam + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License + as published by the Free Software Foundation; either version 2 + of the License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + + As a special exception, if you link this library with other files, + compiled with a Free Software compiler, to produce an executable, this + library does not by itself cause the resulting executable to be covered + by the GNU General Public License. This exception does not however + invalidate any other reasons why the executable file might be covered by + the GNU General Public License. +*/ + +:- module(xsdp_type, + [ xsdp_type/1, % ?Type + xsdp_numeric_uri/2, % ?URI, ?Primary + xsdp_subtype_of/2, % ?Type, ?Super + xsdp_convert/3 % +Type, +Content, -Value + ]). + + +/** XML-Schema primitie types + +This modules provides support for the primitive XML-Schema (XSD) +datatypes. It defines the type hierarchy which allows for reasoning over +types as well as xsdp_convert/3 to convert XML content to a natural +Prolog representation of the XSD type. + +Based on the W3C definitions at + + * http://www.w3.org/TR/xmlschema-2/#built-in-datatypes + +The current implementation is incomplete and only there to test the API +and its integration with rdf:dataType=Type handling in the RDF parser. + +The extra 'p' in the module prefix (xsdp_*) is used to allow for a +module xsd_*, providing full user-defined XSD types on top of this +module. +*/ + +ns('http://www.w3.org/2001/XMLSchema#'). + + + /******************************* + * PRIMITIVE TYPE HIERARCHY * + *******************************/ + +%% xsdp_type(?Type) +% +% Test/generate the names for the XML schema primitive types + +xsdp_type(Type) :- + subtype_of(Type, _). + +%% xsdp_subtype_of(?Type, ?Super) +% +% True if Type is a (transitive) subtype of Super. + +xsdp_subtype_of(Type, Type). +xsdp_subtype_of(Type, Super) :- + ( nonvar(Type) + -> subtype_of(Type, Super0), + Super0 \== (-), + xsdp_subtype_of(Type, Super) + ; subtype_of(Sub0, Super), + xsdp_subtype_of(Type, Sub0) + ). + +subtype_of(anyType, -). +subtype_of(anySimpleType, anyType). + % string hierarchy +subtype_of(string, anySimpleType). +subtype_of(normalizedString, string). +subtype_of(token, normalizedString). +subtype_of(language, token). +subtype_of('Name', token). +subtype_of('NCName', 'Name'). +subtype_of('NMTOKEN', token). +subtype_of('NMTOKENS', 'NMTOKEN'). +subtype_of('ID', 'NCName'). +subtype_of('IDREF', 'NCName'). +subtype_of('IDREFS', 'IDREF'). +subtype_of('ENTITY', 'NCName'). +subtype_of('ENTITIES', 'ENTITY'). + % numeric hierarchy +subtype_of(decimal, anySimpleType). +subtype_of(integer, decimal). +subtype_of(nonPositiveInteger, integer). +subtype_of(negativeInteger, nonPositiveInteger). +subtype_of(long, integer). +subtype_of(int, long). +subtype_of(short, int). +subtype_of(byte, short). +subtype_of(nonNegativeInteger, integer). +subtype_of(unsignedLong, nonNegativeInteger). +subtype_of(positiveInteger, nonNegativeInteger). +subtype_of(unsignedInt, unsignedLong). +subtype_of(unsignedShort, unsignedInt). +subtype_of(unsignedByte, unsignedShort). + % other simple types +subtype_of(duration, anySimpleType). +subtype_of(dateTime, anySimpleType). +subtype_of(time, anySimpleType). +subtype_of(date, anySimpleType). +subtype_of(gYearMonth, anySimpleType). +subtype_of(gYear, anySimpleType). +subtype_of(gMonthDay, anySimpleType). +subtype_of(gDay, anySimpleType). +subtype_of(gMonth, anySimpleType). +subtype_of(boolean, anySimpleType). +subtype_of(base64Binary, anySimpleType). +subtype_of(hexBinary, anySimpleType). +subtype_of(float, anySimpleType). +subtype_of(double, anySimpleType). +subtype_of(anyURI, anySimpleType). +subtype_of('QName', anySimpleType). +subtype_of('NOTATION', anySimpleType). + +%% xsdp_numeric_uri(?URI, -PromoteURI) is nondet. +% +% Table mapping all XML-Schema numeric URIs into the type they +% promote to. Types are promoted to =integer=, =float=, =double= +% and =decimal=. + +term_expansion(integer_types, Clauses) :- + findall(integer_type(Type), xsdp_subtype_of(Type, integer), Clauses). +term_expansion(xsd_local_ids, Clauses) :- + ns(NS), + findall(xsd_local_id(URI, Type), + ( xsdp_type(Type), + atom_concat(NS, Type, URI) + ), + Clauses). +term_expansion(numeric_uirs, Clauses) :- + findall(xsdp_numeric_uri(URI, PrimaryURI), + ( ( integer_type(Type), Primary = integer + ; Type = float, Primary = float + ; Type = double, Primary = double + ; Type = decimal, Primary = decimal + ), + xsd_local_id(URI, Type), + xsd_local_id(PrimaryURI, Primary) + ), + Clauses). + +integer_types. +xsd_local_ids. + +numeric_uirs. + +%% xsdp_convert(+Type, +Content, -Value) +% +% Convert the content model Content to an object of the given XSD +% type and return the Prolog value in Value. + +xsdp_convert(URI, Content, Value) :- + ( xsd_local_id(URI, Type) + -> convert(Type, Content, Value) + ; convert(URI, Content, Value) + ). + +convert(anyType, Term, Term) :- !. +convert(anySimpleType, [Simple], Simple) :- !. + % strings +convert(string, [String], String) :- !. + % numbers +convert(IntType, [Text], Integer) :- + integer_type(IntType), !, + atom_number(Text, Integer), + ( integer(Integer), + validate_int_domain(IntType, Integer) + -> true + ; throw(error(domain_error(Text, IntType), _)) + ). +convert(float, [Text], Float) :- !, + atom_number(Text, Number), + Float is float(Number). +convert(double, [Text], Float) :- !, + atom_number(Text, Number), + Float is float(Number). +convert(_Any, [X], X) :- !. % TBD: provide for more types +convert(_Any, X, X). + +validate_int_domain(integer, _). +validate_int_domain(int, _). +validate_int_domain(long, _). +validate_int_domain(nonPositiveInteger, I) :- \+ I > 0. +validate_int_domain(negativeInteger, I) :- I < 0. +validate_int_domain(short, I) :- between(-32768, 32767, I). +validate_int_domain(byte, I) :- between(-128, 127, I). +validate_int_domain(nonNegativeInteger, I) :- \+ I < 0. +validate_int_domain(unsignedLong, I) :- I >= 0. +validate_int_domain(positiveInteger, I) :- I > 0. +validate_int_domain(unsignedInt, I) :- I >= 0. +validate_int_domain(unsignedShort, I) :- between(0, 65535, I). +validate_int_domain(unsignedByte, I) :- between(0, 255, I).