From 2f49994b09bdf36a571a8bba6a713fa416665391 Mon Sep 17 00:00:00 2001 From: Jez Higgins Date: Fri, 14 Sep 2012 11:12:50 +0100 Subject: [PATCH] Deal properly with the [dtd] pseudo-entity Some parsers report the [dtd] pseudo entity through the LexicalHandler. Arabica was incorrectly trying to create an Entity Reference for it. --- include/DOM/SAX2DOM/SAX2DOM.hpp | 893 +++++++++++++++-------------- include/SAX/ext/LexicalHandler.hpp | 396 ++++++------- 2 files changed, 650 insertions(+), 639 deletions(-) diff --git a/include/DOM/SAX2DOM/SAX2DOM.hpp b/include/DOM/SAX2DOM/SAX2DOM.hpp index 87990c66..c1bccfb5 100644 --- a/include/DOM/SAX2DOM/SAX2DOM.hpp +++ b/include/DOM/SAX2DOM/SAX2DOM.hpp @@ -1,445 +1,448 @@ -#ifndef JEZUK_SAX2DOM_PARSER_H -#define JEZUK_SAX2DOM_PARSER_H - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace Arabica -{ -namespace SAX2DOM -{ - -template -struct ParserTypes -{ - typedef typename Arabica::get_param, - T0, - T1>::type string_adaptor; - typedef typename Arabica::get_param, - T1, - T0>::type SAX_parser_type; -}; - -template -class Parser : protected Arabica::SAX::DefaultHandler::string_adaptor> -{ - typedef typename ParserTypes::string_adaptor string_adaptorT; - typedef typename ParserTypes::SAX_parser_type SAX_parser_type; - typedef Arabica::SAX::XMLReaderInterface XMLReaderInterfaceT; - typedef Arabica::SAX::TextCoalescer TextCoalescerT; - typedef Arabica::SAX::Attributes AttributesT; - typedef Arabica::SAX::EntityResolver EntityResolverT; - typedef Arabica::SAX::ErrorHandler ErrorHandlerT; - typedef Arabica::SAX::LexicalHandler LexicalHandlerT; - typedef Arabica::SAX::DeclHandler DeclHandlerT; - typedef Arabica::SAX::InputSource InputSourceT; - typedef Arabica::SimpleDOM::EntityImpl EntityT; - typedef Arabica::SimpleDOM::NotationImpl NotationT; - typedef Arabica::SimpleDOM::ElementImpl ElementT; - typedef typename ErrorHandlerT::SAXParseExceptionT SAXParseExceptionT; - - public: - Parser() : - documentType_(0), - entityResolver_(0), - errorHandler_(0) - { - Arabica::SAX::FeatureNames fNames; - features_.insert(std::make_pair(fNames.namespaces, true)); - features_.insert(std::make_pair(fNames.namespace_prefixes, true)); - features_.insert(std::make_pair(fNames.validation, false)); - } // Parser - - void setEntityResolver(EntityResolverT& resolver) { entityResolver_ = &resolver; } - EntityResolverT* getEntityResolver() const { return entityResolver_; } - - void setErrorHandler(ErrorHandlerT& handler) { errorHandler_ = &handler; } - ErrorHandlerT* getErrorHandler() const { return errorHandler_; } - - void setFeature(const stringT& name, bool value) - { - typename Features::iterator f = features_.find(name); - if(f == features_.end()) - features_.insert(std::make_pair(name, value)); - else - f->second = value; - } // setFeature - - bool getFeature(const stringT& name) const - { - typename Features::const_iterator f = features_.find(name); - if(f == features_.end()) - throw Arabica::SAX::SAXNotRecognizedException(std::string("Feature not recognized ") + string_adaptorT::asStdString(name)); - return f->second; - } // getFeature - - bool parse(const stringT& systemId) - { - InputSourceT is(systemId); - return parse(is); - } // loadDOM - - bool parse(InputSourceT& source) - { - Arabica::SAX::PropertyNames pNames; - - DOM::DOMImplementation di = Arabica::SimpleDOM::DOMImplementation::getDOMImplementation(); - document_ = di.createDocument(string_adaptorT::construct_from_utf8(""), string_adaptorT::construct_from_utf8(""), 0); - currentNode_ = document_; - inCDATA_ = false; - inDTD_ = false; - inEntity_ = 0; - - SAX_parser_type base_parser; - TextCoalescerT parser(base_parser); - parser.setContentHandler(*this); - parser.setErrorHandler(*this); - if(entityResolver_) - parser.setEntityResolver(*entityResolver_); - - parser.setLexicalHandler(*this); - parser.setDeclHandler(*this); - - setParserFeatures(parser); - - try - { - parser.parse(source); - } - catch(const DOM::DOMException& de) - { - document_ = 0; - - if(errorHandler_) - { - SAXParseExceptionT pe(de.what()); - errorHandler_->fatalError(pe); - } // if ... - } // catch - - return (document_ != 0); - } // loadDOM - - DOM::Document getDocument() const - { - return document_; - } // getDocument - - void reset() - { - currentNode_ = 0; - document_ = 0; - } // reset - - protected: - DOM::Node& currentNode() { return currentNode_; } - - private: - // no implementations - Parser(const Parser&); - bool operator==(const Parser&) const; - Parser& operator=(const Parser&); - - // instance variables - DOM::Document document_; - DocumentType* documentType_; - DOM::Node currentNode_; - DOM::Node cachedCurrent_; - - typedef std::map Features; - Features features_; - - bool inCDATA_; - bool inDTD_; - int inEntity_; - - std::map declaredEntities_; - - EntityResolverT* entityResolver_; - ErrorHandlerT* errorHandler_; - Arabica::SAX::AttributeTypes attributeTypes_; - - protected: - void setParserFeatures(XMLReaderInterfaceT& parser) const - { - for(typename Features::const_iterator f = features_.begin(), e = features_.end(); f != e; ++f) - try { - parser.setFeature(f->first, f->second); - } - catch(const Arabica::SAX::SAXException&) { } - } // setParserFeatures - - /////////////////////////////////////////////////////////// - // ContentHandler - virtual void endDocument() - { - currentNode_ = 0; - } // endDocument - - virtual void startElement(const stringT& namespaceURI, - const stringT& /*localName*/, - const stringT& qName, - const AttributesT& atts) - { - if(currentNode_ == 0) - return; - - try - { - DOM::Element elem = document_.createElementNS(namespaceURI, qName); - currentNode_.appendChild(elem); - - // attributes here - for(int i = 0; i < atts.getLength(); ++i) - { - stringT qName = atts.getQName(i); - if(string_adaptorT::empty(qName)) - qName = atts.getLocalName(i); - elem.setAttributeNS(atts.getURI(i), qName, atts.getValue(i)); - } - - currentNode_ = elem; - } - catch(const DOM::DOMException& de) - { - reset(); - - if(errorHandler_) - { - SAXParseExceptionT pe(de.what()); - errorHandler_->fatalError(pe); - } // if ... - } // catch - } // startElement - - virtual void endElement(const stringT& /*namespaceURI*/, - const stringT& /*localName*/, - const stringT& /*qName*/) - { - if(currentNode_ == 0) - return; - - currentNode_ = currentNode_.getParentNode(); - } // endElement - - virtual void characters(const stringT& ch) - { - if(currentNode_ == 0) - return; - - if(!inCDATA_) - currentNode_.appendChild(document_.createTextNode(ch)); - else - currentNode_.appendChild(document_.createCDATASection(ch)); - } // characters - - virtual void processingInstruction(const stringT& target, const stringT& data) - { - if(currentNode_ == 0) - return; - - currentNode_.appendChild(document_.createProcessingInstruction(target, data)); - } // processingInstruction - - virtual void skippedEntity(const stringT& name) - { - if(currentNode_ == 0 || inDTD_ == true) - return; - - currentNode_.appendChild(document_.createEntityReference(name)); - } // skippedEntity - - //////////////////////////////////////////////////// - // ErrorHandler - virtual void warning(const SAXParseExceptionT& e) - { - if(errorHandler_) - errorHandler_->warning(e); - } // warning - - virtual void error(const SAXParseExceptionT& e) - { - if(errorHandler_) - errorHandler_->error(e); - reset(); - } // error - - virtual void fatalError(const SAXParseExceptionT& e) - { - if(errorHandler_) - errorHandler_->fatalError(e); - reset(); - } // fatalError - - ///////////////////////////////////////////////////// - // LexicalHandler - virtual void startDTD(const stringT& name, - const stringT& publicId, - const stringT& systemId) - { - documentType_ = new DocumentType(name, publicId, systemId); - document_.insertBefore(documentType_, 0); - inDTD_ = true; - } // startDTD - - virtual void endDTD() - { - documentType_->setReadOnly(true); - inDTD_ = false; - } // endDTD - - virtual void startEntity(const stringT& name) - { - if(currentNode_ == 0) - return; - - if(++inEntity_ == 1) - { - cachedCurrent_ = currentNode_; - currentNode_ = declaredEntities_[name]; - if(currentNode_ != 0 && currentNode_.hasChildNodes() == true) // already populated - currentNode_ = 0; - } - } // startEntity - - virtual void endEntity(const stringT& name) - { - if(--inEntity_ == 0) - currentNode_ = cachedCurrent_; - - currentNode_.appendChild(document_.createEntityReference(name)); - } // endEntity - - virtual void startCDATA() - { - inCDATA_ = true; - } // startCDATA - - virtual void endCDATA() - { - inCDATA_ = false; - } // endCDATA - - virtual void comment(const stringT& text) - { - if(currentNode_ == 0) - return; - - currentNode_.appendChild(document_.createComment(text)); - } // comment - - ////////////////////////////////////////////////////////////////////// - // DeclHandler - virtual void elementDecl(const stringT& name, const stringT& /*model*/) - { - if(!documentType_) - return; - documentType_->addElement(name); - } // elementDecl - - virtual void attributeDecl(const stringT& elementName, - const stringT& attributeName, - const stringT& type, - const stringT& /*valueDefault*/, - const stringT& value) - { - if(!documentType_) - return; - if(!string_adaptorT::empty(value)) - documentType_->addDefaultAttr(elementName, attributeName, value); - if(type == attributeTypes_.id) - documentType_->addElementId(attributeName); - } // attributeDecl - - virtual void internalEntityDecl(const stringT& name, const stringT& value) - { - if(!documentType_) - return; - - static const stringT LEFT_ANGLE_BRACKET = string_adaptorT::construct_from_utf8("<"); - - EntityT* entity = new EntityT(0, name, string_adaptorT::construct_from_utf8(""), string_adaptorT::construct_from_utf8(""), string_adaptorT::construct_from_utf8("")); - declaredEntities_.insert(std::make_pair(name, entity)); - documentType_->addEntity(entity); - DOM::Node n = entity; - - if(string_adaptorT::find(value, LEFT_ANGLE_BRACKET) == string_adaptorT::npos()) - { - n.appendChild(document_.createTextNode(value)); - return; - } // if ... - - // parse the value into a Document - // this may not quite do the right thing for some custom strug types, - // but at the time I've writing this, the code has been missing this - // stuff for something like 8 years and nobody's noticed so it's not - // massively used. - // I only noticed myself when I started running the DOM conformance tests - std::stringstream ss; - ss << "" << string_adaptorT::asStdString(value) << ""; - - Arabica::SAX::InputSource is(ss); - Arabica::SAX2DOM::Parser parser; - parser.parse(is); - - DOM::Document entityDoc = parser.getDocument(); - DOM::Element entityElem = entityDoc.getDocumentElement(); - DOM::Node child = entityElem.getFirstChild(); - while(child != 0) - { - // import the contents thereof - DOM::Node imported = document_.importNode(child, true); - // append to entity - n.appendChild(imported); - - child = child.getNextSibling(); - } // while - } // internalEntityDecl - - virtual void externalEntityDecl(const stringT& name, const stringT& publicId, const stringT& systemId) - { - if(!documentType_) - return; - EntityT* entity = new EntityT(0, name, publicId, systemId, string_adaptorT::construct_from_utf8("")); - declaredEntities_.insert(std::make_pair(name, entity)); // we'll populate it later - documentType_->addEntity(entity); - } // externalEntityDecl - - ///////////////////////////////////////////////////////////////////////// - // DTDHandler - virtual void notationDecl(const stringT& name, const stringT& publicId, const stringT& systemId) - { - if(!documentType_) - return; - documentType_->addNotation(new NotationT(0, name, publicId, systemId)); - } // notationDecl - - virtual void unparsedEntityDecl(const stringT& name, const stringT& publicId, const stringT& systemId, const stringT& notationName) - { - if(!documentType_) - return; - documentType_->addEntity(new EntityT(0, name, publicId, systemId, notationName)); - } // unparsedEntityDecl -}; // class Parser - -} // namespace SAX2DOM -} // namespace Arabica - -#endif - +#ifndef JEZUK_SAX2DOM_PARSER_H +#define JEZUK_SAX2DOM_PARSER_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace Arabica +{ +namespace SAX2DOM +{ + +template +struct ParserTypes +{ + typedef typename Arabica::get_param, + T0, + T1>::type string_adaptor; + typedef typename Arabica::get_param, + T1, + T0>::type SAX_parser_type; +}; + +template +class Parser : protected Arabica::SAX::DefaultHandler::string_adaptor> +{ + typedef typename ParserTypes::string_adaptor string_adaptorT; + typedef typename ParserTypes::SAX_parser_type SAX_parser_type; + typedef Arabica::SAX::XMLReaderInterface XMLReaderInterfaceT; + typedef Arabica::SAX::TextCoalescer TextCoalescerT; + typedef Arabica::SAX::Attributes AttributesT; + typedef Arabica::SAX::EntityResolver EntityResolverT; + typedef Arabica::SAX::ErrorHandler ErrorHandlerT; + typedef Arabica::SAX::LexicalHandler LexicalHandlerT; + typedef Arabica::SAX::DeclHandler DeclHandlerT; + typedef Arabica::SAX::InputSource InputSourceT; + typedef Arabica::SimpleDOM::EntityImpl EntityT; + typedef Arabica::SimpleDOM::NotationImpl NotationT; + typedef Arabica::SimpleDOM::ElementImpl ElementT; + typedef typename ErrorHandlerT::SAXParseExceptionT SAXParseExceptionT; + + public: + Parser() : + documentType_(0), + entityResolver_(0), + errorHandler_(0) + { + Arabica::SAX::FeatureNames fNames; + features_.insert(std::make_pair(fNames.namespaces, true)); + features_.insert(std::make_pair(fNames.namespace_prefixes, true)); + features_.insert(std::make_pair(fNames.validation, false)); + } // Parser + + void setEntityResolver(EntityResolverT& resolver) { entityResolver_ = &resolver; } + EntityResolverT* getEntityResolver() const { return entityResolver_; } + + void setErrorHandler(ErrorHandlerT& handler) { errorHandler_ = &handler; } + ErrorHandlerT* getErrorHandler() const { return errorHandler_; } + + void setFeature(const stringT& name, bool value) + { + typename Features::iterator f = features_.find(name); + if(f == features_.end()) + features_.insert(std::make_pair(name, value)); + else + f->second = value; + } // setFeature + + bool getFeature(const stringT& name) const + { + typename Features::const_iterator f = features_.find(name); + if(f == features_.end()) + throw Arabica::SAX::SAXNotRecognizedException(std::string("Feature not recognized ") + string_adaptorT::asStdString(name)); + return f->second; + } // getFeature + + bool parse(const stringT& systemId) + { + InputSourceT is(systemId); + return parse(is); + } // loadDOM + + bool parse(InputSourceT& source) + { + Arabica::SAX::PropertyNames pNames; + + DOM::DOMImplementation di = Arabica::SimpleDOM::DOMImplementation::getDOMImplementation(); + document_ = di.createDocument(string_adaptorT::construct_from_utf8(""), string_adaptorT::construct_from_utf8(""), 0); + currentNode_ = document_; + inCDATA_ = false; + inDTD_ = false; + inEntity_ = 0; + + SAX_parser_type base_parser; + TextCoalescerT parser(base_parser); + parser.setContentHandler(*this); + parser.setErrorHandler(*this); + if(entityResolver_) + parser.setEntityResolver(*entityResolver_); + + parser.setLexicalHandler(*this); + parser.setDeclHandler(*this); + + setParserFeatures(parser); + + try + { + parser.parse(source); + } + catch(const DOM::DOMException& de) + { + document_ = 0; + + if(errorHandler_) + { + SAXParseExceptionT pe(de.what()); + errorHandler_->fatalError(pe); + } // if ... + } // catch + + return (document_ != 0); + } // loadDOM + + DOM::Document getDocument() const + { + return document_; + } // getDocument + + void reset() + { + currentNode_ = 0; + document_ = 0; + } // reset + + protected: + DOM::Node& currentNode() { return currentNode_; } + + private: + // no implementations + Parser(const Parser&); + bool operator==(const Parser&) const; + Parser& operator=(const Parser&); + + // instance variables + DOM::Document document_; + DocumentType* documentType_; + DOM::Node currentNode_; + DOM::Node cachedCurrent_; + + typedef std::map Features; + Features features_; + + bool inCDATA_; + bool inDTD_; + int inEntity_; + + std::map declaredEntities_; + + EntityResolverT* entityResolver_; + ErrorHandlerT* errorHandler_; + Arabica::SAX::AttributeTypes attributeTypes_; + + protected: + void setParserFeatures(XMLReaderInterfaceT& parser) const + { + for(typename Features::const_iterator f = features_.begin(), e = features_.end(); f != e; ++f) + try { + parser.setFeature(f->first, f->second); + } + catch(const Arabica::SAX::SAXException&) { } + } // setParserFeatures + + /////////////////////////////////////////////////////////// + // ContentHandler + virtual void endDocument() + { + currentNode_ = 0; + } // endDocument + + virtual void startElement(const stringT& namespaceURI, + const stringT& /*localName*/, + const stringT& qName, + const AttributesT& atts) + { + if(currentNode_ == 0) + return; + + try + { + DOM::Element elem = document_.createElementNS(namespaceURI, qName); + currentNode_.appendChild(elem); + + // attributes here + for(int i = 0; i < atts.getLength(); ++i) + { + stringT qName = atts.getQName(i); + if(string_adaptorT::empty(qName)) + qName = atts.getLocalName(i); + elem.setAttributeNS(atts.getURI(i), qName, atts.getValue(i)); + } + + currentNode_ = elem; + } + catch(const DOM::DOMException& de) + { + reset(); + + if(errorHandler_) + { + SAXParseExceptionT pe(de.what()); + errorHandler_->fatalError(pe); + } // if ... + } // catch + } // startElement + + virtual void endElement(const stringT& /*namespaceURI*/, + const stringT& /*localName*/, + const stringT& /*qName*/) + { + if(currentNode_ == 0) + return; + + currentNode_ = currentNode_.getParentNode(); + } // endElement + + virtual void characters(const stringT& ch) + { + if(currentNode_ == 0) + return; + + if(!inCDATA_) + currentNode_.appendChild(document_.createTextNode(ch)); + else + currentNode_.appendChild(document_.createCDATASection(ch)); + } // characters + + virtual void processingInstruction(const stringT& target, const stringT& data) + { + if(currentNode_ == 0) + return; + + currentNode_.appendChild(document_.createProcessingInstruction(target, data)); + } // processingInstruction + + virtual void skippedEntity(const stringT& name) + { + if(currentNode_ == 0 || inDTD_ == true) + return; + + currentNode_.appendChild(document_.createEntityReference(name)); + } // skippedEntity + + //////////////////////////////////////////////////// + // ErrorHandler + virtual void warning(const SAXParseExceptionT& e) + { + if(errorHandler_) + errorHandler_->warning(e); + } // warning + + virtual void error(const SAXParseExceptionT& e) + { + if(errorHandler_) + errorHandler_->error(e); + reset(); + } // error + + virtual void fatalError(const SAXParseExceptionT& e) + { + if(errorHandler_) + errorHandler_->fatalError(e); + reset(); + } // fatalError + + ///////////////////////////////////////////////////// + // LexicalHandler + virtual void startDTD(const stringT& name, + const stringT& publicId, + const stringT& systemId) + { + documentType_ = new DocumentType(name, publicId, systemId); + document_.insertBefore(documentType_, 0); + inDTD_ = true; + } // startDTD + + virtual void endDTD() + { + documentType_->setReadOnly(true); + inDTD_ = false; + } // endDTD + + virtual void startEntity(const stringT& name) + { + if(currentNode_ == 0) + return; + + if(++inEntity_ == 1) + { + cachedCurrent_ = currentNode_; + currentNode_ = declaredEntities_[name]; + if(currentNode_ != 0 && currentNode_.hasChildNodes() == true) // already populated + currentNode_ = 0; + } + } // startEntity + + virtual void endEntity(const stringT& name) + { + if(--inEntity_ == 0) + currentNode_ = cachedCurrent_; + + if(dtd_pseudo_entity == name) + return; + + currentNode_.appendChild(document_.createEntityReference(name)); + } // endEntity + + virtual void startCDATA() + { + inCDATA_ = true; + } // startCDATA + + virtual void endCDATA() + { + inCDATA_ = false; + } // endCDATA + + virtual void comment(const stringT& text) + { + if(currentNode_ == 0) + return; + + currentNode_.appendChild(document_.createComment(text)); + } // comment + + ////////////////////////////////////////////////////////////////////// + // DeclHandler + virtual void elementDecl(const stringT& name, const stringT& /*model*/) + { + if(!documentType_) + return; + documentType_->addElement(name); + } // elementDecl + + virtual void attributeDecl(const stringT& elementName, + const stringT& attributeName, + const stringT& type, + const stringT& /*valueDefault*/, + const stringT& value) + { + if(!documentType_) + return; + if(!string_adaptorT::empty(value)) + documentType_->addDefaultAttr(elementName, attributeName, value); + if(type == attributeTypes_.id) + documentType_->addElementId(attributeName); + } // attributeDecl + + virtual void internalEntityDecl(const stringT& name, const stringT& value) + { + if(!documentType_) + return; + + static const stringT LEFT_ANGLE_BRACKET = string_adaptorT::construct_from_utf8("<"); + + EntityT* entity = new EntityT(0, name, string_adaptorT::construct_from_utf8(""), string_adaptorT::construct_from_utf8(""), string_adaptorT::construct_from_utf8("")); + declaredEntities_.insert(std::make_pair(name, entity)); + documentType_->addEntity(entity); + DOM::Node n = entity; + + if(string_adaptorT::find(value, LEFT_ANGLE_BRACKET) == string_adaptorT::npos()) + { + n.appendChild(document_.createTextNode(value)); + return; + } // if ... + + // parse the value into a Document + // this may not quite do the right thing for some custom strug types, + // but at the time I've writing this, the code has been missing this + // stuff for something like 8 years and nobody's noticed so it's not + // massively used. + // I only noticed myself when I started running the DOM conformance tests + std::stringstream ss; + ss << "" << string_adaptorT::asStdString(value) << ""; + + Arabica::SAX::InputSource is(ss); + Arabica::SAX2DOM::Parser parser; + parser.parse(is); + + DOM::Document entityDoc = parser.getDocument(); + DOM::Element entityElem = entityDoc.getDocumentElement(); + DOM::Node child = entityElem.getFirstChild(); + while(child != 0) + { + // import the contents thereof + DOM::Node imported = document_.importNode(child, true); + // append to entity + n.appendChild(imported); + + child = child.getNextSibling(); + } // while + } // internalEntityDecl + + virtual void externalEntityDecl(const stringT& name, const stringT& publicId, const stringT& systemId) + { + if(!documentType_) + return; + EntityT* entity = new EntityT(0, name, publicId, systemId, string_adaptorT::construct_from_utf8("")); + declaredEntities_.insert(std::make_pair(name, entity)); // we'll populate it later + documentType_->addEntity(entity); + } // externalEntityDecl + + ///////////////////////////////////////////////////////////////////////// + // DTDHandler + virtual void notationDecl(const stringT& name, const stringT& publicId, const stringT& systemId) + { + if(!documentType_) + return; + documentType_->addNotation(new NotationT(0, name, publicId, systemId)); + } // notationDecl + + virtual void unparsedEntityDecl(const stringT& name, const stringT& publicId, const stringT& systemId, const stringT& notationName) + { + if(!documentType_) + return; + documentType_->addEntity(new EntityT(0, name, publicId, systemId, notationName)); + } // unparsedEntityDecl +}; // class Parser + +} // namespace SAX2DOM +} // namespace Arabica + +#endif + diff --git a/include/SAX/ext/LexicalHandler.hpp b/include/SAX/ext/LexicalHandler.hpp index 8610bd7f..3e57576a 100644 --- a/include/SAX/ext/LexicalHandler.hpp +++ b/include/SAX/ext/LexicalHandler.hpp @@ -1,194 +1,202 @@ -#ifndef ARABICA_LEXICAL_HANDLER_H -#define ARABICA_LEXICAL_HANDLER_H - -// LexicalHandler.h -// $Id$ - -#include -#include -#include - -namespace Arabica -{ -namespace SAX -{ - -/** - * SAX2 extension handler for lexical events. - * - *

This is an optional extension handler for SAX2 to provide - * lexical information about an XML document, such as comments - * and CDATA section boundaries; XML readers are not required to - * support this handler, and it is not part of the core SAX2 - * distribution.

- * - *

The events in the lexical handler apply to the entire document, - * not just to the document element, and all lexical handler events - * must appear between the content handler's startDocument and - * endDocument events.

- * - *

To set the LexicalHandler for an XML reader, use the - * {@link XMLReader#setProperty setProperty} method - * with the propertyId "http://xml.org/sax/properties/lexical-handler". - * If the reader does not support lexical events, it will throw a - * {@link SAXNotRecognizedException SAXNotRecognizedException} - * or a - * {@link SAXNotSupportedException SAXNotSupportedException} - * when you attempt to register the handler.

- * - * @since 2.0 - * @author Jez Higgins, - * jez@jezuk.co.uk - * @version 1.0 - * @see XMLReader#setProperty - * @see SAXNotRecognizedException - * @see SAXNotSupportedException - */ -template > -class LexicalHandler -{ -public: - virtual ~LexicalHandler() { } - - /** - * Report the start of DTD declarations, if any. - * - *

This method is intended to report the beginning of the - * DOCTYPE declaration; if the document has no DOCTYPE declaration, - * this method will not be invoked.

- * - *

All declarations reported through - * {@link DTDHandler DTDHandler} or - * {@link DeclHandler DeclHandler} events must appear - * between the startDTD and {@link #endDTD endDTD} events. - * Declarations are assumed to belong to the internal DTD subset - * unless they appear between {@link #startEntity startEntity} - * and {@link #endEntity endEntity} events. Comments and - * processing instructions from the DTD should also be reported - * between the startDTD and endDTD events, in their original - * order of (logical) occurrence; they are not required to - * appear in their correct locations relative to DTDHandler - * or DeclHandler events, however.

- * - *

Note that the start/endDTD events will appear within - * the start/endDocument events from ContentHandler and - * before the first - * {@link ContentHandler#startElement startElement} - * event.

- * - * @param name The document type name. - * @param publicId The declared public identifier for the - * external DTD subset, or an empty string if none was declared. - * @param systemId The declared system identifier for the - * external DTD subset, or an empty string if none was declared. - * @see #endDTD - * @see #startEntity - */ - virtual void startDTD(const string_type& name, - const string_type& publicId, - const string_type& systemId) = 0; - /** - * Report the end of DTD declarations. - * - *

This method is intended to report the end of the - * DOCTYPE declaration; if the document has no DOCTYPE declaration, - * this method will not be invoked.

- * - * @see #startDTD - */ - virtual void endDTD() = 0; - - /** - * Report the beginning of some internal and external XML entities. - * - *

The reporting of parameter entities (including - * the external DTD subset) is optional, and SAX2 drivers that - * support LexicalHandler may not support it; you can use the - * http://xml.org/sax/features/lexical-handler/parameter-entities - * feature to query or control the reporting of parameter entities.

- * - *

General entities are reported with their regular names, - * parameter entities have '%' prepended to their names, and - * the external DTD subset has the pseudo-entity name "[dtd]".

- * - *

When a SAX2 driver is providing these events, all other - * events must be properly nested within start/end entity - * events. There is no additional requirement that events from - * {@link DeclHandler DeclHandler} or - * {@link DTDHandler DTDHandler} be properly ordered.

- * - *

Note that skipped entities will be reported through the - * {@link ContentHandler#skippedEntity skippedEntity} - * event, which is part of the ContentHandler interface.

- * - *

Because of the streaming event model that SAX uses, some - * entity boundaries cannot be reported under any - * circumstances:

- * - *
    - *
  • general entities within attribute values
  • - *
  • parameter entities within declarations
  • - *
- * - *

These will be silently expanded, with no indication of where - * the original entity boundaries were.

- * - *

Note also that the boundaries of character references (which - * are not really entities anyway) are not reported.

- * - *

All start/endEntity events must be properly nested. - * - * @param name The name of the entity. If it is a parameter - * entity, the name will begin with '%', and if it is the - * external DTD subset, it will be "[dtd]". - * @see #endEntity - * @see DeclHandler#internalEntityDecl - * @see DeclHandler#externalEntityDecl - */ - virtual void startEntity(const string_type& name) = 0; - /** - * Report the end of an entity. - * - * @param name The name of the entity that is ending. - * @see #startEntity - */ - virtual void endEntity(const string_type& name) = 0; - - /** - * Report the start of a CDATA section. - * - *

The contents of the CDATA section will be reported through - * the regular {@link ContentHandler#characters - * characters} event; this event is intended only to report - * the boundary.

- * - * @see #endCDATA - */ - virtual void startCDATA() = 0; - /** - * Report the end of a CDATA section. - * - * @see #startCDATA - */ - virtual void endCDATA() = 0; - - /** - * Report an XML comment anywhere in the document. - * - *

This callback will be used for comments inside or outside the - * document element, including comments in the external DTD - * subset (if read). Comments in the DTD must be properly - * nested inside start/endDTD and start/endEntity events (if - * used).

- * - * @param text A string holding the comment. - */ - virtual void comment(const string_type& text) = 0; -}; // class LexicalHandler - -} // namespace SAX -} // namespace Arabica - -#endif -// end of file +#ifndef ARABICA_LEXICAL_HANDLER_H +#define ARABICA_LEXICAL_HANDLER_H + +// LexicalHandler.h +// $Id$ + +#include +#include +#include + +namespace Arabica +{ +namespace SAX +{ + +/** + * SAX2 extension handler for lexical events. + * + *

This is an optional extension handler for SAX2 to provide + * lexical information about an XML document, such as comments + * and CDATA section boundaries; XML readers are not required to + * support this handler, and it is not part of the core SAX2 + * distribution.

+ * + *

The events in the lexical handler apply to the entire document, + * not just to the document element, and all lexical handler events + * must appear between the content handler's startDocument and + * endDocument events.

+ * + *

To set the LexicalHandler for an XML reader, use the + * {@link XMLReader#setProperty setProperty} method + * with the propertyId "http://xml.org/sax/properties/lexical-handler". + * If the reader does not support lexical events, it will throw a + * {@link SAXNotRecognizedException SAXNotRecognizedException} + * or a + * {@link SAXNotSupportedException SAXNotSupportedException} + * when you attempt to register the handler.

+ * + * @since 2.0 + * @author Jez Higgins, + * jez@jezuk.co.uk + * @version 1.0 + * @see XMLReader#setProperty + * @see SAXNotRecognizedException + * @see SAXNotSupportedException + */ +template > +class LexicalHandler +{ +public: + virtual ~LexicalHandler() { } + + /** + * Report the start of DTD declarations, if any. + * + *

This method is intended to report the beginning of the + * DOCTYPE declaration; if the document has no DOCTYPE declaration, + * this method will not be invoked.

+ * + *

All declarations reported through + * {@link DTDHandler DTDHandler} or + * {@link DeclHandler DeclHandler} events must appear + * between the startDTD and {@link #endDTD endDTD} events. + * Declarations are assumed to belong to the internal DTD subset + * unless they appear between {@link #startEntity startEntity} + * and {@link #endEntity endEntity} events. Comments and + * processing instructions from the DTD should also be reported + * between the startDTD and endDTD events, in their original + * order of (logical) occurrence; they are not required to + * appear in their correct locations relative to DTDHandler + * or DeclHandler events, however.

+ * + *

Note that the start/endDTD events will appear within + * the start/endDocument events from ContentHandler and + * before the first + * {@link ContentHandler#startElement startElement} + * event.

+ * + * @param name The document type name. + * @param publicId The declared public identifier for the + * external DTD subset, or an empty string if none was declared. + * @param systemId The declared system identifier for the + * external DTD subset, or an empty string if none was declared. + * @see #endDTD + * @see #startEntity + */ + virtual void startDTD(const string_type& name, + const string_type& publicId, + const string_type& systemId) = 0; + /** + * Report the end of DTD declarations. + * + *

This method is intended to report the end of the + * DOCTYPE declaration; if the document has no DOCTYPE declaration, + * this method will not be invoked.

+ * + * @see #startDTD + */ + virtual void endDTD() = 0; + + /** + * Report the beginning of some internal and external XML entities. + * + *

The reporting of parameter entities (including + * the external DTD subset) is optional, and SAX2 drivers that + * support LexicalHandler may not support it; you can use the + * http://xml.org/sax/features/lexical-handler/parameter-entities + * feature to query or control the reporting of parameter entities.

+ * + *

General entities are reported with their regular names, + * parameter entities have '%' prepended to their names, and + * the external DTD subset has the pseudo-entity name "[dtd]".

+ * + *

When a SAX2 driver is providing these events, all other + * events must be properly nested within start/end entity + * events. There is no additional requirement that events from + * {@link DeclHandler DeclHandler} or + * {@link DTDHandler DTDHandler} be properly ordered.

+ * + *

Note that skipped entities will be reported through the + * {@link ContentHandler#skippedEntity skippedEntity} + * event, which is part of the ContentHandler interface.

+ * + *

Because of the streaming event model that SAX uses, some + * entity boundaries cannot be reported under any + * circumstances:

+ * + *
    + *
  • general entities within attribute values
  • + *
  • parameter entities within declarations
  • + *
+ * + *

These will be silently expanded, with no indication of where + * the original entity boundaries were.

+ * + *

Note also that the boundaries of character references (which + * are not really entities anyway) are not reported.

+ * + *

All start/endEntity events must be properly nested. + * + * @param name The name of the entity. If it is a parameter + * entity, the name will begin with '%', and if it is the + * external DTD subset, it will be "[dtd]". + * @see #endEntity + * @see DeclHandler#internalEntityDecl + * @see DeclHandler#externalEntityDecl + */ + virtual void startEntity(const string_type& name) = 0; + /** + * Report the end of an entity. + * + * @param name The name of the entity that is ending. + * @see #startEntity + */ + virtual void endEntity(const string_type& name) = 0; + + /** + * Report the start of a CDATA section. + * + *

The contents of the CDATA section will be reported through + * the regular {@link ContentHandler#characters + * characters} event; this event is intended only to report + * the boundary.

+ * + * @see #endCDATA + */ + virtual void startCDATA() = 0; + /** + * Report the end of a CDATA section. + * + * @see #startCDATA + */ + virtual void endCDATA() = 0; + + /** + * Report an XML comment anywhere in the document. + * + *

This callback will be used for comments inside or outside the + * document element, including comments in the external DTD + * subset (if read). Comments in the DTD must be properly + * nested inside start/endDTD and start/endEntity events (if + * used).

+ * + * @param text A string holding the comment. + */ + virtual void comment(const string_type& text) = 0; + + const string_type dtd_pseudo_entity; + + protected: + LexicalHandler() : + dtd_pseudo_entity(string_adaptor::construct_from_utf8("[dtd]")) + { + } +}; // class LexicalHandler + +} // namespace SAX +} // namespace Arabica + +#endif +// end of file