#ifndef ARABICA_GARDEN_H #define ARABICA_GARDEN_H #include #include #if BOOST_VERSION >= 103800 #define BOOST_SPIRIT_USE_OLD_NAMESPACE 1 #include #include #else #include #include #endif #include #include #include #include #include #include #include #include #include #include #include #include namespace Arabica { namespace SAX { template class Garden : public SAX::XMLReaderInterface::type> { public: typedef SAX::XMLReaderInterface::type> XMLReaderT; typedef typename XMLReaderT::string_adaptor string_adaptor; typedef EntityResolver EntityResolverT; typedef DTDHandler DTDHandlerT; typedef ContentHandler ContentHandlerT; typedef InputSource InputSourceT; typedef AttributesImpl AttributesImplT; typedef ErrorHandler ErrorHandlerT; typedef DeclHandler declHandlerT; typedef LexicalHandler lexicalHandlerT; typedef typename XMLReaderT::PropertyBase PropertyBaseT; Garden(); virtual bool getFeature(const string_type& name) const; virtual void setFeature(const string_type& name, bool value); virtual void setEntityResolver(EntityResolverT& resolver) { entityResolver_ = &resolver; } virtual EntityResolverT* getEntityResolver() const { return entityResolver_; } virtual void setDTDHandler(DTDHandlerT& handler) { dtdHandler_ = &handler; } virtual DTDHandlerT* getDTDHandler() const { return dtdHandler_; } virtual void setContentHandler(ContentHandlerT& handler) { contentHandler_ = &handler; } virtual ContentHandlerT* getContentHandler() const { return contentHandler_; } virtual void setErrorHandler(ErrorHandlerT& handler) { errorHandler_ = &handler; } virtual ErrorHandlerT* getErrorHandler() const { return errorHandler_; } virtual void setDeclHandler(declHandlerT& handler) { throw SAXNotSupportedException("decl-handler"); } virtual declHandlerT* getDeclHandler() const { throw SAXNotSupportedException("decl-handler"); } virtual void setLexicalHandler(lexicalHandlerT& handler) { throw SAXNotSupportedException("lexical-handler"); } virtual lexicalHandlerT* getLexicalHandler() const { throw SAXNotSupportedException("lexical-handler"); } virtual void parse(InputSourceT& input); virtual std::auto_ptr doGetProperty(const string_type& name); virtual void doSetProperty(const string_type& name, std::auto_ptr value); private: void reportError(const std::string& message, bool fatal = false); typedef typename string_adaptor::value_type char_t; typedef std::vector vector_t; typedef typename vector_t::iterator iterator_t; typedef boost::spirit::scanner scanner_t; typedef boost::spirit::rule rule_t; void openElement(iterator_t s, iterator_t e); void closeElement(iterator_t s, iterator_t e); void closeEmptyElement(iterator_t s, iterator_t e); void endElementName(iterator_t s, iterator_t e); void endElement(iterator_t s, iterator_t e); void attributeName(iterator_t s, iterator_t e); void attributeValue(iterator_t s, iterator_t e); void elementContent(iterator_t s, iterator_t e); void piTarget(iterator_t s, iterator_t e); void piData(iterator_t s, iterator_t e); void piEnd(iterator_t s, iterator_t e); void entityRef(iterator_t s, iterator_t e); void decimalCharacterRef(iterator_t s, iterator_t e); void hexCharacterRef(iterator_t s, iterator_t e); void characterRef(iterator_t s, iterator_t e, int base); // Start grammar definition rule_t prolog, element, Misc, Reference, CDSect, CDStart, CData, CDEnd, PI, PITarget, PIData, doctypedecl, XMLDecl, SDDecl, VersionInfo, EncodingDecl, VersionNum, Eq, EmptyElemTag, STag, content, ETag, Attribute, AttValue, CharData, Comment, CharRef, EntityRef, EncName, document_, Name, Comment1, Spaces; string_type str(iterator_t s, iterator_t e, int trim = 0) { return string_adaptor::construct(s, e-trim); } // str ////////////////////////////// // member variables EntityResolverT* entityResolver_; DTDHandlerT* dtdHandler_; ContentHandlerT* contentHandler_; ErrorHandlerT* errorHandler_; std::stack elements_; AttributesImplT attrs_; typedef typename AttributesImplT::Attr Attr; Attr currentAttr_; string_type piTarget_; string_type piData_; string_type entityRef_; std::map declaredEntities_; std::map conversion_; }; // parser template Garden::Garden() : entityResolver_(0), dtdHandler_(0), contentHandler_(0), errorHandler_(0) { // define the parsing rules typedef boost::spirit::chset chset_t; typedef boost::spirit::chlit chlit_t; // characters chset_t Char("\x9\xA\xD\x20-\xFF"); chset_t SpaceChar("\x20\x9\xD\xA"); Spaces = +(SpaceChar); chset_t Letter("\x41-\x5A\x61-\x7A\xC0-\xD6\xD8-\xF6\xF8-\xFF"); chset_t Digit("0-9"); chlit_t Extender('\xB7'); chset_t NameChar = Letter | Digit | chset_t("._:-") | Extender; Name = (Letter | '_' | ':') >> *(NameChar); document_ = prolog >> element >> *Misc; chset_t CharDataChar (boost::spirit::anychar_p - (chset_t('<') | chset_t('&'))); CharData = (*(CharDataChar - boost::spirit::str_p("]]>")))[boost::bind(&Garden::elementContent, this, _1, _2)]; // Section 2.5 - Comments Comment = boost::spirit::str_p(""); Comment1 = *((Char - boost::spirit::ch_p('-')) | (boost::spirit::ch_p('-') >> (Char - boost::spirit::ch_p('-')))); // Section 2.6 - Processing Instructions PI = boost::spirit::str_p("> (PITarget)[boost::bind(&Garden::piTarget, this, _1, _2)] >> !Spaces >> (PIData)[boost::bind(&Garden::piData, this, _1, _2)] >> (boost::spirit::str_p("?>"))[boost::bind(&Garden::piEnd, this, _1, _2)]; PITarget = Name - boost::spirit::as_lower_d[boost::spirit::str_p("xml")]; PIData = !(!Spaces >> (*(Char - boost::spirit::str_p("?>")))); // Section 2.7 - CDATA CDSect = CDStart >> (CData)[boost::bind(&Garden::elementContent, this, _1, _2)] >> CDEnd; CDStart = boost::spirit::str_p("")); CDEnd = boost::spirit::str_p("]]>"); // bits before the root elemenet prolog = !XMLDecl >> *Misc >> !(doctypedecl >> *Misc); XMLDecl = boost::spirit::str_p("> VersionInfo >> !EncodingDecl >> !SDDecl >> !Spaces >> boost::spirit::str_p("?>"); VersionInfo = Spaces >> boost::spirit::str_p("version") >> Eq >> (boost::spirit::ch_p('\'') >> VersionNum >>'\'' | boost::spirit::ch_p('"') >> VersionNum >> '"'); chset_t VersionNumCh("A-Za-z0-9_.:-"); VersionNum = +(VersionNumCh); doctypedecl = boost::spirit::str_p("> *(Char - (chset_t('[') | '>')) >> !('[' >> *(Char - ']') >> ']') >> '>'; SDDecl = Spaces >> boost::spirit::str_p("standalone") >> Eq >> ((boost::spirit::ch_p('\'') >> (boost::spirit::str_p("yes") | boost::spirit::str_p("no")) >> '\'') | (boost::spirit::ch_p('"') >> (boost::spirit::str_p("yes") | boost::spirit::str_p("no")) >> '"')); // odd bits Eq = !Spaces >> '=' >> !Spaces; Misc = Comment | Spaces | PI; // Elements element = STag >> (EmptyElemTag | (boost::spirit::str_p(">"))[boost::bind(&Garden::closeElement, this, _1, _2)] >> content >> ETag); STag = '<' >> (Name)[boost::bind(&Garden::openElement, this, _1, _2)] >> *(Spaces >> Attribute) >> !Spaces; Attribute = (Name)[boost::bind(&Garden::attributeName, this, _1, _2)] >> Eq >> AttValue; EmptyElemTag = (boost::spirit::str_p("/>"))[boost::bind(&Garden::closeEmptyElement, this, _1, _2)]; ETag = (boost::spirit::str_p("> (Name)[boost::bind(&Garden::endElementName, this, _1, _2)] >> !Spaces >> '>')[boost::bind(&Garden::endElement, this, _1, _2)]; AttValue = '"' >> (*((boost::spirit::anychar_p - (chset_t('<') | '&' | '"')) | Reference))[boost::bind(&Garden::attributeValue, this, _1, _2)] >> '"' | '\'' >> (*((boost::spirit::anychar_p - (chset_t('<') | '&' | '\'')) | Reference))[boost::bind(&Garden::attributeValue, this, _1, _2)] >> '\''; content = !CharData >> *((element | Reference | CDSect | Comment | PI) >> !CharData); // Section 4.1 - Character and entity references CharRef = boost::spirit::str_p("&#") >> (+boost::spirit::digit_p >> ';')[boost::bind(&Garden::decimalCharacterRef, this, _1, _2)] | boost::spirit::str_p("&#x") >> (+boost::spirit::xdigit_p >> ';')[boost::bind(&Garden::hexCharacterRef, this, _1, _2)]; Reference = EntityRef | CharRef; EntityRef = '&' >> (Name >> boost::spirit::ch_p(';'))[boost::bind(&Garden::entityRef, this, _1, _2)]; EncodingDecl = Spaces >> boost::spirit::str_p("encoding") >> Eq >> (boost::spirit::ch_p('"') >> EncName >> '"' | boost::spirit::ch_p('\'') >> EncName >> '\''); chset_t EncNameCh = VersionNumCh - chset_t(':'); EncName = boost::spirit::alpha_p >> *(EncNameCh); ///////////////// declaredEntities_.insert(std::make_pair(string_adaptor::construct("lt"), string_adaptor::construct("<"))); declaredEntities_.insert(std::make_pair(string_adaptor::construct("gt"), string_adaptor::construct(">"))); declaredEntities_.insert(std::make_pair(string_adaptor::construct("amp"), string_adaptor::construct("&"))); declaredEntities_.insert(std::make_pair(string_adaptor::construct("apos"), string_adaptor::construct("'"))); declaredEntities_.insert(std::make_pair(string_adaptor::construct("quot"), string_adaptor::construct("\""))); conversion_.insert(std::make_pair('0', 0)); conversion_.insert(std::make_pair('1', 1)); conversion_.insert(std::make_pair('2', 2)); conversion_.insert(std::make_pair('3', 3)); conversion_.insert(std::make_pair('4', 4)); conversion_.insert(std::make_pair('5', 5)); conversion_.insert(std::make_pair('6', 6)); conversion_.insert(std::make_pair('7', 7)); conversion_.insert(std::make_pair('8', 8)); conversion_.insert(std::make_pair('9', 9)); conversion_.insert(std::make_pair('a', 10)); conversion_.insert(std::make_pair('b', 11)); conversion_.insert(std::make_pair('c', 12)); conversion_.insert(std::make_pair('d', 13)); conversion_.insert(std::make_pair('e', 14)); conversion_.insert(std::make_pair('f', 15)); conversion_.insert(std::make_pair('A', 10)); conversion_.insert(std::make_pair('B', 11)); conversion_.insert(std::make_pair('C', 12)); conversion_.insert(std::make_pair('D', 13)); conversion_.insert(std::make_pair('E', 14)); conversion_.insert(std::make_pair('F', 15)); } // XMLparser ////////////////////////////////////// // features template bool Garden::getFeature(const string_type& name) const { throw SAXNotRecognizedException(string_adaptor::asStdString(name)); } // getFeature template void Garden::setFeature(const string_type& name, bool value) { throw SAXNotRecognizedException(string_adaptor::asStdString(name)); } // setFeature /////////////////////////////////////// // properties template std::auto_ptr::PropertyBaseT> Garden::doGetProperty(const string_type& name) { throw SAXNotRecognizedException(string_adaptor::asStdString(name)); } // doGetProperty template void Garden::doSetProperty(const string_type& name, std::auto_ptr value) { throw SAXNotRecognizedException(string_adaptor::asStdString(name)); } // doSetProperty ////////////////////////////////////////// // parse template void Garden::parse(InputSourceT& input) { InputSourceResolver is(input, string_adaptor()); if(is.resolve() == 0) { reportError("Could not resolve XML document", true); return; } // if(is.resolver() == 0) // Turn of white space skipping on the stream is.resolve()->unsetf(std::ios::skipws); vector_t data(std::istream_iterator(*is.resolve()), std::istream_iterator()); iterator_t first = data.begin(); iterator_t last = data.end(); scanner_t scanner(first, last); typedef typename boost::spirit::parser_result::type result_t; if(contentHandler_) contentHandler_->startDocument(); result_t r = document_.parse(scanner); if(contentHandler_) contentHandler_->endDocument(); if(!(r && first == last)) { std::cout << string_adaptor::asStdString(input.getSystemId()) << " Fails Parsing\n" << std::endl; for (int i = 0; i < 50; ++i) { std::cout << *first++; } std::cout << std::endl; } // if ... } // parse template void Garden::openElement(iterator_t s, iterator_t e) { elements_.push(str(s, e)); attrs_.clear(); } // openElement template void Garden::closeElement(iterator_t s, iterator_t e) { if(contentHandler_) contentHandler_->startElement(string_adaptor::empty_string(), elements_.top(), string_adaptor::empty_string(), attrs_); } // closeElement template void Garden::closeEmptyElement(iterator_t s, iterator_t e) { if(contentHandler_) { contentHandler_->startElement(string_adaptor::empty_string(), elements_.top(), string_adaptor::empty_string(), attrs_); contentHandler_->endElement(string_adaptor::empty_string(), elements_.top(), string_adaptor::empty_string()); elements_.pop(); } // if ... } // closeEmptyElement template void Garden::endElementName(iterator_t s, iterator_t e) { string_type name = str(s, e); if(name != elements_.top()) reportError("Expect end element " + string_adaptor::asStdString(elements_.top()), true); } // endElementName template void Garden::endElement(iterator_t s, iterator_t e) { if(contentHandler_) contentHandler_->endElement(string_adaptor::empty_string(), elements_.top(), string_adaptor::empty_string()); elements_.pop(); } // endElement template void Garden::attributeName(iterator_t s, iterator_t e) { currentAttr_ = typename AttributesImplT::Attr(); currentAttr_.localName_ = str(s, e); } // attributeName template void Garden::attributeValue(iterator_t s, iterator_t e) { currentAttr_.value_ = str(s, e); currentAttr_.type_ = AttributeType::CDATA; attrs_.addAttribute(currentAttr_); } // attributeValue template void Garden::elementContent(iterator_t s, iterator_t e) { if(contentHandler_ && (s != e)) contentHandler_->characters(str(s, e)); } // Garden::elementContent // processing instructions template void Garden::piTarget(iterator_t s, iterator_t e) { piTarget_ = str(s, e); piData_ = string_adaptor::empty_string(); } // piTarget template void Garden::piData(iterator_t s, iterator_t e) { piData_ = str(s, e); } // piData template void Garden::piEnd(iterator_t s, iterator_t e) { if(contentHandler_) contentHandler_->processingInstruction(piTarget_, piData_); } // piEnd //entity refs template void Garden::entityRef(iterator_t s, iterator_t e) { if(contentHandler_) { string_type name(str(s, e, 1)); typedef typename std::map::iterator entity_iterator; entity_iterator ent = declaredEntities_.find(name); if(ent != declaredEntities_.end()) { contentHandler_->characters((*ent).second); return; } else { reportError("Undeclared entity " + string_adaptor::asStdString(name)); return; } // if ... contentHandler_->skippedEntity(name); } // if ... } // entityRef template void Garden::decimalCharacterRef(iterator_t s, iterator_t e) { characterRef(s, e, 10); } // decimalCharacterRef template void Garden::hexCharacterRef(iterator_t s, iterator_t e) { characterRef(s, e, 16); } // hexCharacterRef template void Garden::characterRef(iterator_t s, iterator_t e, int base) { if(!contentHandler_) return; int val = 0; char next = *s; while(++s != e) { val *= base; val += conversion_[next]; next = *s; } contentHandler_->characters(string_adaptor::construct("?"));//string_type(1, val)); } // characterRef /////////////////////////////// template void Garden::reportError(const std::string& message, bool fatal) { if(!errorHandler_) return; SAX::SAXParseException e(message); if(fatal) errorHandler_->fatalError(e); else errorHandler_->error(e); } // reportError } // namespace SAX } // namespace Arabica #endif