#ifndef ARABICA_GARDEN_H #define ARABICA_GARDEN_H #include #include #include #include #include #include #include #include #include #include #include #include #include namespace SAX { template class Garden : public basic_XMLReader { public: typedef string_type stringT; typedef basic_EntityResolver EntityResolverT; typedef basic_DTDHandler DTDHandlerT; typedef basic_ContentHandler ContentHandlerT; typedef basic_InputSource InputSourceT; typedef basic_AttributesImpl AttributesImplT; typedef typename basic_XMLReader::PropertyBase PropertyBase; Garden(); virtual bool getFeature(const stringT& name) const; virtual void setFeature(const stringT& name, bool value); virtual void setEntityResolver(EntityResolverT& resolver) { entityResolver_ = &resolver; } virtual EntityResolverT* getEntityResolver() const { return entityResolver_; } virtual void setDTDHandler(DTDHandlerT& handler) { dtdHandler_ = &handler; } virtual DTDHandlerT* getDTDHandler() const { return dtdHandler_; } virtual void setContentHandler(ContentHandlerT& handler) { contentHandler_ = &handler; } virtual ContentHandlerT* getContentHandler() const { return contentHandler_; } virtual void setErrorHandler(SAX::ErrorHandler& handler) { errorHandler_ = &handler; } virtual SAX::ErrorHandler* getErrorHandler() const { return errorHandler_; } virtual void parse(InputSourceT& input); virtual std::auto_ptr doGetProperty(const stringT& name); virtual void doSetProperty(const stringT& name, std::auto_ptr value); private: void reportError(const std::string& message, bool fatal = false); typedef typename stringT::value_type char_t; typedef std::vector vector_t; typedef typename vector_t::iterator iterator_t; typedef boost::spirit::scanner scanner_t; typedef boost::spirit::rule rule_t; void openElement(iterator_t s, iterator_t e); void closeElement(iterator_t s, iterator_t e); void closeEmptyElement(iterator_t s, iterator_t e); void endElementName(iterator_t s, iterator_t e); void endElement(iterator_t s, iterator_t e); void attributeName(iterator_t s, iterator_t e); void attributeValue(iterator_t s, iterator_t e); void elementContent(iterator_t s, iterator_t e); void piTarget(iterator_t s, iterator_t e); void piData(iterator_t s, iterator_t e); void piEnd(iterator_t s, iterator_t e); void entityRef(iterator_t s, iterator_t e); void decimalCharacterRef(iterator_t s, iterator_t e); void hexCharacterRef(iterator_t s, iterator_t e); void characterRef(iterator_t s, iterator_t e, int base); // Start grammar definition rule_t prolog, element, Misc, Reference, CDSect, CDStart, CData, CDEnd, PI, PITarget, PIData, doctypedecl, XMLDecl, SDDecl, VersionInfo, EncodingDecl, VersionNum, Eq, EmptyElemTag, STag, content, ETag, Attribute, AttValue, CharData, Comment, CharRef, EntityRef, EncName, document_, Name, Comment1, Spaces; stringT str(iterator_t s, iterator_t e, int trim = 0); ////////////////////////////// // member variables EntityResolverT* entityResolver_; DTDHandlerT* dtdHandler_; ContentHandlerT* contentHandler_; ErrorHandler* errorHandler_; std::stack elements_; AttributesImplT attrs_; typedef typename AttributesImplT::Attr Attr; Attr currentAttr_; stringT piTarget_; stringT piData_; stringT entityRef_; std::map declaredEntities_; std::map conversion_; }; // parser template Garden::Garden() : entityResolver_(0), dtdHandler_(0), contentHandler_(0), errorHandler_(0) { // define the parsing rules typedef boost::spirit::chset chset_t; typedef boost::spirit::chlit chlit_t; // characters chset_t Char("\x9\xA\xD\x20-\xFF"); chset_t SpaceChar("\x20\x9\xD\xA"); Spaces = +(SpaceChar); chset_t Letter("\x41-\x5A\x61-\x7A\xC0-\xD6\xD8-\xF6\xF8-\xFF"); chset_t Digit("0-9"); chlit_t Extender('\xB7'); chset_t NameChar = Letter | Digit | chset_t("._:-") | Extender; Name = (Letter | '_' | ':') >> *(NameChar); document_ = prolog >> element >> *Misc; chset_t CharDataChar (boost::spirit::anychar_p - (chset_t('<') | chset_t('&'))); CharData = (*(CharDataChar - boost::spirit::str_p("]]>")))[boost::bind(&Garden::elementContent, this, _1, _2)]; // Section 2.5 - Comments Comment = boost::spirit::str_p(""); Comment1 = *((Char - boost::spirit::ch_p('-')) | (boost::spirit::ch_p('-') >> (Char - boost::spirit::ch_p('-')))); // Section 2.6 - Processing Instructions PI = boost::spirit::str_p("> (PITarget)[boost::bind(&Garden::piTarget, this, _1, _2)] >> !Spaces >> (PIData)[boost::bind(&Garden::piData, this, _1, _2)] >> (boost::spirit::str_p("?>"))[boost::bind(&Garden::piEnd, this, _1, _2)]; PITarget = Name - boost::spirit::as_lower_d[boost::spirit::str_p("xml")]; PIData = !(!Spaces >> (*(Char - boost::spirit::str_p("?>")))); // Section 2.7 - CDATA CDSect = CDStart >> (CData)[boost::bind(&Garden::elementContent, this, _1, _2)] >> CDEnd; CDStart = boost::spirit::str_p("")); CDEnd = boost::spirit::str_p("]]>"); // bits before the root elemenet prolog = !XMLDecl >> *Misc >> !(doctypedecl >> *Misc); XMLDecl = boost::spirit::str_p("> VersionInfo >> !EncodingDecl >> !SDDecl >> !Spaces >> boost::spirit::str_p("?>"); VersionInfo = Spaces >> boost::spirit::str_p("version") >> Eq >> (boost::spirit::ch_p('\'') >> VersionNum >>'\'' | boost::spirit::ch_p('"') >> VersionNum >> '"'); chset_t VersionNumCh("A-Za-z0-9_.:-"); VersionNum = +(VersionNumCh); doctypedecl = boost::spirit::str_p("> *(Char - (chset_t('[') | '>')) >> !('[' >> *(Char - ']') >> ']') >> '>'; SDDecl = Spaces >> boost::spirit::str_p("standalone") >> Eq >> ((boost::spirit::ch_p('\'') >> (boost::spirit::str_p("yes") | boost::spirit::str_p("no")) >> '\'') | (boost::spirit::ch_p('"') >> (boost::spirit::str_p("yes") | boost::spirit::str_p("no")) >> '"')); // odd bits Eq = !Spaces >> '=' >> !Spaces; Misc = Comment | Spaces | PI; // Elements element = STag >> (EmptyElemTag | (boost::spirit::str_p(">"))[boost::bind(&Garden::closeElement, this, _1, _2)] >> content >> ETag); STag = '<' >> (Name)[boost::bind(&Garden::openElement, this, _1, _2)] >> *(Spaces >> Attribute) >> !Spaces; Attribute = (Name)[boost::bind(&Garden::attributeName, this, _1, _2)] >> Eq >> AttValue; EmptyElemTag = (boost::spirit::str_p("/>"))[boost::bind(&Garden::closeEmptyElement, this, _1, _2)]; ETag = (boost::spirit::str_p("> (Name)[boost::bind(&Garden::endElementName, this, _1, _2)] >> !Spaces >> '>')[boost::bind(&Garden::endElement, this, _1, _2)]; AttValue = '"' >> (*((boost::spirit::anychar_p - (chset_t('<') | '&' | '"')) | Reference))[boost::bind(&Garden::attributeValue, this, _1, _2)] >> '"' | '\'' >> (*((boost::spirit::anychar_p - (chset_t('<') | '&' | '\'')) | Reference))[boost::bind(&Garden::attributeValue, this, _1, _2)] >> '\''; content = !CharData >> *((element | Reference | CDSect | Comment | PI) >> !CharData); // Section 4.1 - Character and entity references CharRef = boost::spirit::str_p("&#") >> (+boost::spirit::digit_p >> ';')[boost::bind(&Garden::decimalCharacterRef, this, _1, _2)] | boost::spirit::str_p("&#x") >> (+boost::spirit::xdigit_p >> ';')[boost::bind(&Garden::hexCharacterRef, this, _1, _2)]; Reference = EntityRef | CharRef; EntityRef = '&' >> (Name >> boost::spirit::ch_p(';'))[boost::bind(&Garden::entityRef, this, _1, _2)]; EncodingDecl = Spaces >> boost::spirit::str_p("encoding") >> Eq >> (boost::spirit::ch_p('"') >> EncName >> '"' | boost::spirit::ch_p('\'') >> EncName >> '\''); chset_t EncNameCh = VersionNumCh - chset_t(':'); EncName = boost::spirit::alpha_p >> *(EncNameCh); ///////////////// declaredEntities_.insert(std::make_pair("lt", "<")); declaredEntities_.insert(std::make_pair("gt", ">")); declaredEntities_.insert(std::make_pair("amp", "&")); declaredEntities_.insert(std::make_pair("apos", "'")); declaredEntities_.insert(std::make_pair("quot", "\"")); conversion_.insert(std::make_pair('0', 0)); conversion_.insert(std::make_pair('1', 1)); conversion_.insert(std::make_pair('2', 2)); conversion_.insert(std::make_pair('3', 3)); conversion_.insert(std::make_pair('4', 4)); conversion_.insert(std::make_pair('5', 5)); conversion_.insert(std::make_pair('6', 6)); conversion_.insert(std::make_pair('7', 7)); conversion_.insert(std::make_pair('8', 8)); conversion_.insert(std::make_pair('9', 9)); conversion_.insert(std::make_pair('a', 10)); conversion_.insert(std::make_pair('b', 11)); conversion_.insert(std::make_pair('c', 12)); conversion_.insert(std::make_pair('d', 13)); conversion_.insert(std::make_pair('e', 14)); conversion_.insert(std::make_pair('f', 15)); conversion_.insert(std::make_pair('A', 10)); conversion_.insert(std::make_pair('B', 11)); conversion_.insert(std::make_pair('C', 12)); conversion_.insert(std::make_pair('D', 13)); conversion_.insert(std::make_pair('E', 14)); conversion_.insert(std::make_pair('F', 15)); } // XMLparser ////////////////////////////////////// // features template bool Garden::getFeature(const stringT& name) const { throw SAXNotRecognizedException(name); } // getFeature template void Garden::setFeature(const stringT& name, bool value) { throw SAXNotRecognizedException(name); } // setFeature /////////////////////////////////////// // properties template std::auto_ptr::PropertyBase> Garden::doGetProperty(const stringT& name) { throw SAXNotRecognizedException(name); } // doGetProperty template void Garden::doSetProperty(const stringT& name, std::auto_ptr::PropertyBase> value) { throw SAXNotRecognizedException(name); } // doSetProperty ////////////////////////////////////////// // parse template void Garden::parse(InputSourceT& input) { Arabica::default_string_adaptor SA; InputSourceResolver is(input, SA); if(is.resolve() == 0) { reportError("Could not resolve XML document", true); return; } // if(is.resolver() == 0) // Turn of white space skipping on the stream is.resolve()->unsetf(std::ios::skipws); vector_t data(std::istream_iterator(*is.resolve()), std::istream_iterator()); iterator_t first = data.begin(); iterator_t last = data.end(); scanner_t scanner(first, last); typedef typename boost::spirit::parser_result::type result_t; if(contentHandler_) contentHandler_->startDocument(); result_t r = document_.parse(scanner); if(contentHandler_) contentHandler_->endDocument(); if(!(r && first == last)) { std::cout << input.getSystemId() << " Fails Parsing\n" << std::endl; for (int i = 0; i < 50; ++i) { std::cout << *first++; } std::cout << std::endl; } // if ... } // parse template void Garden::openElement(iterator_t s, iterator_t e) { elements_.push(str(s, e)); attrs_.clear(); } // openElement template void Garden::closeElement(iterator_t s, iterator_t e) { if(contentHandler_) contentHandler_->startElement("", elements_.top(), "", attrs_); } // closeElement template void Garden::closeEmptyElement(iterator_t s, iterator_t e) { if(contentHandler_) { contentHandler_->startElement("", elements_.top(), "", attrs_); contentHandler_->endElement("", elements_.top(), ""); elements_.pop(); } // if ... } // closeEmptyElement template void Garden::endElementName(iterator_t s, iterator_t e) { stringT name = str(s, e); if(name != elements_.top()) reportError("Expect end element " + elements_.top(), true); } // endElementName template void Garden::endElement(iterator_t s, iterator_t e) { if(contentHandler_) contentHandler_->endElement("", elements_.top(), ""); elements_.pop(); } // endElement template void Garden::attributeName(iterator_t s, iterator_t e) { currentAttr_ = typename AttributesImplT::Attr(); currentAttr_.localName_ = str(s, e); } // attributeName template void Garden::attributeValue(iterator_t s, iterator_t e) { currentAttr_.value_ = str(s, e); currentAttr_.type_ = "CDATA"; attrs_.addAttribute(currentAttr_); } // attributeValue template void Garden::elementContent(iterator_t s, iterator_t e) { if(contentHandler_ && (s != e)) contentHandler_->characters(str(s, e)); } // Garden::elementContent // processing instructions template void Garden::piTarget(iterator_t s, iterator_t e) { piTarget_ = str(s, e); piData_.erase(); } // piTarget template void Garden::piData(iterator_t s, iterator_t e) { piData_ = str(s, e); } // piData template void Garden::piEnd(iterator_t s, iterator_t e) { if(contentHandler_) contentHandler_->processingInstruction(piTarget_, piData_); } // piEnd //entity refs template void Garden::entityRef(iterator_t s, iterator_t e) { if(contentHandler_) { stringT name(str(s, e, 1)); typedef typename std::map::iterator entity_iterator; entity_iterator ent = declaredEntities_.find(name); if(ent != declaredEntities_.end()) { contentHandler_->characters((*ent).second); return; } else { reportError("Undeclared entity " + name); return; } // if ... contentHandler_->skippedEntity(name); } // if ... } // entityRef template void Garden::decimalCharacterRef(iterator_t s, iterator_t e) { characterRef(s, e, 10); } // decimalCharacterRef template void Garden::hexCharacterRef(iterator_t s, iterator_t e) { characterRef(s, e, 16); } // hexCharacterRef template void Garden::characterRef(iterator_t s, iterator_t e, int base) { if(!contentHandler_) return; int val = 0; char next = *s; while(++s != e) { val *= base; val += conversion_[next]; next = *s; } contentHandler_->characters(stringT(1, val)); } // characterRef /////////////////////////////// template typename Garden::stringT Garden::str(iterator_t s, iterator_t e, int trim) { stringT str; std::copy(s, e, std::inserter(str, str.begin())); if(trim) str.erase(str.end() - trim); return str; } // str template void Garden::reportError(const std::string& message, bool fatal) { if(!errorHandler_) return; SAX::basic_SAXParseException e(message); if(fatal) errorHandler_->fatalError(e); else errorHandler_->error(e); } // reportError } // namespace SAX #endif