mirror of
https://github.com/jezhiggins/arabica
synced 2024-12-26 21:58:39 +01:00
6bf75264de
For nearly everyone, their use can be replaced by either setLexicalHandler or setDeclHandler. For the particular case of the Xerces extended properties I'll loop back round and add specific member methods for those.
452 lines
18 KiB
C++
452 lines
18 KiB
C++
#ifndef ARABICA_GARDEN_H
|
|
#define ARABICA_GARDEN_H
|
|
|
|
#include <SAX/ArabicaConfig.hpp>
|
|
#include <boost/version.hpp>
|
|
#if BOOST_VERSION >= 103800
|
|
#define BOOST_SPIRIT_USE_OLD_NAMESPACE 1
|
|
#include <boost/spirit/include/classic_core.hpp>
|
|
#include <boost/spirit/include/classic_chset.hpp>
|
|
#else
|
|
#include <boost/spirit/core.hpp>
|
|
#include <boost/spirit/utility/chset.hpp>
|
|
#endif
|
|
#include <boost/bind.hpp>
|
|
#include <vector>
|
|
#include <string>
|
|
#include <stack>
|
|
#include <map>
|
|
#include <SAX/XMLReader.hpp>
|
|
#include <SAX/SAXParseException.hpp>
|
|
#include <SAX/SAXNotRecognizedException.hpp>
|
|
#include <SAX/helpers/InputSourceResolver.hpp>
|
|
#include <SAX/helpers/AttributesImpl.hpp>
|
|
#include <Arabica/StringAdaptor.hpp>
|
|
#include <Arabica/getparam.hpp>
|
|
|
|
namespace Arabica
|
|
{
|
|
namespace SAX
|
|
{
|
|
|
|
template<class string_type,
|
|
class T0 = Arabica::nil_t,
|
|
class T1 = Arabica::nil_t>
|
|
class Garden :
|
|
public SAX::XMLReaderInterface<string_type,
|
|
typename Arabica::get_string_adaptor<string_type, T0, T1>::type>
|
|
{
|
|
public:
|
|
typedef SAX::XMLReaderInterface<string_type,
|
|
typename Arabica::get_string_adaptor<string_type, T0, T1>::type> XMLReaderT;
|
|
typedef typename XMLReaderT::string_adaptor string_adaptor;
|
|
typedef EntityResolver<string_type, string_adaptor> EntityResolverT;
|
|
typedef DTDHandler<string_type, string_adaptor> DTDHandlerT;
|
|
typedef ContentHandler<string_type, string_adaptor> ContentHandlerT;
|
|
typedef InputSource<string_type, string_adaptor> InputSourceT;
|
|
typedef AttributesImpl<string_type, string_adaptor> AttributesImplT;
|
|
typedef ErrorHandler<string_type, string_adaptor> ErrorHandlerT;
|
|
typedef DeclHandler<string_type, string_adaptor> declHandlerT;
|
|
typedef LexicalHandler<string_type, string_adaptor> lexicalHandlerT;
|
|
|
|
Garden();
|
|
|
|
virtual bool getFeature(const string_type& name) const;
|
|
virtual void setFeature(const string_type& name, bool value);
|
|
|
|
virtual void setEntityResolver(EntityResolverT& resolver) { entityResolver_ = &resolver; }
|
|
virtual EntityResolverT* getEntityResolver() const { return entityResolver_; }
|
|
virtual void setDTDHandler(DTDHandlerT& handler) { dtdHandler_ = &handler; }
|
|
virtual DTDHandlerT* getDTDHandler() const { return dtdHandler_; }
|
|
virtual void setContentHandler(ContentHandlerT& handler) { contentHandler_ = &handler; }
|
|
virtual ContentHandlerT* getContentHandler() const { return contentHandler_; }
|
|
virtual void setErrorHandler(ErrorHandlerT& handler) { errorHandler_ = &handler; }
|
|
virtual ErrorHandlerT* getErrorHandler() const { return errorHandler_; }
|
|
virtual void setDeclHandler(declHandlerT& handler) { throw SAXNotSupportedException("decl-handler"); }
|
|
virtual declHandlerT* getDeclHandler() const { throw SAXNotSupportedException("decl-handler"); }
|
|
virtual void setLexicalHandler(lexicalHandlerT& handler) { throw SAXNotSupportedException("lexical-handler"); }
|
|
virtual lexicalHandlerT* getLexicalHandler() const { throw SAXNotSupportedException("lexical-handler"); }
|
|
|
|
virtual void parse(InputSourceT& input);
|
|
|
|
private:
|
|
void reportError(const std::string& message, bool fatal = false);
|
|
|
|
typedef typename string_adaptor::value_type char_t;
|
|
typedef std::vector<char_t> vector_t;
|
|
typedef typename vector_t::iterator iterator_t;
|
|
typedef boost::spirit::scanner<iterator_t> scanner_t;
|
|
typedef boost::spirit::rule<scanner_t> rule_t;
|
|
|
|
void openElement(iterator_t s, iterator_t e);
|
|
void closeElement(iterator_t s, iterator_t e);
|
|
void closeEmptyElement(iterator_t s, iterator_t e);
|
|
void endElementName(iterator_t s, iterator_t e);
|
|
void endElement(iterator_t s, iterator_t e);
|
|
void attributeName(iterator_t s, iterator_t e);
|
|
void attributeValue(iterator_t s, iterator_t e);
|
|
void elementContent(iterator_t s, iterator_t e);
|
|
void piTarget(iterator_t s, iterator_t e);
|
|
void piData(iterator_t s, iterator_t e);
|
|
void piEnd(iterator_t s, iterator_t e);
|
|
void entityRef(iterator_t s, iterator_t e);
|
|
void decimalCharacterRef(iterator_t s, iterator_t e);
|
|
void hexCharacterRef(iterator_t s, iterator_t e);
|
|
void characterRef(iterator_t s, iterator_t e, int base);
|
|
|
|
// Start grammar definition
|
|
rule_t prolog, element, Misc, Reference,
|
|
CDSect, CDStart, CData, CDEnd,
|
|
PI, PITarget, PIData,
|
|
doctypedecl, XMLDecl, SDDecl, VersionInfo, EncodingDecl,
|
|
VersionNum, Eq, EmptyElemTag, STag, content, ETag, Attribute,
|
|
AttValue, CharData, Comment,
|
|
CharRef, EntityRef, EncName, document_,
|
|
Name, Comment1, Spaces;
|
|
|
|
string_type str(iterator_t s, iterator_t e, int trim = 0)
|
|
{
|
|
return string_adaptor::construct(s, e-trim);
|
|
} // str
|
|
|
|
//////////////////////////////
|
|
// member variables
|
|
EntityResolverT* entityResolver_;
|
|
DTDHandlerT* dtdHandler_;
|
|
ContentHandlerT* contentHandler_;
|
|
ErrorHandlerT* errorHandler_;
|
|
|
|
std::stack<string_type> elements_;
|
|
AttributesImplT attrs_;
|
|
typedef typename AttributesImplT::Attr Attr;
|
|
Attr currentAttr_;
|
|
string_type piTarget_;
|
|
string_type piData_;
|
|
string_type entityRef_;
|
|
std::map<string_type, string_type> declaredEntities_;
|
|
std::map<char, int> conversion_;
|
|
}; // parser
|
|
|
|
template<class string_type, class T0, class T1>
|
|
Garden<string_type, T0, T1>::Garden() :
|
|
entityResolver_(0),
|
|
dtdHandler_(0),
|
|
contentHandler_(0),
|
|
errorHandler_(0)
|
|
{
|
|
// define the parsing rules
|
|
typedef boost::spirit::chset<char_t> chset_t;
|
|
typedef boost::spirit::chlit<char_t> chlit_t;
|
|
|
|
// characters
|
|
chset_t Char("\x9\xA\xD\x20-\xFF");
|
|
chset_t SpaceChar("\x20\x9\xD\xA");
|
|
Spaces = +(SpaceChar);
|
|
chset_t Letter("\x41-\x5A\x61-\x7A\xC0-\xD6\xD8-\xF6\xF8-\xFF");
|
|
chset_t Digit("0-9");
|
|
chlit_t Extender('\xB7');
|
|
chset_t NameChar = Letter | Digit | chset_t("._:-") | Extender;
|
|
Name = (Letter | '_' | ':') >> *(NameChar);
|
|
|
|
document_ = prolog >> element >> *Misc;
|
|
|
|
chset_t CharDataChar (boost::spirit::anychar_p - (chset_t('<') | chset_t('&')));
|
|
CharData = (*(CharDataChar - boost::spirit::str_p("]]>")))[boost::bind(&Garden<string_type, T0, T1>::elementContent, this, _1, _2)];
|
|
|
|
// Section 2.5 - Comments
|
|
Comment = boost::spirit::str_p("<!--") >> Comment1 >> boost::spirit::str_p("-->");
|
|
Comment1 = *((Char - boost::spirit::ch_p('-')) | (boost::spirit::ch_p('-') >> (Char - boost::spirit::ch_p('-'))));
|
|
|
|
// Section 2.6 - Processing Instructions
|
|
PI = boost::spirit::str_p("<?") >> (PITarget)[boost::bind(&Garden<string_type, T0, T1>::piTarget, this, _1, _2)] >> !Spaces >> (PIData)[boost::bind(&Garden<string_type, T0, T1>::piData, this, _1, _2)] >> (boost::spirit::str_p("?>"))[boost::bind(&Garden<string_type, T0, T1>::piEnd, this, _1, _2)];
|
|
PITarget = Name - boost::spirit::as_lower_d[boost::spirit::str_p("xml")];
|
|
PIData = !(!Spaces >> (*(Char - boost::spirit::str_p("?>"))));
|
|
|
|
// Section 2.7 - CDATA
|
|
CDSect = CDStart >> (CData)[boost::bind(&Garden<string_type, T0, T1>::elementContent, this, _1, _2)] >> CDEnd;
|
|
CDStart = boost::spirit::str_p("<![CDATA[");
|
|
CData = *(Char - boost::spirit::str_p("]]>"));
|
|
CDEnd = boost::spirit::str_p("]]>");
|
|
|
|
// bits before the root elemenet
|
|
prolog = !XMLDecl >> *Misc >> !(doctypedecl >> *Misc);
|
|
XMLDecl = boost::spirit::str_p("<?xml") >> VersionInfo >> !EncodingDecl >> !SDDecl >> !Spaces >> boost::spirit::str_p("?>");
|
|
VersionInfo = Spaces >> boost::spirit::str_p("version") >> Eq >> (boost::spirit::ch_p('\'') >> VersionNum >>'\''
|
|
| boost::spirit::ch_p('"') >> VersionNum >> '"');
|
|
chset_t VersionNumCh("A-Za-z0-9_.:-");
|
|
VersionNum = +(VersionNumCh);
|
|
doctypedecl = boost::spirit::str_p("<!DOCTYPE") >> *(Char - (chset_t('[') | '>')) >> !('[' >> *(Char - ']') >> ']') >> '>';
|
|
SDDecl = Spaces >> boost::spirit::str_p("standalone") >> Eq >> ((boost::spirit::ch_p('\'') >> (boost::spirit::str_p("yes") | boost::spirit::str_p("no")) >> '\'')
|
|
| (boost::spirit::ch_p('"') >> (boost::spirit::str_p("yes") | boost::spirit::str_p("no")) >> '"'));
|
|
|
|
// odd bits
|
|
Eq = !Spaces >> '=' >> !Spaces;
|
|
Misc = Comment | Spaces | PI;
|
|
|
|
// Elements
|
|
element = STag >> (EmptyElemTag | (boost::spirit::str_p(">"))[boost::bind(&Garden<string_type, T0, T1>::closeElement, this, _1, _2)] >> content >> ETag);
|
|
STag = '<' >> (Name)[boost::bind(&Garden<string_type, T0, T1>::openElement, this, _1, _2)] >> *(Spaces >> Attribute) >> !Spaces;
|
|
Attribute = (Name)[boost::bind(&Garden<string_type, T0, T1>::attributeName, this, _1, _2)] >> Eq >> AttValue;
|
|
EmptyElemTag = (boost::spirit::str_p("/>"))[boost::bind(&Garden<string_type, T0, T1>::closeEmptyElement, this, _1, _2)];
|
|
ETag = (boost::spirit::str_p("</") >> (Name)[boost::bind(&Garden<string_type, T0, T1>::endElementName, this, _1, _2)] >> !Spaces >> '>')[boost::bind(&Garden<string_type, T0, T1>::endElement, this, _1, _2)];
|
|
|
|
AttValue = '"' >> (*((boost::spirit::anychar_p - (chset_t('<') | '&' | '"')) | Reference))[boost::bind(&Garden<string_type, T0, T1>::attributeValue, this, _1, _2)] >> '"'
|
|
| '\'' >> (*((boost::spirit::anychar_p - (chset_t('<') | '&' | '\'')) | Reference))[boost::bind(&Garden<string_type, T0, T1>::attributeValue, this, _1, _2)] >> '\'';
|
|
|
|
content = !CharData >> *((element | Reference | CDSect | Comment | PI) >> !CharData);
|
|
|
|
// Section 4.1 - Character and entity references
|
|
CharRef = boost::spirit::str_p("&#") >> (+boost::spirit::digit_p >> ';')[boost::bind(&Garden<string_type, T0, T1>::decimalCharacterRef, this, _1, _2)] |
|
|
boost::spirit::str_p("&#x") >> (+boost::spirit::xdigit_p >> ';')[boost::bind(&Garden<string_type, T0, T1>::hexCharacterRef, this, _1, _2)];
|
|
Reference = EntityRef | CharRef;
|
|
EntityRef = '&' >> (Name >> boost::spirit::ch_p(';'))[boost::bind(&Garden<string_type, T0, T1>::entityRef, this, _1, _2)];
|
|
|
|
EncodingDecl = Spaces >> boost::spirit::str_p("encoding") >> Eq >> (boost::spirit::ch_p('"') >> EncName >> '"' |
|
|
boost::spirit::ch_p('\'') >> EncName >> '\'');
|
|
chset_t EncNameCh = VersionNumCh - chset_t(':');
|
|
EncName = boost::spirit::alpha_p >> *(EncNameCh);
|
|
|
|
|
|
/////////////////
|
|
declaredEntities_.insert(std::make_pair<string_type, string_type>(string_adaptor::construct("lt"), string_adaptor::construct("<")));
|
|
declaredEntities_.insert(std::make_pair<string_type, string_type>(string_adaptor::construct("gt"), string_adaptor::construct(">")));
|
|
declaredEntities_.insert(std::make_pair<string_type, string_type>(string_adaptor::construct("amp"), string_adaptor::construct("&")));
|
|
declaredEntities_.insert(std::make_pair<string_type, string_type>(string_adaptor::construct("apos"), string_adaptor::construct("'")));
|
|
declaredEntities_.insert(std::make_pair<string_type, string_type>(string_adaptor::construct("quot"), string_adaptor::construct("\"")));
|
|
|
|
conversion_.insert(std::make_pair('0', 0));
|
|
conversion_.insert(std::make_pair('1', 1));
|
|
conversion_.insert(std::make_pair('2', 2));
|
|
conversion_.insert(std::make_pair('3', 3));
|
|
conversion_.insert(std::make_pair('4', 4));
|
|
conversion_.insert(std::make_pair('5', 5));
|
|
conversion_.insert(std::make_pair('6', 6));
|
|
conversion_.insert(std::make_pair('7', 7));
|
|
conversion_.insert(std::make_pair('8', 8));
|
|
conversion_.insert(std::make_pair('9', 9));
|
|
conversion_.insert(std::make_pair('a', 10));
|
|
conversion_.insert(std::make_pair('b', 11));
|
|
conversion_.insert(std::make_pair('c', 12));
|
|
conversion_.insert(std::make_pair('d', 13));
|
|
conversion_.insert(std::make_pair('e', 14));
|
|
conversion_.insert(std::make_pair('f', 15));
|
|
conversion_.insert(std::make_pair('A', 10));
|
|
conversion_.insert(std::make_pair('B', 11));
|
|
conversion_.insert(std::make_pair('C', 12));
|
|
conversion_.insert(std::make_pair('D', 13));
|
|
conversion_.insert(std::make_pair('E', 14));
|
|
conversion_.insert(std::make_pair('F', 15));
|
|
} // XMLparser
|
|
|
|
//////////////////////////////////////
|
|
// features
|
|
template<class string_type, class T0, class T1>
|
|
bool Garden<string_type, T0, T1>::getFeature(const string_type& name) const
|
|
{
|
|
throw SAXNotRecognizedException(string_adaptor::asStdString(name));
|
|
} // getFeature
|
|
|
|
template<class string_type, class T0, class T1>
|
|
void Garden<string_type, T0, T1>::setFeature(const string_type& name, bool value)
|
|
{
|
|
throw SAXNotRecognizedException(string_adaptor::asStdString(name));
|
|
} // setFeature
|
|
|
|
//////////////////////////////////////////
|
|
// parse
|
|
template<class string_type, class T0, class T1>
|
|
void Garden<string_type, T0, T1>::parse(InputSourceT& input)
|
|
{
|
|
InputSourceResolver is(input, string_adaptor());
|
|
if(is.resolve() == 0)
|
|
{
|
|
reportError("Could not resolve XML document", true);
|
|
return;
|
|
} // if(is.resolver() == 0)
|
|
|
|
// Turn of white space skipping on the stream
|
|
is.resolve()->unsetf(std::ios::skipws);
|
|
|
|
vector_t data(std::istream_iterator<char_t>(*is.resolve()), std::istream_iterator<char_t>());
|
|
|
|
iterator_t first = data.begin();
|
|
iterator_t last = data.end();
|
|
scanner_t scanner(first, last);
|
|
typedef typename boost::spirit::parser_result<rule_t, scanner_t>::type result_t;
|
|
|
|
if(contentHandler_)
|
|
contentHandler_->startDocument();
|
|
result_t r = document_.parse(scanner);
|
|
if(contentHandler_)
|
|
contentHandler_->endDocument();
|
|
|
|
if(!(r && first == last))
|
|
{
|
|
std::cout << string_adaptor::asStdString(input.getSystemId()) << " Fails Parsing\n" << std::endl;
|
|
for (int i = 0; i < 50; ++i)
|
|
{
|
|
std::cout << *first++;
|
|
}
|
|
std::cout << std::endl;
|
|
} // if ...
|
|
} // parse
|
|
|
|
template<class string_type, class T0, class T1>
|
|
void Garden<string_type, T0, T1>::openElement(iterator_t s, iterator_t e)
|
|
{
|
|
elements_.push(str(s, e));
|
|
attrs_.clear();
|
|
} // openElement
|
|
|
|
template<class string_type, class T0, class T1>
|
|
void Garden<string_type, T0, T1>::closeElement(iterator_t s, iterator_t e)
|
|
{
|
|
if(contentHandler_)
|
|
contentHandler_->startElement(string_adaptor::empty_string(), elements_.top(), string_adaptor::empty_string(), attrs_);
|
|
} // closeElement
|
|
|
|
template<class string_type, class T0, class T1>
|
|
void Garden<string_type, T0, T1>::closeEmptyElement(iterator_t s, iterator_t e)
|
|
{
|
|
if(contentHandler_)
|
|
{
|
|
contentHandler_->startElement(string_adaptor::empty_string(), elements_.top(), string_adaptor::empty_string(), attrs_);
|
|
contentHandler_->endElement(string_adaptor::empty_string(), elements_.top(), string_adaptor::empty_string());
|
|
elements_.pop();
|
|
} // if ...
|
|
} // closeEmptyElement
|
|
|
|
template<class string_type, class T0, class T1>
|
|
void Garden<string_type, T0, T1>::endElementName(iterator_t s, iterator_t e)
|
|
{
|
|
string_type name = str(s, e);
|
|
if(name != elements_.top())
|
|
reportError("Expect end element " + string_adaptor::asStdString(elements_.top()), true);
|
|
} // endElementName
|
|
|
|
template<class string_type, class T0, class T1>
|
|
void Garden<string_type, T0, T1>::endElement(iterator_t s, iterator_t e)
|
|
{
|
|
if(contentHandler_)
|
|
contentHandler_->endElement(string_adaptor::empty_string(), elements_.top(), string_adaptor::empty_string());
|
|
elements_.pop();
|
|
} // endElement
|
|
|
|
template<class string_type, class T0, class T1>
|
|
void Garden<string_type, T0, T1>::attributeName(iterator_t s, iterator_t e)
|
|
{
|
|
currentAttr_ = typename AttributesImplT::Attr();
|
|
currentAttr_.localName_ = str(s, e);
|
|
} // attributeName
|
|
|
|
template<class string_type, class T0, class T1>
|
|
void Garden<string_type, T0, T1>::attributeValue(iterator_t s, iterator_t e)
|
|
{
|
|
currentAttr_.value_ = str(s, e);
|
|
currentAttr_.type_ = AttributeType<string_type, string_adaptor>::CDATA;
|
|
attrs_.addAttribute(currentAttr_);
|
|
} // attributeValue
|
|
|
|
template<class string_type, class T0, class T1>
|
|
void Garden<string_type, T0, T1>::elementContent(iterator_t s, iterator_t e)
|
|
{
|
|
if(contentHandler_ && (s != e))
|
|
contentHandler_->characters(str(s, e));
|
|
} // Garden<string_type, T0, T1>::elementContent
|
|
|
|
// processing instructions
|
|
template<class string_type, class T0, class T1>
|
|
void Garden<string_type, T0, T1>::piTarget(iterator_t s, iterator_t e)
|
|
{
|
|
piTarget_ = str(s, e);
|
|
piData_ = string_adaptor::empty_string();
|
|
} // piTarget
|
|
|
|
template<class string_type, class T0, class T1>
|
|
void Garden<string_type, T0, T1>::piData(iterator_t s, iterator_t e)
|
|
{
|
|
piData_ = str(s, e);
|
|
} // piData
|
|
|
|
template<class string_type, class T0, class T1>
|
|
void Garden<string_type, T0, T1>::piEnd(iterator_t s, iterator_t e)
|
|
{
|
|
if(contentHandler_)
|
|
contentHandler_->processingInstruction(piTarget_, piData_);
|
|
} // piEnd
|
|
|
|
//entity refs
|
|
template<class string_type, class T0, class T1>
|
|
void Garden<string_type, T0, T1>::entityRef(iterator_t s, iterator_t e)
|
|
{
|
|
if(contentHandler_)
|
|
{
|
|
string_type name(str(s, e, 1));
|
|
|
|
typedef typename std::map<string_type, string_type>::iterator entity_iterator;
|
|
|
|
entity_iterator ent = declaredEntities_.find(name);
|
|
if(ent != declaredEntities_.end())
|
|
{
|
|
contentHandler_->characters((*ent).second);
|
|
return;
|
|
}
|
|
else
|
|
{
|
|
reportError("Undeclared entity " + string_adaptor::asStdString(name));
|
|
return;
|
|
} // if ...
|
|
|
|
contentHandler_->skippedEntity(name);
|
|
} // if ...
|
|
} // entityRef
|
|
|
|
template<class string_type, class T0, class T1>
|
|
void Garden<string_type, T0, T1>::decimalCharacterRef(iterator_t s, iterator_t e)
|
|
{
|
|
characterRef(s, e, 10);
|
|
} // decimalCharacterRef
|
|
|
|
template<class string_type, class T0, class T1>
|
|
void Garden<string_type, T0, T1>::hexCharacterRef(iterator_t s, iterator_t e)
|
|
{
|
|
characterRef(s, e, 16);
|
|
} // hexCharacterRef
|
|
|
|
template<class string_type, class T0, class T1>
|
|
void Garden<string_type, T0, T1>::characterRef(iterator_t s, iterator_t e, int base)
|
|
{
|
|
if(!contentHandler_)
|
|
return;
|
|
|
|
int val = 0;
|
|
char next = *s;
|
|
while(++s != e)
|
|
{
|
|
val *= base;
|
|
val += conversion_[next];
|
|
next = *s;
|
|
}
|
|
|
|
contentHandler_->characters(string_adaptor::construct("?"));//string_type(1, val));
|
|
} // characterRef
|
|
|
|
///////////////////////////////
|
|
template<class string_type, class T0, class T1>
|
|
void Garden<string_type, T0, T1>::reportError(const std::string& message, bool fatal)
|
|
{
|
|
if(!errorHandler_)
|
|
return;
|
|
|
|
SAX::SAXParseException<string_type> e(message);
|
|
|
|
if(fatal)
|
|
errorHandler_->fatalError(e);
|
|
else
|
|
errorHandler_->error(e);
|
|
} // reportError
|
|
|
|
} // namespace SAX
|
|
} // namespace Arabica
|
|
|
|
#endif
|