arabica/include/SAX/parsers/saxgarden.hpp

453 lines
18 KiB
C++
Raw Permalink Normal View History

2003-09-11 12:26:53 +02:00
#ifndef ARABICA_GARDEN_H
#define ARABICA_GARDEN_H
2002-06-21 13:16:28 +02:00
2007-09-05 00:55:47 +02:00
#include <SAX/ArabicaConfig.hpp>
#include <boost/version.hpp>
#if BOOST_VERSION >= 103800
#define BOOST_SPIRIT_USE_OLD_NAMESPACE 1
#include <boost/spirit/include/classic_core.hpp>
#include <boost/spirit/include/classic_chset.hpp>
#else
#include <boost/spirit/core.hpp>
#include <boost/spirit/utility/chset.hpp>
#endif
#include <boost/bind.hpp>
#include <vector>
2002-06-21 13:16:28 +02:00
#include <string>
#include <stack>
2005-08-08 07:41:43 +02:00
#include <map>
2007-09-05 00:55:47 +02:00
#include <SAX/XMLReader.hpp>
#include <SAX/SAXParseException.hpp>
#include <SAX/SAXNotRecognizedException.hpp>
#include <SAX/helpers/InputSourceResolver.hpp>
#include <SAX/helpers/AttributesImpl.hpp>
#include <Arabica/StringAdaptor.hpp>
#include <Arabica/getparam.hpp>
2002-06-21 13:16:28 +02:00
2007-09-05 11:49:18 +02:00
namespace Arabica
{
namespace SAX
{
2002-06-21 13:16:28 +02:00
template<class string_type,
class T0 = Arabica::nil_t,
class T1 = Arabica::nil_t>
class Garden :
public SAX::XMLReaderInterface<string_type,
typename Arabica::get_string_adaptor<string_type, T0, T1>::type>
2002-06-21 13:16:28 +02:00
{
public:
typedef SAX::XMLReaderInterface<string_type,
typename Arabica::get_string_adaptor<string_type, T0, T1>::type> XMLReaderT;
2007-09-06 17:07:16 +02:00
typedef typename XMLReaderT::string_adaptor string_adaptor;
typedef EntityResolver<string_type, string_adaptor> EntityResolverT;
typedef DTDHandler<string_type, string_adaptor> DTDHandlerT;
typedef ContentHandler<string_type, string_adaptor> ContentHandlerT;
typedef InputSource<string_type, string_adaptor> InputSourceT;
typedef AttributesImpl<string_type, string_adaptor> AttributesImplT;
typedef ErrorHandler<string_type, string_adaptor> ErrorHandlerT;
typedef DeclHandler<string_type, string_adaptor> declHandlerT;
typedef LexicalHandler<string_type, string_adaptor> lexicalHandlerT;
2002-06-21 13:16:28 +02:00
Garden();
virtual bool getFeature(const string_type& name) const;
virtual void setFeature(const string_type& name, bool value);
2002-06-21 13:16:28 +02:00
virtual void setEntityResolver(EntityResolverT& resolver) { entityResolver_ = &resolver; }
virtual EntityResolverT* getEntityResolver() const { return entityResolver_; }
virtual void setDTDHandler(DTDHandlerT& handler) { dtdHandler_ = &handler; }
virtual DTDHandlerT* getDTDHandler() const { return dtdHandler_; }
virtual void setContentHandler(ContentHandlerT& handler) { contentHandler_ = &handler; }
virtual ContentHandlerT* getContentHandler() const { return contentHandler_; }
2007-06-18 14:01:47 +02:00
virtual void setErrorHandler(ErrorHandlerT& handler) { errorHandler_ = &handler; }
virtual ErrorHandlerT* getErrorHandler() const { return errorHandler_; }
virtual void setDeclHandler(declHandlerT& handler) { throw SAXNotSupportedException("decl-handler"); }
virtual declHandlerT* getDeclHandler() const { throw SAXNotSupportedException("decl-handler"); }
virtual void setLexicalHandler(lexicalHandlerT& handler) { throw SAXNotSupportedException("lexical-handler"); }
virtual lexicalHandlerT* getLexicalHandler() const { throw SAXNotSupportedException("lexical-handler"); }
2002-06-21 13:16:28 +02:00
virtual void parse(InputSourceT& input);
private:
void reportError(const std::string& message, bool fatal = false);
2007-09-06 17:07:16 +02:00
typedef typename string_adaptor::value_type char_t;
typedef std::vector<char_t> vector_t;
2003-09-09 15:09:48 +02:00
typedef typename vector_t::iterator iterator_t;
typedef boost::spirit::scanner<iterator_t> scanner_t;
typedef boost::spirit::rule<scanner_t> rule_t;
2002-06-21 13:16:28 +02:00
void openElement(iterator_t s, iterator_t e);
void closeElement(iterator_t s, iterator_t e);
void closeEmptyElement(iterator_t s, iterator_t e);
void endElementName(iterator_t s, iterator_t e);
void endElement(iterator_t s, iterator_t e);
void attributeName(iterator_t s, iterator_t e);
void attributeValue(iterator_t s, iterator_t e);
void elementContent(iterator_t s, iterator_t e);
void piTarget(iterator_t s, iterator_t e);
void piData(iterator_t s, iterator_t e);
void piEnd(iterator_t s, iterator_t e);
void entityRef(iterator_t s, iterator_t e);
void decimalCharacterRef(iterator_t s, iterator_t e);
void hexCharacterRef(iterator_t s, iterator_t e);
void characterRef(iterator_t s, iterator_t e, int base);
// Start grammar definition
rule_t prolog, element, Misc, Reference,
2002-06-21 13:16:28 +02:00
CDSect, CDStart, CData, CDEnd,
PI, PITarget, PIData,
doctypedecl, XMLDecl, SDDecl, VersionInfo, EncodingDecl,
VersionNum, Eq, EmptyElemTag, STag, content, ETag, Attribute,
AttValue, CharData, Comment,
CharRef, EntityRef, EncName, document_,
Name, Comment1, Spaces;
2002-06-21 13:16:28 +02:00
2007-09-06 17:07:16 +02:00
string_type str(iterator_t s, iterator_t e, int trim = 0)
{
return string_adaptor::construct(s, e-trim);
} // str
2002-06-21 13:16:28 +02:00
//////////////////////////////
// member variables
EntityResolverT* entityResolver_;
DTDHandlerT* dtdHandler_;
ContentHandlerT* contentHandler_;
2007-06-18 14:01:47 +02:00
ErrorHandlerT* errorHandler_;
2002-06-21 13:16:28 +02:00
std::stack<string_type> elements_;
2002-06-21 13:16:28 +02:00
AttributesImplT attrs_;
2003-09-09 15:09:48 +02:00
typedef typename AttributesImplT::Attr Attr;
Attr currentAttr_;
string_type piTarget_;
string_type piData_;
string_type entityRef_;
2007-09-06 17:07:16 +02:00
std::map<string_type, string_type> declaredEntities_;
2002-06-21 13:16:28 +02:00
std::map<char, int> conversion_;
}; // parser
template<class string_type, class T0, class T1>
Garden<string_type, T0, T1>::Garden() :
2002-06-21 13:16:28 +02:00
entityResolver_(0),
dtdHandler_(0),
contentHandler_(0),
errorHandler_(0)
{
// define the parsing rules
typedef boost::spirit::chset<char_t> chset_t;
typedef boost::spirit::chlit<char_t> chlit_t;
2002-06-21 13:16:28 +02:00
// characters
chset_t Char("\x9\xA\xD\x20-\xFF");
chset_t SpaceChar("\x20\x9\xD\xA");
Spaces = +(SpaceChar);
2002-06-21 13:16:28 +02:00
chset_t Letter("\x41-\x5A\x61-\x7A\xC0-\xD6\xD8-\xF6\xF8-\xFF");
chset_t Digit("0-9");
chlit_t Extender('\xB7');
chset_t NameChar = Letter | Digit | chset_t("._:-") | Extender;
Name = (Letter | '_' | ':') >> *(NameChar);
document_ = prolog >> element >> *Misc;
chset_t CharDataChar (boost::spirit::anychar_p - (chset_t('<') | chset_t('&')));
CharData = (*(CharDataChar - boost::spirit::str_p("]]>")))[boost::bind(&Garden<string_type, T0, T1>::elementContent, this, _1, _2)];
2002-06-21 13:16:28 +02:00
// Section 2.5 - Comments
Comment = boost::spirit::str_p("<!--") >> Comment1 >> boost::spirit::str_p("-->");
Comment1 = *((Char - boost::spirit::ch_p('-')) | (boost::spirit::ch_p('-') >> (Char - boost::spirit::ch_p('-'))));
2002-06-21 13:16:28 +02:00
// Section 2.6 - Processing Instructions
PI = boost::spirit::str_p("<?") >> (PITarget)[boost::bind(&Garden<string_type, T0, T1>::piTarget, this, _1, _2)] >> !Spaces >> (PIData)[boost::bind(&Garden<string_type, T0, T1>::piData, this, _1, _2)] >> (boost::spirit::str_p("?>"))[boost::bind(&Garden<string_type, T0, T1>::piEnd, this, _1, _2)];
PITarget = Name - boost::spirit::as_lower_d[boost::spirit::str_p("xml")];
PIData = !(!Spaces >> (*(Char - boost::spirit::str_p("?>"))));
2002-06-21 13:16:28 +02:00
// Section 2.7 - CDATA
CDSect = CDStart >> (CData)[boost::bind(&Garden<string_type, T0, T1>::elementContent, this, _1, _2)] >> CDEnd;
CDStart = boost::spirit::str_p("<![CDATA[");
CData = *(Char - boost::spirit::str_p("]]>"));
CDEnd = boost::spirit::str_p("]]>");
2002-06-21 13:16:28 +02:00
// bits before the root elemenet
prolog = !XMLDecl >> *Misc >> !(doctypedecl >> *Misc);
XMLDecl = boost::spirit::str_p("<?xml") >> VersionInfo >> !EncodingDecl >> !SDDecl >> !Spaces >> boost::spirit::str_p("?>");
VersionInfo = Spaces >> boost::spirit::str_p("version") >> Eq >> (boost::spirit::ch_p('\'') >> VersionNum >>'\''
| boost::spirit::ch_p('"') >> VersionNum >> '"');
2002-06-21 13:16:28 +02:00
chset_t VersionNumCh("A-Za-z0-9_.:-");
VersionNum = +(VersionNumCh);
doctypedecl = boost::spirit::str_p("<!DOCTYPE") >> *(Char - (chset_t('[') | '>')) >> !('[' >> *(Char - ']') >> ']') >> '>';
SDDecl = Spaces >> boost::spirit::str_p("standalone") >> Eq >> ((boost::spirit::ch_p('\'') >> (boost::spirit::str_p("yes") | boost::spirit::str_p("no")) >> '\'')
| (boost::spirit::ch_p('"') >> (boost::spirit::str_p("yes") | boost::spirit::str_p("no")) >> '"'));
2002-06-21 13:16:28 +02:00
// odd bits
Eq = !Spaces >> '=' >> !Spaces;
Misc = Comment | Spaces | PI;
// Elements
element = STag >> (EmptyElemTag | (boost::spirit::str_p(">"))[boost::bind(&Garden<string_type, T0, T1>::closeElement, this, _1, _2)] >> content >> ETag);
STag = '<' >> (Name)[boost::bind(&Garden<string_type, T0, T1>::openElement, this, _1, _2)] >> *(Spaces >> Attribute) >> !Spaces;
Attribute = (Name)[boost::bind(&Garden<string_type, T0, T1>::attributeName, this, _1, _2)] >> Eq >> AttValue;
EmptyElemTag = (boost::spirit::str_p("/>"))[boost::bind(&Garden<string_type, T0, T1>::closeEmptyElement, this, _1, _2)];
ETag = (boost::spirit::str_p("</") >> (Name)[boost::bind(&Garden<string_type, T0, T1>::endElementName, this, _1, _2)] >> !Spaces >> '>')[boost::bind(&Garden<string_type, T0, T1>::endElement, this, _1, _2)];
2002-06-21 13:16:28 +02:00
AttValue = '"' >> (*((boost::spirit::anychar_p - (chset_t('<') | '&' | '"')) | Reference))[boost::bind(&Garden<string_type, T0, T1>::attributeValue, this, _1, _2)] >> '"'
| '\'' >> (*((boost::spirit::anychar_p - (chset_t('<') | '&' | '\'')) | Reference))[boost::bind(&Garden<string_type, T0, T1>::attributeValue, this, _1, _2)] >> '\'';
2002-06-21 13:16:28 +02:00
content = !CharData >> *((element | Reference | CDSect | Comment | PI) >> !CharData);
// Section 4.1 - Character and entity references
CharRef = boost::spirit::str_p("&#") >> (+boost::spirit::digit_p >> ';')[boost::bind(&Garden<string_type, T0, T1>::decimalCharacterRef, this, _1, _2)] |
boost::spirit::str_p("&#x") >> (+boost::spirit::xdigit_p >> ';')[boost::bind(&Garden<string_type, T0, T1>::hexCharacterRef, this, _1, _2)];
2002-06-21 13:16:28 +02:00
Reference = EntityRef | CharRef;
EntityRef = '&' >> (Name >> boost::spirit::ch_p(';'))[boost::bind(&Garden<string_type, T0, T1>::entityRef, this, _1, _2)];
2002-06-21 13:16:28 +02:00
EncodingDecl = Spaces >> boost::spirit::str_p("encoding") >> Eq >> (boost::spirit::ch_p('"') >> EncName >> '"' |
boost::spirit::ch_p('\'') >> EncName >> '\'');
2002-06-21 13:16:28 +02:00
chset_t EncNameCh = VersionNumCh - chset_t(':');
EncName = boost::spirit::alpha_p >> *(EncNameCh);
2002-06-21 13:16:28 +02:00
/////////////////
2007-09-06 17:07:16 +02:00
declaredEntities_.insert(std::make_pair<string_type, string_type>(string_adaptor::construct("lt"), string_adaptor::construct("<")));
declaredEntities_.insert(std::make_pair<string_type, string_type>(string_adaptor::construct("gt"), string_adaptor::construct(">")));
declaredEntities_.insert(std::make_pair<string_type, string_type>(string_adaptor::construct("amp"), string_adaptor::construct("&")));
declaredEntities_.insert(std::make_pair<string_type, string_type>(string_adaptor::construct("apos"), string_adaptor::construct("'")));
declaredEntities_.insert(std::make_pair<string_type, string_type>(string_adaptor::construct("quot"), string_adaptor::construct("\"")));
2002-06-21 13:16:28 +02:00
conversion_.insert(std::make_pair('0', 0));
conversion_.insert(std::make_pair('1', 1));
conversion_.insert(std::make_pair('2', 2));
conversion_.insert(std::make_pair('3', 3));
conversion_.insert(std::make_pair('4', 4));
conversion_.insert(std::make_pair('5', 5));
conversion_.insert(std::make_pair('6', 6));
conversion_.insert(std::make_pair('7', 7));
conversion_.insert(std::make_pair('8', 8));
conversion_.insert(std::make_pair('9', 9));
conversion_.insert(std::make_pair('a', 10));
conversion_.insert(std::make_pair('b', 11));
conversion_.insert(std::make_pair('c', 12));
conversion_.insert(std::make_pair('d', 13));
conversion_.insert(std::make_pair('e', 14));
conversion_.insert(std::make_pair('f', 15));
conversion_.insert(std::make_pair('A', 10));
conversion_.insert(std::make_pair('B', 11));
conversion_.insert(std::make_pair('C', 12));
conversion_.insert(std::make_pair('D', 13));
conversion_.insert(std::make_pair('E', 14));
conversion_.insert(std::make_pair('F', 15));
} // XMLparser
//////////////////////////////////////
// features
template<class string_type, class T0, class T1>
bool Garden<string_type, T0, T1>::getFeature(const string_type& name) const
2002-06-21 13:16:28 +02:00
{
2007-09-06 17:07:16 +02:00
throw SAXNotRecognizedException(string_adaptor::asStdString(name));
2002-06-21 13:16:28 +02:00
} // getFeature
template<class string_type, class T0, class T1>
void Garden<string_type, T0, T1>::setFeature(const string_type& name, bool value)
2002-06-21 13:16:28 +02:00
{
2007-09-06 17:07:16 +02:00
throw SAXNotRecognizedException(string_adaptor::asStdString(name));
2002-06-21 13:16:28 +02:00
} // setFeature
//////////////////////////////////////////
// parse
template<class string_type, class T0, class T1>
void Garden<string_type, T0, T1>::parse(InputSourceT& input)
2002-06-21 13:16:28 +02:00
{
2007-09-06 17:07:16 +02:00
InputSourceResolver is(input, string_adaptor());
2005-08-11 14:13:13 +02:00
if(is.resolve() == 0)
2002-06-21 13:16:28 +02:00
{
reportError("Could not resolve XML document", true);
return;
} // if(is.resolver() == 0)
// Turn of white space skipping on the stream
is.resolve()->unsetf(std::ios::skipws);
vector_t data(std::istream_iterator<char_t>(*is.resolve()), std::istream_iterator<char_t>());
iterator_t first = data.begin();
iterator_t last = data.end();
scanner_t scanner(first, last);
typedef typename boost::spirit::parser_result<rule_t, scanner_t>::type result_t;
2002-06-21 13:16:28 +02:00
if(contentHandler_)
contentHandler_->startDocument();
result_t r = document_.parse(scanner);
2002-06-21 13:16:28 +02:00
if(contentHandler_)
contentHandler_->endDocument();
if(!(r && first == last))
2002-06-21 13:16:28 +02:00
{
2007-09-06 17:07:16 +02:00
std::cout << string_adaptor::asStdString(input.getSystemId()) << " Fails Parsing\n" << std::endl;
2002-06-21 13:16:28 +02:00
for (int i = 0; i < 50; ++i)
{
std::cout << *first++;
}
std::cout << std::endl;
} // if ...
} // parse
template<class string_type, class T0, class T1>
void Garden<string_type, T0, T1>::openElement(iterator_t s, iterator_t e)
2002-06-21 13:16:28 +02:00
{
elements_.push(str(s, e));
attrs_.clear();
} // openElement
template<class string_type, class T0, class T1>
void Garden<string_type, T0, T1>::closeElement(iterator_t s, iterator_t e)
2002-06-21 13:16:28 +02:00
{
if(contentHandler_)
2007-09-06 17:07:16 +02:00
contentHandler_->startElement(string_adaptor::empty_string(), elements_.top(), string_adaptor::empty_string(), attrs_);
2002-06-21 13:16:28 +02:00
} // closeElement
template<class string_type, class T0, class T1>
void Garden<string_type, T0, T1>::closeEmptyElement(iterator_t s, iterator_t e)
2002-06-21 13:16:28 +02:00
{
if(contentHandler_)
{
2007-09-06 17:07:16 +02:00
contentHandler_->startElement(string_adaptor::empty_string(), elements_.top(), string_adaptor::empty_string(), attrs_);
contentHandler_->endElement(string_adaptor::empty_string(), elements_.top(), string_adaptor::empty_string());
2002-06-21 13:16:28 +02:00
elements_.pop();
} // if ...
} // closeEmptyElement
template<class string_type, class T0, class T1>
void Garden<string_type, T0, T1>::endElementName(iterator_t s, iterator_t e)
2002-06-21 13:16:28 +02:00
{
string_type name = str(s, e);
2007-07-19 19:01:08 +02:00
if(name != elements_.top())
2007-09-06 17:07:16 +02:00
reportError("Expect end element " + string_adaptor::asStdString(elements_.top()), true);
2002-06-21 13:16:28 +02:00
} // endElementName
template<class string_type, class T0, class T1>
void Garden<string_type, T0, T1>::endElement(iterator_t s, iterator_t e)
2002-06-21 13:16:28 +02:00
{
if(contentHandler_)
2007-09-06 17:07:16 +02:00
contentHandler_->endElement(string_adaptor::empty_string(), elements_.top(), string_adaptor::empty_string());
2002-06-21 13:16:28 +02:00
elements_.pop();
} // endElement
template<class string_type, class T0, class T1>
void Garden<string_type, T0, T1>::attributeName(iterator_t s, iterator_t e)
2002-06-21 13:16:28 +02:00
{
2005-08-11 14:13:13 +02:00
currentAttr_ = typename AttributesImplT::Attr();
2002-06-21 13:16:28 +02:00
currentAttr_.localName_ = str(s, e);
} // attributeName
template<class string_type, class T0, class T1>
void Garden<string_type, T0, T1>::attributeValue(iterator_t s, iterator_t e)
2002-06-21 13:16:28 +02:00
{
currentAttr_.value_ = str(s, e);
currentAttr_.type_ = AttributeType<string_type, string_adaptor>::CDATA;
2002-06-21 13:16:28 +02:00
attrs_.addAttribute(currentAttr_);
} // attributeValue
template<class string_type, class T0, class T1>
void Garden<string_type, T0, T1>::elementContent(iterator_t s, iterator_t e)
2002-06-21 13:16:28 +02:00
{
if(contentHandler_ && (s != e))
contentHandler_->characters(str(s, e));
} // Garden<string_type, T0, T1>::elementContent
2002-06-21 13:16:28 +02:00
// processing instructions
template<class string_type, class T0, class T1>
void Garden<string_type, T0, T1>::piTarget(iterator_t s, iterator_t e)
2002-06-21 13:16:28 +02:00
{
piTarget_ = str(s, e);
2007-09-06 17:07:16 +02:00
piData_ = string_adaptor::empty_string();
2002-06-21 13:16:28 +02:00
} // piTarget
template<class string_type, class T0, class T1>
void Garden<string_type, T0, T1>::piData(iterator_t s, iterator_t e)
2002-06-21 13:16:28 +02:00
{
piData_ = str(s, e);
} // piData
template<class string_type, class T0, class T1>
void Garden<string_type, T0, T1>::piEnd(iterator_t s, iterator_t e)
2002-06-21 13:16:28 +02:00
{
if(contentHandler_)
contentHandler_->processingInstruction(piTarget_, piData_);
} // piEnd
//entity refs
template<class string_type, class T0, class T1>
void Garden<string_type, T0, T1>::entityRef(iterator_t s, iterator_t e)
2002-06-21 13:16:28 +02:00
{
if(contentHandler_)
{
string_type name(str(s, e, 1));
2002-06-21 13:16:28 +02:00
2007-09-06 17:07:16 +02:00
typedef typename std::map<string_type, string_type>::iterator entity_iterator;
2005-08-08 07:41:43 +02:00
entity_iterator ent = declaredEntities_.find(name);
2002-06-21 13:16:28 +02:00
if(ent != declaredEntities_.end())
{
contentHandler_->characters((*ent).second);
return;
}
else
{
2007-09-06 17:07:16 +02:00
reportError("Undeclared entity " + string_adaptor::asStdString(name));
2002-06-21 13:16:28 +02:00
return;
} // if ...
contentHandler_->skippedEntity(name);
} // if ...
} // entityRef
template<class string_type, class T0, class T1>
void Garden<string_type, T0, T1>::decimalCharacterRef(iterator_t s, iterator_t e)
2002-06-21 13:16:28 +02:00
{
characterRef(s, e, 10);
} // decimalCharacterRef
template<class string_type, class T0, class T1>
void Garden<string_type, T0, T1>::hexCharacterRef(iterator_t s, iterator_t e)
2002-06-21 13:16:28 +02:00
{
characterRef(s, e, 16);
} // hexCharacterRef
template<class string_type, class T0, class T1>
void Garden<string_type, T0, T1>::characterRef(iterator_t s, iterator_t e, int base)
2002-06-21 13:16:28 +02:00
{
if(!contentHandler_)
return;
int val = 0;
char next = *s;
while(++s != e)
{
val *= base;
val += conversion_[next];
next = *s;
}
2007-09-06 17:07:16 +02:00
contentHandler_->characters(string_adaptor::construct("?"));//string_type(1, val));
2002-06-21 13:16:28 +02:00
} // characterRef
///////////////////////////////
template<class string_type, class T0, class T1>
void Garden<string_type, T0, T1>::reportError(const std::string& message, bool fatal)
2002-06-21 13:16:28 +02:00
{
if(!errorHandler_)
return;
SAX::SAXParseException<string_type> e(message);
2002-06-21 13:16:28 +02:00
if(fatal)
errorHandler_->fatalError(e);
else
errorHandler_->error(e);
} // reportError
} // namespace SAX
2007-09-05 11:49:18 +02:00
} // namespace Arabica
2002-06-21 13:16:28 +02:00
#endif