mirror of
https://github.com/jezhiggins/arabica
synced 2025-01-26 08:03:21 +01:00
collapsed tagsoup-port into mainline
This commit is contained in:
parent
1468d9f822
commit
9ea360f3ef
23 changed files with 6308 additions and 17 deletions
|
@ -27,6 +27,7 @@ AC_CONFIG_FILES([arabica.pc])
|
|||
AC_CONFIG_FILES([src/Makefile])
|
||||
AC_CONFIG_FILES([examples/Makefile])
|
||||
AC_CONFIG_FILES([examples/Utils/Makefile])
|
||||
AC_CONFIG_FILES([examples/Taggle/Makefile])
|
||||
AC_CONFIG_FILES([examples/SAX/Makefile])
|
||||
AC_CONFIG_FILES([examples/DOM/Makefile])
|
||||
AC_CONFIG_FILES([examples/XPath/Makefile])
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
SUBDIRS = SAX Utils
|
||||
SUBDIRS = SAX Taggle Utils
|
||||
if WANT_DOM
|
||||
SUBDIRS += DOM
|
||||
endif
|
||||
|
|
7
examples/Taggle/Makefile.am
Executable file
7
examples/Taggle/Makefile.am
Executable file
|
@ -0,0 +1,7 @@
|
|||
noinst_PROGRAMS = taggle
|
||||
|
||||
AM_CPPFLAGS = -I$(top_srcdir)/include @PARSER_HEADERS@ $(BOOST_CPPFLAGS)
|
||||
LIBARABICA = $(top_builddir)/src/libarabica.la
|
||||
|
||||
taggle_SOURCES = taggle.cpp
|
||||
taggle_LDADD = $(LIBARABICA)
|
60
examples/Taggle/sample.pyx
Executable file
60
examples/Taggle/sample.pyx
Executable file
|
@ -0,0 +1,60 @@
|
|||
(po
|
||||
Aid P01456
|
||||
(date
|
||||
Ayear 2002
|
||||
Amonth 6
|
||||
Aday 14
|
||||
)date
|
||||
(address
|
||||
Atype shipping
|
||||
(name
|
||||
-Frits Mendels
|
||||
)name
|
||||
(street
|
||||
-152 Cherry St
|
||||
)street
|
||||
(city
|
||||
-San Francisco
|
||||
)city
|
||||
(state
|
||||
-CA
|
||||
)state
|
||||
(zip
|
||||
-94045
|
||||
)zip
|
||||
)address
|
||||
(address
|
||||
Atype billing
|
||||
(name
|
||||
-Frits Mendels
|
||||
)name
|
||||
(street
|
||||
-PO Box 6789
|
||||
)street
|
||||
(city
|
||||
-San Francisco
|
||||
)city
|
||||
(state
|
||||
-CA
|
||||
)state
|
||||
(zip
|
||||
-94123-6798
|
||||
)zip
|
||||
)address
|
||||
(items
|
||||
(item
|
||||
Aquantity 1
|
||||
AproductCode R-273
|
||||
Adescription 14.4 Volt Cordless Drill
|
||||
AunitCost 198.95
|
||||
)item
|
||||
(item
|
||||
Aquantity 1
|
||||
AproductCode 16325
|
||||
Adescription 12 Piece Drill Bit Set
|
||||
AunitCost 14.95
|
||||
)item
|
||||
)items
|
||||
)po
|
||||
|
||||
|
54
examples/Taggle/taggle.cpp
Executable file
54
examples/Taggle/taggle.cpp
Executable file
|
@ -0,0 +1,54 @@
|
|||
#pragma warning(disable: 4250)
|
||||
|
||||
#include <iostream>
|
||||
#include <SAX/filter/Writer.hpp>
|
||||
#include <SAX/helpers/CatchErrorHandler.hpp>
|
||||
#include <Taggle/Taggle.hpp>
|
||||
#include <DOM/SAX2DOM/SAX2DOM.hpp>
|
||||
#include <DOM/io/Stream.hpp>
|
||||
#include <XPath/XPath.hpp>
|
||||
|
||||
int main(int argc, const char* argv[])
|
||||
{
|
||||
if(argc == 1)
|
||||
{
|
||||
std::cout << "taggle [file1] [file2] ... [filen]\n"
|
||||
<< " taggle reads arbitrary HTML, outputting it as well-formed XML\n";
|
||||
return 0;
|
||||
} // if(argc == 1)
|
||||
|
||||
Arabica::SAX::Taggle<std::string> parser;
|
||||
std::ostringstream sink;
|
||||
Arabica::SAX::Writer<std::string> writer(sink, 4);
|
||||
Arabica::SAX::CatchErrorHandler<std::string> eh;
|
||||
|
||||
writer.setParent(parser);
|
||||
writer.setErrorHandler(eh);
|
||||
|
||||
for(int i = 1; i < argc; ++i)
|
||||
{
|
||||
std::string file(argv[i]);
|
||||
Arabica::SAX::InputSource<std::string> is;
|
||||
is.setSystemId(file);
|
||||
|
||||
if(file != "-")
|
||||
writer.parse(is);
|
||||
else
|
||||
{
|
||||
is.setSystemId("stdin");
|
||||
is.setByteStream(std::cin);
|
||||
|
||||
writer.parse(is);
|
||||
} // if(file != "-")
|
||||
|
||||
if(eh.errorsReported())
|
||||
{
|
||||
std::cerr << eh.errors() << std::endl;
|
||||
eh.reset();
|
||||
} // if ...
|
||||
|
||||
std::cout << sink.str();
|
||||
} // for ...
|
||||
|
||||
return 0;
|
||||
} // main
|
|
@ -20,11 +20,26 @@ namespace Arabica
|
|||
namespace SAX2DOM
|
||||
{
|
||||
|
||||
template<class stringT,
|
||||
class string_adaptorT = Arabica::default_string_adaptor<stringT>,
|
||||
class SAX_parser = Arabica::SAX::XMLReader<stringT, string_adaptorT> >
|
||||
class Parser : protected Arabica::SAX::DefaultHandler<stringT, string_adaptorT>
|
||||
template<class string_type, class T0, class T1>
|
||||
struct ParserTypes
|
||||
{
|
||||
typedef typename Arabica::get_param<Arabica::string_adaptor_tag,
|
||||
Arabica::default_string_adaptor<string_type>,
|
||||
T0,
|
||||
T1>::type string_adaptor;
|
||||
typedef typename Arabica::get_param<Arabica::SAX::XMLReaderInterface_tag,
|
||||
Arabica::SAX::XMLReader<string_type, string_adaptor>,
|
||||
T1,
|
||||
T0>::type SAX_parser_type;
|
||||
};
|
||||
|
||||
template<class stringT,
|
||||
class T0 = Arabica::nil_t,
|
||||
class T1 = Arabica::nil_t>
|
||||
class Parser : protected Arabica::SAX::DefaultHandler<stringT, typename ParserTypes<stringT, T0, T1>::string_adaptor>
|
||||
{
|
||||
typedef typename ParserTypes<stringT, T0, T1>::string_adaptor string_adaptorT;
|
||||
typedef typename ParserTypes<stringT, T0, T1>::SAX_parser_type SAX_parser_type;
|
||||
typedef Arabica::SAX::Attributes<stringT, string_adaptorT> AttributesT;
|
||||
typedef Arabica::SAX::EntityResolver<stringT, string_adaptorT> EntityResolverT;
|
||||
typedef Arabica::SAX::ErrorHandler<stringT, string_adaptorT> ErrorHandlerT;
|
||||
|
@ -88,7 +103,7 @@ class Parser : protected Arabica::SAX::DefaultHandler<stringT, string_adaptorT>
|
|||
inDTD_ = false;
|
||||
inEntity_ = 0;
|
||||
|
||||
SAX_parser parser;
|
||||
SAX_parser_type parser;
|
||||
parser.setContentHandler(*this);
|
||||
parser.setErrorHandler(*this);
|
||||
if(entityResolver_)
|
||||
|
@ -157,7 +172,7 @@ class Parser : protected Arabica::SAX::DefaultHandler<stringT, string_adaptorT>
|
|||
Arabica::SAX::AttributeTypes<stringT, string_adaptorT> attributeTypes_;
|
||||
|
||||
protected:
|
||||
void setParserFeatures(SAX_parser& parser) const
|
||||
void setParserFeatures(SAX_parser_type& parser) const
|
||||
{
|
||||
for(typename Features::const_iterator f = features_.begin(), e = features_.end(); f != e; ++f)
|
||||
try {
|
||||
|
|
|
@ -37,7 +37,7 @@ namespace DOM
|
|||
namespace StreamImpl
|
||||
{
|
||||
template<class stringT, class string_adaptorT, class charT, class traitsT>
|
||||
void streamChildren(std::basic_ostream<charT, traitsT>& stream, DOM::Node<stringT, string_adaptorT>& node)
|
||||
void streamChildren(std::basic_ostream<charT, traitsT>& stream, const DOM::Node<stringT, string_adaptorT>& node)
|
||||
{
|
||||
DOM::Node<stringT> child = node.getFirstChild();
|
||||
while(child != 0)
|
||||
|
@ -72,7 +72,7 @@ std::pair<bool, stringT> is_uri_declared(std::vector<std::map<stringT, stringT>
|
|||
|
||||
template<class stringT, class string_adaptorT, class charT, class traitsT>
|
||||
void check_and_output_node_name(std::basic_ostream<charT, traitsT>& stream,
|
||||
DOM::Node<stringT, string_adaptorT>& node,
|
||||
const DOM::Node<stringT, string_adaptorT>& node,
|
||||
std::vector<std::map<stringT, stringT> >* prefix_stack)
|
||||
{
|
||||
std::map<stringT, stringT>& current = *(prefix_stack->rbegin());
|
||||
|
@ -112,7 +112,7 @@ bool isXmlns(const stringT& str)
|
|||
|
||||
template<class stringT, class string_adaptorT, class charT, class traitsT>
|
||||
int prefix_mapper(std::basic_ostream<charT, traitsT>& stream,
|
||||
DOM::Node<stringT, string_adaptorT>& node)
|
||||
const DOM::Node<stringT, string_adaptorT>& node)
|
||||
{
|
||||
typedef Arabica::text::Unicode<charT> UnicodeT;
|
||||
|
||||
|
@ -189,7 +189,7 @@ int prefix_mapper(std::basic_ostream<charT, traitsT>& stream,
|
|||
|
||||
template<class stringT, class string_adaptorT, class charT, class traitsT>
|
||||
void prefix_mapper_pop(std::basic_ostream<charT, traitsT>& stream,
|
||||
DOM::Node<stringT, string_adaptorT> node,
|
||||
const DOM::Node<stringT, string_adaptorT>& node,
|
||||
int index,
|
||||
bool output)
|
||||
{
|
||||
|
@ -212,7 +212,7 @@ void prefix_mapper_pop(std::basic_ostream<charT, traitsT>& stream,
|
|||
template<class stringT, class string_adaptorT, class charT, class traitsT>
|
||||
std::basic_ostream<charT, traitsT>&
|
||||
operator<<(std::basic_ostream<charT, traitsT>& stream,
|
||||
DOM::Node<stringT, string_adaptorT>& node)
|
||||
const DOM::Node<stringT, string_adaptorT>& node)
|
||||
{
|
||||
typedef Arabica::text::Unicode<charT> UnicodeT;
|
||||
|
||||
|
|
|
@ -7,6 +7,7 @@
|
|||
#include <string>
|
||||
|
||||
#include <SAX/ArabicaConfig.hpp>
|
||||
#include <Arabica/StringAdaptor.hpp>
|
||||
|
||||
namespace Arabica
|
||||
{
|
||||
|
@ -43,7 +44,7 @@ namespace SAX
|
|||
* @version 2.0
|
||||
* @see ContentHandler#setDocumentLocator
|
||||
*/
|
||||
template<class string_type, class string_adaptor>
|
||||
template<class string_type, class string_adaptor = Arabica::default_string_adaptor<string_type> >
|
||||
class Locator
|
||||
{
|
||||
public:
|
||||
|
|
|
@ -63,8 +63,10 @@ namespace SAX
|
|||
* @see helpers.ParserAdapter
|
||||
* @see helpers.XMLReaderAdapter
|
||||
*/
|
||||
class XMLReaderInterface_tag { };
|
||||
|
||||
template<class string_type, class T0, class T1>
|
||||
class XMLReaderInterface
|
||||
class XMLReaderInterface : public XMLReaderInterface_tag
|
||||
{
|
||||
public:
|
||||
typedef typename Arabica::get_param<Arabica::string_adaptor_tag,
|
||||
|
|
|
@ -6,6 +6,7 @@
|
|||
|
||||
#include <SAX/ArabicaConfig.hpp>
|
||||
#include <SAX/Attributes.hpp>
|
||||
#include <stdexcept>
|
||||
#include <deque>
|
||||
|
||||
namespace Arabica
|
||||
|
@ -38,7 +39,7 @@ namespace SAX
|
|||
* <a href="mailto:jez@jezuk.co.uk">jez@jezuk.co.uk</a>
|
||||
* @version 2.0
|
||||
*/
|
||||
template<class string_type, class string_adaptor>
|
||||
template<class string_type, class string_adaptor = Arabica::default_string_adaptor<string_type> >
|
||||
class AttributesImpl : public Attributes<string_type, string_adaptor>
|
||||
{
|
||||
public:
|
||||
|
@ -64,6 +65,14 @@ public:
|
|||
|
||||
return *this;
|
||||
} // operator=
|
||||
bool operator==(const Attr& rhs) const
|
||||
{
|
||||
return (uri_ == rhs.uri_) &&
|
||||
(localName_ == rhs.localName_) &&
|
||||
(qName_ == rhs.qName_) &&
|
||||
(type_ == rhs.type_) &&
|
||||
(value_ == rhs.value_);
|
||||
} // operator==
|
||||
|
||||
string_type uri_;
|
||||
string_type localName_;
|
||||
|
@ -75,11 +84,21 @@ public:
|
|||
////////////////////////////////////////////////////////////////////
|
||||
// Constructors.
|
||||
AttributesImpl() { }
|
||||
AttributesImpl(const AttributesT& atts)
|
||||
AttributesImpl(const AttributesT& rhs)
|
||||
{
|
||||
setAttributes(atts);
|
||||
setAttributes(rhs);
|
||||
} // AttributesImpl
|
||||
|
||||
AttributesImpl& operator=(const AttributesT& rhs)
|
||||
{
|
||||
setAttributes(rhs);
|
||||
} // operator=
|
||||
|
||||
bool operator==(const AttributesImpl& rhs) const
|
||||
{
|
||||
return attributes_ == rhs.attributes_;
|
||||
} // operator==
|
||||
|
||||
////////////////////////////////////////////////////////////////////
|
||||
// Implementation of SAX::Attributes.
|
||||
/**
|
||||
|
|
13
include/Taggle/Taggle.hpp
Normal file
13
include/Taggle/Taggle.hpp
Normal file
|
@ -0,0 +1,13 @@
|
|||
#ifndef ARABICA_TAGGLE_TAGGLE_HPP
|
||||
#define ARABICA_TAGGLE_TAGGLE_HPP
|
||||
|
||||
#include "impl/ScanHandler.hpp"
|
||||
#include "impl/ElementType.hpp"
|
||||
#include "impl/Element.hpp"
|
||||
#include "impl/Schema.hpp"
|
||||
#include "impl/html/HTMLModels.hpp"
|
||||
#include "impl/html/HTMLScanner.hpp"
|
||||
#include "impl/html/HTMLSchema.hpp"
|
||||
#include "impl/Parser.hpp"
|
||||
|
||||
#endif
|
304
include/Taggle/impl/Element.hpp
Executable file
304
include/Taggle/impl/Element.hpp
Executable file
|
@ -0,0 +1,304 @@
|
|||
#ifndef ARABICA_SAX_TAGSOUP_ELEMENT_HPP
|
||||
#define ARABICA_SAX_TAGSOUP_ELEMENT_HPP
|
||||
|
||||
#include <string>
|
||||
#include <SAX/helpers/AttributesImpl.hpp>
|
||||
#include "ElementType.hpp"
|
||||
|
||||
namespace Arabica
|
||||
{
|
||||
|
||||
namespace SAX
|
||||
{
|
||||
|
||||
/**
|
||||
The internal representation of an actual element (not an element type).
|
||||
An Element has an element type, attributes, and a successor Element
|
||||
for use in constructing stacks and queues of Elements.
|
||||
@see ElementType
|
||||
@see AttributesImpl
|
||||
|
||||
Based on code from John Cowan's super TagSoup package
|
||||
*/
|
||||
class Element
|
||||
{
|
||||
private:
|
||||
ElementType* type_; // type of element
|
||||
AttributesImpl<std::string> atts_; // attributes of element
|
||||
const Element* next_; // successor of element
|
||||
bool preclosed_; // this element has been preclosed
|
||||
|
||||
public:
|
||||
static const Element Null;
|
||||
|
||||
Element() :
|
||||
type_(&ElementType::Null),
|
||||
atts_(),
|
||||
next_(0),
|
||||
preclosed_(false)
|
||||
{
|
||||
} // Element
|
||||
|
||||
Element(const Element& rhs):
|
||||
type_(rhs.type_),
|
||||
atts_(rhs.atts_),
|
||||
next_(0),
|
||||
preclosed_(rhs.preclosed_)
|
||||
{
|
||||
if(rhs.next_)
|
||||
next_ = new Element(*rhs.next_);
|
||||
} // Element
|
||||
|
||||
/**
|
||||
Return an Element from a specified ElementType.
|
||||
@param type The element type of the newly constructed element
|
||||
@param defaultAttributes True if default attributes are wanted
|
||||
*/
|
||||
Element(ElementType& type, bool defaultAttributes) :
|
||||
type_(&type),
|
||||
atts_(),
|
||||
next_(0),
|
||||
preclosed_(false)
|
||||
{
|
||||
if (defaultAttributes)
|
||||
atts_ = type.atts();
|
||||
} // Element
|
||||
|
||||
~Element()
|
||||
{
|
||||
if(next_ && (*next_ != Null))
|
||||
delete next_;
|
||||
} // ~Element
|
||||
|
||||
Element& operator=(const Element& rhs)
|
||||
{
|
||||
type_ = rhs.type_;
|
||||
atts_ = rhs.atts_;
|
||||
preclosed_ = rhs.preclosed_;
|
||||
if(next_ && (*next_ != Null))
|
||||
delete next_;
|
||||
if(rhs.next_)
|
||||
next_ = new Element(*rhs.next_);
|
||||
else
|
||||
next_ = 0;
|
||||
return *this;
|
||||
} // operator=
|
||||
|
||||
bool operator==(const Element& rhs) const
|
||||
{
|
||||
bool ok = (type_ == rhs.type_) &&
|
||||
(atts_ == rhs.atts_) &&
|
||||
(preclosed_ == rhs.preclosed_);
|
||||
if(!ok)
|
||||
return false;
|
||||
|
||||
if(!next_ && !rhs.next_)
|
||||
return true;
|
||||
|
||||
if((!next_ && rhs.next_) ||
|
||||
(next_ && !rhs.next_))
|
||||
return false;
|
||||
|
||||
return (*next_ == *rhs.next_);
|
||||
} // operator==
|
||||
|
||||
bool operator!=(const Element& rhs) const
|
||||
{
|
||||
return !(*this == rhs);
|
||||
} // operator!=
|
||||
|
||||
/**
|
||||
Return the element type.
|
||||
@return The element type.
|
||||
*/
|
||||
const ElementType& type() const
|
||||
{
|
||||
return *type_;
|
||||
} // type
|
||||
|
||||
/**
|
||||
Return the attributes as an AttributesImpl object.
|
||||
Returning an AttributesImpl makes the attributes mutable.
|
||||
@return The attributes
|
||||
@see AttributesImpl
|
||||
*/
|
||||
const AttributesImpl<std::string>& atts() const
|
||||
{
|
||||
return atts_;
|
||||
} // atts
|
||||
|
||||
/**
|
||||
Return the next element in an element stack or queue.
|
||||
@return The next element
|
||||
*/
|
||||
Element next() const
|
||||
{
|
||||
if(!next_)
|
||||
return Null;
|
||||
return *next_;
|
||||
} // next
|
||||
|
||||
/**
|
||||
Change the next element in an element stack or queue.
|
||||
@param next The new next element
|
||||
*/
|
||||
void setNext(const Element& next)
|
||||
{
|
||||
if(next_ && (*next_ != Null))
|
||||
delete next_;
|
||||
next_ = new Element(next);
|
||||
} // setNext
|
||||
|
||||
/**
|
||||
Return the name of the element's type.
|
||||
Convenience method.
|
||||
@return The element type name
|
||||
*/
|
||||
std::string name() const
|
||||
{
|
||||
return type_->name();
|
||||
} // name
|
||||
|
||||
/**
|
||||
Return the namespace name of the element's type.
|
||||
Convenience method.
|
||||
@return The element type namespace name
|
||||
*/
|
||||
std::string namespaceName() const
|
||||
{
|
||||
return type_->namespaceName();
|
||||
} // namespaceName
|
||||
|
||||
/**
|
||||
Return the local name of the element's type.
|
||||
Convenience method.
|
||||
@return The element type local name
|
||||
*/
|
||||
std::string localName() const
|
||||
{
|
||||
return type_->localName();
|
||||
} // localName
|
||||
|
||||
/**
|
||||
Return the content model vector of the element's type.
|
||||
Convenience method.
|
||||
@return The content model vector
|
||||
*/
|
||||
int model() const
|
||||
{
|
||||
return type_->model();
|
||||
} // model
|
||||
|
||||
/**
|
||||
Return the member-of vector of the element's type.
|
||||
Convenience method.
|
||||
@return The member-of vector
|
||||
*/
|
||||
int memberOf() const
|
||||
{
|
||||
return type_->memberOf();
|
||||
} // memberOf
|
||||
|
||||
/**
|
||||
Return the flags vector of the element's type.
|
||||
Convenience method.
|
||||
@return The flags vector
|
||||
*/
|
||||
int flags() const
|
||||
{
|
||||
return type_->flags();
|
||||
} // flags
|
||||
|
||||
/**
|
||||
Return the parent element type of the element's type.
|
||||
Convenience method.
|
||||
@return The parent element type
|
||||
*/
|
||||
ElementType& parent() const
|
||||
{
|
||||
return type_->parent();
|
||||
} // parent
|
||||
|
||||
/**
|
||||
Return true if the type of this element can contain the type of
|
||||
another element.
|
||||
Convenience method.
|
||||
@param other The other element
|
||||
*/
|
||||
bool canContain(const Element& other) const
|
||||
{
|
||||
return type_->canContain(*(other.type_));
|
||||
} // canContain
|
||||
|
||||
/**
|
||||
Set an attribute and its value into this element.
|
||||
@param name The attribute name (Qname)
|
||||
@param type The attribute type
|
||||
@param value The attribute value
|
||||
*/
|
||||
void setAttribute(const std::string& name, const std::string& type, const std::string& value)
|
||||
{
|
||||
type_->setAttribute(atts_, name, type, value);
|
||||
} // setAttribute
|
||||
|
||||
/**
|
||||
Make this element anonymous.
|
||||
Remove any <tt>id</tt> or <tt>name</tt> attribute present
|
||||
in the element's attributes.
|
||||
*/
|
||||
void anonymize()
|
||||
{
|
||||
for (int i = atts_.getLength() - 1; i >= 0; i--)
|
||||
{
|
||||
if((atts_.getType(i) == "ID") ||
|
||||
(atts_.getQName(i) == "name"))
|
||||
{
|
||||
atts_.removeAttribute(i);
|
||||
}
|
||||
} // for ...
|
||||
} // anonymize
|
||||
|
||||
/**
|
||||
Clean the attributes of this element.
|
||||
Attributes with null name (the name was ill-formed)
|
||||
or null value (the attribute was present in the element type but
|
||||
not in this actual element) are removed.
|
||||
*/
|
||||
void clean()
|
||||
{
|
||||
for (int i = atts_.getLength() - 1; i >= 0; i--)
|
||||
{
|
||||
const std::string& name = atts_.getLocalName(i);
|
||||
if (atts_.getValue(i) == "" || name == "" || name.length() == 0)
|
||||
{
|
||||
atts_.removeAttribute(i);
|
||||
continue;
|
||||
} // if ...
|
||||
} // for ...
|
||||
} // clean
|
||||
|
||||
/**
|
||||
Force this element to preclosed status, meaning that an end-tag has
|
||||
been seen but the element cannot yet be closed for structural reasons.
|
||||
*/
|
||||
void preclose()
|
||||
{
|
||||
preclosed_ = true;
|
||||
} // preclose
|
||||
|
||||
/**
|
||||
Return true if this element has been preclosed.
|
||||
*/
|
||||
bool isPreclosed() const
|
||||
{
|
||||
return preclosed_;
|
||||
} // isPreclosed
|
||||
}; // class Element
|
||||
|
||||
const Element Element::Null;
|
||||
|
||||
} // namespace SAX
|
||||
|
||||
} // namespace Arabica
|
||||
|
||||
#endif
|
333
include/Taggle/impl/ElementType.hpp
Executable file
333
include/Taggle/impl/ElementType.hpp
Executable file
|
@ -0,0 +1,333 @@
|
|||
#ifndef ARABICA_SAX_ELEMENT_TYPE_HPP
|
||||
#define ARABICA_SAX_ELEMENT_TYPE_HPP
|
||||
|
||||
#include <SAX/helpers/AttributesImpl.hpp>
|
||||
#include <text/normalize_whitespace.hpp>
|
||||
#include "Schema.hpp"
|
||||
|
||||
namespace Arabica
|
||||
{
|
||||
|
||||
namespace SAX
|
||||
{
|
||||
|
||||
|
||||
/**
|
||||
This class represents an element type in the schema.
|
||||
An element type has a name, a content model vector, a member-of vector,
|
||||
a flags vector, default attributes, and a schema to which it belongs.
|
||||
|
||||
Based on code from John Cowan's super TagSoup package
|
||||
@see Schema
|
||||
*/
|
||||
class ElementType
|
||||
{
|
||||
private:
|
||||
std::string name_; // element type name (Qname)
|
||||
std::string namespace_; // element type namespace name
|
||||
std::string localName_; // element type local name
|
||||
int model_; // bitmap: what the element contains
|
||||
int memberOf_; // bitmap: what element is contained in
|
||||
int flags_; // bitmap: element flags
|
||||
AttributesImpl<std::string> atts_; // default attributes
|
||||
ElementType* parent_; // parent of this element type
|
||||
Schema* schema_; // schema to which this belongs
|
||||
|
||||
public:
|
||||
static ElementType Null;
|
||||
|
||||
private:
|
||||
ElementType() :
|
||||
name_("<null>"),
|
||||
namespace_("<null>"),
|
||||
localName_("<null>"),
|
||||
model_(0),
|
||||
memberOf_(0),
|
||||
flags_(0),
|
||||
atts_(),
|
||||
parent_(0),
|
||||
schema_(0)
|
||||
{
|
||||
} // ElementType
|
||||
|
||||
/**
|
||||
Construct an ElementType:
|
||||
but it's better to use Schema.element() instead.
|
||||
The content model, member-of, and flags vectors are specified as ints.
|
||||
@param name The element type name
|
||||
@param model ORed-together bits representing the content models
|
||||
allowed in the content of this element type
|
||||
@param memberOf ORed-together bits representing the content models
|
||||
to which this element type belongs
|
||||
@param flags ORed-together bits representing the flags associated
|
||||
with this element type
|
||||
@param schema The schema with which this element type will be
|
||||
associated
|
||||
*/
|
||||
ElementType(const std::string& name, int model, int memberOf, int flags, Schema& schema) :
|
||||
name_(name),
|
||||
model_(model),
|
||||
memberOf_(memberOf),
|
||||
flags_(flags),
|
||||
schema_(&schema),
|
||||
namespace_(),
|
||||
localName_(),
|
||||
parent_(0)
|
||||
{
|
||||
namespace_ = namespaceName(name, false);
|
||||
localName_ = localName(name);
|
||||
} // ElementType
|
||||
|
||||
ElementType(const ElementType& rhs) :
|
||||
name_(rhs.name_),
|
||||
model_(rhs.model_),
|
||||
memberOf_(rhs.memberOf_),
|
||||
flags_(rhs.flags_),
|
||||
schema_(rhs.schema_),
|
||||
namespace_(rhs.namespace_),
|
||||
localName_(rhs.localName_),
|
||||
parent_(rhs.parent_)
|
||||
{
|
||||
} // ElementType
|
||||
|
||||
friend class SchemaImpl;
|
||||
|
||||
public:
|
||||
/**
|
||||
Return a namespace name from a Qname.
|
||||
The attribute flag tells us whether to return an empty namespace
|
||||
name if there is no prefix, or use the schema default instead.
|
||||
@param name The Qname
|
||||
@param attribute True if name is an attribute name
|
||||
@return The namespace name
|
||||
**/
|
||||
std::string namespaceName(const std::string& name, bool attribute) const
|
||||
{
|
||||
size_t colon = name.find(':');
|
||||
if (colon == std::string::npos)
|
||||
return attribute ? "" : schema_->getURI();
|
||||
|
||||
std::string prefix = name.substr(0, colon);
|
||||
if (prefix == "xml")
|
||||
return "http://www.w3.org/XML/1998/namespace";
|
||||
else
|
||||
return "urn:x-prefix:" + prefix;
|
||||
} // namespaceName
|
||||
|
||||
/**
|
||||
Return a local name from a Qname.
|
||||
@param name The Qname
|
||||
@return The local name
|
||||
**/
|
||||
std::string localName(const std::string& name) const
|
||||
{
|
||||
size_t colon = name.find(':');
|
||||
if (colon == std::string::npos)
|
||||
return name;
|
||||
else
|
||||
return name.substr(colon+1);
|
||||
} // localName
|
||||
|
||||
/**
|
||||
Returns the name of this element type.
|
||||
@return The name of the element type
|
||||
*/
|
||||
std::string name() const { return name_; }
|
||||
|
||||
/**
|
||||
Returns the namespace name of this element type.
|
||||
@return The namespace name of the element type
|
||||
*/
|
||||
std::string namespaceName() const { return namespace_; }
|
||||
|
||||
/**
|
||||
Returns the local name of this element type.
|
||||
@return The local name of the element type
|
||||
*/
|
||||
std::string localName() const { return localName_; }
|
||||
|
||||
/**
|
||||
Returns the content models of this element type.
|
||||
@return The content models of this element type as a vector of bits
|
||||
*/
|
||||
int model() const { return model_; }
|
||||
|
||||
/**
|
||||
Returns the content models to which this element type belongs.
|
||||
@return The content models to which this element type belongs as a
|
||||
vector of bits
|
||||
*/
|
||||
int memberOf() const { return memberOf_; }
|
||||
|
||||
/**
|
||||
Returns the flags associated with this element type.
|
||||
@return The flags associated with this element type as a vector of bits
|
||||
*/
|
||||
int flags() const { return flags_; }
|
||||
|
||||
/**
|
||||
Returns the default attributes associated with this element type.
|
||||
Attributes of type CDATA that don't have default values are
|
||||
typically not included. Other attributes without default values
|
||||
have an internal value of <tt>null</tt>.
|
||||
The return value is an AttributesImpl to allow the caller to mutate
|
||||
the attributes.
|
||||
*/
|
||||
const AttributesImpl<std::string>& atts() const { return atts_; }
|
||||
|
||||
/**
|
||||
Returns the parent element type of this element type.
|
||||
@return The parent element type
|
||||
*/
|
||||
ElementType& parent() const
|
||||
{
|
||||
return *parent_;
|
||||
} // parent
|
||||
|
||||
/**
|
||||
Returns the schema which this element type is associated with.
|
||||
@return The schema
|
||||
*/
|
||||
Schema& schema() const
|
||||
{
|
||||
return *schema_;
|
||||
} // schema
|
||||
|
||||
|
||||
/**
|
||||
Returns true if this element type can contain another element type.
|
||||
That is, if any of the models in this element's model vector
|
||||
match any of the models in the other element type's member-of
|
||||
vector.
|
||||
@param other The other element type
|
||||
*/
|
||||
bool canContain(const ElementType& other) const
|
||||
{
|
||||
return (model_ & other.memberOf_) != 0;
|
||||
} // canContain
|
||||
|
||||
|
||||
/**
|
||||
Sets an attribute and its value into an AttributesImpl object.
|
||||
Attempts to set a namespace declaration are ignored.
|
||||
@param atts The AttributesImpl object
|
||||
@param name The name (Qname) of the attribute
|
||||
@param type The type of the attribute
|
||||
@param value The value of the attribute
|
||||
*/
|
||||
void setAttribute(AttributesImpl<std::string>& atts,
|
||||
const std::string& name,
|
||||
const std::string& type,
|
||||
const std::string& value)
|
||||
{
|
||||
if (name == "xmlns" || name.find("xmlns:") == 0)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
std::string namespaceN = namespaceName(name, true);
|
||||
std::string localN = localName(name);
|
||||
std::string actualType = type;
|
||||
std::string actualValue = value;
|
||||
|
||||
int i = atts.getIndex(name);
|
||||
if (i == -1)
|
||||
{
|
||||
if (actualType == "")
|
||||
actualType = "CDATA";
|
||||
if (actualType != "CDATA")
|
||||
actualValue = Arabica::text::normalize_whitespace<std::string, Arabica::default_string_adaptor<std::string> >(value);
|
||||
atts.addAttribute(namespaceN, localN, name, actualType, actualValue);
|
||||
}
|
||||
else
|
||||
{
|
||||
if (actualType == "")
|
||||
actualType = atts.getType(i);
|
||||
if (actualType != ("CDATA"))
|
||||
actualValue = Arabica::text::normalize_whitespace<std::string, Arabica::default_string_adaptor<std::string> >(value);
|
||||
atts.setAttribute(i, namespaceN, localN, name, actualType, actualValue);
|
||||
}
|
||||
} // setAttribute
|
||||
|
||||
/**
|
||||
Sets an attribute and its value into this element type.
|
||||
@param name The name of the attribute
|
||||
@param type The type of the attribute
|
||||
@param value The value of the attribute
|
||||
*/
|
||||
void setAttribute(const std::string& name, const std::string& type, const std::string& value)
|
||||
{
|
||||
setAttribute(atts_, name, type, value);
|
||||
} // setAttribute
|
||||
|
||||
/**
|
||||
Sets the models of this element type.
|
||||
@param model The content models of this element type as a vector of bits
|
||||
*/
|
||||
void setModel(int model)
|
||||
{
|
||||
model_ = model;
|
||||
} // setModel
|
||||
|
||||
/**
|
||||
Sets the content models to which this element type belongs.
|
||||
@param memberOf The content models to which this element type belongs as a vector of bits
|
||||
*/
|
||||
void setMemberOf(int memberOf)
|
||||
{
|
||||
memberOf_ = memberOf;
|
||||
} // setMemberOf
|
||||
|
||||
/**
|
||||
Sets the flags of this element type.
|
||||
@param flags associated with this element type The flags as a vector of bits
|
||||
*/
|
||||
void setFlags(int flags)
|
||||
{
|
||||
flags_ = flags;
|
||||
} // setFlags
|
||||
|
||||
/**
|
||||
Sets the parent element type of this element type.
|
||||
@param parent The parent element type
|
||||
*/
|
||||
void setParent(ElementType& parent)
|
||||
{
|
||||
parent_ = &parent;
|
||||
} // setParent
|
||||
|
||||
bool operator==(const ElementType& rhs) const
|
||||
{
|
||||
return (name_ == rhs.name_) &&
|
||||
(namespace_ == rhs.namespace_) &&
|
||||
(localName_ == rhs.localName_) &&
|
||||
(model_ == rhs.model_) &&
|
||||
(memberOf_ == rhs.memberOf_) &&
|
||||
(flags_ == rhs.flags_) &&
|
||||
(parent_ == rhs.parent_) &&
|
||||
(schema_ == rhs.schema_);
|
||||
} // operator ==
|
||||
|
||||
ElementType& operator=(const ElementType& rhs)
|
||||
{
|
||||
name_ = rhs.name_;
|
||||
namespace_ = rhs.namespace_;
|
||||
localName_ = rhs.localName_;
|
||||
model_ = rhs.model_;
|
||||
memberOf_ = rhs.memberOf_;
|
||||
flags_ = rhs.flags_;
|
||||
atts_ = rhs.atts_;
|
||||
parent_ = rhs.parent_;
|
||||
schema_ = rhs.schema_;
|
||||
|
||||
return *this;
|
||||
} // operator=
|
||||
}; // class ElementType
|
||||
|
||||
ElementType ElementType::Null;
|
||||
|
||||
} // namespace SAX
|
||||
|
||||
} // namespace Arabica
|
||||
|
||||
#endif
|
1391
include/Taggle/impl/Parser.hpp
Normal file
1391
include/Taggle/impl/Parser.hpp
Normal file
File diff suppressed because it is too large
Load diff
105
include/Taggle/impl/ScanHandler.hpp
Executable file
105
include/Taggle/impl/ScanHandler.hpp
Executable file
|
@ -0,0 +1,105 @@
|
|||
#ifndef ARABICA_SCAN_HANDLER_HPP
|
||||
#define ARABICA_SCAN_HANDLER_HPP
|
||||
|
||||
#include <string>
|
||||
|
||||
namespace Arabica
|
||||
{
|
||||
|
||||
namespace SAX
|
||||
{
|
||||
|
||||
/**
|
||||
An interface that Scanners use to report events in the input stream.
|
||||
|
||||
This code is derived from John Cowan's splendid TagSoup package
|
||||
*/
|
||||
class ScanHandler
|
||||
{
|
||||
public:
|
||||
/**
|
||||
Reports an attribute name without a value.
|
||||
**/
|
||||
virtual void adup(const std::string& buff) = 0;
|
||||
|
||||
/**
|
||||
Reports an attribute name; a value will follow.
|
||||
**/
|
||||
virtual void aname(const std::string& buff) = 0;
|
||||
|
||||
/**
|
||||
Reports an attribute value.
|
||||
**/
|
||||
virtual void aval(const std::string& buff) = 0;
|
||||
|
||||
/**
|
||||
* Reports the content of a CDATA section (not a CDATA element)
|
||||
*/
|
||||
virtual void cdsect(const std::string& buff) = 0;
|
||||
|
||||
/**
|
||||
* Reports a <!....> declaration - typically a DOCTYPE
|
||||
*/
|
||||
virtual void decl(const std::string& buff) = 0;
|
||||
|
||||
/**
|
||||
Reports an entity reference or character reference.
|
||||
**/
|
||||
virtual void entity(const std::string& buff) = 0;
|
||||
|
||||
/**
|
||||
Reports EOF.
|
||||
**/
|
||||
virtual void eof(const std::string& buff) = 0;
|
||||
|
||||
/**
|
||||
Reports an end-tag.
|
||||
**/
|
||||
virtual void etag(const std::string& buff) = 0;
|
||||
|
||||
/**
|
||||
Reports the general identifier (element type name) of a start-tag.
|
||||
**/
|
||||
virtual void gi(const std::string& buff) = 0;
|
||||
|
||||
/**
|
||||
Reports character content.
|
||||
**/
|
||||
virtual void pcdata(const std::string& buff) = 0;
|
||||
|
||||
/**
|
||||
Reports the data part of a processing instruction.
|
||||
**/
|
||||
virtual void pi(const std::string& buff) = 0;
|
||||
|
||||
/**
|
||||
Reports the target part of a processing instruction.
|
||||
**/
|
||||
virtual void pitarget(const std::string& buff) = 0;
|
||||
|
||||
/**
|
||||
Reports the close of a start-tag.
|
||||
**/
|
||||
virtual void stagc(const std::string& buff) = 0;
|
||||
|
||||
/**
|
||||
Reports the close of an empty-tag.
|
||||
**/
|
||||
virtual void stage(const std::string& buff) = 0;
|
||||
|
||||
/**
|
||||
Reports a comment.
|
||||
**/
|
||||
virtual void cmnt(const std::string& buff) = 0;
|
||||
|
||||
/**
|
||||
Returns the value of the last entity or character reference reported.
|
||||
**/
|
||||
virtual int getEntity() = 0;
|
||||
}; // class ScanHandler
|
||||
|
||||
} // namespace SAX
|
||||
|
||||
} // namespace Arabica
|
||||
|
||||
#endif
|
45
include/Taggle/impl/Scanner.hpp
Executable file
45
include/Taggle/impl/Scanner.hpp
Executable file
|
@ -0,0 +1,45 @@
|
|||
#ifndef ARABICA_SAX_SCANNER_HPP
|
||||
#define ARABICA_SAX_SCANNER_HPP
|
||||
|
||||
#include <istream>
|
||||
#include <string>
|
||||
|
||||
namespace Arabica
|
||||
{
|
||||
namespace SAX
|
||||
{
|
||||
|
||||
class ScanHandler;
|
||||
|
||||
/**
|
||||
An interface allowing Parser to invoke scanners.
|
||||
|
||||
This code is derived from John Cowan's splendid TagSoup package
|
||||
*/
|
||||
class Scanner
|
||||
{
|
||||
public:
|
||||
/**
|
||||
Invoke a scanner.
|
||||
@param r A source of characters to scan
|
||||
@param h A ScanHandler to report events to
|
||||
**/
|
||||
virtual void scan(std::istream& r, ScanHandler& h) = 0;
|
||||
|
||||
/**
|
||||
Reset the embedded locator.
|
||||
@param publicid The publicid of the source
|
||||
@param systemid The systemid of the source
|
||||
**/
|
||||
virtual void resetDocumentLocator(const std::string& publicid, const std::string& systemid) = 0;
|
||||
|
||||
/**
|
||||
Signal to the scanner to start CDATA content mode.
|
||||
**/
|
||||
virtual void startCDATA() = 0;
|
||||
}; // Scanner
|
||||
|
||||
|
||||
} // namespace SAX
|
||||
} // namespace Arabica
|
||||
#endif
|
44
include/Taggle/impl/Schema.hpp
Normal file
44
include/Taggle/impl/Schema.hpp
Normal file
|
@ -0,0 +1,44 @@
|
|||
#ifndef ARABICA_SAX_TAGGLE_SCHEMA_HPP
|
||||
#define ARABICA_SAX_TAGGLE_SCHEMA_HPP
|
||||
|
||||
namespace Arabica
|
||||
{
|
||||
namespace SAX
|
||||
{
|
||||
|
||||
class ElementType;
|
||||
|
||||
/**
|
||||
Abstract class representing a TSSL schema.
|
||||
Actual TSSL schemas are compiled into concrete subclasses of this class.
|
||||
|
||||
Based on code from John Cowan's super TagSoup package
|
||||
**/
|
||||
class Schema
|
||||
{
|
||||
public:
|
||||
static const int M_ANY = 0xFFFFFFFF;
|
||||
static const int M_EMPTY = 0;
|
||||
static const int M_PCDATA = 1 << 30;
|
||||
static const int M_ROOT = 1 << 31;
|
||||
|
||||
static const int F_RESTART = 1;
|
||||
static const int F_CDATA = 2;
|
||||
static const int F_NOFORCE = 4;
|
||||
|
||||
virtual void elementType(const std::string& name, int model, int memberOf, int flags) = 0;
|
||||
virtual ElementType& rootElementType() = 0;
|
||||
virtual void parent(std::string name, std::string parentName) = 0;
|
||||
|
||||
virtual ElementType& getElementType(const std::string& name) = 0;
|
||||
virtual int getEntity(const std::string& name) const = 0;
|
||||
virtual const std::string& getURI() const = 0;
|
||||
virtual const std::string& getPrefix() const = 0;
|
||||
|
||||
virtual ~Schema() { }
|
||||
}; // class Schema
|
||||
|
||||
} // namespace SAX
|
||||
|
||||
} // namespace Arabica
|
||||
#endif
|
182
include/Taggle/impl/SchemaImpl.hpp
Normal file
182
include/Taggle/impl/SchemaImpl.hpp
Normal file
|
@ -0,0 +1,182 @@
|
|||
#ifndef ARABICA_SAX_TAGGLE_SCHEMAIMPL_HPP
|
||||
#define ARABICA_SAX_TAGGLE_SCHEMAIMPL_HPP
|
||||
|
||||
#include <map>
|
||||
#include <string>
|
||||
#include <algorithm>
|
||||
#include <cctype>
|
||||
#include "ElementType.hpp"
|
||||
#include "Schema.hpp"
|
||||
|
||||
namespace Arabica
|
||||
{
|
||||
namespace SAX
|
||||
{
|
||||
|
||||
/**
|
||||
Abstract class representing a TSSL schema.
|
||||
Actual TSSL schemas are compiled into concrete subclasses of this class.
|
||||
|
||||
Based on code from John Cowan's super TagSoup package
|
||||
**/
|
||||
class SchemaImpl : public Schema
|
||||
{
|
||||
private:
|
||||
std::map<std::string, char> entities_;
|
||||
std::map<std::string, ElementType*> elementTypes_;
|
||||
|
||||
std::string URI_;
|
||||
std::string prefix_;
|
||||
ElementType* root_;
|
||||
|
||||
public:
|
||||
virtual ~SchemaImpl()
|
||||
{
|
||||
for(std::map<std::string, ElementType*>::iterator i = elementTypes_.begin(), ie = elementTypes_.end(); i != ie; ++i)
|
||||
delete i->second;
|
||||
} // ~SchemaImpl
|
||||
|
||||
/**
|
||||
Add or replace an element type for this schema.
|
||||
@param name Name (Qname) of the element
|
||||
@param model Models of the element's content as a vector of bits
|
||||
@param memberOf Models the element is a member of as a vector of bits
|
||||
@param flags Flags for the element
|
||||
**/
|
||||
void elementType(const std::string& name, int model, int memberOf, int flags)
|
||||
{
|
||||
ElementType* e = new ElementType(name, model, memberOf, flags, *this);
|
||||
std::string lname = lower_case(name);
|
||||
elementTypes_[lname] = e;
|
||||
if(memberOf == M_ROOT)
|
||||
root_ = elementTypes_[lname];
|
||||
} // elementType
|
||||
|
||||
/**
|
||||
Get the root element of this schema
|
||||
**/
|
||||
ElementType& rootElementType()
|
||||
{
|
||||
return *root_;
|
||||
} // rootElementType
|
||||
|
||||
/**
|
||||
Add or replace a default attribute for an element type in this schema.
|
||||
@param elemName Name (Qname) of the element type
|
||||
@param attrName Name (Qname) of the attribute
|
||||
@param type Type of the attribute
|
||||
@param value Default value of the attribute; null if no default
|
||||
**/
|
||||
void attribute(const std::string& elemName, const std::string& attrName, const std::string& type, const std::string& value)
|
||||
{
|
||||
ElementType& e = getElementType(elemName);
|
||||
if (e == ElementType::Null)
|
||||
{
|
||||
throw std::runtime_error("Attribute " + attrName +
|
||||
" specified for unknown element type " +
|
||||
elemName);
|
||||
}
|
||||
e.setAttribute(attrName, type, value);
|
||||
} // attribute
|
||||
|
||||
/**
|
||||
Specify natural parent of an element in this schema.
|
||||
@param name Name of the child element
|
||||
@param parentName Name of the parent element
|
||||
**/
|
||||
void parent(std::string name, std::string parentName)
|
||||
{
|
||||
ElementType& child = getElementType(name);
|
||||
ElementType& parent = getElementType(parentName);
|
||||
if (child == ElementType::Null)
|
||||
{
|
||||
throw std::runtime_error("No child " + name + " for parent " + parentName);
|
||||
}
|
||||
if (parent == ElementType::Null)
|
||||
{
|
||||
throw std::runtime_error("No parent " + parentName + " for child " + name);
|
||||
}
|
||||
child.setParent(parent);
|
||||
} // parent
|
||||
|
||||
/**
|
||||
Add to or replace a character entity in this schema.
|
||||
@param name Name of the entity
|
||||
@param value Value of the entity
|
||||
**/
|
||||
void entity(const std::string& name, int value)
|
||||
{
|
||||
entities_[name] = value;
|
||||
} // entity
|
||||
|
||||
/**
|
||||
Get an ElementType by name.
|
||||
@param name Name (Qname) of the element type
|
||||
@return The corresponding ElementType
|
||||
**/
|
||||
ElementType& getElementType(const std::string& name)
|
||||
{
|
||||
std::map<std::string, ElementType*>::iterator elemType = elementTypes_.find(lower_case(name));
|
||||
if(elemType == elementTypes_.end())
|
||||
return ElementType::Null;
|
||||
return *elemType->second;
|
||||
} // getElementType
|
||||
|
||||
/**
|
||||
Get an entity value by name.
|
||||
@param name Name of the entity
|
||||
@return The corresponding character, or 0 if none
|
||||
**/
|
||||
int getEntity(const std::string& name) const
|
||||
{
|
||||
std::map<std::string, char>::const_iterator ent = entities_.find(name);
|
||||
if(ent == entities_.end())
|
||||
return 0;
|
||||
return ent->second;
|
||||
} // getEntity
|
||||
|
||||
/**
|
||||
Return the URI (namespace name) of this schema.
|
||||
**/
|
||||
const std::string& getURI() const
|
||||
{
|
||||
return URI_;
|
||||
} // getURI
|
||||
|
||||
/**
|
||||
Return the prefix of this schema.
|
||||
**/
|
||||
const std::string& getPrefix() const
|
||||
{
|
||||
return prefix_;
|
||||
} // getPrefix
|
||||
|
||||
/**
|
||||
Change the URI (namespace name) of this schema.
|
||||
**/
|
||||
void setURI(std::string uri)
|
||||
{
|
||||
URI_ = uri;
|
||||
} // setURI
|
||||
|
||||
/**
|
||||
Change the prefix of this schema.
|
||||
**/
|
||||
void setPrefix(std::string prefix)
|
||||
{
|
||||
prefix_ = prefix;
|
||||
} // setPrefix
|
||||
|
||||
private:
|
||||
static std::string lower_case(const std::string& str)
|
||||
{
|
||||
std::string lower;
|
||||
std::transform(str.begin(), str.end(), std::back_inserter(lower), (int(*)(int))std::tolower);
|
||||
return lower;
|
||||
} // lower_case
|
||||
}; // class Schema
|
||||
|
||||
} // namespace SAX
|
||||
|
||||
} // namespace Arabica
|
||||
#endif
|
49
include/Taggle/impl/html/HTMLModels.hpp
Normal file
49
include/Taggle/impl/html/HTMLModels.hpp
Normal file
|
@ -0,0 +1,49 @@
|
|||
#ifndef ARABICA_SAX_TAGGLE_HTML_MODELS_HPP
|
||||
#define ARABICA_SAX_TAGGLE_HTML_MODELS_HPP
|
||||
|
||||
namespace Arabica
|
||||
{
|
||||
|
||||
namespace SAX
|
||||
{
|
||||
|
||||
/**
|
||||
This interface contains generated constants representing HTML content
|
||||
models. Logically, it is part of HTMLSchema, but it is more
|
||||
convenient to generate the constants into a separate interface.
|
||||
|
||||
Based on code from John Cowan's super TagSoup package
|
||||
*/
|
||||
class HTMLModels
|
||||
{
|
||||
protected:
|
||||
// Start of model definitions
|
||||
static const int M_AREA = 1 << 1;
|
||||
static const int M_BLOCK = 1 << 2;
|
||||
static const int M_BLOCKINLINE = 1 << 3;
|
||||
static const int M_BODY = 1 << 4;
|
||||
static const int M_CELL = 1 << 5;
|
||||
static const int M_COL = 1 << 6;
|
||||
static const int M_DEF = 1 << 7;
|
||||
static const int M_FORM = 1 << 8;
|
||||
static const int M_FRAME = 1 << 9;
|
||||
static const int M_HEAD = 1 << 10;
|
||||
static const int M_HTML = 1 << 11;
|
||||
static const int M_INLINE = 1 << 12;
|
||||
static const int M_LEGEND = 1 << 13;
|
||||
static const int M_LI = 1 << 14;
|
||||
static const int M_NOLINK = 1 << 15;
|
||||
static const int M_OPTION = 1 << 16;
|
||||
static const int M_OPTIONS = 1 << 17;
|
||||
static const int M_P = 1 << 18;
|
||||
static const int M_PARAM = 1 << 19;
|
||||
static const int M_TABLE = 1 << 20;
|
||||
static const int M_TABULAR = 1 << 21;
|
||||
static const int M_TR = 1 << 22;
|
||||
}; // namespace HTMLModels
|
||||
|
||||
} // namespace SAX
|
||||
|
||||
} // namespace Arabica
|
||||
#endif
|
||||
|
704
include/Taggle/impl/html/HTMLScanner.hpp
Normal file
704
include/Taggle/impl/html/HTMLScanner.hpp
Normal file
|
@ -0,0 +1,704 @@
|
|||
#ifndef ARABICA_SAX_TAGGLE_HTML_SCANNER_HPP
|
||||
#define ARABICA_SAX_TAGGLE_HTML_SCANNER_HPP
|
||||
|
||||
#include <SAX/SAXException.hpp>
|
||||
#include <SAX/Locator.hpp>
|
||||
#include <XML/XMLCharacterClasses.hpp>
|
||||
#include "../Scanner.hpp"
|
||||
|
||||
namespace Arabica
|
||||
{
|
||||
|
||||
namespace SAX
|
||||
{
|
||||
|
||||
/**
|
||||
This class implements a table-driven scanner for HTML, allowing for lots of
|
||||
defects. It implements the Scanner interface, which accepts a Reader
|
||||
object to fetch characters from and a ScanHandler object to report lexical
|
||||
events to.
|
||||
|
||||
Based on code from John Cowan's super TagSoup package
|
||||
*/
|
||||
class HTMLScanner : public Scanner, public SAX::Locator<std::string>
|
||||
{
|
||||
private:
|
||||
// Start of state table
|
||||
static const int S_ANAME = 1;
|
||||
static const int S_APOS = 2;
|
||||
static const int S_AVAL = 3;
|
||||
static const int S_BB = 4;
|
||||
static const int S_BBC = 5;
|
||||
static const int S_BBCD = 6;
|
||||
static const int S_BBCDA = 7;
|
||||
static const int S_BBCDAT = 8;
|
||||
static const int S_BBCDATA = 9;
|
||||
static const int S_CDATA = 10;
|
||||
static const int S_CDATA2 = 11;
|
||||
static const int S_CDSECT = 12;
|
||||
static const int S_CDSECT1 = 13;
|
||||
static const int S_CDSECT2 = 14;
|
||||
static const int S_COM = 15;
|
||||
static const int S_COM2 = 16;
|
||||
static const int S_COM3 = 17;
|
||||
static const int S_COM4 = 18;
|
||||
static const int S_DECL = 19;
|
||||
static const int S_DECL2 = 20;
|
||||
static const int S_DONE = 21;
|
||||
static const int S_EMPTYTAG = 22;
|
||||
static const int S_ENT = 23;
|
||||
static const int S_EQ = 24;
|
||||
static const int S_ETAG = 25;
|
||||
static const int S_GI = 26;
|
||||
static const int S_NCR = 27;
|
||||
static const int S_PCDATA = 28;
|
||||
static const int S_PI = 29;
|
||||
static const int S_PITARGET = 30;
|
||||
static const int S_QUOT = 31;
|
||||
static const int S_STAGC = 32;
|
||||
static const int S_TAG = 33;
|
||||
static const int S_TAGWS = 34;
|
||||
static const int S_XNCR = 35;
|
||||
static const int A_ADUP = 1;
|
||||
static const int A_ADUP_SAVE = 2;
|
||||
static const int A_ADUP_STAGC = 3;
|
||||
static const int A_ANAME = 4;
|
||||
static const int A_ANAME_ADUP = 5;
|
||||
static const int A_ANAME_ADUP_STAGC = 6;
|
||||
static const int A_AVAL = 7;
|
||||
static const int A_AVAL_STAGC = 8;
|
||||
static const int A_CDATA = 9;
|
||||
static const int A_CMNT = 10;
|
||||
static const int A_DECL = 11;
|
||||
static const int A_EMPTYTAG = 12;
|
||||
static const int A_ENTITY = 13;
|
||||
static const int A_ENTITY_START = 14;
|
||||
static const int A_ETAG = 15;
|
||||
static const int A_GI = 16;
|
||||
static const int A_GI_STAGC = 17;
|
||||
static const int A_LT = 18;
|
||||
static const int A_LT_PCDATA = 19;
|
||||
static const int A_MINUS = 20;
|
||||
static const int A_MINUS2 = 21;
|
||||
static const int A_MINUS3 = 22;
|
||||
static const int A_PCDATA = 23;
|
||||
static const int A_PI = 24;
|
||||
static const int A_PITARGET = 25;
|
||||
static const int A_PITARGET_PI = 26;
|
||||
static const int A_SAVE = 27;
|
||||
static const int A_SKIP = 28;
|
||||
static const int A_SP = 29;
|
||||
static const int A_STAGC = 30;
|
||||
static const int A_UNGET = 31;
|
||||
static const int A_UNSAVE_PCDATA = 32;
|
||||
static const int statetable[];
|
||||
static const std::string debug_actionnames[];
|
||||
static const std::string debug_statenames[];
|
||||
// End of state table
|
||||
static const int WinCharMap[]; // Windows char map
|
||||
static const std::string hexLetters;
|
||||
|
||||
std::string publicId_; // Locator state
|
||||
std::string systemId_;
|
||||
int lastLine_;
|
||||
int lastColumn_;
|
||||
int currentLine_;
|
||||
int currentColumn_;
|
||||
|
||||
int state_; // Current state
|
||||
int nextState_; // Next state
|
||||
std::string outputBuffer_; // Output buffer
|
||||
|
||||
// Compensate for bug in PushbackReader that allows
|
||||
// pushing back EOF.
|
||||
//void unread(PushbackReader r, int c) throws IOException {
|
||||
// if (c != -1) r.unread(c);
|
||||
// }
|
||||
|
||||
public:
|
||||
HTMLScanner() :
|
||||
publicId_(),
|
||||
systemId_(),
|
||||
lastLine_(0),
|
||||
lastColumn_(0),
|
||||
currentLine_(0),
|
||||
currentColumn_(0),
|
||||
state_(0),
|
||||
nextState_(0),
|
||||
outputBuffer_()
|
||||
{
|
||||
outputBuffer_.reserve(200);
|
||||
} // HTMLScanner
|
||||
|
||||
// Locator implementation
|
||||
int getLineNumber() const
|
||||
{
|
||||
return lastLine_;
|
||||
} // getLineNumber
|
||||
|
||||
int getColumnNumber() const
|
||||
{
|
||||
return lastColumn_;
|
||||
} // getColumnNumber
|
||||
|
||||
std::string getPublicId() const
|
||||
{
|
||||
return publicId_;
|
||||
} // getPublicId
|
||||
|
||||
std::string getSystemId() const
|
||||
{
|
||||
return systemId_;
|
||||
} // getSystemId
|
||||
|
||||
|
||||
// Scanner implementation
|
||||
/**
|
||||
Reset document locator, supplying systemid and publicid.
|
||||
@param systemid System id
|
||||
@param publicid Public id
|
||||
*/
|
||||
virtual void resetDocumentLocator(const std::string& publicid, const std::string& systemid)
|
||||
{
|
||||
publicId_ = publicid;
|
||||
systemId_ = systemid;
|
||||
lastLine_ = lastColumn_ = currentLine_ = currentColumn_ = 0;
|
||||
} // resetDocumentLocator
|
||||
|
||||
/**
|
||||
Scan HTML source, reporting lexical events.
|
||||
@param r0 Reader that provides characters
|
||||
@param h ScanHandler that accepts lexical events.
|
||||
*/
|
||||
virtual void scan(std::istream& r, ScanHandler& h)
|
||||
{
|
||||
state_ = S_PCDATA;
|
||||
/* PushbackReader r;
|
||||
if (r0 instanceof PushbackReader) {
|
||||
r = (PushbackReader)r0;
|
||||
}
|
||||
else if (r0 instanceof BufferedReader) {
|
||||
r = new PushbackReader(r0);
|
||||
}
|
||||
else {
|
||||
r = new PushbackReader(new BufferedReader(r0));
|
||||
}
|
||||
*/
|
||||
// int firstChar = r.read(); // Remove any leading BOM
|
||||
// if (firstChar != '\uFEFF') unread(r, firstChar);
|
||||
|
||||
while (state_ != S_DONE)
|
||||
{
|
||||
int ch = r.get();
|
||||
|
||||
// Process control characters
|
||||
if (ch >= 0x80 && ch <= 0x9F)
|
||||
ch = WinCharMap[ch-0x80];
|
||||
|
||||
if (ch == '\r')
|
||||
{
|
||||
ch = r.get(); // expect LF next
|
||||
if (ch != '\n')
|
||||
{
|
||||
r.unget();
|
||||
ch = '\n';
|
||||
}
|
||||
}
|
||||
|
||||
if (ch == '\n')
|
||||
{
|
||||
++currentLine_;
|
||||
currentColumn_ = 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
++currentColumn_;
|
||||
}
|
||||
|
||||
if (!(ch >= 0x20 || ch == '\n' || ch == '\t' || ch == -1))
|
||||
continue;
|
||||
|
||||
// Search state table
|
||||
int action = 0;
|
||||
for (int i = 0; statetable[i] != -1; i += 4)
|
||||
{
|
||||
if (state_ != statetable[i])
|
||||
{
|
||||
if (action != 0)
|
||||
break;
|
||||
continue;
|
||||
}
|
||||
if (statetable[i+1] == 0)
|
||||
{
|
||||
action = statetable[i+2];
|
||||
nextState_ = statetable[i+3];
|
||||
}
|
||||
else if (statetable[i+1] == ch)
|
||||
{
|
||||
action = statetable[i+2];
|
||||
nextState_ = statetable[i+3];
|
||||
break;
|
||||
}
|
||||
} // for ...
|
||||
|
||||
switch (action)
|
||||
{
|
||||
case 0:
|
||||
{
|
||||
std::ostringstream os;
|
||||
os << "HTMLScanner can't cope with " << ch << " in state " << state_;
|
||||
throw std::runtime_error(os.str());
|
||||
}
|
||||
case A_ADUP:
|
||||
h.adup(outputBuffer_);
|
||||
outputBuffer_.clear();
|
||||
break;
|
||||
case A_ADUP_SAVE:
|
||||
h.adup(outputBuffer_);
|
||||
outputBuffer_.clear();
|
||||
save(ch, h);
|
||||
break;
|
||||
case A_ADUP_STAGC:
|
||||
h.adup(outputBuffer_);
|
||||
outputBuffer_.clear();
|
||||
h.stagc(outputBuffer_);
|
||||
break;
|
||||
case A_ANAME:
|
||||
h.aname(outputBuffer_);
|
||||
outputBuffer_.clear();
|
||||
break;
|
||||
case A_ANAME_ADUP:
|
||||
h.aname(outputBuffer_);
|
||||
outputBuffer_.clear();
|
||||
h.adup(outputBuffer_);
|
||||
break;
|
||||
case A_ANAME_ADUP_STAGC:
|
||||
h.aname(outputBuffer_);
|
||||
outputBuffer_.clear();
|
||||
h.adup(outputBuffer_);
|
||||
h.stagc(outputBuffer_);
|
||||
break;
|
||||
case A_AVAL:
|
||||
h.aval(outputBuffer_);
|
||||
outputBuffer_.clear();
|
||||
break;
|
||||
case A_AVAL_STAGC:
|
||||
h.aval(outputBuffer_);
|
||||
outputBuffer_.clear();
|
||||
h.stagc(outputBuffer_);
|
||||
break;
|
||||
case A_CDATA:
|
||||
mark();
|
||||
// suppress the final "]]" in the buffer
|
||||
if (outputBuffer_.size() > 1)
|
||||
outputBuffer_.erase(outputBuffer_.size()-2);
|
||||
h.pcdata(outputBuffer_);
|
||||
outputBuffer_.clear();
|
||||
break;
|
||||
case A_ENTITY_START:
|
||||
h.pcdata(outputBuffer_);
|
||||
outputBuffer_.clear();
|
||||
save(ch, h);
|
||||
break;
|
||||
case A_ENTITY:
|
||||
{
|
||||
mark();
|
||||
char ch1 = (char)ch;
|
||||
// System.out.println("Got " + ch1 + " in state " + ((state_ == S_ENT) ? "S_ENT" : ((state_ == S_NCR) ? "S_NCR" : "UNK")));
|
||||
if (state_ == S_ENT && ch1 == '#')
|
||||
{
|
||||
nextState_ = S_NCR;
|
||||
save(ch, h);
|
||||
break;
|
||||
}
|
||||
else if (state_ == S_NCR && (ch1 == 'x' || ch1 == 'X'))
|
||||
{
|
||||
nextState_ = S_XNCR;
|
||||
save(ch, h);
|
||||
break;
|
||||
}
|
||||
else if (state_ == S_ENT && XML::is_letter_or_digit(ch1))
|
||||
{
|
||||
save(ch, h);
|
||||
break;
|
||||
}
|
||||
else if (state_ == S_NCR && XML::is_digit(ch1))
|
||||
{
|
||||
save(ch, h);
|
||||
break;
|
||||
}
|
||||
else if (state_ == S_XNCR && (XML::is_digit(ch1) || hexLetters.find(ch1) != std::string::npos))
|
||||
{
|
||||
save(ch, h);
|
||||
break;
|
||||
}
|
||||
|
||||
// The whole entity reference has been collected
|
||||
h.entity(outputBuffer_.substr(1, outputBuffer_.size()-1));
|
||||
int ent = h.getEntity();
|
||||
if (ent != 0)
|
||||
{
|
||||
outputBuffer_.clear();
|
||||
if (ent >= 0x80 && ent <= 0x9F)
|
||||
{
|
||||
ent = WinCharMap[ent-0x80];
|
||||
}
|
||||
if (ent < 0x20)
|
||||
{
|
||||
// Control becomes space
|
||||
ent = 0x20;
|
||||
}
|
||||
else if (ent >= 0xD800 && ent <= 0xDFFF)
|
||||
{
|
||||
// Surrogates get dropped
|
||||
ent = 0;
|
||||
}
|
||||
else if (ent <= 0xFFFF)
|
||||
{
|
||||
// BMP character
|
||||
save(ent, h);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Astral converted to two surrogates
|
||||
ent -= 0x10000;
|
||||
save((ent>>10) + 0xD800, h);
|
||||
save((ent&0x3FF) + 0xDC00, h);
|
||||
}
|
||||
if (ch != ';')
|
||||
{
|
||||
r.unget();
|
||||
currentColumn_--;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
r.unget();
|
||||
currentColumn_--;
|
||||
}
|
||||
nextState_ = S_PCDATA;
|
||||
} // case A_ENTITY:
|
||||
break;
|
||||
case A_ETAG:
|
||||
h.etag(outputBuffer_);
|
||||
outputBuffer_.clear();
|
||||
break;
|
||||
case A_DECL:
|
||||
h.decl(outputBuffer_);
|
||||
outputBuffer_.clear();
|
||||
break;
|
||||
case A_GI:
|
||||
h.gi(outputBuffer_);
|
||||
outputBuffer_.clear();
|
||||
break;
|
||||
case A_GI_STAGC:
|
||||
h.gi(outputBuffer_);
|
||||
outputBuffer_.clear();
|
||||
h.stagc(outputBuffer_);
|
||||
break;
|
||||
case A_LT:
|
||||
mark();
|
||||
save('<', h);
|
||||
save(ch, h);
|
||||
break;
|
||||
case A_LT_PCDATA:
|
||||
mark();
|
||||
save('<', h);
|
||||
h.pcdata(outputBuffer_);
|
||||
outputBuffer_.clear();
|
||||
break;
|
||||
case A_PCDATA:
|
||||
mark();
|
||||
h.pcdata(outputBuffer_);
|
||||
outputBuffer_.clear();
|
||||
break;
|
||||
case A_CMNT:
|
||||
mark();
|
||||
h.cmnt(outputBuffer_);
|
||||
outputBuffer_.clear();
|
||||
break;
|
||||
case A_MINUS3:
|
||||
save('-', h);
|
||||
save(' ', h);
|
||||
break;
|
||||
case A_MINUS2:
|
||||
save('-', h);
|
||||
save(' ', h);
|
||||
// fall through into A_MINUS
|
||||
case A_MINUS:
|
||||
save('-', h);
|
||||
save(ch, h);
|
||||
break;
|
||||
case A_PI:
|
||||
mark();
|
||||
h.pi(outputBuffer_);
|
||||
outputBuffer_.clear();
|
||||
break;
|
||||
case A_PITARGET:
|
||||
h.pitarget(outputBuffer_);
|
||||
outputBuffer_.clear();
|
||||
break;
|
||||
case A_PITARGET_PI:
|
||||
h.pitarget(outputBuffer_);
|
||||
outputBuffer_.clear();
|
||||
h.pi(outputBuffer_);
|
||||
break;
|
||||
case A_SAVE:
|
||||
save(ch, h);
|
||||
break;
|
||||
case A_SKIP:
|
||||
break;
|
||||
case A_SP:
|
||||
save(' ', h);
|
||||
break;
|
||||
case A_STAGC:
|
||||
h.stagc(outputBuffer_);
|
||||
outputBuffer_.clear();
|
||||
break;
|
||||
case A_EMPTYTAG:
|
||||
mark();
|
||||
if (outputBuffer_.size() > 0)
|
||||
h.gi(outputBuffer_);
|
||||
outputBuffer_.clear();
|
||||
h.stage(outputBuffer_);
|
||||
break;
|
||||
case A_UNGET:
|
||||
r.unget();
|
||||
currentColumn_--;
|
||||
break;
|
||||
case A_UNSAVE_PCDATA:
|
||||
if (outputBuffer_.size() > 0)
|
||||
outputBuffer_.erase(outputBuffer_.size()-1);
|
||||
h.pcdata(outputBuffer_);
|
||||
outputBuffer_.clear();
|
||||
break;
|
||||
default:
|
||||
throw std::runtime_error("Can't process state " + action);
|
||||
} // switch ...
|
||||
state_ = nextState_;
|
||||
} // while (state_ != S_DONE)
|
||||
h.eof("");
|
||||
} // scan
|
||||
|
||||
/**
|
||||
A callback for the ScanHandler that allows it to force
|
||||
the lexer state to CDATA content (no markup is recognized except
|
||||
the end of element.
|
||||
*/
|
||||
void startCDATA()
|
||||
{
|
||||
nextState_ = S_CDATA;
|
||||
} // startCDATA
|
||||
|
||||
private:
|
||||
/**
|
||||
* Mark the current scan position as a "point of interest" - start of a tag,
|
||||
* cdata, processing instruction etc.
|
||||
*/
|
||||
void mark()
|
||||
{
|
||||
lastColumn_ = currentColumn_;
|
||||
lastLine_ = currentLine_;
|
||||
} // mark
|
||||
|
||||
void save(int ch, ScanHandler& h)
|
||||
{
|
||||
if (outputBuffer_.size() >= outputBuffer_.capacity() - 20)
|
||||
{
|
||||
if (state_ == S_PCDATA || state_ == S_CDATA)
|
||||
{
|
||||
// Return a buffer-sized chunk of PCDATA
|
||||
h.pcdata(outputBuffer_);
|
||||
outputBuffer_.clear();
|
||||
}
|
||||
}
|
||||
outputBuffer_ += static_cast<char>(ch);
|
||||
} // save
|
||||
|
||||
static std::string nicechar(int in)
|
||||
{
|
||||
if (in == '\n')
|
||||
return "\\n";
|
||||
std::ostringstream os;
|
||||
if(in >= 32)
|
||||
os << '\'' << static_cast<char>(in) << '\'';
|
||||
else
|
||||
os << std::hex << std::showbase << in;
|
||||
return os.str();
|
||||
} // nicechar
|
||||
|
||||
HTMLScanner(const HTMLScanner&);
|
||||
bool operator==(const HTMLScanner&) const;
|
||||
HTMLScanner& operator=(const HTMLScanner&);
|
||||
}; // class HTMLScanner
|
||||
|
||||
const int HTMLScanner::statetable[] = {
|
||||
S_ANAME, '/', A_ANAME_ADUP, S_EMPTYTAG,
|
||||
S_ANAME, '=', A_ANAME, S_AVAL,
|
||||
S_ANAME, '>', A_ANAME_ADUP_STAGC, S_PCDATA,
|
||||
S_ANAME, 0, A_SAVE, S_ANAME,
|
||||
S_ANAME, -1, A_ANAME_ADUP_STAGC, S_DONE,
|
||||
S_ANAME, ' ', A_ANAME, S_EQ,
|
||||
S_ANAME, '\n', A_ANAME, S_EQ,
|
||||
S_ANAME, '\t', A_ANAME, S_EQ,
|
||||
S_APOS, '\'', A_AVAL, S_TAGWS,
|
||||
S_APOS, 0, A_SAVE, S_APOS,
|
||||
S_APOS, -1, A_AVAL_STAGC, S_DONE,
|
||||
S_APOS, ' ', A_SP, S_APOS,
|
||||
S_APOS, '\n', A_SP, S_APOS,
|
||||
S_APOS, '\t', A_SP, S_APOS,
|
||||
S_AVAL, '\'', A_SKIP, S_APOS,
|
||||
S_AVAL, '"', A_SKIP, S_QUOT,
|
||||
S_AVAL, '>', A_AVAL_STAGC, S_PCDATA,
|
||||
S_AVAL, 0, A_SAVE, S_STAGC,
|
||||
S_AVAL, -1, A_AVAL_STAGC, S_DONE,
|
||||
S_AVAL, ' ', A_SKIP, S_AVAL,
|
||||
S_AVAL, '\n', A_SKIP, S_AVAL,
|
||||
S_AVAL, '\t', A_SKIP, S_AVAL,
|
||||
S_BB, 'C', A_SKIP, S_BBC,
|
||||
S_BB, 0, A_SKIP, S_DECL,
|
||||
S_BB, -1, A_SKIP, S_DONE,
|
||||
S_BBC, 'D', A_SKIP, S_BBCD,
|
||||
S_BBC, 0, A_SKIP, S_DECL,
|
||||
S_BBC, -1, A_SKIP, S_DONE,
|
||||
S_BBCD, 'A', A_SKIP, S_BBCDA,
|
||||
S_BBCD, 0, A_SKIP, S_DECL,
|
||||
S_BBCD, -1, A_SKIP, S_DONE,
|
||||
S_BBCDA, 'T', A_SKIP, S_BBCDAT,
|
||||
S_BBCDA, 0, A_SKIP, S_DECL,
|
||||
S_BBCDA, -1, A_SKIP, S_DONE,
|
||||
S_BBCDAT, 'A', A_SKIP, S_BBCDATA,
|
||||
S_BBCDAT, 0, A_SKIP, S_DECL,
|
||||
S_BBCDAT, -1, A_SKIP, S_DONE,
|
||||
S_BBCDATA, '[', A_SKIP, S_CDSECT,
|
||||
S_BBCDATA, 0, A_SKIP, S_DECL,
|
||||
S_BBCDATA, -1, A_SKIP, S_DONE,
|
||||
S_CDATA, '<', A_SAVE, S_CDATA2,
|
||||
S_CDATA, 0, A_SAVE, S_CDATA,
|
||||
S_CDATA, -1, A_PCDATA, S_DONE,
|
||||
S_CDATA2, '/', A_UNSAVE_PCDATA, S_ETAG,
|
||||
S_CDATA2, 0, A_SAVE, S_CDATA,
|
||||
S_CDATA2, -1, A_UNSAVE_PCDATA, S_DONE,
|
||||
S_CDSECT, ']', A_SAVE, S_CDSECT1,
|
||||
S_CDSECT, 0, A_SAVE, S_CDSECT,
|
||||
S_CDSECT, -1, A_SKIP, S_DONE,
|
||||
S_CDSECT1, ']', A_SAVE, S_CDSECT2,
|
||||
S_CDSECT1, 0, A_SAVE, S_CDSECT,
|
||||
S_CDSECT1, -1, A_SKIP, S_DONE,
|
||||
S_CDSECT2, '>', A_CDATA, S_PCDATA,
|
||||
S_CDSECT2, 0, A_SAVE, S_CDSECT,
|
||||
S_CDSECT2, -1, A_SKIP, S_DONE,
|
||||
S_COM, '-', A_SKIP, S_COM2,
|
||||
S_COM, 0, A_SAVE, S_COM2,
|
||||
S_COM, -1, A_CMNT, S_DONE,
|
||||
S_COM2, '-', A_SKIP, S_COM3,
|
||||
S_COM2, 0, A_SAVE, S_COM2,
|
||||
S_COM2, -1, A_CMNT, S_DONE,
|
||||
S_COM3, '-', A_SKIP, S_COM4,
|
||||
S_COM3, 0, A_MINUS, S_COM2,
|
||||
S_COM3, -1, A_CMNT, S_DONE,
|
||||
S_COM4, '-', A_MINUS3, S_COM4,
|
||||
S_COM4, '>', A_CMNT, S_PCDATA,
|
||||
S_COM4, 0, A_MINUS2, S_COM2,
|
||||
S_COM4, -1, A_CMNT, S_DONE,
|
||||
S_DECL, '-', A_SKIP, S_COM,
|
||||
S_DECL, '[', A_SKIP, S_BB,
|
||||
S_DECL, '>', A_SKIP, S_PCDATA,
|
||||
S_DECL, 0, A_SAVE, S_DECL2,
|
||||
S_DECL, -1, A_SKIP, S_DONE,
|
||||
S_DECL2, '>', A_DECL, S_PCDATA,
|
||||
S_DECL2, 0, A_SAVE, S_DECL2,
|
||||
S_DECL2, -1, A_SKIP, S_DONE,
|
||||
S_EMPTYTAG, '>', A_EMPTYTAG, S_PCDATA,
|
||||
S_EMPTYTAG, 0, A_SAVE, S_ANAME,
|
||||
S_EMPTYTAG, ' ', A_SKIP, S_TAGWS,
|
||||
S_EMPTYTAG, '\n', A_SKIP, S_TAGWS,
|
||||
S_EMPTYTAG, '\t', A_SKIP, S_TAGWS,
|
||||
S_ENT, 0, A_ENTITY, S_ENT,
|
||||
S_ENT, -1, A_ENTITY, S_DONE,
|
||||
S_EQ, '=', A_SKIP, S_AVAL,
|
||||
S_EQ, '>', A_ADUP_STAGC, S_PCDATA,
|
||||
S_EQ, 0, A_ADUP_SAVE, S_ANAME,
|
||||
S_EQ, -1, A_ADUP_STAGC, S_DONE,
|
||||
S_EQ, ' ', A_SKIP, S_EQ,
|
||||
S_EQ, '\n', A_SKIP, S_EQ,
|
||||
S_EQ, '\t', A_SKIP, S_EQ,
|
||||
S_ETAG, '>', A_ETAG, S_PCDATA,
|
||||
S_ETAG, 0, A_SAVE, S_ETAG,
|
||||
S_ETAG, -1, A_ETAG, S_DONE,
|
||||
S_ETAG, ' ', A_SKIP, S_ETAG,
|
||||
S_ETAG, '\n', A_SKIP, S_ETAG,
|
||||
S_ETAG, '\t', A_SKIP, S_ETAG,
|
||||
S_GI, '/', A_SKIP, S_EMPTYTAG,
|
||||
S_GI, '>', A_GI_STAGC, S_PCDATA,
|
||||
S_GI, 0, A_SAVE, S_GI,
|
||||
S_GI, -1, A_SKIP, S_DONE,
|
||||
S_GI, ' ', A_GI, S_TAGWS,
|
||||
S_GI, '\n', A_GI, S_TAGWS,
|
||||
S_GI, '\t', A_GI, S_TAGWS,
|
||||
S_NCR, 0, A_ENTITY, S_NCR,
|
||||
S_NCR, -1, A_ENTITY, S_DONE,
|
||||
S_PCDATA, '&', A_ENTITY_START, S_ENT,
|
||||
S_PCDATA, '<', A_PCDATA, S_TAG,
|
||||
S_PCDATA, 0, A_SAVE, S_PCDATA,
|
||||
S_PCDATA, -1, A_PCDATA, S_DONE,
|
||||
S_PI, '>', A_PI, S_PCDATA,
|
||||
S_PI, 0, A_SAVE, S_PI,
|
||||
S_PI, -1, A_PI, S_DONE,
|
||||
S_PITARGET, '>', A_PITARGET_PI, S_PCDATA,
|
||||
S_PITARGET, 0, A_SAVE, S_PITARGET,
|
||||
S_PITARGET, -1, A_PITARGET_PI, S_DONE,
|
||||
S_PITARGET, ' ', A_PITARGET, S_PI,
|
||||
S_PITARGET, '\n', A_PITARGET, S_PI,
|
||||
S_PITARGET, '\t', A_PITARGET, S_PI,
|
||||
S_QUOT, '"', A_AVAL, S_TAGWS,
|
||||
S_QUOT, 0, A_SAVE, S_QUOT,
|
||||
S_QUOT, -1, A_AVAL_STAGC, S_DONE,
|
||||
S_QUOT, ' ', A_SP, S_QUOT,
|
||||
S_QUOT, '\n', A_SP, S_QUOT,
|
||||
S_QUOT, '\t', A_SP, S_QUOT,
|
||||
S_STAGC, '>', A_AVAL_STAGC, S_PCDATA,
|
||||
S_STAGC, 0, A_SAVE, S_STAGC,
|
||||
S_STAGC, -1, A_AVAL_STAGC, S_DONE,
|
||||
S_STAGC, ' ', A_AVAL, S_TAGWS,
|
||||
S_STAGC, '\n', A_AVAL, S_TAGWS,
|
||||
S_STAGC, '\t', A_AVAL, S_TAGWS,
|
||||
S_TAG, '!', A_SKIP, S_DECL,
|
||||
S_TAG, '?', A_SKIP, S_PITARGET,
|
||||
S_TAG, '/', A_SKIP, S_ETAG,
|
||||
S_TAG, '<', A_SAVE, S_TAG,
|
||||
S_TAG, 0, A_SAVE, S_GI,
|
||||
S_TAG, -1, A_LT_PCDATA, S_DONE,
|
||||
S_TAG, ' ', A_LT, S_PCDATA,
|
||||
S_TAG, '\n', A_LT, S_PCDATA,
|
||||
S_TAG, '\t', A_LT, S_PCDATA,
|
||||
S_TAGWS, '/', A_SKIP, S_EMPTYTAG,
|
||||
S_TAGWS, '>', A_STAGC, S_PCDATA,
|
||||
S_TAGWS, 0, A_SAVE, S_ANAME,
|
||||
S_TAGWS, -1, A_STAGC, S_DONE,
|
||||
S_TAGWS, ' ', A_SKIP, S_TAGWS,
|
||||
S_TAGWS, '\n', A_SKIP, S_TAGWS,
|
||||
S_TAGWS, '\t', A_SKIP, S_TAGWS,
|
||||
S_XNCR, 0, A_ENTITY, S_XNCR,
|
||||
S_XNCR, -1, A_ENTITY, S_DONE,
|
||||
-1, -1, -1, -1
|
||||
}; // HTMLScanner::statetable
|
||||
|
||||
const std::string HTMLScanner::debug_actionnames[] = { "", "A_ADUP", "A_ADUP_SAVE", "A_ADUP_STAGC", "A_ANAME", "A_ANAME_ADUP", "A_ANAME_ADUP_STAGC", "A_AVAL", "A_AVAL_STAGC", "A_CDATA", "A_CMNT", "A_DECL", "A_EMPTYTAG", "A_ENTITY", "A_ENTITY_START", "A_ETAG", "A_GI", "A_GI_STAGC", "A_LT", "A_LT_PCDATA", "A_MINUS", "A_MINUS2", "A_MINUS3", "A_PCDATA", "A_PI", "A_PITARGET", "A_PITARGET_PI", "A_SAVE", "A_SKIP", "A_SP", "A_STAGC", "A_UNGET", "A_UNSAVE_PCDATA"};
|
||||
const std::string HTMLScanner::debug_statenames[] = { "", "S_ANAME", "S_APOS", "S_AVAL", "S_BB", "S_BBC", "S_BBCD", "S_BBCDA", "S_BBCDAT", "S_BBCDATA", "S_CDATA", "S_CDATA2", "S_CDSECT", "S_CDSECT1", "S_CDSECT2", "S_COM", "S_COM2", "S_COM3", "S_COM4", "S_DECL", "S_DECL2", "S_DONE", "S_EMPTYTAG", "S_ENT", "S_EQ", "S_ETAG", "S_GI", "S_NCR", "S_PCDATA", "S_PI", "S_PITARGET", "S_QUOT", "S_STAGC", "S_TAG", "S_TAGWS", "S_XNCR"};
|
||||
|
||||
const int HTMLScanner::WinCharMap[] = { // Windows chars map
|
||||
0x20AC, 0xFFFD, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021,
|
||||
0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0xFFFD, 0x017D, 0xFFFD,
|
||||
0xFFFD, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
|
||||
0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0xFFFD, 0x017E, 0x0178
|
||||
}; // HTMLScanner::WinCharMap
|
||||
|
||||
const std::string HTMLScanner::hexLetters = "abcdefABCDEF";
|
||||
|
||||
} // namespace SAX
|
||||
|
||||
} // namespace Arabica
|
||||
|
||||
#endif
|
||||
|
2955
include/Taggle/impl/html/HTMLSchema.hpp
Normal file
2955
include/Taggle/impl/html/HTMLSchema.hpp
Normal file
File diff suppressed because it is too large
Load diff
|
@ -19,6 +19,7 @@ namespace XML
|
|||
bool is_digit(wchar_t c);
|
||||
bool is_combining_char(wchar_t c);
|
||||
bool is_extender(wchar_t c);
|
||||
bool is_letter_or_digit(wchar_t c);
|
||||
} // namespace XML
|
||||
|
||||
} // namespace Arabica
|
||||
|
|
|
@ -268,4 +268,10 @@ bool Arabica::XML::is_extender(wchar_t c)
|
|||
((c >= 0x30FC) && (c <= 0x30FE));
|
||||
} // is_extender
|
||||
|
||||
bool Arabica::XML::is_letter_or_digit(wchar_t c)
|
||||
{
|
||||
return is_letter(c) || is_digit(c);
|
||||
} // is_letter_or_digit
|
||||
|
||||
|
||||
// end of file
|
||||
|
|
Loading…
Add table
Reference in a new issue