collapsed tagsoup-port into mainline

This commit is contained in:
jez 2008-04-07 15:37:47 +00:00
parent 1468d9f822
commit 9ea360f3ef
23 changed files with 6308 additions and 17 deletions

View file

@ -27,6 +27,7 @@ AC_CONFIG_FILES([arabica.pc])
AC_CONFIG_FILES([src/Makefile]) AC_CONFIG_FILES([src/Makefile])
AC_CONFIG_FILES([examples/Makefile]) AC_CONFIG_FILES([examples/Makefile])
AC_CONFIG_FILES([examples/Utils/Makefile]) AC_CONFIG_FILES([examples/Utils/Makefile])
AC_CONFIG_FILES([examples/Taggle/Makefile])
AC_CONFIG_FILES([examples/SAX/Makefile]) AC_CONFIG_FILES([examples/SAX/Makefile])
AC_CONFIG_FILES([examples/DOM/Makefile]) AC_CONFIG_FILES([examples/DOM/Makefile])
AC_CONFIG_FILES([examples/XPath/Makefile]) AC_CONFIG_FILES([examples/XPath/Makefile])

View file

@ -1,4 +1,4 @@
SUBDIRS = SAX Utils SUBDIRS = SAX Taggle Utils
if WANT_DOM if WANT_DOM
SUBDIRS += DOM SUBDIRS += DOM
endif endif

7
examples/Taggle/Makefile.am Executable file
View file

@ -0,0 +1,7 @@
noinst_PROGRAMS = taggle
AM_CPPFLAGS = -I$(top_srcdir)/include @PARSER_HEADERS@ $(BOOST_CPPFLAGS)
LIBARABICA = $(top_builddir)/src/libarabica.la
taggle_SOURCES = taggle.cpp
taggle_LDADD = $(LIBARABICA)

60
examples/Taggle/sample.pyx Executable file
View file

@ -0,0 +1,60 @@
(po
Aid P01456
(date
Ayear 2002
Amonth 6
Aday 14
)date
(address
Atype shipping
(name
-Frits Mendels
)name
(street
-152 Cherry St
)street
(city
-San Francisco
)city
(state
-CA
)state
(zip
-94045
)zip
)address
(address
Atype billing
(name
-Frits Mendels
)name
(street
-PO Box 6789
)street
(city
-San Francisco
)city
(state
-CA
)state
(zip
-94123-6798
)zip
)address
(items
(item
Aquantity 1
AproductCode R-273
Adescription 14.4 Volt Cordless Drill
AunitCost 198.95
)item
(item
Aquantity 1
AproductCode 16325
Adescription 12 Piece Drill Bit Set
AunitCost 14.95
)item
)items
)po

54
examples/Taggle/taggle.cpp Executable file
View file

@ -0,0 +1,54 @@
#pragma warning(disable: 4250)
#include <iostream>
#include <SAX/filter/Writer.hpp>
#include <SAX/helpers/CatchErrorHandler.hpp>
#include <Taggle/Taggle.hpp>
#include <DOM/SAX2DOM/SAX2DOM.hpp>
#include <DOM/io/Stream.hpp>
#include <XPath/XPath.hpp>
int main(int argc, const char* argv[])
{
if(argc == 1)
{
std::cout << "taggle [file1] [file2] ... [filen]\n"
<< " taggle reads arbitrary HTML, outputting it as well-formed XML\n";
return 0;
} // if(argc == 1)
Arabica::SAX::Taggle<std::string> parser;
std::ostringstream sink;
Arabica::SAX::Writer<std::string> writer(sink, 4);
Arabica::SAX::CatchErrorHandler<std::string> eh;
writer.setParent(parser);
writer.setErrorHandler(eh);
for(int i = 1; i < argc; ++i)
{
std::string file(argv[i]);
Arabica::SAX::InputSource<std::string> is;
is.setSystemId(file);
if(file != "-")
writer.parse(is);
else
{
is.setSystemId("stdin");
is.setByteStream(std::cin);
writer.parse(is);
} // if(file != "-")
if(eh.errorsReported())
{
std::cerr << eh.errors() << std::endl;
eh.reset();
} // if ...
std::cout << sink.str();
} // for ...
return 0;
} // main

View file

@ -20,11 +20,26 @@ namespace Arabica
namespace SAX2DOM namespace SAX2DOM
{ {
template<class stringT, template<class string_type, class T0, class T1>
class string_adaptorT = Arabica::default_string_adaptor<stringT>, struct ParserTypes
class SAX_parser = Arabica::SAX::XMLReader<stringT, string_adaptorT> >
class Parser : protected Arabica::SAX::DefaultHandler<stringT, string_adaptorT>
{ {
typedef typename Arabica::get_param<Arabica::string_adaptor_tag,
Arabica::default_string_adaptor<string_type>,
T0,
T1>::type string_adaptor;
typedef typename Arabica::get_param<Arabica::SAX::XMLReaderInterface_tag,
Arabica::SAX::XMLReader<string_type, string_adaptor>,
T1,
T0>::type SAX_parser_type;
};
template<class stringT,
class T0 = Arabica::nil_t,
class T1 = Arabica::nil_t>
class Parser : protected Arabica::SAX::DefaultHandler<stringT, typename ParserTypes<stringT, T0, T1>::string_adaptor>
{
typedef typename ParserTypes<stringT, T0, T1>::string_adaptor string_adaptorT;
typedef typename ParserTypes<stringT, T0, T1>::SAX_parser_type SAX_parser_type;
typedef Arabica::SAX::Attributes<stringT, string_adaptorT> AttributesT; typedef Arabica::SAX::Attributes<stringT, string_adaptorT> AttributesT;
typedef Arabica::SAX::EntityResolver<stringT, string_adaptorT> EntityResolverT; typedef Arabica::SAX::EntityResolver<stringT, string_adaptorT> EntityResolverT;
typedef Arabica::SAX::ErrorHandler<stringT, string_adaptorT> ErrorHandlerT; typedef Arabica::SAX::ErrorHandler<stringT, string_adaptorT> ErrorHandlerT;
@ -88,7 +103,7 @@ class Parser : protected Arabica::SAX::DefaultHandler<stringT, string_adaptorT>
inDTD_ = false; inDTD_ = false;
inEntity_ = 0; inEntity_ = 0;
SAX_parser parser; SAX_parser_type parser;
parser.setContentHandler(*this); parser.setContentHandler(*this);
parser.setErrorHandler(*this); parser.setErrorHandler(*this);
if(entityResolver_) if(entityResolver_)
@ -157,7 +172,7 @@ class Parser : protected Arabica::SAX::DefaultHandler<stringT, string_adaptorT>
Arabica::SAX::AttributeTypes<stringT, string_adaptorT> attributeTypes_; Arabica::SAX::AttributeTypes<stringT, string_adaptorT> attributeTypes_;
protected: protected:
void setParserFeatures(SAX_parser& parser) const void setParserFeatures(SAX_parser_type& parser) const
{ {
for(typename Features::const_iterator f = features_.begin(), e = features_.end(); f != e; ++f) for(typename Features::const_iterator f = features_.begin(), e = features_.end(); f != e; ++f)
try { try {

View file

@ -37,7 +37,7 @@ namespace DOM
namespace StreamImpl namespace StreamImpl
{ {
template<class stringT, class string_adaptorT, class charT, class traitsT> template<class stringT, class string_adaptorT, class charT, class traitsT>
void streamChildren(std::basic_ostream<charT, traitsT>& stream, DOM::Node<stringT, string_adaptorT>& node) void streamChildren(std::basic_ostream<charT, traitsT>& stream, const DOM::Node<stringT, string_adaptorT>& node)
{ {
DOM::Node<stringT> child = node.getFirstChild(); DOM::Node<stringT> child = node.getFirstChild();
while(child != 0) while(child != 0)
@ -72,7 +72,7 @@ std::pair<bool, stringT> is_uri_declared(std::vector<std::map<stringT, stringT>
template<class stringT, class string_adaptorT, class charT, class traitsT> template<class stringT, class string_adaptorT, class charT, class traitsT>
void check_and_output_node_name(std::basic_ostream<charT, traitsT>& stream, void check_and_output_node_name(std::basic_ostream<charT, traitsT>& stream,
DOM::Node<stringT, string_adaptorT>& node, const DOM::Node<stringT, string_adaptorT>& node,
std::vector<std::map<stringT, stringT> >* prefix_stack) std::vector<std::map<stringT, stringT> >* prefix_stack)
{ {
std::map<stringT, stringT>& current = *(prefix_stack->rbegin()); std::map<stringT, stringT>& current = *(prefix_stack->rbegin());
@ -112,7 +112,7 @@ bool isXmlns(const stringT& str)
template<class stringT, class string_adaptorT, class charT, class traitsT> template<class stringT, class string_adaptorT, class charT, class traitsT>
int prefix_mapper(std::basic_ostream<charT, traitsT>& stream, int prefix_mapper(std::basic_ostream<charT, traitsT>& stream,
DOM::Node<stringT, string_adaptorT>& node) const DOM::Node<stringT, string_adaptorT>& node)
{ {
typedef Arabica::text::Unicode<charT> UnicodeT; typedef Arabica::text::Unicode<charT> UnicodeT;
@ -189,7 +189,7 @@ int prefix_mapper(std::basic_ostream<charT, traitsT>& stream,
template<class stringT, class string_adaptorT, class charT, class traitsT> template<class stringT, class string_adaptorT, class charT, class traitsT>
void prefix_mapper_pop(std::basic_ostream<charT, traitsT>& stream, void prefix_mapper_pop(std::basic_ostream<charT, traitsT>& stream,
DOM::Node<stringT, string_adaptorT> node, const DOM::Node<stringT, string_adaptorT>& node,
int index, int index,
bool output) bool output)
{ {
@ -212,7 +212,7 @@ void prefix_mapper_pop(std::basic_ostream<charT, traitsT>& stream,
template<class stringT, class string_adaptorT, class charT, class traitsT> template<class stringT, class string_adaptorT, class charT, class traitsT>
std::basic_ostream<charT, traitsT>& std::basic_ostream<charT, traitsT>&
operator<<(std::basic_ostream<charT, traitsT>& stream, operator<<(std::basic_ostream<charT, traitsT>& stream,
DOM::Node<stringT, string_adaptorT>& node) const DOM::Node<stringT, string_adaptorT>& node)
{ {
typedef Arabica::text::Unicode<charT> UnicodeT; typedef Arabica::text::Unicode<charT> UnicodeT;

View file

@ -7,6 +7,7 @@
#include <string> #include <string>
#include <SAX/ArabicaConfig.hpp> #include <SAX/ArabicaConfig.hpp>
#include <Arabica/StringAdaptor.hpp>
namespace Arabica namespace Arabica
{ {
@ -43,7 +44,7 @@ namespace SAX
* @version 2.0 * @version 2.0
* @see ContentHandler#setDocumentLocator * @see ContentHandler#setDocumentLocator
*/ */
template<class string_type, class string_adaptor> template<class string_type, class string_adaptor = Arabica::default_string_adaptor<string_type> >
class Locator class Locator
{ {
public: public:

View file

@ -63,8 +63,10 @@ namespace SAX
* @see helpers.ParserAdapter * @see helpers.ParserAdapter
* @see helpers.XMLReaderAdapter * @see helpers.XMLReaderAdapter
*/ */
class XMLReaderInterface_tag { };
template<class string_type, class T0, class T1> template<class string_type, class T0, class T1>
class XMLReaderInterface class XMLReaderInterface : public XMLReaderInterface_tag
{ {
public: public:
typedef typename Arabica::get_param<Arabica::string_adaptor_tag, typedef typename Arabica::get_param<Arabica::string_adaptor_tag,

View file

@ -6,6 +6,7 @@
#include <SAX/ArabicaConfig.hpp> #include <SAX/ArabicaConfig.hpp>
#include <SAX/Attributes.hpp> #include <SAX/Attributes.hpp>
#include <stdexcept>
#include <deque> #include <deque>
namespace Arabica namespace Arabica
@ -38,7 +39,7 @@ namespace SAX
* <a href="mailto:jez@jezuk.co.uk">jez@jezuk.co.uk</a> * <a href="mailto:jez@jezuk.co.uk">jez@jezuk.co.uk</a>
* @version 2.0 * @version 2.0
*/ */
template<class string_type, class string_adaptor> template<class string_type, class string_adaptor = Arabica::default_string_adaptor<string_type> >
class AttributesImpl : public Attributes<string_type, string_adaptor> class AttributesImpl : public Attributes<string_type, string_adaptor>
{ {
public: public:
@ -64,6 +65,14 @@ public:
return *this; return *this;
} // operator= } // operator=
bool operator==(const Attr& rhs) const
{
return (uri_ == rhs.uri_) &&
(localName_ == rhs.localName_) &&
(qName_ == rhs.qName_) &&
(type_ == rhs.type_) &&
(value_ == rhs.value_);
} // operator==
string_type uri_; string_type uri_;
string_type localName_; string_type localName_;
@ -75,11 +84,21 @@ public:
//////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////
// Constructors. // Constructors.
AttributesImpl() { } AttributesImpl() { }
AttributesImpl(const AttributesT& atts) AttributesImpl(const AttributesT& rhs)
{ {
setAttributes(atts); setAttributes(rhs);
} // AttributesImpl } // AttributesImpl
AttributesImpl& operator=(const AttributesT& rhs)
{
setAttributes(rhs);
} // operator=
bool operator==(const AttributesImpl& rhs) const
{
return attributes_ == rhs.attributes_;
} // operator==
//////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////
// Implementation of SAX::Attributes. // Implementation of SAX::Attributes.
/** /**

13
include/Taggle/Taggle.hpp Normal file
View file

@ -0,0 +1,13 @@
#ifndef ARABICA_TAGGLE_TAGGLE_HPP
#define ARABICA_TAGGLE_TAGGLE_HPP
#include "impl/ScanHandler.hpp"
#include "impl/ElementType.hpp"
#include "impl/Element.hpp"
#include "impl/Schema.hpp"
#include "impl/html/HTMLModels.hpp"
#include "impl/html/HTMLScanner.hpp"
#include "impl/html/HTMLSchema.hpp"
#include "impl/Parser.hpp"
#endif

304
include/Taggle/impl/Element.hpp Executable file
View file

@ -0,0 +1,304 @@
#ifndef ARABICA_SAX_TAGSOUP_ELEMENT_HPP
#define ARABICA_SAX_TAGSOUP_ELEMENT_HPP
#include <string>
#include <SAX/helpers/AttributesImpl.hpp>
#include "ElementType.hpp"
namespace Arabica
{
namespace SAX
{
/**
The internal representation of an actual element (not an element type).
An Element has an element type, attributes, and a successor Element
for use in constructing stacks and queues of Elements.
@see ElementType
@see AttributesImpl
Based on code from John Cowan's super TagSoup package
*/
class Element
{
private:
ElementType* type_; // type of element
AttributesImpl<std::string> atts_; // attributes of element
const Element* next_; // successor of element
bool preclosed_; // this element has been preclosed
public:
static const Element Null;
Element() :
type_(&ElementType::Null),
atts_(),
next_(0),
preclosed_(false)
{
} // Element
Element(const Element& rhs):
type_(rhs.type_),
atts_(rhs.atts_),
next_(0),
preclosed_(rhs.preclosed_)
{
if(rhs.next_)
next_ = new Element(*rhs.next_);
} // Element
/**
Return an Element from a specified ElementType.
@param type The element type of the newly constructed element
@param defaultAttributes True if default attributes are wanted
*/
Element(ElementType& type, bool defaultAttributes) :
type_(&type),
atts_(),
next_(0),
preclosed_(false)
{
if (defaultAttributes)
atts_ = type.atts();
} // Element
~Element()
{
if(next_ && (*next_ != Null))
delete next_;
} // ~Element
Element& operator=(const Element& rhs)
{
type_ = rhs.type_;
atts_ = rhs.atts_;
preclosed_ = rhs.preclosed_;
if(next_ && (*next_ != Null))
delete next_;
if(rhs.next_)
next_ = new Element(*rhs.next_);
else
next_ = 0;
return *this;
} // operator=
bool operator==(const Element& rhs) const
{
bool ok = (type_ == rhs.type_) &&
(atts_ == rhs.atts_) &&
(preclosed_ == rhs.preclosed_);
if(!ok)
return false;
if(!next_ && !rhs.next_)
return true;
if((!next_ && rhs.next_) ||
(next_ && !rhs.next_))
return false;
return (*next_ == *rhs.next_);
} // operator==
bool operator!=(const Element& rhs) const
{
return !(*this == rhs);
} // operator!=
/**
Return the element type.
@return The element type.
*/
const ElementType& type() const
{
return *type_;
} // type
/**
Return the attributes as an AttributesImpl object.
Returning an AttributesImpl makes the attributes mutable.
@return The attributes
@see AttributesImpl
*/
const AttributesImpl<std::string>& atts() const
{
return atts_;
} // atts
/**
Return the next element in an element stack or queue.
@return The next element
*/
Element next() const
{
if(!next_)
return Null;
return *next_;
} // next
/**
Change the next element in an element stack or queue.
@param next The new next element
*/
void setNext(const Element& next)
{
if(next_ && (*next_ != Null))
delete next_;
next_ = new Element(next);
} // setNext
/**
Return the name of the element's type.
Convenience method.
@return The element type name
*/
std::string name() const
{
return type_->name();
} // name
/**
Return the namespace name of the element's type.
Convenience method.
@return The element type namespace name
*/
std::string namespaceName() const
{
return type_->namespaceName();
} // namespaceName
/**
Return the local name of the element's type.
Convenience method.
@return The element type local name
*/
std::string localName() const
{
return type_->localName();
} // localName
/**
Return the content model vector of the element's type.
Convenience method.
@return The content model vector
*/
int model() const
{
return type_->model();
} // model
/**
Return the member-of vector of the element's type.
Convenience method.
@return The member-of vector
*/
int memberOf() const
{
return type_->memberOf();
} // memberOf
/**
Return the flags vector of the element's type.
Convenience method.
@return The flags vector
*/
int flags() const
{
return type_->flags();
} // flags
/**
Return the parent element type of the element's type.
Convenience method.
@return The parent element type
*/
ElementType& parent() const
{
return type_->parent();
} // parent
/**
Return true if the type of this element can contain the type of
another element.
Convenience method.
@param other The other element
*/
bool canContain(const Element& other) const
{
return type_->canContain(*(other.type_));
} // canContain
/**
Set an attribute and its value into this element.
@param name The attribute name (Qname)
@param type The attribute type
@param value The attribute value
*/
void setAttribute(const std::string& name, const std::string& type, const std::string& value)
{
type_->setAttribute(atts_, name, type, value);
} // setAttribute
/**
Make this element anonymous.
Remove any <tt>id</tt> or <tt>name</tt> attribute present
in the element's attributes.
*/
void anonymize()
{
for (int i = atts_.getLength() - 1; i >= 0; i--)
{
if((atts_.getType(i) == "ID") ||
(atts_.getQName(i) == "name"))
{
atts_.removeAttribute(i);
}
} // for ...
} // anonymize
/**
Clean the attributes of this element.
Attributes with null name (the name was ill-formed)
or null value (the attribute was present in the element type but
not in this actual element) are removed.
*/
void clean()
{
for (int i = atts_.getLength() - 1; i >= 0; i--)
{
const std::string& name = atts_.getLocalName(i);
if (atts_.getValue(i) == "" || name == "" || name.length() == 0)
{
atts_.removeAttribute(i);
continue;
} // if ...
} // for ...
} // clean
/**
Force this element to preclosed status, meaning that an end-tag has
been seen but the element cannot yet be closed for structural reasons.
*/
void preclose()
{
preclosed_ = true;
} // preclose
/**
Return true if this element has been preclosed.
*/
bool isPreclosed() const
{
return preclosed_;
} // isPreclosed
}; // class Element
const Element Element::Null;
} // namespace SAX
} // namespace Arabica
#endif

View file

@ -0,0 +1,333 @@
#ifndef ARABICA_SAX_ELEMENT_TYPE_HPP
#define ARABICA_SAX_ELEMENT_TYPE_HPP
#include <SAX/helpers/AttributesImpl.hpp>
#include <text/normalize_whitespace.hpp>
#include "Schema.hpp"
namespace Arabica
{
namespace SAX
{
/**
This class represents an element type in the schema.
An element type has a name, a content model vector, a member-of vector,
a flags vector, default attributes, and a schema to which it belongs.
Based on code from John Cowan's super TagSoup package
@see Schema
*/
class ElementType
{
private:
std::string name_; // element type name (Qname)
std::string namespace_; // element type namespace name
std::string localName_; // element type local name
int model_; // bitmap: what the element contains
int memberOf_; // bitmap: what element is contained in
int flags_; // bitmap: element flags
AttributesImpl<std::string> atts_; // default attributes
ElementType* parent_; // parent of this element type
Schema* schema_; // schema to which this belongs
public:
static ElementType Null;
private:
ElementType() :
name_("<null>"),
namespace_("<null>"),
localName_("<null>"),
model_(0),
memberOf_(0),
flags_(0),
atts_(),
parent_(0),
schema_(0)
{
} // ElementType
/**
Construct an ElementType:
but it's better to use Schema.element() instead.
The content model, member-of, and flags vectors are specified as ints.
@param name The element type name
@param model ORed-together bits representing the content models
allowed in the content of this element type
@param memberOf ORed-together bits representing the content models
to which this element type belongs
@param flags ORed-together bits representing the flags associated
with this element type
@param schema The schema with which this element type will be
associated
*/
ElementType(const std::string& name, int model, int memberOf, int flags, Schema& schema) :
name_(name),
model_(model),
memberOf_(memberOf),
flags_(flags),
schema_(&schema),
namespace_(),
localName_(),
parent_(0)
{
namespace_ = namespaceName(name, false);
localName_ = localName(name);
} // ElementType
ElementType(const ElementType& rhs) :
name_(rhs.name_),
model_(rhs.model_),
memberOf_(rhs.memberOf_),
flags_(rhs.flags_),
schema_(rhs.schema_),
namespace_(rhs.namespace_),
localName_(rhs.localName_),
parent_(rhs.parent_)
{
} // ElementType
friend class SchemaImpl;
public:
/**
Return a namespace name from a Qname.
The attribute flag tells us whether to return an empty namespace
name if there is no prefix, or use the schema default instead.
@param name The Qname
@param attribute True if name is an attribute name
@return The namespace name
**/
std::string namespaceName(const std::string& name, bool attribute) const
{
size_t colon = name.find(':');
if (colon == std::string::npos)
return attribute ? "" : schema_->getURI();
std::string prefix = name.substr(0, colon);
if (prefix == "xml")
return "http://www.w3.org/XML/1998/namespace";
else
return "urn:x-prefix:" + prefix;
} // namespaceName
/**
Return a local name from a Qname.
@param name The Qname
@return The local name
**/
std::string localName(const std::string& name) const
{
size_t colon = name.find(':');
if (colon == std::string::npos)
return name;
else
return name.substr(colon+1);
} // localName
/**
Returns the name of this element type.
@return The name of the element type
*/
std::string name() const { return name_; }
/**
Returns the namespace name of this element type.
@return The namespace name of the element type
*/
std::string namespaceName() const { return namespace_; }
/**
Returns the local name of this element type.
@return The local name of the element type
*/
std::string localName() const { return localName_; }
/**
Returns the content models of this element type.
@return The content models of this element type as a vector of bits
*/
int model() const { return model_; }
/**
Returns the content models to which this element type belongs.
@return The content models to which this element type belongs as a
vector of bits
*/
int memberOf() const { return memberOf_; }
/**
Returns the flags associated with this element type.
@return The flags associated with this element type as a vector of bits
*/
int flags() const { return flags_; }
/**
Returns the default attributes associated with this element type.
Attributes of type CDATA that don't have default values are
typically not included. Other attributes without default values
have an internal value of <tt>null</tt>.
The return value is an AttributesImpl to allow the caller to mutate
the attributes.
*/
const AttributesImpl<std::string>& atts() const { return atts_; }
/**
Returns the parent element type of this element type.
@return The parent element type
*/
ElementType& parent() const
{
return *parent_;
} // parent
/**
Returns the schema which this element type is associated with.
@return The schema
*/
Schema& schema() const
{
return *schema_;
} // schema
/**
Returns true if this element type can contain another element type.
That is, if any of the models in this element's model vector
match any of the models in the other element type's member-of
vector.
@param other The other element type
*/
bool canContain(const ElementType& other) const
{
return (model_ & other.memberOf_) != 0;
} // canContain
/**
Sets an attribute and its value into an AttributesImpl object.
Attempts to set a namespace declaration are ignored.
@param atts The AttributesImpl object
@param name The name (Qname) of the attribute
@param type The type of the attribute
@param value The value of the attribute
*/
void setAttribute(AttributesImpl<std::string>& atts,
const std::string& name,
const std::string& type,
const std::string& value)
{
if (name == "xmlns" || name.find("xmlns:") == 0)
{
return;
}
std::string namespaceN = namespaceName(name, true);
std::string localN = localName(name);
std::string actualType = type;
std::string actualValue = value;
int i = atts.getIndex(name);
if (i == -1)
{
if (actualType == "")
actualType = "CDATA";
if (actualType != "CDATA")
actualValue = Arabica::text::normalize_whitespace<std::string, Arabica::default_string_adaptor<std::string> >(value);
atts.addAttribute(namespaceN, localN, name, actualType, actualValue);
}
else
{
if (actualType == "")
actualType = atts.getType(i);
if (actualType != ("CDATA"))
actualValue = Arabica::text::normalize_whitespace<std::string, Arabica::default_string_adaptor<std::string> >(value);
atts.setAttribute(i, namespaceN, localN, name, actualType, actualValue);
}
} // setAttribute
/**
Sets an attribute and its value into this element type.
@param name The name of the attribute
@param type The type of the attribute
@param value The value of the attribute
*/
void setAttribute(const std::string& name, const std::string& type, const std::string& value)
{
setAttribute(atts_, name, type, value);
} // setAttribute
/**
Sets the models of this element type.
@param model The content models of this element type as a vector of bits
*/
void setModel(int model)
{
model_ = model;
} // setModel
/**
Sets the content models to which this element type belongs.
@param memberOf The content models to which this element type belongs as a vector of bits
*/
void setMemberOf(int memberOf)
{
memberOf_ = memberOf;
} // setMemberOf
/**
Sets the flags of this element type.
@param flags associated with this element type The flags as a vector of bits
*/
void setFlags(int flags)
{
flags_ = flags;
} // setFlags
/**
Sets the parent element type of this element type.
@param parent The parent element type
*/
void setParent(ElementType& parent)
{
parent_ = &parent;
} // setParent
bool operator==(const ElementType& rhs) const
{
return (name_ == rhs.name_) &&
(namespace_ == rhs.namespace_) &&
(localName_ == rhs.localName_) &&
(model_ == rhs.model_) &&
(memberOf_ == rhs.memberOf_) &&
(flags_ == rhs.flags_) &&
(parent_ == rhs.parent_) &&
(schema_ == rhs.schema_);
} // operator ==
ElementType& operator=(const ElementType& rhs)
{
name_ = rhs.name_;
namespace_ = rhs.namespace_;
localName_ = rhs.localName_;
model_ = rhs.model_;
memberOf_ = rhs.memberOf_;
flags_ = rhs.flags_;
atts_ = rhs.atts_;
parent_ = rhs.parent_;
schema_ = rhs.schema_;
return *this;
} // operator=
}; // class ElementType
ElementType ElementType::Null;
} // namespace SAX
} // namespace Arabica
#endif

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,105 @@
#ifndef ARABICA_SCAN_HANDLER_HPP
#define ARABICA_SCAN_HANDLER_HPP
#include <string>
namespace Arabica
{
namespace SAX
{
/**
An interface that Scanners use to report events in the input stream.
This code is derived from John Cowan's splendid TagSoup package
*/
class ScanHandler
{
public:
/**
Reports an attribute name without a value.
**/
virtual void adup(const std::string& buff) = 0;
/**
Reports an attribute name; a value will follow.
**/
virtual void aname(const std::string& buff) = 0;
/**
Reports an attribute value.
**/
virtual void aval(const std::string& buff) = 0;
/**
* Reports the content of a CDATA section (not a CDATA element)
*/
virtual void cdsect(const std::string& buff) = 0;
/**
* Reports a <!....> declaration - typically a DOCTYPE
*/
virtual void decl(const std::string& buff) = 0;
/**
Reports an entity reference or character reference.
**/
virtual void entity(const std::string& buff) = 0;
/**
Reports EOF.
**/
virtual void eof(const std::string& buff) = 0;
/**
Reports an end-tag.
**/
virtual void etag(const std::string& buff) = 0;
/**
Reports the general identifier (element type name) of a start-tag.
**/
virtual void gi(const std::string& buff) = 0;
/**
Reports character content.
**/
virtual void pcdata(const std::string& buff) = 0;
/**
Reports the data part of a processing instruction.
**/
virtual void pi(const std::string& buff) = 0;
/**
Reports the target part of a processing instruction.
**/
virtual void pitarget(const std::string& buff) = 0;
/**
Reports the close of a start-tag.
**/
virtual void stagc(const std::string& buff) = 0;
/**
Reports the close of an empty-tag.
**/
virtual void stage(const std::string& buff) = 0;
/**
Reports a comment.
**/
virtual void cmnt(const std::string& buff) = 0;
/**
Returns the value of the last entity or character reference reported.
**/
virtual int getEntity() = 0;
}; // class ScanHandler
} // namespace SAX
} // namespace Arabica
#endif

45
include/Taggle/impl/Scanner.hpp Executable file
View file

@ -0,0 +1,45 @@
#ifndef ARABICA_SAX_SCANNER_HPP
#define ARABICA_SAX_SCANNER_HPP
#include <istream>
#include <string>
namespace Arabica
{
namespace SAX
{
class ScanHandler;
/**
An interface allowing Parser to invoke scanners.
This code is derived from John Cowan's splendid TagSoup package
*/
class Scanner
{
public:
/**
Invoke a scanner.
@param r A source of characters to scan
@param h A ScanHandler to report events to
**/
virtual void scan(std::istream& r, ScanHandler& h) = 0;
/**
Reset the embedded locator.
@param publicid The publicid of the source
@param systemid The systemid of the source
**/
virtual void resetDocumentLocator(const std::string& publicid, const std::string& systemid) = 0;
/**
Signal to the scanner to start CDATA content mode.
**/
virtual void startCDATA() = 0;
}; // Scanner
} // namespace SAX
} // namespace Arabica
#endif

View file

@ -0,0 +1,44 @@
#ifndef ARABICA_SAX_TAGGLE_SCHEMA_HPP
#define ARABICA_SAX_TAGGLE_SCHEMA_HPP
namespace Arabica
{
namespace SAX
{
class ElementType;
/**
Abstract class representing a TSSL schema.
Actual TSSL schemas are compiled into concrete subclasses of this class.
Based on code from John Cowan's super TagSoup package
**/
class Schema
{
public:
static const int M_ANY = 0xFFFFFFFF;
static const int M_EMPTY = 0;
static const int M_PCDATA = 1 << 30;
static const int M_ROOT = 1 << 31;
static const int F_RESTART = 1;
static const int F_CDATA = 2;
static const int F_NOFORCE = 4;
virtual void elementType(const std::string& name, int model, int memberOf, int flags) = 0;
virtual ElementType& rootElementType() = 0;
virtual void parent(std::string name, std::string parentName) = 0;
virtual ElementType& getElementType(const std::string& name) = 0;
virtual int getEntity(const std::string& name) const = 0;
virtual const std::string& getURI() const = 0;
virtual const std::string& getPrefix() const = 0;
virtual ~Schema() { }
}; // class Schema
} // namespace SAX
} // namespace Arabica
#endif

View file

@ -0,0 +1,182 @@
#ifndef ARABICA_SAX_TAGGLE_SCHEMAIMPL_HPP
#define ARABICA_SAX_TAGGLE_SCHEMAIMPL_HPP
#include <map>
#include <string>
#include <algorithm>
#include <cctype>
#include "ElementType.hpp"
#include "Schema.hpp"
namespace Arabica
{
namespace SAX
{
/**
Abstract class representing a TSSL schema.
Actual TSSL schemas are compiled into concrete subclasses of this class.
Based on code from John Cowan's super TagSoup package
**/
class SchemaImpl : public Schema
{
private:
std::map<std::string, char> entities_;
std::map<std::string, ElementType*> elementTypes_;
std::string URI_;
std::string prefix_;
ElementType* root_;
public:
virtual ~SchemaImpl()
{
for(std::map<std::string, ElementType*>::iterator i = elementTypes_.begin(), ie = elementTypes_.end(); i != ie; ++i)
delete i->second;
} // ~SchemaImpl
/**
Add or replace an element type for this schema.
@param name Name (Qname) of the element
@param model Models of the element's content as a vector of bits
@param memberOf Models the element is a member of as a vector of bits
@param flags Flags for the element
**/
void elementType(const std::string& name, int model, int memberOf, int flags)
{
ElementType* e = new ElementType(name, model, memberOf, flags, *this);
std::string lname = lower_case(name);
elementTypes_[lname] = e;
if(memberOf == M_ROOT)
root_ = elementTypes_[lname];
} // elementType
/**
Get the root element of this schema
**/
ElementType& rootElementType()
{
return *root_;
} // rootElementType
/**
Add or replace a default attribute for an element type in this schema.
@param elemName Name (Qname) of the element type
@param attrName Name (Qname) of the attribute
@param type Type of the attribute
@param value Default value of the attribute; null if no default
**/
void attribute(const std::string& elemName, const std::string& attrName, const std::string& type, const std::string& value)
{
ElementType& e = getElementType(elemName);
if (e == ElementType::Null)
{
throw std::runtime_error("Attribute " + attrName +
" specified for unknown element type " +
elemName);
}
e.setAttribute(attrName, type, value);
} // attribute
/**
Specify natural parent of an element in this schema.
@param name Name of the child element
@param parentName Name of the parent element
**/
void parent(std::string name, std::string parentName)
{
ElementType& child = getElementType(name);
ElementType& parent = getElementType(parentName);
if (child == ElementType::Null)
{
throw std::runtime_error("No child " + name + " for parent " + parentName);
}
if (parent == ElementType::Null)
{
throw std::runtime_error("No parent " + parentName + " for child " + name);
}
child.setParent(parent);
} // parent
/**
Add to or replace a character entity in this schema.
@param name Name of the entity
@param value Value of the entity
**/
void entity(const std::string& name, int value)
{
entities_[name] = value;
} // entity
/**
Get an ElementType by name.
@param name Name (Qname) of the element type
@return The corresponding ElementType
**/
ElementType& getElementType(const std::string& name)
{
std::map<std::string, ElementType*>::iterator elemType = elementTypes_.find(lower_case(name));
if(elemType == elementTypes_.end())
return ElementType::Null;
return *elemType->second;
} // getElementType
/**
Get an entity value by name.
@param name Name of the entity
@return The corresponding character, or 0 if none
**/
int getEntity(const std::string& name) const
{
std::map<std::string, char>::const_iterator ent = entities_.find(name);
if(ent == entities_.end())
return 0;
return ent->second;
} // getEntity
/**
Return the URI (namespace name) of this schema.
**/
const std::string& getURI() const
{
return URI_;
} // getURI
/**
Return the prefix of this schema.
**/
const std::string& getPrefix() const
{
return prefix_;
} // getPrefix
/**
Change the URI (namespace name) of this schema.
**/
void setURI(std::string uri)
{
URI_ = uri;
} // setURI
/**
Change the prefix of this schema.
**/
void setPrefix(std::string prefix)
{
prefix_ = prefix;
} // setPrefix
private:
static std::string lower_case(const std::string& str)
{
std::string lower;
std::transform(str.begin(), str.end(), std::back_inserter(lower), (int(*)(int))std::tolower);
return lower;
} // lower_case
}; // class Schema
} // namespace SAX
} // namespace Arabica
#endif

View file

@ -0,0 +1,49 @@
#ifndef ARABICA_SAX_TAGGLE_HTML_MODELS_HPP
#define ARABICA_SAX_TAGGLE_HTML_MODELS_HPP
namespace Arabica
{
namespace SAX
{
/**
This interface contains generated constants representing HTML content
models. Logically, it is part of HTMLSchema, but it is more
convenient to generate the constants into a separate interface.
Based on code from John Cowan's super TagSoup package
*/
class HTMLModels
{
protected:
// Start of model definitions
static const int M_AREA = 1 << 1;
static const int M_BLOCK = 1 << 2;
static const int M_BLOCKINLINE = 1 << 3;
static const int M_BODY = 1 << 4;
static const int M_CELL = 1 << 5;
static const int M_COL = 1 << 6;
static const int M_DEF = 1 << 7;
static const int M_FORM = 1 << 8;
static const int M_FRAME = 1 << 9;
static const int M_HEAD = 1 << 10;
static const int M_HTML = 1 << 11;
static const int M_INLINE = 1 << 12;
static const int M_LEGEND = 1 << 13;
static const int M_LI = 1 << 14;
static const int M_NOLINK = 1 << 15;
static const int M_OPTION = 1 << 16;
static const int M_OPTIONS = 1 << 17;
static const int M_P = 1 << 18;
static const int M_PARAM = 1 << 19;
static const int M_TABLE = 1 << 20;
static const int M_TABULAR = 1 << 21;
static const int M_TR = 1 << 22;
}; // namespace HTMLModels
} // namespace SAX
} // namespace Arabica
#endif

View file

@ -0,0 +1,704 @@
#ifndef ARABICA_SAX_TAGGLE_HTML_SCANNER_HPP
#define ARABICA_SAX_TAGGLE_HTML_SCANNER_HPP
#include <SAX/SAXException.hpp>
#include <SAX/Locator.hpp>
#include <XML/XMLCharacterClasses.hpp>
#include "../Scanner.hpp"
namespace Arabica
{
namespace SAX
{
/**
This class implements a table-driven scanner for HTML, allowing for lots of
defects. It implements the Scanner interface, which accepts a Reader
object to fetch characters from and a ScanHandler object to report lexical
events to.
Based on code from John Cowan's super TagSoup package
*/
class HTMLScanner : public Scanner, public SAX::Locator<std::string>
{
private:
// Start of state table
static const int S_ANAME = 1;
static const int S_APOS = 2;
static const int S_AVAL = 3;
static const int S_BB = 4;
static const int S_BBC = 5;
static const int S_BBCD = 6;
static const int S_BBCDA = 7;
static const int S_BBCDAT = 8;
static const int S_BBCDATA = 9;
static const int S_CDATA = 10;
static const int S_CDATA2 = 11;
static const int S_CDSECT = 12;
static const int S_CDSECT1 = 13;
static const int S_CDSECT2 = 14;
static const int S_COM = 15;
static const int S_COM2 = 16;
static const int S_COM3 = 17;
static const int S_COM4 = 18;
static const int S_DECL = 19;
static const int S_DECL2 = 20;
static const int S_DONE = 21;
static const int S_EMPTYTAG = 22;
static const int S_ENT = 23;
static const int S_EQ = 24;
static const int S_ETAG = 25;
static const int S_GI = 26;
static const int S_NCR = 27;
static const int S_PCDATA = 28;
static const int S_PI = 29;
static const int S_PITARGET = 30;
static const int S_QUOT = 31;
static const int S_STAGC = 32;
static const int S_TAG = 33;
static const int S_TAGWS = 34;
static const int S_XNCR = 35;
static const int A_ADUP = 1;
static const int A_ADUP_SAVE = 2;
static const int A_ADUP_STAGC = 3;
static const int A_ANAME = 4;
static const int A_ANAME_ADUP = 5;
static const int A_ANAME_ADUP_STAGC = 6;
static const int A_AVAL = 7;
static const int A_AVAL_STAGC = 8;
static const int A_CDATA = 9;
static const int A_CMNT = 10;
static const int A_DECL = 11;
static const int A_EMPTYTAG = 12;
static const int A_ENTITY = 13;
static const int A_ENTITY_START = 14;
static const int A_ETAG = 15;
static const int A_GI = 16;
static const int A_GI_STAGC = 17;
static const int A_LT = 18;
static const int A_LT_PCDATA = 19;
static const int A_MINUS = 20;
static const int A_MINUS2 = 21;
static const int A_MINUS3 = 22;
static const int A_PCDATA = 23;
static const int A_PI = 24;
static const int A_PITARGET = 25;
static const int A_PITARGET_PI = 26;
static const int A_SAVE = 27;
static const int A_SKIP = 28;
static const int A_SP = 29;
static const int A_STAGC = 30;
static const int A_UNGET = 31;
static const int A_UNSAVE_PCDATA = 32;
static const int statetable[];
static const std::string debug_actionnames[];
static const std::string debug_statenames[];
// End of state table
static const int WinCharMap[]; // Windows char map
static const std::string hexLetters;
std::string publicId_; // Locator state
std::string systemId_;
int lastLine_;
int lastColumn_;
int currentLine_;
int currentColumn_;
int state_; // Current state
int nextState_; // Next state
std::string outputBuffer_; // Output buffer
// Compensate for bug in PushbackReader that allows
// pushing back EOF.
//void unread(PushbackReader r, int c) throws IOException {
// if (c != -1) r.unread(c);
// }
public:
HTMLScanner() :
publicId_(),
systemId_(),
lastLine_(0),
lastColumn_(0),
currentLine_(0),
currentColumn_(0),
state_(0),
nextState_(0),
outputBuffer_()
{
outputBuffer_.reserve(200);
} // HTMLScanner
// Locator implementation
int getLineNumber() const
{
return lastLine_;
} // getLineNumber
int getColumnNumber() const
{
return lastColumn_;
} // getColumnNumber
std::string getPublicId() const
{
return publicId_;
} // getPublicId
std::string getSystemId() const
{
return systemId_;
} // getSystemId
// Scanner implementation
/**
Reset document locator, supplying systemid and publicid.
@param systemid System id
@param publicid Public id
*/
virtual void resetDocumentLocator(const std::string& publicid, const std::string& systemid)
{
publicId_ = publicid;
systemId_ = systemid;
lastLine_ = lastColumn_ = currentLine_ = currentColumn_ = 0;
} // resetDocumentLocator
/**
Scan HTML source, reporting lexical events.
@param r0 Reader that provides characters
@param h ScanHandler that accepts lexical events.
*/
virtual void scan(std::istream& r, ScanHandler& h)
{
state_ = S_PCDATA;
/* PushbackReader r;
if (r0 instanceof PushbackReader) {
r = (PushbackReader)r0;
}
else if (r0 instanceof BufferedReader) {
r = new PushbackReader(r0);
}
else {
r = new PushbackReader(new BufferedReader(r0));
}
*/
// int firstChar = r.read(); // Remove any leading BOM
// if (firstChar != '\uFEFF') unread(r, firstChar);
while (state_ != S_DONE)
{
int ch = r.get();
// Process control characters
if (ch >= 0x80 && ch <= 0x9F)
ch = WinCharMap[ch-0x80];
if (ch == '\r')
{
ch = r.get(); // expect LF next
if (ch != '\n')
{
r.unget();
ch = '\n';
}
}
if (ch == '\n')
{
++currentLine_;
currentColumn_ = 0;
}
else
{
++currentColumn_;
}
if (!(ch >= 0x20 || ch == '\n' || ch == '\t' || ch == -1))
continue;
// Search state table
int action = 0;
for (int i = 0; statetable[i] != -1; i += 4)
{
if (state_ != statetable[i])
{
if (action != 0)
break;
continue;
}
if (statetable[i+1] == 0)
{
action = statetable[i+2];
nextState_ = statetable[i+3];
}
else if (statetable[i+1] == ch)
{
action = statetable[i+2];
nextState_ = statetable[i+3];
break;
}
} // for ...
switch (action)
{
case 0:
{
std::ostringstream os;
os << "HTMLScanner can't cope with " << ch << " in state " << state_;
throw std::runtime_error(os.str());
}
case A_ADUP:
h.adup(outputBuffer_);
outputBuffer_.clear();
break;
case A_ADUP_SAVE:
h.adup(outputBuffer_);
outputBuffer_.clear();
save(ch, h);
break;
case A_ADUP_STAGC:
h.adup(outputBuffer_);
outputBuffer_.clear();
h.stagc(outputBuffer_);
break;
case A_ANAME:
h.aname(outputBuffer_);
outputBuffer_.clear();
break;
case A_ANAME_ADUP:
h.aname(outputBuffer_);
outputBuffer_.clear();
h.adup(outputBuffer_);
break;
case A_ANAME_ADUP_STAGC:
h.aname(outputBuffer_);
outputBuffer_.clear();
h.adup(outputBuffer_);
h.stagc(outputBuffer_);
break;
case A_AVAL:
h.aval(outputBuffer_);
outputBuffer_.clear();
break;
case A_AVAL_STAGC:
h.aval(outputBuffer_);
outputBuffer_.clear();
h.stagc(outputBuffer_);
break;
case A_CDATA:
mark();
// suppress the final "]]" in the buffer
if (outputBuffer_.size() > 1)
outputBuffer_.erase(outputBuffer_.size()-2);
h.pcdata(outputBuffer_);
outputBuffer_.clear();
break;
case A_ENTITY_START:
h.pcdata(outputBuffer_);
outputBuffer_.clear();
save(ch, h);
break;
case A_ENTITY:
{
mark();
char ch1 = (char)ch;
// System.out.println("Got " + ch1 + " in state " + ((state_ == S_ENT) ? "S_ENT" : ((state_ == S_NCR) ? "S_NCR" : "UNK")));
if (state_ == S_ENT && ch1 == '#')
{
nextState_ = S_NCR;
save(ch, h);
break;
}
else if (state_ == S_NCR && (ch1 == 'x' || ch1 == 'X'))
{
nextState_ = S_XNCR;
save(ch, h);
break;
}
else if (state_ == S_ENT && XML::is_letter_or_digit(ch1))
{
save(ch, h);
break;
}
else if (state_ == S_NCR && XML::is_digit(ch1))
{
save(ch, h);
break;
}
else if (state_ == S_XNCR && (XML::is_digit(ch1) || hexLetters.find(ch1) != std::string::npos))
{
save(ch, h);
break;
}
// The whole entity reference has been collected
h.entity(outputBuffer_.substr(1, outputBuffer_.size()-1));
int ent = h.getEntity();
if (ent != 0)
{
outputBuffer_.clear();
if (ent >= 0x80 && ent <= 0x9F)
{
ent = WinCharMap[ent-0x80];
}
if (ent < 0x20)
{
// Control becomes space
ent = 0x20;
}
else if (ent >= 0xD800 && ent <= 0xDFFF)
{
// Surrogates get dropped
ent = 0;
}
else if (ent <= 0xFFFF)
{
// BMP character
save(ent, h);
}
else
{
// Astral converted to two surrogates
ent -= 0x10000;
save((ent>>10) + 0xD800, h);
save((ent&0x3FF) + 0xDC00, h);
}
if (ch != ';')
{
r.unget();
currentColumn_--;
}
}
else
{
r.unget();
currentColumn_--;
}
nextState_ = S_PCDATA;
} // case A_ENTITY:
break;
case A_ETAG:
h.etag(outputBuffer_);
outputBuffer_.clear();
break;
case A_DECL:
h.decl(outputBuffer_);
outputBuffer_.clear();
break;
case A_GI:
h.gi(outputBuffer_);
outputBuffer_.clear();
break;
case A_GI_STAGC:
h.gi(outputBuffer_);
outputBuffer_.clear();
h.stagc(outputBuffer_);
break;
case A_LT:
mark();
save('<', h);
save(ch, h);
break;
case A_LT_PCDATA:
mark();
save('<', h);
h.pcdata(outputBuffer_);
outputBuffer_.clear();
break;
case A_PCDATA:
mark();
h.pcdata(outputBuffer_);
outputBuffer_.clear();
break;
case A_CMNT:
mark();
h.cmnt(outputBuffer_);
outputBuffer_.clear();
break;
case A_MINUS3:
save('-', h);
save(' ', h);
break;
case A_MINUS2:
save('-', h);
save(' ', h);
// fall through into A_MINUS
case A_MINUS:
save('-', h);
save(ch, h);
break;
case A_PI:
mark();
h.pi(outputBuffer_);
outputBuffer_.clear();
break;
case A_PITARGET:
h.pitarget(outputBuffer_);
outputBuffer_.clear();
break;
case A_PITARGET_PI:
h.pitarget(outputBuffer_);
outputBuffer_.clear();
h.pi(outputBuffer_);
break;
case A_SAVE:
save(ch, h);
break;
case A_SKIP:
break;
case A_SP:
save(' ', h);
break;
case A_STAGC:
h.stagc(outputBuffer_);
outputBuffer_.clear();
break;
case A_EMPTYTAG:
mark();
if (outputBuffer_.size() > 0)
h.gi(outputBuffer_);
outputBuffer_.clear();
h.stage(outputBuffer_);
break;
case A_UNGET:
r.unget();
currentColumn_--;
break;
case A_UNSAVE_PCDATA:
if (outputBuffer_.size() > 0)
outputBuffer_.erase(outputBuffer_.size()-1);
h.pcdata(outputBuffer_);
outputBuffer_.clear();
break;
default:
throw std::runtime_error("Can't process state " + action);
} // switch ...
state_ = nextState_;
} // while (state_ != S_DONE)
h.eof("");
} // scan
/**
A callback for the ScanHandler that allows it to force
the lexer state to CDATA content (no markup is recognized except
the end of element.
*/
void startCDATA()
{
nextState_ = S_CDATA;
} // startCDATA
private:
/**
* Mark the current scan position as a "point of interest" - start of a tag,
* cdata, processing instruction etc.
*/
void mark()
{
lastColumn_ = currentColumn_;
lastLine_ = currentLine_;
} // mark
void save(int ch, ScanHandler& h)
{
if (outputBuffer_.size() >= outputBuffer_.capacity() - 20)
{
if (state_ == S_PCDATA || state_ == S_CDATA)
{
// Return a buffer-sized chunk of PCDATA
h.pcdata(outputBuffer_);
outputBuffer_.clear();
}
}
outputBuffer_ += static_cast<char>(ch);
} // save
static std::string nicechar(int in)
{
if (in == '\n')
return "\\n";
std::ostringstream os;
if(in >= 32)
os << '\'' << static_cast<char>(in) << '\'';
else
os << std::hex << std::showbase << in;
return os.str();
} // nicechar
HTMLScanner(const HTMLScanner&);
bool operator==(const HTMLScanner&) const;
HTMLScanner& operator=(const HTMLScanner&);
}; // class HTMLScanner
const int HTMLScanner::statetable[] = {
S_ANAME, '/', A_ANAME_ADUP, S_EMPTYTAG,
S_ANAME, '=', A_ANAME, S_AVAL,
S_ANAME, '>', A_ANAME_ADUP_STAGC, S_PCDATA,
S_ANAME, 0, A_SAVE, S_ANAME,
S_ANAME, -1, A_ANAME_ADUP_STAGC, S_DONE,
S_ANAME, ' ', A_ANAME, S_EQ,
S_ANAME, '\n', A_ANAME, S_EQ,
S_ANAME, '\t', A_ANAME, S_EQ,
S_APOS, '\'', A_AVAL, S_TAGWS,
S_APOS, 0, A_SAVE, S_APOS,
S_APOS, -1, A_AVAL_STAGC, S_DONE,
S_APOS, ' ', A_SP, S_APOS,
S_APOS, '\n', A_SP, S_APOS,
S_APOS, '\t', A_SP, S_APOS,
S_AVAL, '\'', A_SKIP, S_APOS,
S_AVAL, '"', A_SKIP, S_QUOT,
S_AVAL, '>', A_AVAL_STAGC, S_PCDATA,
S_AVAL, 0, A_SAVE, S_STAGC,
S_AVAL, -1, A_AVAL_STAGC, S_DONE,
S_AVAL, ' ', A_SKIP, S_AVAL,
S_AVAL, '\n', A_SKIP, S_AVAL,
S_AVAL, '\t', A_SKIP, S_AVAL,
S_BB, 'C', A_SKIP, S_BBC,
S_BB, 0, A_SKIP, S_DECL,
S_BB, -1, A_SKIP, S_DONE,
S_BBC, 'D', A_SKIP, S_BBCD,
S_BBC, 0, A_SKIP, S_DECL,
S_BBC, -1, A_SKIP, S_DONE,
S_BBCD, 'A', A_SKIP, S_BBCDA,
S_BBCD, 0, A_SKIP, S_DECL,
S_BBCD, -1, A_SKIP, S_DONE,
S_BBCDA, 'T', A_SKIP, S_BBCDAT,
S_BBCDA, 0, A_SKIP, S_DECL,
S_BBCDA, -1, A_SKIP, S_DONE,
S_BBCDAT, 'A', A_SKIP, S_BBCDATA,
S_BBCDAT, 0, A_SKIP, S_DECL,
S_BBCDAT, -1, A_SKIP, S_DONE,
S_BBCDATA, '[', A_SKIP, S_CDSECT,
S_BBCDATA, 0, A_SKIP, S_DECL,
S_BBCDATA, -1, A_SKIP, S_DONE,
S_CDATA, '<', A_SAVE, S_CDATA2,
S_CDATA, 0, A_SAVE, S_CDATA,
S_CDATA, -1, A_PCDATA, S_DONE,
S_CDATA2, '/', A_UNSAVE_PCDATA, S_ETAG,
S_CDATA2, 0, A_SAVE, S_CDATA,
S_CDATA2, -1, A_UNSAVE_PCDATA, S_DONE,
S_CDSECT, ']', A_SAVE, S_CDSECT1,
S_CDSECT, 0, A_SAVE, S_CDSECT,
S_CDSECT, -1, A_SKIP, S_DONE,
S_CDSECT1, ']', A_SAVE, S_CDSECT2,
S_CDSECT1, 0, A_SAVE, S_CDSECT,
S_CDSECT1, -1, A_SKIP, S_DONE,
S_CDSECT2, '>', A_CDATA, S_PCDATA,
S_CDSECT2, 0, A_SAVE, S_CDSECT,
S_CDSECT2, -1, A_SKIP, S_DONE,
S_COM, '-', A_SKIP, S_COM2,
S_COM, 0, A_SAVE, S_COM2,
S_COM, -1, A_CMNT, S_DONE,
S_COM2, '-', A_SKIP, S_COM3,
S_COM2, 0, A_SAVE, S_COM2,
S_COM2, -1, A_CMNT, S_DONE,
S_COM3, '-', A_SKIP, S_COM4,
S_COM3, 0, A_MINUS, S_COM2,
S_COM3, -1, A_CMNT, S_DONE,
S_COM4, '-', A_MINUS3, S_COM4,
S_COM4, '>', A_CMNT, S_PCDATA,
S_COM4, 0, A_MINUS2, S_COM2,
S_COM4, -1, A_CMNT, S_DONE,
S_DECL, '-', A_SKIP, S_COM,
S_DECL, '[', A_SKIP, S_BB,
S_DECL, '>', A_SKIP, S_PCDATA,
S_DECL, 0, A_SAVE, S_DECL2,
S_DECL, -1, A_SKIP, S_DONE,
S_DECL2, '>', A_DECL, S_PCDATA,
S_DECL2, 0, A_SAVE, S_DECL2,
S_DECL2, -1, A_SKIP, S_DONE,
S_EMPTYTAG, '>', A_EMPTYTAG, S_PCDATA,
S_EMPTYTAG, 0, A_SAVE, S_ANAME,
S_EMPTYTAG, ' ', A_SKIP, S_TAGWS,
S_EMPTYTAG, '\n', A_SKIP, S_TAGWS,
S_EMPTYTAG, '\t', A_SKIP, S_TAGWS,
S_ENT, 0, A_ENTITY, S_ENT,
S_ENT, -1, A_ENTITY, S_DONE,
S_EQ, '=', A_SKIP, S_AVAL,
S_EQ, '>', A_ADUP_STAGC, S_PCDATA,
S_EQ, 0, A_ADUP_SAVE, S_ANAME,
S_EQ, -1, A_ADUP_STAGC, S_DONE,
S_EQ, ' ', A_SKIP, S_EQ,
S_EQ, '\n', A_SKIP, S_EQ,
S_EQ, '\t', A_SKIP, S_EQ,
S_ETAG, '>', A_ETAG, S_PCDATA,
S_ETAG, 0, A_SAVE, S_ETAG,
S_ETAG, -1, A_ETAG, S_DONE,
S_ETAG, ' ', A_SKIP, S_ETAG,
S_ETAG, '\n', A_SKIP, S_ETAG,
S_ETAG, '\t', A_SKIP, S_ETAG,
S_GI, '/', A_SKIP, S_EMPTYTAG,
S_GI, '>', A_GI_STAGC, S_PCDATA,
S_GI, 0, A_SAVE, S_GI,
S_GI, -1, A_SKIP, S_DONE,
S_GI, ' ', A_GI, S_TAGWS,
S_GI, '\n', A_GI, S_TAGWS,
S_GI, '\t', A_GI, S_TAGWS,
S_NCR, 0, A_ENTITY, S_NCR,
S_NCR, -1, A_ENTITY, S_DONE,
S_PCDATA, '&', A_ENTITY_START, S_ENT,
S_PCDATA, '<', A_PCDATA, S_TAG,
S_PCDATA, 0, A_SAVE, S_PCDATA,
S_PCDATA, -1, A_PCDATA, S_DONE,
S_PI, '>', A_PI, S_PCDATA,
S_PI, 0, A_SAVE, S_PI,
S_PI, -1, A_PI, S_DONE,
S_PITARGET, '>', A_PITARGET_PI, S_PCDATA,
S_PITARGET, 0, A_SAVE, S_PITARGET,
S_PITARGET, -1, A_PITARGET_PI, S_DONE,
S_PITARGET, ' ', A_PITARGET, S_PI,
S_PITARGET, '\n', A_PITARGET, S_PI,
S_PITARGET, '\t', A_PITARGET, S_PI,
S_QUOT, '"', A_AVAL, S_TAGWS,
S_QUOT, 0, A_SAVE, S_QUOT,
S_QUOT, -1, A_AVAL_STAGC, S_DONE,
S_QUOT, ' ', A_SP, S_QUOT,
S_QUOT, '\n', A_SP, S_QUOT,
S_QUOT, '\t', A_SP, S_QUOT,
S_STAGC, '>', A_AVAL_STAGC, S_PCDATA,
S_STAGC, 0, A_SAVE, S_STAGC,
S_STAGC, -1, A_AVAL_STAGC, S_DONE,
S_STAGC, ' ', A_AVAL, S_TAGWS,
S_STAGC, '\n', A_AVAL, S_TAGWS,
S_STAGC, '\t', A_AVAL, S_TAGWS,
S_TAG, '!', A_SKIP, S_DECL,
S_TAG, '?', A_SKIP, S_PITARGET,
S_TAG, '/', A_SKIP, S_ETAG,
S_TAG, '<', A_SAVE, S_TAG,
S_TAG, 0, A_SAVE, S_GI,
S_TAG, -1, A_LT_PCDATA, S_DONE,
S_TAG, ' ', A_LT, S_PCDATA,
S_TAG, '\n', A_LT, S_PCDATA,
S_TAG, '\t', A_LT, S_PCDATA,
S_TAGWS, '/', A_SKIP, S_EMPTYTAG,
S_TAGWS, '>', A_STAGC, S_PCDATA,
S_TAGWS, 0, A_SAVE, S_ANAME,
S_TAGWS, -1, A_STAGC, S_DONE,
S_TAGWS, ' ', A_SKIP, S_TAGWS,
S_TAGWS, '\n', A_SKIP, S_TAGWS,
S_TAGWS, '\t', A_SKIP, S_TAGWS,
S_XNCR, 0, A_ENTITY, S_XNCR,
S_XNCR, -1, A_ENTITY, S_DONE,
-1, -1, -1, -1
}; // HTMLScanner::statetable
const std::string HTMLScanner::debug_actionnames[] = { "", "A_ADUP", "A_ADUP_SAVE", "A_ADUP_STAGC", "A_ANAME", "A_ANAME_ADUP", "A_ANAME_ADUP_STAGC", "A_AVAL", "A_AVAL_STAGC", "A_CDATA", "A_CMNT", "A_DECL", "A_EMPTYTAG", "A_ENTITY", "A_ENTITY_START", "A_ETAG", "A_GI", "A_GI_STAGC", "A_LT", "A_LT_PCDATA", "A_MINUS", "A_MINUS2", "A_MINUS3", "A_PCDATA", "A_PI", "A_PITARGET", "A_PITARGET_PI", "A_SAVE", "A_SKIP", "A_SP", "A_STAGC", "A_UNGET", "A_UNSAVE_PCDATA"};
const std::string HTMLScanner::debug_statenames[] = { "", "S_ANAME", "S_APOS", "S_AVAL", "S_BB", "S_BBC", "S_BBCD", "S_BBCDA", "S_BBCDAT", "S_BBCDATA", "S_CDATA", "S_CDATA2", "S_CDSECT", "S_CDSECT1", "S_CDSECT2", "S_COM", "S_COM2", "S_COM3", "S_COM4", "S_DECL", "S_DECL2", "S_DONE", "S_EMPTYTAG", "S_ENT", "S_EQ", "S_ETAG", "S_GI", "S_NCR", "S_PCDATA", "S_PI", "S_PITARGET", "S_QUOT", "S_STAGC", "S_TAG", "S_TAGWS", "S_XNCR"};
const int HTMLScanner::WinCharMap[] = { // Windows chars map
0x20AC, 0xFFFD, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021,
0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0xFFFD, 0x017D, 0xFFFD,
0xFFFD, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0xFFFD, 0x017E, 0x0178
}; // HTMLScanner::WinCharMap
const std::string HTMLScanner::hexLetters = "abcdefABCDEF";
} // namespace SAX
} // namespace Arabica
#endif

File diff suppressed because it is too large Load diff

View file

@ -19,6 +19,7 @@ namespace XML
bool is_digit(wchar_t c); bool is_digit(wchar_t c);
bool is_combining_char(wchar_t c); bool is_combining_char(wchar_t c);
bool is_extender(wchar_t c); bool is_extender(wchar_t c);
bool is_letter_or_digit(wchar_t c);
} // namespace XML } // namespace XML
} // namespace Arabica } // namespace Arabica

View file

@ -268,4 +268,10 @@ bool Arabica::XML::is_extender(wchar_t c)
((c >= 0x30FC) && (c <= 0x30FE)); ((c >= 0x30FC) && (c <= 0x30FE));
} // is_extender } // is_extender
bool Arabica::XML::is_letter_or_digit(wchar_t c)
{
return is_letter(c) || is_digit(c);
} // is_letter_or_digit
// end of file // end of file