diff --git a/src/Makefile.am b/src/Makefile.am index cd3f21a5..ac73dcca 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -13,15 +13,14 @@ cc_sources = arabica.cpp \ Utils/utf16utf8codecvt.cpp \ Utils/utf8iso88591codecvt.cpp \ Utils/utf8ucs2codecvt.cpp \ + Utils/uri.cpp \ XML/XMLCharacterClasses.cpp -library_includedir=$(includedir) - -INCLUDES = -I$(top_srcdir)/include $(PARSER_HEADERS) $(BOOST_CPPFLAGS) +AM_CPPFLAGS = -I$(top_srcdir)/include @PARSER_HEADERS@ $(BOOST_CPPFLAGS) lib_LTLIBRARIES = libarabica.la libarabica_la_SOURCES= $(cc_sources) -libarabica_la_LDFLAGS= $(PARSER_LIBS) +libarabica_la_LDFLAGS= @PARSER_LIBS@ diff --git a/src/SAX/helpers/InputSourceResolver.cpp b/src/SAX/helpers/InputSourceResolver.cpp index 9cd3d62e..3a6940ac 100644 --- a/src/SAX/helpers/InputSourceResolver.cpp +++ b/src/SAX/helpers/InputSourceResolver.cpp @@ -10,6 +10,8 @@ #include #include #include +#include +#include InputSourceResolver::InputSourceResolver(const SAX::InputSource& inputSource) : deleteStream_(false), @@ -30,11 +32,11 @@ void InputSourceResolver::open(const std::string& publicId, return; } - // does it look like a URL? - std::string::size_type colonIndex = systemId.find("://"); - if(colonIndex != std::string::npos) + // does it look like a URI? + Arabica::io::URI url(systemId); + if(!url.scheme().empty()) { - URIResolver res = findResolver(systemId.substr(0, colonIndex)); + URIResolver res = findResolver(url.scheme()); if(res) byteStream_ = res(systemId); if(byteStream_) @@ -45,7 +47,7 @@ void InputSourceResolver::open(const std::string& publicId, } // if ... // try and open it as a file - std::ifstream* ifs = new std::ifstream(systemId.c_str()); + std::ifstream* ifs = new std::ifstream(url.path().c_str()); if(ifs->is_open()) { deleteStream_ = true; @@ -85,10 +87,10 @@ InputSourceResolver::URIResolver InputSourceResolver::findResolver(std::string m namespace { - std::istream* fileResolver(const std::string& fileURL) + std::istream* fileResolver(const std::string& fileURI) { - int colon = fileURL.find("://"); - std::string fileName = fileURL.substr(colon+3); + Arabica::io::URI url(fileURI); + std::string fileName = url.path(); std::ifstream* ifs = new std::ifstream(fileName.c_str()); if(ifs->is_open()) @@ -113,7 +115,7 @@ namespace static bool fileReg = InputSourceResolver::registerResolver("file", fileResolver); - std::istream* httpResolver(const std::string& httpURL) + std::istream* httpResolver(const std::string& httpURI) { #ifdef ARABICA_USE_WINSOCK WORD wVersionRequested; @@ -126,23 +128,16 @@ namespace return 0; #endif - int colon1 = httpURL.find("://"); - colon1 += 3; - //int colon2 = httpURL.find("://", colon1); - int slash1 = httpURL.find("/", colon1); + Arabica::io::URI url(httpURI); - std::string hostName = httpURL.substr(colon1, slash1 - (colon1)); - std::string path = httpURL.substr(slash1); - - Arabica::socketstream* ifs = new Arabica::socketstream(hostName.c_str(), 80); + Arabica::socketstream* ifs = new Arabica::socketstream(url.host().c_str(), std::atoi(url.port().c_str())); if(!ifs->is_open()) return 0; - *ifs << "GET " << path << " HTTP/1.0" << std::endl; - *ifs << "Host: " << hostName << std::endl; + *ifs << "GET " << url.path() << " HTTP/1.0" << std::endl; + *ifs << "Host: " << url.host() << std::endl; *ifs << "Connection: close" << std::endl; *ifs << std::endl; - char buffer[1024]; do { diff --git a/src/SAX/wrappers/saxlibxml2.cpp b/src/SAX/wrappers/saxlibxml2.cpp index 59ed5425..97492b5c 100644 --- a/src/SAX/wrappers/saxlibxml2.cpp +++ b/src/SAX/wrappers/saxlibxml2.cpp @@ -64,6 +64,12 @@ void lwit_processingInstruction(void *user_data, const xmlChar* target, const xm p->SAXprocessingInstruction(target, data); } // lwit_processingInstruction +void lwit_comment(void *user_data, const xmlChar* comment) +{ + libxml2_base* p = reinterpret_cast(user_data); + p->SAXcomment(comment); +} // lwit_comment + void lwit_warning(void *user_data, const char* fmt, ...) { va_list arg; @@ -168,7 +174,7 @@ static xmlSAXHandler saxHandler = { lwit_characters, // charactersSAXFunc characters; lwit_ignorableWhitespace, // ignorableWhitespaceSAXFunc ignorableWhitespace; lwit_processingInstruction, // processingInstructionSAXFunc processingInstruction; - 0, // commentSAXFunc comment; + lwit_comment, // commentSAXFunc comment; lwit_warning, // warningSAXFunc warning; lwit_error, // errorSAXFunc error; lwit_fatalError, // fatalErrorSAXFunc fatalError; diff --git a/src/Utils/base64codecvt.cpp b/src/Utils/base64codecvt.cpp index 549365ee..2838af92 100644 --- a/src/Utils/base64codecvt.cpp +++ b/src/Utils/base64codecvt.cpp @@ -82,7 +82,7 @@ std::codecvt_base::result base64codecvt::do_in(std::mbstate_t& state, from_next = from; to_next = to; - while((from_next != from_end) && (to != to_limit)) + while((from_next != from_end) && (to_next != to_limit)) { char b = *from_next++; size_t i = base64_charset.find(b); diff --git a/src/Utils/uri.cpp b/src/Utils/uri.cpp new file mode 100644 index 00000000..28db6f10 --- /dev/null +++ b/src/Utils/uri.cpp @@ -0,0 +1,141 @@ + +#include +#include + +using namespace Arabica::io; + +namespace { + const std::string ZERO = "0"; + const std::string PORT_EIGHTY = "80"; + const std::string PORT_443 = "443"; + + const std::string& wellKnownPort(const std::string& scheme) + { + if(scheme.empty()) + return ZERO; + + if(scheme == "http") + return PORT_EIGHTY; + if(scheme == "https") + return PORT_443; + + return ZERO; + } // wellKnownPort +} // namespace + +URI::URI(const std::string& uri) +{ + parse(uri); +} // URI + +URI::URI(const URI& base, const std::string& relativeUrl) : + scheme_(base.scheme_), + host_(base.host_), + path_(base.path_), + port_(base.port_) +{ + URI relUrl(relativeUrl); + absolutise(relUrl); +} // URI + +const std::string& URI::port() const +{ + if(port_.empty()) + return wellKnownPort(scheme_); + return port_; +} // port() + +std::string URI::as_string() const +{ + std::string str; + if(!scheme_.empty()) + str.append(scheme_).append("://"); + if(!host_.empty()) + { + str.append(host_); + if(!port_.empty()) + str.append(":").append(port_); + } + str.append(path_); + return str; +} // as_string + + +void URI::parse(const std::string& uri) +{ + // I'd like to use something a bit stronger - http://code.google.com/p/uri-grammar/ + // but that would put a Boost Spirit dependence right in the core, which I'm not prepared to do at the moment + + int d = uri.find_first_of(":"); + if(d == std::string::npos) + { + path_ = uri; + return; + } // if ... + + scheme_ = uri.substr(0, d); + + std::string::const_iterator u = uri.begin() + d; + std::string::const_iterator ue = uri.end(); + + ++u; + if(*u == '/' && *(u+1) == '/') + { + u += 2; + parseAuthority(u, ue); + } // if ... + + path_.append(u, ue); +} // parse + +void URI::parseAuthority(std::string::const_iterator& u, std::string::const_iterator& ue) +{ + std::string::const_iterator slash = std::find(u, ue, '/'); + if(slash == ue) + return; + + std::string::const_iterator colon = std::find(u, slash, ':'); + host_.append(u, colon); + + if(colon != slash) + port_.append(colon+1, slash); + + u = slash; +} // parseAuthority + +void URI::absolutise(URI& relative) +{ + if(!relative.scheme().empty()) + { + swap(relative); + return; + } + + if(relative.path_[0] == '/') + path_ = relative.path_; + else + combinePath(relative.path_); +} // absolutise + +void URI::combinePath(const std::string& relPath) +{ + if(*(path_.rbegin()) != '/') + path_.erase(path_.rfind('/')+1); + + path_.append(relPath); + + int dots = path_.find("/../"); + while(dots != std::string::npos) + { + int preceding_slash = (dots > 0) ? path_.rfind('/', dots-1) : 0; + path_.erase(preceding_slash, dots+3-preceding_slash); + dots = path_.find("/../"); + } // while + + int dot = path_.find("/./"); + while(dot != std::string::npos) + { + path_.erase(dot, 2); + dot = path_.find("/./", dot); + } +} // combinePath \ No newline at end of file