2007-07-19 19:01:57 +02:00
|
|
|
#ifndef ARABICA_UTILS_NORMALIZE_WHITESPACE_HPP
|
|
|
|
#define ARABICA_UTILS_NORMALIZE_WHITESPACE_HPP
|
|
|
|
|
2007-09-10 19:52:04 +02:00
|
|
|
#include <Arabica/StringAdaptor.hpp>
|
2007-09-05 00:55:47 +02:00
|
|
|
#include <XML/XMLCharacterClasses.hpp>
|
2007-09-10 19:24:17 +02:00
|
|
|
#include <text/UnicodeCharacters.hpp>
|
2007-07-19 19:01:57 +02:00
|
|
|
|
|
|
|
namespace Arabica
|
|
|
|
{
|
2007-09-10 19:24:17 +02:00
|
|
|
namespace text
|
2007-07-19 19:01:57 +02:00
|
|
|
{
|
|
|
|
|
|
|
|
template<class string_type, class string_adaptor>
|
2008-04-28 11:13:49 +02:00
|
|
|
inline string_type normalize_whitespace(const string_type& ch)
|
2007-07-19 19:01:57 +02:00
|
|
|
{
|
|
|
|
std::string value = string_adaptor::asStdString(ch);
|
|
|
|
std::string stripped = normalize_whitespace<std::string, Arabica::default_string_adaptor<std::string> >(value);
|
|
|
|
return string_adaptor::construct_from_utf8(stripped.c_str());
|
|
|
|
} // normalize_whitespace
|
|
|
|
|
|
|
|
template<>
|
2008-04-28 11:13:49 +02:00
|
|
|
inline std::string normalize_whitespace<std::string, Arabica::default_string_adaptor<std::string> >(const std::string& ch)
|
2007-07-19 19:01:57 +02:00
|
|
|
{
|
|
|
|
std::string value(ch);
|
|
|
|
std::string::const_iterator i = value.begin(), ie = value.end();
|
|
|
|
std::string::iterator p = value.begin(), pe = value.end();
|
|
|
|
|
|
|
|
// string leading space
|
|
|
|
while((i != ie) && (Arabica::XML::is_space(static_cast<char>(*i))))
|
|
|
|
++i;
|
|
|
|
|
|
|
|
while(i != ie)
|
|
|
|
{
|
|
|
|
while((i != ie) && (!Arabica::XML::is_space(static_cast<char>(*i))))
|
|
|
|
*p++ = *i++;
|
|
|
|
while((i != ie) && (Arabica::XML::is_space(static_cast<char>(*i))))
|
|
|
|
++i;
|
|
|
|
if(i != ie)
|
2007-09-10 19:24:17 +02:00
|
|
|
*p++ = Arabica::text::Unicode<char>::SPACE;
|
2007-07-19 19:01:57 +02:00
|
|
|
} // while ...
|
|
|
|
if(p != pe)
|
|
|
|
value.erase(p, pe);
|
|
|
|
|
|
|
|
return value;
|
|
|
|
} // normalize_whitespace
|
|
|
|
|
2007-09-10 19:24:17 +02:00
|
|
|
} // namespace text
|
2007-07-19 19:01:57 +02:00
|
|
|
} // namespace Arabica
|
|
|
|
#endif
|
|
|
|
|