arabica/include/text/normalize_whitespace.hpp

#ifndef ARABICA_UTILS_NORMALIZE_WHITESPACE_HPP
#define ARABICA_UTILS_NORMALIZE_WHITESPACE_HPP

#include <Arabica/StringAdaptor.hpp>
#include <XML/XMLCharacterClasses.hpp>
#include <text/UnicodeCharacters.hpp>

namespace Arabica
{
namespace text
{

template<class string_type, class string_adaptor>
inline string_type normalize_whitespace(const string_type& ch)
{
  std::string value = string_adaptor::asStdString(ch);
  std::string stripped = normalize_whitespace<std::string, Arabica::default_string_adaptor<std::string> >(value);
  return string_adaptor::construct_from_utf8(stripped.c_str());
} // normalize_whitespace

template<>
inline std::string normalize_whitespace<std::string, Arabica::default_string_adaptor<std::string> >(const std::string& ch)
{
  std::string value(ch);
  std::string::const_iterator i = value.begin(), ie = value.end();
  std::string::iterator p = value.begin(), pe = value.end();

  // string leading space
  while((i != ie) && (Arabica::XML::is_space(static_cast<char>(*i))))
    ++i;
    
  while(i != ie)
  {
    while((i != ie) && (!Arabica::XML::is_space(static_cast<char>(*i)))) 
      *p++ = *i++;
    while((i != ie) && (Arabica::XML::is_space(static_cast<char>(*i))))
      ++i;
    if(i != ie)
      *p++ = Arabica::text::Unicode<char>::SPACE;
  } // while ...
  if(p != pe)
    value.erase(p, pe);

  return value;
} // normalize_whitespace

} // namespace text
} // namespace Arabica
#endif
merged with mangle-dev branch 2007-07-19 19:01:57 +02:00			`#ifndef ARABICA_UTILS_NORMALIZE_WHITESPACE_HPP`
			`#define ARABICA_UTILS_NORMALIZE_WHITESPACE_HPP`

renamed include/Utils to include/Arabica 2007-09-10 19:52:04 +02:00			`#include <Arabica/StringAdaptor.hpp>`
renamed all .h to .hpp 2007-09-05 00:55:47 +02:00			`#include <XML/XMLCharacterClasses.hpp>`
more namespace and file moving shenanigans 2007-09-10 19:24:17 +02:00			`#include <text/UnicodeCharacters.hpp>`
merged with mangle-dev branch 2007-07-19 19:01:57 +02:00
			`namespace Arabica`
			`{`
more namespace and file moving shenanigans 2007-09-10 19:24:17 +02:00			`namespace text`
merged with mangle-dev branch 2007-07-19 19:01:57 +02:00			`{`

			`template<class string_type, class string_adaptor>`
marked functions as inline as requested, although not entirely sure if it should be necessary :) 2008-04-28 11:13:49 +02:00			`inline string_type normalize_whitespace(const string_type& ch)`
merged with mangle-dev branch 2007-07-19 19:01:57 +02:00			`{`
			`std::string value = string_adaptor::asStdString(ch);`
			`std::string stripped = normalize_whitespace<std::string, Arabica::default_string_adaptor<std::string> >(value);`
			`return string_adaptor::construct_from_utf8(stripped.c_str());`
			`} // normalize_whitespace`

			`template<>`
marked functions as inline as requested, although not entirely sure if it should be necessary :) 2008-04-28 11:13:49 +02:00			`inline std::string normalize_whitespace<std::string, Arabica::default_string_adaptor<std::string> >(const std::string& ch)`
merged with mangle-dev branch 2007-07-19 19:01:57 +02:00			`{`
			`std::string value(ch);`
			`std::string::const_iterator i = value.begin(), ie = value.end();`
			`std::string::iterator p = value.begin(), pe = value.end();`

			`// string leading space`
			`while((i != ie) && (Arabica::XML::is_space(static_cast<char>(*i))))`
			`++i;`

			`while(i != ie)`
			`{`
			`while((i != ie) && (!Arabica::XML::is_space(static_cast<char>(*i))))`
			`p++ = i++;`
			`while((i != ie) && (Arabica::XML::is_space(static_cast<char>(*i))))`
			`++i;`
			`if(i != ie)`
more namespace and file moving shenanigans 2007-09-10 19:24:17 +02:00			`*p++ = Arabica::text::Unicode<char>::SPACE;`
merged with mangle-dev branch 2007-07-19 19:01:57 +02:00			`} // while ...`
			`if(p != pe)`
			`value.erase(p, pe);`

			`return value;`
			`} // normalize_whitespace`

more namespace and file moving shenanigans 2007-09-10 19:24:17 +02:00			`} // namespace text`
merged with mangle-dev branch 2007-07-19 19:01:57 +02:00			`} // namespace Arabica`
			`#endif`