libraries/libuchardet: Updated for version 0.0.5.

Signed-off-by: Robby Workman <rworkman@slackbuilds.org>
2024-10-04 07:54:46 +02:00 · 2016-07-24 21:33:01 -05:00 · 2016-07-24 21:33:01 -05:00 · cea1efabbd
commit cea1efabbd
parent 1151ce1229
6 changed files with 168 additions and 17 deletions
--- a/libraries/libuchardet/README
+++ b/libraries/libuchardet/README
@ -1,8 +1,8 @@
 libuchardet (encoding detector library)

-uchardet is a C language binding of the original C++ implementation of the
-universal charset detection library by Mozilla.
+uchardet uchardet is a C language binding of the original C++
+implementation of the universal charset detection library by Mozilla.

-uchardet is an encoding detector library, which takes a sequence of bytes
-in an unknown character encoding without any additional information, and attempts
-to determine the encoding of the text.
+uchardet is an encoding detector library, which takes a sequence of
+bytes in an unknown character encoding without any additional
+information, and attempts to determine the encoding of the text.
--- a/libraries/libuchardet/libuchardet.SlackBuild
+++ b/libraries/libuchardet/libuchardet.SlackBuild
@ -1,7 +1,7 @@
 #!/bin/sh
-#
+
 # Slackware build script for libuchardet.
-#
+
 # Copyright 2015 Edinaldo P. Silva, Rio de Janeiro, Brazil.
 # All rights reserved.
 #
@ -23,7 +23,7 @@
 #  ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 PRGNAM=libuchardet
-VERSION=${VERSION:-0.0.1}
+VERSION=${VERSION:-0.0.5}
 BUILD=${BUILD:-1}
 TAG=${TAG:-_SBo}

@ -57,12 +57,13 @@ fi
 set -e

 SRCNAM="uchardet"
+SRCVER="v0.0.5"

 rm -rf $PKG
 mkdir -p $TMP $PKG $OUTPUT
 rm -rf $TMP/$PRGNAM-$VERSION
 cd $TMP
-tar xvf $CWD/$SRCNAM-$VERSION.tar.gz
+tar xvf $CWD/$SRCVER.tar.gz
 mv $SRCNAM-$VERSION $PRGNAM-$VERSION
 cd $PRGNAM-$VERSION
 chown -R root:root .
@ -72,12 +73,16 @@ find -L . \
 \( -perm 666 -o -perm 664 -o -perm 640 -o -perm 600 -o -perm 444 \
  -o -perm 440 -o -perm 400 \) -exec chmod 644 {} \;

+patch -Np1 < $CWD/uchardet-0.0.5-fix-ASCII-detection.patch
+patch -Np1 < $CWD/uchardet-0.0.5-use-proper-package-name.patch
+
 cmake \
  -DCMAKE_CXX_FLAGS:STRING="$SLKCFLAGS" \
  -DCMAKE_INSTALL_PREFIX=/usr \
  -DCMAKE_INSTALL_LIBDIR=/usr/lib${LIBDIRSUFFIX} \
  .
 make
+#make test
 make install DESTDIR=$PKG

 find $PKG -print0 | xargs -0 file | grep -e "executable" -e "shared object" | grep ELF \
@ -89,7 +94,7 @@ for i in $( find $PKG/usr/man -type l ) ; do ln -s $( readlink $i ).gz $i.gz ; r
 rm -rf $PKG/usr/share

 mkdir -p $PKG/usr/doc/$PRGNAM-$VERSION
-cp -a AUTHORS COPYING INSTALL $PKG/usr/doc/$PRGNAM-$VERSION
+cp -a AUTHORS COPYING INSTALL README.md $PKG/usr/doc/$PRGNAM-$VERSION
 cat $CWD/$PRGNAM.SlackBuild > $PKG/usr/doc/$PRGNAM-$VERSION/$PRGNAM-SlackBuild

 mkdir -p $PKG/install
--- a/libraries/libuchardet/libuchardet.info
+++ b/libraries/libuchardet/libuchardet.info
@ -1,8 +1,8 @@
 PRGNAM="libuchardet"
-VERSION="0.0.1"
+VERSION="0.0.5"
 HOMEPAGE="https://github.com/BYVoid/uchardet"
-DOWNLOAD="http://uchardet.googlecode.com/files/uchardet-0.0.1.tar.gz"
-MD5SUM="9c17f0aca38c66c95d400691a9160b1b"
+DOWNLOAD="https://github.com/BYVoid/uchardet/archive/v0.0.5.tar.gz"
+MD5SUM="2421993e7b098366bd008d81385150b6"
 DOWNLOAD_x86_64=""
 MD5SUM_x86_64=""
 REQUIRES=""
--- a/libraries/libuchardet/slack-desc
+++ b/libraries/libuchardet/slack-desc
@ -8,10 +8,10 @@
           |-----handy-ruler------------------------------------------------------|
 libuchardet: libuchardet (encoding detector library)
 libuchardet:
-libuchardet: uchardet uchardet is a C language binding of the original C++ 
-libuchardet: implementation of the universal charset detection library by Mozilla. 
-libuchardet: uchardet is an encoding detector library, which takes a sequence of 
-libuchardet: bytes in an unknown character encoding without any additional 
+libuchardet: uchardet uchardet is a C language binding of the original C++
+libuchardet: implementation of the universal charset detection library by Mozilla.
+libuchardet: uchardet is an encoding detector library, which takes a sequence of
+libuchardet: bytes in an unknown character encoding without any additional
 libuchardet: information, and attempts to determine the encoding of the text.
 libuchardet:
 libuchardet: Home page: https://github.com/BYVoid/uchardet/
--- a/libraries/libuchardet/uchardet-0.0.5-fix-ASCII-detection.patch
+++ b/libraries/libuchardet/uchardet-0.0.5-fix-ASCII-detection.patch
@ -0,0 +1,116 @@
+commit 4c8316f9cfda38d75fb015c0eb40e0eebb03d28f
+Author: Jehan <jehan@girinstud.io>
+Date:   Sat Dec 5 21:04:20 2015 +0100
+
+    Nearly-ASCII text with NBSP is still not ASCII.
+    
+    There is no "exception" in encoding. The non-breaking space 0xA0 is not
+    ASCII, and therefore returning "ASCII" will later create issues (for
+    instance trying to re-encode with iconv produces an error).
+    This was obviously an explicit decision in original code (according to
+    code comments), probably tied to specifity of the original program from
+    Mozilla. Now we want strict detection.
+    I will return "ISO-8859-1" for "nearly-ASCII texts with NBSP as only
+    exception" (note that I could have returned any ISO-8859 charsets since
+    they all have this character in common).
+
+diff --git a/src/nsUniversalDetector.cpp b/src/nsUniversalDetector.cpp
+index ab8bae0..ff06b9d 100644
+--- a/src/nsUniversalDetector.cpp
+++ b/src/nsUniversalDetector.cpp
+@@ -47,6 +47,7 @@
+
+ nsUniversalDetector::nsUniversalDetector(PRUint32 aLanguageFilter)
+ {
+  mNbspFound = PR_FALSE;
+   mDone = PR_FALSE;
+   mBestGuess = -1;   //illegal value as signal
+   mInTag = PR_FALSE;
+@@ -75,6 +76,7 @@ nsUniversalDetector::~nsUniversalDetector()
+ void
+ nsUniversalDetector::Reset()
+ {
+  mNbspFound = PR_FALSE;
+   mDone = PR_FALSE;
+   mBestGuess = -1;   //illegal value as signal
+   mInTag = PR_FALSE;
+@@ -162,9 +164,10 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
+   PRUint32 i;
+   for (i = 0; i < aLen; i++)
+   {
+-    /* Other than 0xA0, if every other character is ASCII, the page is ASCII.
+    /* If every other character is ASCII or 0xA0, we don't run charset
+     * probers.
+      * 0xA0 (NBSP in a few charset) is apparently a rare exception
+-     * of non-ASCII character contained in ASCII text. */
+     * of non-ASCII character often contained in nearly-ASCII text. */
+     if (aBuf[i] & '\x80' && aBuf[i] != '\xA0')
+     {
+       /* We got a non-ASCII byte (high-byte) */
+@@ -203,11 +206,19 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
+     }
+     else
+     {
+-      //ok, just pure ascii so far
+-      if ( ePureAscii == mInputState &&
+-        (aBuf[i] == '\033' || (aBuf[i] == '{' && mLastChar == '~')) )
+      /* Just pure ASCII or NBSP so far. */
+      if (aBuf[i] == '\xA0')
+       {
+-        //found escape character or HZ "~{"
+        /* ASCII with the only exception of NBSP seems quite common.
+         * I doubt it is really necessary to train a model here, so let's
+         * just make an exception.
+         */
+          mNbspFound = PR_TRUE;
+      }
+      else if (mInputState == ePureAscii &&
+               (aBuf[i] == '\033' || (aBuf[i] == '{' && mLastChar == '~')))
+      {
+        /* We found an escape character or HZ "~{". */
+         mInputState = eEscAscii;
+       }
+       mLastChar = aBuf[i];
+@@ -229,6 +240,10 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
+       mDone = PR_TRUE;
+       mDetectedCharset = mEscCharSetProber->GetCharSetName();
+     }
+    else if (mNbspFound)
+    {
+      mDetectedCharset = "ISO-8859-1";
+    }
+     else
+     {
+       /* ASCII with the ESC character (or the sequence "~{") is still
+@@ -253,8 +268,17 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
+     break;
+
+   default:
+-    /* Pure ASCII */
+-    mDetectedCharset = "ASCII";
+    if (mNbspFound)
+    {
+      /* ISO-8859-1 is a good result candidate for ASCII + NBSP.
+       * (though it could have been any ISO-8859 encoding). */
+      mDetectedCharset = "ISO-8859-1";
+    }
+    else
+    {
+      /* Pure ASCII */
+      mDetectedCharset = "ASCII";
+    }
+     break;
+   }
+   return NS_OK;
+diff --git a/src/nsUniversalDetector.h b/src/nsUniversalDetector.h
+index 4d9b460..9f0a4b1 100644
+--- a/src/nsUniversalDetector.h
+++ b/src/nsUniversalDetector.h
+@@ -72,6 +72,7 @@ protected:
+    virtual void Report(const char* aCharset) = 0;
+    virtual void Reset();
+    nsInputState  mInputState;
+   PRBool  mNbspFound;
+    PRBool  mDone;
+    PRBool  mInTag;
+    PRBool  mStart;
--- a/libraries/libuchardet/uchardet-0.0.5-use-proper-package-name.patch
+++ b/libraries/libuchardet/uchardet-0.0.5-use-proper-package-name.patch
@ -0,0 +1,30 @@
+commit b6d872bbec3be7abfccbdfd3d90e784cf7281c55
+Author: Jehan <jehan@girinstud.io>
+Date:   Tue Dec 15 21:40:16 2015 +0100
+
+    app: package name wrong in CMakeLists.txt.
+    
+    Probably coming from a copy-paste error when the build system was
+    originally created.
+
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index 0b65c49..4f279e1 100644
+--- a/CMakeLists.txt
+++ b/CMakeLists.txt
+@@ -1,6 +1,6 @@
+ ######## Project settings
+ cmake_minimum_required(VERSION 2.8)
+-set (PACKAGE_NAME opencc)
+set (PACKAGE_NAME uchardet)
+ project (${PACKAGE_NAME} CXX C)
+ enable_testing()
+
+@@ -54,7 +54,7 @@ if (DEFINED SYSCONF_INSTALL_DIR)
+ 	set (DIR_ETC ${SYSCONF_INSTALL_DIR})
+ endif (DEFINED SYSCONF_INSTALL_DIR)
+
+-set (DIR_SHARE_UCHARDET ${DIR_SHARE}/opencc)
+set (DIR_SHARE_UCHARDET ${DIR_SHARE}/uchardet)
+ set (DIR_SHARE_LOCALE ${DIR_SHARE}/locale)
+
+ ######## Configuration