mirror of
https://github.com/Ponce/slackbuilds
synced 2024-10-04 07:54:46 +02:00
libraries/libuchardet: Updated for version 0.0.5.
Signed-off-by: Robby Workman <rworkman@slackbuilds.org>
This commit is contained in:
parent
1151ce1229
commit
cea1efabbd
6 changed files with 168 additions and 17 deletions
|
@ -1,8 +1,8 @@
|
|||
libuchardet (encoding detector library)
|
||||
|
||||
uchardet is a C language binding of the original C++ implementation of the
|
||||
universal charset detection library by Mozilla.
|
||||
uchardet uchardet is a C language binding of the original C++
|
||||
implementation of the universal charset detection library by Mozilla.
|
||||
|
||||
uchardet is an encoding detector library, which takes a sequence of bytes
|
||||
in an unknown character encoding without any additional information, and attempts
|
||||
to determine the encoding of the text.
|
||||
uchardet is an encoding detector library, which takes a sequence of
|
||||
bytes in an unknown character encoding without any additional
|
||||
information, and attempts to determine the encoding of the text.
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
#!/bin/sh
|
||||
#
|
||||
|
||||
# Slackware build script for libuchardet.
|
||||
#
|
||||
|
||||
# Copyright 2015 Edinaldo P. Silva, Rio de Janeiro, Brazil.
|
||||
# All rights reserved.
|
||||
#
|
||||
|
@ -23,7 +23,7 @@
|
|||
# ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
PRGNAM=libuchardet
|
||||
VERSION=${VERSION:-0.0.1}
|
||||
VERSION=${VERSION:-0.0.5}
|
||||
BUILD=${BUILD:-1}
|
||||
TAG=${TAG:-_SBo}
|
||||
|
||||
|
@ -57,12 +57,13 @@ fi
|
|||
set -e
|
||||
|
||||
SRCNAM="uchardet"
|
||||
SRCVER="v0.0.5"
|
||||
|
||||
rm -rf $PKG
|
||||
mkdir -p $TMP $PKG $OUTPUT
|
||||
rm -rf $TMP/$PRGNAM-$VERSION
|
||||
cd $TMP
|
||||
tar xvf $CWD/$SRCNAM-$VERSION.tar.gz
|
||||
tar xvf $CWD/$SRCVER.tar.gz
|
||||
mv $SRCNAM-$VERSION $PRGNAM-$VERSION
|
||||
cd $PRGNAM-$VERSION
|
||||
chown -R root:root .
|
||||
|
@ -72,12 +73,16 @@ find -L . \
|
|||
\( -perm 666 -o -perm 664 -o -perm 640 -o -perm 600 -o -perm 444 \
|
||||
-o -perm 440 -o -perm 400 \) -exec chmod 644 {} \;
|
||||
|
||||
patch -Np1 < $CWD/uchardet-0.0.5-fix-ASCII-detection.patch
|
||||
patch -Np1 < $CWD/uchardet-0.0.5-use-proper-package-name.patch
|
||||
|
||||
cmake \
|
||||
-DCMAKE_CXX_FLAGS:STRING="$SLKCFLAGS" \
|
||||
-DCMAKE_INSTALL_PREFIX=/usr \
|
||||
-DCMAKE_INSTALL_LIBDIR=/usr/lib${LIBDIRSUFFIX} \
|
||||
.
|
||||
make
|
||||
#make test
|
||||
make install DESTDIR=$PKG
|
||||
|
||||
find $PKG -print0 | xargs -0 file | grep -e "executable" -e "shared object" | grep ELF \
|
||||
|
@ -89,7 +94,7 @@ for i in $( find $PKG/usr/man -type l ) ; do ln -s $( readlink $i ).gz $i.gz ; r
|
|||
rm -rf $PKG/usr/share
|
||||
|
||||
mkdir -p $PKG/usr/doc/$PRGNAM-$VERSION
|
||||
cp -a AUTHORS COPYING INSTALL $PKG/usr/doc/$PRGNAM-$VERSION
|
||||
cp -a AUTHORS COPYING INSTALL README.md $PKG/usr/doc/$PRGNAM-$VERSION
|
||||
cat $CWD/$PRGNAM.SlackBuild > $PKG/usr/doc/$PRGNAM-$VERSION/$PRGNAM-SlackBuild
|
||||
|
||||
mkdir -p $PKG/install
|
||||
|
|
|
@ -1,8 +1,8 @@
|
|||
PRGNAM="libuchardet"
|
||||
VERSION="0.0.1"
|
||||
VERSION="0.0.5"
|
||||
HOMEPAGE="https://github.com/BYVoid/uchardet"
|
||||
DOWNLOAD="http://uchardet.googlecode.com/files/uchardet-0.0.1.tar.gz"
|
||||
MD5SUM="9c17f0aca38c66c95d400691a9160b1b"
|
||||
DOWNLOAD="https://github.com/BYVoid/uchardet/archive/v0.0.5.tar.gz"
|
||||
MD5SUM="2421993e7b098366bd008d81385150b6"
|
||||
DOWNLOAD_x86_64=""
|
||||
MD5SUM_x86_64=""
|
||||
REQUIRES=""
|
||||
|
|
|
@ -8,10 +8,10 @@
|
|||
|-----handy-ruler------------------------------------------------------|
|
||||
libuchardet: libuchardet (encoding detector library)
|
||||
libuchardet:
|
||||
libuchardet: uchardet uchardet is a C language binding of the original C++
|
||||
libuchardet: implementation of the universal charset detection library by Mozilla.
|
||||
libuchardet: uchardet is an encoding detector library, which takes a sequence of
|
||||
libuchardet: bytes in an unknown character encoding without any additional
|
||||
libuchardet: uchardet uchardet is a C language binding of the original C++
|
||||
libuchardet: implementation of the universal charset detection library by Mozilla.
|
||||
libuchardet: uchardet is an encoding detector library, which takes a sequence of
|
||||
libuchardet: bytes in an unknown character encoding without any additional
|
||||
libuchardet: information, and attempts to determine the encoding of the text.
|
||||
libuchardet:
|
||||
libuchardet: Home page: https://github.com/BYVoid/uchardet/
|
||||
|
|
116
libraries/libuchardet/uchardet-0.0.5-fix-ASCII-detection.patch
Normal file
116
libraries/libuchardet/uchardet-0.0.5-fix-ASCII-detection.patch
Normal file
|
@ -0,0 +1,116 @@
|
|||
commit 4c8316f9cfda38d75fb015c0eb40e0eebb03d28f
|
||||
Author: Jehan <jehan@girinstud.io>
|
||||
Date: Sat Dec 5 21:04:20 2015 +0100
|
||||
|
||||
Nearly-ASCII text with NBSP is still not ASCII.
|
||||
|
||||
There is no "exception" in encoding. The non-breaking space 0xA0 is not
|
||||
ASCII, and therefore returning "ASCII" will later create issues (for
|
||||
instance trying to re-encode with iconv produces an error).
|
||||
This was obviously an explicit decision in original code (according to
|
||||
code comments), probably tied to specifity of the original program from
|
||||
Mozilla. Now we want strict detection.
|
||||
I will return "ISO-8859-1" for "nearly-ASCII texts with NBSP as only
|
||||
exception" (note that I could have returned any ISO-8859 charsets since
|
||||
they all have this character in common).
|
||||
|
||||
diff --git a/src/nsUniversalDetector.cpp b/src/nsUniversalDetector.cpp
|
||||
index ab8bae0..ff06b9d 100644
|
||||
--- a/src/nsUniversalDetector.cpp
|
||||
+++ b/src/nsUniversalDetector.cpp
|
||||
@@ -47,6 +47,7 @@
|
||||
|
||||
nsUniversalDetector::nsUniversalDetector(PRUint32 aLanguageFilter)
|
||||
{
|
||||
+ mNbspFound = PR_FALSE;
|
||||
mDone = PR_FALSE;
|
||||
mBestGuess = -1; //illegal value as signal
|
||||
mInTag = PR_FALSE;
|
||||
@@ -75,6 +76,7 @@ nsUniversalDetector::~nsUniversalDetector()
|
||||
void
|
||||
nsUniversalDetector::Reset()
|
||||
{
|
||||
+ mNbspFound = PR_FALSE;
|
||||
mDone = PR_FALSE;
|
||||
mBestGuess = -1; //illegal value as signal
|
||||
mInTag = PR_FALSE;
|
||||
@@ -162,9 +164,10 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
|
||||
PRUint32 i;
|
||||
for (i = 0; i < aLen; i++)
|
||||
{
|
||||
- /* Other than 0xA0, if every other character is ASCII, the page is ASCII.
|
||||
+ /* If every other character is ASCII or 0xA0, we don't run charset
|
||||
+ * probers.
|
||||
* 0xA0 (NBSP in a few charset) is apparently a rare exception
|
||||
- * of non-ASCII character contained in ASCII text. */
|
||||
+ * of non-ASCII character often contained in nearly-ASCII text. */
|
||||
if (aBuf[i] & '\x80' && aBuf[i] != '\xA0')
|
||||
{
|
||||
/* We got a non-ASCII byte (high-byte) */
|
||||
@@ -203,11 +206,19 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
|
||||
}
|
||||
else
|
||||
{
|
||||
- //ok, just pure ascii so far
|
||||
- if ( ePureAscii == mInputState &&
|
||||
- (aBuf[i] == '\033' || (aBuf[i] == '{' && mLastChar == '~')) )
|
||||
+ /* Just pure ASCII or NBSP so far. */
|
||||
+ if (aBuf[i] == '\xA0')
|
||||
{
|
||||
- //found escape character or HZ "~{"
|
||||
+ /* ASCII with the only exception of NBSP seems quite common.
|
||||
+ * I doubt it is really necessary to train a model here, so let's
|
||||
+ * just make an exception.
|
||||
+ */
|
||||
+ mNbspFound = PR_TRUE;
|
||||
+ }
|
||||
+ else if (mInputState == ePureAscii &&
|
||||
+ (aBuf[i] == '\033' || (aBuf[i] == '{' && mLastChar == '~')))
|
||||
+ {
|
||||
+ /* We found an escape character or HZ "~{". */
|
||||
mInputState = eEscAscii;
|
||||
}
|
||||
mLastChar = aBuf[i];
|
||||
@@ -229,6 +240,10 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
|
||||
mDone = PR_TRUE;
|
||||
mDetectedCharset = mEscCharSetProber->GetCharSetName();
|
||||
}
|
||||
+ else if (mNbspFound)
|
||||
+ {
|
||||
+ mDetectedCharset = "ISO-8859-1";
|
||||
+ }
|
||||
else
|
||||
{
|
||||
/* ASCII with the ESC character (or the sequence "~{") is still
|
||||
@@ -253,8 +268,17 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
|
||||
break;
|
||||
|
||||
default:
|
||||
- /* Pure ASCII */
|
||||
- mDetectedCharset = "ASCII";
|
||||
+ if (mNbspFound)
|
||||
+ {
|
||||
+ /* ISO-8859-1 is a good result candidate for ASCII + NBSP.
|
||||
+ * (though it could have been any ISO-8859 encoding). */
|
||||
+ mDetectedCharset = "ISO-8859-1";
|
||||
+ }
|
||||
+ else
|
||||
+ {
|
||||
+ /* Pure ASCII */
|
||||
+ mDetectedCharset = "ASCII";
|
||||
+ }
|
||||
break;
|
||||
}
|
||||
return NS_OK;
|
||||
diff --git a/src/nsUniversalDetector.h b/src/nsUniversalDetector.h
|
||||
index 4d9b460..9f0a4b1 100644
|
||||
--- a/src/nsUniversalDetector.h
|
||||
+++ b/src/nsUniversalDetector.h
|
||||
@@ -72,6 +72,7 @@ protected:
|
||||
virtual void Report(const char* aCharset) = 0;
|
||||
virtual void Reset();
|
||||
nsInputState mInputState;
|
||||
+ PRBool mNbspFound;
|
||||
PRBool mDone;
|
||||
PRBool mInTag;
|
||||
PRBool mStart;
|
|
@ -0,0 +1,30 @@
|
|||
commit b6d872bbec3be7abfccbdfd3d90e784cf7281c55
|
||||
Author: Jehan <jehan@girinstud.io>
|
||||
Date: Tue Dec 15 21:40:16 2015 +0100
|
||||
|
||||
app: package name wrong in CMakeLists.txt.
|
||||
|
||||
Probably coming from a copy-paste error when the build system was
|
||||
originally created.
|
||||
|
||||
diff --git a/CMakeLists.txt b/CMakeLists.txt
|
||||
index 0b65c49..4f279e1 100644
|
||||
--- a/CMakeLists.txt
|
||||
+++ b/CMakeLists.txt
|
||||
@@ -1,6 +1,6 @@
|
||||
######## Project settings
|
||||
cmake_minimum_required(VERSION 2.8)
|
||||
-set (PACKAGE_NAME opencc)
|
||||
+set (PACKAGE_NAME uchardet)
|
||||
project (${PACKAGE_NAME} CXX C)
|
||||
enable_testing()
|
||||
|
||||
@@ -54,7 +54,7 @@ if (DEFINED SYSCONF_INSTALL_DIR)
|
||||
set (DIR_ETC ${SYSCONF_INSTALL_DIR})
|
||||
endif (DEFINED SYSCONF_INSTALL_DIR)
|
||||
|
||||
-set (DIR_SHARE_UCHARDET ${DIR_SHARE}/opencc)
|
||||
+set (DIR_SHARE_UCHARDET ${DIR_SHARE}/uchardet)
|
||||
set (DIR_SHARE_LOCALE ${DIR_SHARE}/locale)
|
||||
|
||||
######## Configuration
|
Loading…
Reference in a new issue