mirror of
https://github.com/Ponce/slackbuilds
synced 2024-11-22 19:44:21 +01:00
cea1efabbd
Signed-off-by: Robby Workman <rworkman@slackbuilds.org>
116 lines
4 KiB
Diff
116 lines
4 KiB
Diff
commit 4c8316f9cfda38d75fb015c0eb40e0eebb03d28f
|
|
Author: Jehan <jehan@girinstud.io>
|
|
Date: Sat Dec 5 21:04:20 2015 +0100
|
|
|
|
Nearly-ASCII text with NBSP is still not ASCII.
|
|
|
|
There is no "exception" in encoding. The non-breaking space 0xA0 is not
|
|
ASCII, and therefore returning "ASCII" will later create issues (for
|
|
instance trying to re-encode with iconv produces an error).
|
|
This was obviously an explicit decision in original code (according to
|
|
code comments), probably tied to specifity of the original program from
|
|
Mozilla. Now we want strict detection.
|
|
I will return "ISO-8859-1" for "nearly-ASCII texts with NBSP as only
|
|
exception" (note that I could have returned any ISO-8859 charsets since
|
|
they all have this character in common).
|
|
|
|
diff --git a/src/nsUniversalDetector.cpp b/src/nsUniversalDetector.cpp
|
|
index ab8bae0..ff06b9d 100644
|
|
--- a/src/nsUniversalDetector.cpp
|
|
+++ b/src/nsUniversalDetector.cpp
|
|
@@ -47,6 +47,7 @@
|
|
|
|
nsUniversalDetector::nsUniversalDetector(PRUint32 aLanguageFilter)
|
|
{
|
|
+ mNbspFound = PR_FALSE;
|
|
mDone = PR_FALSE;
|
|
mBestGuess = -1; //illegal value as signal
|
|
mInTag = PR_FALSE;
|
|
@@ -75,6 +76,7 @@ nsUniversalDetector::~nsUniversalDetector()
|
|
void
|
|
nsUniversalDetector::Reset()
|
|
{
|
|
+ mNbspFound = PR_FALSE;
|
|
mDone = PR_FALSE;
|
|
mBestGuess = -1; //illegal value as signal
|
|
mInTag = PR_FALSE;
|
|
@@ -162,9 +164,10 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
|
|
PRUint32 i;
|
|
for (i = 0; i < aLen; i++)
|
|
{
|
|
- /* Other than 0xA0, if every other character is ASCII, the page is ASCII.
|
|
+ /* If every other character is ASCII or 0xA0, we don't run charset
|
|
+ * probers.
|
|
* 0xA0 (NBSP in a few charset) is apparently a rare exception
|
|
- * of non-ASCII character contained in ASCII text. */
|
|
+ * of non-ASCII character often contained in nearly-ASCII text. */
|
|
if (aBuf[i] & '\x80' && aBuf[i] != '\xA0')
|
|
{
|
|
/* We got a non-ASCII byte (high-byte) */
|
|
@@ -203,11 +206,19 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
|
|
}
|
|
else
|
|
{
|
|
- //ok, just pure ascii so far
|
|
- if ( ePureAscii == mInputState &&
|
|
- (aBuf[i] == '\033' || (aBuf[i] == '{' && mLastChar == '~')) )
|
|
+ /* Just pure ASCII or NBSP so far. */
|
|
+ if (aBuf[i] == '\xA0')
|
|
{
|
|
- //found escape character or HZ "~{"
|
|
+ /* ASCII with the only exception of NBSP seems quite common.
|
|
+ * I doubt it is really necessary to train a model here, so let's
|
|
+ * just make an exception.
|
|
+ */
|
|
+ mNbspFound = PR_TRUE;
|
|
+ }
|
|
+ else if (mInputState == ePureAscii &&
|
|
+ (aBuf[i] == '\033' || (aBuf[i] == '{' && mLastChar == '~')))
|
|
+ {
|
|
+ /* We found an escape character or HZ "~{". */
|
|
mInputState = eEscAscii;
|
|
}
|
|
mLastChar = aBuf[i];
|
|
@@ -229,6 +240,10 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
|
|
mDone = PR_TRUE;
|
|
mDetectedCharset = mEscCharSetProber->GetCharSetName();
|
|
}
|
|
+ else if (mNbspFound)
|
|
+ {
|
|
+ mDetectedCharset = "ISO-8859-1";
|
|
+ }
|
|
else
|
|
{
|
|
/* ASCII with the ESC character (or the sequence "~{") is still
|
|
@@ -253,8 +268,17 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
|
|
break;
|
|
|
|
default:
|
|
- /* Pure ASCII */
|
|
- mDetectedCharset = "ASCII";
|
|
+ if (mNbspFound)
|
|
+ {
|
|
+ /* ISO-8859-1 is a good result candidate for ASCII + NBSP.
|
|
+ * (though it could have been any ISO-8859 encoding). */
|
|
+ mDetectedCharset = "ISO-8859-1";
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ /* Pure ASCII */
|
|
+ mDetectedCharset = "ASCII";
|
|
+ }
|
|
break;
|
|
}
|
|
return NS_OK;
|
|
diff --git a/src/nsUniversalDetector.h b/src/nsUniversalDetector.h
|
|
index 4d9b460..9f0a4b1 100644
|
|
--- a/src/nsUniversalDetector.h
|
|
+++ b/src/nsUniversalDetector.h
|
|
@@ -72,6 +72,7 @@ protected:
|
|
virtual void Report(const char* aCharset) = 0;
|
|
virtual void Reset();
|
|
nsInputState mInputState;
|
|
+ PRBool mNbspFound;
|
|
PRBool mDone;
|
|
PRBool mInTag;
|
|
PRBool mStart;
|