Aegisub/universalchardet/nsUniversalDetector.cpp

/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* ***** BEGIN LICENSE BLOCK *****
 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
 *
 * The contents of this file are subject to the Mozilla Public License Version
 * 1.1 (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 * http://www.mozilla.org/MPL/
 *
 * Software distributed under the License is distributed on an "AS IS" basis,
 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
 * for the specific language governing rights and limitations under the
 * License.
 *
 * The Original Code is Mozilla Universal charset detector code.
 *
 * The Initial Developer of the Original Code is
 * Netscape Communications Corporation.
 * Portions created by the Initial Developer are Copyright (C) 2001
 * the Initial Developer. All Rights Reserved.
 *
 * Contributor(s):
 *          Shy Shalom <shooshX@gmail.com>
 *
 * Alternatively, the contents of this file may be used under the terms of
 * either the GNU General Public License Version 2 or later (the "GPL"), or
 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
 * in which case the provisions of the GPL or the LGPL are applicable instead
 * of those above. If you wish to allow use of your version of this file only
 * under the terms of either the GPL or the LGPL, and not to allow others to
 * use your version of this file under the terms of the MPL, indicate your
 * decision by deleting the provisions above and replace them with the notice
 * and other provisions required by the GPL or the LGPL. If you do not delete
 * the provisions above, a recipient may use your version of this file under
 * the terms of any one of the MPL, the GPL or the LGPL.
 *
 * ***** END LICENSE BLOCK ***** */

#include "nscore.h"

#include "nsUniversalDetector.h"

#include "nsMBCSGroupProber.h"
#include "nsSBCSGroupProber.h"
#include "nsEscCharsetProber.h"
#include "nsLatin1Prober.h"
#include "nsError.h"

nsUniversalDetector::nsUniversalDetector()
{
  mDone = PR_FALSE;
  mBestGuess = -1;   //illegal value as signal
  mInTag = PR_FALSE;
  mEscCharSetProber = nsnull;

  mStart = PR_TRUE;
  mDetectedCharset = nsnull;
  mGotData = PR_FALSE;
  mInputState = ePureAscii;
  mLastChar = '\0';

  PRUint32 i;
  for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
    mCharSetProbers[i] = nsnull;
}

nsUniversalDetector::~nsUniversalDetector() 
{
  for (PRInt32 i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
    if (mCharSetProbers[i])      
      delete mCharSetProbers[i];
  if (mEscCharSetProber)
    delete mEscCharSetProber;
}

void 
nsUniversalDetector::Reset()
{
  mDone = PR_FALSE;
  mBestGuess = -1;   //illegal value as signal
  mInTag = PR_FALSE;

  mStart = PR_TRUE;
  mDetectedCharset = nsnull;
  mGotData = PR_FALSE;
  mInputState = ePureAscii;
  mLastChar = '\0';

  if (mEscCharSetProber)
    mEscCharSetProber->Reset();

  PRUint32 i;
  for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
    if (mCharSetProbers[i])
      mCharSetProbers[i]->Reset();
}

//---------------------------------------------------------------------
#define SHORTCUT_THRESHOLD      (float)0.95
#define MINIMUM_THRESHOLD      (float)0.20

nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
{
  if(mDone) 
    return NS_OK;

  if (aLen > 0)
    mGotData = PR_TRUE;

  //If the data starts with BOM, we know it is UTF
  if (mStart)
  {
    mStart = PR_FALSE;
    if (aLen > 3)
      switch (aBuf[0])
        {
        case '\xEF':
          if (('\xBB' == aBuf[1]) && ('\xBF' == aBuf[2]))
            // EF BB BF  UTF-8 encoded BOM
            mDetectedCharset = "UTF-8";
        break;
        case '\xFE':
          if (('\xFF' == aBuf[1]) && ('\x00' == aBuf[2]) && ('\x00' == aBuf[3]))
            // FE FF 00 00  UCS-4, unusual octet order BOM (3412)
            mDetectedCharset = "X-ISO-10646-UCS-4-3412";
          else if ('\xFF' == aBuf[1])
            // FE FF  UTF-16, big endian BOM
            mDetectedCharset = "UTF-16BE";
        break;
        case '\x00':
          if (('\x00' == aBuf[1]) && ('\xFE' == aBuf[2]) && ('\xFF' == aBuf[3]))
            // 00 00 FE FF  UTF-32, big-endian BOM
            mDetectedCharset = "UTF-32BE";
          else if (('\x00' == aBuf[1]) && ('\xFF' == aBuf[2]) && ('\xFE' == aBuf[3]))
            // 00 00 FF FE  UCS-4, unusual octet order BOM (2143)
            mDetectedCharset = "X-ISO-10646-UCS-4-2143";
        break;
        case '\xFF':
          if (('\xFE' == aBuf[1]) && ('\x00' == aBuf[2]) && ('\x00' == aBuf[3]))
            // FF FE 00 00  UTF-32, little-endian BOM
            mDetectedCharset = "UTF-32LE";
          else if ('\xFE' == aBuf[1])
            // FF FE  UTF-16, little endian BOM
            mDetectedCharset = "UTF-16LE";
        break;
      }  // switch

      if (mDetectedCharset)
      {
        mDone = PR_TRUE;
        return NS_OK;
      }
  }
  
  PRUint32 i;
  for (i = 0; i < aLen; i++)
  {
    //other than 0xa0, if every othe character is ascii, the page is ascii
    if (aBuf[i] & '\x80' && aBuf[i] != '\xA0')  //Since many Ascii only page contains NBSP 
    {
      //we got a non-ascii byte (high-byte)
      if (mInputState != eHighbyte)
      {
        //adjust state
        mInputState = eHighbyte;

        //kill mEscCharSetProber if it is active
        if (mEscCharSetProber) {
          delete mEscCharSetProber;
          mEscCharSetProber = nsnull;
        }

        //start multibyte and singlebyte charset prober
        if (nsnull == mCharSetProbers[0])
          mCharSetProbers[0] = new nsMBCSGroupProber;
        if (nsnull == mCharSetProbers[1])
          mCharSetProbers[1] = new nsSBCSGroupProber;
        if (nsnull == mCharSetProbers[2])
          mCharSetProbers[2] = new nsLatin1Prober; 

        if ((nsnull == mCharSetProbers[0]) ||
            (nsnull == mCharSetProbers[1]) ||
            (nsnull == mCharSetProbers[2]))
            return NS_ERROR_OUT_OF_MEMORY;
      }
    }
    else
    {
      //ok, just pure ascii so far
      if ( ePureAscii == mInputState &&
        (aBuf[i] == '\033' || (aBuf[i] == '{' && mLastChar == '~')) )
      {
        //found escape character or HZ "~{"
        mInputState = eEscAscii;
      }
      mLastChar = aBuf[i];
    }
  }

  nsProbingState st;
  switch (mInputState)
  {
  case eEscAscii:
    if (nsnull == mEscCharSetProber) {
      mEscCharSetProber = new nsEscCharSetProber;
      if (nsnull == mEscCharSetProber)
        return NS_ERROR_OUT_OF_MEMORY;
    }
    st = mEscCharSetProber->HandleData(aBuf, aLen);
    if (st == eFoundIt)
    {
      mDone = PR_TRUE;
      mDetectedCharset = mEscCharSetProber->GetCharSetName();
    }
    break;
  case eHighbyte:
    for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
    {
      st = mCharSetProbers[i]->HandleData(aBuf, aLen);
      if (st == eFoundIt) 
      {
        mDone = PR_TRUE;
        mDetectedCharset = mCharSetProbers[i]->GetCharSetName();
        return NS_OK;
      } 
    }
    break;

  default:  //pure ascii
    ;//do nothing here
  }
  return NS_OK;
}


//---------------------------------------------------------------------
void nsUniversalDetector::DataEnd()
{
  if (!mGotData)
  {
    // we haven't got any data yet, return immediately 
    // caller program sometimes call DataEnd before anything has been sent to detector
    return;
  }

  if (mDetectedCharset)
  {
    mDone = PR_TRUE;
    Report(mDetectedCharset);
    return;
  }
  
  switch (mInputState)
  {
  case eHighbyte:
    {
      float proberConfidence;
      float maxProberConfidence = (float)0.0;
      PRInt32 maxProber = 0;

      for (PRInt32 i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
      {
        proberConfidence = mCharSetProbers[i]->GetConfidence();
        if (proberConfidence > maxProberConfidence)
        {
          maxProberConfidence = proberConfidence;
          maxProber = i;
        }
      }
      //do not report anything because we are not confident of it, that's in fact a negative answer
      if (maxProberConfidence > MINIMUM_THRESHOLD)
        Report(mCharSetProbers[maxProber]->GetCharSetName());
    }
    break;
  case eEscAscii:
    break;
  default:
    ;
  }
  return;
}
Instead of falling back to your local charset, Aegisub will now use the "universalchardet" library (the one used by the Mozilla project) to autodetect the character set of non-Unicode files. Originally committed to SVN as r1028. 2007-04-08 08:01:41 +02:00			`/* -- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -- */`
			`/* *** BEGIN LICENSE BLOCK ***`
			`* Version: MPL 1.1/GPL 2.0/LGPL 2.1`
			`*`
			`* The contents of this file are subject to the Mozilla Public License Version`
			`* 1.1 (the "License"); you may not use this file except in compliance with`
			`* the License. You may obtain a copy of the License at`
			`* http://www.mozilla.org/MPL/`
			`*`
			`* Software distributed under the License is distributed on an "AS IS" basis,`
			`* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License`
			`* for the specific language governing rights and limitations under the`
			`* License.`
			`*`
			`* The Original Code is Mozilla Universal charset detector code.`
			`*`
			`* The Initial Developer of the Original Code is`
			`* Netscape Communications Corporation.`
			`* Portions created by the Initial Developer are Copyright (C) 2001`
			`* the Initial Developer. All Rights Reserved.`
			`*`
			`* Contributor(s):`
			`* Shy Shalom <shooshX@gmail.com>`
			`*`
			`* Alternatively, the contents of this file may be used under the terms of`
			`* either the GNU General Public License Version 2 or later (the "GPL"), or`
			`* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),`
			`* in which case the provisions of the GPL or the LGPL are applicable instead`
			`* of those above. If you wish to allow use of your version of this file only`
			`* under the terms of either the GPL or the LGPL, and not to allow others to`
			`* use your version of this file under the terms of the MPL, indicate your`
			`* decision by deleting the provisions above and replace them with the notice`
			`* and other provisions required by the GPL or the LGPL. If you do not delete`
			`* the provisions above, a recipient may use your version of this file under`
			`* the terms of any one of the MPL, the GPL or the LGPL.`
			`*`
			`* *** END LICENSE BLOCK *** */`

			`#include "nscore.h"`

			`#include "nsUniversalDetector.h"`

			`#include "nsMBCSGroupProber.h"`
			`#include "nsSBCSGroupProber.h"`
			`#include "nsEscCharsetProber.h"`
			`#include "nsLatin1Prober.h"`
			`#include "nsError.h"`

			`nsUniversalDetector::nsUniversalDetector()`
			`{`
			`mDone = PR_FALSE;`
			`mBestGuess = -1; //illegal value as signal`
			`mInTag = PR_FALSE;`
			`mEscCharSetProber = nsnull;`

			`mStart = PR_TRUE;`
			`mDetectedCharset = nsnull;`
			`mGotData = PR_FALSE;`
			`mInputState = ePureAscii;`
			`mLastChar = '\0';`

			`PRUint32 i;`
			`for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++)`
			`mCharSetProbers[i] = nsnull;`
			`}`

			`nsUniversalDetector::~nsUniversalDetector()`
			`{`
			`for (PRInt32 i = 0; i < NUM_OF_CHARSET_PROBERS; i++)`
			`if (mCharSetProbers[i])`
			`delete mCharSetProbers[i];`
			`if (mEscCharSetProber)`
			`delete mEscCharSetProber;`
			`}`

			`void`
			`nsUniversalDetector::Reset()`
			`{`
			`mDone = PR_FALSE;`
			`mBestGuess = -1; //illegal value as signal`
			`mInTag = PR_FALSE;`

			`mStart = PR_TRUE;`
			`mDetectedCharset = nsnull;`
			`mGotData = PR_FALSE;`
			`mInputState = ePureAscii;`
			`mLastChar = '\0';`

			`if (mEscCharSetProber)`
			`mEscCharSetProber->Reset();`

			`PRUint32 i;`
			`for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++)`
			`if (mCharSetProbers[i])`
			`mCharSetProbers[i]->Reset();`
			`}`

			`//---------------------------------------------------------------------`
			`#define SHORTCUT_THRESHOLD (float)0.95`
			`#define MINIMUM_THRESHOLD (float)0.20`

			`nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)`
			`{`
			`if(mDone)`
			`return NS_OK;`

			`if (aLen > 0)`
			`mGotData = PR_TRUE;`

			`//If the data starts with BOM, we know it is UTF`
			`if (mStart)`
			`{`
			`mStart = PR_FALSE;`
			`if (aLen > 3)`
			`switch (aBuf[0])`
			`{`
			`case '\xEF':`
			`if (('\xBB' == aBuf[1]) && ('\xBF' == aBuf[2]))`
			`// EF BB BF UTF-8 encoded BOM`
			`mDetectedCharset = "UTF-8";`
			`break;`
			`case '\xFE':`
			`if (('\xFF' == aBuf[1]) && ('\x00' == aBuf[2]) && ('\x00' == aBuf[3]))`
			`// FE FF 00 00 UCS-4, unusual octet order BOM (3412)`
			`mDetectedCharset = "X-ISO-10646-UCS-4-3412";`
			`else if ('\xFF' == aBuf[1])`
			`// FE FF UTF-16, big endian BOM`
			`mDetectedCharset = "UTF-16BE";`
			`break;`
			`case '\x00':`
			`if (('\x00' == aBuf[1]) && ('\xFE' == aBuf[2]) && ('\xFF' == aBuf[3]))`
			`// 00 00 FE FF UTF-32, big-endian BOM`
			`mDetectedCharset = "UTF-32BE";`
			`else if (('\x00' == aBuf[1]) && ('\xFF' == aBuf[2]) && ('\xFE' == aBuf[3]))`
			`// 00 00 FF FE UCS-4, unusual octet order BOM (2143)`
			`mDetectedCharset = "X-ISO-10646-UCS-4-2143";`
			`break;`
			`case '\xFF':`
			`if (('\xFE' == aBuf[1]) && ('\x00' == aBuf[2]) && ('\x00' == aBuf[3]))`
			`// FF FE 00 00 UTF-32, little-endian BOM`
			`mDetectedCharset = "UTF-32LE";`
			`else if ('\xFE' == aBuf[1])`
			`// FF FE UTF-16, little endian BOM`
			`mDetectedCharset = "UTF-16LE";`
			`break;`
			`} // switch`

			`if (mDetectedCharset)`
			`{`
			`mDone = PR_TRUE;`
			`return NS_OK;`
			`}`
			`}`

			`PRUint32 i;`
			`for (i = 0; i < aLen; i++)`
			`{`
			`//other than 0xa0, if every othe character is ascii, the page is ascii`
			`if (aBuf[i] & '\x80' && aBuf[i] != '\xA0') //Since many Ascii only page contains NBSP`
			`{`
			`//we got a non-ascii byte (high-byte)`
			`if (mInputState != eHighbyte)`
			`{`
			`//adjust state`
			`mInputState = eHighbyte;`

			`//kill mEscCharSetProber if it is active`
			`if (mEscCharSetProber) {`
			`delete mEscCharSetProber;`
			`mEscCharSetProber = nsnull;`
			`}`

			`//start multibyte and singlebyte charset prober`
			`if (nsnull == mCharSetProbers[0])`
			`mCharSetProbers[0] = new nsMBCSGroupProber;`
			`if (nsnull == mCharSetProbers[1])`
			`mCharSetProbers[1] = new nsSBCSGroupProber;`
			`if (nsnull == mCharSetProbers[2])`
			`mCharSetProbers[2] = new nsLatin1Prober;`

			`if ((nsnull == mCharSetProbers[0]) \|\|`
			`(nsnull == mCharSetProbers[1]) \|\|`
			`(nsnull == mCharSetProbers[2]))`
			`return NS_ERROR_OUT_OF_MEMORY;`
			`}`
			`}`
			`else`
			`{`
			`//ok, just pure ascii so far`
			`if ( ePureAscii == mInputState &&`
			`(aBuf[i] == '\033' \|\| (aBuf[i] == '{' && mLastChar == '~')) )`
			`{`
			`//found escape character or HZ "~{"`
			`mInputState = eEscAscii;`
			`}`
			`mLastChar = aBuf[i];`
			`}`
			`}`

			`nsProbingState st;`
			`switch (mInputState)`
			`{`
			`case eEscAscii:`
			`if (nsnull == mEscCharSetProber) {`
			`mEscCharSetProber = new nsEscCharSetProber;`
			`if (nsnull == mEscCharSetProber)`
			`return NS_ERROR_OUT_OF_MEMORY;`
			`}`
			`st = mEscCharSetProber->HandleData(aBuf, aLen);`
			`if (st == eFoundIt)`
			`{`
			`mDone = PR_TRUE;`
			`mDetectedCharset = mEscCharSetProber->GetCharSetName();`
			`}`
			`break;`
			`case eHighbyte:`
			`for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++)`
			`{`
			`st = mCharSetProbers[i]->HandleData(aBuf, aLen);`
			`if (st == eFoundIt)`
			`{`
			`mDone = PR_TRUE;`
			`mDetectedCharset = mCharSetProbers[i]->GetCharSetName();`
			`return NS_OK;`
			`}`
			`}`
			`break;`

			`default: //pure ascii`
			`;//do nothing here`
			`}`
			`return NS_OK;`
			`}`


			`//---------------------------------------------------------------------`
			`void nsUniversalDetector::DataEnd()`
			`{`
			`if (!mGotData)`
			`{`
			`// we haven't got any data yet, return immediately`
			`// caller program sometimes call DataEnd before anything has been sent to detector`
			`return;`
			`}`

			`if (mDetectedCharset)`
			`{`
			`mDone = PR_TRUE;`
			`Report(mDetectedCharset);`
			`return;`
			`}`

			`switch (mInputState)`
			`{`
			`case eHighbyte:`
			`{`
			`float proberConfidence;`
			`float maxProberConfidence = (float)0.0;`
			`PRInt32 maxProber = 0;`

			`for (PRInt32 i = 0; i < NUM_OF_CHARSET_PROBERS; i++)`
			`{`
			`proberConfidence = mCharSetProbers[i]->GetConfidence();`
			`if (proberConfidence > maxProberConfidence)`
			`{`
			`maxProberConfidence = proberConfidence;`
			`maxProber = i;`
			`}`
			`}`
			`//do not report anything because we are not confident of it, that's in fact a negative answer`
			`if (maxProberConfidence > MINIMUM_THRESHOLD)`
			`Report(mCharSetProbers[maxProber]->GetCharSetName());`
			`}`
			`break;`
			`case eEscAscii:`
			`break;`
			`default:`
			`;`
			`}`
			`return;`
			`}`