2007-04-08 08:01:41 +02:00
// Copyright (c) 2007, Rodrigo Braz Monteiro
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are met:
//
// * Redistributions of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
// * Neither the name of the Aegisub Group nor the names of its contributors
// may be used to endorse or promote products derived from this software
// without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
// POSSIBILITY OF SUCH DAMAGE.
//
2009-07-29 07:43:02 +02:00
// Aegisub Project http://www.aegisub.org/
2007-04-08 08:01:41 +02:00
//
2009-07-29 07:43:02 +02:00
// $Id$
/// @file charset_detect.cpp
/// @brief Wrapper around text encoding detection library
/// @ingroup utility
///
2007-04-08 08:01:41 +02:00
///////////
// Headers
2009-01-04 07:31:48 +01:00
# include "config.h"
2008-01-21 00:37:44 +01:00
# ifdef WITH_UNIVCHARDET
2008-03-20 22:58:21 +01:00
# include <wx/wxprec.h>
2007-04-08 08:01:41 +02:00
# include "text_file_reader.h"
2008-03-21 03:22:00 +01:00
# include "charset_detect.h"
2008-01-17 19:35:06 +01:00
# include "../universalchardet/nsCharSetProber.h"
2008-03-06 21:49:04 +01:00
# include <wx/intl.h>
2008-01-17 19:35:06 +01:00
# include <wx/choicdlg.h>
2007-04-08 08:01:41 +02:00
2008-01-20 08:24:04 +01:00
struct CharDetResult {
float confidence ;
wxString name ;
bool operator < ( CharDetResult & par ) { return confidence > par . confidence ; }
} ;
2007-04-08 08:01:41 +02:00
////////////////
// Get encoding
wxString CharSetDetect : : GetEncoding ( wxString filename ) {
2009-07-14 23:28:49 +02:00
std : : ifstream file ;
# ifdef __WINDOWS__
file . open ( filename . wc_str ( ) , std : : ios : : in | std : : ios : : binary ) ;
# else
file . open ( wxFNCONV ( filename ) , std : : ios : : in | std : : ios : : binary ) ;
# endif
if ( ! file . is_open ( ) ) {
throw _T ( " Failed opening file for reading. " ) ;
}
2007-04-08 08:01:41 +02:00
// Loop through it until it finds interesting lines
2009-07-14 23:28:49 +02:00
while ( ! file . eof ( ) & & ! done ( ) ) {
char buffer [ 512 ] ;
file . read ( buffer , 512 ) ;
size_t bytesRead = file . gcount ( ) ;
HandleData ( buffer , bytesRead ) ;
2007-04-08 08:01:41 +02:00
}
// Flag as finished
DataEnd ( ) ;
2008-01-17 19:35:06 +01:00
// Grab every result obtained
2008-03-06 21:49:04 +01:00
wxString local = wxLocale : : GetSystemEncodingName ( ) ;
2008-01-17 19:35:06 +01:00
std : : list < CharDetResult > results ;
2008-03-06 21:49:04 +01:00
bool gotLocal = false ;
2008-01-17 19:35:06 +01:00
for ( int i = 0 ; i < NUM_OF_CHARSET_PROBERS ; i + + ) {
2008-01-21 23:34:55 +01:00
if ( mCharSetProbers [ i ] ) {
int probes = mCharSetProbers [ i ] - > GetProbeCount ( ) ;
for ( int j = 0 ; j < probes ; j + + ) {
float conf = mCharSetProbers [ i ] - > GetConfidence ( j ) ;
2008-01-17 19:35:06 +01:00
2008-01-21 23:34:55 +01:00
// Only bother with those whose confidence is at least 1%
2008-03-06 21:49:04 +01:00
wxString curName = wxString ( mCharSetProbers [ i ] - > GetCharSetName ( j ) , wxConvUTF8 ) ;
if ( conf > 0.01f | | curName = = local ) {
2008-01-21 23:34:55 +01:00
results . push_back ( CharDetResult ( ) ) ;
2008-03-06 21:49:04 +01:00
results . back ( ) . name = curName ;
2008-01-21 23:34:55 +01:00
results . back ( ) . confidence = mCharSetProbers [ i ] - > GetConfidence ( j ) ;
}
2008-01-17 19:35:06 +01:00
}
}
}
// If you got more than one valid result, ask the user which he wants
if ( results . size ( ) > 1 ) {
2008-03-06 21:49:04 +01:00
// Add local
if ( ! gotLocal ) {
results . push_back ( CharDetResult ( ) ) ;
results . back ( ) . name = local ;
results . back ( ) . confidence = 0 ;
}
// Sort by confidence
2008-01-17 19:35:06 +01:00
results . sort ( ) ;
// Get choice from user
wxArrayString choices ;
2008-01-20 08:24:04 +01:00
wxArrayString picked ;
int i = 0 ;
2008-01-17 19:35:06 +01:00
for ( std : : list < CharDetResult > : : iterator cur = results . begin ( ) ; cur ! = results . end ( ) ; cur + + ) {
2008-01-20 08:24:04 +01:00
wxString name = ( * cur ) . name ;
if ( picked . Index ( name ) = = wxNOT_FOUND ) {
picked . Add ( name ) ;
2008-03-06 21:49:04 +01:00
// Generate name
wxString choiceStr ;
if ( ( * cur ) . confidence > 0.0f ) choiceStr = wxString : : Format ( _T ( " %f%% - " ) , ( * cur ) . confidence * 100.0f ) ;
else choiceStr = _T ( " Unknown - " ) ;
choiceStr + = name ;
if ( name = = local ) choiceStr + = _T ( " (local) " ) ;
// Insert
choices . Add ( choiceStr ) ;
2008-01-20 08:24:04 +01:00
i + + ;
if ( i = = 20 ) break ;
}
2008-01-17 19:35:06 +01:00
}
int choice = wxGetSingleChoiceIndex ( _ ( " Aegisub could not narrow down the character set to a single one. \n Please pick one below: " ) , _ ( " Choose character set " ) , choices ) ;
if ( choice = = - 1 ) throw _T ( " Canceled " ) ;
// Retrieve name
2008-01-20 08:24:04 +01:00
i = 0 ;
2008-01-17 19:35:06 +01:00
for ( std : : list < CharDetResult > : : iterator cur = results . begin ( ) ; cur ! = results . end ( ) ; cur + + , i + + ) {
if ( i = = choice ) result = ( * cur ) . name ;
}
}
2008-01-20 08:24:04 +01:00
// Return whatever it got
return result ;
}
//////////
// Report
void CharSetDetect : : Report ( const char * aCharset ) {
// Store the result reported
result = wxString ( aCharset , wxConvUTF8 ) ;
2007-04-08 08:01:41 +02:00
}
2008-01-21 00:37:44 +01:00
# endif // WITH_UNIVCHARDET
2009-07-29 07:43:02 +02:00