Switch to using libaegisub for character set detection. There are some bugs here but it seems to be more consistent than the last version. There are two remaining issues left: when the character set is unknown we need to provide a full list to choose from. The second: if the file is detected as US-ASCII we need to give a parse error to the user if we run into problems. Right now we'll load the file fine and disable a lot of options.. with no message to the user.

Originally committed to SVN as r4370.
2010-05-29 02:25:19 +00:00 · 2010-05-29 02:25:19 +00:00 · e7b859b9f7
parent 9d854b69f3
commit e7b859b9f7
3 changed files with 32 additions and 172 deletions
--- a/aegisub/src/charset_detect.cpp
+++ b/aegisub/src/charset_detect.cpp
@ -1,4 +1,4 @@
-// Copyright (c) 2007, Rodrigo Braz Monteiro
+// Copyright (c) 2010, Amar Takhar
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@ -39,7 +39,6 @@
 // Headers
 #include "config.h"

-#ifdef WITH_UNIVCHARDET
 #ifndef AGI_PRE
 #include <fstream>
 #include <list>
@ -49,132 +48,47 @@
 #include <wx/intl.h>
 #endif

-#include "../universalchardet/nsCharSetProber.h"
+#include <libaegisub/charset.h>
+
 #include "charset_detect.h"
 #include "text_file_reader.h"
+#include "compat.h"



-
-/// DOCME
-struct CharDetResult {
-
-	/// DOCME
-	float confidence;
-
-	/// DOCME
-	wxString name;
-
-
-	/// @brief DOCME
-	/// @param par 
-	/// @return 
-	///
-	bool operator < (CharDetResult &par) { return confidence > par.confidence; }
-};
-
-
 /// @brief Get encoding 
 /// @param filename 
 /// @return 
 ///
 wxString CharSetDetect::GetEncoding(wxString filename) {
-	std::ifstream file;
-#ifdef __WINDOWS__
-	file.open(filename.wc_str(),std::ios::in | std::ios::binary);
-#else
-	file.open(wxFNCONV(filename),std::ios::in | std::ios::binary);
-#endif
-	if (!file.is_open()) {
-		throw _T("Failed opening file for reading.");
+	wxLogDebug("Filename: %s", filename);
+	bool unknown = 0;
+
+	agi::charset::CharsetListDetected list;
+	agi::charset::CharsetListDetected::const_iterator i_lst;
+
+	try {
+		agi::charset::DetectAll(STD_STR(filename), list);
+    } catch (const agi::charset::UnknownCharset&) {
+		unknown = 1;
 	}

-	// Loop through it until it finds interesting lines
-	while (!file.eof() && !done()) {
-		char buffer[512];
-		file.read(buffer, 512);
-		size_t bytesRead = file.gcount();
-		HandleData(buffer, bytesRead);
-	}
-
-	// Flag as finished
-	DataEnd();
-
-	// Grab every result obtained
-	wxString local = wxLocale::GetSystemEncodingName();
-	std::list<CharDetResult> results;
-	bool gotLocal = false;
-	for (int i=0;i<NUM_OF_CHARSET_PROBERS;i++) {
-		if (mCharSetProbers[i]) {
-			float conf = mCharSetProbers[i]->GetConfidence();
-
-			// Only bother with those whose confidence is at least 1%
-			wxString curName = wxString(mCharSetProbers[i]->GetCharSetName(),wxConvUTF8);
-			if (conf > 0.01f || curName == local) {
-				results.push_back(CharDetResult());
-				results.back().name = curName;
-				results.back().confidence = conf;
-			}
-		}
-	}
-
-	// If you got more than one valid result, ask the user which he wants
-	if (results.size() > 1) {
-		// Add local
-		if (!gotLocal) {
-			results.push_back(CharDetResult());
-			results.back().name = local;
-			results.back().confidence = 0;
-		}
-
-		// Sort by confidence
-		results.sort();
+	/// @todo If the charset is unknown we need to display a complete list of character sets.
+	if (list.size() > 1) {

 		// Get choice from user
 		wxArrayString choices;
-		wxArrayString picked;
-		int i = 0;
-		for (std::list<CharDetResult>::iterator cur=results.begin();cur!=results.end();cur++) {
-			wxString name = (*cur).name;
-			if (picked.Index(name) == wxNOT_FOUND) {
-				picked.Add(name);

-				// Generate name
-				wxString choiceStr;
-				if ((*cur).confidence > 0.0f) choiceStr = wxString::Format(_T("%f%% - "),(*cur).confidence*100.0f);
-				else choiceStr = _T("Unknown - ");
-				choiceStr += name;
-				if (name == local) choiceStr += _T(" (local)");
-
-				// Insert
-				choices.Add(choiceStr);
-				i++;
-				if (i == 20) break;
-			}
+		for (i_lst = list.begin(); i_lst != list.end(); ++i_lst) {
+			choices.Add(lagi_wxString(i_lst->second));
 		}
+
 		int choice = wxGetSingleChoiceIndex(_("Aegisub could not narrow down the character set to a single one.\nPlease pick one below:"),_("Choose character set"),choices);
 		if (choice == -1) throw _T("Canceled");
-
-		// Retrieve name
-		i = 0;
-		for (std::list<CharDetResult>::iterator cur=results.begin();cur!=results.end();cur++,i++) {
-			if (i == choice) result = (*cur).name;
-		}
+		return choices.Item(choice);
 	}

-	// Return whatever it got
-	return result;
+	i_lst = list.begin();
+	return i_lst->second;
 }

-
-/// @brief Report 
-/// @param aCharset 
-///
-void CharSetDetect::Report(const char* aCharset) {
-	// Store the result reported
-	result = wxString(aCharset,wxConvUTF8);
-}
-
-#endif // WITH_UNIVCHARDET
-
-
--- a/aegisub/src/charset_detect.h
+++ b/aegisub/src/charset_detect.h
@ -1,4 +1,4 @@
-// Copyright (c) 2007, Rodrigo Braz Monteiro
+// Copyright (c) 2010, Amar Takhar
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@ -34,34 +34,17 @@
 /// @ingroup utility
 ///

-
-
-///////////
-// Headers
-#include "../universalchardet/nscore.h"
-#include "../universalchardet/nsUniversalDetector.h"
-#include "../universalchardet/nsMBCSGroupProber.h"
-
-
 /// DOCME
 /// @class CharSetDetect
-/// @brief DOCME
-///
-/// DOCME
-class CharSetDetect : public nsUniversalDetector {
+/// @brief Detect character set of a file
+class CharSetDetect {
 private:
-
-	/// DOCME
+	/// Character set
 	wxString result;
-	void Report(const char* aCharset);

 public:
-	CharSetDetect() : nsUniversalDetector(NS_FILTER_ALL) { };
+	/// @brief Get character set name.
+	/// @param filename File to check
+	/// @return Character set name
 	wxString GetEncoding(wxString filename);
-
-	/// @brief DOCME
-	///
-	PRBool done() const { return mDone; }
 };
-
-
--- a/aegisub/src/text_file_reader.cpp
+++ b/aegisub/src/text_file_reader.cpp
@ -74,49 +74,12 @@ TextFileReader::~TextFileReader() {
 }

 wxString TextFileReader::GetEncoding(wxString const& filename) {
-	// Prepare
-	unsigned char b[4];
-	memset(b, 0, sizeof(b));

-	// Read four bytes from file
-	std::ifstream ifile;
-#ifdef __WINDOWS__
-	ifile.open(filename.wc_str());
-#else
-	ifile.open(wxFNCONV(filename));
-#endif
-	if (!ifile.is_open()) {
-		return L"unknown";
-	}
-	ifile.read(reinterpret_cast<char *>(b),4);
-	ifile.close();
-
-	// Try to get the byte order mark from them
-	if (b[0] == 0xEF && b[1] == 0xBB && b[2] == 0xBF) return L"UTF-8";
-	else if (b[0] == 0xFF && b[1] == 0xFE && b[2] == 0x00 && b[3] == 0x00) return L"UTF-32LE";
-	else if (b[0] == 0x00 && b[1] == 0x00 && b[2] == 0xFE && b[3] == 0xFF) return L"UTF-32BE";
-	else if (b[0] == 0xFF && b[1] == 0xFE) return L"UTF-16LE";
-	else if (b[0] == 0xFE && b[1] == 0xFF) return L"UTF-16BE";
-	else if (b[0] == 0x2B && b[1] == 0x2F && b[2] == 0x76) return L"UTF-7";
-
-	// Try to guess UTF-16
-	else if (b[0] == 0 && b[1] >= 32 && b[2] == 0 && b[3] >= 32) return L"UTF-16BE";
-	else if (b[0] >= 32 && b[1] == 0 && b[2] >= 32 && b[3] == 0) return L"UTF-16LE";
-
-	// If any of the first four bytes are under 0x20 (the first printable character),
-	// except for 9-13 range, assume binary
-	for (int i=0;i<4;i++) {
-		if (b[i] < 9 || (b[i] > 13 && b[i] < 32)) return L"binary";
-	}
-
-#ifdef WITH_UNIVCHARDET
 	// Use universalchardet library to detect charset
 	CharSetDetect det;
-	return det.GetEncoding(filename);
-#else
-	// Fall back to local
-	return L"local";
-#endif
+	wxString str(det.GetEncoding(filename));
+	wxLogDebug("Encoding: %s", str);
+	return str;
 }

 wchar_t TextFileReader::GetWChar() {