From f733297499f72ea11c166f6be23751a7b002c51c Mon Sep 17 00:00:00 2001 From: wangqr Date: Sat, 18 May 2019 20:10:32 -0400 Subject: [PATCH] Rewrite encoding detection Now feeds all data to uchardet, when uchardet is available. The file size limit is removed. When uchardet is not available, we look for UTF-8 BOM. This should make loading UTF-8-BOM files faster. Because Aegisub always save file in UTF-8-BOM, this should also guarentee Aegisub will load large (>100MB) file saved by itself. See Aegisub/Aegisub#110 --- libaegisub/common/charset.cpp | 46 ++++++++++++++++------------------- 1 file changed, 21 insertions(+), 25 deletions(-) diff --git a/libaegisub/common/charset.cpp b/libaegisub/common/charset.cpp index 1addb90ee..5ca248b9e 100644 --- a/libaegisub/common/charset.cpp +++ b/libaegisub/common/charset.cpp @@ -29,37 +29,33 @@ namespace agi { namespace charset { std::string Detect(agi::fs::path const& file) { agi::read_file_mapping fp(file); +#ifdef WITH_UCHARDET + agi::scoped_holder ud(uchardet_new(), uchardet_delete); + for (uint64_t offset = 0; offset < fp.size(); ) { + auto read = std::min(65536, fp.size() - offset); + auto buf = fp.read(offset, read); + uchardet_handle_data(ud, buf, read); + offset += read; + } + uchardet_data_end(ud); + std::string encoding = uchardet_get_charset(ud); + return encoding.empty() ? "binary" : encoding; +#else + + // Look for utf-8 BOM + if (fp.size() >= 3) { + const char* buf = fp.read(0, 3); + if (!strncmp(buf, "\xef\xbb\xbf", 3)) + return "utf-8"; + } + // If it's over 100 MB it's either binary or big enough that we won't // be able to do anything useful with it anyway if (fp.size() > 100 * 1024 * 1024) return "binary"; uint64_t binaryish = 0; - -#ifdef WITH_UCHARDET - agi::scoped_holder ud(uchardet_new(), uchardet_delete); - for (uint64_t offset = 0; offset < fp.size(); ) { - auto read = std::min(4096, fp.size() - offset); - auto buf = fp.read(offset, read); - uchardet_handle_data(ud, buf, read); - uchardet_data_end(ud); - if (*uchardet_get_charset(ud)) - return uchardet_get_charset(ud); - - offset += read; - - // A dumb heuristic to detect binary files - for (size_t i = 0; i < read; ++i) { - if ((unsigned char)buf[i] < 32 && (buf[i] != '\r' && buf[i] != '\n' && buf[i] != '\t')) - ++binaryish; - } - - if (binaryish > offset / 8) - return "binary"; - } - return uchardet_get_charset(ud); -#else - auto read = std::min(4096, fp.size()); + auto read = std::min(65536, fp.size()); auto buf = fp.read(0, read); for (size_t i = 0; i < read; ++i) { if ((unsigned char)buf[i] < 32 && (buf[i] != '\r' && buf[i] != '\n' && buf[i] != '\t'))