Use read_file_mapping for charset detection

This commit is contained in:
Thomas Goyne 2014-03-21 08:06:41 -07:00
parent cb1f04481a
commit 3d21f00356
1 changed files with 10 additions and 16 deletions

View File

@ -18,9 +18,8 @@
#include "libaegisub/charset.h" #include "libaegisub/charset.h"
#include "libaegisub/io.h" #include "libaegisub/file_mapping.h"
#include <fstream>
#include <string> #include <string>
#ifndef _WIN32 #ifndef _WIN32
@ -48,35 +47,30 @@ public:
: nsUniversalDetector(NS_FILTER_ALL) : nsUniversalDetector(NS_FILTER_ALL)
{ {
{ {
std::unique_ptr<std::ifstream> fp(agi::io::Open(file, true)); agi::read_file_mapping fp(file);
// If it's over 100 MB it's either binary or big enough that we won't // If it's over 100 MB it's either binary or big enough that we won't
// be able to do anything useful with it anyway // be able to do anything useful with it anyway
fp->seekg(0, std::ios::end); if (fp.size() > 100 * 1024 * 1024) {
if (fp->tellg() > 100 * 1024 * 1024) {
list.emplace_back(1.f, "binary"); list.emplace_back(1.f, "binary");
return; return;
} }
fp->seekg(0, std::ios::beg);
std::streamsize binaryish = 0; uint64_t binaryish = 0;
std::streamsize bytes = 0; for (uint64_t offset = 0; !mDone && offset < fp.size(); ) {
auto read = std::min<uint64_t>(4096, fp.size() - offset);
while (!mDone && *fp) { auto buf = fp.read(offset, read);
char buf[4096];
fp->read(buf, sizeof(buf));
std::streamsize read = fp->gcount();
HandleData(buf, (PRUint32)read); HandleData(buf, (PRUint32)read);
offset += read;
// A dumb heuristic to detect binary files // A dumb heuristic to detect binary files
if (!mDone) { if (!mDone) {
bytes += read; for (size_t i = 0; i < read; ++i) {
for (std::streamsize i = 0; i < read; ++i) {
if ((unsigned char)buf[i] < 32 && (buf[i] != '\r' && buf[i] != '\n' && buf[i] != '\t')) if ((unsigned char)buf[i] < 32 && (buf[i] != '\r' && buf[i] != '\n' && buf[i] != '\t'))
++binaryish; ++binaryish;
} }
if (binaryish > bytes / 8) { if (binaryish > offset / 8) {
list.emplace_back(1.f, "binary"); list.emplace_back(1.f, "binary");
return; return;
} }