Add some dumb heuristics to detect binary files in the charset detector to avoid taking forever feeding hundreds of MB through it. Closes #1438.

Originally committed to SVN as r6405.
This commit is contained in:
Thomas Goyne 2012-01-31 04:03:55 +00:00
parent d68a395499
commit 983ffc1e83
1 changed files with 33 additions and 10 deletions

View File

@ -20,11 +20,8 @@
#include "charset_ucd.h"
#ifndef LAGI_PRE
#include <memory>
#endif
#include "libaegisub/io.h"
#include "libaegisub/scoped_ptr.h"
#include "../../universalchardet/nsCharSetProber.h"
@ -34,13 +31,39 @@ namespace agi {
UCDetect::UCDetect(const std::string &file): nsUniversalDetector(NS_FILTER_ALL) {
{
std::auto_ptr<std::ifstream> fp(io::Open(file, true));
agi::scoped_ptr<std::ifstream> fp(io::Open(file, true));
while (!mDone && !fp->eof()) {
char buf[512];
fp->read(buf, 512);
size_t bytes = (size_t)fp->gcount();
HandleData(buf, (PRUint32)bytes);
// If it's over 100 MB it's either binary or big enough that we won't
// be able to do anything useful with it anyway
fp->seekg(0, std::ios::end);
if (fp->tellg() > 100 * 1024 * 1024) {
list.insert(CLDPair(1.f, "binary"));
return;
}
fp->seekg(0, std::ios::beg);
std::streamsize binaryish = 0;
std::streamsize bytes = 0;
while (!mDone && *fp) {
char buf[4096];
fp->read(buf, sizeof(buf));
std::streamsize read = fp->gcount();
HandleData(buf, (PRUint32)read);
// A dumb heuristic to detect binary files
if (!mDone) {
bytes += read;
for (std::streamsize i = 0; i < read; ++i) {
if ((unsigned char)buf[i] < 32 && (buf[i] != '\r' && buf[i] != '\n' && buf[i] != '\t'))
++binaryish;
}
if (binaryish > bytes / 8) {
list.insert(CLDPair(1.f, "binary"));
return;
}
}
}
}