Don't bother storing storing a single charset into an std::string, instead insert it into the std::map and fix Single() to return the first element. This keeps things simple and also ensures that DetectAll() will always return atleast one element which wasn't being done before.

Originally committed to SVN as r4369.
This commit is contained in:
Amar Takhar 2010-05-28 13:08:00 +00:00
parent 6736f5e292
commit 9d854b69f3
2 changed files with 18 additions and 12 deletions

View File

@ -30,7 +30,6 @@
namespace agi {
namespace charset {
UCDetect::UCDetect(const std::string file): nsUniversalDetector(NS_FILTER_ALL) {
{
std::ifstream *fp;
@ -47,7 +46,7 @@ UCDetect::UCDetect(const std::string file): nsUniversalDetector(NS_FILTER_ALL) {
DataEnd();
if (mDetectedCharset) {
charset.assign(mDetectedCharset);
list.insert(CLDPair(1, mDetectedCharset));
} else {
switch (mInputState) {
@ -56,32 +55,39 @@ UCDetect::UCDetect(const std::string file): nsUniversalDetector(NS_FILTER_ALL) {
if (mCharSetProbers[i]) {
float conf = mCharSetProbers[i]->GetConfidence();
if (conf > 0.01f) {
list.insert(std::pair<float, std::string>(conf, mCharSetProbers[i]->GetCharSetName()));
list.insert(CLDPair(conf, mCharSetProbers[i]->GetCharSetName()));
}
}
}
if (!list.empty()) {
CharsetListDetected::const_iterator i_lst = list.begin();
charset.assign(i_lst->second);
}
break;
}
case ePureAscii:
charset.assign("US-ASCII");
list.insert(CLDPair(1, "US-ASCII"));
break;
default:
throw UnknownCharset("Unknown chararacter set.");
}
if ((list.empty() && (mInputState == eHighbyte)) || charset.empty())
if (list.empty() && (mInputState == eHighbyte))
throw UnknownCharset("Unknown chararacter set.");
} // if mDetectedCharset else
}
std::string UCDetect::Single() {
/// @todo Add a debug log here since this shouldn't happen.
if (list.empty()) {
throw UnknownCharset("Unknown chararacter set.");
}
CharsetListDetected::const_iterator i_lst = list.begin();
return i_lst->second;
}
} // namespace util
} // namespace agi

View File

@ -29,8 +29,8 @@ namespace agi {
class UCDetect : public nsUniversalDetector {
/// Character set
std::string charset;
/// For insertion into CharsetListDetected
typedef std::pair<float, std::string> CLDPair;
/// List of detected character sets.
CharsetListDetected list;
@ -50,7 +50,7 @@ public:
/// @brief Return a single character set (highest confidence)
/// @return Character set
std::string Single() { return charset; }
std::string Single();
};
} // namespace util