2005-04-02 12:57:46 +02:00
|
|
|
/*
|
|
|
|
Copyright (C) 2004-2005 Cory Nelson
|
|
|
|
|
|
|
|
This software is provided 'as-is', without any express or implied
|
|
|
|
warranty. In no event will the authors be held liable for any damages
|
|
|
|
arising from the use of this software.
|
|
|
|
|
|
|
|
Permission is granted to anyone to use this software for any purpose,
|
|
|
|
including commercial applications, and to alter it and redistribute it
|
|
|
|
freely, subject to the following restrictions:
|
|
|
|
|
|
|
|
1. The origin of this software must not be misrepresented; you must not
|
|
|
|
claim that you wrote the original software. If you use this software
|
|
|
|
in a product, an acknowledgment in the product documentation would be
|
|
|
|
appreciated but is not required.
|
|
|
|
2. Altered source versions must be plainly marked as such, and must not be
|
|
|
|
misrepresented as being the original software.
|
|
|
|
3. This notice may not be removed or altered from any source distribution.
|
|
|
|
*/
|
|
|
|
|
|
|
|
// namespaces added by Arvid Norberg
|
|
|
|
|
|
|
|
#ifndef __UTF8_H__
|
|
|
|
#define __UTF8_H__
|
|
|
|
|
|
|
|
#include <string>
|
|
|
|
#include <iterator>
|
|
|
|
#include <stdexcept>
|
2007-03-02 02:16:59 +01:00
|
|
|
#include <cwchar>
|
2005-04-02 12:57:46 +02:00
|
|
|
|
|
|
|
namespace libtorrent {
|
|
|
|
namespace detail {
|
|
|
|
|
|
|
|
template<typename InputIterator>
|
2005-08-04 00:51:21 +02:00
|
|
|
wchar_t decode_utf8_mb(InputIterator &iter, InputIterator last)
|
|
|
|
{
|
2005-08-17 03:57:30 +02:00
|
|
|
if (iter == last) throw std::runtime_error("incomplete UTF-8 sequence");
|
|
|
|
if (((*iter) & 0xc0) != 0x80) throw std::runtime_error("invalid UTF-8 sequence");
|
2005-04-02 12:57:46 +02:00
|
|
|
|
2005-08-17 03:57:30 +02:00
|
|
|
return (wchar_t)((*iter++) & 0x3f);
|
2005-04-02 12:57:46 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
template<typename InputIterator>
|
2005-08-04 00:51:21 +02:00
|
|
|
wchar_t decode_utf8(InputIterator &iter, InputIterator last)
|
|
|
|
{
|
2005-04-02 12:57:46 +02:00
|
|
|
wchar_t ret;
|
|
|
|
|
2005-08-17 03:57:30 +02:00
|
|
|
if (((*iter) & 0x80) == 0) // one byte
|
2005-08-17 03:35:37 +02:00
|
|
|
{
|
|
|
|
ret = *iter++;
|
2005-04-02 12:57:46 +02:00
|
|
|
}
|
2005-08-17 03:35:37 +02:00
|
|
|
else if (((*iter) & 0xe0) == 0xc0) // two bytes
|
|
|
|
{
|
|
|
|
wchar_t byte1 = (*iter++) & 0x1f;
|
|
|
|
wchar_t byte2 = decode_utf8_mb(iter, last);
|
|
|
|
ret = (byte1 << 6) | byte2;
|
2005-04-02 12:57:46 +02:00
|
|
|
}
|
2005-08-17 03:35:37 +02:00
|
|
|
else if (((*iter) & 0xf0) == 0xe0) // three bytes
|
|
|
|
{
|
2005-08-17 03:57:30 +02:00
|
|
|
wchar_t byte1 = (*iter++) & 0x0f;
|
2005-08-17 03:35:37 +02:00
|
|
|
wchar_t byte2 = decode_utf8_mb(iter, last);
|
|
|
|
wchar_t byte3 = decode_utf8_mb(iter, last);
|
|
|
|
ret = (byte1 << 12) | (byte2 << 6) | byte3;
|
2005-04-02 12:57:46 +02:00
|
|
|
}
|
2005-08-17 03:35:37 +02:00
|
|
|
// TODO: support surrogate pairs
|
2005-04-02 12:57:46 +02:00
|
|
|
else throw std::runtime_error("UTF-8 not convertable to UTF-16");
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
template<typename InputIterator, typename OutputIterator>
|
2005-08-04 00:51:21 +02:00
|
|
|
OutputIterator utf8_wchar(InputIterator first, InputIterator last, OutputIterator dest)
|
|
|
|
{
|
2005-04-02 12:57:46 +02:00
|
|
|
for(; first!=last; ++dest)
|
2005-08-17 03:57:30 +02:00
|
|
|
*dest = decode_utf8(first, last);
|
2005-04-02 12:57:46 +02:00
|
|
|
return dest;
|
|
|
|
}
|
|
|
|
|
|
|
|
template<typename InputIterator, typename OutputIterator>
|
2005-08-04 00:51:21 +02:00
|
|
|
void encode_wchar(InputIterator iter, OutputIterator &dest)
|
|
|
|
{
|
|
|
|
if(*iter <= 0x007F)
|
|
|
|
{
|
2005-04-02 12:57:46 +02:00
|
|
|
*dest=(char)*iter;
|
|
|
|
++dest;
|
|
|
|
}
|
2005-08-04 00:51:21 +02:00
|
|
|
else if(*iter <= 0x07FF)
|
|
|
|
{
|
2005-04-02 12:57:46 +02:00
|
|
|
*dest = (char)(
|
|
|
|
0xC0 |
|
|
|
|
((*iter & 0x07C0) >> 6)
|
|
|
|
);
|
|
|
|
++dest;
|
|
|
|
|
|
|
|
*dest = (char)(
|
|
|
|
0x80 |
|
|
|
|
(*iter & 0x003F)
|
|
|
|
);
|
|
|
|
++dest;
|
|
|
|
}
|
2005-08-04 00:51:21 +02:00
|
|
|
else if(*iter <= 0xFFFF)
|
|
|
|
{
|
2005-04-02 12:57:46 +02:00
|
|
|
*dest = (char)(
|
|
|
|
0xE0 |
|
|
|
|
((*iter & 0xF000) >> 12)
|
|
|
|
);
|
|
|
|
++dest;
|
|
|
|
|
|
|
|
*dest = (char)(
|
|
|
|
0x80 |
|
|
|
|
((*iter & 0x0FC0) >> 6)
|
|
|
|
);
|
|
|
|
++dest;
|
|
|
|
|
|
|
|
*dest = (char)(
|
|
|
|
0x80 |
|
|
|
|
(*iter & 0x003F)
|
|
|
|
);
|
|
|
|
++dest;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
template<typename InputIterator, typename OutputIterator>
|
2005-08-04 00:51:21 +02:00
|
|
|
OutputIterator wchar_utf8(InputIterator first, InputIterator last, OutputIterator dest)
|
|
|
|
{
|
2005-04-02 12:57:46 +02:00
|
|
|
for(; first!=last; ++first)
|
|
|
|
encode_wchar(first, dest);
|
|
|
|
return dest;
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
2005-08-04 18:19:28 +02:00
|
|
|
inline void utf8_wchar(const std::string &utf8, std::wstring &wide)
|
2005-08-04 00:51:21 +02:00
|
|
|
{
|
2005-04-02 12:57:46 +02:00
|
|
|
wide.clear();
|
2007-03-02 02:16:59 +01:00
|
|
|
detail::utf8_wchar(utf8.begin(), utf8.end(), std::back_inserter(wide));
|
2005-04-02 12:57:46 +02:00
|
|
|
}
|
|
|
|
|
2005-08-04 18:19:28 +02:00
|
|
|
inline std::wstring utf8_wchar(const std::string &str)
|
2005-08-04 00:51:21 +02:00
|
|
|
{
|
2005-04-02 12:57:46 +02:00
|
|
|
std::wstring ret;
|
|
|
|
utf8_wchar(str, ret);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2005-08-04 18:19:28 +02:00
|
|
|
inline void wchar_utf8(const std::wstring &wide, std::string &utf8)
|
2005-08-04 00:51:21 +02:00
|
|
|
{
|
2005-04-02 12:57:46 +02:00
|
|
|
utf8.clear();
|
2007-03-02 02:16:59 +01:00
|
|
|
detail::wchar_utf8(wide.begin(), wide.end(), std::back_inserter(utf8));
|
2005-04-02 12:57:46 +02:00
|
|
|
}
|
|
|
|
|
2005-08-04 18:19:28 +02:00
|
|
|
inline std::string wchar_utf8(const std::wstring &str)
|
2005-08-04 00:51:21 +02:00
|
|
|
{
|
2005-04-02 12:57:46 +02:00
|
|
|
std::string ret;
|
|
|
|
wchar_utf8(str, ret);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
#endif
|