diff options
Diffstat (limited to 'lurker/common/CharsetEscape.cpp')
-rw-r--r-- | lurker/common/CharsetEscape.cpp | 238 |
1 files changed, 238 insertions, 0 deletions
diff --git a/lurker/common/CharsetEscape.cpp b/lurker/common/CharsetEscape.cpp new file mode 100644 index 0000000..b51c3b2 --- /dev/null +++ b/lurker/common/CharsetEscape.cpp @@ -0,0 +1,238 @@ +/* $Id: CharsetEscape.cpp 1649 2009-10-19 14:35:01Z terpstra $ + * + * CharsetEscape.cpp - A stream manipulator-like thing for charset conversion + * + * Copyright (C) 2002 - Wesley W. Terpstra + * + * License: GPL + * + * Authors: 'Wesley W. Terpstra' <wesley@terpstra.ca> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#define _FILE_OFFSET_BITS 64 + +#include <mimelib/string.h> +#include <mimelib/utility.h> + +#include <cerrno> +#if __GNUC__ == 2 +#include <strstream> +#else +#include <sstream> +#endif + +#include "CharsetEscape.h" + +CharsetEscape::CharsetEscape(const char* charset) + : ic(iconv_open("UTF-8", charset)) +{ +} + +CharsetEscape::~CharsetEscape() +{ + if (valid()) iconv_close(ic); +} + +void iconv_bug_kill_nulls(char* ob, size_t is) +{ + while (is != 0) + { + if (*ob == '\0') *ob = '?'; + ++ob; + --is; + } +} + +void CharsetEscape::write(ostream& o, const char* ib, size_t is) +{ + if (!valid()) + { // when not valid, just keep ascii chars + + while (1) + { + const char* s; + const char* e; + + for (s = ib, e = s + is; s != e; ++s) + { // if it moves, kill it! + if ((*s < 0x20 || *s >= 0x7f) && + (*s != '\n' && *s != '\t')) + { + break; + } + } + + // write out what we have + if (s != ib) o.write(ib, long(s - ib)); + + is -= long(s - ib); + ib = s; + + if (!is) return; + + // skip the offensive byte + ++ib; + --is; + o << '?'; + } + } + + char buf[8096]; + + char* ob = &buf[0]; + size_t os = sizeof(buf); + + // We forcibly type-cast iconv b/c it has different types on some + // platforms, but the difference is only in the const. + while (((size_t (*)(iconv_t, const char **, size_t*, char**, size_t*))&iconv) + (ic, &ib, &is, &ob, &os) == (size_t)-1) + { + if (errno == EILSEQ) + { + // Output some stuff + iconv_bug_kill_nulls(buf, sizeof(buf) - os); + o.write(buf, sizeof(buf) - os); + + ob = &buf[0]; + os = sizeof(buf); + + // skip a broken byte + ++ib; + --is; + o << "?"; + } + else if (errno == EINVAL) + { + // Incomplete data + break; + } + else + { // E2BIG + iconv_bug_kill_nulls(buf, sizeof(buf) - os); + o.write(buf, sizeof(buf) - os); + + ob = &buf[0]; + os = sizeof(buf); + } + } + + // success, write out tail. + iconv_bug_kill_nulls(buf, sizeof(buf) - os); + o.write(buf, sizeof(buf) - os); +} + +string CharsetEscape::write(const char* ib, size_t is) +{ +#if __GNUC__ == 2 + strstream out; +#else + std::stringstream out; +#endif + write(out, ib, is); + +#if __GNUC__ == 2 + char* tmpstr = out.str(); + string ret(tmpstr, out.rdbuf()->pcount()); + free(tmpstr); + return ret; +#else + return out.str(); +#endif +} + +// Transform any =?charset?encoding?str?= stuff in the string to utf-8 +string decode_header( + const string& str, + const char* default_coding) +{ +#if __GNUC__ == 2 + strstream out; +#else + std::stringstream out; +#endif + + CharsetEscape code(default_coding); + bool sawCodeWord = false; + + string::size_type b = 0, c, e, s, n; + while ((c = str.find("=?", b)) != string::npos) + { + if ((e = str.find('?', c+2)) != string::npos && + (s = str.find('?', e+1)) != string::npos && + s == e + 2 && + (n = str.find("?=", s+1)) != string::npos && + str.find_first_of(" \t", c) > b) + { // valid escape + if (!sawCodeWord || // guaranteed not npos: (c has = ) + str.find_first_not_of(" \r\n\t", b) < c) + { + code.write(out, str.c_str() + b, c - b); + } + + sawCodeWord = true; + + c += 2; + string charset(str, c, e - c); + char encoding = str[e+1]; + s += 1; + DwString in(str.c_str() + s, n-s); + DwString decode; + b = n+2; + + if (encoding == 'Q' || encoding == 'q') + { + // Convert also all '_' to ' ' + size_t x = 0; + while ((x = in.find_first_of("_", x)) != DwString::npos) + { + in[x] = ' '; + ++x; + } + + DwDecodeQuotedPrintable(in, decode); + } + else if (encoding == 'B' || encoding == 'b') + { + DwDecodeBase64(in, decode); + } + else + { + decode = "<--corrupt-->"; + } + + CharsetEscape subcode(charset.c_str()); + subcode.write(out, decode.c_str(), decode.length()); + } + else + { // not valid escape + code.write(out, str.c_str() + b, c+2 - b); + b = c+2; + + sawCodeWord = false; + } + } + + code.write(out, str.c_str() + b, str.length() - b); + +#if __GNUC__ == 2 + char* tmpstr = out.str(); + string ret(tmpstr, out.rdbuf()->pcount()); + free(tmpstr); + return ret; +#else + return out.str(); +#endif +} |