summaryrefslogtreecommitdiffstats
path: root/lurker/common/CharsetEscape.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'lurker/common/CharsetEscape.cpp')
-rw-r--r--lurker/common/CharsetEscape.cpp238
1 files changed, 238 insertions, 0 deletions
diff --git a/lurker/common/CharsetEscape.cpp b/lurker/common/CharsetEscape.cpp
new file mode 100644
index 0000000..b51c3b2
--- /dev/null
+++ b/lurker/common/CharsetEscape.cpp
@@ -0,0 +1,238 @@
+/* $Id: CharsetEscape.cpp 1649 2009-10-19 14:35:01Z terpstra $
+ *
+ * CharsetEscape.cpp - A stream manipulator-like thing for charset conversion
+ *
+ * Copyright (C) 2002 - Wesley W. Terpstra
+ *
+ * License: GPL
+ *
+ * Authors: 'Wesley W. Terpstra' <wesley@terpstra.ca>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#define _FILE_OFFSET_BITS 64
+
+#include <mimelib/string.h>
+#include <mimelib/utility.h>
+
+#include <cerrno>
+#if __GNUC__ == 2
+#include <strstream>
+#else
+#include <sstream>
+#endif
+
+#include "CharsetEscape.h"
+
+CharsetEscape::CharsetEscape(const char* charset)
+ : ic(iconv_open("UTF-8", charset))
+{
+}
+
+CharsetEscape::~CharsetEscape()
+{
+ if (valid()) iconv_close(ic);
+}
+
+void iconv_bug_kill_nulls(char* ob, size_t is)
+{
+ while (is != 0)
+ {
+ if (*ob == '\0') *ob = '?';
+ ++ob;
+ --is;
+ }
+}
+
+void CharsetEscape::write(ostream& o, const char* ib, size_t is)
+{
+ if (!valid())
+ { // when not valid, just keep ascii chars
+
+ while (1)
+ {
+ const char* s;
+ const char* e;
+
+ for (s = ib, e = s + is; s != e; ++s)
+ { // if it moves, kill it!
+ if ((*s < 0x20 || *s >= 0x7f) &&
+ (*s != '\n' && *s != '\t'))
+ {
+ break;
+ }
+ }
+
+ // write out what we have
+ if (s != ib) o.write(ib, long(s - ib));
+
+ is -= long(s - ib);
+ ib = s;
+
+ if (!is) return;
+
+ // skip the offensive byte
+ ++ib;
+ --is;
+ o << '?';
+ }
+ }
+
+ char buf[8096];
+
+ char* ob = &buf[0];
+ size_t os = sizeof(buf);
+
+ // We forcibly type-cast iconv b/c it has different types on some
+ // platforms, but the difference is only in the const.
+ while (((size_t (*)(iconv_t, const char **, size_t*, char**, size_t*))&iconv)
+ (ic, &ib, &is, &ob, &os) == (size_t)-1)
+ {
+ if (errno == EILSEQ)
+ {
+ // Output some stuff
+ iconv_bug_kill_nulls(buf, sizeof(buf) - os);
+ o.write(buf, sizeof(buf) - os);
+
+ ob = &buf[0];
+ os = sizeof(buf);
+
+ // skip a broken byte
+ ++ib;
+ --is;
+ o << "?";
+ }
+ else if (errno == EINVAL)
+ {
+ // Incomplete data
+ break;
+ }
+ else
+ { // E2BIG
+ iconv_bug_kill_nulls(buf, sizeof(buf) - os);
+ o.write(buf, sizeof(buf) - os);
+
+ ob = &buf[0];
+ os = sizeof(buf);
+ }
+ }
+
+ // success, write out tail.
+ iconv_bug_kill_nulls(buf, sizeof(buf) - os);
+ o.write(buf, sizeof(buf) - os);
+}
+
+string CharsetEscape::write(const char* ib, size_t is)
+{
+#if __GNUC__ == 2
+ strstream out;
+#else
+ std::stringstream out;
+#endif
+ write(out, ib, is);
+
+#if __GNUC__ == 2
+ char* tmpstr = out.str();
+ string ret(tmpstr, out.rdbuf()->pcount());
+ free(tmpstr);
+ return ret;
+#else
+ return out.str();
+#endif
+}
+
+// Transform any =?charset?encoding?str?= stuff in the string to utf-8
+string decode_header(
+ const string& str,
+ const char* default_coding)
+{
+#if __GNUC__ == 2
+ strstream out;
+#else
+ std::stringstream out;
+#endif
+
+ CharsetEscape code(default_coding);
+ bool sawCodeWord = false;
+
+ string::size_type b = 0, c, e, s, n;
+ while ((c = str.find("=?", b)) != string::npos)
+ {
+ if ((e = str.find('?', c+2)) != string::npos &&
+ (s = str.find('?', e+1)) != string::npos &&
+ s == e + 2 &&
+ (n = str.find("?=", s+1)) != string::npos &&
+ str.find_first_of(" \t", c) > b)
+ { // valid escape
+ if (!sawCodeWord || // guaranteed not npos: (c has = )
+ str.find_first_not_of(" \r\n\t", b) < c)
+ {
+ code.write(out, str.c_str() + b, c - b);
+ }
+
+ sawCodeWord = true;
+
+ c += 2;
+ string charset(str, c, e - c);
+ char encoding = str[e+1];
+ s += 1;
+ DwString in(str.c_str() + s, n-s);
+ DwString decode;
+ b = n+2;
+
+ if (encoding == 'Q' || encoding == 'q')
+ {
+ // Convert also all '_' to ' '
+ size_t x = 0;
+ while ((x = in.find_first_of("_", x)) != DwString::npos)
+ {
+ in[x] = ' ';
+ ++x;
+ }
+
+ DwDecodeQuotedPrintable(in, decode);
+ }
+ else if (encoding == 'B' || encoding == 'b')
+ {
+ DwDecodeBase64(in, decode);
+ }
+ else
+ {
+ decode = "<--corrupt-->";
+ }
+
+ CharsetEscape subcode(charset.c_str());
+ subcode.write(out, decode.c_str(), decode.length());
+ }
+ else
+ { // not valid escape
+ code.write(out, str.c_str() + b, c+2 - b);
+ b = c+2;
+
+ sawCodeWord = false;
+ }
+ }
+
+ code.write(out, str.c_str() + b, str.length() - b);
+
+#if __GNUC__ == 2
+ char* tmpstr = out.str();
+ string ret(tmpstr, out.rdbuf()->pcount());
+ free(tmpstr);
+ return ret;
+#else
+ return out.str();
+#endif
+}