summaryrefslogtreecommitdiffstats
path: root/mimelib/token.cpp
diff options
context:
space:
mode:
authorKaz Kylheku <kaz@kylheku.com>2013-10-06 10:33:05 -0700
committerKaz Kylheku <kaz@kylheku.com>2013-10-06 10:33:05 -0700
commitb479c22c713be413b9135be8f1d4e108d33f17f6 (patch)
treeaaaae4d4fcb5df6ef6414a6e825e8f4e4730f6ee /mimelib/token.cpp
downloadlurker-b479c22c713be413b9135be8f1d4e108d33f17f6.tar.gz
lurker-b479c22c713be413b9135be8f1d4e108d33f17f6.tar.bz2
lurker-b479c22c713be413b9135be8f1d4e108d33f17f6.zip
lurker-2.3
mimelib-3.1.1
Diffstat (limited to 'mimelib/token.cpp')
-rw-r--r--mimelib/token.cpp539
1 files changed, 539 insertions, 0 deletions
diff --git a/mimelib/token.cpp b/mimelib/token.cpp
new file mode 100644
index 0000000..3134e0a
--- /dev/null
+++ b/mimelib/token.cpp
@@ -0,0 +1,539 @@
+//=============================================================================
+// File: token.cpp
+// Contents: Definitions for DwTokenizer, DwRfc822Tokenizer
+// Maintainer: Doug Sauder <dwsauder@fwb.gulf.net>
+// WWW: http://www.fwb.gulf.net/~dwsauder/mimepp.html
+// $Revision: 1.10 $
+// $Date: 2002/04/22 10:01:28 $
+//
+// Copyright (c) 1996, 1997 Douglas W. Sauder
+// All rights reserved.
+//
+// IN NO EVENT SHALL DOUGLAS W. SAUDER BE LIABLE TO ANY PARTY FOR DIRECT,
+// INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF
+// THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF DOUGLAS W. SAUDER
+// HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// DOUGLAS W. SAUDER SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, BUT
+// NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+// PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS IS"
+// BASIS, AND DOUGLAS W. SAUDER HAS NO OBLIGATION TO PROVIDE MAINTENANCE,
+// SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
+//
+//=============================================================================
+
+#define DW_IMPLEMENTATION
+
+#include <mimelib/config.h>
+#include <mimelib/debug.h>
+#include <assert.h>
+#include <ctype.h>
+#include <mimelib/string.h>
+#include <mimelib/token.h>
+
+
+std::ostream* DwTokenizer::mDebugOut = 0;
+
+
+DwTokenizer::DwTokenizer(const DwString& aStr)
+ : mString(aStr)
+{
+ mTokenStart = 0;
+ mTokenLength = 0;
+ mNextStart = 0;
+ mTkType = eTkError;
+}
+
+
+DwTokenizer::DwTokenizer(const char* aCStr)
+ : mString(aCStr)
+{
+ mTokenStart = 0;
+ mTokenLength = 0;
+ mNextStart = 0;
+ mTkType = eTkError;
+}
+
+
+DwTokenizer::~DwTokenizer()
+{
+}
+
+
+void DwTokenizer::StripDelimiters()
+{
+ if (mTokenLength < 2) return;
+ // const ref -- avoids copy on write when using operator[]
+ const DwString& token = mToken;
+ switch (mTkType) {
+ case eTkQuotedString:
+ if (token[0] == '"') {
+ mToken = mToken.substr(1);
+ ++mTokenStart;
+ --mTokenLength;
+ }
+ if (mTokenLength > 0 && token[mTokenLength-1] == '"') {
+ mToken = mToken.substr(0, mTokenLength-1);
+ --mTokenLength;
+ }
+ break;
+ case eTkDomainLiteral:
+ if (token[0] == '[') {
+ mToken = mToken.substr(1);
+ ++mTokenStart;
+ --mTokenLength;
+ }
+ if (mTokenLength > 0 && token[mTokenLength-1] == ']') {
+ mToken = mToken.substr(0, mTokenLength-1);
+ --mTokenLength;
+ }
+ break;
+ case eTkComment:
+ if (token[0] == '(') {
+ mToken = mToken.substr(1);
+ ++mTokenStart;
+ --mTokenLength;
+ }
+ if (mTokenLength > 0 && token[mTokenLength-1] == ')') {
+ mToken = mToken.substr(0, mTokenLength-1);
+ --mTokenLength;
+ }
+ break;
+ }
+}
+
+
+void DwTokenizer::ParseQuotedString()
+{
+ size_t pos = mTokenStart;
+ while (1) {
+ ++pos;
+ if (pos >= mString.length()) {
+ // Ran out of string
+ mTokenLength = 0;
+ mToken = "";
+ mNextStart = pos;
+ mTkType = eTkError;
+ break;
+ }
+ else if (mString[pos] == '\\') {
+ // Quoted character
+ ++pos;
+ if (pos >= mString.length()) {
+ // Ran out of string
+ mTokenLength = 0;
+ mToken = "";
+ mNextStart = pos;
+ mTkType = eTkError;
+ break;
+ }
+ }
+ else if (mString[pos] == '"') {
+ // End of quoted string
+ ++pos;
+ mTokenLength = pos - mTokenStart;
+ mToken = mString.substr(mTokenStart, mTokenLength);
+ mNextStart = pos;
+ break;
+ }
+ }
+}
+
+
+void DwTokenizer::ParseComment()
+{
+ size_t pos = mTokenStart;
+ int level = 1;
+ while (1) {
+ ++pos;
+ if (pos >= mString.length()) {
+ // Ran out of string
+ mTokenLength = 0;
+ mToken = "";
+ mNextStart = pos;
+ mTkType = eTkError;
+ break;
+ }
+ else if (mString[pos] == '\\') {
+ // Quoted character
+ ++pos;
+ if (pos >= mString.length()) {
+ // Ran out of string
+ mTokenLength = 0;
+ mToken = "";
+ mNextStart = pos;
+ mTkType = eTkError;
+ break;
+ }
+ }
+ else if (mString[pos] == ')') {
+ --level;
+ if (level == 0) {
+ // End of comment
+ ++pos;
+ mTokenLength = pos - mTokenStart;
+ mToken = mString.substr(mTokenStart, mTokenLength);
+ mNextStart = pos;
+ break;
+ }
+ }
+ else if (mString[pos] == '(') {
+ ++level;
+ }
+ }
+}
+
+
+void DwTokenizer::ParseDomainLiteral()
+{
+ size_t pos = mTokenStart;
+ while (1) {
+ ++pos;
+ if (pos >= mString.length()) {
+ // Ran out of string
+ mTokenLength = 0;
+ mToken = "";
+ mNextStart = pos;
+ mTkType = eTkError;
+ break;
+ }
+ else if (mString[pos] == '\\') {
+ // Quoted character
+ ++pos;
+ if (pos >= mString.length()) {
+ // Ran out of string
+ mTokenLength = 0;
+ mToken = "";
+ mNextStart = pos;
+ mTkType = eTkError;
+ break;
+ }
+ }
+ else if (mString[pos] == ']') {
+ // End of domain literal
+ ++pos;
+ mTokenLength = pos - mTokenStart;
+ mToken = mString.substr(mTokenStart, mTokenLength);
+ mNextStart = pos;
+ break;
+ }
+ }
+}
+
+
+void DwTokenizer::PrintToken(std::ostream* aOut)
+{
+ if (!aOut) return;
+ const char* type = 0;
+ switch (mTkType) {
+ case eTkError:
+ type = "error ";
+ break;
+ case eTkNull:
+ type = "null ";
+ break;
+ case eTkSpecial:
+ type = "special ";
+ break;
+ case eTkAtom:
+ type = "atom ";
+ break;
+ case eTkComment:
+ type = "comment ";
+ break;
+ case eTkQuotedString:
+ type = "quoted string ";
+ break;
+ case eTkDomainLiteral:
+ type = "domain literal ";
+ break;
+ case eTkTspecial:
+ type = "tspecial ";
+ break;
+ case eTkToken:
+ type = "token ";
+ break;
+ default:
+ type = "unknown ";
+ break;
+ }
+ *aOut << type << mToken << '\n';
+}
+
+
+#define isspecial(c) ((c)=='('||(c)==')'||(c)=='<'||(c)=='>'||(c)=='@'\
+ ||(c)==','||(c)==';'||(c)==':'||(c)=='\\'||(c)=='"'||(c)=='.'\
+ ||(c)=='['||(c)==']')
+
+
+DwRfc822Tokenizer::DwRfc822Tokenizer(const DwString& aStr)
+ : DwTokenizer(aStr)
+{
+ ParseToken();
+}
+
+
+DwRfc822Tokenizer::DwRfc822Tokenizer(const char* aCStr)
+ : DwTokenizer(aCStr)
+{
+ ParseToken();
+}
+
+
+DwRfc822Tokenizer::~DwRfc822Tokenizer()
+{
+}
+
+
+int DwRfc822Tokenizer::Restart()
+{
+ mNextStart = 0;
+ ParseToken();
+ return mTkType;
+}
+
+
+int DwRfc822Tokenizer::operator ++ ()
+{
+ ParseToken();
+ return mTkType;
+}
+
+
+void DwRfc822Tokenizer::ParseToken()
+{
+ // Assume the field body has already been extracted. That is, we don't
+ // have to watch for the end of the field body or folding. We just
+ // treat any CRs or LFs as white space.
+ mTokenStart = mNextStart;
+ mTokenLength = 0;
+ mTkType = eTkNull;
+ if (mTokenStart >= mString.length()) {
+ return;
+ }
+ // Skip leading space. Also, since control chars are not permitted
+ // in atoms, skip these, too.
+ while (1) {
+ if (mTokenStart >= mString.length()) {
+ return;
+ }
+ if (!isspace(mString[mTokenStart]) && !iscntrl(mString[mTokenStart]))
+ break;
+ ++mTokenStart;
+ }
+ char ch = mString[mTokenStart];
+ // Quoted string
+ if (ch == '"') {
+ mTkType = eTkQuotedString;
+ ParseQuotedString();
+ }
+ // Comment
+ else if (ch == '(') {
+ mTkType = eTkComment;
+ ParseComment();
+ }
+ // Domain literal
+ else if (ch == '[') {
+ mTkType = eTkDomainLiteral;
+ ParseDomainLiteral();
+ }
+ // Special
+ else if (isspecial(ch)) {
+ mTkType = eTkSpecial;
+ mTokenLength = 1;
+ mToken = mString.substr(mTokenStart, 1);
+ mNextStart = mTokenStart + 1;
+ }
+ // Atom
+ else {
+ mTkType = eTkAtom;
+ ParseAtom();
+ }
+ if (mDebugOut) PrintToken(mDebugOut);
+}
+
+
+void DwRfc822Tokenizer::ParseAtom()
+{
+ size_t pos = mTokenStart;
+ while (1) {
+ ++pos;
+ char ch = (pos < mString.length()) ? mString[pos] : (char) 0;
+ if (pos >= mString.length()
+ || isspace(ch)
+ || iscntrl(ch)
+ || isspecial(ch)) {
+
+ mTokenLength = pos - mTokenStart;
+ mToken = mString.substr(mTokenStart, mTokenLength);
+ mNextStart = pos;
+ break;
+ }
+ }
+}
+
+
+#define istspecial(c) ((c)=='('||(c)==')'||(c)=='<'||(c)=='>'||(c)=='@'\
+ ||(c)==','||(c)==';'||(c)==':'||(c)=='\\'||(c)=='"'||(c)=='/'\
+ ||(c)=='['||(c)==']'||(c)=='?'||(c)=='=')
+
+
+DwRfc1521Tokenizer::DwRfc1521Tokenizer(const DwString& aStr)
+ : DwTokenizer(aStr)
+{
+ ParseToken();
+}
+
+
+DwRfc1521Tokenizer::DwRfc1521Tokenizer(const char* aCStr)
+ : DwTokenizer(aCStr)
+{
+ ParseToken();
+}
+
+
+DwRfc1521Tokenizer::~DwRfc1521Tokenizer()
+{
+}
+
+
+int DwRfc1521Tokenizer::Restart()
+{
+ mNextStart = 0;
+ ParseToken();
+ return mTkType;
+}
+
+
+int DwRfc1521Tokenizer::operator ++ ()
+{
+ ParseToken();
+ return mTkType;
+}
+
+
+void DwRfc1521Tokenizer::ParseToken()
+{
+ // Assume the field body has already been extracted. That is, we don't
+ // have to watch for the end of the field body or folding. We just
+ // treat any CRs or LFs as white space.
+ mTokenStart = mNextStart;
+ mTokenLength = 0;
+ mTkType = eTkNull;
+ if (mTokenStart >= mString.length()) {
+ return;
+ }
+ // Skip leading space. Also, since control chars are not permitted
+ // in atoms, skip these, too.
+ while (1) {
+ if (mTokenStart >= mString.length()) {
+ return;
+ }
+ if (!isspace(mString[mTokenStart]) && !iscntrl(mString[mTokenStart]))
+ break;
+ ++mTokenStart;
+ }
+ char ch = mString[mTokenStart];
+ // Quoted string
+ if (ch == '"') {
+ mTkType = eTkQuotedString;
+ ParseQuotedString();
+ }
+ // Comment
+ else if (ch == '(') {
+ mTkType = eTkComment;
+ ParseComment();
+ }
+ // Domain literal
+ else if (ch == '[') {
+ mTkType = eTkDomainLiteral;
+ ParseDomainLiteral();
+ }
+ // Special
+ else if (istspecial(ch)) {
+ mTkType = eTkTspecial;
+ mTokenLength = 1;
+ mToken = mString.substr(mTokenStart, 1);
+ mNextStart = mTokenStart + 1;
+ }
+ // Atom
+ else {
+ mTkType = eTkToken;
+ ParseAtom();
+ }
+ if (mDebugOut) PrintToken(mDebugOut);
+}
+
+
+void DwRfc1521Tokenizer::ParseAtom()
+{
+ size_t pos = mTokenStart;
+ while (1) {
+ ++pos;
+ char ch = (pos < mString.length()) ? mString[pos] : (char) 0;
+ if (pos >= mString.length()
+ || isspace(ch)
+ || iscntrl(ch)
+ || istspecial(ch)) {
+
+ mTokenLength = pos - mTokenStart;
+ mToken = mString.substr(mTokenStart, mTokenLength);
+ mNextStart = pos;
+ break;
+ }
+ }
+}
+
+
+DwTokenString::DwTokenString(const DwString& aStr)
+ : mString(aStr)
+{
+ mTokensStart = 0;
+ mTokensLength = 0;
+}
+
+
+DwTokenString::~DwTokenString()
+{
+}
+
+
+void DwTokenString::SetFirst(const DwTokenizer& aTkzr)
+{
+ switch (aTkzr.Type()) {
+ case eTkError:
+ case eTkNull:
+ mTokensStart = aTkzr.mTokenStart;
+ mTokensLength = 0;
+ break;
+ case eTkComment:
+ case eTkDomainLiteral:
+ case eTkQuotedString:
+ case eTkSpecial:
+ case eTkAtom:
+ case eTkTspecial:
+ case eTkToken:
+ mTokensStart = aTkzr.mTokenStart;
+ mTokensLength = aTkzr.mTokenLength;
+ break;
+ }
+ mTokens = mString.substr(mTokensStart, mTokensLength);
+}
+
+
+void DwTokenString::SetLast(const DwTokenizer& aTkzr)
+{
+ assert(aTkzr.mTokenStart >= mTokensStart);
+ if (aTkzr.mTokenStart < mTokensStart) return;
+ mTokensLength = aTkzr.mTokenStart + aTkzr.mTokenLength - mTokensStart;
+ mTokens = mString.substr(mTokensStart, mTokensLength);
+}
+
+
+void DwTokenString::ExtendTo(const DwTokenizer& aTkzr)
+{
+ assert(aTkzr.mTokenStart >= mTokensStart);
+ if (aTkzr.mTokenStart < mTokensStart) return;
+ mTokensLength = aTkzr.mTokenStart - mTokensStart;
+ mTokens = mString.substr(mTokensStart, mTokensLength);
+}