diff options
author | Kaz Kylheku <kaz@kylheku.com> | 2013-10-06 10:33:05 -0700 |
---|---|---|
committer | Kaz Kylheku <kaz@kylheku.com> | 2013-10-06 10:33:05 -0700 |
commit | b479c22c713be413b9135be8f1d4e108d33f17f6 (patch) | |
tree | aaaae4d4fcb5df6ef6414a6e825e8f4e4730f6ee /mimelib/token.cpp | |
download | lurker-b479c22c713be413b9135be8f1d4e108d33f17f6.tar.gz lurker-b479c22c713be413b9135be8f1d4e108d33f17f6.tar.bz2 lurker-b479c22c713be413b9135be8f1d4e108d33f17f6.zip |
lurker-2.3
mimelib-3.1.1
Diffstat (limited to 'mimelib/token.cpp')
-rw-r--r-- | mimelib/token.cpp | 539 |
1 files changed, 539 insertions, 0 deletions
diff --git a/mimelib/token.cpp b/mimelib/token.cpp new file mode 100644 index 0000000..3134e0a --- /dev/null +++ b/mimelib/token.cpp @@ -0,0 +1,539 @@ +//============================================================================= +// File: token.cpp +// Contents: Definitions for DwTokenizer, DwRfc822Tokenizer +// Maintainer: Doug Sauder <dwsauder@fwb.gulf.net> +// WWW: http://www.fwb.gulf.net/~dwsauder/mimepp.html +// $Revision: 1.10 $ +// $Date: 2002/04/22 10:01:28 $ +// +// Copyright (c) 1996, 1997 Douglas W. Sauder +// All rights reserved. +// +// IN NO EVENT SHALL DOUGLAS W. SAUDER BE LIABLE TO ANY PARTY FOR DIRECT, +// INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF +// THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF DOUGLAS W. SAUDER +// HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// DOUGLAS W. SAUDER SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, BUT +// NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +// PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS IS" +// BASIS, AND DOUGLAS W. SAUDER HAS NO OBLIGATION TO PROVIDE MAINTENANCE, +// SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. +// +//============================================================================= + +#define DW_IMPLEMENTATION + +#include <mimelib/config.h> +#include <mimelib/debug.h> +#include <assert.h> +#include <ctype.h> +#include <mimelib/string.h> +#include <mimelib/token.h> + + +std::ostream* DwTokenizer::mDebugOut = 0; + + +DwTokenizer::DwTokenizer(const DwString& aStr) + : mString(aStr) +{ + mTokenStart = 0; + mTokenLength = 0; + mNextStart = 0; + mTkType = eTkError; +} + + +DwTokenizer::DwTokenizer(const char* aCStr) + : mString(aCStr) +{ + mTokenStart = 0; + mTokenLength = 0; + mNextStart = 0; + mTkType = eTkError; +} + + +DwTokenizer::~DwTokenizer() +{ +} + + +void DwTokenizer::StripDelimiters() +{ + if (mTokenLength < 2) return; + // const ref -- avoids copy on write when using operator[] + const DwString& token = mToken; + switch (mTkType) { + case eTkQuotedString: + if (token[0] == '"') { + mToken = mToken.substr(1); + ++mTokenStart; + --mTokenLength; + } + if (mTokenLength > 0 && token[mTokenLength-1] == '"') { + mToken = mToken.substr(0, mTokenLength-1); + --mTokenLength; + } + break; + case eTkDomainLiteral: + if (token[0] == '[') { + mToken = mToken.substr(1); + ++mTokenStart; + --mTokenLength; + } + if (mTokenLength > 0 && token[mTokenLength-1] == ']') { + mToken = mToken.substr(0, mTokenLength-1); + --mTokenLength; + } + break; + case eTkComment: + if (token[0] == '(') { + mToken = mToken.substr(1); + ++mTokenStart; + --mTokenLength; + } + if (mTokenLength > 0 && token[mTokenLength-1] == ')') { + mToken = mToken.substr(0, mTokenLength-1); + --mTokenLength; + } + break; + } +} + + +void DwTokenizer::ParseQuotedString() +{ + size_t pos = mTokenStart; + while (1) { + ++pos; + if (pos >= mString.length()) { + // Ran out of string + mTokenLength = 0; + mToken = ""; + mNextStart = pos; + mTkType = eTkError; + break; + } + else if (mString[pos] == '\\') { + // Quoted character + ++pos; + if (pos >= mString.length()) { + // Ran out of string + mTokenLength = 0; + mToken = ""; + mNextStart = pos; + mTkType = eTkError; + break; + } + } + else if (mString[pos] == '"') { + // End of quoted string + ++pos; + mTokenLength = pos - mTokenStart; + mToken = mString.substr(mTokenStart, mTokenLength); + mNextStart = pos; + break; + } + } +} + + +void DwTokenizer::ParseComment() +{ + size_t pos = mTokenStart; + int level = 1; + while (1) { + ++pos; + if (pos >= mString.length()) { + // Ran out of string + mTokenLength = 0; + mToken = ""; + mNextStart = pos; + mTkType = eTkError; + break; + } + else if (mString[pos] == '\\') { + // Quoted character + ++pos; + if (pos >= mString.length()) { + // Ran out of string + mTokenLength = 0; + mToken = ""; + mNextStart = pos; + mTkType = eTkError; + break; + } + } + else if (mString[pos] == ')') { + --level; + if (level == 0) { + // End of comment + ++pos; + mTokenLength = pos - mTokenStart; + mToken = mString.substr(mTokenStart, mTokenLength); + mNextStart = pos; + break; + } + } + else if (mString[pos] == '(') { + ++level; + } + } +} + + +void DwTokenizer::ParseDomainLiteral() +{ + size_t pos = mTokenStart; + while (1) { + ++pos; + if (pos >= mString.length()) { + // Ran out of string + mTokenLength = 0; + mToken = ""; + mNextStart = pos; + mTkType = eTkError; + break; + } + else if (mString[pos] == '\\') { + // Quoted character + ++pos; + if (pos >= mString.length()) { + // Ran out of string + mTokenLength = 0; + mToken = ""; + mNextStart = pos; + mTkType = eTkError; + break; + } + } + else if (mString[pos] == ']') { + // End of domain literal + ++pos; + mTokenLength = pos - mTokenStart; + mToken = mString.substr(mTokenStart, mTokenLength); + mNextStart = pos; + break; + } + } +} + + +void DwTokenizer::PrintToken(std::ostream* aOut) +{ + if (!aOut) return; + const char* type = 0; + switch (mTkType) { + case eTkError: + type = "error "; + break; + case eTkNull: + type = "null "; + break; + case eTkSpecial: + type = "special "; + break; + case eTkAtom: + type = "atom "; + break; + case eTkComment: + type = "comment "; + break; + case eTkQuotedString: + type = "quoted string "; + break; + case eTkDomainLiteral: + type = "domain literal "; + break; + case eTkTspecial: + type = "tspecial "; + break; + case eTkToken: + type = "token "; + break; + default: + type = "unknown "; + break; + } + *aOut << type << mToken << '\n'; +} + + +#define isspecial(c) ((c)=='('||(c)==')'||(c)=='<'||(c)=='>'||(c)=='@'\ + ||(c)==','||(c)==';'||(c)==':'||(c)=='\\'||(c)=='"'||(c)=='.'\ + ||(c)=='['||(c)==']') + + +DwRfc822Tokenizer::DwRfc822Tokenizer(const DwString& aStr) + : DwTokenizer(aStr) +{ + ParseToken(); +} + + +DwRfc822Tokenizer::DwRfc822Tokenizer(const char* aCStr) + : DwTokenizer(aCStr) +{ + ParseToken(); +} + + +DwRfc822Tokenizer::~DwRfc822Tokenizer() +{ +} + + +int DwRfc822Tokenizer::Restart() +{ + mNextStart = 0; + ParseToken(); + return mTkType; +} + + +int DwRfc822Tokenizer::operator ++ () +{ + ParseToken(); + return mTkType; +} + + +void DwRfc822Tokenizer::ParseToken() +{ + // Assume the field body has already been extracted. That is, we don't + // have to watch for the end of the field body or folding. We just + // treat any CRs or LFs as white space. + mTokenStart = mNextStart; + mTokenLength = 0; + mTkType = eTkNull; + if (mTokenStart >= mString.length()) { + return; + } + // Skip leading space. Also, since control chars are not permitted + // in atoms, skip these, too. + while (1) { + if (mTokenStart >= mString.length()) { + return; + } + if (!isspace(mString[mTokenStart]) && !iscntrl(mString[mTokenStart])) + break; + ++mTokenStart; + } + char ch = mString[mTokenStart]; + // Quoted string + if (ch == '"') { + mTkType = eTkQuotedString; + ParseQuotedString(); + } + // Comment + else if (ch == '(') { + mTkType = eTkComment; + ParseComment(); + } + // Domain literal + else if (ch == '[') { + mTkType = eTkDomainLiteral; + ParseDomainLiteral(); + } + // Special + else if (isspecial(ch)) { + mTkType = eTkSpecial; + mTokenLength = 1; + mToken = mString.substr(mTokenStart, 1); + mNextStart = mTokenStart + 1; + } + // Atom + else { + mTkType = eTkAtom; + ParseAtom(); + } + if (mDebugOut) PrintToken(mDebugOut); +} + + +void DwRfc822Tokenizer::ParseAtom() +{ + size_t pos = mTokenStart; + while (1) { + ++pos; + char ch = (pos < mString.length()) ? mString[pos] : (char) 0; + if (pos >= mString.length() + || isspace(ch) + || iscntrl(ch) + || isspecial(ch)) { + + mTokenLength = pos - mTokenStart; + mToken = mString.substr(mTokenStart, mTokenLength); + mNextStart = pos; + break; + } + } +} + + +#define istspecial(c) ((c)=='('||(c)==')'||(c)=='<'||(c)=='>'||(c)=='@'\ + ||(c)==','||(c)==';'||(c)==':'||(c)=='\\'||(c)=='"'||(c)=='/'\ + ||(c)=='['||(c)==']'||(c)=='?'||(c)=='=') + + +DwRfc1521Tokenizer::DwRfc1521Tokenizer(const DwString& aStr) + : DwTokenizer(aStr) +{ + ParseToken(); +} + + +DwRfc1521Tokenizer::DwRfc1521Tokenizer(const char* aCStr) + : DwTokenizer(aCStr) +{ + ParseToken(); +} + + +DwRfc1521Tokenizer::~DwRfc1521Tokenizer() +{ +} + + +int DwRfc1521Tokenizer::Restart() +{ + mNextStart = 0; + ParseToken(); + return mTkType; +} + + +int DwRfc1521Tokenizer::operator ++ () +{ + ParseToken(); + return mTkType; +} + + +void DwRfc1521Tokenizer::ParseToken() +{ + // Assume the field body has already been extracted. That is, we don't + // have to watch for the end of the field body or folding. We just + // treat any CRs or LFs as white space. + mTokenStart = mNextStart; + mTokenLength = 0; + mTkType = eTkNull; + if (mTokenStart >= mString.length()) { + return; + } + // Skip leading space. Also, since control chars are not permitted + // in atoms, skip these, too. + while (1) { + if (mTokenStart >= mString.length()) { + return; + } + if (!isspace(mString[mTokenStart]) && !iscntrl(mString[mTokenStart])) + break; + ++mTokenStart; + } + char ch = mString[mTokenStart]; + // Quoted string + if (ch == '"') { + mTkType = eTkQuotedString; + ParseQuotedString(); + } + // Comment + else if (ch == '(') { + mTkType = eTkComment; + ParseComment(); + } + // Domain literal + else if (ch == '[') { + mTkType = eTkDomainLiteral; + ParseDomainLiteral(); + } + // Special + else if (istspecial(ch)) { + mTkType = eTkTspecial; + mTokenLength = 1; + mToken = mString.substr(mTokenStart, 1); + mNextStart = mTokenStart + 1; + } + // Atom + else { + mTkType = eTkToken; + ParseAtom(); + } + if (mDebugOut) PrintToken(mDebugOut); +} + + +void DwRfc1521Tokenizer::ParseAtom() +{ + size_t pos = mTokenStart; + while (1) { + ++pos; + char ch = (pos < mString.length()) ? mString[pos] : (char) 0; + if (pos >= mString.length() + || isspace(ch) + || iscntrl(ch) + || istspecial(ch)) { + + mTokenLength = pos - mTokenStart; + mToken = mString.substr(mTokenStart, mTokenLength); + mNextStart = pos; + break; + } + } +} + + +DwTokenString::DwTokenString(const DwString& aStr) + : mString(aStr) +{ + mTokensStart = 0; + mTokensLength = 0; +} + + +DwTokenString::~DwTokenString() +{ +} + + +void DwTokenString::SetFirst(const DwTokenizer& aTkzr) +{ + switch (aTkzr.Type()) { + case eTkError: + case eTkNull: + mTokensStart = aTkzr.mTokenStart; + mTokensLength = 0; + break; + case eTkComment: + case eTkDomainLiteral: + case eTkQuotedString: + case eTkSpecial: + case eTkAtom: + case eTkTspecial: + case eTkToken: + mTokensStart = aTkzr.mTokenStart; + mTokensLength = aTkzr.mTokenLength; + break; + } + mTokens = mString.substr(mTokensStart, mTokensLength); +} + + +void DwTokenString::SetLast(const DwTokenizer& aTkzr) +{ + assert(aTkzr.mTokenStart >= mTokensStart); + if (aTkzr.mTokenStart < mTokensStart) return; + mTokensLength = aTkzr.mTokenStart + aTkzr.mTokenLength - mTokensStart; + mTokens = mString.substr(mTokensStart, mTokensLength); +} + + +void DwTokenString::ExtendTo(const DwTokenizer& aTkzr) +{ + assert(aTkzr.mTokenStart >= mTokensStart); + if (aTkzr.mTokenStart < mTokensStart) return; + mTokensLength = aTkzr.mTokenStart - mTokensStart; + mTokens = mString.substr(mTokensStart, mTokensLength); +} |