/* * File: tokenscanner.h * -------------------- * This file exports a TokenScanner class that divides a string into * individual logical units called tokens. */ /*************************************************************************/ /* Stanford Portable Library */ /* Copyright (c) 2014 by Eric Roberts <eroberts@cs.stanford.edu> */ /* */ /* This program is free software: you can redistribute it and/or modify */ /* it under the terms of the GNU General Public License as published by */ /* the Free Software Foundation, either version 3 of the License, or */ /* (at your option) any later version. */ /* */ /* This program is distributed in the hope that it will be useful, */ /* but WITHOUT ANY WARRANTY; without even the implied warranty of */ /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */ /* GNU General Public License for more details. */ /* */ /* You should have received a copy of the GNU General Public License */ /* along with this program. If not, see <http://www.gnu.org/licenses/>. */ /*************************************************************************/ #ifndef _tokenscanner_h #define _tokenscanner_h #include <iostream> #include <string>/* * Type: TokenType * --------------- * This enumerated type defines the values of the getTokenType method. */ enum TokenType { SEPARATOR, WORD, NUMBER, STRING, OPERATOR };/* * Class: TokenScanner * ------------------- * This class divides a string into individual tokens. The typical use of * the TokenScanner class is illustrated by the following pattern, which * reads the tokens in the string variable input: * * TokenScanner scanner(input); * while (scanner.hasMoreTokens()) { * string token = scanner.nextToken(); * ... process the token ... * } * * The TokenScanner class exports several additional methods that give * clients more control over its behavior. Those methods are described * individually in the documentation. */ class TokenScanner { public:/* * Constructor: TokenScanner * Usage: TokenScanner scanner; * TokenScanner scanner(str); * TokenScanner scanner(infile); * ------------------------------------ * Initializes a scanner object. The initial token stream comes from the * specified string or input stream, if supplied. The default constructor * creates a scanner with an empty token stream. */ TokenScanner(); TokenScanner(std::string str); TokenScanner(std::istream & infile);/* * Destructor: ~TokenScanner * ------------------------- * Deallocates the storage associated with this scanner. */ virtual ~TokenScanner();/* * Method: setInput * Usage: scanner.setInput(str); * scanner.setInput(infile); * -------------------------------- * Sets the token stream for this scanner to the specified string or input * stream. Any previous token stream is discarded. */ void setInput(std::string str); void setInput(std::istream & infile);/* * Method: hasMoreTokens * Usage: if (scanner.hasMoreTokens()) ... * --------------------------------------- * Returns true if there are additional tokens for this scanner to read. */ bool hasMoreTokens();/* * Method: nextToken * Usage: token = scanner.nextToken(); * ----------------------------------- * Returns the next token from this scanner. If nextToken is called when * no tokens are available, it returns the empty string. */ std::string nextToken();/* * Method: saveToken * Usage: scanner.saveToken(token); * -------------------------------- * Pushes the specified token back into this scanner's input stream. On * the next call to nextToken, the scanner will return the saved token * without reading any additional characters from the token stream. */ void saveToken(std::string token);/* * Method: getPosition * Usage: int pos = scanner.getPosition(); * --------------------------------------- * Returns the current position of the scanner in the input stream. If * saveToken has been called, this position corresponds to the beginning of * the saved token. If saveToken is called more than once, getPosition * returns -1. */ int getPosition() const;/* * Method: ignoreWhitespace * Usage: scanner.ignoreWhitespace(); * ---------------------------------- * Tells the scanner to ignore whitespace characters. By default, the * nextToken method treats whitespace characters (typically spaces and * tabs) just like any other punctuation mark and returns them as * single-character tokens. Calling * * scanner.ignoreWhitespace(); * * changes this behavior so that the scanner ignores whitespace characters. */ void ignoreWhitespace();/* * Method: ignoreComments * Usage: scanner.ignoreComments(); * -------------------------------- * Tells the scanner to ignore comments. The scanner package recognizes * both the slash-star and slash-slash comment format from the C-based * family of languages. Calling * * scanner.ignoreComments(); * * sets the parser to ignore comments. */ void ignoreComments();/* * Method: scanNumbers * Usage: scanner.scanNumbers(); * ----------------------------- * Controls how the scanner treats tokens that begin with a digit. By * default, the nextToken method treats numbers and letters identically and * therefore does not provide any special processing for numbers. Calling * * scanner.scanNumbers(); * * changes this behavior so that nextToken returns the longest substring * that can be interpreted as a real number. */ void scanNumbers();/* * Method: scanStrings * Usage: scanner.scanStrings(); * ----------------------------- * Controls how the scanner treats tokens enclosed in quotation marks. By * default, quotation marks (either single or double) are treated just like * any other punctuation character. Calling * * scanner.scanStrings(); * * changes this assumption so that nextToken returns a single token * consisting of all characters through the matching quotation mark. The * quotation marks are returned as part of the scanned token so that * clients can differentiate strings from other token types. */ void scanStrings();/* * Method: addWordCharacters * Usage: scanner.addWordCharacters(str); * -------------------------------------- * Adds the characters in str to the set of characters legal in a WORD * token. For example, calling addWordCharacters("_") adds the underscore * to the set of characters that are accepted as part of a word. */ void addWordCharacters(std::string str);/* * Method: isWordCharacter * Usage: if (scanner.isWordCharacter(ch)) ... * ------------------------------------------- * Returns true if the character is valid in a word. */ bool isWordCharacter(char ch) const;/* * Method: addOperator * Usage: scanner.addOperator(op); * ------------------------------- * Defines a new multicharacter operator. Whenever you call nextToken when * the input stream contains operator characters, the scanner returns the * longest possible operator string that can be read at that point. */ void addOperator(std::string op);/* * Method: verifyToken * Usage: scanner.verifyToken(expected); * ------------------------------------- * Reads the next token and makes sure it matches the string expected. If * it does not, verifyToken throws an error. */ void verifyToken(std::string expected);/* * Method: getTokenType * Usage: TokenType type = scanner.getTokenType(token); * ---------------------------------------------------- * Returns the type of this token. This type will match one of the * following enumerated type constants: EOF, SEPARATOR, WORD, NUMBER, * STRING, or OPERATOR. */ TokenType getTokenType(std::string token) const;/* * Method: getChar * Usage: int ch = scanner.getChar(); * ---------------------------------- * Reads the next character from the scanner input stream. */ int getChar();/* * Method: ungetChar * Usage: scanner.ungetChar(ch); * ----------------------------- * Pushes the character ch back into the scanner stream. The character * must match the one that was read. */ void ungetChar(int ch);/* * Method: getStringValue * Usage: string str = scanner.getStringValue(token); * -------------------------------------------------- * Returns the string value of a token. This value is formed by removing * any surrounding quotation marks and replacing escape sequences by the * appropriate characters. */ std::string getStringValue(std::string token) const;/* Private section */ /**********************************************************************/ /* Note: Everything below this point in the file is logically part */ /* of the implementation and should not be of interest to clients. */ /**********************************************************************/ private:/* * Private type: StringCell * ------------------------ * This type is used to construct linked lists of cells, which are used to * represent both the stack of saved tokens and the set of defined * operators. These types cannot use the Stack and Lexicon classes * directly because tokenscanner.h is an extremely low-level interface, and * doing so would create circular dependencies in the .h files. */ struct StringCell { std::string str; StringCell *link; }; enum NumberScannerState { INITIAL_STATE, BEFORE_DECIMAL_POINT, AFTER_DECIMAL_POINT, STARTING_EXPONENT, FOUND_EXPONENT_SIGN, SCANNING_EXPONENT, FINAL_STATE }; std::string buffer;/* The original argument string */ std::istream *isp;/* The input stream for tokens */ bool stringInputFlag;/* Flag indicating string input */ bool ignoreWhitespaceFlag;/* Scanner ignores whitespace */ bool ignoreCommentsFlag;/* Scanner ignores comments */ bool scanNumbersFlag;/* Scanner parses numbers */ bool scanStringsFlag;/* Scanner parses strings */ std::string wordChars;/* Additional word characters */ StringCell *savedTokens;/* Stack of saved tokens */ StringCell *operators;/* List of multichar operators */ /* Private method prototypes */ void initScanner(); void skipSpaces(); std::string scanWord(); std::string scanNumber(); std::string scanString(); bool isOperator(std::string op); bool isOperatorPrefix(std::string op); }; #endif