/* * File: lexicon.h * --------------- * This file exports the Lexicon class, which is a compact structure for * storing a list of words. */ /*************************************************************************/ /* Stanford Portable Library */ /* Copyright (c) 2014 by Eric Roberts <eroberts@cs.stanford.edu> */ /* */ /* This program is free software: you can redistribute it and/or modify */ /* it under the terms of the GNU General Public License as published by */ /* the Free Software Foundation, either version 3 of the License, or */ /* (at your option) any later version. */ /* */ /* This program is distributed in the hope that it will be useful, */ /* but WITHOUT ANY WARRANTY; without even the implied warranty of */ /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */ /* GNU General Public License for more details. */ /* */ /* You should have received a copy of the GNU General Public License */ /* along with this program. If not, see <http://www.gnu.org/licenses/>. */ /*************************************************************************/ #ifndef _lexicon_h #define _lexicon_h #include <string> #include "set.h" #include "stack.h"/* * Class: Lexicon * -------------- * This class is used to represent a lexicon, or word list. The main * difference between a lexicon and a dictionary is that a lexicon does not * provide any mechanism for storing definitions; the lexicon contains only * words, with no associated information. It is therefore similar to a set * of strings, but with a more space-efficient internal representation. * The Lexicon class supports efficient lookup operations for words and * prefixes. * * As an example of the use of the Lexicon class, the following program * lists all the two-letter words in the lexicon stored in * EnglishWords.dat: * * int main() { * Lexicon english("EnglishWords.dat"); * foreach (string word in english) { * if (word.length() == 2) { * cout << word << endl; * } * } * return 0; * } */ #include <cctype> class Lexicon { public:/* * Constructor: Lexicon * Usage: Lexicon lex; * Lexicon lex(filename); * ----------------------------- * Initializes a new lexicon. The default constructor creates an empty * lexicon. The second form reads in the contents of the lexicon from the * specified data file. The data file must be in one of two formats: (1) a * space-efficient precompiled binary format or (2) a text file containing * one word per line. The Stanford library distribution includes a binary * lexicon file named English.dat containing a list of words in English. * The standard code pattern to initialize that lexicon looks like this: * * Lexicon english("English.dat"); */ Lexicon(); Lexicon(std::string filename);/* * Destructor: ~Lexicon * -------------------- * The destructor deallocates any storage associated with the lexicon. */ virtual ~Lexicon();/* * Method: size * Usage: int n = lex.size(); * -------------------------- * Returns the number of words contained in the lexicon. */ int size() const;/* * Method: isEmpty * Usage: if (lex.isEmpty()) ... * ----------------------------- * Returns true if the lexicon contains no words. */ bool isEmpty() const;/* * Method: clear * Usage: lex.clear(); * ------------------- * Removes all words from the lexicon. */ void clear();/* * Method: add * Usage: lex.add(word); * --------------------- * Adds the specified word to the lexicon. */ void add(std::string word);/* * Method: addWordsFromFile * Usage: lex.addWordsFromFile(filename); * -------------------------------------- * Reads the file and adds all of its words to the lexicon. */ void addWordsFromFile(std::string filename);/* * Method: contains * Usage: if (lex.contains(word)) ... * ---------------------------------- * Returns true if word is contained in the lexicon. In the Lexicon class, * the case of letters is ignored, so "Zoo" is the same as "ZOO" or "zoo". */ bool contains(std::string word) const;/* * Method: containsPrefix * Usage: if (lex.containsPrefix(prefix)) ... * ------------------------------------------ * Returns true if any words in the lexicon begin with prefix. Like * containsWord, this method ignores the case of letters so that "MO" is a * prefix of "monkey" or "Monday". */ bool containsPrefix(std::string prefix) const;/* * Method: mapAll * Usage: lexicon.mapAll(fn); * -------------------------- * Calls the specified function on each word in the lexicon. */ void mapAll(void (*fn)(std::string)) const; void mapAll(void (*fn)(const std::string &)) const; template <typename FunctorType> void mapAll(FunctorType fn) const;/* * Additional Lexicon operations * ----------------------------- * In addition to the methods listed in this interface, the Lexicon class * supports the following operations: * * - Deep copying for the copy constructor and assignment operator * - Iteration using the range-based for statement and STL iterators * * All iteration is guaranteed to proceed in alphabetical order. All words * in the lexicon are stored in lowercase. */ /* Private section */ /**********************************************************************/ /* Note: Everything below this point in the file is logically part */ /* of the implementation and should not be of interest to clients. */ /**********************************************************************/ private: #ifdef _MSC_VER #define LITTLE_ENDIAN 1 #define BYTE_ORDER LITTLE_ENDIAN #endif struct Edge { #if defined(BYTE_ORDER) && BYTE_ORDER == LITTLE_ENDIAN unsigned long letter:5; unsigned long lastEdge:1; unsigned long accept:1; unsigned long unused:1; unsigned long children:24; #else unsigned long children:24; unsigned long unused:1; unsigned long accept:1; unsigned long lastEdge:1; unsigned long letter:5; #endif }; Edge *edges, *start; int numEdges, numDawgWords; Set<std::string> otherWords; public:/* * Deep copying support * -------------------- * This copy constructor and operator= are defined to make a deep copy, * making it possible to pass/return lexicons by value and assign from one * lexicon to another. The entire contents of the lexicon, including all * words, are copied. Making copies is generally avoided because of the * expense and thus, lexicons are typically passed by reference. When a * copy is needed, these operations are supported. */ Lexicon(const Lexicon & src); Lexicon & operator=(const Lexicon & src);/* * Iterator support * ---------------- * The classes in the StanfordCPPLib collection implement input iterators * so that they work symmetrically with respect to the corresponding STL * classes. */ class iterator : public std::iterator<std::input_iterator_tag,std::string> { private: const Lexicon *lp; int index; std::string currentDawgPrefix; std::string currentSetWord; std::string tmpWord; Edge *edgePtr; Stack<Edge *> stack; Set<std::string>::iterator setIterator; Set<std::string>::iterator setEnd; void advanceToNextWordInDawg(); void advanceToNextWordInSet(); void advanceToNextEdge(); public: iterator() { this->lp = NULL; } iterator(const Lexicon *lp, bool endFlag) { this->lp = lp; if (endFlag) { index = lp->size(); } else { index = 0; edgePtr = NULL; setIterator = lp->otherWords.begin(); setEnd = lp->otherWords.end(); currentDawgPrefix = ""; currentSetWord = ""; advanceToNextWordInDawg(); advanceToNextWordInSet(); } } iterator(const iterator & it) { lp = it.lp; index = it.index; currentDawgPrefix = it.currentDawgPrefix; currentSetWord = it.currentSetWord; edgePtr = it.edgePtr; stack = it.stack; setIterator = it.setIterator; } iterator & operator++() { if (edgePtr == NULL) { advanceToNextWordInSet(); } else { if (currentSetWord == "" || currentDawgPrefix < currentSetWord) { advanceToNextWordInDawg(); } else { advanceToNextWordInSet(); } } index++; return *this; } iterator operator++(int) { iterator copy(*this); operator++(); return copy; } bool operator==(const iterator & rhs) { return lp == rhs.lp && index == rhs.index; } bool operator!=(const iterator & rhs) { return !(*this == rhs); } std::string operator*() { if (edgePtr == NULL) return currentSetWord; if (currentSetWord == "" || currentDawgPrefix < currentSetWord) { return currentDawgPrefix + lp->ordToChar(edgePtr->letter); } else { return currentSetWord; } } std::string *operator->() { if (edgePtr == NULL) return ¤tSetWord; if (currentSetWord == "" || currentDawgPrefix < currentSetWord) { tmpWord = currentDawgPrefix + lp->ordToChar(edgePtr->letter); return &tmpWord; } else { return ¤tSetWord; } } }; iterator begin() const { return iterator(this, false); } iterator end() const { return iterator(this, true); } private: Edge *findEdgeForChar(Edge *children, char ch) const; Edge *traceToLastEdge(const std::string & s) const; void readBinaryFile(std::string filename); void deepCopy(const Lexicon & src); int countDawgWords(Edge *start) const; unsigned int charToOrd(char ch) const { return ((unsigned int)(tolower(ch) - 'a' + 1)); } char ordToChar(unsigned int ord) const { return ((char)(ord - 1 + 'a')); } }; template <typename FunctorType> void Lexicon::mapAll(FunctorType fn) const { foreach (std::string word in *this) { fn(word); } } #endif