diff options
author | John Fultz <jfultz@wolfram.com> | 2015-08-24 04:45:27 -0500 |
---|---|---|
committer | John Fultz <jfultz@wolfram.com> | 2015-08-24 04:45:46 -0500 |
commit | 1f7b8ef6f96e1d5a2c50565a0f52cc633215e485 (patch) | |
tree | 11f406677824d20924748225ab7eb129ba929cd0 | |
parent | 8c7ffef1b6c669592e979fb6038dd634df7f95fc (diff) |
Version the GADDAGs.
Basically the same thing I just did to the DAWG files,
now done to GADDAGs. Also, add hashing, and
make sure GADDAGs only load if their hash matches
that of the DAWG files.
-rw-r--r-- | lexiconparameters.cpp | 52 | ||||
-rw-r--r-- | lexiconparameters.h | 15 | ||||
-rw-r--r-- | quacker/settings.cpp | 19 | ||||
-rw-r--r-- | quackleio/dawgfactory.cpp | 1 | ||||
-rw-r--r-- | quackleio/gaddagfactory.cpp | 61 | ||||
-rw-r--r-- | quackleio/gaddagfactory.h | 23 | ||||
-rw-r--r-- | quackletest.cpp | 4 | ||||
-rw-r--r-- | test/testharness.cpp | 6 |
8 files changed, 117 insertions, 64 deletions
diff --git a/lexiconparameters.cpp b/lexiconparameters.cpp index ca09fa5..e014048 100644 --- a/lexiconparameters.cpp +++ b/lexiconparameters.cpp @@ -19,13 +19,14 @@ #include <iostream> #include <fstream> + #include "datamanager.h" #include "lexiconparameters.h" #include "uv.h" using namespace Quackle; -class Quackle::V0DawgInterpreter : public DawgInterpreter +class Quackle::V0LexiconInterpreter : public LexiconInterpreter { virtual void loadDawg(ifstream &file, LexiconParameters &lexparams) @@ -39,6 +40,17 @@ class Quackle::V0DawgInterpreter : public DawgInterpreter } } + virtual void loadGaddag(ifstream &file, LexiconParameters &lexparams) + { + int i = 0; + file.unget(); + while (!file.eof()) + { + file.read((char*)(lexparams.m_gaddag) + i, 4); + i += 4; + } + } + virtual void dawgAt(const unsigned char *dawg, int index, unsigned int &p, Letter &letter, bool &t, bool &lastchild, bool &british, int &playability) const { index *= 7; @@ -55,7 +67,7 @@ class Quackle::V0DawgInterpreter : public DawgInterpreter virtual int versionNumber() const { return 0; } }; -class Quackle::V1DawgInterpreter : public DawgInterpreter +class Quackle::V1LexiconInterpreter : public LexiconInterpreter { virtual void loadDawg(ifstream &file, LexiconParameters &lexparams) @@ -72,6 +84,24 @@ class Quackle::V1DawgInterpreter : public DawgInterpreter } } + virtual void loadGaddag(ifstream &file, LexiconParameters &lexparams) + { + char hash[16]; + file.read(hash, sizeof(hash)); + if (memcmp(hash, lexparams.m_hash, sizeof(hash))) + { + lexparams.unloadGaddag(); // don't use a mismatched gaddag + return; + } + + int i = 0; + while (!file.eof()) + { + file.read((char*)(lexparams.m_gaddag) + i, 4); + i += 4; + } + } + virtual void dawgAt(const unsigned char *dawg, int index, unsigned int &p, Letter &letter, bool &t, bool &lastchild, bool &british, int &playability) const { index *= 7; @@ -108,14 +138,14 @@ void LexiconParameters::unloadAll() void LexiconParameters::unloadDawg() { delete[] m_dawg; - m_dawg = 0; + m_dawg = NULL; delete m_interpreter; } void LexiconParameters::unloadGaddag() { delete[] m_gaddag; - m_gaddag = 0; + m_gaddag = NULL; } void LexiconParameters::loadDawg(const string &filename) @@ -133,10 +163,10 @@ void LexiconParameters::loadDawg(const string &filename) switch(versionByte) { case 0: - m_interpreter = new V0DawgInterpreter(); + m_interpreter = new V0LexiconInterpreter(); break; case 1: - m_interpreter = new V1DawgInterpreter(); + m_interpreter = new V1LexiconInterpreter(); break; default: UVcout << "couldn't open dawg " << filename.c_str() << endl; @@ -160,14 +190,12 @@ void LexiconParameters::loadGaddag(const string &filename) return; } + char versionByte = file.get(); + if (versionByte != m_interpreter->versionNumber()) + return; m_gaddag = new unsigned char[40000000]; - int i = 0; - while (!file.eof()) - { - file.read((char*)(m_gaddag) + i, 4); - i += 4; - } + m_interpreter->loadGaddag(file, *this); } string LexiconParameters::findDictionaryFile(const string &lexicon) diff --git a/lexiconparameters.h b/lexiconparameters.h index 4b6369d..04ad4e7 100644 --- a/lexiconparameters.h +++ b/lexiconparameters.h @@ -25,22 +25,23 @@ namespace Quackle { -class DawgInterpreter +class LexiconInterpreter { public: virtual void loadDawg(ifstream &file, LexiconParameters &lexparams) = 0; + virtual void loadGaddag(ifstream &file, LexiconParameters &lexparams) = 0; virtual void dawgAt(const unsigned char *dawg, int index, unsigned int &p, Letter &letter, bool &t, bool &lastchild, bool &british, int &playability) const = 0; virtual int versionNumber() const = 0; - virtual ~DawgInterpreter() {}; + virtual ~LexiconInterpreter() {}; }; -class V0DawgInterpreter; -class V1DawgInterpreter; +class V0LexiconInterpreter; +class V1LexiconInterpreter; class LexiconParameters { - friend class Quackle::V0DawgInterpreter; - friend class Quackle::V1DawgInterpreter; + friend class Quackle::V0LexiconInterpreter; + friend class Quackle::V1LexiconInterpreter; public: LexiconParameters(); @@ -79,7 +80,7 @@ protected: unsigned char *m_dawg; unsigned char *m_gaddag; string m_lexiconName; - DawgInterpreter *m_interpreter; + LexiconInterpreter *m_interpreter; char m_hash[16]; int m_wordcount; }; diff --git a/quacker/settings.cpp b/quacker/settings.cpp index 3c42a39..362e916 100644 --- a/quacker/settings.cpp +++ b/quacker/settings.cpp @@ -207,16 +207,6 @@ void Settings::setQuackleToUseLexiconName(const string &lexiconName) { QUACKLE_LEXICON_PARAMETERS->setLexiconName(lexiconName); - string gaddagFile = Quackle::LexiconParameters::findDictionaryFile(lexiconName + ".gaddag"); - - if (gaddagFile.empty()) - { - UVcout << "Gaddag for lexicon '" << lexiconName << "' does not exist." << endl; - QUACKLE_LEXICON_PARAMETERS->unloadGaddag(); - } - else - QUACKLE_LEXICON_PARAMETERS->loadGaddag(gaddagFile); - string dawgFile = Quackle::LexiconParameters::findDictionaryFile(lexiconName + ".dawg"); if (dawgFile.empty()) { @@ -226,6 +216,15 @@ void Settings::setQuackleToUseLexiconName(const string &lexiconName) else QUACKLE_LEXICON_PARAMETERS->loadDawg(dawgFile); + string gaddagFile = Quackle::LexiconParameters::findDictionaryFile(lexiconName + ".gaddag"); + if (gaddagFile.empty()) + { + UVcout << "Gaddag for lexicon '" << lexiconName << "' does not exist." << endl; + QUACKLE_LEXICON_PARAMETERS->unloadGaddag(); + } + else + QUACKLE_LEXICON_PARAMETERS->loadGaddag(gaddagFile); + QUACKLE_STRATEGY_PARAMETERS->initialize(lexiconName); } } diff --git a/quackleio/dawgfactory.cpp b/quackleio/dawgfactory.cpp index 6fb5be0..74b4346 100644 --- a/quackleio/dawgfactory.cpp +++ b/quackleio/dawgfactory.cpp @@ -138,6 +138,7 @@ void DawgFactory::writeIndex(const QString& filename) bytes[1] = (m_encodableWords & 0x0000FF00) >> 8; bytes[2] = (m_encodableWords & 0x000000FF); + out.put(1); // DAWG format version 1 out.write(m_hash.charptr, sizeof(m_hash.charptr)); out.write((char*)bytes, 3); diff --git a/quackleio/gaddagfactory.cpp b/quackleio/gaddagfactory.cpp index e2c726d..7f666cb 100644 --- a/quackleio/gaddagfactory.cpp +++ b/quackleio/gaddagfactory.cpp @@ -19,6 +19,7 @@ #include <iostream> #include <QtCore> +#include <QCryptographicHash> #include "gaddagfactory.h" #include "util.h" @@ -27,18 +28,20 @@ GaddagFactory::GaddagFactory(const QString& alphabetFile) { QuackleIO::FlexibleAlphabetParameters *flexure = new QuackleIO::FlexibleAlphabetParameters; flexure->load(alphabetFile); - alphas = flexure; + m_alphas = flexure; // So the separator is sorted to last. - root.t = false; - root.c = QUACKLE_NULL_MARK; // "_" - root.pointer = 0; - root.lastchild = true; + m_root.t = false; + m_root.c = QUACKLE_NULL_MARK; // "_" + m_root.pointer = 0; + m_root.lastchild = true; + + m_hash.int32ptr[0] = m_hash.int32ptr[1] = m_hash.int32ptr[2] = m_hash.int32ptr[3] = 0; } GaddagFactory::~GaddagFactory() { - delete alphas; + delete m_alphas; } bool GaddagFactory::pushWord(const QString& word) @@ -46,10 +49,14 @@ bool GaddagFactory::pushWord(const QString& word) UVString originalString = QuackleIO::Util::qstringToString(word); UVString leftover; - Quackle::LetterString encodedWord = alphas->encode(originalString, &leftover); + Quackle::LetterString encodedWord = m_alphas->encode(originalString, &leftover); if (leftover.empty()) { ++m_encodableWords; + hashWord(encodedWord); + // FIXME: This hash will fail if duplicate words are passed in. + // But testing for duplicate words isn't so easy without keeping + // an entirely separate list. for (unsigned i = 1; i <= encodedWord.length(); i++) { @@ -64,7 +71,7 @@ bool GaddagFactory::pushWord(const QString& word) for (unsigned j = i; j < encodedWord.length(); j++) newword.push_back(encodedWord[j]); } - gaddagizedWords.push_back(newword); + m_gaddagizedWords.push_back(newword); } return true; } @@ -73,26 +80,40 @@ bool GaddagFactory::pushWord(const QString& word) return false; } +void GaddagFactory::hashWord(const Quackle::LetterString &word) +{ + QCryptographicHash wordhash(QCryptographicHash::Md5); + wordhash.addData(word.constData(), word.length()); + QByteArray wordhashbytes = wordhash.result(); + m_hash.int32ptr[0] ^= ((const int32_t*)wordhashbytes.constData())[0]; + m_hash.int32ptr[1] ^= ((const int32_t*)wordhashbytes.constData())[1]; + m_hash.int32ptr[2] ^= ((const int32_t*)wordhashbytes.constData())[2]; + m_hash.int32ptr[3] ^= ((const int32_t*)wordhashbytes.constData())[3]; +} + void GaddagFactory::generate() { - Quackle::WordList::const_iterator wordsEnd = gaddagizedWords.end(); - for (Quackle::WordList::const_iterator wordsIt = gaddagizedWords.begin(); wordsIt != wordsEnd; ++wordsIt) - root.pushWord(*wordsIt); + Quackle::WordList::const_iterator wordsEnd = m_gaddagizedWords.end(); + for (Quackle::WordList::const_iterator wordsIt = m_gaddagizedWords.begin(); wordsIt != wordsEnd; ++wordsIt) + m_root.pushWord(*wordsIt); // for (const auto& words : gaddaggizedWords) - // root.pushWord(words); + // m_root.pushWord(words); } -void GaddagFactory::writeIndex(const QString& fname) +void GaddagFactory::writeIndex(const QString &fname) { - nodelist.push_back(&root); + m_nodelist.push_back(&m_root); - root.print(nodelist); + m_root.print(m_nodelist); ofstream out(QuackleIO::Util::qstringToStdString(fname).c_str(), ios::out | ios::binary); - for (size_t i = 0; i < nodelist.size(); i++) + out.put(1); // GADDAG format version 1 + out.write(m_hash.charptr, sizeof(m_hash.charptr)); + + for (size_t i = 0; i < m_nodelist.size(); i++) { - unsigned int p = (unsigned int)(nodelist[i]->pointer); + unsigned int p = (unsigned int)(m_nodelist[i]->pointer); if (p != 0) p -= i; // offset indexing @@ -102,14 +123,14 @@ void GaddagFactory::writeIndex(const QString& fname) unsigned char n3 = (p & 0x000000FF) >> 0; unsigned char n4; - n4 = nodelist[i]->c; + n4 = m_nodelist[i]->c; if (n4 == internalSeparatorRepresentation) n4 = QUACKLE_NULL_MARK; - if (nodelist[i]->t) + if (m_nodelist[i]->t) n4 |= 64; - if (nodelist[i]->lastchild) + if (m_nodelist[i]->lastchild) n4 |= 128; bytes[0] = n1; bytes[1] = n2; bytes[2] = n3; bytes[3] = n4; diff --git a/quackleio/gaddagfactory.h b/quackleio/gaddagfactory.h index 9eb8d72..2d21192 100644 --- a/quackleio/gaddagfactory.h +++ b/quackleio/gaddagfactory.h @@ -30,13 +30,14 @@ public: GaddagFactory(const QString& alphabetFile); ~GaddagFactory(); - int wordCount() const { return gaddagizedWords.size(); }; - int nodeCount() const { return nodelist.size(); }; + int wordCount() const { return m_gaddagizedWords.size(); }; + int nodeCount() const { return m_nodelist.size(); }; int encodableWords() const { return m_encodableWords; }; int unencodableWords() const { return m_unencodableWords; }; bool pushWord(const QString& word); - void sortWords() { sort(gaddagizedWords.begin(), gaddagizedWords.end()); }; + void hashWord(const Quackle::LetterString &word); + void sortWords() { sort(m_gaddagizedWords.begin(), m_gaddagizedWords.end()); }; void generate(); void writeIndex(const QString& fname); @@ -49,17 +50,19 @@ private: int pointer; bool lastchild; void pushWord(const Quackle::LetterString& word); - void print(vector< Node* >& nodelist); + void print(vector< Node* >& m_nodelist); }; int m_encodableWords; int m_unencodableWords; - Quackle::WordList gaddagizedWords; - vector< Node* > nodelist; - Quackle::AlphabetParameters *alphas; - Node root; - - + Quackle::WordList m_gaddagizedWords; + vector< Node* > m_nodelist; + Quackle::AlphabetParameters *m_alphas; + Node m_root; + union { + char charptr[16]; + int32_t int32ptr[4]; + } m_hash; }; #endif diff --git a/quackletest.cpp b/quackletest.cpp index e69c2cb..7ea5d10 100644 --- a/quackletest.cpp +++ b/quackletest.cpp @@ -47,7 +47,7 @@ int main() dataManager.setAppDataDirectory("data"); dataManager.lexiconParameters()->loadDawg(Quackle::LexiconParameters::findDictionaryFile("twl06.dawg")); - dataManager.lexiconParameters()->loadGaddag(Quackle::LexiconParameters::findDictionaryFile("twl06.gaddag")); + dataManager.lexiconParameters()->loadGaddag(Quackle::LexiconParameters::findDictionaryFile("twl06.gaddag")); dataManager.strategyParameters()->initialize("twl06"); dataManager.setBoardParameters(new Quackle::EnglishBoard()); @@ -58,7 +58,7 @@ int main() const int gameCnt = 1000; //const int gameCnt = 1; for (int game = 0; game < gameCnt; ++game) { - testGame(); + testGame(); } return 0; diff --git a/test/testharness.cpp b/test/testharness.cpp index 683443f..3f390c1 100644 --- a/test/testharness.cpp +++ b/test/testharness.cpp @@ -207,13 +207,13 @@ void TestHarness::startUp() m_dataManager.setBoardParameters(new ScrabbleBoard()); - m_dataManager.lexiconParameters()->loadGaddag(Quackle::LexiconParameters::findDictionaryFile(QuackleIO::Util::qstringToStdString(m_lexicon + ".gaddag"))); + m_dataManager.lexiconParameters()->loadDawg(Quackle::LexiconParameters::findDictionaryFile(QuackleIO::Util::qstringToStdString(m_lexicon + ".dawg"))); UVcout << "."; - m_dataManager.lexiconParameters()->loadDawg(Quackle::LexiconParameters::findDictionaryFile(QuackleIO::Util::qstringToStdString(m_lexicon + ".dawg"))); + m_dataManager.lexiconParameters()->loadGaddag(Quackle::LexiconParameters::findDictionaryFile(QuackleIO::Util::qstringToStdString(m_lexicon + ".gaddag"))); + UVcout << "."; m_dataManager.strategyParameters()->initialize(QuackleIO::Util::qstringToStdString(m_lexicon)); - UVcout << "."; UVcout << endl; |