diff options
author | John Fultz <jfultz@wolfram.com> | 2015-08-24 04:45:27 -0500 |
---|---|---|
committer | John Fultz <jfultz@wolfram.com> | 2015-08-24 04:45:46 -0500 |
commit | 1f7b8ef6f96e1d5a2c50565a0f52cc633215e485 (patch) | |
tree | 11f406677824d20924748225ab7eb129ba929cd0 /quackleio | |
parent | 8c7ffef1b6c669592e979fb6038dd634df7f95fc (diff) |
Version the GADDAGs.
Basically the same thing I just did to the DAWG files,
now done to GADDAGs. Also, add hashing, and
make sure GADDAGs only load if their hash matches
that of the DAWG files.
Diffstat (limited to 'quackleio')
-rw-r--r-- | quackleio/dawgfactory.cpp | 1 | ||||
-rw-r--r-- | quackleio/gaddagfactory.cpp | 61 | ||||
-rw-r--r-- | quackleio/gaddagfactory.h | 23 |
3 files changed, 55 insertions, 30 deletions
diff --git a/quackleio/dawgfactory.cpp b/quackleio/dawgfactory.cpp index 6fb5be0..74b4346 100644 --- a/quackleio/dawgfactory.cpp +++ b/quackleio/dawgfactory.cpp @@ -138,6 +138,7 @@ void DawgFactory::writeIndex(const QString& filename) bytes[1] = (m_encodableWords & 0x0000FF00) >> 8; bytes[2] = (m_encodableWords & 0x000000FF); + out.put(1); // DAWG format version 1 out.write(m_hash.charptr, sizeof(m_hash.charptr)); out.write((char*)bytes, 3); diff --git a/quackleio/gaddagfactory.cpp b/quackleio/gaddagfactory.cpp index e2c726d..7f666cb 100644 --- a/quackleio/gaddagfactory.cpp +++ b/quackleio/gaddagfactory.cpp @@ -19,6 +19,7 @@ #include <iostream> #include <QtCore> +#include <QCryptographicHash> #include "gaddagfactory.h" #include "util.h" @@ -27,18 +28,20 @@ GaddagFactory::GaddagFactory(const QString& alphabetFile) { QuackleIO::FlexibleAlphabetParameters *flexure = new QuackleIO::FlexibleAlphabetParameters; flexure->load(alphabetFile); - alphas = flexure; + m_alphas = flexure; // So the separator is sorted to last. - root.t = false; - root.c = QUACKLE_NULL_MARK; // "_" - root.pointer = 0; - root.lastchild = true; + m_root.t = false; + m_root.c = QUACKLE_NULL_MARK; // "_" + m_root.pointer = 0; + m_root.lastchild = true; + + m_hash.int32ptr[0] = m_hash.int32ptr[1] = m_hash.int32ptr[2] = m_hash.int32ptr[3] = 0; } GaddagFactory::~GaddagFactory() { - delete alphas; + delete m_alphas; } bool GaddagFactory::pushWord(const QString& word) @@ -46,10 +49,14 @@ bool GaddagFactory::pushWord(const QString& word) UVString originalString = QuackleIO::Util::qstringToString(word); UVString leftover; - Quackle::LetterString encodedWord = alphas->encode(originalString, &leftover); + Quackle::LetterString encodedWord = m_alphas->encode(originalString, &leftover); if (leftover.empty()) { ++m_encodableWords; + hashWord(encodedWord); + // FIXME: This hash will fail if duplicate words are passed in. + // But testing for duplicate words isn't so easy without keeping + // an entirely separate list. for (unsigned i = 1; i <= encodedWord.length(); i++) { @@ -64,7 +71,7 @@ bool GaddagFactory::pushWord(const QString& word) for (unsigned j = i; j < encodedWord.length(); j++) newword.push_back(encodedWord[j]); } - gaddagizedWords.push_back(newword); + m_gaddagizedWords.push_back(newword); } return true; } @@ -73,26 +80,40 @@ bool GaddagFactory::pushWord(const QString& word) return false; } +void GaddagFactory::hashWord(const Quackle::LetterString &word) +{ + QCryptographicHash wordhash(QCryptographicHash::Md5); + wordhash.addData(word.constData(), word.length()); + QByteArray wordhashbytes = wordhash.result(); + m_hash.int32ptr[0] ^= ((const int32_t*)wordhashbytes.constData())[0]; + m_hash.int32ptr[1] ^= ((const int32_t*)wordhashbytes.constData())[1]; + m_hash.int32ptr[2] ^= ((const int32_t*)wordhashbytes.constData())[2]; + m_hash.int32ptr[3] ^= ((const int32_t*)wordhashbytes.constData())[3]; +} + void GaddagFactory::generate() { - Quackle::WordList::const_iterator wordsEnd = gaddagizedWords.end(); - for (Quackle::WordList::const_iterator wordsIt = gaddagizedWords.begin(); wordsIt != wordsEnd; ++wordsIt) - root.pushWord(*wordsIt); + Quackle::WordList::const_iterator wordsEnd = m_gaddagizedWords.end(); + for (Quackle::WordList::const_iterator wordsIt = m_gaddagizedWords.begin(); wordsIt != wordsEnd; ++wordsIt) + m_root.pushWord(*wordsIt); // for (const auto& words : gaddaggizedWords) - // root.pushWord(words); + // m_root.pushWord(words); } -void GaddagFactory::writeIndex(const QString& fname) +void GaddagFactory::writeIndex(const QString &fname) { - nodelist.push_back(&root); + m_nodelist.push_back(&m_root); - root.print(nodelist); + m_root.print(m_nodelist); ofstream out(QuackleIO::Util::qstringToStdString(fname).c_str(), ios::out | ios::binary); - for (size_t i = 0; i < nodelist.size(); i++) + out.put(1); // GADDAG format version 1 + out.write(m_hash.charptr, sizeof(m_hash.charptr)); + + for (size_t i = 0; i < m_nodelist.size(); i++) { - unsigned int p = (unsigned int)(nodelist[i]->pointer); + unsigned int p = (unsigned int)(m_nodelist[i]->pointer); if (p != 0) p -= i; // offset indexing @@ -102,14 +123,14 @@ void GaddagFactory::writeIndex(const QString& fname) unsigned char n3 = (p & 0x000000FF) >> 0; unsigned char n4; - n4 = nodelist[i]->c; + n4 = m_nodelist[i]->c; if (n4 == internalSeparatorRepresentation) n4 = QUACKLE_NULL_MARK; - if (nodelist[i]->t) + if (m_nodelist[i]->t) n4 |= 64; - if (nodelist[i]->lastchild) + if (m_nodelist[i]->lastchild) n4 |= 128; bytes[0] = n1; bytes[1] = n2; bytes[2] = n3; bytes[3] = n4; diff --git a/quackleio/gaddagfactory.h b/quackleio/gaddagfactory.h index 9eb8d72..2d21192 100644 --- a/quackleio/gaddagfactory.h +++ b/quackleio/gaddagfactory.h @@ -30,13 +30,14 @@ public: GaddagFactory(const QString& alphabetFile); ~GaddagFactory(); - int wordCount() const { return gaddagizedWords.size(); }; - int nodeCount() const { return nodelist.size(); }; + int wordCount() const { return m_gaddagizedWords.size(); }; + int nodeCount() const { return m_nodelist.size(); }; int encodableWords() const { return m_encodableWords; }; int unencodableWords() const { return m_unencodableWords; }; bool pushWord(const QString& word); - void sortWords() { sort(gaddagizedWords.begin(), gaddagizedWords.end()); }; + void hashWord(const Quackle::LetterString &word); + void sortWords() { sort(m_gaddagizedWords.begin(), m_gaddagizedWords.end()); }; void generate(); void writeIndex(const QString& fname); @@ -49,17 +50,19 @@ private: int pointer; bool lastchild; void pushWord(const Quackle::LetterString& word); - void print(vector< Node* >& nodelist); + void print(vector< Node* >& m_nodelist); }; int m_encodableWords; int m_unencodableWords; - Quackle::WordList gaddagizedWords; - vector< Node* > nodelist; - Quackle::AlphabetParameters *alphas; - Node root; - - + Quackle::WordList m_gaddagizedWords; + vector< Node* > m_nodelist; + Quackle::AlphabetParameters *m_alphas; + Node m_root; + union { + char charptr[16]; + int32_t int32ptr[4]; + } m_hash; }; #endif |