From 1f7b8ef6f96e1d5a2c50565a0f52cc633215e485 Mon Sep 17 00:00:00 2001 From: John Fultz Date: Mon, 24 Aug 2015 04:45:27 -0500 Subject: Version the GADDAGs. Basically the same thing I just did to the DAWG files, now done to GADDAGs. Also, add hashing, and make sure GADDAGs only load if their hash matches that of the DAWG files. --- quackleio/gaddagfactory.cpp | 61 ++++++++++++++++++++++++++++++--------------- 1 file changed, 41 insertions(+), 20 deletions(-) (limited to 'quackleio/gaddagfactory.cpp') diff --git a/quackleio/gaddagfactory.cpp b/quackleio/gaddagfactory.cpp index e2c726d..7f666cb 100644 --- a/quackleio/gaddagfactory.cpp +++ b/quackleio/gaddagfactory.cpp @@ -19,6 +19,7 @@ #include #include +#include #include "gaddagfactory.h" #include "util.h" @@ -27,18 +28,20 @@ GaddagFactory::GaddagFactory(const QString& alphabetFile) { QuackleIO::FlexibleAlphabetParameters *flexure = new QuackleIO::FlexibleAlphabetParameters; flexure->load(alphabetFile); - alphas = flexure; + m_alphas = flexure; // So the separator is sorted to last. - root.t = false; - root.c = QUACKLE_NULL_MARK; // "_" - root.pointer = 0; - root.lastchild = true; + m_root.t = false; + m_root.c = QUACKLE_NULL_MARK; // "_" + m_root.pointer = 0; + m_root.lastchild = true; + + m_hash.int32ptr[0] = m_hash.int32ptr[1] = m_hash.int32ptr[2] = m_hash.int32ptr[3] = 0; } GaddagFactory::~GaddagFactory() { - delete alphas; + delete m_alphas; } bool GaddagFactory::pushWord(const QString& word) @@ -46,10 +49,14 @@ bool GaddagFactory::pushWord(const QString& word) UVString originalString = QuackleIO::Util::qstringToString(word); UVString leftover; - Quackle::LetterString encodedWord = alphas->encode(originalString, &leftover); + Quackle::LetterString encodedWord = m_alphas->encode(originalString, &leftover); if (leftover.empty()) { ++m_encodableWords; + hashWord(encodedWord); + // FIXME: This hash will fail if duplicate words are passed in. + // But testing for duplicate words isn't so easy without keeping + // an entirely separate list. for (unsigned i = 1; i <= encodedWord.length(); i++) { @@ -64,7 +71,7 @@ bool GaddagFactory::pushWord(const QString& word) for (unsigned j = i; j < encodedWord.length(); j++) newword.push_back(encodedWord[j]); } - gaddagizedWords.push_back(newword); + m_gaddagizedWords.push_back(newword); } return true; } @@ -73,26 +80,40 @@ bool GaddagFactory::pushWord(const QString& word) return false; } +void GaddagFactory::hashWord(const Quackle::LetterString &word) +{ + QCryptographicHash wordhash(QCryptographicHash::Md5); + wordhash.addData(word.constData(), word.length()); + QByteArray wordhashbytes = wordhash.result(); + m_hash.int32ptr[0] ^= ((const int32_t*)wordhashbytes.constData())[0]; + m_hash.int32ptr[1] ^= ((const int32_t*)wordhashbytes.constData())[1]; + m_hash.int32ptr[2] ^= ((const int32_t*)wordhashbytes.constData())[2]; + m_hash.int32ptr[3] ^= ((const int32_t*)wordhashbytes.constData())[3]; +} + void GaddagFactory::generate() { - Quackle::WordList::const_iterator wordsEnd = gaddagizedWords.end(); - for (Quackle::WordList::const_iterator wordsIt = gaddagizedWords.begin(); wordsIt != wordsEnd; ++wordsIt) - root.pushWord(*wordsIt); + Quackle::WordList::const_iterator wordsEnd = m_gaddagizedWords.end(); + for (Quackle::WordList::const_iterator wordsIt = m_gaddagizedWords.begin(); wordsIt != wordsEnd; ++wordsIt) + m_root.pushWord(*wordsIt); // for (const auto& words : gaddaggizedWords) - // root.pushWord(words); + // m_root.pushWord(words); } -void GaddagFactory::writeIndex(const QString& fname) +void GaddagFactory::writeIndex(const QString &fname) { - nodelist.push_back(&root); + m_nodelist.push_back(&m_root); - root.print(nodelist); + m_root.print(m_nodelist); ofstream out(QuackleIO::Util::qstringToStdString(fname).c_str(), ios::out | ios::binary); - for (size_t i = 0; i < nodelist.size(); i++) + out.put(1); // GADDAG format version 1 + out.write(m_hash.charptr, sizeof(m_hash.charptr)); + + for (size_t i = 0; i < m_nodelist.size(); i++) { - unsigned int p = (unsigned int)(nodelist[i]->pointer); + unsigned int p = (unsigned int)(m_nodelist[i]->pointer); if (p != 0) p -= i; // offset indexing @@ -102,14 +123,14 @@ void GaddagFactory::writeIndex(const QString& fname) unsigned char n3 = (p & 0x000000FF) >> 0; unsigned char n4; - n4 = nodelist[i]->c; + n4 = m_nodelist[i]->c; if (n4 == internalSeparatorRepresentation) n4 = QUACKLE_NULL_MARK; - if (nodelist[i]->t) + if (m_nodelist[i]->t) n4 |= 64; - if (nodelist[i]->lastchild) + if (m_nodelist[i]->lastchild) n4 |= 128; bytes[0] = n1; bytes[1] = n2; bytes[2] = n3; bytes[3] = n4; -- cgit v1.2.3