summaryrefslogtreecommitdiff
path: root/quackleio/gaddagfactory.cpp
diff options
context:
space:
mode:
authorJohn Fultz <jfultz@wolfram.com>2015-08-24 04:45:27 -0500
committerJohn Fultz <jfultz@wolfram.com>2015-08-24 04:45:46 -0500
commit1f7b8ef6f96e1d5a2c50565a0f52cc633215e485 (patch)
tree11f406677824d20924748225ab7eb129ba929cd0 /quackleio/gaddagfactory.cpp
parent8c7ffef1b6c669592e979fb6038dd634df7f95fc (diff)
Version the GADDAGs.
Basically the same thing I just did to the DAWG files, now done to GADDAGs. Also, add hashing, and make sure GADDAGs only load if their hash matches that of the DAWG files.
Diffstat (limited to 'quackleio/gaddagfactory.cpp')
-rw-r--r--quackleio/gaddagfactory.cpp61
1 files changed, 41 insertions, 20 deletions
diff --git a/quackleio/gaddagfactory.cpp b/quackleio/gaddagfactory.cpp
index e2c726d..7f666cb 100644
--- a/quackleio/gaddagfactory.cpp
+++ b/quackleio/gaddagfactory.cpp
@@ -19,6 +19,7 @@
#include <iostream>
#include <QtCore>
+#include <QCryptographicHash>
#include "gaddagfactory.h"
#include "util.h"
@@ -27,18 +28,20 @@ GaddagFactory::GaddagFactory(const QString& alphabetFile)
{
QuackleIO::FlexibleAlphabetParameters *flexure = new QuackleIO::FlexibleAlphabetParameters;
flexure->load(alphabetFile);
- alphas = flexure;
+ m_alphas = flexure;
// So the separator is sorted to last.
- root.t = false;
- root.c = QUACKLE_NULL_MARK; // "_"
- root.pointer = 0;
- root.lastchild = true;
+ m_root.t = false;
+ m_root.c = QUACKLE_NULL_MARK; // "_"
+ m_root.pointer = 0;
+ m_root.lastchild = true;
+
+ m_hash.int32ptr[0] = m_hash.int32ptr[1] = m_hash.int32ptr[2] = m_hash.int32ptr[3] = 0;
}
GaddagFactory::~GaddagFactory()
{
- delete alphas;
+ delete m_alphas;
}
bool GaddagFactory::pushWord(const QString& word)
@@ -46,10 +49,14 @@ bool GaddagFactory::pushWord(const QString& word)
UVString originalString = QuackleIO::Util::qstringToString(word);
UVString leftover;
- Quackle::LetterString encodedWord = alphas->encode(originalString, &leftover);
+ Quackle::LetterString encodedWord = m_alphas->encode(originalString, &leftover);
if (leftover.empty())
{
++m_encodableWords;
+ hashWord(encodedWord);
+ // FIXME: This hash will fail if duplicate words are passed in.
+ // But testing for duplicate words isn't so easy without keeping
+ // an entirely separate list.
for (unsigned i = 1; i <= encodedWord.length(); i++)
{
@@ -64,7 +71,7 @@ bool GaddagFactory::pushWord(const QString& word)
for (unsigned j = i; j < encodedWord.length(); j++)
newword.push_back(encodedWord[j]);
}
- gaddagizedWords.push_back(newword);
+ m_gaddagizedWords.push_back(newword);
}
return true;
}
@@ -73,26 +80,40 @@ bool GaddagFactory::pushWord(const QString& word)
return false;
}
+void GaddagFactory::hashWord(const Quackle::LetterString &word)
+{
+ QCryptographicHash wordhash(QCryptographicHash::Md5);
+ wordhash.addData(word.constData(), word.length());
+ QByteArray wordhashbytes = wordhash.result();
+ m_hash.int32ptr[0] ^= ((const int32_t*)wordhashbytes.constData())[0];
+ m_hash.int32ptr[1] ^= ((const int32_t*)wordhashbytes.constData())[1];
+ m_hash.int32ptr[2] ^= ((const int32_t*)wordhashbytes.constData())[2];
+ m_hash.int32ptr[3] ^= ((const int32_t*)wordhashbytes.constData())[3];
+}
+
void GaddagFactory::generate()
{
- Quackle::WordList::const_iterator wordsEnd = gaddagizedWords.end();
- for (Quackle::WordList::const_iterator wordsIt = gaddagizedWords.begin(); wordsIt != wordsEnd; ++wordsIt)
- root.pushWord(*wordsIt);
+ Quackle::WordList::const_iterator wordsEnd = m_gaddagizedWords.end();
+ for (Quackle::WordList::const_iterator wordsIt = m_gaddagizedWords.begin(); wordsIt != wordsEnd; ++wordsIt)
+ m_root.pushWord(*wordsIt);
// for (const auto& words : gaddaggizedWords)
- // root.pushWord(words);
+ // m_root.pushWord(words);
}
-void GaddagFactory::writeIndex(const QString& fname)
+void GaddagFactory::writeIndex(const QString &fname)
{
- nodelist.push_back(&root);
+ m_nodelist.push_back(&m_root);
- root.print(nodelist);
+ m_root.print(m_nodelist);
ofstream out(QuackleIO::Util::qstringToStdString(fname).c_str(), ios::out | ios::binary);
- for (size_t i = 0; i < nodelist.size(); i++)
+ out.put(1); // GADDAG format version 1
+ out.write(m_hash.charptr, sizeof(m_hash.charptr));
+
+ for (size_t i = 0; i < m_nodelist.size(); i++)
{
- unsigned int p = (unsigned int)(nodelist[i]->pointer);
+ unsigned int p = (unsigned int)(m_nodelist[i]->pointer);
if (p != 0)
p -= i; // offset indexing
@@ -102,14 +123,14 @@ void GaddagFactory::writeIndex(const QString& fname)
unsigned char n3 = (p & 0x000000FF) >> 0;
unsigned char n4;
- n4 = nodelist[i]->c;
+ n4 = m_nodelist[i]->c;
if (n4 == internalSeparatorRepresentation)
n4 = QUACKLE_NULL_MARK;
- if (nodelist[i]->t)
+ if (m_nodelist[i]->t)
n4 |= 64;
- if (nodelist[i]->lastchild)
+ if (m_nodelist[i]->lastchild)
n4 |= 128;
bytes[0] = n1; bytes[1] = n2; bytes[2] = n3; bytes[3] = n4;