diff options
-rw-r--r-- | fixedstring.h | 1 | ||||
-rw-r--r-- | lexiconparameters.cpp | 29 | ||||
-rw-r--r-- | lexiconparameters.h | 9 | ||||
-rw-r--r-- | quackleio/dawgfactory.cpp | 144 | ||||
-rw-r--r-- | quackleio/dawgfactory.h | 26 |
5 files changed, 125 insertions, 84 deletions
diff --git a/fixedstring.h b/fixedstring.h index 46d1011..e8db0bf 100644 --- a/fixedstring.h +++ b/fixedstring.h @@ -54,6 +54,7 @@ class FixedLengthString size_type size() const { return length(); } void clear() { m_end = m_data; } void push_back(char c); + const char* constData() const { return m_data; } int compare(const FixedLengthString& s) const; diff --git a/lexiconparameters.cpp b/lexiconparameters.cpp index 9826ca3..ca09fa5 100644 --- a/lexiconparameters.cpp +++ b/lexiconparameters.cpp @@ -25,15 +25,16 @@ using namespace Quackle; -class V0DawgInterpreter : public DawgInterpreter +class Quackle::V0DawgInterpreter : public DawgInterpreter { - virtual void loadDawg(ifstream &file, unsigned char *dawg) + virtual void loadDawg(ifstream &file, LexiconParameters &lexparams) { int i = 0; + file.unget(); // version 0 doesn't have a version byte...it's just the node byte which is always set to 0 while (!file.eof()) { - file.read((char*)(dawg) + i, 7); + file.read((char*)(lexparams.m_dawg) + i, 7); i += 7; } } @@ -54,15 +55,19 @@ class V0DawgInterpreter : public DawgInterpreter virtual int versionNumber() const { return 0; } }; -class V1DawgInterpreter : public DawgInterpreter +class Quackle::V1DawgInterpreter : public DawgInterpreter { - virtual void loadDawg(ifstream &file, unsigned char *dawg) + virtual void loadDawg(ifstream &file, LexiconParameters &lexparams) { int i = 0; + unsigned char bytes[3]; + file.read(lexparams.m_hash, sizeof(lexparams.m_hash)); + file.read((char*)bytes, 3); + lexparams.m_wordcount = (bytes[0] << 16) | (bytes[1] << 8) | bytes[2]; while (!file.eof()) { - file.read((char*)(dawg) + i, 7); + file.read((char*)(lexparams.m_dawg) + i, 7); i += 7; } } @@ -73,10 +78,10 @@ class V1DawgInterpreter : public DawgInterpreter p = (dawg[index] << 16) + (dawg[index + 1] << 8) + (dawg[index + 2]); letter = dawg[index + 3]; - t = (letter & 32) != 0; - lastchild = (letter & 64) != 0; + t = (p != 0); + lastchild = ((letter & 64) != 0); british = !(letter & 128); - letter = (letter & 31) + QUACKLE_FIRST_LETTER; + letter = (letter & 63) + QUACKLE_FIRST_LETTER; playability = (dawg[index + 4] << 16) + (dawg[index + 5] << 8) + (dawg[index + 6]); } @@ -84,8 +89,9 @@ class V1DawgInterpreter : public DawgInterpreter }; LexiconParameters::LexiconParameters() - : m_dawg(0), m_gaddag(0) + : m_dawg(NULL), m_gaddag(NULL), m_interpreter(NULL), m_wordcount(0) { + memset(m_hash, 0, sizeof(m_hash)); } LexiconParameters::~LexiconParameters() @@ -124,7 +130,6 @@ void LexiconParameters::loadDawg(const string &filename) } char versionByte = file.get(); - file.unget(); switch(versionByte) { case 0: @@ -140,7 +145,7 @@ void LexiconParameters::loadDawg(const string &filename) m_dawg = new unsigned char[7000000]; - m_interpreter->loadDawg(file, m_dawg); + m_interpreter->loadDawg(file, *this); } void LexiconParameters::loadGaddag(const string &filename) diff --git a/lexiconparameters.h b/lexiconparameters.h index 4c77cd1..4b6369d 100644 --- a/lexiconparameters.h +++ b/lexiconparameters.h @@ -28,15 +28,20 @@ namespace Quackle class DawgInterpreter { public: - virtual void loadDawg(ifstream &file, unsigned char *dawg) = 0; + virtual void loadDawg(ifstream &file, LexiconParameters &lexparams) = 0; virtual void dawgAt(const unsigned char *dawg, int index, unsigned int &p, Letter &letter, bool &t, bool &lastchild, bool &british, int &playability) const = 0; virtual int versionNumber() const = 0; virtual ~DawgInterpreter() {}; }; +class V0DawgInterpreter; +class V1DawgInterpreter; class LexiconParameters { + friend class Quackle::V0DawgInterpreter; + friend class Quackle::V1DawgInterpreter; + public: LexiconParameters(); ~LexiconParameters(); @@ -75,6 +80,8 @@ protected: unsigned char *m_gaddag; string m_lexiconName; DawgInterpreter *m_interpreter; + char m_hash[16]; + int m_wordcount; }; } diff --git a/quackleio/dawgfactory.cpp b/quackleio/dawgfactory.cpp index 8a37766..6fb5be0 100644 --- a/quackleio/dawgfactory.cpp +++ b/quackleio/dawgfactory.cpp @@ -19,6 +19,7 @@ #include <iostream> #include <QtCore> +#include <QCryptographicHash> #include "dawgfactory.h" #include "util.h" @@ -28,19 +29,20 @@ DawgFactory::DawgFactory(const QString& alphabetFile) { QuackleIO::FlexibleAlphabetParameters *flexure = new QuackleIO::FlexibleAlphabetParameters; flexure->load(alphabetFile); - alphas = flexure; - - root.t = false; - root.insmallerdict = false; - root.playability = 0; - root.c = QUACKLE_BLANK_MARK; - root.pointer = 0; - root.lastchild = true; + m_alphas = flexure; + + m_root.insmallerdict = false; + m_root.playability = 0; + m_root.c = QUACKLE_BLANK_MARK; + m_root.pointer = 0; + m_root.lastchild = true; + + m_hash.int32ptr[0] = m_hash.int32ptr[1] = m_hash.int32ptr[2] = m_hash.int32ptr[3] = 0; } DawgFactory::~DawgFactory() { - delete alphas; + delete m_alphas; } bool DawgFactory::pushWord(const QString& word, bool inSmaller, int playability) @@ -48,36 +50,52 @@ bool DawgFactory::pushWord(const QString& word, bool inSmaller, int playability) UVString originalString = QuackleIO::Util::qstringToString(word); UVString leftover; - Quackle::LetterString encodedWord = alphas->encode(originalString, &leftover); + Quackle::LetterString encodedWord = m_alphas->encode(originalString, &leftover); if (leftover.empty()) { - ++m_encodableWords; - root.pushWord(encodedWord, inSmaller, playability); - return true; + if (m_root.pushWord(encodedWord, inSmaller, playability)) + { + ++m_encodableWords; + hashWord(encodedWord); + return true; + } + ++m_duplicateWords; + return false; } ++m_unencodableWords; return false; } +void DawgFactory::hashWord(const Quackle::LetterString &word) +{ + QCryptographicHash wordhash(QCryptographicHash::Md5); + wordhash.addData(word.constData(), word.length()); + QByteArray wordhashbytes = wordhash.result(); + m_hash.int32ptr[0] ^= ((const int32_t*)wordhashbytes.constData())[0]; + m_hash.int32ptr[1] ^= ((const int32_t*)wordhashbytes.constData())[1]; + m_hash.int32ptr[2] ^= ((const int32_t*)wordhashbytes.constData())[2]; + m_hash.int32ptr[3] ^= ((const int32_t*)wordhashbytes.constData())[3]; +} + void DawgFactory::generate() { const int bucketcount = 2000; vector< int > bucket[bucketcount]; - nodelist.clear(); - nodelist.push_back(&root); - root.print(nodelist); + m_nodelist.clear(); + m_nodelist.push_back(&m_root); + m_root.print(m_nodelist); - nodelist[0]->letterSum(); + m_nodelist[0]->letterSum(); - for (unsigned int i = 0; i < nodelist.size(); i++) + for (unsigned int i = 0; i < m_nodelist.size(); i++) { - bucket[nodelist[i]->sum % bucketcount].push_back(i); - nodelist[i]->pointer = 0; - nodelist[i]->written = false; - nodelist[i]->deleted = false; - nodelist[i]->cloneof = NULL; + bucket[m_nodelist[i]->sum % bucketcount].push_back(i); + m_nodelist[i]->pointer = 0; + m_nodelist[i]->written = false; + m_nodelist[i]->deleted = false; + m_nodelist[i]->cloneof = NULL; } for (int b = 0; b < bucketcount; b++) @@ -86,19 +104,19 @@ void DawgFactory::generate() continue; for (vector<int>::iterator it = bucket[b].begin(); it != bucket[b].end() - 1; it++) { - if (!nodelist[(*it)]->deleted) + if (!m_nodelist[(*it)]->deleted) { for (vector<int>::iterator jt = it + 1; jt != bucket[b].end(); jt++) { - if (!nodelist[(*jt)]->deleted) + if (!m_nodelist[(*jt)]->deleted) { // cout << "Comparing " << (*it) << " and " << (*jt) << endl; - if (nodelist[(*it)]->equals(nodelist[(*jt)][0])) + if (m_nodelist[(*it)]->equals(m_nodelist[(*jt)][0])) { //cout << "Hey! " << (*it) << " == " << (*jt) << endl; // ones[l].erase(jt); - nodelist[(*jt)]->deleted = true; - nodelist[(*jt)]->cloneof = nodelist[(*it)]; + m_nodelist[(*jt)]->deleted = true; + m_nodelist[(*jt)]->cloneof = m_nodelist[(*it)]; } } } @@ -106,51 +124,53 @@ void DawgFactory::generate() } } - nodelist.clear(); - nodelist.push_back(&root); - root.print(nodelist); + m_nodelist.clear(); + m_nodelist.push_back(&m_root); + m_root.print(m_nodelist); } void DawgFactory::writeIndex(const QString& filename) { ofstream out(QuackleIO::Util::qstringToStdString(filename).c_str(), ios::out | ios::binary); + unsigned char bytes[7]; + + bytes[0] = (m_encodableWords & 0x00FF0000) >> 16; + bytes[1] = (m_encodableWords & 0x0000FF00) >> 8; + bytes[2] = (m_encodableWords & 0x000000FF); - for (unsigned int i = 0; i < nodelist.size(); i++) { - //cout << nodelist[i]->c << " " << nodelist[i]->pointer << " " << nodelist[i]->t << " " << nodelist[i]->lastchild << endl; - Node* n = nodelist[i]; + out.write(m_hash.charptr, sizeof(m_hash.charptr)); + out.write((char*)bytes, 3); + + for (unsigned int i = 0; i < m_nodelist.size(); i++) { + //cout << m_nodelist[i]->c << " " << m_nodelist[i]->pointer << " " << m_nodelist[i]->t << " " << m_nodelist[i]->lastchild << endl; + Node* n = m_nodelist[i]; unsigned int p; - if (nodelist[i]->deleted) + if (m_nodelist[i]->deleted) { - p = (unsigned int)(nodelist[i]->cloneof->pointer); - // n = nodelist[i]->cloneof; + p = (unsigned int)(m_nodelist[i]->cloneof->pointer); + // n = m_nodelist[i]->cloneof; } else - p = (unsigned int)(nodelist[i]->pointer); + p = (unsigned int)(m_nodelist[i]->pointer); - char bytes[7]; - unsigned char n1 = (p & 0x00FF0000) >> 16; - unsigned char n2 = (p & 0x0000FF00) >> 8; - unsigned char n3 = (p & 0x000000FF); - unsigned char n4 = n->c - QUACKLE_FIRST_LETTER; + bytes[0] = (p & 0x00FF0000) >> 16; + bytes[1] = (p & 0x0000FF00) >> 8; + bytes[2] = (p & 0x000000FF); + bytes[3] = n->c - QUACKLE_FIRST_LETTER; unsigned int pb = n->playability; - unsigned char n5 = (pb & 0x00FF0000) >> 16; - unsigned char n6 = (pb & 0x0000FF00) >> 8; - unsigned char n7 = (pb & 0x000000FF); + bytes[4] = (pb & 0x00FF0000) >> 16; + bytes[5] = (pb & 0x0000FF00) >> 8; + bytes[6] = (pb & 0x000000FF); - if (n->t) { - n4 |= 32; - } if (n->lastchild) { - n4 |= 64; + bytes[3] |= 64; } if (n->insmallerdict) { - n4 |= 128; + bytes[3] |= 128; } - bytes[0] = n1; bytes[1] = n2; bytes[2] = n3; bytes[3] = n4; - bytes[4] = n5; bytes[5] = n6; bytes[6] = n7; - out.write(bytes, 7); + out.write((char*)bytes, 7); } } @@ -193,11 +213,13 @@ void DawgFactory::Node::print(vector< Node* >& nodelist) } -void DawgFactory::Node::pushWord(const Quackle::LetterString& word, bool inSmaller, int pb) +// returns true if the word was actually added...false if it's a duplicate. +bool DawgFactory::Node::pushWord(const Quackle::LetterString& word, bool inSmaller, int pb) { + bool added; if (word.length() == 0) { - t = true; - playability = pb; + added = (playability == 0); + playability = (pb == 0) ? 1 : pb; // word terminators nodes are marked by nonzero playability in the v1 DAWG format insmallerdict = inSmaller; } else { @@ -217,7 +239,6 @@ void DawgFactory::Node::pushWord(const Quackle::LetterString& word, bool inSmall if (index == -1) { Node n; n.c = first; - n.t = false; n.playability = 0; n.insmallerdict = false; n.pointer = 0; @@ -226,12 +247,13 @@ void DawgFactory::Node::pushWord(const Quackle::LetterString& word, bool inSmall index = children.size() - 1; } - children[index].pushWord(rest, inSmaller, pb); + added = children[index].pushWord(rest, inSmaller, pb); } sumexplored = false; deleted = false; written = false; + return added; } @@ -243,8 +265,6 @@ bool DawgFactory::Node::equals(const Node &n) const return false; if (children.size() != n.children.size()) return false; - if (t != n.t) - return false; if (insmallerdict != n.insmallerdict) return false; if (sum != n.sum) @@ -259,7 +279,7 @@ bool DawgFactory::Node::equals(const Node &n) const int DawgFactory::Node::wordCount() const { - int wordCount = (t ? 0 : 1); + int wordCount = ((playability == 0) ? 0 : 1); for (size_t i = 0; i < children.size(); i++) wordCount += children[i].wordCount(); return wordCount; diff --git a/quackleio/dawgfactory.h b/quackleio/dawgfactory.h index b2cfb76..13837c4 100644 --- a/quackleio/dawgfactory.h +++ b/quackleio/dawgfactory.h @@ -29,29 +29,30 @@ public: DawgFactory(const QString& alphabetFile); ~DawgFactory(); - int wordCount() const { return root.wordCount(); }; - int nodeCount() const { return nodelist.size(); }; + int wordCount() const { return m_root.wordCount(); }; + int nodeCount() const { return m_nodelist.size(); }; int encodableWords() const { return m_encodableWords; }; int unencodableWords() const { return m_unencodableWords; }; + int duplicateWords() const { return m_duplicateWords; }; bool pushWord(const QString& word, bool inSmaller, int playability); + void hashWord(const Quackle::LetterString &word); void generate(); void writeIndex(const QString& fname); private: class Node { public: - void pushWord(const Quackle::LetterString& word, bool inSmaller, int pb); - void print(vector< Node* >& nodelist); + bool pushWord(const Quackle::LetterString& word, bool inSmaller, int pb); + void print(vector< Node* >& m_nodelist); int letterSum() const; int wordCount() const; bool equals(const Node &n) const; Quackle::Letter c; - bool t; bool insmallerdict; - int playability; + int playability; // if nonzero, then terminates word vector<Node> children; int pointer; @@ -69,9 +70,16 @@ private: int m_encodableWords; int m_unencodableWords; - vector< Node* > nodelist; - Quackle::AlphabetParameters *alphas; - Node root; + int m_duplicateWords; + vector< Node* > m_nodelist; + Quackle::AlphabetParameters *m_alphas; + Node m_root; + union { + char charptr[16]; + int32_t int32ptr[4]; + } m_hash; + + static const char m_versionNumber = 1; }; #endif |