summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJohn Fultz <jfultz@wolfram.com>2015-08-24 00:51:48 -0500
committerJohn Fultz <jfultz@wolfram.com>2015-08-24 00:51:48 -0500
commitd1f5f768764d439f02520d9c6c017fcd3ae96b83 (patch)
treee54e047c16bbb0da22b3978cdbbe9b3f86f7add0
parent5e5d414c57d5c7dd8a3037dda1555db5fb7eb486 (diff)
Add a new DAWG format.
Make reader and writer for the new format, while maintaing compatibility with the old. Things to note of the new format... * Now has a header, with version number, MD5, and word count. * No longer has terminator bit. Nodes are terminated by a non-zero playability. * Which means letters have one more bit. So we can now support more than 32 letters. Important for Slovak alphabet. Also, various cleanups and refactorings.
-rw-r--r--fixedstring.h1
-rw-r--r--lexiconparameters.cpp29
-rw-r--r--lexiconparameters.h9
-rw-r--r--quackleio/dawgfactory.cpp144
-rw-r--r--quackleio/dawgfactory.h26
5 files changed, 125 insertions, 84 deletions
diff --git a/fixedstring.h b/fixedstring.h
index 46d1011..e8db0bf 100644
--- a/fixedstring.h
+++ b/fixedstring.h
@@ -54,6 +54,7 @@ class FixedLengthString
size_type size() const { return length(); }
void clear() { m_end = m_data; }
void push_back(char c);
+ const char* constData() const { return m_data; }
int compare(const FixedLengthString& s) const;
diff --git a/lexiconparameters.cpp b/lexiconparameters.cpp
index 9826ca3..ca09fa5 100644
--- a/lexiconparameters.cpp
+++ b/lexiconparameters.cpp
@@ -25,15 +25,16 @@
using namespace Quackle;
-class V0DawgInterpreter : public DawgInterpreter
+class Quackle::V0DawgInterpreter : public DawgInterpreter
{
- virtual void loadDawg(ifstream &file, unsigned char *dawg)
+ virtual void loadDawg(ifstream &file, LexiconParameters &lexparams)
{
int i = 0;
+ file.unget(); // version 0 doesn't have a version byte...it's just the node byte which is always set to 0
while (!file.eof())
{
- file.read((char*)(dawg) + i, 7);
+ file.read((char*)(lexparams.m_dawg) + i, 7);
i += 7;
}
}
@@ -54,15 +55,19 @@ class V0DawgInterpreter : public DawgInterpreter
virtual int versionNumber() const { return 0; }
};
-class V1DawgInterpreter : public DawgInterpreter
+class Quackle::V1DawgInterpreter : public DawgInterpreter
{
- virtual void loadDawg(ifstream &file, unsigned char *dawg)
+ virtual void loadDawg(ifstream &file, LexiconParameters &lexparams)
{
int i = 0;
+ unsigned char bytes[3];
+ file.read(lexparams.m_hash, sizeof(lexparams.m_hash));
+ file.read((char*)bytes, 3);
+ lexparams.m_wordcount = (bytes[0] << 16) | (bytes[1] << 8) | bytes[2];
while (!file.eof())
{
- file.read((char*)(dawg) + i, 7);
+ file.read((char*)(lexparams.m_dawg) + i, 7);
i += 7;
}
}
@@ -73,10 +78,10 @@ class V1DawgInterpreter : public DawgInterpreter
p = (dawg[index] << 16) + (dawg[index + 1] << 8) + (dawg[index + 2]);
letter = dawg[index + 3];
- t = (letter & 32) != 0;
- lastchild = (letter & 64) != 0;
+ t = (p != 0);
+ lastchild = ((letter & 64) != 0);
british = !(letter & 128);
- letter = (letter & 31) + QUACKLE_FIRST_LETTER;
+ letter = (letter & 63) + QUACKLE_FIRST_LETTER;
playability = (dawg[index + 4] << 16) + (dawg[index + 5] << 8) + (dawg[index + 6]);
}
@@ -84,8 +89,9 @@ class V1DawgInterpreter : public DawgInterpreter
};
LexiconParameters::LexiconParameters()
- : m_dawg(0), m_gaddag(0)
+ : m_dawg(NULL), m_gaddag(NULL), m_interpreter(NULL), m_wordcount(0)
{
+ memset(m_hash, 0, sizeof(m_hash));
}
LexiconParameters::~LexiconParameters()
@@ -124,7 +130,6 @@ void LexiconParameters::loadDawg(const string &filename)
}
char versionByte = file.get();
- file.unget();
switch(versionByte)
{
case 0:
@@ -140,7 +145,7 @@ void LexiconParameters::loadDawg(const string &filename)
m_dawg = new unsigned char[7000000];
- m_interpreter->loadDawg(file, m_dawg);
+ m_interpreter->loadDawg(file, *this);
}
void LexiconParameters::loadGaddag(const string &filename)
diff --git a/lexiconparameters.h b/lexiconparameters.h
index 4c77cd1..4b6369d 100644
--- a/lexiconparameters.h
+++ b/lexiconparameters.h
@@ -28,15 +28,20 @@ namespace Quackle
class DawgInterpreter
{
public:
- virtual void loadDawg(ifstream &file, unsigned char *dawg) = 0;
+ virtual void loadDawg(ifstream &file, LexiconParameters &lexparams) = 0;
virtual void dawgAt(const unsigned char *dawg, int index, unsigned int &p, Letter &letter, bool &t, bool &lastchild, bool &british, int &playability) const = 0;
virtual int versionNumber() const = 0;
virtual ~DawgInterpreter() {};
};
+class V0DawgInterpreter;
+class V1DawgInterpreter;
class LexiconParameters
{
+ friend class Quackle::V0DawgInterpreter;
+ friend class Quackle::V1DawgInterpreter;
+
public:
LexiconParameters();
~LexiconParameters();
@@ -75,6 +80,8 @@ protected:
unsigned char *m_gaddag;
string m_lexiconName;
DawgInterpreter *m_interpreter;
+ char m_hash[16];
+ int m_wordcount;
};
}
diff --git a/quackleio/dawgfactory.cpp b/quackleio/dawgfactory.cpp
index 8a37766..6fb5be0 100644
--- a/quackleio/dawgfactory.cpp
+++ b/quackleio/dawgfactory.cpp
@@ -19,6 +19,7 @@
#include <iostream>
#include <QtCore>
+#include <QCryptographicHash>
#include "dawgfactory.h"
#include "util.h"
@@ -28,19 +29,20 @@ DawgFactory::DawgFactory(const QString& alphabetFile)
{
QuackleIO::FlexibleAlphabetParameters *flexure = new QuackleIO::FlexibleAlphabetParameters;
flexure->load(alphabetFile);
- alphas = flexure;
-
- root.t = false;
- root.insmallerdict = false;
- root.playability = 0;
- root.c = QUACKLE_BLANK_MARK;
- root.pointer = 0;
- root.lastchild = true;
+ m_alphas = flexure;
+
+ m_root.insmallerdict = false;
+ m_root.playability = 0;
+ m_root.c = QUACKLE_BLANK_MARK;
+ m_root.pointer = 0;
+ m_root.lastchild = true;
+
+ m_hash.int32ptr[0] = m_hash.int32ptr[1] = m_hash.int32ptr[2] = m_hash.int32ptr[3] = 0;
}
DawgFactory::~DawgFactory()
{
- delete alphas;
+ delete m_alphas;
}
bool DawgFactory::pushWord(const QString& word, bool inSmaller, int playability)
@@ -48,36 +50,52 @@ bool DawgFactory::pushWord(const QString& word, bool inSmaller, int playability)
UVString originalString = QuackleIO::Util::qstringToString(word);
UVString leftover;
- Quackle::LetterString encodedWord = alphas->encode(originalString, &leftover);
+ Quackle::LetterString encodedWord = m_alphas->encode(originalString, &leftover);
if (leftover.empty())
{
- ++m_encodableWords;
- root.pushWord(encodedWord, inSmaller, playability);
- return true;
+ if (m_root.pushWord(encodedWord, inSmaller, playability))
+ {
+ ++m_encodableWords;
+ hashWord(encodedWord);
+ return true;
+ }
+ ++m_duplicateWords;
+ return false;
}
++m_unencodableWords;
return false;
}
+void DawgFactory::hashWord(const Quackle::LetterString &word)
+{
+ QCryptographicHash wordhash(QCryptographicHash::Md5);
+ wordhash.addData(word.constData(), word.length());
+ QByteArray wordhashbytes = wordhash.result();
+ m_hash.int32ptr[0] ^= ((const int32_t*)wordhashbytes.constData())[0];
+ m_hash.int32ptr[1] ^= ((const int32_t*)wordhashbytes.constData())[1];
+ m_hash.int32ptr[2] ^= ((const int32_t*)wordhashbytes.constData())[2];
+ m_hash.int32ptr[3] ^= ((const int32_t*)wordhashbytes.constData())[3];
+}
+
void DawgFactory::generate()
{
const int bucketcount = 2000;
vector< int > bucket[bucketcount];
- nodelist.clear();
- nodelist.push_back(&root);
- root.print(nodelist);
+ m_nodelist.clear();
+ m_nodelist.push_back(&m_root);
+ m_root.print(m_nodelist);
- nodelist[0]->letterSum();
+ m_nodelist[0]->letterSum();
- for (unsigned int i = 0; i < nodelist.size(); i++)
+ for (unsigned int i = 0; i < m_nodelist.size(); i++)
{
- bucket[nodelist[i]->sum % bucketcount].push_back(i);
- nodelist[i]->pointer = 0;
- nodelist[i]->written = false;
- nodelist[i]->deleted = false;
- nodelist[i]->cloneof = NULL;
+ bucket[m_nodelist[i]->sum % bucketcount].push_back(i);
+ m_nodelist[i]->pointer = 0;
+ m_nodelist[i]->written = false;
+ m_nodelist[i]->deleted = false;
+ m_nodelist[i]->cloneof = NULL;
}
for (int b = 0; b < bucketcount; b++)
@@ -86,19 +104,19 @@ void DawgFactory::generate()
continue;
for (vector<int>::iterator it = bucket[b].begin(); it != bucket[b].end() - 1; it++)
{
- if (!nodelist[(*it)]->deleted)
+ if (!m_nodelist[(*it)]->deleted)
{
for (vector<int>::iterator jt = it + 1; jt != bucket[b].end(); jt++)
{
- if (!nodelist[(*jt)]->deleted)
+ if (!m_nodelist[(*jt)]->deleted)
{
// cout << "Comparing " << (*it) << " and " << (*jt) << endl;
- if (nodelist[(*it)]->equals(nodelist[(*jt)][0]))
+ if (m_nodelist[(*it)]->equals(m_nodelist[(*jt)][0]))
{
//cout << "Hey! " << (*it) << " == " << (*jt) << endl;
// ones[l].erase(jt);
- nodelist[(*jt)]->deleted = true;
- nodelist[(*jt)]->cloneof = nodelist[(*it)];
+ m_nodelist[(*jt)]->deleted = true;
+ m_nodelist[(*jt)]->cloneof = m_nodelist[(*it)];
}
}
}
@@ -106,51 +124,53 @@ void DawgFactory::generate()
}
}
- nodelist.clear();
- nodelist.push_back(&root);
- root.print(nodelist);
+ m_nodelist.clear();
+ m_nodelist.push_back(&m_root);
+ m_root.print(m_nodelist);
}
void DawgFactory::writeIndex(const QString& filename)
{
ofstream out(QuackleIO::Util::qstringToStdString(filename).c_str(), ios::out | ios::binary);
+ unsigned char bytes[7];
+
+ bytes[0] = (m_encodableWords & 0x00FF0000) >> 16;
+ bytes[1] = (m_encodableWords & 0x0000FF00) >> 8;
+ bytes[2] = (m_encodableWords & 0x000000FF);
- for (unsigned int i = 0; i < nodelist.size(); i++) {
- //cout << nodelist[i]->c << " " << nodelist[i]->pointer << " " << nodelist[i]->t << " " << nodelist[i]->lastchild << endl;
- Node* n = nodelist[i];
+ out.write(m_hash.charptr, sizeof(m_hash.charptr));
+ out.write((char*)bytes, 3);
+
+ for (unsigned int i = 0; i < m_nodelist.size(); i++) {
+ //cout << m_nodelist[i]->c << " " << m_nodelist[i]->pointer << " " << m_nodelist[i]->t << " " << m_nodelist[i]->lastchild << endl;
+ Node* n = m_nodelist[i];
unsigned int p;
- if (nodelist[i]->deleted)
+ if (m_nodelist[i]->deleted)
{
- p = (unsigned int)(nodelist[i]->cloneof->pointer);
- // n = nodelist[i]->cloneof;
+ p = (unsigned int)(m_nodelist[i]->cloneof->pointer);
+ // n = m_nodelist[i]->cloneof;
}
else
- p = (unsigned int)(nodelist[i]->pointer);
+ p = (unsigned int)(m_nodelist[i]->pointer);
- char bytes[7];
- unsigned char n1 = (p & 0x00FF0000) >> 16;
- unsigned char n2 = (p & 0x0000FF00) >> 8;
- unsigned char n3 = (p & 0x000000FF);
- unsigned char n4 = n->c - QUACKLE_FIRST_LETTER;
+ bytes[0] = (p & 0x00FF0000) >> 16;
+ bytes[1] = (p & 0x0000FF00) >> 8;
+ bytes[2] = (p & 0x000000FF);
+ bytes[3] = n->c - QUACKLE_FIRST_LETTER;
unsigned int pb = n->playability;
- unsigned char n5 = (pb & 0x00FF0000) >> 16;
- unsigned char n6 = (pb & 0x0000FF00) >> 8;
- unsigned char n7 = (pb & 0x000000FF);
+ bytes[4] = (pb & 0x00FF0000) >> 16;
+ bytes[5] = (pb & 0x0000FF00) >> 8;
+ bytes[6] = (pb & 0x000000FF);
- if (n->t) {
- n4 |= 32;
- }
if (n->lastchild) {
- n4 |= 64;
+ bytes[3] |= 64;
}
if (n->insmallerdict) {
- n4 |= 128;
+ bytes[3] |= 128;
}
- bytes[0] = n1; bytes[1] = n2; bytes[2] = n3; bytes[3] = n4;
- bytes[4] = n5; bytes[5] = n6; bytes[6] = n7;
- out.write(bytes, 7);
+ out.write((char*)bytes, 7);
}
}
@@ -193,11 +213,13 @@ void DawgFactory::Node::print(vector< Node* >& nodelist)
}
-void DawgFactory::Node::pushWord(const Quackle::LetterString& word, bool inSmaller, int pb)
+// returns true if the word was actually added...false if it's a duplicate.
+bool DawgFactory::Node::pushWord(const Quackle::LetterString& word, bool inSmaller, int pb)
{
+ bool added;
if (word.length() == 0) {
- t = true;
- playability = pb;
+ added = (playability == 0);
+ playability = (pb == 0) ? 1 : pb; // word terminators nodes are marked by nonzero playability in the v1 DAWG format
insmallerdict = inSmaller;
}
else {
@@ -217,7 +239,6 @@ void DawgFactory::Node::pushWord(const Quackle::LetterString& word, bool inSmall
if (index == -1) {
Node n;
n.c = first;
- n.t = false;
n.playability = 0;
n.insmallerdict = false;
n.pointer = 0;
@@ -226,12 +247,13 @@ void DawgFactory::Node::pushWord(const Quackle::LetterString& word, bool inSmall
index = children.size() - 1;
}
- children[index].pushWord(rest, inSmaller, pb);
+ added = children[index].pushWord(rest, inSmaller, pb);
}
sumexplored = false;
deleted = false;
written = false;
+ return added;
}
@@ -243,8 +265,6 @@ bool DawgFactory::Node::equals(const Node &n) const
return false;
if (children.size() != n.children.size())
return false;
- if (t != n.t)
- return false;
if (insmallerdict != n.insmallerdict)
return false;
if (sum != n.sum)
@@ -259,7 +279,7 @@ bool DawgFactory::Node::equals(const Node &n) const
int DawgFactory::Node::wordCount() const
{
- int wordCount = (t ? 0 : 1);
+ int wordCount = ((playability == 0) ? 0 : 1);
for (size_t i = 0; i < children.size(); i++)
wordCount += children[i].wordCount();
return wordCount;
diff --git a/quackleio/dawgfactory.h b/quackleio/dawgfactory.h
index b2cfb76..13837c4 100644
--- a/quackleio/dawgfactory.h
+++ b/quackleio/dawgfactory.h
@@ -29,29 +29,30 @@ public:
DawgFactory(const QString& alphabetFile);
~DawgFactory();
- int wordCount() const { return root.wordCount(); };
- int nodeCount() const { return nodelist.size(); };
+ int wordCount() const { return m_root.wordCount(); };
+ int nodeCount() const { return m_nodelist.size(); };
int encodableWords() const { return m_encodableWords; };
int unencodableWords() const { return m_unencodableWords; };
+ int duplicateWords() const { return m_duplicateWords; };
bool pushWord(const QString& word, bool inSmaller, int playability);
+ void hashWord(const Quackle::LetterString &word);
void generate();
void writeIndex(const QString& fname);
private:
class Node {
public:
- void pushWord(const Quackle::LetterString& word, bool inSmaller, int pb);
- void print(vector< Node* >& nodelist);
+ bool pushWord(const Quackle::LetterString& word, bool inSmaller, int pb);
+ void print(vector< Node* >& m_nodelist);
int letterSum() const;
int wordCount() const;
bool equals(const Node &n) const;
Quackle::Letter c;
- bool t;
bool insmallerdict;
- int playability;
+ int playability; // if nonzero, then terminates word
vector<Node> children;
int pointer;
@@ -69,9 +70,16 @@ private:
int m_encodableWords;
int m_unencodableWords;
- vector< Node* > nodelist;
- Quackle::AlphabetParameters *alphas;
- Node root;
+ int m_duplicateWords;
+ vector< Node* > m_nodelist;
+ Quackle::AlphabetParameters *m_alphas;
+ Node m_root;
+ union {
+ char charptr[16];
+ int32_t int32ptr[4];
+ } m_hash;
+
+ static const char m_versionNumber = 1;
};
#endif