diff options
author | John Fultz <jfultz@wolfram.com> | 2015-08-24 00:51:48 -0500 |
---|---|---|
committer | John Fultz <jfultz@wolfram.com> | 2015-08-24 00:51:48 -0500 |
commit | d1f5f768764d439f02520d9c6c017fcd3ae96b83 (patch) | |
tree | e54e047c16bbb0da22b3978cdbbe9b3f86f7add0 /lexiconparameters.cpp | |
parent | 5e5d414c57d5c7dd8a3037dda1555db5fb7eb486 (diff) |
Add a new DAWG format.
Make reader and writer for the new format, while
maintaing compatibility with the old. Things to
note of the new format...
* Now has a header, with version number, MD5, and word count.
* No longer has terminator bit. Nodes are terminated by
a non-zero playability.
* Which means letters have one more bit. So we can now
support more than 32 letters. Important for Slovak alphabet.
Also, various cleanups and refactorings.
Diffstat (limited to 'lexiconparameters.cpp')
-rw-r--r-- | lexiconparameters.cpp | 29 |
1 files changed, 17 insertions, 12 deletions
diff --git a/lexiconparameters.cpp b/lexiconparameters.cpp index 9826ca3..ca09fa5 100644 --- a/lexiconparameters.cpp +++ b/lexiconparameters.cpp @@ -25,15 +25,16 @@ using namespace Quackle; -class V0DawgInterpreter : public DawgInterpreter +class Quackle::V0DawgInterpreter : public DawgInterpreter { - virtual void loadDawg(ifstream &file, unsigned char *dawg) + virtual void loadDawg(ifstream &file, LexiconParameters &lexparams) { int i = 0; + file.unget(); // version 0 doesn't have a version byte...it's just the node byte which is always set to 0 while (!file.eof()) { - file.read((char*)(dawg) + i, 7); + file.read((char*)(lexparams.m_dawg) + i, 7); i += 7; } } @@ -54,15 +55,19 @@ class V0DawgInterpreter : public DawgInterpreter virtual int versionNumber() const { return 0; } }; -class V1DawgInterpreter : public DawgInterpreter +class Quackle::V1DawgInterpreter : public DawgInterpreter { - virtual void loadDawg(ifstream &file, unsigned char *dawg) + virtual void loadDawg(ifstream &file, LexiconParameters &lexparams) { int i = 0; + unsigned char bytes[3]; + file.read(lexparams.m_hash, sizeof(lexparams.m_hash)); + file.read((char*)bytes, 3); + lexparams.m_wordcount = (bytes[0] << 16) | (bytes[1] << 8) | bytes[2]; while (!file.eof()) { - file.read((char*)(dawg) + i, 7); + file.read((char*)(lexparams.m_dawg) + i, 7); i += 7; } } @@ -73,10 +78,10 @@ class V1DawgInterpreter : public DawgInterpreter p = (dawg[index] << 16) + (dawg[index + 1] << 8) + (dawg[index + 2]); letter = dawg[index + 3]; - t = (letter & 32) != 0; - lastchild = (letter & 64) != 0; + t = (p != 0); + lastchild = ((letter & 64) != 0); british = !(letter & 128); - letter = (letter & 31) + QUACKLE_FIRST_LETTER; + letter = (letter & 63) + QUACKLE_FIRST_LETTER; playability = (dawg[index + 4] << 16) + (dawg[index + 5] << 8) + (dawg[index + 6]); } @@ -84,8 +89,9 @@ class V1DawgInterpreter : public DawgInterpreter }; LexiconParameters::LexiconParameters() - : m_dawg(0), m_gaddag(0) + : m_dawg(NULL), m_gaddag(NULL), m_interpreter(NULL), m_wordcount(0) { + memset(m_hash, 0, sizeof(m_hash)); } LexiconParameters::~LexiconParameters() @@ -124,7 +130,6 @@ void LexiconParameters::loadDawg(const string &filename) } char versionByte = file.get(); - file.unget(); switch(versionByte) { case 0: @@ -140,7 +145,7 @@ void LexiconParameters::loadDawg(const string &filename) m_dawg = new unsigned char[7000000]; - m_interpreter->loadDawg(file, m_dawg); + m_interpreter->loadDawg(file, *this); } void LexiconParameters::loadGaddag(const string &filename) |