/*
* Quackle -- Crossword game artificial intelligence and analysis tool
* Copyright (C) 2005-2019 Jason Katz-Brown, John O'Laughlin, and John Fultz.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see .
*/
#include
#include
#include
#include
#include
#include "dawgfactory.h"
#include "util.h"
using namespace std;
DawgFactory::DawgFactory(const QString &alphabetFile)
: m_encodableWords(0), m_unencodableWords(0), m_duplicateWords(0),
m_countsByLength(Quackle::FixedLengthString::maxSize, 0)
{
QuackleIO::FlexibleAlphabetParameters *flexure = new QuackleIO::FlexibleAlphabetParameters;
flexure->load(alphabetFile);
m_alphas = flexure;
m_root.insmallerdict = false;
m_root.playability = 0;
m_root.c = QUACKLE_BLANK_MARK;
m_root.pointer = 0;
m_root.lastchild = true;
m_hash.int32ptr[0] = m_hash.int32ptr[1] = m_hash.int32ptr[2] = m_hash.int32ptr[3] = 0;
}
DawgFactory::~DawgFactory()
{
delete m_alphas;
}
bool DawgFactory::pushWord(const UVString &word, bool inSmaller, int playability)
{
UVString leftover;
Quackle::LetterString encodedWord = m_alphas->encode(word, &leftover);
if (leftover.empty())
return pushWord(encodedWord, inSmaller, playability);
++m_unencodableWords;
return false;
}
bool DawgFactory::pushWord(const Quackle::LetterString &word, bool inSmaller, int playability)
{
if (m_root.pushWord(word, inSmaller, playability))
{
++m_encodableWords;
++m_countsByLength[word.length()];
hashWord(word);
return true;
}
++m_duplicateWords;
return false;
}
void DawgFactory::hashWord(const Quackle::LetterString &word)
{
QCryptographicHash wordhash(QCryptographicHash::Md5);
wordhash.addData(word.constData(), word.length());
QByteArray wordhashbytes = wordhash.result();
m_hash.int32ptr[0] ^= ((const int32_t*)wordhashbytes.constData())[0];
m_hash.int32ptr[1] ^= ((const int32_t*)wordhashbytes.constData())[1];
m_hash.int32ptr[2] ^= ((const int32_t*)wordhashbytes.constData())[2];
m_hash.int32ptr[3] ^= ((const int32_t*)wordhashbytes.constData())[3];
}
void DawgFactory::generate()
{
const int bucketcount = 2000;
vector< int > bucket[bucketcount];
m_nodelist.clear();
m_nodelist.push_back(&m_root);
m_root.print(m_nodelist);
m_nodelist[0]->letterSum();
for (unsigned int i = 0; i < m_nodelist.size(); i++)
{
bucket[m_nodelist[i]->sum % bucketcount].push_back(i);
m_nodelist[i]->pointer = 0;
m_nodelist[i]->written = false;
m_nodelist[i]->deleted = false;
m_nodelist[i]->cloneof = NULL;
}
for (int b = 0; b < bucketcount; b++)
{
if (bucket[b].size() == 0)
continue;
for (vector::iterator it = bucket[b].begin(); it != bucket[b].end() - 1; it++)
{
if (!m_nodelist[(*it)]->deleted)
{
for (vector::iterator jt = it + 1; jt != bucket[b].end(); jt++)
{
if (!m_nodelist[(*jt)]->deleted)
{
// cout << "Comparing " << (*it) << " and " << (*jt) << endl;
if (m_nodelist[(*it)]->equals(m_nodelist[(*jt)][0]))
{
//cout << "Hey! " << (*it) << " == " << (*jt) << endl;
// ones[l].erase(jt);
m_nodelist[(*jt)]->deleted = true;
m_nodelist[(*jt)]->cloneof = m_nodelist[(*it)];
}
}
}
}
}
}
m_nodelist.clear();
m_nodelist.push_back(&m_root);
m_root.print(m_nodelist);
}
void DawgFactory::writeIndex(const string &filename)
{
ofstream out(filename.c_str(), ios::out | ios::binary);
unsigned char bytes[7];
bytes[0] = (m_encodableWords & 0x00FF0000) >> 16;
bytes[1] = (m_encodableWords & 0x0000FF00) >> 8;
bytes[2] = (m_encodableWords & 0x000000FF);
out.put(1); // DAWG format version 1
out.write(m_hash.charptr, sizeof(m_hash.charptr));
out.write((char*)bytes, 3);
out.put((char)m_alphas->length());
for (Quackle::Letter i = m_alphas->firstLetter(); i <= m_alphas->lastLetter(); i++)
{
QString letterText = QuackleIO::Util::uvStringToQString(m_alphas->letterParameter(i).text());
QByteArray utf8bytes = letterText.toUtf8();
string utf8LetterText(utf8bytes.constData());
out << utf8LetterText << ' ';
}
for (unsigned int i = 0; i < m_nodelist.size(); i++) {
//cout << m_nodelist[i]->c << " " << m_nodelist[i]->pointer << " " << m_nodelist[i]->t << " " << m_nodelist[i]->lastchild << endl;
Node* n = m_nodelist[i];
unsigned int p;
if (m_nodelist[i]->deleted)
{
p = (unsigned int)(m_nodelist[i]->cloneof->pointer);
// n = m_nodelist[i]->cloneof;
}
else
p = (unsigned int)(m_nodelist[i]->pointer);
bytes[0] = (p & 0x00FF0000) >> 16;
bytes[1] = (p & 0x0000FF00) >> 8;
bytes[2] = (p & 0x000000FF);
bytes[3] = n->c - QUACKLE_FIRST_LETTER;
unsigned int pb = n->playability;
bytes[4] = (pb & 0x00FF0000) >> 16;
bytes[5] = (pb & 0x0000FF00) >> 8;
bytes[6] = (pb & 0x000000FF);
if (n->lastchild) {
bytes[3] |= 64;
}
if (n->insmallerdict) {
bytes[3] |= 128;
}
out.write((char*)bytes, 7);
}
}
string DawgFactory::letterCountString() const
{
ostringstream str;
for (int i = 0; i < 4; i++)
{
for (int j = 0; j < 4; j++)
{
const int letterCount = j * 4 + i + 2;
if (j != 0)
str << "\t";
if (m_countsByLength[letterCount] > 0)
str << letterCount << "s: " << std::setw(7) << std::right << std::setfill(' ') << m_countsByLength[letterCount];
}
str << "\n";
}
return str.str();
}
void DawgFactory::Node::print(vector< Node* > &nodelist)
{
written = true;
if (children.size() == 0)
return;
if (!deleted)
{
//cout << " Setting pointer to " << nodelist.size() << " before I push_back the children." << endl;
pointer = (int)nodelist.size();
}
else
{
pointer = cloneof->pointer;
//cout << " Setting pointer to clone's (" << pointer << ") and not pushing anything." << endl;
}
if (!deleted)
{
for (unsigned int i = 0; i < children.size(); i++) {
nodelist.push_back(&children[i]);
}
for (unsigned int i = 0; i < children.size(); i++) {
if (!children[i].deleted)
children[i].print(nodelist);
else if (!children[i].cloneof->written)
children[i].cloneof->print(nodelist);
}
}
if (children.size() > 0)
children[children.size() - 1].lastchild = true;
}
// returns true if the word was actually added...false if it's a duplicate.
bool DawgFactory::Node::pushWord(const Quackle::LetterString &word, bool inSmaller, int pb)
{
bool added;
if (word.length() == 0) {
added = (playability == 0);
playability = (pb == 0) ? 1 : pb; // word terminators nodes are marked by nonzero playability in the v1 DAWG format
insmallerdict = inSmaller;
}
else {
char first = word[0];
Quackle::LetterString rest = word.substr(1, word.length() - 1);
int index = -1;
// cout << "first: " << first << ", rest: " << rest << endl;
for (unsigned int i = 0; i < children.size(); i++) {
if (children[i].c == first) {
index = i;
break;
}
}
if (index == -1) {
Node n;
n.c = first;
n.playability = 0;
n.insmallerdict = false;
n.pointer = 0;
n.lastchild = false;
children.push_back(n);
index = (int)children.size() - 1;
}
added = children[index].pushWord(rest, inSmaller, pb);
}
sumexplored = false;
deleted = false;
written = false;
return added;
}
bool DawgFactory::Node::equals(const Node &n) const
{
if (playability != n.playability)
return false;
if (c != n.c)
return false;
if (children.size() != n.children.size())
return false;
if (insmallerdict != n.insmallerdict)
return false;
if (sum != n.sum)
return false;
for (unsigned int i = 0; i < children.size(); i++)
if (!children[i].equals(n.children[i]))
return false;
return true;
}
int DawgFactory::Node::letterSum() const
{
if (sumexplored)
return sum;
sumexplored = true;
// djb2 checksum
sum = 5381 * 33 + (int) c;
for (unsigned int i = 0; i < children.size(); i++)
sum = (sum << 5) + sum + children[i].letterSum();
return sum;
}