diff options
author | Leo Tenenbaum <pommicket@gmail.com> | 2020-01-19 23:21:56 -0500 |
---|---|---|
committer | Leo Tenenbaum <pommicket@gmail.com> | 2020-01-19 23:21:56 -0500 |
commit | 4a1467810d1cf9498db9507f36f78a0bd385c1e9 (patch) | |
tree | e212837c08ac0f1dd308cf800b1a8fdf6cdd3a3f | |
parent | d4cf195c8ad0338fef7ccb08d0767360378b24d2 (diff) |
extracted hash table for future use by foreign to avoid loading libraries twice
-rw-r--r-- | README.html | 4 | ||||
-rw-r--r-- | README.md | 4 | ||||
-rw-r--r-- | data_structures.c (renamed from arr.c) | 167 | ||||
-rw-r--r-- | identifiers.c | 90 | ||||
-rw-r--r-- | misc.c | 2 | ||||
-rw-r--r-- | tests.c | 1 | ||||
-rw-r--r-- | toc.c | 4 | ||||
-rw-r--r-- | types.h | 49 |
8 files changed, 243 insertions, 78 deletions
diff --git a/README.html b/README.html index f00bf27..baa360d 100644 --- a/README.html +++ b/README.html @@ -46,7 +46,9 @@ it is nearly as fast in theory.</p> <h3><code>toc</code> Compiler Source Code</h3> -<p>Most of the source code for the <code>toc</code> compiler is licensed under the GNU General Public License, version 3. See <code>LICENSE</code> for more information.</p> +<p>Most of the source code for the <code>toc</code> compiler is licensed under the GNU General Public License, version 3, and the rest (some small general utilities) is in the public domain. Each source file begins with a comment explaining its license.</p> + +<p>See <code>LICENSE</code> for the GNU General Public License.</p> <p><code>toc</code> is written in C, for speed and portability. It has no dependencies, other than the C runtime library. If you want to be able to call external C functions at compile time, however, you will need <code>libffcall</code> and <code>libdl</code> (so this is only currently supported on Unix-y systems).</p> @@ -43,7 +43,9 @@ On other systems, you can just compile main.c with a C compiler. `toc` uses seve ### `toc` Compiler Source Code -Most of the source code for the `toc` compiler is licensed under the GNU General Public License, version 3. See `LICENSE` for more information. +Most of the source code for the `toc` compiler is licensed under the GNU General Public License, version 3, and the rest (some small general utilities) is in the public domain. Each source file begins with a comment explaining its license. + +See `LICENSE` for the GNU General Public License. `toc` is written in C, for speed and portability. It has no dependencies, other than the C runtime library. If you want to be able to call external C functions at compile time, however, you will need `libffcall` and `libdl` (so this is only currently supported on Unix-y systems). diff --git a/arr.c b/data_structures.c index b44935d..970d966 100644 --- a/arr.c +++ b/data_structures.c @@ -1,7 +1,26 @@ -/* - Copyright (C) 2019, 2020 Leo Tenenbaum. - This file is part of toc. toc is distributed under version 3 of the GNU General Public License, without any warranty whatsoever. - You should have received a copy of the GNU General Public License along with toc. If not, see <https://www.gnu.org/licenses/>. +/* +Dynamic arrays and hash tables in C + +This is free and unencumbered software released into the public domain. +Anyone is free to copy, modify, publish, use, compile, sell, or +distribute this software, either in source code form or as a compiled +binary, for any purpose, commercial or non-commercial, and by any +means. +In jurisdictions that recognize copyright laws, the author or authors +of this software dedicate any and all copyright interest in the +software to the public domain. We make this dedication for the benefit +of the public at large and to the detriment of our heirs and +successors. We intend this dedication to be an overt act of +relinquishment in perpetuity of all present and future rights to this +software under copyright law. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +OTHER DEALINGS IN THE SOFTWARE. +For more information, please refer to <http://unlicense.org/> */ /* OPTIM: is it faster to store void *end? */ @@ -79,8 +98,7 @@ static void arr_set_len_(void **arr, size_t n, size_t item_sz) { } static void arr_set_lena_(void **arr, size_t n, size_t item_sz, Allocator *a) { if (n == 0) { - /* OPTIM: arr_cleara */ - *arr = NULL; + arr_cleara_(arr, item_sz, a); return; } arr_resva_(arr, n, item_sz, a); @@ -215,3 +233,140 @@ static void arr_test(void) { arr_clear(&foos); } #endif + +static U64 str_hash(const char *s, size_t len) { + U32 x = 0xabcdef01; + U32 y = 0x31415926; + U64 hash = 0; + for (size_t i = 0; i < len; ++i) { + hash += (U64)x * (unsigned char)(*s) + y; + x = rand_u32(x); + y = rand_u32(y); + ++s; + } + return hash; +} + +static inline void str_hash_table_create(StrHashTable *t, size_t data_size, Allocator *allocr) { + t->slots = NULL; + t->data_size = data_size; + t->nentries = 0; + t->allocr = allocr; + t->rand_seed = 0xabacabad; +} + +static StrHashTableSlot **str_hash_table_slot_get(StrHashTableSlot **slots, const char *s, size_t s_len, size_t i) { + StrHashTableSlot **slot; + size_t slots_cap = arr_len(slots); + while (1) { + assert(i < slots_cap); + slot = &slots[i]; + if (!*slot) break; + if (s && (*slot)->str && + s_len == (*slot)->len && memcmp(s, (*slot)->str, s_len) == 0) + break; + i = (i+1) % slots_cap; + } + return slot; +} + +static void str_hash_table_grow(StrHashTable *t) { + size_t slots_cap = arr_len(t->slots); + if (slots_cap <= 2 * t->nentries) { + StrHashTableSlot **new_slots = NULL; + size_t new_slots_cap = slots_cap * 2 + 10; + arr_set_len(&new_slots, new_slots_cap); + arr_zero(new_slots); + arr_foreach(t->slots, StrHashTableSlotPtr, slotp) { + StrHashTableSlot *slot = *slotp; + if (slot) { + U64 new_hash = str_hash(slot->str, slot->len); + StrHashTableSlot **new_slot = str_hash_table_slot_get(new_slots, slot->str, slot->len, new_hash % new_slots_cap); + *new_slot = slot; + } + } + arr_cleara(&t->slots, t->allocr); + t->slots = new_slots; + slots_cap = new_slots_cap; + } +} + +static inline size_t str_hash_table_slot_size(StrHashTable *t) { + return sizeof(StrHashTableSlot) + ((t->data_size + sizeof(MaxAlign) - 1) / sizeof(MaxAlign)) * sizeof(MaxAlign); +} + +static StrHashTableSlot *str_hash_table_insert_(StrHashTable *t, const char *str, size_t len) { + str_hash_table_grow(t); + size_t slots_cap = arr_len(t->slots); + U64 hash = str_hash(str, len); + StrHashTableSlot **slot = str_hash_table_slot_get(t->slots, str, len, hash % slots_cap); + if (!*slot) { + *slot = allocr_calloc(t->allocr, 1, str_hash_table_slot_size(t)); + (*slot)->str = str; + (*slot)->len = len; + ++t->nentries; + } + return *slot; +} + +/* use this if you don't need the slot */ +static inline void *str_hash_table_insert(StrHashTable *t, const char *str, size_t len) { + return str_hash_table_insert_(t, str, len)->data; +} + +static StrHashTableSlot *str_hash_table_insert_anonymous_(StrHashTable *t) { + str_hash_table_grow(t); + size_t slots_cap = arr_len(t->slots); + U32 slot_idx = (U32)((t->rand_seed = rand_u32(t->rand_seed)) % slots_cap); + StrHashTableSlot **slot = str_hash_table_slot_get(t->slots, NULL, 0, slot_idx); + if (!*slot) { + *slot = allocr_calloc(t->allocr, 1, str_hash_table_slot_size(t)); + ++t->nentries; + } + return *slot; +} + +/* use this if you don't need the slot */ +static inline void *str_hash_table_insert_anonymous(StrHashTable *t) { + return str_hash_table_insert_anonymous_(t)->data; +} + + +static void str_hash_table_free(StrHashTable *t) { + arr_foreach(t->slots, StrHashTableSlotPtr, slotp) { + allocr_free(t->allocr, *slotp, str_hash_table_slot_size(t)); + } + arr_cleara(&t->slots, t->allocr); +} + +static StrHashTableSlot *str_hash_table_get_(StrHashTable *t, const char *str, size_t len) { + size_t slot_index = str_hash(str, len) % arr_len(t->slots); + return *str_hash_table_slot_get(t->slots, str, len, slot_index); +} + +static inline void *str_hash_table_get(StrHashTable *t, const char *str, size_t len) { + return str_hash_table_get_(t, str, len)->data; +} + +#ifdef TOC_DEBUG +static void str_hash_table_test(void) { + StrHashTable t; + str_hash_table_create(&t, sizeof(int), NULL); + int *p = str_hash_table_insert(&t, "Hello", 5); + *p = 182; + int *q = str_hash_table_insert(&t, "Hello", 5); + assert(p == q); + assert(*q == 182); + *q = 112; + int *r = str_hash_table_insert(&t, "Hellop", 6); + assert(p != r); + assert(*r == 0); + *r = 999; + int *s = str_hash_table_insert_anonymous(&t); + assert(p != s && r != s); + *s = 123; + int *u = str_hash_table_get(&t, "Hello", 5); + assert(p == u); + str_hash_table_free(&t); +} +#endif diff --git a/identifiers.c b/identifiers.c index 52f09b0..a17c002 100644 --- a/identifiers.c +++ b/identifiers.c @@ -28,22 +28,22 @@ static int is_ident(int c) { /* Initialize Identifiers. */ static void idents_create(Identifiers *ids) { - ids->slots = NULL; - ids->nidents = 0; + str_hash_table_create(&ids->table, sizeof(IdentSlot) - sizeof(StrHashTableSlot), NULL); ids->rseed = 0x27182818; } -static U64 ident_hash(char **s) { - U32 x = 0xabcdef01; - U32 y = 0x31415926; - U64 hash = 0; +/* advances s until a non-identifier character is reached, then returns the number of characters advanced */ +static size_t ident_str_len(char **s) { + char *original = *s; while (is_ident(**s)) { - hash += (U64)x * (unsigned char)(**s) + y; - x = rand_u32(x); - y = rand_u32(y); ++*s; } - return hash; + return (size_t)(*s - original); +} + +static U64 ident_hash(char **s) { + char *original = *s; + return str_hash(original, ident_str_len(s)); } /* are these strings equal, up to the first non-ident character? */ @@ -57,7 +57,7 @@ static bool ident_str_eq_str(const char *s, const char *t) { static inline bool ident_eq_str(Identifier i, const char *s) { if (i->anonymous) return false; - return ident_str_eq_str(i->text, s); + return ident_str_eq_str(i->str, s); } @@ -78,55 +78,25 @@ static IdentSlot **ident_slots_insert(IdentSlot **slots, char *s, size_t i) { static Identifier ident_new_anonymous(Identifiers *ids) { U32 idx = rand_u32(ids->rseed); ids->rseed = idx; - IdentSlot **slot = ident_slots_insert(ids->slots, NULL, idx % arr_len(ids->slots)); - *slot = err_calloc(1, sizeof **slot); - ++ids->nidents; - (*slot)->anonymous = true; - (*slot)->len = 3; - (*slot)->text = "???"; - return *slot; + IdentSlot *slot = (IdentSlot *)str_hash_table_insert_anonymous_(&ids->table); + slot->anonymous = true; + return slot; } /* moves s to the char after the identifier */ /* inserts if does not exist. reads until non-ident char is found. */ /* advances past identifier */ static Identifier ident_insert(Identifiers *ids, char **s) { - size_t nslots = arr_len(ids->slots); - if (nslots <= 2*ids->nidents) { - IdentSlot **slots = ids->slots; - /* reserve more space */ - IdentSlot **new_slots = NULL; - size_t new_nslots = nslots * 2 + 10; - arr_set_len(&new_slots, new_nslots); - arr_zero(new_slots); - arr_foreach(slots, IdentSlotPtr, slotp) { - IdentSlot *slot = *slotp; - if (slot) { - char *ptr = slot->text; - U64 new_hash = ident_hash(&ptr); - IdentSlot **new_slot = ident_slots_insert(new_slots, slot->text, new_hash % new_nslots); - *new_slot = slot; - } - } - arr_clear(&slots); - ids->slots = new_slots; - nslots = new_nslots; - } char *original = *s; - U64 hash = ident_hash(s); - IdentSlot **slot = ident_slots_insert(ids->slots, original, hash % arr_len(ids->slots)); - if (!*slot) { - *slot = err_calloc(1, sizeof **slot); - ++ids->nidents; - (*slot)->text = original; - (*slot)->len = (size_t)(*s - original); - } - return *slot; + size_t len = ident_str_len(s); + return (Identifier)str_hash_table_insert_(&ids->table, original, len); } static char *ident_to_str(Identifier i) { char *str = err_malloc(i->len + 1); - /* for some reason, GCC thinks that i->len is -1 when this is called from type_to_str_ (in release mode) */ + /* for some reason, GCC thinks that i->len is -1 when this is called from type_to_str_ (in release mode) + TODO: test this now (some stuff was changed) + */ #if !defined(__clang__) && defined(__GNUC__) #pragma GCC diagnostic push @@ -134,7 +104,7 @@ static char *ident_to_str(Identifier i) { #pragma GCC diagnostic ignored "-Wrestrict" #pragma GCC diagnostic ignored "-Warray-bounds" #endif - memcpy(str, i->text, i->len); + memcpy(str, i->str, i->len); #if !defined(__clang__) && defined(__GNUC__) @@ -146,7 +116,7 @@ static char *ident_to_str(Identifier i) { static void fprint_ident(FILE *out, Identifier id) { - fwrite(id->text, 1, id->len, out); + fwrite(id->str, 1, id->len, out); } static void fprint_ident_debug(FILE *out, Identifier id) { @@ -174,7 +144,7 @@ static void fprint_ident_reduced_charset(FILE *out, Identifier id) { fprintf(out, "a%p__",(void *)id); return; } - for (char *s = id->text; is_ident(*s); ++s) { + for (const char *s = id->str; is_ident(*s); ++s) { int c = (unsigned char)(*s); if (c > 127) { fprintf(out, "x__%x", c); @@ -187,14 +157,13 @@ static void fprint_ident_reduced_charset(FILE *out, Identifier id) { /* NULL = no such identifier. returns identifier "foo" for both "foo\0" and "foo+92384324..." */ static Identifier ident_get(Identifiers *ids, char *s) { char *ptr = s; - U64 hash = ident_hash(&ptr); - IdentSlot **slot = ident_slots_insert(ids->slots, s, hash % arr_len(ids->slots)); - return *slot; + size_t len = ident_str_len(&ptr); + return (Identifier)str_hash_table_get_(&ids->table, s, len); } static Identifier ident_translate(Identifier i, Identifiers *to_idents) { if (!i || i->anonymous) return NULL; - Identifier new_ident = ident_get(to_idents, i->text); + Identifier new_ident = ident_get(to_idents, i->str); return new_ident; } @@ -213,16 +182,15 @@ static IdentDecl *ident_decl(Identifier i) { /* returns true if i and j are equal, even if they're not in the same table */ static bool ident_eq(Identifier i, Identifier j) { - return ident_str_eq_str(i->text, j->text); + return i->len == j->len && memcmp(i->str, j->str, i->len) == 0; } static void idents_free(Identifiers *ids) { - arr_foreach(ids->slots, IdentSlotPtr, slotp) { - IdentSlot *slot = *slotp; + arr_foreach(ids->table.slots, StrHashTableSlotPtr, slotp) { + IdentSlot *slot = *(IdentSlot **)slotp; if (slot) arr_clear(&slot->decls); - free(slot); } - arr_clear(&ids->slots); + str_hash_table_free(&ids->table); } #ifdef TOC_DEBUG @@ -53,3 +53,5 @@ static U32 rand_u32(U32 seed) { return (U32)((seed64 * 0x832f0fda4e1a8642 + 0x41d49cd5459a2ab4) >> 32); } + + @@ -31,6 +31,7 @@ static void test_all(void) { allocr_test(); arr_test(); block_arr_test(); + str_hash_table_test(); idents_test(); binfile_test(); } @@ -69,10 +69,10 @@ static inline bool type_is_slicechar(Type *t) { /* utilities */ #include "allocator.c" -#include "arr.c" +#include "misc.c" +#include "data_structures.c" #include "location.c" #include "err.c" -#include "misc.c" #include "blockarr.c" #include "instance_table.c" #include "copy.c" @@ -1,7 +1,27 @@ /* - Copyright (C) 2019, 2020 Leo Tenenbaum. - This file is part of toc. toc is distributed under version 3 of the GNU General Public License, without any warranty whatsoever. - You should have received a copy of the GNU General Public License along with toc. If not, see <https://www.gnu.org/licenses/>. +toc's types. Note that although these types are in the public domain, +the code which uses them (i.e. most of the rest of toc) is not. + +This is free and unencumbered software released into the public domain. +Anyone is free to copy, modify, publish, use, compile, sell, or +distribute this software, either in source code form or as a compiled +binary, for any purpose, commercial or non-commercial, and by any +means. +In jurisdictions that recognize copyright laws, the author or authors +of this software dedicate any and all copyright interest in the +software to the public domain. We make this dedication for the benefit +of the public at large and to the detriment of our heirs and +successors. We intend this dedication to be an overt act of +relinquishment in perpetuity of all present and future rights to this +software under copyright law. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +OTHER DEALINGS IN THE SOFTWARE. +For more information, please refer to <http://unlicense.org/> */ /* NOTE: make sure you edit copy.c and package.c and cgen_recurse_subexprs/types when you make a change to expression-related types or type-related types in this file! */ @@ -170,23 +190,38 @@ typedef struct IdentDecl { } IdentDecl; typedef struct IdentSlot { + char *str; + size_t len; bool export_name; /* is this identifier's name important? */ bool anonymous; /* is this identifier not part of a tree? */ bool imported; /* was this identifier imported from a package? */ - char *text; /* actual name of the identifier */ - size_t len; /* length of name */ U64 export_id; /* 0 if there's no exported identifier here, otherwise unique positive integer associated with this identifier */ struct Package *from_pkg; struct Package *pkg; /* NULL if this is not associated with a package */ IdentDecl *decls; /* array of declarations of this identifier */ } IdentSlot; +typedef struct { + const char *str; + size_t len; + MaxAlign data[]; +} StrHashTableSlot; + +typedef StrHashTableSlot *StrHashTableSlotPtr; + +typedef struct { + StrHashTableSlot **slots; + Allocator *allocr; + U32 rand_seed; + size_t data_size; + size_t nentries; /* # of filled slots */ +} StrHashTable; + typedef IdentSlot *Identifier; typedef IdentSlot *IdentSlotPtr; typedef struct Identifiers { - IdentSlot **slots; - U64 nidents; + StrHashTable table; U32 rseed; } Identifiers; |