From 52f08356aedd7ff2bc5c3fdb9effac98d98b0c63 Mon Sep 17 00:00:00 2001 From: pommicket Date: Thu, 25 Sep 2025 14:59:07 -0400 Subject: Add translingual defintiiosn --- index.html | 41 +++++++++++- src/definitions.rs | 185 +++++++++++++++++++++++++++++++---------------------- 2 files changed, 148 insertions(+), 78 deletions(-) diff --git a/index.html b/index.html index 3bd880c..116827f 100644 --- a/index.html +++ b/index.html @@ -16,6 +16,7 @@

These are various lists of words extracted from Wiktionary data dumps. Some of the code used to produce them is available here.
+ Of course, all these lists undoubtedly contain errors because Wiktionary contains errors.
You can do whatever you like with them, subject to Wiktionary's licensing, where applicable.

@@ -26,10 +27,44 @@ Words labelled offensive on Wiktionary were filtered out (overly aggressively—some totally inoffensive words were removed in the process).
  • - English definitions: en-definitions.txt.xz (22MB compressed, 115MB uncompressed, 1,629,682 entries)
    - Every English definition in English wiktionary. Format is WORD DEFINITION - on each line (note: delimiter is 2 spaces).
    + English definitions: + en-definitions.txt.xz (23MB compressed, 127MB uncompressed, 1,629,482 entries) + and
    Translingual definitions: + trans-definitions.txt.xz (MB compressed, MB uncompressed, entries)
    + Every English/Translingual definition in English wiktionary. + Format is WORD PART_OF_SPEECH DEFINITION + on each line (note the two spaces between word and part of speech).
    Words can have multiple definitions; they are listed as separate lines.
    + PART_OF_SPEECH is one of the following: + DEFINITION is in the wikitext format.
    It’s possible that there are parsing errors, but I haven’t spotted any yet.
  • diff --git a/src/definitions.rs b/src/definitions.rs index bb7eb04..d3d0192 100644 --- a/src/definitions.rs +++ b/src/definitions.rs @@ -66,7 +66,6 @@ enum Section { Conjunction, PrepositionalPhrase, Proverb, - Idiom, Phrase, Suffix, Prefix, @@ -80,7 +79,6 @@ enum Section { PunctuationMark, DiacriticalMark, Determiner, - Participle, Particle, Contraction, Letter, @@ -97,7 +95,8 @@ impl Section { "Adjective" | "Proper adjective" | "Adjectives" => Adjective, "Noun" => Noun, "Proper noun" => ProperNoun, - "Verb" | "Verb phrase" | "Verb form" => Verb, + // All but Verb are not used enough to warrant their own categories + "Verb" | "Verb phrase" | "Verb form" | "Participle" => Verb, "Adverbial phrase" | "Adverb" => Adverb, "Interjection" => Interjection, "Conjunction" => Conjunction, @@ -109,20 +108,18 @@ impl Section { "Infix" => Infix, "Interfix" => Interfix, "Pronoun" => Pronoun, - "Phrase" => Phrase, - "Symbol" => Symbol, + // Idiom is not used enough to warrant its own category (only appears 12 times) + "Phrase" | "Idiom" => Phrase, + "Symbol" | "Cuneiform sign" | "Iteration mark" => Symbol, "Preposition" => Preposition, "Punctuation mark" => PunctuationMark, - "Diacritical mark" => DiacriticalMark, + "Diacritical mark" | "Diacritic" => DiacriticalMark, "Article" | "Determiner" => Determiner, - "Participle" => Participle, "Particle" => Particle, "Contraction" => Contraction, - "Idiom" => Idiom, "Letter" => Letter, - "Affix" | "Combining form" => Affix, - // currently at least ev (abbr. for even, ever, every) has this designation - "Multiple parts of speech" => UnknownPoS, + "Affix" | "Combining form" | "Simulfix" => Affix, + "Multiple parts of speech" | "Syllable" => UnknownPoS, // 20250701 erroneously has "Proper noun 1" and "Proper Noun" and "Proper" "Proper" | "Proper Noun" => ProperNoun, x if x.starts_with("Proper noun ") => ProperNoun, @@ -185,11 +182,37 @@ impl Section { | "Declension" | "Synonyms and related terms" | "Additional notes" - | "Related vocabulary" => NotDefinition, - x if x.starts_with("Pronunciation") || x.starts_with("Etymology") => NotDefinition, + | "Related vocabulary" + | "Glyph origin" + | "Han character" + | "Derived characters" + | "Related characters" + | "Related symbols" + | "Design" + | "Forms" + | "Ligature" + | "Derived signs" + | "Derived symbols" + | "Derived glyphs" + | "Production" + | "Example" + | "Origin" + | "Bibliography" + | "Formation" + | "Derived Characters" + | "Composition" + | "Depiction" + | "Derived forms" => NotDefinition, + x if x.starts_with("Pronunciation") + || x.starts_with("Etymology") + || x.starts_with("Terms suffixed with ") + || x.starts_with("Symbol origin") => + { + NotDefinition + } // mistakes that exist in 20250701 dump "Alternate forms" | "English" | "Etymyology" | "See Also" | "Usage Notes" - | "Translate" => NotDefinition, + | "Translate" | "Etymolohy" | "Derived chracters" => NotDefinition, _ => { eprintln!("\x1b[1mUnrecognized section {section}\x1b[0m (in page {title})"); return None; @@ -208,7 +231,6 @@ impl Section { Conjunction => "conjunction", PrepositionalPhrase => "phrase.prepositional", Proverb => "phrase.proverb", - Idiom => "phrase.idiom", Phrase => "phrase", Suffix => "affix.suffix", Prefix => "affix.prefix", @@ -218,14 +240,13 @@ impl Section { Affix => "affix", Pronoun => "pronoun", Symbol => "symbol", + PunctuationMark => "symbol.punctuation", + Letter => "symbol.letter", + DiacriticalMark => "symbol.diacritic", Preposition => "preposition", - PunctuationMark => "punctuation", - DiacriticalMark => "diacritic", Determiner => "determiner", - Participle => "participle", Particle => "particle", Contraction => "contraction", - Letter => "letter", Number => "number", UnknownPoS => "unknown", NotDefinition => panic!(), @@ -233,7 +254,11 @@ impl Section { } } -fn parse_xml(reader: impl BufRead, output: &mut Definitions) -> Result<(), Box> { +fn parse_xml( + reader: impl BufRead, + output: &mut Definitions, + language: &str, +) -> Result<(), Box> { let mut config = xml::reader::ParserConfig::default(); config.cdata_to_characters = true; #[derive(Debug, Clone, Copy)] @@ -281,52 +306,54 @@ fn parse_xml(reader: impl BufRead, output: &mut Definitions) -> Result<(), Box Result<(), Box) -> Result<(), Box> { - let mut output = Definitions::default(); let mut files: Vec = vec![]; for arg in args { if arg == "-h" || arg == "--help" { println!("Usage: {} definitions [FILES]", env!("CARGO_PKG_NAME")); - println!(" Extract English-language definitions from Wiktionary"); - println!(" data dump files, writing output to definitions.txt."); - println!(" Each line of the output file is of the format: Word Definition"); + println!(" Extract English-language and Translingual definitions from Wiktionary"); + println!( + " data dump files, writing output to en-definitions.txt and trans-definitions.txt." + ); + println!( + " Each line of the output file is of the format: Word Part_of_Speech Definition" + ); println!( " Note the two spaces—this avoids ambiguity when the word contains a space." ); @@ -400,17 +430,22 @@ pub fn definitions(args: Vec) -> Result<(), Box> { return Err("Aborted.".into()); } } - for input_filename in &files { - let input = File::open(input_filename) - .map_err(|e| format!("Couldn't open {input_filename}: {e}"))?; - let reader = BufReader::new(input); - println!("Parsing {input_filename}..."); - parse_xml(reader, &mut output) - .map_err(|e| format!("Couldn't parse {input_filename}: {e}"))?; + for (lang, abbrev) in [("English", "en"), ("Translingual", "trans")] { + let mut output = Definitions::default(); + for input_filename in &files { + let input = File::open(input_filename) + .map_err(|e| format!("Couldn't open {input_filename}: {e}"))?; + let reader = BufReader::new(input); + println!("({lang}) Parsing {input_filename}..."); + parse_xml(reader, &mut output, lang) + .map_err(|e| format!("Couldn't parse {input_filename}: {e}"))?; + } + println!("Sorting {} definitions...", output.definitions.len()); + output.sort(); + crate::do_write(&format!("{abbrev}-definitions.txt"), |writer| { + output.write_to(writer) + })?; } - println!("Sorting {} definitions...", output.definitions.len()); - output.sort(); - crate::do_write("definitions.txt", |writer| output.write_to(writer))?; println!("Done!"); Ok(()) } -- cgit v1.2.3