diff options
author | pommicket <pommicket@gmail.com> | 2025-09-25 14:59:07 -0400 |
---|---|---|
committer | pommicket <pommicket@gmail.com> | 2025-09-25 14:59:07 -0400 |
commit | 52f08356aedd7ff2bc5c3fdb9effac98d98b0c63 (patch) | |
tree | adc37da02bd27404dc39f31965b09bd2e9110223 | |
parent | 1d6462d9c03c620d24d113443d24fcfce984c817 (diff) |
Add translingual defintiiosn
-rw-r--r-- | index.html | 41 | ||||
-rw-r--r-- | src/definitions.rs | 185 |
2 files changed, 148 insertions, 78 deletions
@@ -16,6 +16,7 @@ <p> These are various lists of words extracted from Wiktionary data dumps. Some of the code used to produce them is available <a href="https://github.com/pommicket/wiktionary" target="_blank">here</a>.<br> + Of course, all these lists undoubtedly contain errors because Wiktionary contains errors.<br> You can do whatever you like with them, subject to <a href="https://en.wiktionary.org/wiki/Wiktionary:Copyrights" target="_blank">Wiktionary's licensing</a>, where applicable. </p> @@ -26,10 +27,44 @@ Words labelled <i>offensive</i> on Wiktionary were filtered out (overly aggressively—some totally inoffensive words were removed in the process). </li> <li> - English definitions: <a href="/wiktionary/en-definitions.txt.xz">en-definitions.txt.xz (22MB compressed, 115MB uncompressed, 1,629,682 entries)</a>.¹<br> - Every English definition in English wiktionary. Format is <code style="white-space: pre;">WORD DEFINITION</code> - on each line (note: delimiter is <b>2</b> spaces).<br> + English definitions: + <a href="/wiktionary/en-definitions.txt.xz">en-definitions.txt.xz (23MB compressed, 127MB uncompressed, 1,629,482 entries)</a> + and<br>Translingual definitions: + <a href="/wiktionary/trans-definitions.txt.xz">trans-definitions.txt.xz (MB compressed, MB uncompressed, entries)</a>.¹<br> + Every English/Translingual definition in English wiktionary. + Format is <code style="white-space: pre;">WORD PART_OF_SPEECH DEFINITION</code> + on each line (note the two spaces between word and part of speech).<br> Words can have multiple definitions; they are listed as separate lines.<br> + <code>PART_OF_SPEECH</code> is one of the following: + <ul> + <li><code>%adjective</code> (e.g. <i>unbelievable</i>)</li> + <li><code>%noun</code> (e.g. <i>belief</i>)</li> + <li><code>%noun.proper</code> (e.g. <i>France</i>)</li> + <li><code>%verb</code> (e.g. <i>believe</i>)</li> + <li><code>%adverb</code> (e.g. <i>unbelievably</i>)</li> + <li><code>%interjection</code> (e.g. <i>yowza</i>)</li> + <li><code>%particle</code> (e.g. <i>O</i>)</li> + <li><code>%conjunction</code> (e.g. <i>unless</i>)</li> + <li><code>%preposition</code> (e.g. <i>into</i>)</li> + <li><code>%determiner</code> (e.g. <i>the</i>)</li> + <li><code>%pronoun</code> (e.g. <i>yourself</i>)</li> + <li><code>%contraction</code> (e.g. <i>woulda</i>)</li> + <li><code>%number</code> (e.g. <i>2</i>, <i>twenty-seven</i>)</li> + <li><code>%phrase</code> (e.g. <i>you'd better believe it</i>)</li> + <li><code>%phrase.prepositional</code> (e.g. <i>beyond belief</i>)</li> + <li><code>%phrase.proverb</code> (e.g. <i>seeing is believing</i>)</li> + <li><code>%affix</code> (e.g. <i>🅱</i>, a “simulfix”)</li> + <li><code>%affix.prefix</code> (e.g. <i>un-</i>)</li> + <li><code>%affix.suffix</code> (e.g. <i>-ism</i>)</li> + <li><code>%affix.infix</code> (e.g. <i>-fuckin-</i>)</li> + <li><code>%affix.circumfix</code> (e.g. <i>a- -ing</i>)</li> + <li><code>%affix.interfix</code> (rare, e.g. <i>-retin-</i>)</li> + <li><code>%symbol</code> (e.g. <i>℞</i>)</li> + <li><code>%symbol.punctuation</code> (e.g. <i>…</i>)</li> + <li><code>%symbol.letter</code> (e.g. <i>b</i>)</li> + <li><code>%symbol.diacritic</code> (e.g. <i>◌́</i>)</li> + <li><code>%unknown</code> — couldn’t be determined/none of the above</li> + </ul> <code>DEFINITION</code> is in the wikitext format.<br> It’s possible that there are parsing errors, but I haven’t spotted any yet. </li> diff --git a/src/definitions.rs b/src/definitions.rs index bb7eb04..d3d0192 100644 --- a/src/definitions.rs +++ b/src/definitions.rs @@ -66,7 +66,6 @@ enum Section { Conjunction, PrepositionalPhrase, Proverb, - Idiom, Phrase, Suffix, Prefix, @@ -80,7 +79,6 @@ enum Section { PunctuationMark, DiacriticalMark, Determiner, - Participle, Particle, Contraction, Letter, @@ -97,7 +95,8 @@ impl Section { "Adjective" | "Proper adjective" | "Adjectives" => Adjective, "Noun" => Noun, "Proper noun" => ProperNoun, - "Verb" | "Verb phrase" | "Verb form" => Verb, + // All but Verb are not used enough to warrant their own categories + "Verb" | "Verb phrase" | "Verb form" | "Participle" => Verb, "Adverbial phrase" | "Adverb" => Adverb, "Interjection" => Interjection, "Conjunction" => Conjunction, @@ -109,20 +108,18 @@ impl Section { "Infix" => Infix, "Interfix" => Interfix, "Pronoun" => Pronoun, - "Phrase" => Phrase, - "Symbol" => Symbol, + // Idiom is not used enough to warrant its own category (only appears 12 times) + "Phrase" | "Idiom" => Phrase, + "Symbol" | "Cuneiform sign" | "Iteration mark" => Symbol, "Preposition" => Preposition, "Punctuation mark" => PunctuationMark, - "Diacritical mark" => DiacriticalMark, + "Diacritical mark" | "Diacritic" => DiacriticalMark, "Article" | "Determiner" => Determiner, - "Participle" => Participle, "Particle" => Particle, "Contraction" => Contraction, - "Idiom" => Idiom, "Letter" => Letter, - "Affix" | "Combining form" => Affix, - // currently at least ev (abbr. for even, ever, every) has this designation - "Multiple parts of speech" => UnknownPoS, + "Affix" | "Combining form" | "Simulfix" => Affix, + "Multiple parts of speech" | "Syllable" => UnknownPoS, // 20250701 erroneously has "Proper noun 1" and "Proper Noun" and "Proper" "Proper" | "Proper Noun" => ProperNoun, x if x.starts_with("Proper noun ") => ProperNoun, @@ -185,11 +182,37 @@ impl Section { | "Declension" | "Synonyms and related terms" | "Additional notes" - | "Related vocabulary" => NotDefinition, - x if x.starts_with("Pronunciation") || x.starts_with("Etymology") => NotDefinition, + | "Related vocabulary" + | "Glyph origin" + | "Han character" + | "Derived characters" + | "Related characters" + | "Related symbols" + | "Design" + | "Forms" + | "Ligature" + | "Derived signs" + | "Derived symbols" + | "Derived glyphs" + | "Production" + | "Example" + | "Origin" + | "Bibliography" + | "Formation" + | "Derived Characters" + | "Composition" + | "Depiction" + | "Derived forms" => NotDefinition, + x if x.starts_with("Pronunciation") + || x.starts_with("Etymology") + || x.starts_with("Terms suffixed with ") + || x.starts_with("Symbol origin") => + { + NotDefinition + } // mistakes that exist in 20250701 dump "Alternate forms" | "English" | "Etymyology" | "See Also" | "Usage Notes" - | "Translate" => NotDefinition, + | "Translate" | "Etymolohy" | "Derived chracters" => NotDefinition, _ => { eprintln!("\x1b[1mUnrecognized section {section}\x1b[0m (in page {title})"); return None; @@ -208,7 +231,6 @@ impl Section { Conjunction => "conjunction", PrepositionalPhrase => "phrase.prepositional", Proverb => "phrase.proverb", - Idiom => "phrase.idiom", Phrase => "phrase", Suffix => "affix.suffix", Prefix => "affix.prefix", @@ -218,14 +240,13 @@ impl Section { Affix => "affix", Pronoun => "pronoun", Symbol => "symbol", + PunctuationMark => "symbol.punctuation", + Letter => "symbol.letter", + DiacriticalMark => "symbol.diacritic", Preposition => "preposition", - PunctuationMark => "punctuation", - DiacriticalMark => "diacritic", Determiner => "determiner", - Participle => "participle", Particle => "particle", Contraction => "contraction", - Letter => "letter", Number => "number", UnknownPoS => "unknown", NotDefinition => panic!(), @@ -233,7 +254,11 @@ impl Section { } } -fn parse_xml(reader: impl BufRead, output: &mut Definitions) -> Result<(), Box<dyn Error>> { +fn parse_xml( + reader: impl BufRead, + output: &mut Definitions, + language: &str, +) -> Result<(), Box<dyn Error>> { let mut config = xml::reader::ParserConfig::default(); config.cdata_to_characters = true; #[derive(Debug, Clone, Copy)] @@ -281,52 +306,54 @@ fn parse_xml(reader: impl BufRead, output: &mut Definitions) -> Result<(), Box<d if name.local_name == "page" { title.clear(); } else if name.local_name == "text" { - if ns == 0 - && let Some(eng_start) = body.find("==English==\n") - { - let mut curr_section = None; - let eng = &body[eng_start..]; - let eng_end = eng - .as_bytes() - .windows(4) - .position(|w| w.starts_with(b"\n==") && w[3] != b'=') - .unwrap_or(eng.len()); - let eng = &eng[..eng_end]; - for (i, w) in eng.as_bytes().windows(3).enumerate() { - if w == b"\n==" && eng.get(i + 3..i + 4) == Some("=") { - let mut section = &eng[i + 3..]; - while let Some(s) = section.strip_prefix('=') { - section = s; - } - let Some((section, _)) = section - .split_once('\n') - .and_then(|(first_line, _)| first_line.split_once('=')) - else { - continue; - }; - curr_section = Section::from_name(section, &title); - continue; - } - if curr_section == Some(Section::NotDefinition) { - continue; + let body = std::mem::take(&mut body); + if ns != 0 { + continue; + } + let Some(lang_start) = body.find(&format!("=={language}==\n")) else { + continue; + }; + let mut curr_section = None; + let lang = &body[lang_start..]; + let lang_end = lang + .as_bytes() + .windows(4) + .position(|w| w.starts_with(b"\n==") && w[3] != b'=') + .unwrap_or(lang.len()); + let lang = &lang[..lang_end]; + for (i, w) in lang.as_bytes().windows(3).enumerate() { + if w == b"\n==" && lang.get(i + 3..i + 4) == Some("=") { + let mut section = &lang[i + 3..]; + while let Some(s) = section.strip_prefix('=') { + section = s; } - if w != b"\n# " { + let Some((section, _)) = section + .split_once('\n') + .and_then(|(first_line, _)| first_line.split_once('=')) + else { continue; - } - let definition = - eng[i + 3..].split_once('\n').map_or(&eng[i + 3..], |x| x.0); - let definition = remove_comments(definition); - if curr_section.is_none() { + }; + curr_section = Section::from_name(section, &title); + continue; + } + if curr_section == Some(Section::NotDefinition) { + continue; + } + if w != b"\n# " { + continue; + } + let definition = lang[i + 3..] + .split_once('\n') + .map_or(&lang[i + 3..], |x| x.0); + let definition = remove_comments(definition); + let curr_section = curr_section.unwrap_or_else(|| { + if false { eprintln!("\x1b[1mMissing part of speech\x1b[0m for {title}"); } - output.add_definition( - &title[..], - curr_section.unwrap_or(Section::UnknownPoS), - &definition, - ); - } + Section::UnknownPoS + }); + output.add_definition(&title[..], curr_section, &definition); } - body.clear(); } else if name.local_name == "ns" { ns = ns_str.parse().unwrap_or(1); } @@ -338,14 +365,17 @@ fn parse_xml(reader: impl BufRead, output: &mut Definitions) -> Result<(), Box<d } pub fn definitions(args: Vec<String>) -> Result<(), Box<dyn Error>> { - let mut output = Definitions::default(); let mut files: Vec<String> = vec![]; for arg in args { if arg == "-h" || arg == "--help" { println!("Usage: {} definitions [FILES]", env!("CARGO_PKG_NAME")); - println!(" Extract English-language definitions from Wiktionary"); - println!(" data dump files, writing output to definitions.txt."); - println!(" Each line of the output file is of the format: Word Definition"); + println!(" Extract English-language and Translingual definitions from Wiktionary"); + println!( + " data dump files, writing output to en-definitions.txt and trans-definitions.txt." + ); + println!( + " Each line of the output file is of the format: Word Part_of_Speech Definition" + ); println!( " Note the two spaces—this avoids ambiguity when the word contains a space." ); @@ -400,17 +430,22 @@ pub fn definitions(args: Vec<String>) -> Result<(), Box<dyn Error>> { return Err("Aborted.".into()); } } - for input_filename in &files { - let input = File::open(input_filename) - .map_err(|e| format!("Couldn't open {input_filename}: {e}"))?; - let reader = BufReader::new(input); - println!("Parsing {input_filename}..."); - parse_xml(reader, &mut output) - .map_err(|e| format!("Couldn't parse {input_filename}: {e}"))?; + for (lang, abbrev) in [("English", "en"), ("Translingual", "trans")] { + let mut output = Definitions::default(); + for input_filename in &files { + let input = File::open(input_filename) + .map_err(|e| format!("Couldn't open {input_filename}: {e}"))?; + let reader = BufReader::new(input); + println!("({lang}) Parsing {input_filename}..."); + parse_xml(reader, &mut output, lang) + .map_err(|e| format!("Couldn't parse {input_filename}: {e}"))?; + } + println!("Sorting {} definitions...", output.definitions.len()); + output.sort(); + crate::do_write(&format!("{abbrev}-definitions.txt"), |writer| { + output.write_to(writer) + })?; } - println!("Sorting {} definitions...", output.definitions.len()); - output.sort(); - crate::do_write("definitions.txt", |writer| output.write_to(writer))?; println!("Done!"); Ok(()) } |