Add translingual defintiiosn

author: pommicket <pommicket@gmail.com> 2025-09-25 14:59:07 -0400
committer: pommicket <pommicket@gmail.com> 2025-09-25 14:59:07 -0400
commit: 52f08356aedd7ff2bc5c3fdb9effac98d98b0c63 (patch)
tree: adc37da02bd27404dc39f31965b09bd2e9110223
parent: 1d6462d9c03c620d24d113443d24fcfce984c817 (diff)
2 files changed, 148 insertions, 78 deletions
diff --git a/index.html b/index.html
index 3bd880c..116827f 100644
--- a/index.html
+++ b/index.html
@@ -16,6 +16,7 @@
 	<p>
 		These are various lists of words extracted from Wiktionary data dumps. Some of the code
 		used to produce them is available <a href="https://github.com/pommicket/wiktionary" target="_blank">here</a>.<br>
+		Of course, all these lists undoubtedly contain errors because Wiktionary contains errors.<br>
 		You can do whatever you like with them, subject to
 		<a href="https://en.wiktionary.org/wiki/Wiktionary:Copyrights" target="_blank">Wiktionary's licensing</a>, where applicable.
 	</p>
@@ -26,10 +27,44 @@
 			Words labelled <i>offensive</i> on Wiktionary were filtered out (overly aggressively—some totally inoffensive words were removed in the process).
 		</li>
 		<li>
-			English definitions: <a href="/wiktionary/en-definitions.txt.xz">en-definitions.txt.xz (22MB compressed, 115MB uncompressed, 1,629,682 entries)</a>.¹<br>
-			Every English definition in English wiktionary. Format is <code style="white-space: pre;">WORD  DEFINITION</code>
-			on each line (note: delimiter is <b>2</b> spaces).<br>
+			English definitions:
+			<a href="/wiktionary/en-definitions.txt.xz">en-definitions.txt.xz (23MB compressed, 127MB uncompressed, 1,629,482 entries)</a>
+			and<br>Translingual definitions:
+			<a href="/wiktionary/trans-definitions.txt.xz">trans-definitions.txt.xz (MB compressed, MB uncompressed, entries)</a>.¹<br>
+			Every English/Translingual definition in English wiktionary.
+			Format is <code style="white-space: pre;">WORD  PART_OF_SPEECH DEFINITION</code>
+			on each line (note the two spaces between word and part of speech).<br>
 			Words can have multiple definitions; they are listed as separate lines.<br>
+			<code>PART_OF_SPEECH</code> is one of the following:
+			<ul>
+				<li><code>%adjective</code> (e.g. <i>unbelievable</i>)</li>
+				<li><code>%noun</code> (e.g. <i>belief</i>)</li>
+				<li><code>%noun.proper</code> (e.g. <i>France</i>)</li>
+				<li><code>%verb</code> (e.g. <i>believe</i>)</li>
+				<li><code>%adverb</code> (e.g. <i>unbelievably</i>)</li>
+				<li><code>%interjection</code> (e.g. <i>yowza</i>)</li>
+				<li><code>%particle</code> (e.g. <i>O</i>)</li>
+				<li><code>%conjunction</code> (e.g. <i>unless</i>)</li>
+				<li><code>%preposition</code> (e.g. <i>into</i>)</li>
+				<li><code>%determiner</code> (e.g. <i>the</i>)</li>
+				<li><code>%pronoun</code> (e.g. <i>yourself</i>)</li>
+				<li><code>%contraction</code> (e.g. <i>woulda</i>)</li>
+				<li><code>%number</code> (e.g. <i>2</i>, <i>twenty-seven</i>)</li>
+				<li><code>%phrase</code> (e.g. <i>you'd better believe it</i>)</li>
+				<li><code>%phrase.prepositional</code> (e.g. <i>beyond belief</i>)</li>
+				<li><code>%phrase.proverb</code> (e.g. <i>seeing is believing</i>)</li>
+				<li><code>%affix</code> (e.g. <i>🅱</i>, a “simulfix”)</li>
+				<li><code>%affix.prefix</code> (e.g. <i>un-</i>)</li>
+				<li><code>%affix.suffix</code> (e.g. <i>-ism</i>)</li>
+				<li><code>%affix.infix</code> (e.g. <i>-fuckin-</i>)</li>
+				<li><code>%affix.circumfix</code> (e.g. <i>a- -ing</i>)</li>
+				<li><code>%affix.interfix</code> (rare, e.g. <i>-retin-</i>)</li>
+				<li><code>%symbol</code> (e.g. <i>℞</i>)</li>
+				<li><code>%symbol.punctuation</code> (e.g. <i>…</i>)</li>
+				<li><code>%symbol.letter</code> (e.g. <i>b</i>)</li>
+				<li><code>%symbol.diacritic</code> (e.g. <i>◌́</i>)</li>
+				<li><code>%unknown</code> — couldn’t be determined/none of the above</li>
+			</ul>
 			<code>DEFINITION</code> is in the wikitext format.<br>
 			It’s possible that there are parsing errors, but I haven’t spotted any yet.
 		</li>
diff --git a/src/definitions.rs b/src/definitions.rs
index bb7eb04..d3d0192 100644
--- a/src/definitions.rs
+++ b/src/definitions.rs
@@ -66,7 +66,6 @@ enum Section {
 	Conjunction,
 	PrepositionalPhrase,
 	Proverb,
-	Idiom,
 	Phrase,
 	Suffix,
 	Prefix,
@@ -80,7 +79,6 @@ enum Section {
 	PunctuationMark,
 	DiacriticalMark,
 	Determiner,
-	Participle,
 	Particle,
 	Contraction,
 	Letter,
@@ -97,7 +95,8 @@ impl Section {
 			"Adjective" | "Proper adjective" | "Adjectives" => Adjective,
 			"Noun" => Noun,
 			"Proper noun" => ProperNoun,
-			"Verb" | "Verb phrase" | "Verb form" => Verb,
+			// All but Verb are not used enough to warrant their own categories
+			"Verb" | "Verb phrase" | "Verb form" | "Participle" => Verb,
 			"Adverbial phrase" | "Adverb" => Adverb,
 			"Interjection" => Interjection,
 			"Conjunction" => Conjunction,
@@ -109,20 +108,18 @@ impl Section {
 			"Infix" => Infix,
 			"Interfix" => Interfix,
 			"Pronoun" => Pronoun,
-			"Phrase" => Phrase,
-			"Symbol" => Symbol,
+			// Idiom is not used enough to warrant its own category (only appears 12 times)
+			"Phrase" | "Idiom" => Phrase,
+			"Symbol" | "Cuneiform sign" | "Iteration mark" => Symbol,
 			"Preposition" => Preposition,
 			"Punctuation mark" => PunctuationMark,
-			"Diacritical mark" => DiacriticalMark,
+			"Diacritical mark" | "Diacritic" => DiacriticalMark,
 			"Article" | "Determiner" => Determiner,
-			"Participle" => Participle,
 			"Particle" => Particle,
 			"Contraction" => Contraction,
-			"Idiom" => Idiom,
 			"Letter" => Letter,
-			"Affix" | "Combining form" => Affix,
-			// currently at least ev (abbr. for even, ever, every) has this designation
-			"Multiple parts of speech" => UnknownPoS,
+			"Affix" | "Combining form" | "Simulfix" => Affix,
+			"Multiple parts of speech" | "Syllable" => UnknownPoS,
 			// 20250701 erroneously has "Proper noun 1" and "Proper Noun" and "Proper"
 			"Proper" | "Proper Noun" => ProperNoun,
 			x if x.starts_with("Proper noun ") => ProperNoun,
@@ -185,11 +182,37 @@ impl Section {
 			| "Declension"
 			| "Synonyms and related terms"
 			| "Additional notes"
-			| "Related vocabulary" => NotDefinition,
-			x if x.starts_with("Pronunciation") || x.starts_with("Etymology") => NotDefinition,
+			| "Related vocabulary"
+			| "Glyph origin"
+			| "Han character"
+			| "Derived characters"
+			| "Related characters"
+			| "Related symbols"
+			| "Design"
+			| "Forms"
+			| "Ligature"
+			| "Derived signs"
+			| "Derived symbols"
+			| "Derived glyphs"
+			| "Production"
+			| "Example"
+			| "Origin"
+			| "Bibliography"
+			| "Formation"
+			| "Derived Characters"
+			| "Composition"
+			| "Depiction"
+			| "Derived forms" => NotDefinition,
+			x if x.starts_with("Pronunciation")
+				|| x.starts_with("Etymology")
+				|| x.starts_with("Terms suffixed with ")
+				|| x.starts_with("Symbol origin") =>
+			{
+				NotDefinition
+			}
 			// mistakes that exist in 20250701 dump
 			"Alternate forms" | "English" | "Etymyology" | "See Also" | "Usage Notes"
-			| "Translate" => NotDefinition,
+			| "Translate" | "Etymolohy" | "Derived chracters" => NotDefinition,
 			_ => {
 				eprintln!("\x1b[1mUnrecognized section {section}\x1b[0m (in page {title})");
 				return None;
@@ -208,7 +231,6 @@ impl Section {
 			Conjunction => "conjunction",
 			PrepositionalPhrase => "phrase.prepositional",
 			Proverb => "phrase.proverb",
-			Idiom => "phrase.idiom",
 			Phrase => "phrase",
 			Suffix => "affix.suffix",
 			Prefix => "affix.prefix",
@@ -218,14 +240,13 @@ impl Section {
 			Affix => "affix",
 			Pronoun => "pronoun",
 			Symbol => "symbol",
+			PunctuationMark => "symbol.punctuation",
+			Letter => "symbol.letter",
+			DiacriticalMark => "symbol.diacritic",
 			Preposition => "preposition",
-			PunctuationMark => "punctuation",
-			DiacriticalMark => "diacritic",
 			Determiner => "determiner",
-			Participle => "participle",
 			Particle => "particle",
 			Contraction => "contraction",
-			Letter => "letter",
 			Number => "number",
 			UnknownPoS => "unknown",
 			NotDefinition => panic!(),
@@ -233,7 +254,11 @@ impl Section {
 	}
 }
 
-fn parse_xml(reader: impl BufRead, output: &mut Definitions) -> Result<(), Box<dyn Error>> {
+fn parse_xml(
+	reader: impl BufRead,
+	output: &mut Definitions,
+	language: &str,
+) -> Result<(), Box<dyn Error>> {
 	let mut config = xml::reader::ParserConfig::default();
 	config.cdata_to_characters = true;
 	#[derive(Debug, Clone, Copy)]
@@ -281,52 +306,54 @@ fn parse_xml(reader: impl BufRead, output: &mut Definitions) -> Result<(), Box<d
 				if name.local_name == "page" {
 					title.clear();
 				} else if name.local_name == "text" {
-					if ns == 0
-						&& let Some(eng_start) = body.find("==English==\n")
-					{
-						let mut curr_section = None;
-						let eng = &body[eng_start..];
-						let eng_end = eng
-							.as_bytes()
-							.windows(4)
-							.position(|w| w.starts_with(b"\n==") && w[3] != b'=')
-							.unwrap_or(eng.len());
-						let eng = &eng[..eng_end];
-						for (i, w) in eng.as_bytes().windows(3).enumerate() {
-							if w == b"\n==" && eng.get(i + 3..i + 4) == Some("=") {
-								let mut section = &eng[i + 3..];
-								while let Some(s) = section.strip_prefix('=') {
-									section = s;
-								}
-								let Some((section, _)) = section
-									.split_once('\n')
-									.and_then(|(first_line, _)| first_line.split_once('='))
-								else {
-									continue;
-								};
-								curr_section = Section::from_name(section, &title);
-								continue;
-							}
-							if curr_section == Some(Section::NotDefinition) {
-								continue;
+					let body = std::mem::take(&mut body);
+					if ns != 0 {
+						continue;
+					}
+					let Some(lang_start) = body.find(&format!("=={language}==\n")) else {
+						continue;
+					};
+					let mut curr_section = None;
+					let lang = &body[lang_start..];
+					let lang_end = lang
+						.as_bytes()
+						.windows(4)
+						.position(|w| w.starts_with(b"\n==") && w[3] != b'=')
+						.unwrap_or(lang.len());
+					let lang = &lang[..lang_end];
+					for (i, w) in lang.as_bytes().windows(3).enumerate() {
+						if w == b"\n==" && lang.get(i + 3..i + 4) == Some("=") {
+							let mut section = &lang[i + 3..];
+							while let Some(s) = section.strip_prefix('=') {
+								section = s;
 							}
-							if w != b"\n# " {
+							let Some((section, _)) = section
+								.split_once('\n')
+								.and_then(|(first_line, _)| first_line.split_once('='))
+							else {
 								continue;
-							}
-							let definition =
-								eng[i + 3..].split_once('\n').map_or(&eng[i + 3..], |x| x.0);
-							let definition = remove_comments(definition);
-							if curr_section.is_none() {
+							};
+							curr_section = Section::from_name(section, &title);
+							continue;
+						}
+						if curr_section == Some(Section::NotDefinition) {
+							continue;
+						}
+						if w != b"\n# " {
+							continue;
+						}
+						let definition = lang[i + 3..]
+							.split_once('\n')
+							.map_or(&lang[i + 3..], |x| x.0);
+						let definition = remove_comments(definition);
+						let curr_section = curr_section.unwrap_or_else(|| {
+							if false {
 								eprintln!("\x1b[1mMissing part of speech\x1b[0m for {title}");
 							}
-							output.add_definition(
-								&title[..],
-								curr_section.unwrap_or(Section::UnknownPoS),
-								&definition,
-							);
-						}
+							Section::UnknownPoS
+						});
+						output.add_definition(&title[..], curr_section, &definition);
 					}
-					body.clear();
 				} else if name.local_name == "ns" {
 					ns = ns_str.parse().unwrap_or(1);
 				}
@@ -338,14 +365,17 @@ fn parse_xml(reader: impl BufRead, output: &mut Definitions) -> Result<(), Box<d
 }
 
 pub fn definitions(args: Vec<String>) -> Result<(), Box<dyn Error>> {
-	let mut output = Definitions::default();
 	let mut files: Vec<String> = vec![];
 	for arg in args {
 		if arg == "-h" || arg == "--help" {
 			println!("Usage: {} definitions [FILES]", env!("CARGO_PKG_NAME"));
-			println!("    Extract English-language definitions from Wiktionary");
-			println!("    data dump files, writing output to definitions.txt.");
-			println!("    Each line of the output file is of the format: Word  Definition");
+			println!("    Extract English-language and Translingual definitions from Wiktionary");
+			println!(
+				"    data dump files, writing output to en-definitions.txt and trans-definitions.txt."
+			);
+			println!(
+				"    Each line of the output file is of the format: Word  Part_of_Speech Definition"
+			);
 			println!(
 				"    Note the two spaces—this avoids ambiguity when the word contains a space."
 			);
@@ -400,17 +430,22 @@ pub fn definitions(args: Vec<String>) -> Result<(), Box<dyn Error>> {
 			return Err("Aborted.".into());
 		}
 	}
-	for input_filename in &files {
-		let input = File::open(input_filename)
-			.map_err(|e| format!("Couldn't open {input_filename}: {e}"))?;
-		let reader = BufReader::new(input);
-		println!("Parsing {input_filename}...");
-		parse_xml(reader, &mut output)
-			.map_err(|e| format!("Couldn't parse {input_filename}: {e}"))?;
+	for (lang, abbrev) in [("English", "en"), ("Translingual", "trans")] {
+		let mut output = Definitions::default();
+		for input_filename in &files {
+			let input = File::open(input_filename)
+				.map_err(|e| format!("Couldn't open {input_filename}: {e}"))?;
+			let reader = BufReader::new(input);
+			println!("({lang}) Parsing {input_filename}...");
+			parse_xml(reader, &mut output, lang)
+				.map_err(|e| format!("Couldn't parse {input_filename}: {e}"))?;
+		}
+		println!("Sorting {} definitions...", output.definitions.len());
+		output.sort();
+		crate::do_write(&format!("{abbrev}-definitions.txt"), |writer| {
+			output.write_to(writer)
+		})?;
 	}
-	println!("Sorting {} definitions...", output.definitions.len());
-	output.sort();
-	crate::do_write("definitions.txt", |writer| output.write_to(writer))?;
 	println!("Done!");
 	Ok(())
 }
author	pommicket <pommicket@gmail.com>	2025-09-25 14:59:07 -0400
committer	pommicket <pommicket@gmail.com>	2025-09-25 14:59:07 -0400
commit	52f08356aedd7ff2bc5c3fdb9effac98d98b0c63 (patch)
tree	adc37da02bd27404dc39f31965b09bd2e9110223
parent	1d6462d9c03c620d24d113443d24fcfce984c817 (diff)