summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--index.html41
-rw-r--r--src/definitions.rs185
2 files changed, 148 insertions, 78 deletions
diff --git a/index.html b/index.html
index 3bd880c..116827f 100644
--- a/index.html
+++ b/index.html
@@ -16,6 +16,7 @@
<p>
These are various lists of words extracted from Wiktionary data dumps. Some of the code
used to produce them is available <a href="https://github.com/pommicket/wiktionary" target="_blank">here</a>.<br>
+ Of course, all these lists undoubtedly contain errors because Wiktionary contains errors.<br>
You can do whatever you like with them, subject to
<a href="https://en.wiktionary.org/wiki/Wiktionary:Copyrights" target="_blank">Wiktionary's licensing</a>, where applicable.
</p>
@@ -26,10 +27,44 @@
Words labelled <i>offensive</i> on Wiktionary were filtered out (overly aggressively—some totally inoffensive words were removed in the process).
</li>
<li>
- English definitions: <a href="/wiktionary/en-definitions.txt.xz">en-definitions.txt.xz (22MB compressed, 115MB uncompressed, 1,629,682 entries)</a>.¹<br>
- Every English definition in English wiktionary. Format is <code style="white-space: pre;">WORD DEFINITION</code>
- on each line (note: delimiter is <b>2</b> spaces).<br>
+ English definitions:
+ <a href="/wiktionary/en-definitions.txt.xz">en-definitions.txt.xz (23MB compressed, 127MB uncompressed, 1,629,482 entries)</a>
+ and<br>Translingual definitions:
+ <a href="/wiktionary/trans-definitions.txt.xz">trans-definitions.txt.xz (MB compressed, MB uncompressed, entries)</a>.¹<br>
+ Every English/Translingual definition in English wiktionary.
+ Format is <code style="white-space: pre;">WORD PART_OF_SPEECH DEFINITION</code>
+ on each line (note the two spaces between word and part of speech).<br>
Words can have multiple definitions; they are listed as separate lines.<br>
+ <code>PART_OF_SPEECH</code> is one of the following:
+ <ul>
+ <li><code>%adjective</code> (e.g. <i>unbelievable</i>)</li>
+ <li><code>%noun</code> (e.g. <i>belief</i>)</li>
+ <li><code>%noun.proper</code> (e.g. <i>France</i>)</li>
+ <li><code>%verb</code> (e.g. <i>believe</i>)</li>
+ <li><code>%adverb</code> (e.g. <i>unbelievably</i>)</li>
+ <li><code>%interjection</code> (e.g. <i>yowza</i>)</li>
+ <li><code>%particle</code> (e.g. <i>O</i>)</li>
+ <li><code>%conjunction</code> (e.g. <i>unless</i>)</li>
+ <li><code>%preposition</code> (e.g. <i>into</i>)</li>
+ <li><code>%determiner</code> (e.g. <i>the</i>)</li>
+ <li><code>%pronoun</code> (e.g. <i>yourself</i>)</li>
+ <li><code>%contraction</code> (e.g. <i>woulda</i>)</li>
+ <li><code>%number</code> (e.g. <i>2</i>, <i>twenty-seven</i>)</li>
+ <li><code>%phrase</code> (e.g. <i>you'd better believe it</i>)</li>
+ <li><code>%phrase.prepositional</code> (e.g. <i>beyond belief</i>)</li>
+ <li><code>%phrase.proverb</code> (e.g. <i>seeing is believing</i>)</li>
+ <li><code>%affix</code> (e.g. <i>🅱</i>, a “simulfix”)</li>
+ <li><code>%affix.prefix</code> (e.g. <i>un-</i>)</li>
+ <li><code>%affix.suffix</code> (e.g. <i>-ism</i>)</li>
+ <li><code>%affix.infix</code> (e.g. <i>-fuckin-</i>)</li>
+ <li><code>%affix.circumfix</code> (e.g. <i>a- -ing</i>)</li>
+ <li><code>%affix.interfix</code> (rare, e.g. <i>-retin-</i>)</li>
+ <li><code>%symbol</code> (e.g. <i>℞</i>)</li>
+ <li><code>%symbol.punctuation</code> (e.g. <i>…</i>)</li>
+ <li><code>%symbol.letter</code> (e.g. <i>b</i>)</li>
+ <li><code>%symbol.diacritic</code> (e.g. <i>◌́</i>)</li>
+ <li><code>%unknown</code> — couldn’t be determined/none of the above</li>
+ </ul>
<code>DEFINITION</code> is in the wikitext format.<br>
It’s possible that there are parsing errors, but I haven’t spotted any yet.
</li>
diff --git a/src/definitions.rs b/src/definitions.rs
index bb7eb04..d3d0192 100644
--- a/src/definitions.rs
+++ b/src/definitions.rs
@@ -66,7 +66,6 @@ enum Section {
Conjunction,
PrepositionalPhrase,
Proverb,
- Idiom,
Phrase,
Suffix,
Prefix,
@@ -80,7 +79,6 @@ enum Section {
PunctuationMark,
DiacriticalMark,
Determiner,
- Participle,
Particle,
Contraction,
Letter,
@@ -97,7 +95,8 @@ impl Section {
"Adjective" | "Proper adjective" | "Adjectives" => Adjective,
"Noun" => Noun,
"Proper noun" => ProperNoun,
- "Verb" | "Verb phrase" | "Verb form" => Verb,
+ // All but Verb are not used enough to warrant their own categories
+ "Verb" | "Verb phrase" | "Verb form" | "Participle" => Verb,
"Adverbial phrase" | "Adverb" => Adverb,
"Interjection" => Interjection,
"Conjunction" => Conjunction,
@@ -109,20 +108,18 @@ impl Section {
"Infix" => Infix,
"Interfix" => Interfix,
"Pronoun" => Pronoun,
- "Phrase" => Phrase,
- "Symbol" => Symbol,
+ // Idiom is not used enough to warrant its own category (only appears 12 times)
+ "Phrase" | "Idiom" => Phrase,
+ "Symbol" | "Cuneiform sign" | "Iteration mark" => Symbol,
"Preposition" => Preposition,
"Punctuation mark" => PunctuationMark,
- "Diacritical mark" => DiacriticalMark,
+ "Diacritical mark" | "Diacritic" => DiacriticalMark,
"Article" | "Determiner" => Determiner,
- "Participle" => Participle,
"Particle" => Particle,
"Contraction" => Contraction,
- "Idiom" => Idiom,
"Letter" => Letter,
- "Affix" | "Combining form" => Affix,
- // currently at least ev (abbr. for even, ever, every) has this designation
- "Multiple parts of speech" => UnknownPoS,
+ "Affix" | "Combining form" | "Simulfix" => Affix,
+ "Multiple parts of speech" | "Syllable" => UnknownPoS,
// 20250701 erroneously has "Proper noun 1" and "Proper Noun" and "Proper"
"Proper" | "Proper Noun" => ProperNoun,
x if x.starts_with("Proper noun ") => ProperNoun,
@@ -185,11 +182,37 @@ impl Section {
| "Declension"
| "Synonyms and related terms"
| "Additional notes"
- | "Related vocabulary" => NotDefinition,
- x if x.starts_with("Pronunciation") || x.starts_with("Etymology") => NotDefinition,
+ | "Related vocabulary"
+ | "Glyph origin"
+ | "Han character"
+ | "Derived characters"
+ | "Related characters"
+ | "Related symbols"
+ | "Design"
+ | "Forms"
+ | "Ligature"
+ | "Derived signs"
+ | "Derived symbols"
+ | "Derived glyphs"
+ | "Production"
+ | "Example"
+ | "Origin"
+ | "Bibliography"
+ | "Formation"
+ | "Derived Characters"
+ | "Composition"
+ | "Depiction"
+ | "Derived forms" => NotDefinition,
+ x if x.starts_with("Pronunciation")
+ || x.starts_with("Etymology")
+ || x.starts_with("Terms suffixed with ")
+ || x.starts_with("Symbol origin") =>
+ {
+ NotDefinition
+ }
// mistakes that exist in 20250701 dump
"Alternate forms" | "English" | "Etymyology" | "See Also" | "Usage Notes"
- | "Translate" => NotDefinition,
+ | "Translate" | "Etymolohy" | "Derived chracters" => NotDefinition,
_ => {
eprintln!("\x1b[1mUnrecognized section {section}\x1b[0m (in page {title})");
return None;
@@ -208,7 +231,6 @@ impl Section {
Conjunction => "conjunction",
PrepositionalPhrase => "phrase.prepositional",
Proverb => "phrase.proverb",
- Idiom => "phrase.idiom",
Phrase => "phrase",
Suffix => "affix.suffix",
Prefix => "affix.prefix",
@@ -218,14 +240,13 @@ impl Section {
Affix => "affix",
Pronoun => "pronoun",
Symbol => "symbol",
+ PunctuationMark => "symbol.punctuation",
+ Letter => "symbol.letter",
+ DiacriticalMark => "symbol.diacritic",
Preposition => "preposition",
- PunctuationMark => "punctuation",
- DiacriticalMark => "diacritic",
Determiner => "determiner",
- Participle => "participle",
Particle => "particle",
Contraction => "contraction",
- Letter => "letter",
Number => "number",
UnknownPoS => "unknown",
NotDefinition => panic!(),
@@ -233,7 +254,11 @@ impl Section {
}
}
-fn parse_xml(reader: impl BufRead, output: &mut Definitions) -> Result<(), Box<dyn Error>> {
+fn parse_xml(
+ reader: impl BufRead,
+ output: &mut Definitions,
+ language: &str,
+) -> Result<(), Box<dyn Error>> {
let mut config = xml::reader::ParserConfig::default();
config.cdata_to_characters = true;
#[derive(Debug, Clone, Copy)]
@@ -281,52 +306,54 @@ fn parse_xml(reader: impl BufRead, output: &mut Definitions) -> Result<(), Box<d
if name.local_name == "page" {
title.clear();
} else if name.local_name == "text" {
- if ns == 0
- && let Some(eng_start) = body.find("==English==\n")
- {
- let mut curr_section = None;
- let eng = &body[eng_start..];
- let eng_end = eng
- .as_bytes()
- .windows(4)
- .position(|w| w.starts_with(b"\n==") && w[3] != b'=')
- .unwrap_or(eng.len());
- let eng = &eng[..eng_end];
- for (i, w) in eng.as_bytes().windows(3).enumerate() {
- if w == b"\n==" && eng.get(i + 3..i + 4) == Some("=") {
- let mut section = &eng[i + 3..];
- while let Some(s) = section.strip_prefix('=') {
- section = s;
- }
- let Some((section, _)) = section
- .split_once('\n')
- .and_then(|(first_line, _)| first_line.split_once('='))
- else {
- continue;
- };
- curr_section = Section::from_name(section, &title);
- continue;
- }
- if curr_section == Some(Section::NotDefinition) {
- continue;
+ let body = std::mem::take(&mut body);
+ if ns != 0 {
+ continue;
+ }
+ let Some(lang_start) = body.find(&format!("=={language}==\n")) else {
+ continue;
+ };
+ let mut curr_section = None;
+ let lang = &body[lang_start..];
+ let lang_end = lang
+ .as_bytes()
+ .windows(4)
+ .position(|w| w.starts_with(b"\n==") && w[3] != b'=')
+ .unwrap_or(lang.len());
+ let lang = &lang[..lang_end];
+ for (i, w) in lang.as_bytes().windows(3).enumerate() {
+ if w == b"\n==" && lang.get(i + 3..i + 4) == Some("=") {
+ let mut section = &lang[i + 3..];
+ while let Some(s) = section.strip_prefix('=') {
+ section = s;
}
- if w != b"\n# " {
+ let Some((section, _)) = section
+ .split_once('\n')
+ .and_then(|(first_line, _)| first_line.split_once('='))
+ else {
continue;
- }
- let definition =
- eng[i + 3..].split_once('\n').map_or(&eng[i + 3..], |x| x.0);
- let definition = remove_comments(definition);
- if curr_section.is_none() {
+ };
+ curr_section = Section::from_name(section, &title);
+ continue;
+ }
+ if curr_section == Some(Section::NotDefinition) {
+ continue;
+ }
+ if w != b"\n# " {
+ continue;
+ }
+ let definition = lang[i + 3..]
+ .split_once('\n')
+ .map_or(&lang[i + 3..], |x| x.0);
+ let definition = remove_comments(definition);
+ let curr_section = curr_section.unwrap_or_else(|| {
+ if false {
eprintln!("\x1b[1mMissing part of speech\x1b[0m for {title}");
}
- output.add_definition(
- &title[..],
- curr_section.unwrap_or(Section::UnknownPoS),
- &definition,
- );
- }
+ Section::UnknownPoS
+ });
+ output.add_definition(&title[..], curr_section, &definition);
}
- body.clear();
} else if name.local_name == "ns" {
ns = ns_str.parse().unwrap_or(1);
}
@@ -338,14 +365,17 @@ fn parse_xml(reader: impl BufRead, output: &mut Definitions) -> Result<(), Box<d
}
pub fn definitions(args: Vec<String>) -> Result<(), Box<dyn Error>> {
- let mut output = Definitions::default();
let mut files: Vec<String> = vec![];
for arg in args {
if arg == "-h" || arg == "--help" {
println!("Usage: {} definitions [FILES]", env!("CARGO_PKG_NAME"));
- println!(" Extract English-language definitions from Wiktionary");
- println!(" data dump files, writing output to definitions.txt.");
- println!(" Each line of the output file is of the format: Word Definition");
+ println!(" Extract English-language and Translingual definitions from Wiktionary");
+ println!(
+ " data dump files, writing output to en-definitions.txt and trans-definitions.txt."
+ );
+ println!(
+ " Each line of the output file is of the format: Word Part_of_Speech Definition"
+ );
println!(
" Note the two spaces—this avoids ambiguity when the word contains a space."
);
@@ -400,17 +430,22 @@ pub fn definitions(args: Vec<String>) -> Result<(), Box<dyn Error>> {
return Err("Aborted.".into());
}
}
- for input_filename in &files {
- let input = File::open(input_filename)
- .map_err(|e| format!("Couldn't open {input_filename}: {e}"))?;
- let reader = BufReader::new(input);
- println!("Parsing {input_filename}...");
- parse_xml(reader, &mut output)
- .map_err(|e| format!("Couldn't parse {input_filename}: {e}"))?;
+ for (lang, abbrev) in [("English", "en"), ("Translingual", "trans")] {
+ let mut output = Definitions::default();
+ for input_filename in &files {
+ let input = File::open(input_filename)
+ .map_err(|e| format!("Couldn't open {input_filename}: {e}"))?;
+ let reader = BufReader::new(input);
+ println!("({lang}) Parsing {input_filename}...");
+ parse_xml(reader, &mut output, lang)
+ .map_err(|e| format!("Couldn't parse {input_filename}: {e}"))?;
+ }
+ println!("Sorting {} definitions...", output.definitions.len());
+ output.sort();
+ crate::do_write(&format!("{abbrev}-definitions.txt"), |writer| {
+ output.write_to(writer)
+ })?;
}
- println!("Sorting {} definitions...", output.definitions.len());
- output.sort();
- crate::do_write("definitions.txt", |writer| output.write_to(writer))?;
println!("Done!");
Ok(())
}