use std::borrow::Cow; use std::error::Error; use std::fs::File; use std::io::{self, BufReader, BufWriter, prelude::*}; use std::process::ExitCode; #[derive(Default)] struct Definitions { definitions: Vec<(Box, Box)>, } // replace sequences of 2+ spaces with a single space fn compact_spaces(s: &str) -> String { let mut s: String = s.into(); // quite inefficient, but it doesn't really matter for our purposes. while s.contains(" ") { s = s.replace(" ", " ").to_owned(); } s } impl Definitions { fn add_definition(&mut self, word: &str, def: &str) { self.definitions .push((compact_spaces(word).into(), compact_spaces(def).into())) } fn sort(&mut self) { self.definitions.sort_by(|x, y| x.0.cmp(&y.0)); } fn write_to(&mut self, mut w: impl Write) -> io::Result<()> { for (title, definition) in &self.definitions { writeln!(w, "{title} {definition}")?; } Ok(()) } } // remove HTML comments from string #[must_use] fn remove_comments(mut text: &str) -> Cow { if !text.contains("").map_or(text.len(), |i| i + 3); text = &text[comment_end..]; } new_str.push_str(text); Cow::Owned(new_str) } fn parse_xml(reader: impl BufRead, output: &mut Definitions) -> Result<(), Box> { let mut config = xml::reader::ParserConfig::default(); config.cdata_to_characters = true; #[derive(Debug, Clone, Copy)] enum Tag { Other, Title, Text, Ns, } let mut tags = vec![]; let mut title = String::new(); let mut body = String::new(); let mut ns = 1; let mut ns_str = String::new(); for event in xml::reader::EventReader::new_with_config(reader, config) { let event = event?; use xml::reader::XmlEvent; match (&event, tags.last()) { (XmlEvent::StartElement { name, .. }, _) => { let name = name.local_name.as_str(); match name { "title" => { tags.push(Tag::Title); title.clear(); } "text" => tags.push(Tag::Text), "ns" => { tags.push(Tag::Ns); ns_str.clear(); } _ => tags.push(Tag::Other), } } (XmlEvent::Characters(s), Some(Tag::Title)) => { title.push_str(s); } (XmlEvent::Characters(s), Some(Tag::Ns)) => { ns_str.push_str(s); } (XmlEvent::Characters(s), Some(Tag::Text)) => { body.push_str(s); } (XmlEvent::EndElement { name, .. }, _) => { tags.pop(); if name.local_name == "page" { title.clear(); } else if name.local_name == "text" { if ns == 0 && let Some(eng_start) = body.find("==English==\n") { let eng = &body[eng_start..]; let eng_end = eng .as_bytes() .windows(4) .position(|w| w.starts_with(b"\n==") && w[3] != b'=') .unwrap_or(eng.len()); let eng = &eng[..eng_end]; for (i, w) in eng.as_bytes().windows(3).enumerate() { if w != b"\n# " { continue; } let definition = eng[i + 3..].split_once('\n').map_or(&eng[i + 3..], |x| x.0); let definition = remove_comments(definition); output.add_definition(&title[..], &definition); } } body.clear(); } else if name.local_name == "ns" { ns = ns_str.parse().unwrap_or(1); } } _ => {} } } Ok(()) } fn do_write(path: &str, write_func: W) -> Result<(), Box> where W: FnOnce(BufWriter) -> Result<(), E>, { println!("Writing output to {path}..."); let tmp_name = format!(".{path}.tmp"); let file = File::create(&tmp_name).map_err(|e| format!("Error creating {tmp_name}: {e}"))?; let writer = BufWriter::new(file); write_func(writer).map_err(|e| format!("Error writing to {tmp_name}: {e}"))?; _ = std::fs::remove_file(path); // OK if this already exists std::fs::rename(&tmp_name, path) .map_err(|e| format!("Error renaming {tmp_name} => {path}: {e}"))?; Ok(()) } fn definitions(args: Vec) -> Result<(), Box> { let mut output = Definitions::default(); let mut files: Vec = vec![]; for arg in args { if arg == "-h" || arg == "--help" { println!("Usage: {} definitions [FILES]", env!("CARGO_PKG_NAME")); println!(" Extract English-language definitions from Wiktionary"); println!(" data dump files, writing output to definitions.txt."); println!(" Each line of the output file is of the format: Word Definition"); println!( " Note the two spaces—this avoids ambiguity when the word contains a space." ); println!(" A single Word can have multiple Definitions."); println!(" If FILES is not specified, will use ./*wiktionary*.xml-p*"); return Ok(()); } files.push(arg.to_owned()); } let files_from_pwd = || -> Result, Box> { let mut files = vec![]; for file in std::fs::read_dir(".")? { let file = file?; let mut r#type = file.file_type()?; let name = file.file_name(); if r#type.is_symlink() { // get type of thing symlink is pointing to r#type = std::fs::metadata(file.path())?.file_type(); } if !r#type.is_file() { continue; } let Some(name) = name.to_str() else { continue; }; if name.contains("wiktionary") && name.contains(".xml-p") { files.push(name.into()); } } files.sort(); Ok(files) }; if files.is_empty() { files = files_from_pwd() .map_err(|e| format!("No files specified and couldn't list PWD ({e}). Aborting."))?; println!("No files specified on command line."); println!("These files were found in the PWD:"); for file in &files { println!(" {file}"); } print!("Proceed with these files [Y/n]? "); _ = std::io::stdout().flush(); let mut line = String::new(); let result = std::io::stdin().read_line(&mut line); let line = line.trim(); if result.is_err() || line .chars() .next() .is_some_and(|c| c.to_lowercase().to_string() != "y") { return Err("Aborted.".into()); } } for input_filename in &files { let input = File::open(input_filename) .map_err(|e| format!("Couldn't open {input_filename}: {e}"))?; let reader = BufReader::new(input); println!("Parsing {input_filename}..."); parse_xml(reader, &mut output) .map_err(|e| format!("Couldn't parse {input_filename}: {e}"))?; } println!("Sorting {} definitions...", output.definitions.len()); output.sort(); do_write("definitions.txt", |writer| output.write_to(writer))?; println!("Done!"); Ok(()) } fn try_main() -> Result<(), Box> { let mut args = std::env::args_os().skip(1); let command = args.next(); let no_command = "No command specified. Commands available: - definitions"; let Some(command) = command else { return Err(no_command.into()); }; if command == "-h" || command == "--help" { return Err(no_command.into()); } let mut command_args = vec![]; for arg in args { let Some(arg) = arg.to_str() else { return Err(format!( "Bad UTF-8 in argument: {}", arg.to_string_lossy().escape_debug() ) .into()); }; command_args.push(arg.to_owned()); } match &command.to_string_lossy()[..] { "definitions" => definitions(command_args), x => Err(format!("Unrecognized command: {x}").into()), } } fn main() -> ExitCode { use std::time::Instant; let start_time = Instant::now(); if let Err(e) = try_main() { eprintln!("Error: {e}"); return ExitCode::FAILURE; } println!( "Time taken: {:?}", Instant::now().duration_since(start_time) ); ExitCode::SUCCESS }