diff options
Diffstat (limited to 'src/main.rs')
-rw-r--r-- | src/main.rs | 268 |
1 files changed, 268 insertions, 0 deletions
diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..3b9ffa6 --- /dev/null +++ b/src/main.rs @@ -0,0 +1,268 @@ +use std::borrow::Cow; +use std::error::Error; +use std::fs::File; +use std::io::{self, BufReader, BufWriter, prelude::*}; +use std::process::ExitCode; + +#[derive(Default)] +struct Definitions { + definitions: Vec<(Box<str>, Box<str>)>, +} + +// replace sequences of 2+ spaces with a single space +fn compact_spaces(s: &str) -> String { + let mut s: String = s.into(); + // quite inefficient, but it doesn't really matter for our purposes. + while s.contains(" ") { + s = s.replace(" ", " ").to_owned(); + } + s +} + +impl Definitions { + fn add_definition(&mut self, word: &str, def: &str) { + self.definitions + .push((compact_spaces(word).into(), compact_spaces(def).into())) + } + fn sort(&mut self) { + self.definitions.sort_by(|x, y| x.0.cmp(&y.0)); + } + fn write_to(&mut self, mut w: impl Write) -> io::Result<()> { + for (title, definition) in &self.definitions { + writeln!(w, "{title} {definition}")?; + } + Ok(()) + } +} + +// remove HTML comments from string +#[must_use] +fn remove_comments(mut text: &str) -> Cow<str> { + if !text.contains("<!--") { + // (by far) most common case + return Cow::Borrowed(text); + } + let mut new_str = String::new(); + while let Some(comment_start) = text.find("<!--") { + new_str.push_str(&text[..comment_start]); + text = &text[comment_start..]; + let comment_end = text.find("-->").map_or(text.len(), |i| i + 3); + text = &text[comment_end..]; + } + new_str.push_str(text); + Cow::Owned(new_str) +} + +fn parse_xml(reader: impl BufRead, output: &mut Definitions) -> Result<(), Box<dyn Error>> { + let mut config = xml::reader::ParserConfig::default(); + config.cdata_to_characters = true; + #[derive(Debug, Clone, Copy)] + enum Tag { + Other, + Title, + Text, + Ns, + } + let mut tags = vec![]; + let mut title = String::new(); + let mut body = String::new(); + let mut ns = 1; + let mut ns_str = String::new(); + for event in xml::reader::EventReader::new_with_config(reader, config) { + let event = event?; + use xml::reader::XmlEvent; + match (&event, tags.last()) { + (XmlEvent::StartElement { name, .. }, _) => { + let name = name.local_name.as_str(); + match name { + "title" => { + tags.push(Tag::Title); + title.clear(); + } + "text" => tags.push(Tag::Text), + "ns" => { + tags.push(Tag::Ns); + ns_str.clear(); + } + _ => tags.push(Tag::Other), + } + } + (XmlEvent::Characters(s), Some(Tag::Title)) => { + title.push_str(s); + } + (XmlEvent::Characters(s), Some(Tag::Ns)) => { + ns_str.push_str(s); + } + (XmlEvent::Characters(s), Some(Tag::Text)) => { + body.push_str(s); + } + (XmlEvent::EndElement { name, .. }, _) => { + tags.pop(); + if name.local_name == "page" { + title.clear(); + } else if name.local_name == "text" { + if ns == 0 + && let Some(eng_start) = body.find("==English==\n") + { + let eng = &body[eng_start..]; + let eng_end = eng + .as_bytes() + .windows(4) + .position(|w| w.starts_with(b"\n==") && w[3] != b'=') + .unwrap_or(eng.len()); + let eng = &eng[..eng_end]; + for (i, w) in eng.as_bytes().windows(3).enumerate() { + if w != b"\n# " { + continue; + } + let definition = + eng[i + 3..].split_once('\n').map_or(&eng[i + 3..], |x| x.0); + let definition = remove_comments(definition); + output.add_definition(&title[..], &definition); + } + } + body.clear(); + } else if name.local_name == "ns" { + ns = ns_str.parse().unwrap_or(1); + } + } + _ => {} + } + } + Ok(()) +} + +fn do_write<W, E: Error>(path: &str, write_func: W) -> Result<(), Box<dyn Error>> +where + W: FnOnce(BufWriter<File>) -> Result<(), E>, +{ + println!("Writing output to {path}..."); + let tmp_name = format!(".{path}.tmp"); + let file = File::create(&tmp_name).map_err(|e| format!("Error creating {tmp_name}: {e}"))?; + let writer = BufWriter::new(file); + write_func(writer).map_err(|e| format!("Error writing to {tmp_name}: {e}"))?; + _ = std::fs::remove_file(path); // OK if this already exists + std::fs::rename(&tmp_name, path) + .map_err(|e| format!("Error renaming {tmp_name} => {path}: {e}"))?; + Ok(()) +} + +fn definitions(args: Vec<String>) -> Result<(), Box<dyn Error>> { + let mut output = Definitions::default(); + let mut files: Vec<String> = vec![]; + for arg in args { + if arg == "-h" || arg == "--help" { + println!("Usage: {} definitions [FILES]", env!("CARGO_PKG_NAME")); + println!(" Extract English-language definitions from Wiktionary"); + println!(" data dump files, writing output to definitions.txt."); + println!(" Each line of the output file is of the format: Word Definition"); + println!( + " Note the two spaces—this avoids ambiguity when the word contains a space." + ); + println!(" A single Word can have multiple Definitions."); + println!(" If FILES is not specified, will use ./*wiktionary*.xml-p*"); + return Ok(()); + } + files.push(arg.to_owned()); + } + let files_from_pwd = || -> Result<Vec<String>, Box<dyn Error>> { + let mut files = vec![]; + for file in std::fs::read_dir(".")? { + let file = file?; + let mut r#type = file.file_type()?; + let name = file.file_name(); + if r#type.is_symlink() { + // get type of thing symlink is pointing to + r#type = std::fs::metadata(file.path())?.file_type(); + } + if !r#type.is_file() { + continue; + } + let Some(name) = name.to_str() else { + continue; + }; + if name.contains("wiktionary") && name.contains(".xml-p") { + files.push(name.into()); + } + } + files.sort(); + Ok(files) + }; + if files.is_empty() { + files = files_from_pwd() + .map_err(|e| format!("No files specified and couldn't list PWD ({e}). Aborting."))?; + println!("No files specified on command line."); + println!("These files were found in the PWD:"); + for file in &files { + println!(" {file}"); + } + print!("Proceed with these files [Y/n]? "); + _ = std::io::stdout().flush(); + let mut line = String::new(); + let result = std::io::stdin().read_line(&mut line); + let line = line.trim(); + if result.is_err() + || line + .chars() + .next() + .is_some_and(|c| c.to_lowercase().to_string() != "y") + { + return Err("Aborted.".into()); + } + } + for input_filename in &files { + let input = File::open(input_filename) + .map_err(|e| format!("Couldn't open {input_filename}: {e}"))?; + let reader = BufReader::new(input); + println!("Parsing {input_filename}..."); + parse_xml(reader, &mut output) + .map_err(|e| format!("Couldn't parse {input_filename}: {e}"))?; + } + println!("Sorting {} definitions...", output.definitions.len()); + output.sort(); + do_write("definitions.txt", |writer| output.write_to(writer))?; + println!("Done!"); + Ok(()) +} + +fn try_main() -> Result<(), Box<dyn Error>> { + let mut args = std::env::args_os().skip(1); + let command = args.next(); + let no_command = "No command specified. Commands available: +- definitions"; + let Some(command) = command else { + return Err(no_command.into()); + }; + if command == "-h" || command == "--help" { + return Err(no_command.into()); + } + let mut command_args = vec![]; + for arg in args { + let Some(arg) = arg.to_str() else { + return Err(format!( + "Bad UTF-8 in argument: {}", + arg.to_string_lossy().escape_debug() + ) + .into()); + }; + command_args.push(arg.to_owned()); + } + match &command.to_string_lossy()[..] { + "definitions" => definitions(command_args), + x => Err(format!("Unrecognized command: {x}").into()), + } +} + +fn main() -> ExitCode { + use std::time::Instant; + let start_time = Instant::now(); + if let Err(e) = try_main() { + eprintln!("Error: {e}"); + return ExitCode::FAILURE; + } + println!( + "Time taken: {:?}", + Instant::now().duration_since(start_time) + ); + ExitCode::SUCCESS +} |