diff options
Diffstat (limited to 'src/main.rs')
-rw-r--r-- | src/main.rs | 211 |
1 files changed, 3 insertions, 208 deletions
diff --git a/src/main.rs b/src/main.rs index 3b9ffa6..f484ade 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,136 +1,9 @@ -use std::borrow::Cow; use std::error::Error; use std::fs::File; -use std::io::{self, BufReader, BufWriter, prelude::*}; +use std::io::BufWriter; use std::process::ExitCode; -#[derive(Default)] -struct Definitions { - definitions: Vec<(Box<str>, Box<str>)>, -} - -// replace sequences of 2+ spaces with a single space -fn compact_spaces(s: &str) -> String { - let mut s: String = s.into(); - // quite inefficient, but it doesn't really matter for our purposes. - while s.contains(" ") { - s = s.replace(" ", " ").to_owned(); - } - s -} - -impl Definitions { - fn add_definition(&mut self, word: &str, def: &str) { - self.definitions - .push((compact_spaces(word).into(), compact_spaces(def).into())) - } - fn sort(&mut self) { - self.definitions.sort_by(|x, y| x.0.cmp(&y.0)); - } - fn write_to(&mut self, mut w: impl Write) -> io::Result<()> { - for (title, definition) in &self.definitions { - writeln!(w, "{title} {definition}")?; - } - Ok(()) - } -} - -// remove HTML comments from string -#[must_use] -fn remove_comments(mut text: &str) -> Cow<str> { - if !text.contains("<!--") { - // (by far) most common case - return Cow::Borrowed(text); - } - let mut new_str = String::new(); - while let Some(comment_start) = text.find("<!--") { - new_str.push_str(&text[..comment_start]); - text = &text[comment_start..]; - let comment_end = text.find("-->").map_or(text.len(), |i| i + 3); - text = &text[comment_end..]; - } - new_str.push_str(text); - Cow::Owned(new_str) -} - -fn parse_xml(reader: impl BufRead, output: &mut Definitions) -> Result<(), Box<dyn Error>> { - let mut config = xml::reader::ParserConfig::default(); - config.cdata_to_characters = true; - #[derive(Debug, Clone, Copy)] - enum Tag { - Other, - Title, - Text, - Ns, - } - let mut tags = vec![]; - let mut title = String::new(); - let mut body = String::new(); - let mut ns = 1; - let mut ns_str = String::new(); - for event in xml::reader::EventReader::new_with_config(reader, config) { - let event = event?; - use xml::reader::XmlEvent; - match (&event, tags.last()) { - (XmlEvent::StartElement { name, .. }, _) => { - let name = name.local_name.as_str(); - match name { - "title" => { - tags.push(Tag::Title); - title.clear(); - } - "text" => tags.push(Tag::Text), - "ns" => { - tags.push(Tag::Ns); - ns_str.clear(); - } - _ => tags.push(Tag::Other), - } - } - (XmlEvent::Characters(s), Some(Tag::Title)) => { - title.push_str(s); - } - (XmlEvent::Characters(s), Some(Tag::Ns)) => { - ns_str.push_str(s); - } - (XmlEvent::Characters(s), Some(Tag::Text)) => { - body.push_str(s); - } - (XmlEvent::EndElement { name, .. }, _) => { - tags.pop(); - if name.local_name == "page" { - title.clear(); - } else if name.local_name == "text" { - if ns == 0 - && let Some(eng_start) = body.find("==English==\n") - { - let eng = &body[eng_start..]; - let eng_end = eng - .as_bytes() - .windows(4) - .position(|w| w.starts_with(b"\n==") && w[3] != b'=') - .unwrap_or(eng.len()); - let eng = &eng[..eng_end]; - for (i, w) in eng.as_bytes().windows(3).enumerate() { - if w != b"\n# " { - continue; - } - let definition = - eng[i + 3..].split_once('\n').map_or(&eng[i + 3..], |x| x.0); - let definition = remove_comments(definition); - output.add_definition(&title[..], &definition); - } - } - body.clear(); - } else if name.local_name == "ns" { - ns = ns_str.parse().unwrap_or(1); - } - } - _ => {} - } - } - Ok(()) -} +mod definitions; fn do_write<W, E: Error>(path: &str, write_func: W) -> Result<(), Box<dyn Error>> where @@ -147,84 +20,6 @@ where Ok(()) } -fn definitions(args: Vec<String>) -> Result<(), Box<dyn Error>> { - let mut output = Definitions::default(); - let mut files: Vec<String> = vec![]; - for arg in args { - if arg == "-h" || arg == "--help" { - println!("Usage: {} definitions [FILES]", env!("CARGO_PKG_NAME")); - println!(" Extract English-language definitions from Wiktionary"); - println!(" data dump files, writing output to definitions.txt."); - println!(" Each line of the output file is of the format: Word Definition"); - println!( - " Note the two spaces—this avoids ambiguity when the word contains a space." - ); - println!(" A single Word can have multiple Definitions."); - println!(" If FILES is not specified, will use ./*wiktionary*.xml-p*"); - return Ok(()); - } - files.push(arg.to_owned()); - } - let files_from_pwd = || -> Result<Vec<String>, Box<dyn Error>> { - let mut files = vec![]; - for file in std::fs::read_dir(".")? { - let file = file?; - let mut r#type = file.file_type()?; - let name = file.file_name(); - if r#type.is_symlink() { - // get type of thing symlink is pointing to - r#type = std::fs::metadata(file.path())?.file_type(); - } - if !r#type.is_file() { - continue; - } - let Some(name) = name.to_str() else { - continue; - }; - if name.contains("wiktionary") && name.contains(".xml-p") { - files.push(name.into()); - } - } - files.sort(); - Ok(files) - }; - if files.is_empty() { - files = files_from_pwd() - .map_err(|e| format!("No files specified and couldn't list PWD ({e}). Aborting."))?; - println!("No files specified on command line."); - println!("These files were found in the PWD:"); - for file in &files { - println!(" {file}"); - } - print!("Proceed with these files [Y/n]? "); - _ = std::io::stdout().flush(); - let mut line = String::new(); - let result = std::io::stdin().read_line(&mut line); - let line = line.trim(); - if result.is_err() - || line - .chars() - .next() - .is_some_and(|c| c.to_lowercase().to_string() != "y") - { - return Err("Aborted.".into()); - } - } - for input_filename in &files { - let input = File::open(input_filename) - .map_err(|e| format!("Couldn't open {input_filename}: {e}"))?; - let reader = BufReader::new(input); - println!("Parsing {input_filename}..."); - parse_xml(reader, &mut output) - .map_err(|e| format!("Couldn't parse {input_filename}: {e}"))?; - } - println!("Sorting {} definitions...", output.definitions.len()); - output.sort(); - do_write("definitions.txt", |writer| output.write_to(writer))?; - println!("Done!"); - Ok(()) -} - fn try_main() -> Result<(), Box<dyn Error>> { let mut args = std::env::args_os().skip(1); let command = args.next(); @@ -248,7 +43,7 @@ fn try_main() -> Result<(), Box<dyn Error>> { command_args.push(arg.to_owned()); } match &command.to_string_lossy()[..] { - "definitions" => definitions(command_args), + "definitions" => definitions::definitions(command_args), x => Err(format!("Unrecognized command: {x}").into()), } } |