summaryrefslogtreecommitdiff
path: root/src/main.rs
diff options
context:
space:
mode:
Diffstat (limited to 'src/main.rs')
-rw-r--r--src/main.rs211
1 files changed, 3 insertions, 208 deletions
diff --git a/src/main.rs b/src/main.rs
index 3b9ffa6..f484ade 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -1,136 +1,9 @@
-use std::borrow::Cow;
use std::error::Error;
use std::fs::File;
-use std::io::{self, BufReader, BufWriter, prelude::*};
+use std::io::BufWriter;
use std::process::ExitCode;
-#[derive(Default)]
-struct Definitions {
- definitions: Vec<(Box<str>, Box<str>)>,
-}
-
-// replace sequences of 2+ spaces with a single space
-fn compact_spaces(s: &str) -> String {
- let mut s: String = s.into();
- // quite inefficient, but it doesn't really matter for our purposes.
- while s.contains(" ") {
- s = s.replace(" ", " ").to_owned();
- }
- s
-}
-
-impl Definitions {
- fn add_definition(&mut self, word: &str, def: &str) {
- self.definitions
- .push((compact_spaces(word).into(), compact_spaces(def).into()))
- }
- fn sort(&mut self) {
- self.definitions.sort_by(|x, y| x.0.cmp(&y.0));
- }
- fn write_to(&mut self, mut w: impl Write) -> io::Result<()> {
- for (title, definition) in &self.definitions {
- writeln!(w, "{title} {definition}")?;
- }
- Ok(())
- }
-}
-
-// remove HTML comments from string
-#[must_use]
-fn remove_comments(mut text: &str) -> Cow<str> {
- if !text.contains("<!--") {
- // (by far) most common case
- return Cow::Borrowed(text);
- }
- let mut new_str = String::new();
- while let Some(comment_start) = text.find("<!--") {
- new_str.push_str(&text[..comment_start]);
- text = &text[comment_start..];
- let comment_end = text.find("-->").map_or(text.len(), |i| i + 3);
- text = &text[comment_end..];
- }
- new_str.push_str(text);
- Cow::Owned(new_str)
-}
-
-fn parse_xml(reader: impl BufRead, output: &mut Definitions) -> Result<(), Box<dyn Error>> {
- let mut config = xml::reader::ParserConfig::default();
- config.cdata_to_characters = true;
- #[derive(Debug, Clone, Copy)]
- enum Tag {
- Other,
- Title,
- Text,
- Ns,
- }
- let mut tags = vec![];
- let mut title = String::new();
- let mut body = String::new();
- let mut ns = 1;
- let mut ns_str = String::new();
- for event in xml::reader::EventReader::new_with_config(reader, config) {
- let event = event?;
- use xml::reader::XmlEvent;
- match (&event, tags.last()) {
- (XmlEvent::StartElement { name, .. }, _) => {
- let name = name.local_name.as_str();
- match name {
- "title" => {
- tags.push(Tag::Title);
- title.clear();
- }
- "text" => tags.push(Tag::Text),
- "ns" => {
- tags.push(Tag::Ns);
- ns_str.clear();
- }
- _ => tags.push(Tag::Other),
- }
- }
- (XmlEvent::Characters(s), Some(Tag::Title)) => {
- title.push_str(s);
- }
- (XmlEvent::Characters(s), Some(Tag::Ns)) => {
- ns_str.push_str(s);
- }
- (XmlEvent::Characters(s), Some(Tag::Text)) => {
- body.push_str(s);
- }
- (XmlEvent::EndElement { name, .. }, _) => {
- tags.pop();
- if name.local_name == "page" {
- title.clear();
- } else if name.local_name == "text" {
- if ns == 0
- && let Some(eng_start) = body.find("==English==\n")
- {
- let eng = &body[eng_start..];
- let eng_end = eng
- .as_bytes()
- .windows(4)
- .position(|w| w.starts_with(b"\n==") && w[3] != b'=')
- .unwrap_or(eng.len());
- let eng = &eng[..eng_end];
- for (i, w) in eng.as_bytes().windows(3).enumerate() {
- if w != b"\n# " {
- continue;
- }
- let definition =
- eng[i + 3..].split_once('\n').map_or(&eng[i + 3..], |x| x.0);
- let definition = remove_comments(definition);
- output.add_definition(&title[..], &definition);
- }
- }
- body.clear();
- } else if name.local_name == "ns" {
- ns = ns_str.parse().unwrap_or(1);
- }
- }
- _ => {}
- }
- }
- Ok(())
-}
+mod definitions;
fn do_write<W, E: Error>(path: &str, write_func: W) -> Result<(), Box<dyn Error>>
where
@@ -147,84 +20,6 @@ where
Ok(())
}
-fn definitions(args: Vec<String>) -> Result<(), Box<dyn Error>> {
- let mut output = Definitions::default();
- let mut files: Vec<String> = vec![];
- for arg in args {
- if arg == "-h" || arg == "--help" {
- println!("Usage: {} definitions [FILES]", env!("CARGO_PKG_NAME"));
- println!(" Extract English-language definitions from Wiktionary");
- println!(" data dump files, writing output to definitions.txt.");
- println!(" Each line of the output file is of the format: Word Definition");
- println!(
- " Note the two spaces—this avoids ambiguity when the word contains a space."
- );
- println!(" A single Word can have multiple Definitions.");
- println!(" If FILES is not specified, will use ./*wiktionary*.xml-p*");
- return Ok(());
- }
- files.push(arg.to_owned());
- }
- let files_from_pwd = || -> Result<Vec<String>, Box<dyn Error>> {
- let mut files = vec![];
- for file in std::fs::read_dir(".")? {
- let file = file?;
- let mut r#type = file.file_type()?;
- let name = file.file_name();
- if r#type.is_symlink() {
- // get type of thing symlink is pointing to
- r#type = std::fs::metadata(file.path())?.file_type();
- }
- if !r#type.is_file() {
- continue;
- }
- let Some(name) = name.to_str() else {
- continue;
- };
- if name.contains("wiktionary") && name.contains(".xml-p") {
- files.push(name.into());
- }
- }
- files.sort();
- Ok(files)
- };
- if files.is_empty() {
- files = files_from_pwd()
- .map_err(|e| format!("No files specified and couldn't list PWD ({e}). Aborting."))?;
- println!("No files specified on command line.");
- println!("These files were found in the PWD:");
- for file in &files {
- println!(" {file}");
- }
- print!("Proceed with these files [Y/n]? ");
- _ = std::io::stdout().flush();
- let mut line = String::new();
- let result = std::io::stdin().read_line(&mut line);
- let line = line.trim();
- if result.is_err()
- || line
- .chars()
- .next()
- .is_some_and(|c| c.to_lowercase().to_string() != "y")
- {
- return Err("Aborted.".into());
- }
- }
- for input_filename in &files {
- let input = File::open(input_filename)
- .map_err(|e| format!("Couldn't open {input_filename}: {e}"))?;
- let reader = BufReader::new(input);
- println!("Parsing {input_filename}...");
- parse_xml(reader, &mut output)
- .map_err(|e| format!("Couldn't parse {input_filename}: {e}"))?;
- }
- println!("Sorting {} definitions...", output.definitions.len());
- output.sort();
- do_write("definitions.txt", |writer| output.write_to(writer))?;
- println!("Done!");
- Ok(())
-}
-
fn try_main() -> Result<(), Box<dyn Error>> {
let mut args = std::env::args_os().skip(1);
let command = args.next();
@@ -248,7 +43,7 @@ fn try_main() -> Result<(), Box<dyn Error>> {
command_args.push(arg.to_owned());
}
match &command.to_string_lossy()[..] {
- "definitions" => definitions(command_args),
+ "definitions" => definitions::definitions(command_args),
x => Err(format!("Unrecognized command: {x}").into()),
}
}