From f5f2bae2a43a8508235e5635d3d19d9e488bc029 Mon Sep 17 00:00:00 2001 From: pommicket Date: Sun, 6 Nov 2022 11:16:32 -0500 Subject: cleanup, nice command line interface --- Cargo.lock | 201 +++++++++++ Cargo.toml | 1 + src/elf.rs | 21 +- src/linker.rs | 1001 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ src/main.rs | 1026 +++------------------------------------------------------ test.c | 7 +- 6 files changed, 1272 insertions(+), 985 deletions(-) create mode 100644 src/linker.rs diff --git a/Cargo.lock b/Cargo.lock index 6beeb6d..bda8a2a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,6 +2,207 @@ # It is not intended for manual editing. version = 3 +[[package]] +name = "atty" +version = "0.2.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8" +dependencies = [ + "hermit-abi", + "libc", + "winapi", +] + +[[package]] +name = "bitflags" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" + +[[package]] +name = "clap" +version = "4.0.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e67816e006b17427c9b4386915109b494fec2d929c63e3bd3561234cbf1bf1e" +dependencies = [ + "atty", + "bitflags", + "clap_derive", + "clap_lex", + "once_cell", + "strsim", + "termcolor", +] + +[[package]] +name = "clap_derive" +version = "4.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "16a1b0f6422af32d5da0c58e2703320f379216ee70198241c84173a8c5ac28f3" +dependencies = [ + "heck", + "proc-macro-error", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "clap_lex" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0d4198f73e42b4936b35b5bb248d81d2b595ecb170da0bac7655c54eedfa8da8" +dependencies = [ + "os_str_bytes", +] + +[[package]] +name = "heck" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2540771e65fc8cb83cd6e8a237f70c319bd5c29f78ed1084ba5d50eeac86f7f9" + +[[package]] +name = "hermit-abi" +version = "0.1.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33" +dependencies = [ + "libc", +] + +[[package]] +name = "libc" +version = "0.2.137" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc7fcc620a3bff7cdd7a365be3376c97191aeaccc2a603e600951e452615bf89" + +[[package]] +name = "once_cell" +version = "1.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "86f0b0d4bf799edbc74508c1e8bf170ff5f41238e5f8225603ca7caaae2b7860" + +[[package]] +name = "os_str_bytes" +version = "6.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3baf96e39c5359d2eb0dd6ccb42c62b91d9678aa68160d261b9e0ccbf9e9dea9" + +[[package]] +name = "proc-macro-error" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c" +dependencies = [ + "proc-macro-error-attr", + "proc-macro2", + "quote", + "syn", + "version_check", +] + +[[package]] +name = "proc-macro-error-attr" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869" +dependencies = [ + "proc-macro2", + "quote", + "version_check", +] + +[[package]] +name = "proc-macro2" +version = "1.0.47" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5ea3d908b0e36316caf9e9e2c4625cdde190a7e6f440d794667ed17a1855e725" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbe448f377a7d6961e30f5955f9b8d106c3f5e449d493ee1b125c1d43c2b5179" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "strsim" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" + +[[package]] +name = "syn" +version = "1.0.103" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a864042229133ada95abf3b54fdc62ef5ccabe9515b64717bcb9a1919e59445d" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "termcolor" +version = "1.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bab24d30b911b2376f3a13cc2cd443142f0c81dda04c118693e35b3835757755" +dependencies = [ + "winapi-util", +] + [[package]] name = "tinyld" version = "0.1.0" +dependencies = [ + "clap", +] + +[[package]] +name = "unicode-ident" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ceab39d59e4c9499d4e5a8ee0e2735b891bb7308ac83dfb4e80cad195c9f6f3" + +[[package]] +name = "version_check" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" + +[[package]] +name = "winapi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + +[[package]] +name = "winapi-util" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70ec6ce85bb158151cae5e5c87f95a8e97d2c0c4b001223f33a334e3ce5de178" +dependencies = [ + "winapi", +] + +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" diff --git a/Cargo.toml b/Cargo.toml index 51821ce..c287cd1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -4,3 +4,4 @@ version = "0.1.0" edition = "2021" [dependencies] +clap = { version = "4.0", features = ["derive"] } diff --git a/src/elf.rs b/src/elf.rs index acc3e18..033910a 100644 --- a/src/elf.rs +++ b/src/elf.rs @@ -529,19 +529,32 @@ impl Reader for Reader32LE { let mut sym_buf = [0; 16]; reader.read_exact(&mut sym_buf)?; let sym = Sym32::from_bytes(sym_buf); + let r#type = (sym.info & 0xf).into(); + let bind = (sym.info >> 4).into(); + let mut size = sym.size.into(); + let value = match sym.shndx { SHN_UNDEF => SymbolValue::Undefined, SHN_ABS => SymbolValue::Absolute(sym.value.into()), - idx if idx < ehdr.shnum => SymbolValue::SectionOffset(idx, sym.value.into()), + idx if idx < ehdr.shnum => { + if r#type == SymbolType::Section { + // section symbols have a size of 0, it seems. + // i don't know why they don't just use the size of the section. + // i'm replacing it here. it makes the code easier to write. + size = shdrs[idx as usize].size.into(); + } + SymbolValue::SectionOffset(idx, sym.value.into()) + }, x => return Err(BadSymShNdx(x)), }; + let symbol = Symbol { name: sym.name.into(), value, - r#type: (sym.info & 0xf).into(), - bind: (sym.info >> 4).into(), - size: sym.size.into(), + r#type, + bind, + size, }; symtab.push(symbols.len()); symbols.push(symbol); diff --git a/src/linker.rs b/src/linker.rs new file mode 100644 index 0000000..b7069f6 --- /dev/null +++ b/src/linker.rs @@ -0,0 +1,1001 @@ +use crate::{elf, util}; +use io::{BufRead, Seek, Write}; +use std::collections::{BTreeMap, HashMap}; +use std::{fmt, io, mem, fs, path}; + +use elf::ToBytes; +use elf::Reader as ELFReader; +use util::u32_from_le_slice; + +pub enum LinkError { + IO(io::Error), + TooLarge, + NoEntry(String), // no entry point + EntryNotDefined(String), // entry point is declared, but not defined +} + +type LinkResult = Result; + +impl fmt::Display for LinkError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + use LinkError::*; + match self { + IO(e) => write!(f, "IO error: {e}"), + TooLarge => write!(f, "executable file would be too large."), + NoEntry(name) => write!(f, "entry point '{name}' not found."), + EntryNotDefined(name) => write!(f, "entry point '{name}' declared, but not defined."), + } + } +} + +impl From for LinkError { + fn from(e: io::Error) -> Self { + Self::IO(e) + } +} + +impl From<&LinkError> for String { + fn from(e: &LinkError) -> Self { + format!("{e}") + } +} + +pub enum LinkWarning { + RelSymNotFound { source: String, name: String }, + RelUnsupported(u8), + RelOOB(String, u64), + RelNoData(String, u64), + RelNoValue(String), +} + +impl fmt::Display for LinkWarning { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + use LinkWarning::*; + match self { + RelSymNotFound { source, name } => write!(f, "undefined symbol '{name}' (in {source}) (relocation ignored)."), + RelOOB(text, offset) => write!(f, "relocation applied to {text}+0x{offset:x}, which goes outside of the symbol (it will be ignored)."), + RelNoData(source, offset) => write!( + f, + "relocation {source}+0x{offset:x} not in a data/text section. it will be ignored." + ), + RelNoValue(name) => write!(f, "can't figure out value of symbol '{name}' (relocation ignored)."), + RelUnsupported(x) => write!(f, "Unsupported relocation type {x} (relocation ignored)."), + } + } +} + +impl From<&LinkWarning> for String { + fn from(e: &LinkWarning) -> Self { + format!("{e}") + } +} + +pub enum ObjectError { + Elf(elf::Error), + BadType, + BadUtf8, + BadSymtab, + BadLink(u64), + BadRelHeader, + UnsupportedRelocation(u8), + BadSymIdx(u64), + NoStrtab, +} + +impl From for ObjectError { + fn from(e: elf::Error) -> Self { + Self::Elf(e) + } +} + +impl From<&ObjectError> for String { + fn from(e: &ObjectError) -> String { + format!("{e}") + } +} + +impl fmt::Display for ObjectError { + fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> { + use ObjectError::*; + match self { + // Display for UnexpectedEof *should* be this but is less clear + // ("failed to fill whole buffer") + Elf(e) => write!(f, "{e}"), + BadType => write!(f, "wrong type of ELF file (not an object file)"), + BadUtf8 => write!(f, "bad UTF-8 in ELF file"), + BadSymtab => write!(f, "bad ELF symbol table"), + BadRelHeader => write!(f, "bad ELF relocation header"), + UnsupportedRelocation(x) => write!(f, "unsupported relocation type: {x}"), + BadLink(i) => write!(f, "bad ELF link: {i}"), + BadSymIdx(i) => write!(f, "bad symbol index: {i}"), + NoStrtab => write!(f, "object has no .strtab section"), + } + } +} + +// to be more efficientâ„¢, we use integers to keep track of symbol names. +type SymbolNameType = u32; +#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)] +struct SymbolName(SymbolNameType); +struct SymbolNames { + count: SymbolNameType, + to_string: Vec, + by_string: HashMap, +} + +impl SymbolNames { + fn new() -> Self { + Self { + count: 0, + to_string: vec![], + by_string: HashMap::new(), + } + } + + fn add(&mut self, name: String) -> SymbolName { + match self.by_string.get(&name) { + Some(id) => *id, + None => { + // new symbol + let id = SymbolName(self.count); + self.count += 1; + self.by_string.insert(name.clone(), id); + self.to_string.push(name); + id + } + } + } + + #[allow(dead_code)] + fn get_str(&self, id: SymbolName) -> Option<&str> { + self.to_string.get(id.0 as usize).map(|s| &s[..]) + } + + #[allow(dead_code)] + fn get(&self, name: &str) -> Option { + self.by_string.get(name).copied() + } +} + +#[derive(Copy, Clone, PartialEq, Eq, Hash, Debug)] +struct SourceId(u32); + +impl SourceId { + const NONE: Self = Self(u32::MAX); +} + +type SymbolIdType = u32; +#[derive(Copy, Clone, PartialEq, Eq, Hash, Debug)] +struct SymbolId(SymbolIdType); + +#[derive(Copy, Clone, Debug)] +#[allow(dead_code)] // @TODO @TEMPORARY +enum SymbolType { + Function, + Object, + Other, +} + +#[derive(Debug)] +enum SymbolValue { + Bss(u64), + Data(Vec), + Absolute(u64), +} + +#[allow(dead_code)] // @TODO @TEMPORARY +#[derive(Debug)] +struct SymbolInfo { + r#type: elf::SymbolType, + value: Option, + size: u64, +} + +struct Symbols { + info: Vec, + locations: HashMap, + global: HashMap, + weak: HashMap, + local: HashMap<(SourceId, SymbolName), SymbolId>, +} + +impl Symbols { + fn new() -> Self { + Self { + info: vec![], + global: HashMap::new(), + weak: HashMap::new(), + local: HashMap::new(), + locations: HashMap::new(), + } + } + + fn add_(&mut self, source: SourceId, name: SymbolName, info: SymbolInfo) -> SymbolId { + let id = SymbolId(self.info.len() as _); + self.info.push(info); + self.locations.insert(id, (source, name)); + id + } + + fn add_weak(&mut self, source: SourceId, name: SymbolName, info: SymbolInfo) -> SymbolId { + let id = self.add_(source, name, info); + self.weak.insert(name, id); + id + } + + fn add_local(&mut self, source: SourceId, name: SymbolName, info: SymbolInfo) -> SymbolId { + let id = self.add_(source, name, info); + self.local.insert((source, name), id); + id + } + + fn add_global(&mut self, source: SourceId, name: SymbolName, info: SymbolInfo) -> SymbolId { + let id = self.add_(source, name, info); + self.global.insert(name, id); + id + } + + fn get_mut_info_from_id(&mut self, id: SymbolId) -> Option<&mut SymbolInfo> { + self.info.get_mut(id.0 as usize) + } + + fn get_info_from_id(&self, id: SymbolId) -> Option<&SymbolInfo> { + self.info.get(id.0 as usize) + } + + fn get_id_from_name(&self, source: SourceId, name: SymbolName) -> Option { + self.local + .get(&(source, name)) + .or_else(|| self.global.get(&name)) + .or_else(|| self.weak.get(&name)) + .copied() + } + + fn get_location_from_id(&self, id: SymbolId) -> Option<(SourceId, SymbolName)> { + self.locations.get(&id).copied() + } + + fn count(&self) -> usize { + self.info.len() + } +} + +#[derive(Debug, Clone)] +struct Relocation { + r#where: (SymbolId, u64), // (symbol containing relocation, offset in symbol where relocation needs to be applied) + source_id: SourceId, + sym: SymbolName, + r#type: elf::RelType, + addend: i64, +} + +pub struct Linker { + symbols: Symbols, + symbol_names: SymbolNames, + relocations: Vec, + undefined_relocations: Vec, // library relocations + sources: Vec, // object files + libraries: Vec, + bss_size: u64, // output bss size + bss_addr: u64, // output bss address + data_addr: u64, // output data address + symbol_data_offsets: HashMap, // for symbols with data, this holds the offsets into the data segment. + warn: fn(LinkWarning), +} + +// this maps between offsets in an object file and symbols defined in that file. +// this is used to figure out where relocations are taking place. +struct SymbolOffsetMap { + map: BTreeMap<(u64, u64), SymbolId>, +} + +impl SymbolOffsetMap { + fn new() -> Self { + SymbolOffsetMap { + map: BTreeMap::new(), + } + } + + fn add_symbol(&mut self, offset: u64, size: u64, id: SymbolId) { + if size > 0 { + self.map.insert((offset, offset + size), id); + } + } + + // returns symbol, offset in symbol. + // e.g. a relocation might happen at main+0x33. + fn get(&self, offset: u64) -> Option<(SymbolId, u64)> { + let mut r = self.map.range(..(offset, u64::MAX)); + let (key, value) = r.next_back()?; + if offset >= key.0 && offset < key.1 { + // offset corresponds to somewhere in this symbol + Some((*value, offset - key.0)) + } else { + None + } + } +} + +// graph of which symbols use which symbols +// this is needed so we don't emit anything for unused symbols. +type SymbolGraph = HashMap>; + +struct Executable { + interp: Vec, + load_addr: u64, + bss: Option<(u64, u64)>, + relocations: Vec<(Relocation, u64)>, + strtab: Vec, + symbol_strtab_offsets: HashMap, + lib_strtab_offsets: Vec, +} + +impl Executable { + pub fn new(load_addr: u64) -> Self { + Self { + bss: None, + load_addr, + interp: vec![], + relocations: vec![], + lib_strtab_offsets: vec![], + symbol_strtab_offsets: HashMap::new(), + strtab: vec![0], + } + } + + pub fn set_bss(&mut self, addr: u64, size: u64) { + self.bss = Some((addr, size)); + } + + pub fn set_interp(&mut self, interp: &str) { + self.interp = interp.as_bytes().into(); + self.interp.push(b'\0'); + } + + fn add_string(&mut self, s: &str) -> u64 { + let ret = self.strtab.len() as u64; + self.strtab.extend(s.as_bytes()); + self.strtab.push(b'\0'); + ret + } + + pub fn add_lib(&mut self, lib: &str) { + let s = self.add_string(lib); + self.lib_strtab_offsets.push(s); + } + + pub fn add_relocation(&mut self, symbol_names: &SymbolNames, rel: &Relocation, addr: u64) { + let name = rel.sym; + + if self.symbol_strtab_offsets.get(&name).is_none() { + let s = symbol_names.get_str(name).unwrap(); + let offset = self.add_string(s); + self.symbol_strtab_offsets.insert(name, offset); + } + self.relocations.push((rel.clone(), addr)); + } + + fn segment_count(&self) -> u16 { + let mut count = 1 /*data*/; + if !self.interp.is_empty() { + count += 2 /*interp,dyntab*/; + } + if self.bss.is_some() { + count += 1 /*bss*/; + } + count + } + + fn ph_offset(&self) -> u64 { + elf::Ehdr32::size_of() as u64 + } + + fn ph_size(&self) -> u64 { + elf::Phdr32::size_of() as u64 * u64::from(self.segment_count()) + } + + fn data_offset(&self) -> u64 { + self.ph_offset() + self.ph_size() + } + + pub fn data_addr(&self) -> u64 { + self.load_addr + self.data_offset() + } + + pub fn write(&self, data: &[u8], mut out: T) -> LinkResult<()> { + let load_addr = self.load_addr as u32; + + // start by writing data. + out.seek(io::SeekFrom::Start(self.data_offset()))?; + out.write_all(data)?; + + let mut interp_offset = 0; + let mut dyntab_offset = 0; + let mut interp_size = 0; + let mut dyntab_size = 0; + if !self.interp.is_empty() { + // now interp + interp_offset = out.stream_position()?; + out.write_all(&self.interp)?; + interp_size = self.interp.len() as u32; + // now strtab + let strtab_offset = out.stream_position()?; + out.write_all(&self.strtab)?; + // now symtab + let symtab_offset = out.stream_position()?; + let null_symbol = [0; mem::size_of::()]; + out.write_all(&null_symbol)?; + let mut symbols: HashMap = HashMap::new(); + for (i, (sym, strtab_offset)) in self.symbol_strtab_offsets.iter().enumerate() { + symbols.insert(*sym, (i + 1) as u32); + // @TODO: allow STT_OBJECT as fell + let sym = elf::Sym32 { + name: *strtab_offset as u32, + info: elf::STB_GLOBAL << 4 | elf::STT_FUNC, + value: 0, + size: 0, + other: 0, + shndx: 0, + }; + out.write_all(&sym.to_bytes())?; + } + // now reltab + let reltab_offset = out.stream_position()?; + for (reloc, addr) in self.relocations.iter() { + let index = *symbols.get(&reloc.sym).unwrap(); + let rel = elf::Rel32 { + offset: *addr as u32, + info: index << 8 | u32::from(reloc.r#type.to_x86_u8().unwrap()), + }; + out.write_all(&rel.to_bytes())?; + } + let reltab_size = out.stream_position()? - reltab_offset; + // now hash + let hashtab_offset = out.stream_position()?; + // put everything in a single bucket + let nsymbols = symbols.len() as u32; + out.write_all(&u32::to_le_bytes(1))?; // nbucket + out.write_all(&u32::to_le_bytes(nsymbols + 1))?; // nchain + out.write_all(&u32::to_le_bytes(0))?; // bucket begins at 0 + // chain 1 -> 2 -> 3 -> ... -> n -> 0 + for i in 1..nsymbols { + out.write_all(&u32::to_le_bytes(i))?; + } + out.write_all(&u32::to_le_bytes(0))?; + // i don't know why this needs to be here. + out.write_all(&u32::to_le_bytes(0))?; + + // now dyntab + dyntab_offset = out.stream_position()?; + let mut dyn_data = vec![ + elf::DT_RELSZ, + reltab_size as u32, + elf::DT_RELENT, + 8, + elf::DT_REL, + load_addr + reltab_offset as u32, + elf::DT_STRSZ, + self.strtab.len() as u32, + elf::DT_STRTAB, + load_addr + strtab_offset as u32, + elf::DT_SYMENT, + 16, + elf::DT_SYMTAB, + load_addr + symtab_offset as u32, + elf::DT_HASH, + load_addr + hashtab_offset as u32, + ]; + for lib in &self.lib_strtab_offsets { + dyn_data.extend([elf::DT_NEEDED, *lib as u32]); + } + dyn_data.extend([elf::DT_NULL, 0]); + let mut dyn_bytes = Vec::with_capacity(dyn_data.len() * 4); + for x in dyn_data { + dyn_bytes.extend(u32::to_le_bytes(x)); + } + dyntab_size = dyn_bytes.len() as u32; + out.write_all(&dyn_bytes)?; + } + + let file_size: u32 = out + .stream_position()? + .try_into() + .map_err(|_| LinkError::TooLarge)?; + + out.seek(io::SeekFrom::Start(0))?; + + let ehdr = elf::Ehdr32 { + phnum: self.segment_count(), + phoff: elf::Ehdr32::size_of() as u32, + entry: self + .data_addr() + .try_into() + .map_err(|_| LinkError::TooLarge)?, + ..Default::default() + }; + out.write_all(&ehdr.to_bytes())?; + + let phdr_data = elf::Phdr32 { + flags: elf::PF_R | elf::PF_W | elf::PF_X, // read, write, execute + offset: 0, + vaddr: load_addr, + filesz: file_size, + memsz: file_size, + ..Default::default() + }; + out.write_all(&phdr_data.to_bytes())?; + + if let Some((bss_addr, bss_size)) = self.bss { + // for some reason, linux doesn't like executables + // with memsz > filesz != 0 + // so we need two segments. + let bss_size: u32 = bss_size.try_into().map_err(|_| LinkError::TooLarge)?; + let phdr_bss = elf::Phdr32 { + flags: elf::PF_R | elf::PF_W, // read, write + offset: 0, + vaddr: bss_addr as u32, + filesz: 0, + memsz: bss_size as u32, + ..Default::default() + }; + out.write_all(&phdr_bss.to_bytes())?; + } + + if !self.interp.is_empty() { + let phdr_interp = elf::Phdr32 { + r#type: elf::PT_INTERP, + flags: elf::PF_R, + offset: interp_offset as u32, + vaddr: load_addr + interp_offset as u32, + filesz: interp_size as u32, + memsz: interp_size as u32, + align: 1, + ..Default::default() + }; + out.write_all(&phdr_interp.to_bytes())?; + + let phdr_dynamic = elf::Phdr32 { + r#type: elf::PT_DYNAMIC, + flags: elf::PF_R, + offset: dyntab_offset as u32, + vaddr: load_addr + dyntab_offset as u32, + filesz: dyntab_size as u32, + memsz: dyntab_size as u32, + align: 1, + ..Default::default() + }; + out.write_all(&phdr_dynamic.to_bytes())?; + } + + Ok(()) + } +} + +impl Linker { + fn default_warn_handler(warning: LinkWarning) { + eprintln!("warning: {warning}"); + } + + // why use fn of all things to transmit warnings? + // well, it's very nice for stuff to not need a mutable reference + // to emit warnings, and this is basically the only way of doing it. + // if you need to mutate state in your warning handler, you can always + // use a mutex. + pub fn _set_warning_handler(&mut self, warn: fn(LinkWarning)) { + self.warn = warn; + } + + pub fn new() -> Self { + Linker { + symbols: Symbols::new(), + symbol_names: SymbolNames::new(), + bss_addr: 0, + bss_size: 0, + data_addr: 0, + relocations: vec![], + undefined_relocations: vec![], + sources: vec![], + libraries: vec![], + symbol_data_offsets: HashMap::new(), + warn: Self::default_warn_handler, + } + } + + fn source_name(&self, id: SourceId) -> &str { + &self.sources[id.0 as usize] + } + + fn add_symbol( + &mut self, + source: SourceId, + elf: &elf::Reader32LE, + offset_map: &mut SymbolOffsetMap, + symbol: &elf::Symbol, + ) -> Result<(), ObjectError> { + let mut data_offset = None; + let name = elf.symbol_name(symbol)?; + let name_id = self.symbol_names.add(name); + + let value = match symbol.value { + elf::SymbolValue::Undefined => None, + elf::SymbolValue::Absolute(n) => Some(SymbolValue::Absolute(n)), + elf::SymbolValue::SectionOffset(shndx, offset) => { + match elf.section_type(shndx) { + Some(elf::SectionType::ProgBits) => { + let mut data = vec![0; symbol.size as usize]; + data_offset = Some(elf.section_offset(shndx).unwrap() + offset); + elf.read_section_data_exact(shndx, offset, &mut data)?; + Some(SymbolValue::Data(data)) + }, + Some(elf::SectionType::NoBits) => { + let p = self.bss_size; + self.bss_size += symbol.size; + Some(SymbolValue::Bss(p)) + }, + _ => None, // huh + } + } + }; + + let info = SymbolInfo { + r#type: symbol.r#type, + value, + size: symbol.size, + }; + let symbol_id = match symbol.bind { + elf::SymbolBind::Local => self.symbols.add_local(source, name_id, info), + elf::SymbolBind::Global => self.symbols.add_global(source, name_id, info), + elf::SymbolBind::Weak => self.symbols.add_weak(source, name_id, info), + _ => return Ok(()), // eh + }; + + if let Some(offset) = data_offset { + offset_map.add_symbol(offset, symbol.size, symbol_id); + } + Ok(()) + } + + /// add an object file (.o). + /// name doesn't need to correspond to the actual file name. + /// it only exists for debugging purposes. + pub fn add_object( + &mut self, + name: &str, + reader: T, + ) -> Result<(), ObjectError> { + use ObjectError::*; + + let mut offset_map = SymbolOffsetMap::new(); + + let source_id = SourceId(self.sources.len() as _); + self.sources.push(name.into()); + + let elf = elf::Reader32LE::new(reader)?; + if elf.r#type() != elf::Type::Rel { + return Err(BadType); + } + + for symbol in elf.symbols() { + self.add_symbol(source_id, &elf, &mut offset_map, symbol)?; + } + + for rel in elf.relocations() { + if let Some(r#where) = offset_map.get(rel.offset) { + let sym = self.symbol_names.add(elf.symbol_name(&rel.symbol)?); + self.relocations.push(Relocation { + r#where, + source_id, + sym, + r#type: rel.r#type, + addend: rel.addend, + }); + } else { + self.emit_warning(LinkWarning::RelNoData( + self.source_name(source_id).into(), + rel.entry_offset + )); + } + } + + Ok(()) + } + + pub fn add_library(&mut self, name: &str) -> Result<(), ObjectError> { + self.libraries.push(name.into()); + Ok(()) + } + + fn symbol_name_str(&self, id: SymbolName) -> &str { + self.symbol_names.get_str(id).unwrap_or("???") + } + + fn emit_warning(&self, warning: LinkWarning) { + (self.warn)(warning); + } + + fn emit_warning_rel_sym_not_found(&self, source: SourceId, name: SymbolName) { + let warn = LinkWarning::RelSymNotFound { + source: self.source_name(source).into(), + name: self.symbol_name_str(name).into(), + }; + self.emit_warning(warn); + } + + // get symbol ID, producing a warning if it does not exist. + fn get_symbol_id(&self, source_id: SourceId, name: SymbolName) -> Option { + // @TODO: don't warn about the same symbol twice + let sym = self.symbols.get_id_from_name(source_id, name); + if sym.is_none() { + self.emit_warning_rel_sym_not_found(source_id, name); + } + sym + } + + // generates a string like main.c:some_function + fn symbol_id_location_string(&self, id: SymbolId) -> String { + if let Some((source, name)) = self.symbols.get_location_from_id(id) { + return format!( + "{}:{}", + self.source_name(source), + self.symbol_name_str(name) + ); + } + "???".into() + } + + fn get_symbol_value(&self, sym: SymbolId) -> Option { + let info = self.symbols.get_info_from_id(sym)?; + use SymbolValue::*; + match info.value.as_ref()? { + Data(_) => self + .symbol_data_offsets + .get(&sym) + .map(|&o| o + self.data_addr), + Bss(x) => Some(self.bss_addr + *x), + Absolute(a) => Some(*a), + } + } + + fn get_rel_apply_data_offset(&self, rel: &Relocation) -> Option { + let apply_symbol = rel.r#where.0; + let r = self.symbol_data_offsets.get(&apply_symbol)?; + Some(*r + rel.r#where.1) + } + + fn apply_relocation(&mut self, rel: Relocation, data: &mut [u8]) -> Result<(), LinkError> { + let apply_symbol = rel.r#where.0; + let apply_offset = match self.get_rel_apply_data_offset(&rel) { + Some(data_offset) => data_offset, + None => return Ok(()), // this relocation isn't in a data section so there's nothing we can do about it + }; + let pc = apply_offset + self.data_addr; + + let symbol = match self.get_symbol_id(rel.source_id, rel.sym) { + None => return Ok(()), // we emitted a warning in get_symbol_id + Some(sym) => sym, + }; + + let symbol_value = match self.get_symbol_value(symbol) { + None => { + // this symbol is defined in a library + //self.emit_warning(LinkWarning::RelNoValue(self.symbol_id_location_string(symbol))); + self.undefined_relocations.push(rel); + return Ok(()); + } + Some(v) => v, + }; + + let addend = rel.addend; + + enum Value { + U32(u32), + } + use elf::RelType::*; + use Value::*; + + let value = match rel.r#type { + Direct32 => U32(symbol_value as u32 + addend as u32), + Pc32 => U32(symbol_value as u32 + addend as u32 - pc as u32), + Other(x) => {self.emit_warning(LinkWarning::RelUnsupported(x)); return Ok(()) }, + }; + + let apply_symbol_info = match self.symbols.get_mut_info_from_id(apply_symbol) { + Some(info) => info, + None => { + // this shouldn't happen. + self.emit_warning_rel_sym_not_found(rel.source_id, rel.sym); + return Ok(()); + } + }; + + use SymbolValue::*; + + // guarantee failure if apply_offset can't be converted to usize. + let apply_start = apply_offset.try_into().unwrap_or(usize::MAX - 32); + + match apply_symbol_info.value { + Some(Data(_)) => { + let mut in_bounds = true; + match value { + U32(u) => { + if let Some(apply_to) = data.get_mut(apply_start..apply_start + 4) { + let curr_val = u32_from_le_slice(apply_to); + apply_to.copy_from_slice(&(u + curr_val).to_le_bytes()); + } else { + in_bounds = false; + } + } + }; + + if !in_bounds { + self.emit_warning(LinkWarning::RelOOB( + self.symbol_id_location_string(apply_symbol), + apply_offset, + )); + } + } + _ => { + self.emit_warning(LinkWarning::RelNoData( + self.source_name(rel.source_id).into(), + apply_offset, + )); + } + } + + Ok(()) + } + + /// "easy" input API. + /// infers the file type of input, and calls the appropriate function (e.g. `add_object`) + /// if there return value is `Err(s)`, `s` will be a nicely formatted error string. + pub fn add_input(&mut self, input: &str) -> Result<(), String> { + enum FileType { + Object, + DynamicLibrary, + Other + } + + use FileType::*; + + fn file_type(input: &str) -> FileType { + if input.ends_with(".o") { + return Object; + } + if input.ends_with(".so") { + return DynamicLibrary; + } + if input.find(".so.").is_some() { + // e.g. libc.so.6, some_library.so.12.7.3 + return DynamicLibrary; + } + Other + } + + match file_type(input) { + Object => { + let file = fs::File::open(input) + .map_err(|e| format!("Error opening {input}: {e}"))?; + let mut file = io::BufReader::new(file); + self.add_object(input, &mut file) + .map_err(|e| format!("Error processing object file {input}: {e}")) + }, + DynamicLibrary => { + self.add_library(input) + .map_err(|e| format!("Error processing library file {input}: {e}")) + }, + Other => { + Err(format!("Unrecognized file type: {input}")) + } + } + } + + + // we don't want to link unused symbols. + // we start by calling this on the entry function, then it recursively calls itself for each symbol used. + fn add_data_for_symbol( + &mut self, + data: &mut Vec, + symbol_graph: &SymbolGraph, + id: SymbolId, + ) -> Result<(), LinkError> { + // deal with cycles + if self.symbol_data_offsets.contains_key(&id) { + return Ok(()); + } + + if let Some(info) = self.symbols.get_info_from_id(id) { + if let Some(SymbolValue::Data(d)) = &info.value { + // set address + self.symbol_data_offsets.insert(id, data.len() as u64); + // add data + data.extend(d); + } + } + + for reference in symbol_graph.get(&id).unwrap_or(&vec![]) { + self.add_data_for_symbol(data, symbol_graph, *reference)?; + } + + Ok(()) + } + + pub fn link(mut self, out: T, entry: &str) -> LinkResult<()> { + let mut symbol_graph = SymbolGraph::with_capacity(self.symbols.count()); + + let relocations = mem::take(&mut self.relocations); + + // compute symbol graph + for rel in relocations.iter() { + use std::collections::hash_map::Entry; + if let Some(symbol) = self.get_symbol_id(rel.source_id, rel.sym) { + let apply_symbol = rel.r#where.0; + match symbol_graph.entry(apply_symbol) { + Entry::Occupied(mut o) => { + o.get_mut().push(symbol); + } + Entry::Vacant(v) => { + v.insert(vec![symbol]); + } + } + } + } + + let symbol_graph = symbol_graph; // no more mutating + + let mut exec = Executable::new(0x400000); + self.bss_addr = 0x50000000; + exec.set_bss(self.bss_addr, self.bss_size); + exec.set_interp("/lib/ld-linux.so.2"); + for lib in self.libraries.iter() { + exec.add_lib(lib); + } + + self.data_addr = exec.data_addr(); + + let entry_name_id = self + .symbol_names + .get(entry) + .ok_or_else(|| LinkError::NoEntry(entry.into()))?; + let entry_id = self + .symbols + .get_id_from_name(SourceId::NONE, entry_name_id) + .ok_or_else(|| LinkError::EntryNotDefined(entry.into()))?; + + let mut data = vec![]; + self.add_data_for_symbol(&mut data, &symbol_graph, entry_id)?; + + for rel in relocations { + self.apply_relocation(rel, &mut data)?; + } + + for rel in mem::take(&mut self.undefined_relocations) { + if let Some(data_offset) = self.get_rel_apply_data_offset(&rel) { + exec.add_relocation(&self.symbol_names, &rel, self.data_addr + data_offset); + } + } + + exec.write(&data, out) + } + + /// "easy" linking API. + pub fn link_to_file>(self, path: P, entry: &str) -> Result<(), String> { + let path = path.as_ref(); + let mut out_options = fs::OpenOptions::new(); + out_options + .write(true) + .create(true) + .truncate(true); + #[cfg(unix)] + { + use std::os::unix::fs::OpenOptionsExt; + out_options.mode(0o755); + } + + let output = out_options.open(path) + .map_err(|e| format!("Error opening output file {}: {e}", path.to_string_lossy()))?; + let mut output = io::BufWriter::new(output); + + self.link(&mut output, entry) + .map_err(|e| format!("Error linking {}: {e}", path.to_string_lossy())) + } +} diff --git a/src/main.rs b/src/main.rs index 2300c27..bae6d71 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,1001 +1,73 @@ // you will need gcc-multilib to compile a 32-bit executable (with stdlib) // you need to use -fno-pic with gcc -- got,plt relocations aren't supported // and also make the executable bigger. -use fs::File; -use io::{BufReader, BufWriter, Seek, Write}; -use std::collections::{BTreeMap, HashMap}; -use std::{fmt, fs, io, mem}; +extern crate clap; + +use clap::Parser; #[cfg(target_endian = "big")] compile_error! {"WHY do you have a big endian machine???? it's the 21st century, buddy. this program won't work fuck you"} mod elf; mod util; - -use elf::ToBytes; -use elf::Reader as ELFReader; -use util::u32_from_le_slice; - -pub enum LinkError { - IO(io::Error), - TooLarge, - NoEntry(String), // no entry point - EntryNotDefined(String), // entry point is declared, but not defined -} - -type LinkResult = Result; - -impl fmt::Display for LinkError { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - use LinkError::*; - match self { - IO(e) => write!(f, "IO error: {e}"), - TooLarge => write!(f, "executable file would be too large."), - NoEntry(name) => write!(f, "entry point '{name}' not found."), - EntryNotDefined(name) => write!(f, "entry point '{name}' declared, but not defined."), - } - } -} - -impl From for LinkError { - fn from(e: io::Error) -> Self { - Self::IO(e) - } -} - -impl From<&LinkError> for String { - fn from(e: &LinkError) -> Self { - format!("{e}") - } -} - -pub enum LinkWarning { - RelSymNotFound { source: String, name: String }, - RelUnsupported(u8), - RelOOB(String, u64), - RelNoData(String, u64), - RelNoValue(String), -} - -impl fmt::Display for LinkWarning { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - use LinkWarning::*; - match self { - RelSymNotFound { source, name } => write!(f, "undefined symbol '{name}' (in {source}) (relocation ignored)."), - RelOOB(text, offset) => write!(f, "relocation applied to {text}+0x{offset:x}, which goes outside of the symbol (it will be ignored)."), - RelNoData(source, offset) => write!( - f, - "offset {source}+0x{offset:x} not in a data/text section. relocation will be ignored." - ), - RelNoValue(name) => write!(f, "can't figure out value of symbol '{name}' (relocation ignored)."), - RelUnsupported(x) => write!(f, "Unsupported relocation type {x} (relocation ignored)."), - } - } -} - -impl From<&LinkWarning> for String { - fn from(e: &LinkWarning) -> Self { - format!("{e}") - } -} - -pub enum ObjectError { - Elf(elf::Error), - BadType, - BadUtf8, - BadSymtab, - BadLink(u64), - BadRelHeader, - UnsupportedRelocation(u8), - BadSymIdx(u64), - NoStrtab, -} - -impl From for ObjectError { - fn from(e: elf::Error) -> Self { - Self::Elf(e) - } -} - -impl From<&ObjectError> for String { - fn from(e: &ObjectError) -> String { - format!("{e}") - } -} - -impl fmt::Display for ObjectError { - fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> { - use ObjectError::*; - match self { - // Display for UnexpectedEof *should* be this but is less clear - // ("failed to fill whole buffer") - Elf(e) => write!(f, "{e}"), - BadType => write!(f, "wrong type of ELF file (not an object file)"), - BadUtf8 => write!(f, "bad UTF-8 in ELF file"), - BadSymtab => write!(f, "bad ELF symbol table"), - BadRelHeader => write!(f, "bad ELF relocation header"), - UnsupportedRelocation(x) => write!(f, "unsupported relocation type: {x}"), - BadLink(i) => write!(f, "bad ELF link: {i}"), - BadSymIdx(i) => write!(f, "bad symbol index: {i}"), - NoStrtab => write!(f, "object has no .strtab section"), - } - } -} - -// to be more efficientâ„¢, we use integers to keep track of symbol names. -type SymbolNameType = u32; -#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)] -struct SymbolName(SymbolNameType); -struct SymbolNames { - count: SymbolNameType, - to_string: Vec, - by_string: HashMap, -} - -impl SymbolNames { - fn new() -> Self { - Self { - count: 0, - to_string: vec![], - by_string: HashMap::new(), - } - } - - fn add(&mut self, name: String) -> SymbolName { - match self.by_string.get(&name) { - Some(id) => *id, - None => { - // new symbol - let id = SymbolName(self.count); - self.count += 1; - self.by_string.insert(name.clone(), id); - self.to_string.push(name); - id - } - } - } - - #[allow(dead_code)] - fn get_str(&self, id: SymbolName) -> Option<&str> { - self.to_string.get(id.0 as usize).map(|s| &s[..]) - } - - #[allow(dead_code)] - fn get(&self, name: &str) -> Option { - self.by_string.get(name).copied() - } -} - -#[derive(Copy, Clone, PartialEq, Eq, Hash, Debug)] -struct SourceId(u32); - -impl SourceId { - const NONE: Self = Self(u32::MAX); -} - -type SymbolIdType = u32; -#[derive(Copy, Clone, PartialEq, Eq, Hash, Debug)] -struct SymbolId(SymbolIdType); - -#[derive(Copy, Clone, Debug)] -#[allow(dead_code)] // @TODO @TEMPORARY -enum SymbolType { - Function, - Object, - Other, -} - -#[derive(Debug)] -enum SymbolValue { - Bss(u64), - Data(Vec), - Absolute(u64), -} - -#[allow(dead_code)] // @TODO @TEMPORARY -#[derive(Debug)] -struct SymbolInfo { - r#type: elf::SymbolType, - value: Option, - size: u64, -} - -struct Symbols { - info: Vec, - locations: HashMap, - global: HashMap, - weak: HashMap, - local: HashMap<(SourceId, SymbolName), SymbolId>, -} - -impl Symbols { - fn new() -> Self { - Self { - info: vec![], - global: HashMap::new(), - weak: HashMap::new(), - local: HashMap::new(), - locations: HashMap::new(), - } - } - - fn add_(&mut self, source: SourceId, name: SymbolName, info: SymbolInfo) -> SymbolId { - let id = SymbolId(self.info.len() as _); - self.info.push(info); - self.locations.insert(id, (source, name)); - id - } - - fn add_weak(&mut self, source: SourceId, name: SymbolName, info: SymbolInfo) -> SymbolId { - let id = self.add_(source, name, info); - self.weak.insert(name, id); - id - } - - fn add_local(&mut self, source: SourceId, name: SymbolName, info: SymbolInfo) -> SymbolId { - let id = self.add_(source, name, info); - self.local.insert((source, name), id); - id - } - - fn add_global(&mut self, source: SourceId, name: SymbolName, info: SymbolInfo) -> SymbolId { - let id = self.add_(source, name, info); - self.global.insert(name, id); - id - } - - fn get_mut_info_from_id(&mut self, id: SymbolId) -> Option<&mut SymbolInfo> { - self.info.get_mut(id.0 as usize) - } - - fn get_info_from_id(&self, id: SymbolId) -> Option<&SymbolInfo> { - self.info.get(id.0 as usize) - } - - fn get_id_from_name(&self, source: SourceId, name: SymbolName) -> Option { - self.local - .get(&(source, name)) - .or_else(|| self.global.get(&name)) - .or_else(|| self.weak.get(&name)) - .copied() - } - - fn get_location_from_id(&self, id: SymbolId) -> Option<(SourceId, SymbolName)> { - self.locations.get(&id).copied() - } - - fn count(&self) -> usize { - self.info.len() - } -} - -#[derive(Debug, Clone)] -struct Relocation { - r#where: (SymbolId, u64), // (symbol containing relocation, offset in symbol where relocation needs to be applied) - source_id: SourceId, - sym: SymbolName, - r#type: elf::RelType, - addend: i64, -} - -struct Linker { - symbols: Symbols, - symbol_names: SymbolNames, - relocations: Vec, - undefined_relocations: Vec, // library relocations - sources: Vec, - bss_size: u64, // output bss size - bss_addr: u64, // output bss address - data_addr: u64, // output data address - symbol_data_offsets: HashMap, // for symbols with data, this holds the offsets into the data segment. - warn: fn(LinkWarning), -} - -// this maps between offsets in an object file and symbols defined in that file. -// this is used to figure out where relocations are taking place. -struct SymbolOffsetMap { - map: BTreeMap<(u64, u64), SymbolId>, -} - -impl SymbolOffsetMap { - fn new() -> Self { - SymbolOffsetMap { - map: BTreeMap::new(), - } - } - - fn add_symbol(&mut self, offset: u64, size: u64, id: SymbolId) { - if size > 0 { - self.map.insert((offset, offset + size), id); - } - } - - // returns symbol, offset in symbol. - // e.g. a relocation might happen at main+0x33. - fn get(&self, offset: u64) -> Option<(SymbolId, u64)> { - let mut r = self.map.range(..(offset, u64::MAX)); - let (key, value) = r.next_back()?; - if offset >= key.0 && offset < key.1 { - // offset corresponds to somewhere in this symbol - Some((*value, offset - key.0)) - } else { - None - } - } -} - -// graph of which symbols use which symbols -// this is needed so we don't emit anything for unused symbols. -type SymbolGraph = HashMap>; - -struct Executable { - interp: Vec, - load_addr: u64, - bss: Option<(u64, u64)>, - relocations: Vec<(Relocation, u64)>, - strtab: Vec, - symbol_strtab_offsets: HashMap, - lib_strtab_offsets: Vec, -} - -impl Executable { - pub fn new(load_addr: u64) -> Self { - Self { - bss: None, - load_addr, - interp: vec![], - relocations: vec![], - lib_strtab_offsets: vec![], - symbol_strtab_offsets: HashMap::new(), - strtab: vec![0], - } - } - - pub fn set_bss(&mut self, addr: u64, size: u64) { - self.bss = Some((addr, size)); - } - - pub fn set_interp(&mut self, interp: &str) { - self.interp = interp.as_bytes().into(); - self.interp.push(b'\0'); - } - - fn add_string(&mut self, s: &str) -> u64 { - let ret = self.strtab.len() as u64; - self.strtab.extend(s.as_bytes()); - self.strtab.push(b'\0'); - ret - } - - pub fn add_lib(&mut self, lib: &str) { - let s = self.add_string(lib); - self.lib_strtab_offsets.push(s); - } - - pub fn add_relocation(&mut self, symbol_names: &SymbolNames, rel: &Relocation, addr: u64) { - let name = rel.sym; - - if self.symbol_strtab_offsets.get(&name).is_none() { - let s = symbol_names.get_str(name).unwrap(); - let offset = self.add_string(s); - self.symbol_strtab_offsets.insert(name, offset); - } - self.relocations.push((rel.clone(), addr)); - } - - fn segment_count(&self) -> u16 { - let mut count = 1 /*data*/; - if !self.interp.is_empty() { - count += 2 /*interp,dyntab*/; - } - if self.bss.is_some() { - count += 1 /*bss*/; - } - count - } - - fn ph_offset(&self) -> u64 { - elf::Ehdr32::size_of() as u64 - } - - fn ph_size(&self) -> u64 { - elf::Phdr32::size_of() as u64 * u64::from(self.segment_count()) - } - - fn data_offset(&self) -> u64 { - self.ph_offset() + self.ph_size() - } - - pub fn data_addr(&self) -> u64 { - self.load_addr + self.data_offset() - } - - pub fn write(&self, data: &[u8], out: &mut T) -> LinkResult<()> { - let load_addr = self.load_addr as u32; - - // start by writing data. - out.seek(io::SeekFrom::Start(self.data_offset()))?; - out.write_all(data)?; - - let mut interp_offset = 0; - let mut dyntab_offset = 0; - let mut interp_size = 0; - let mut dyntab_size = 0; - if !self.interp.is_empty() { - // now interp - interp_offset = out.stream_position()?; - out.write_all(&self.interp)?; - interp_size = self.interp.len() as u32; - // now strtab - let strtab_offset = out.stream_position()?; - out.write_all(&self.strtab)?; - // now symtab - let symtab_offset = out.stream_position()?; - let null_symbol = [0; mem::size_of::()]; - out.write_all(&null_symbol)?; - let mut symbols: HashMap = HashMap::new(); - for (i, (sym, strtab_offset)) in self.symbol_strtab_offsets.iter().enumerate() { - symbols.insert(*sym, (i + 1) as u32); - // @TODO: allow STT_OBJECT as fell - let sym = elf::Sym32 { - name: *strtab_offset as u32, - info: elf::STB_GLOBAL << 4 | elf::STT_FUNC, - value: 0, - size: 0, - other: 0, - shndx: 0, - }; - out.write_all(&sym.to_bytes())?; - } - // now reltab - let reltab_offset = out.stream_position()?; - for (reloc, addr) in self.relocations.iter() { - let index = *symbols.get(&reloc.sym).unwrap(); - let rel = elf::Rel32 { - offset: *addr as u32, - info: index << 8 | u32::from(reloc.r#type.to_x86_u8().unwrap()), - }; - out.write_all(&rel.to_bytes())?; - } - let reltab_size = out.stream_position()? - reltab_offset; - // now hash - let hashtab_offset = out.stream_position()?; - // put everything in a single bucket - let nsymbols = symbols.len() as u32; - out.write_all(&u32::to_le_bytes(1))?; // nbucket - out.write_all(&u32::to_le_bytes(nsymbols + 1))?; // nchain - out.write_all(&u32::to_le_bytes(0))?; // bucket begins at 0 - // chain 1 -> 2 -> 3 -> ... -> n -> 0 - for i in 1..nsymbols { - out.write_all(&u32::to_le_bytes(i))?; - } - out.write_all(&u32::to_le_bytes(0))?; - // i don't know why this needs to be here. - out.write_all(&u32::to_le_bytes(0))?; - - // now dyntab - dyntab_offset = out.stream_position()?; - let mut dyn_data = vec![ - elf::DT_RELSZ, - reltab_size as u32, - elf::DT_RELENT, - 8, - elf::DT_REL, - load_addr + reltab_offset as u32, - elf::DT_STRSZ, - self.strtab.len() as u32, - elf::DT_STRTAB, - load_addr + strtab_offset as u32, - elf::DT_SYMENT, - 16, - elf::DT_SYMTAB, - load_addr + symtab_offset as u32, - elf::DT_HASH, - load_addr + hashtab_offset as u32, - ]; - for lib in &self.lib_strtab_offsets { - dyn_data.extend([elf::DT_NEEDED, *lib as u32]); - } - dyn_data.extend([elf::DT_NULL, 0]); - let mut dyn_bytes = Vec::with_capacity(dyn_data.len() * 4); - for x in dyn_data { - dyn_bytes.extend(u32::to_le_bytes(x)); - } - dyntab_size = dyn_bytes.len() as u32; - out.write_all(&dyn_bytes)?; - } - - let file_size: u32 = out - .stream_position()? - .try_into() - .map_err(|_| LinkError::TooLarge)?; - - out.seek(io::SeekFrom::Start(0))?; - - let ehdr = elf::Ehdr32 { - phnum: self.segment_count(), - phoff: elf::Ehdr32::size_of() as u32, - entry: self - .data_addr() - .try_into() - .map_err(|_| LinkError::TooLarge)?, - ..Default::default() - }; - out.write_all(&ehdr.to_bytes())?; - - let phdr_data = elf::Phdr32 { - flags: elf::PF_R | elf::PF_W | elf::PF_X, // read, write, execute - offset: 0, - vaddr: load_addr, - filesz: file_size, - memsz: file_size, - ..Default::default() - }; - out.write_all(&phdr_data.to_bytes())?; - - if let Some((bss_addr, bss_size)) = self.bss { - // for some reason, linux doesn't like executables - // with memsz > filesz != 0 - // so we need two segments. - let bss_size: u32 = bss_size.try_into().map_err(|_| LinkError::TooLarge)?; - let phdr_bss = elf::Phdr32 { - flags: elf::PF_R | elf::PF_W, // read, write - offset: 0, - vaddr: bss_addr as u32, - filesz: 0, - memsz: bss_size as u32, - ..Default::default() - }; - out.write_all(&phdr_bss.to_bytes())?; - } - - if !self.interp.is_empty() { - let phdr_interp = elf::Phdr32 { - r#type: elf::PT_INTERP, - flags: elf::PF_R, - offset: interp_offset as u32, - vaddr: load_addr + interp_offset as u32, - filesz: interp_size as u32, - memsz: interp_size as u32, - align: 1, - ..Default::default() - }; - out.write_all(&phdr_interp.to_bytes())?; - - let phdr_dynamic = elf::Phdr32 { - r#type: elf::PT_DYNAMIC, - flags: elf::PF_R, - offset: dyntab_offset as u32, - vaddr: load_addr + dyntab_offset as u32, - filesz: dyntab_size as u32, - memsz: dyntab_size as u32, - align: 1, - ..Default::default() - }; - out.write_all(&phdr_dynamic.to_bytes())?; - } - - Ok(()) - } -} - -impl Linker { - fn default_warn_handler(warning: LinkWarning) { - eprintln!("warning: {warning}"); - } - - // why use fn of all things to transmit warnings? - // well, it's very nice for stuff to not need a mutable reference - // to emit warnings, and this is basically the only way of doing it. - // if you need to mutate state in your warning handler, you can always - // use a mutex. - pub fn _set_warning_handler(&mut self, warn: fn(LinkWarning)) { - self.warn = warn; - } - - pub fn new() -> Self { - Linker { - symbols: Symbols::new(), - symbol_names: SymbolNames::new(), - bss_addr: 0, - bss_size: 0, - data_addr: 0, - relocations: vec![], - undefined_relocations: vec![], - sources: vec![], - symbol_data_offsets: HashMap::new(), - warn: Self::default_warn_handler, - } - } - - fn source_name(&self, id: SourceId) -> &str { - &self.sources[id.0 as usize] - } - - fn add_symbol( - &mut self, - source: SourceId, - elf: &elf::Reader32LE, - offset_map: &mut SymbolOffsetMap, - symbol: &elf::Symbol, - ) -> Result<(), ObjectError> { - let mut data_offset = None; - let name = elf.symbol_name(symbol)?; - println!("{name}"); - let name_id = self.symbol_names.add(name); - - let value = match symbol.value { - elf::SymbolValue::Undefined => None, - elf::SymbolValue::Absolute(n) => Some(SymbolValue::Absolute(n)), - elf::SymbolValue::SectionOffset(shndx, offset) => { - match elf.section_type(shndx) { - Some(elf::SectionType::ProgBits) => { - let mut data = vec![0; symbol.size as usize]; - data_offset = Some(elf.section_offset(shndx).unwrap() + offset); - elf.read_section_data_exact(shndx, offset, &mut data)?; - Some(SymbolValue::Data(data)) - }, - Some(elf::SectionType::NoBits) => { - let p = self.bss_size; - self.bss_size += symbol.size; - Some(SymbolValue::Bss(p)) - }, - _ => None, // huh - } - } - }; - - let info = SymbolInfo { - r#type: symbol.r#type, - value, - size: symbol.size, - }; - let symbol_id = match symbol.bind { - elf::SymbolBind::Local => self.symbols.add_local(source, name_id, info), - elf::SymbolBind::Global => self.symbols.add_global(source, name_id, info), - elf::SymbolBind::Weak => self.symbols.add_weak(source, name_id, info), - _ => return Ok(()), // eh - }; - - if let Some(offset) = data_offset { - offset_map.add_symbol(offset, symbol.size, symbol_id); - } - Ok(()) - } - - pub fn process_object( - &mut self, - name: &str, - reader: &mut BufReader, - ) -> Result<(), ObjectError> { - use ObjectError::*; - - let mut offset_map = SymbolOffsetMap::new(); - - let source_id = SourceId(self.sources.len() as _); - self.sources.push(name.into()); - - let elf = elf::Reader32LE::new(reader)?; - if elf.r#type() != elf::Type::Rel { - return Err(BadType); - } - - for symbol in elf.symbols() { - self.add_symbol(source_id, &elf, &mut offset_map, symbol)?; - } - - for rel in elf.relocations() { - if let Some(r#where) = offset_map.get(rel.offset) { - let sym = self.symbol_names.add(elf.symbol_name(&rel.symbol)?); - self.relocations.push(Relocation { - r#where, - source_id, - sym, - r#type: rel.r#type, - addend: rel.addend, - }); - } else { - self.emit_warning(LinkWarning::RelNoData( - self.source_name(source_id).into(), - rel.entry_offset - )); - } - } - - Ok(()) - } - - fn symbol_name_str(&self, id: SymbolName) -> &str { - self.symbol_names.get_str(id).unwrap_or("???") - } - - fn emit_warning(&self, warning: LinkWarning) { - (self.warn)(warning); - } - - fn emit_warning_rel_sym_not_found(&self, source: SourceId, name: SymbolName) { - let warn = LinkWarning::RelSymNotFound { - source: self.source_name(source).into(), - name: self.symbol_name_str(name).into(), - }; - self.emit_warning(warn); - } - - // get symbol ID, producing a warning if it does not exist. - fn get_symbol_id(&self, source_id: SourceId, name: SymbolName) -> Option { - // @TODO: don't warn about the same symbol twice - let sym = self.symbols.get_id_from_name(source_id, name); - if sym.is_none() { - self.emit_warning_rel_sym_not_found(source_id, name); - } - sym - } - - // generates a string like main.c:some_function - fn symbol_id_location_string(&self, id: SymbolId) -> String { - if let Some((source, name)) = self.symbols.get_location_from_id(id) { - return format!( - "{}:{}", - self.source_name(source), - self.symbol_name_str(name) - ); - } - "???".into() - } - - fn get_symbol_value(&self, sym: SymbolId) -> Option { - let info = self.symbols.get_info_from_id(sym)?; - use SymbolValue::*; - match info.value.as_ref()? { - Data(_) => self - .symbol_data_offsets - .get(&sym) - .map(|&o| o + self.data_addr), - Bss(x) => Some(self.bss_addr + *x), - Absolute(a) => Some(*a), - } - } - - fn get_rel_apply_data_offset(&self, rel: &Relocation) -> Option { - let apply_symbol = rel.r#where.0; - let r = self.symbol_data_offsets.get(&apply_symbol)?; - Some(*r + rel.r#where.1) - } - - fn apply_relocation(&mut self, rel: Relocation, data: &mut [u8]) -> Result<(), LinkError> { - let apply_symbol = rel.r#where.0; - let apply_offset = match self.get_rel_apply_data_offset(&rel) { - Some(data_offset) => data_offset, - None => return Ok(()), // this relocation isn't in a data section so there's nothing we can do about it - }; - let pc = apply_offset + self.data_addr; - - let symbol = match self.get_symbol_id(rel.source_id, rel.sym) { - None => return Ok(()), // we emitted a warning in get_symbol_id - Some(sym) => sym, - }; - - let symbol_value = match self.get_symbol_value(symbol) { - None => { - // this symbol is defined in a library - //self.emit_warning(LinkWarning::RelNoValue(self.symbol_id_location_string(symbol))); - self.undefined_relocations.push(rel); - return Ok(()); - } - Some(v) => v, - }; - - let addend = rel.addend; - - enum Value { - U32(u32), - } - use elf::RelType::*; - use Value::*; - - let value = match rel.r#type { - Direct32 => U32(symbol_value as u32 + addend as u32), - Pc32 => U32(symbol_value as u32 + addend as u32 - pc as u32), - Other(x) => {self.emit_warning(LinkWarning::RelUnsupported(x)); return Ok(()) }, - }; - - let apply_symbol_info = match self.symbols.get_mut_info_from_id(apply_symbol) { - Some(info) => info, - None => { - // this shouldn't happen. - self.emit_warning_rel_sym_not_found(rel.source_id, rel.sym); - return Ok(()); - } - }; - - use SymbolValue::*; - - // guarantee failure if apply_offset can't be converted to usize. - let apply_start = apply_offset.try_into().unwrap_or(usize::MAX - 32); - - match apply_symbol_info.value { - Some(Data(_)) => { - let mut in_bounds = true; - match value { - U32(u) => { - if let Some(apply_to) = data.get_mut(apply_start..apply_start + 4) { - let curr_val = u32_from_le_slice(apply_to); - apply_to.copy_from_slice(&(u + curr_val).to_le_bytes()); - } else { - in_bounds = false; - } - } - }; - - if !in_bounds { - self.emit_warning(LinkWarning::RelOOB( - self.symbol_id_location_string(apply_symbol), - apply_offset, - )); - } - } - _ => { - self.emit_warning(LinkWarning::RelNoData( - self.source_name(rel.source_id).into(), - apply_offset, - )); - } - } - - Ok(()) - } - - // we don't want to link unused symbols. - // we start by calling this on the entry function, then it recursively calls itself for each symbol used. - pub fn add_data_for_symbol( - &mut self, - data: &mut Vec, - symbol_graph: &SymbolGraph, - id: SymbolId, - ) -> Result<(), LinkError> { - // deal with cycles - if self.symbol_data_offsets.contains_key(&id) { - return Ok(()); - } - - if let Some(info) = self.symbols.get_info_from_id(id) { - if let Some(SymbolValue::Data(d)) = &info.value { - // set address - self.symbol_data_offsets.insert(id, data.len() as u64); - // add data - data.extend(d); - } - } - - for reference in symbol_graph.get(&id).unwrap_or(&vec![]) { - self.add_data_for_symbol(data, symbol_graph, *reference)?; - } - - Ok(()) - } - - pub fn link(mut self, out: &mut BufWriter) -> LinkResult<()> { - let mut symbol_graph = SymbolGraph::with_capacity(self.symbols.count()); - - let relocations = mem::take(&mut self.relocations); - - // compute symbol graph - for rel in relocations.iter() { - use std::collections::hash_map::Entry; - if let Some(symbol) = self.get_symbol_id(rel.source_id, rel.sym) { - let apply_symbol = rel.r#where.0; - match symbol_graph.entry(apply_symbol) { - Entry::Occupied(mut o) => { - o.get_mut().push(symbol); - } - Entry::Vacant(v) => { - v.insert(vec![symbol]); - } - } - } - } - - let symbol_graph = symbol_graph; // no more mutating - - let mut exec = Executable::new(0x400000); - self.bss_addr = 0x50000000; - exec.set_bss(self.bss_addr, self.bss_size); - exec.set_interp("/lib/ld-linux.so.2"); - exec.add_lib("libc.so.6"); - - self.data_addr = exec.data_addr(); - - let entry_name_str = "main"; - let entry_name_id = self - .symbol_names - .get(entry_name_str) - .ok_or_else(|| LinkError::NoEntry(entry_name_str.into()))?; - let entry_id = self - .symbols - .get_id_from_name(SourceId::NONE, entry_name_id) - .ok_or_else(|| LinkError::EntryNotDefined(entry_name_str.into()))?; - - let mut data = vec![]; - self.add_data_for_symbol(&mut data, &symbol_graph, entry_id)?; - - for rel in relocations { - self.apply_relocation(rel, &mut data)?; - } - - for rel in mem::take(&mut self.undefined_relocations) { - if let Some(data_offset) = self.get_rel_apply_data_offset(&rel) { - exec.add_relocation(&self.symbol_names, &rel, self.data_addr + data_offset); - } - } - - exec.write(&data, out) - } -} - -fn main() { - let mut args = std::env::args(); - args.next(); // program name - let args: Vec = args.collect(); - if args.len() == 1 && args[0] == "--nya" { +mod linker; + +#[derive(Parser, Debug)] +struct Args { + /// Input files: object files (.o) and shared libraries (.so) are supported. + inputs: Vec, + /// If set, the program will not be linked against libc. + /// + /// This makes the executable smaller. + #[arg(long = "no-std-lib", default_value_t = false)] + no_std_lib: bool, + /// Output executable path. + #[arg(short = 'o', long = "output", default_value = "a.out")] + output: String, + /// The name of the function which will be used as the entry point. + #[arg(short = 'e', long = "entry", default_value = "entry")] + entry: String, + /// :3 + #[arg(long = "nya")] + nya: bool +} + +fn main_() -> Result<(), String> { + let args = Args::parse(); + + if args.nya { println!("hai uwu ^_^"); - return; + return Ok(()); } - let mut inputs: Vec = args; + + let inputs = &args.inputs; + + let mut linker = linker::Linker::new(); + if inputs.is_empty() { if cfg!(debug_assertions) { - inputs.push("test.o".into()); + // ease of use when debugging + linker.add_input("test.o")?; } else { - eprintln!("no arguments provided."); - return; + return Err("no inputs provided.".into()); } } - let mut object_files = vec![]; - let mut libraries = vec![]; - - for input in inputs { - if input.ends_with(".o") { - object_files.push(input); - } else if input.ends_with(".so") { - libraries.push(input); - } + + if !args.no_std_lib { + linker.add_input("libc.so.6")?; } - let mut linker = Linker::new(); - - for filename in &object_files { - let file = match File::open(filename) { - Ok(file) => file, - Err(e) => { - eprintln!("Error opening {filename}: {e}"); - return; - } - }; - let mut file = BufReader::new(file); - if let Err(e) = linker.process_object(filename, &mut file) { - eprintln!("Error processing object file {filename}: {e}"); - return; - } + for input in inputs.iter() { + linker.add_input(input)?; } + + linker.link_to_file(&args.output, &args.entry) +} - use std::os::unix::fs::OpenOptionsExt; - let mut out_options = fs::OpenOptions::new(); - out_options - .write(true) - .create(true) - .truncate(true) - .mode(0o755); - - let mut output = match out_options.open("a.out") { - Ok(out) => BufWriter::new(out), - Err(e) => { - eprintln!("Error opening output file: {e}"); - return; - } - }; - - if let Err(e) = linker.link(&mut output) { - eprintln!("Error linking: {e}"); +fn main() { + if let Err(e) = main_() { + eprintln!("{e}"); } } diff --git a/test.c b/test.c index 8cf9a82..ef50528 100644 --- a/test.c +++ b/test.c @@ -2,10 +2,9 @@ #include int x; -void main() { - puts("HellO"); - puts("there"); +void entry() { + puts("Hello, world!"); //printf("hello\n"); - exit(123); + exit(0); } -- cgit v1.2.3