diff options
-rw-r--r-- | src/main.rs | 417 |
1 files changed, 317 insertions, 100 deletions
diff --git a/src/main.rs b/src/main.rs index 53d0cf7..e4ac72c 100644 --- a/src/main.rs +++ b/src/main.rs @@ -3,12 +3,15 @@ use io::{BufRead, BufReader, BufWriter, Read, Seek, Write}; use std::collections::{BTreeMap, HashMap}; use std::{fmt, fs, io, mem, ptr}; +#[cfg(target_endian = "big")] +compile_error! {"WHY do you have a big endian machine???? it's the 21st century, buddy. this program won't work fuck you"} + mod elf; pub enum LinkError { IO(io::Error), TooLarge, - NoEntry(String), // no entry point + NoEntry(String), // no entry point EntryNotDefined(String), // entry point is declared, but not defined } @@ -18,8 +21,8 @@ impl fmt::Display for LinkError { match self { IO(e) => write!(f, "IO error: {e}"), TooLarge => write!(f, "executable file would be too large."), - NoEntry(name) => write!(f, "entry point {name} not found."), - EntryNotDefined(name) => write!(f, "entry point {name} declared, but not defined."), + NoEntry(name) => write!(f, "entry point '{name}' not found."), + EntryNotDefined(name) => write!(f, "entry point '{name}' declared, but not defined."), } } } @@ -37,19 +40,23 @@ impl From<&LinkError> for String { } pub enum LinkWarning { - SymNotFound(String), - RelocationIgnored(u64), + RelSymNotFound { source: String, name: String }, + RelOOB(String, u64), + RelNoData(String, u64), + RelNoValue(String), } impl fmt::Display for LinkWarning { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { use LinkWarning::*; match self { - SymNotFound(s) => write!(f, "symbol not found: {s}"), - RelocationIgnored(offset) => write!( + RelSymNotFound { source, name } => write!(f, "undefined symbol '{name}' (in {source}) (relocation ignored)."), + RelOOB(text, offset) => write!(f, "relocation applied to {text}+0x{offset:x}, which goes outside of the symbol (it will be ignored)."), + RelNoData(source, offset) => write!( f, - "offset {offset} not in a data/text section. relocation will not be applied." + "offset {source}+0x{offset:x} not in a data/text section. relocation will be ignored." ), + RelNoValue(name) => write!(f, "can't figure out value of symbol '{name}' (relocation ignored)."), } } } @@ -73,6 +80,7 @@ pub enum ElfError { BadRelHeader, UnsupportedRelocation(u8), BadSymIdx(u64), + NoStrtab, IO(io::Error), } @@ -105,6 +113,7 @@ impl fmt::Display for ElfError { UnsupportedRelocation(x) => write!(f, "unsupported relocation type: {x}"), BadLink(i) => write!(f, "bad ELF link: {i}"), BadSymIdx(i) => write!(f, "bad symbol index: {i}"), + NoStrtab => write!(f, "object has no .strtab section"), } } } @@ -166,8 +175,9 @@ impl SourceId { const NONE: Self = Self(u32::MAX); } +type SymbolIdType = u32; #[derive(Copy, Clone, PartialEq, Eq, Hash, Debug)] -struct SymbolId(u32); +struct SymbolId(SymbolIdType); #[derive(Copy, Clone, Debug)] enum SymbolType { @@ -193,6 +203,11 @@ struct SymbolInfo { struct Symbols { info: Vec<SymbolInfo>, + // this field isn't strictly needed for linking, + // but it lets us print nice error messages like + // error linking main.c:some_function instead of + // error linking symbol 387 + locations: HashMap<SymbolId, (SourceId, SymbolName)>, global: HashMap<SymbolName, SymbolId>, weak: HashMap<SymbolName, SymbolId>, local: HashMap<(SourceId, SymbolName), SymbolId>, @@ -205,48 +220,57 @@ impl Symbols { global: HashMap::new(), weak: HashMap::new(), local: HashMap::new(), + locations: HashMap::new(), } } - fn add_(&mut self, info: SymbolInfo) -> SymbolId { + fn add_(&mut self, source: SourceId, name: SymbolName, info: SymbolInfo) -> SymbolId { let id = SymbolId(self.info.len() as _); self.info.push(info); + self.locations.insert(id, (source, name)); id } - fn add_weak(&mut self, name: SymbolName, info: SymbolInfo) -> SymbolId { - let id = self.add_(info); + fn add_weak(&mut self, source: SourceId, name: SymbolName, info: SymbolInfo) -> SymbolId { + let id = self.add_(source, name, info); self.weak.insert(name, id); id } fn add_local(&mut self, source: SourceId, name: SymbolName, info: SymbolInfo) -> SymbolId { - let id = self.add_(info); + let id = self.add_(source, name, info); self.local.insert((source, name), id); id } - fn add_global(&mut self, name: SymbolName, info: SymbolInfo) -> SymbolId { - let id = self.add_(info); + fn add_global(&mut self, source: SourceId, name: SymbolName, info: SymbolInfo) -> SymbolId { + let id = self.add_(source, name, info); self.global.insert(name, id); id } + fn get_mut_info_from_id(&mut self, id: SymbolId) -> Option<&mut SymbolInfo> { + self.info.get_mut(id.0 as usize) + } + fn get_info_from_id(&self, id: SymbolId) -> Option<&SymbolInfo> { self.info.get(id.0 as usize) } fn get_id_from_name(&self, source: SourceId, name: SymbolName) -> Option<SymbolId> { - self - .local + self.local .get(&(source, name)) .or_else(|| self.global.get(&name)) .or_else(|| self.weak.get(&name)) .map(|r| *r) } - - fn get_info_from_name(&self, source: SourceId, name: SymbolName) -> Option<&SymbolInfo> { - self.get_info_from_id(self.get_id_from_name(source, name)?) + + fn get_location_from_id(&self, id: SymbolId) -> Option<(SourceId, SymbolName)> { + self.locations.get(&id).map(|r| *r) + } + + fn count(&self) -> usize { + self.info.len() } } @@ -277,20 +301,25 @@ impl RelocationType { struct Relocation { r#where: (SymbolId, u64), // (symbol containing relocation, offset in symbol where relocation needs to be applied) source_id: SourceId, + source_offset: u64, sym: SymbolName, r#type: RelocationType, addend: i64, } struct Linker { - strtab_offset: u64, - source_count: u32, + src_strtab_offset: u64, // .strtab offset in current object file + src_shstrtab_offset: u64, // .shstrtab offset in current object file symbols: Symbols, symbol_names: SymbolNames, relocations: Vec<Relocation>, sections: Vec<elf::Shdr32>, - warnings: Vec<LinkWarning>, - bss_size: u64, + sources: Vec<String>, + bss_size: u64, // output bss size + bss_addr: u64, // output bss address + data_addr: u64, // output data address + symbol_addrs: HashMap<SymbolId, u64>, // output addresses of symbols + warn: fn(LinkWarning), } // this maps between offsets in an object file and symbols defined in that file. @@ -326,22 +355,59 @@ impl AddrMap { } } +// graph of which symbols use which symbols +// this is needed so we don't emit anything for unused symbols. +type SymbolGraph = HashMap<SymbolId, Vec<SymbolId>>; + +const MAX_REL_SIZE: usize = 8; // this seems reasonable + impl Linker { - fn new() -> Self { + fn default_warn_handler(warning: LinkWarning) { + eprintln!("warning: {warning}"); + } + + // why use fn of all things to transmit warnings? + // well, it's very nice for stuff to not need a mutable reference + // to emit warnings, and this is basically the only way of doing it. + // if you need to mutate state in your warning handler, you can always + // use a mutex. + pub fn _set_warning_handler(&mut self, warn: fn(LinkWarning)) { + self.warn = warn; + } + + pub fn new() -> Self { Linker { symbols: Symbols::new(), symbol_names: SymbolNames::new(), - source_count: 0, - strtab_offset: 0, + src_strtab_offset: 0, + src_shstrtab_offset: 0, + bss_addr: 0, bss_size: 0, + data_addr: 0, sections: vec![], relocations: vec![], - warnings: vec![], + sources: vec![], + symbol_addrs: HashMap::new(), + warn: Self::default_warn_handler, } } - fn get_str(&self, reader: &mut BufReader<File>, offset: u32) -> Result<String, ElfError> { - reader.seek(io::SeekFrom::Start(offset as u64 + self.strtab_offset))?; + fn source_name(&self, id: SourceId) -> &str { + &self.sources[id.0 as usize] + } + + fn get_shstrtab(&self, reader: &mut BufReader<File>, offset: u32) -> Result<String, ElfError> { + reader.seek(io::SeekFrom::Start( + offset as u64 + self.src_shstrtab_offset, + ))?; + let mut bytes = vec![]; + reader.read_until(0, &mut bytes)?; + bytes.pop(); // remove terminating \0 + String::from_utf8(bytes).map_err(|_| ElfError::BadUtf8) + } + + fn get_strtab(&self, reader: &mut BufReader<File>, offset: u32) -> Result<String, ElfError> { + reader.seek(io::SeekFrom::Start(offset as u64 + self.src_strtab_offset))?; let mut bytes = vec![]; reader.read_until(0, &mut bytes)?; bytes.pop(); // remove terminating \0 @@ -349,7 +415,7 @@ impl Linker { } // returns SymbolName corresponding to the symbol - fn add_symbol( + fn read_symbol( &mut self, source: SourceId, addr_map: &mut AddrMap, @@ -370,7 +436,7 @@ impl Linker { let sym: ElfSym = unsafe { mem::transmute(sym_buf) }; let r#type = sym.info & 0xf; let bind = sym.info >> 4; - let name = self.get_str(reader, sym.name)?; + let name = self.get_strtab(reader, sym.name)?; let name_id = self.symbol_names.add(name); let size = sym.size as u64; @@ -414,8 +480,8 @@ impl Linker { }; let symbol_id = match bind { elf::STB_LOCAL => self.symbols.add_local(source, name_id, info), - elf::STB_GLOBAL => self.symbols.add_global(name_id, info), - elf::STB_WEAK => self.symbols.add_weak(name_id, info), + elf::STB_GLOBAL => self.symbols.add_global(source, name_id, info), + elf::STB_WEAK => self.symbols.add_weak(source, name_id, info), _ => return Ok(name_id), }; @@ -427,7 +493,7 @@ impl Linker { fn add_relocation_x86( &mut self, - symtab: &HashMap<u32, SymbolName>, + symtab: &[SymbolName], addr_map: &AddrMap, source_id: SourceId, offset: u64, @@ -438,11 +504,12 @@ impl Linker { let sym_idx = info >> 8; if let Some(r#where) = addr_map.get(offset) { - match symtab.get(&sym_idx) { + match symtab.get(sym_idx as usize) { Some(sym) => { self.relocations.push(Relocation { r#where, source_id, + source_offset: offset, sym: *sym, r#type: RelocationType::from_x86_u8(r#type)?, addend: addend.into(), @@ -451,20 +518,27 @@ impl Linker { None => return Err(ElfError::BadSymIdx(sym_idx.into())), } } else { - self.warnings.push(LinkWarning::RelocationIgnored(offset)); + self.emit_warning(LinkWarning::RelNoData( + self.source_name(source_id).into(), + offset, + )); } Ok(()) } - pub fn process_object(&mut self, reader: &mut BufReader<File>) -> Result<(), ElfError> { + pub fn process_object( + &mut self, + name: &str, + reader: &mut BufReader<File>, + ) -> Result<(), ElfError> { use ElfError::*; let mut addr_map = AddrMap::new(); reader.seek(io::SeekFrom::Start(0))?; - let source_id = SourceId(self.source_count); - self.source_count += 1; + let source_id = SourceId(self.sources.len() as _); + self.sources.push(name.into()); let mut elf = [0u8; 0x34]; reader.read_exact(&mut elf)?; @@ -490,8 +564,8 @@ impl Linker { } let mut shdr_buf = [0u8; 0x28]; - self.strtab_offset = { - // read .strtab header + self.src_shstrtab_offset = { + // read .shstrtab header reader.seek(elf.section_seek(elf.shstrndx))?; reader.read_exact(&mut shdr_buf)?; let shdr: elf::Shdr32 = unsafe { mem::transmute(shdr_buf) }; @@ -504,12 +578,18 @@ impl Linker { reader.seek(elf.section_seek(s_idx))?; reader.read_exact(&mut shdr_buf)?; let shdr: elf::Shdr32 = unsafe { mem::transmute(shdr_buf) }; - let name = self.get_str(reader, shdr.name)?; + let name = self.get_shstrtab(reader, shdr.name)?; sections_by_name.insert(name.clone(), shdr.clone()); self.sections.push(shdr); } - let mut symtab = HashMap::new(); + self.src_strtab_offset = if let Some(strtab) = sections_by_name.get(".strtab") { + strtab.offset.into() + } else { + return Err(NoStrtab); + }; + + let mut symtab = vec![]; if let Some(shdr) = sections_by_name.get(".symtab") { // read .symtab let size = shdr.size as u64; @@ -522,17 +602,17 @@ impl Linker { symtab.reserve(count as usize); for sym_idx in 0..count { reader.seek(io::SeekFrom::Start(offset + sym_idx as u64 * entsize))?; - let name = self.add_symbol(source_id, &mut addr_map, reader)?; - symtab.insert(sym_idx, name); + let name = self.read_symbol(source_id, &mut addr_map, reader)?; + symtab.push(name); } } for shdr in sections_by_name.values() { - // we only process relocations relating to .symtab currently. + // @TODO @FIX we only process relocations relating to .symtab currently. match self.sections.get(shdr.link as usize) { None => continue, Some(h) => { - if self.get_str(reader, h.name)? != ".symtab" { + if self.get_shstrtab(reader, h.name)? != ".symtab" { continue; } } @@ -628,55 +708,194 @@ impl Linker { Ok(()) } - fn get_name_str(&self, id: SymbolName) -> Option<&str> { - self.symbol_names.get_str(id) + fn symbol_name_str(&self, id: SymbolName) -> &str { + self.symbol_names.get_str(id).unwrap_or("???") } - // get symbol, producing a warning if it does not exist. - fn get_symbol(&mut self, source_id: SourceId, name: SymbolName) -> Option<&SymbolInfo> { - let sym = self.symbols.get_info_from_name(source_id, name); + fn emit_warning(&self, warning: LinkWarning) { + (self.warn)(warning); + } + + fn emit_warning_rel_sym_not_found(&self, source: SourceId, name: SymbolName) { + let warn = LinkWarning::RelSymNotFound { + source: self.source_name(source).into(), + name: self.symbol_name_str(name).into(), + }; + self.emit_warning(warn); + } + + // get symbol ID, producing a warning if it does not exist. + fn get_symbol_id(&self, source_id: SourceId, name: SymbolName) -> Option<SymbolId> { + // @TODO: don't warn about the same symbol twice + let sym = self.symbols.get_id_from_name(source_id, name); if sym.is_none() { - let warn = LinkWarning::SymNotFound(self.get_name_str(name).unwrap_or("???").into()); - self.warnings.push(warn); + self.emit_warning_rel_sym_not_found(source_id, name); } sym } - fn apply_relocation(&mut self, rel: Relocation) -> Result<(), LinkError> { - let symbol = match self.get_symbol(rel.source_id, rel.sym) { - None => return Ok(()), + // generates a string like main.c:some_function + fn symbol_id_location_string(&self, id: SymbolId) -> String { + if let Some((source, name)) = self.symbols.get_location_from_id(id) { + return format!( + "{}:{}", + self.source_name(source), + self.symbol_name_str(name) + ); + } + "???".into() + } + + fn get_symbol_value(&self, sym: SymbolId) -> Option<u64> { + let info = self.symbols.get_info_from_id(sym)?; + use SymbolValue::*; + match (&info.value).as_ref()? { + Data(_) => self.symbol_addrs.get(&sym).map(|r| *r), + Bss(x) => Some(self.bss_addr + *x), + Absolute(a) => Some(*a), + } + } + + fn get_relocation_data( + &self, + rel: &Relocation, + pc: u64, + data: &mut [u8; MAX_REL_SIZE], + ) -> Result<usize, LinkError> { + // use RelocationType::*; + + let symbol = match self.get_symbol_id(rel.source_id, rel.sym) { + None => return Ok(0), // we emitted a warning in get_symbol_id Some(sym) => sym, }; - println!("{rel:?} {symbol:?}"); + + let symbol_value = match self.get_symbol_value(symbol) { + None => { + self.emit_warning(LinkWarning::RelNoValue(self.symbol_id_location_string(symbol))); + return Ok(0) + }, + Some(v) => v, + }; + + let addend = rel.addend; + + enum Value { + U32(u32), + } + use Value::*; + use RelocationType::*; + + let value = match rel.r#type { + Direct32 => U32(symbol_value as u32 + addend as u32), + Pc32 => U32(symbol_value as u32 + addend as u32 - pc as u32), + _ => todo!(), + }; + + match value { + U32(u) => { + (&mut data[..4]).copy_from_slice(&u32::to_le_bytes(u)); + Ok(4) + }, + } + } + + fn apply_relocation(&mut self, rel: Relocation) -> Result<(), LinkError> { + let apply_symbol = rel.r#where.0; + let apply_offset = rel.r#where.1; + + let apply_addr = match self.symbol_addrs.get(&apply_symbol) { + None => return Ok(()), // this relocation isn't in a section we care about + Some(a) => *a, + }; + + let mut rel_data = [0; MAX_REL_SIZE]; + let rel_data_size = self.get_relocation_data(&rel, apply_addr, &mut rel_data)?; + let rel_data = &rel_data[..rel_data_size]; + + let apply_symbol_info = match self.symbols.get_mut_info_from_id(apply_symbol) { + Some(info) => info, + None => { + // this shouldn't happen. + self.emit_warning_rel_sym_not_found(rel.source_id, rel.sym); + return Ok(()); + } + }; + + use SymbolValue::*; + let mut oob = false; + match &mut apply_symbol_info.value { + Some(Data(data)) => { + let apply_start = apply_offset as usize; + let apply_end = apply_start + rel_data.len(); + if apply_end < apply_start || apply_end > data.len() { + oob = true; + } else { + (&mut data[apply_start..apply_end]).copy_from_slice(rel_data); + } + } + _ => { + self.emit_warning(LinkWarning::RelNoData( + self.source_name(rel.source_id).into(), + rel.source_offset, + )); + } + } + + if oob { + // prevent double mut borrow by moving this here + self.emit_warning(LinkWarning::RelOOB( + self.symbol_id_location_string(apply_symbol), + apply_offset, + )); + } + Ok(()) } - + // we don't want to link unused symbols. // we start by calling this on the entry function, then it recursively calls itself for each symbol used. - pub fn add_data_for_symbol(&mut self, data: &mut Vec<u8>, symbol_graph: &HashMap<SymbolId, Vec<SymbolId>>, - symbol_addrs: &mut HashMap<SymbolId, u64>, id: SymbolId) -> Result<(), LinkError> { + pub fn add_data_for_symbol( + &mut self, + data: &mut Vec<u8>, + symbol_graph: &SymbolGraph, + id: SymbolId, + ) -> Result<(), LinkError> { // deal with cycles - if symbol_addrs.contains_key(&id) { + if self.symbol_addrs.contains_key(&id) { return Ok(()); } - symbol_addrs.insert(id, data.len() as u64); + self.symbol_addrs + .insert(id, self.data_addr + (data.len() as u64)); for reference in symbol_graph.get(&id).unwrap_or(&vec![]) { - self.add_data_for_symbol(data, symbol_graph, symbol_addrs, *reference)?; + self.add_data_for_symbol(data, symbol_graph, *reference)?; } - + Ok(()) } - pub fn link<T: Write>( - &mut self, - out: &mut BufWriter<T>, - ) -> Result<Vec<LinkWarning>, LinkError> { - // we have to use an index because for all rust knows, - // apply_relocation modifies self.relocations (it doesn't). - for i in 0..self.relocations.len() { - self.apply_relocation(self.relocations[i].clone())?; + pub fn link<T: Write>(&mut self, out: &mut BufWriter<T>) -> Result<(), LinkError> { + let mut symbol_graph = SymbolGraph::with_capacity(self.symbols.count()); + + let relocations = mem::take(&mut self.relocations); + + // compute symbol graph + for rel in relocations.iter() { + use std::collections::hash_map::Entry; + if let Some(symbol) = self.get_symbol_id(rel.source_id, rel.sym) { + let apply_symbol = rel.r#where.0; + match symbol_graph.entry(apply_symbol) { + Entry::Occupied(mut o) => { + o.get_mut().push(symbol); + } + Entry::Vacant(v) => { + v.insert(vec![symbol]); + } + } + } } + let symbol_graph = symbol_graph; // no more mutating + let segment_addr: u32 = 0x400000; let data_size = 0; @@ -692,25 +911,28 @@ impl Linker { header.entry = entry_point; out.write_all(&header.to_bytes())?; - //let data_addr = segment_addr + header_size; - //let bss_addr = segment_addr + file_size; + let data_addr = segment_addr + header_size; + self.data_addr = data_addr.into(); + let bss_addr = segment_addr + file_size; + self.bss_addr = bss_addr.into(); let bss_size: u32 = self.bss_size.try_into().map_err(|_| LinkError::TooLarge)?; - + let entry_name_str = "entry"; - let entry_name_id = self.symbol_names.get(entry_name_str).ok_or_else(|| LinkError::NoEntry(entry_name_str.into()))?; - let entry_id = self.symbols.get_id_from_name(SourceId::NONE, entry_name_id).ok_or_else(|| LinkError::EntryNotDefined(entry_name_str.into()))?; - let mut symbol_addrs = HashMap::new(); - - self.add_data_for_symbol(&mut symbol_addrs, entry_id); -// -// let _get_symbol_value = |val: SymbolValue| -> u64 { -// use SymbolValue::*; -// match val { -// Absolute(n) => n, -// Bss(x) => bss_addr as u64 + x, -// Data(_d) => todo!(), -// } -// }; + let entry_name_id = self + .symbol_names + .get(entry_name_str) + .ok_or_else(|| LinkError::NoEntry(entry_name_str.into()))?; + let entry_id = self + .symbols + .get_id_from_name(SourceId::NONE, entry_name_id) + .ok_or_else(|| LinkError::EntryNotDefined(entry_name_str.into()))?; + + let mut data = vec![]; + self.add_data_for_symbol(&mut data, &symbol_graph, entry_id)?; + + for rel in relocations { + self.apply_relocation(rel)?; + } let phdr = elf::Phdr32 { flags: 0b111, // read, write, execute @@ -722,7 +944,7 @@ impl Linker { }; out.write_all(&phdr.to_bytes())?; - Ok(mem::take(&mut self.warnings)) + Ok(()) } } @@ -766,7 +988,7 @@ fn main() { } }; let mut file = BufReader::new(file); - if let Err(e) = linker.process_object(&mut file) { + if let Err(e) = linker.process_object(filename, &mut file) { eprintln!("Error processing object file {filename}: {e}"); return; } @@ -788,12 +1010,7 @@ fn main() { } }; - match linker.link(&mut output) { - Err(e) => eprintln!("Error linking: {e}"), - Ok(warnings) => { - for warning in warnings { - eprintln!("Warning: {warning}"); - } - } + if let Err(e) = linker.link(&mut output) { + eprintln!("Error linking: {e}"); } } |