summaryrefslogtreecommitdiff
path: root/pom_parser/__init__.py
blob: 35d0f497e3b24944585079ddf41efbff3d4c009e (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
r'''Configuration for the [POM configuration file format](https://www.pom.computer).

\mainpage pom_parser

See \ref pom_parser.
'''
import io
from typing import Optional, Any, Iterable, Iterator

class Error(ValueError):
	r'''An error raised by pom_parser.

Attributes
----------
- `next: Optional[Error]` -
	Next error (used when there are multiple errors in a file)
- `message: str` -
	Error message as a string. Note that this does not include
	file/line information, or all errors in a list, so you most
	likely want to use str(error) instead.
- `file: str` -
	File name where error occurred.
- `line: int` -
	Line number where error occurred.
'''

	next: Optional['Error']
	message: str
	file: str
	line: int
	def __init__(self, file: str, line_number: int, message: str) -> None:
		self.file = file
		self.line_number = line_number
		self.message = message
		self.next = None

	def __str__(self) -> str:
		err: Optional['Error'] = self
		messages = []
		while err:
			messages.append(f'{err.file}:{err.line_number}: {err.message}')
			err = err.next
		return '\n'.join(messages)

	@staticmethod
	def _from_list(l: list['Error']) -> 'Error':
		for (i, e) in enumerate(l[:-1]):
			e.next = l[i+1]
		return l[0]

class Item:
	r'''
An item (key-value pair) in a POM configuration.

Attributes
----------
- `key: str` -
	The key.
- `value: str` -
	The value.
- `file: str` -
	File name where item was defined.
- `line: int` -
	Line number where item was defined.
- `read: bool` -
	Has this item been accessed by a \ref pom_parser.Configuration `get_*` method?
'''
	key: str
	value: str
	file: str
	line: int
	read: bool
	def __repr__(self) -> str:
		return f'<Item {self.key} at {self.file}:{self.line}>'

	def _error(self, message: str) -> Error:
		return Error(self.file, self.line, message)

	def _parse_uint(self, using: Optional[str] = None) -> Optional[int]:
		s = self.value if using is None else using
		if s.startswith('+'):
			s = s[1:]
		if s.startswith('0x') or s.startswith('0X'):
			if not all(c in '0123456789abcdefABCDEF' for c in s[2:]):
				return None
			value = int(s[2:], 16)
			if value >> 53:
				return None
			return value
		if s == '0':
			return 0
		if s.startswith('0'):
			return None
		if not all(c in '0123456789' for c in s):
			return None
		value = int(s)
		if value >> 53:
			return None
		return value

	def _parse_int(self) -> Optional[int]:
		sign = 1
		value = self.value
		if value.startswith('-'):
			if value.startswith('-+'):
				return None
			sign = -1
			value = value[1:]
		uint = self._parse_uint(value)
		if uint is None:
			return None
		return uint * sign

	def _parse_float(self) -> Optional[float]:
		value = self.value
		if not all(c in '0123456789eE+-.' for c in value):
			return None
		for (i, c) in enumerate(value):
			# ensure . is preceded and followed by digit
			if c == '.' and (i == 0 or i == len(value)-1 or \
				not value[i+1].isdigit() or not value[i-1].isdigit()):
				return None
		return float(value)

	def _parse_bool(self) -> Optional[bool]:
		value = self.value
		if value in ('yes', 'true', 'on'):
			return True
		if value in ('no', 'false', 'off'):
			return False
		return None

	def _parse_list(self) -> list[str]:
		chars = iter(self.value)
		list_ = []
		entry: list[str] = []
		while (c := next(chars, '')):
			if c == ',':
				list_.append(''.join(entry).strip(' \t'))
				entry = []
			elif c == '\\':
				c = next(chars, '')
				if c not in ',\\':
					entry.append('\\')
				entry.append(c)
			else:
				entry.append(c)
		last_entry = ''.join(entry).strip(' \t')
		if last_entry:
			list_.append(last_entry)
		return list_

class Configuration:
	r'''A POM configuration.'''
	_items: dict[str, Item]
	_section_locations: dict[str, tuple[str, int]]
	def __repr__(self) -> str:
		result = []
		for item in self._items.values():
			result.append(f'{item.key}: {repr(item.value)}')
		return '\n'.join(result)

	def _init(self, items: dict[str, Item]) -> None:
		self._items = items
		self._section_locations = {}
		for item in self._items.values():
			for (i, c) in enumerate(item.key):
				if c != '.':
					continue
				section = item.key[:i]
				if section not in self._section_locations \
					or self._section_locations[section][1] > item.line:
					self._section_locations[section] = (item.file, item.line)

	def has(self, key: str) -> bool:
		r'''Returns whether this configuration contains `key`.'''
		return key in self._items

	def location(self, key: str) -> Optional[tuple[str, int]]:
		r'''Returns the location of `key` as `(filename, line_number)`, or `None` if it's not defined.'''
		item = self._items.get(key)
		if item is None:
			return self._section_locations.get(key, None)
		return (item.file, item.line)

	def get(self, key: str, default: Optional[str] = None) -> Optional[str]:
		r'''Get value associated with `key`.

\param key Key to look up
\param default Default to use when `key` is not defined
		'''
		item = self._items.get(key)
		if item is None:
			return default
		item.read = True
		return item.value

	def get_uint(self, key: str, default: Optional[int] = None) -> Optional[int]:
		r'''Get value associated with `key`, and parse as an unsigned integer.

\param key Key to look up
\param default Default to use when `key` is not defined

\exception pom_parser.Error The key is defined, but its value is
not a valid unsigned integer (< 2^53).
		'''
		item = self._items.get(key)
		if item is None:
			return None if default is None else int(default)
		item.read = True
		uint = item._parse_uint()
		if uint is None:
			raise item._error(f'Value {repr(item.value)} for {item.key} is '
				'not a valid (non-negative) integer.')
		return uint

	def get_int(self, key: str, default: Optional[int] = None) -> Optional[int]:
		r'''Get value associated with `key`, and parse as an integer.

\param key Key to look up
\param default Default to use when `key` is not defined

\exception pom_parser.Error The key is defined, but
its value is not a valid integer (with absolute value < 2^53).
		'''
		item = self._items.get(key)
		if item is None:
			return None if default is None else int(default)
		item.read = True
		intv = item._parse_int()
		if intv is None:
			raise item._error(f'Value {repr(item.value)} for {item.key} is not a valid integer.')
		return intv

	def get_float(self, key: str, default: Optional[float] = None) -> Optional[float]:
		r'''Get value associated with `key`, and parse as a floating-point number.

\param key Key to look up
\param default Default to use when `key` is not defined

\exception pom_parser.Error The key is defined, but its value is not a valid floating-point number.
		'''
		item = self._items.get(key)
		if item is None:
			return None if default is None else float(default)
		item.read = True
		intv = item._parse_float()
		if intv is None:
			raise item._error(f'Value {repr(item.value)} for {item.key} is not a valid number.')
		return intv

	def get_bool(self, key: str, default: Optional[bool] = None) -> Optional[bool]:
		r'''Get value associated with `key`, and parse as a boolean (yes/no/on/off/true/false).

\param key Key to look up
\param default Default to use when `key` is not defined

\exception pom_parser.Error The key is defined, but its value is not one of the six mentioned above.
		'''
		item = self._items.get(key)
		if item is None:
			return None if default is None else bool(default)
		item.read = True
		boolv = item._parse_bool()
		if boolv is None:
			raise item._error(f'Value {repr(item.value)} for {item.key} is '
				'invalid (want on/off/yes/no/true/false)')
		return boolv

	def get_list(self, key: str, default: Optional[list[str]] = None) -> Optional[list[str]]:
		r'''Get value associated with `key`, and parse as a comma-separated list.

Literal commas can be included in the list by using `\,`.

\param key Key to look up
\param default Default to use when `key` is not defined
		'''
		item = self._items.get(key)
		if item is None:
			return None if default is None else default
		item.read = True
		return item._parse_list()


	def items(self) -> Iterator[Item]:
		r'''Get all items (key-value pairs) in configuration.

The order of the returned items is arbitrary and may change in future versions.'''
		import copy
		return iter(map(copy.copy, self._items.values()))

	def keys(self) -> Iterator[str]:
		r'''Get all "direct" keys (unique first components of keys) in configuration.

The order of the returned keys is arbitrary and may change in future versions.'''
		return iter({key.split('.', 1)[0] for key in self._items})

	def unread_keys(self) -> Iterator[str]:
		r'''Get all keys which have not been accessed using a `get_*` method.

The order of the returned keys is arbitrary and may change in future versions.'''
		return (item.key for item in self._items.values() if not item.read)

	def section(self, name: str) -> 'Configuration':
		r'''Extract a "section" out of a configuration.

Specifically, this will return a configuration consisting of all keys starting
with `name.` (with the `name.` stripped out) and their values.
'''
		import copy
		section_items = {}
		name_dot = name + '.'
		for item in self.items():
			if item.key.startswith(name_dot):
				item_copy = copy.copy(item)
				section_items[item.key[len(name_dot):]] = item_copy
		conf = Configuration()
		conf._init(section_items)
		return conf

	def merge(self, other: 'Configuration') -> 'Configuration':
		'''Merge `other` configuration into `self`, preferring values in `other`.'''
		import copy
		new_items = {key: copy.copy(item) for key, item in other._items.items()}
		for key, item in self._items.items():
			if key not in new_items:
				new_items[key] = copy.copy(item)
		conf = Configuration()
		conf._init(new_items)
		return conf


def _parse_hex_digit(d: Optional[str]) -> Optional[int]:
	if d in list('0123456789'):
		return ord(d) - ord('0')
	if d in list('abcdef'):
		return ord(d) - ord('a') + 10
	if d in list('ABCDEF'):
		return ord(d) - ord('A') + 10
	return None

class _Parser:
	line_number: int
	filename: str
	current_section: str
	errors: list[Error]
	file: io.BufferedIOBase
	items: dict[str, Item]

	def __init__(self, filename: str, file: io.BufferedIOBase):
		self.errors = []
		self.filename = filename
		self.file = file
		self.line_number = 0
		self.current_section = ''
		self.items = {}

	def _error(self, message: str) -> None:
		self.errors.append(Error(self.filename, self.line_number, message))

	def _check_key(self, key: str) -> None:
		if not key:
			self._error('Empty key (expected something before =)')
			return
		if '..' in key:
			self._error(f"Key {key} shouldn't contain ..")
			return
		if key.startswith('.'):
			self._error(f"Key {key} shouldn't start with .")
			return
		if key.endswith('.'):
			self._error(f"Key {key} shouldn't end with .")
			return
		for c in key:
			o = ord(c)
			if (0xf800000178000001fc001bffffffffff >> o) & 1:
				self._error(f"Key {key} contains illegal character {c}")

	def _process_escape_sequence(self, chars: Iterator[str]) -> str:
		def bad_escape_sequence(chs: Iterable[Optional[str]]) -> str:
			seq = ''.join(c for c in chs if c)
			self._error(f'Invalid escape sequence: \\{seq}')
			return ''
		c = next(chars, None)
		simple_sequences: dict[str | None, str] = {
			'n': '\n', 't': '\t', 'r': '\r',
			'\'': '\'', '"': '"', '`': '`',
			',': '\\,', '\\': '\\'
		}
		simple = simple_sequences.get(c)
		if simple is not None:
			return simple
		if c == 'x':
			c1 = next(chars, None)
			c2 = next(chars, None)
			dig1 = _parse_hex_digit(c1)
			dig2 = _parse_hex_digit(c2)
			if dig1 is None or dig2 is None:
				return bad_escape_sequence((c, c1, c2))
			value = dig1 << 4 | dig2
			if value == 0 or value >= 0x80:
				return bad_escape_sequence((c, c1, c2))
			return chr(value)
		if c == 'u':
			open_brace = next(chars, None)
			if open_brace != '{':
				return bad_escape_sequence((c, open_brace))
			sequence: list[str | None] = ['u{']
			value = 0
			for i in range(7):
				c = next(chars, None)
				sequence.append(c)
				if c == '}':
					break
				if i == 6:
					return bad_escape_sequence(sequence)
				digit = _parse_hex_digit(c)
				if digit is None:
					return bad_escape_sequence(sequence)
				value <<= 4
				value |= digit
			if value == 0 or \
				0xD800 <= value <= 0xDFFF or \
				value > 0x10FFFF:
				return bad_escape_sequence(sequence)
			return chr(value)
		bad_escape_sequence((c,))
		return ''

	def _read_line(self) -> Optional[str]:
		line_bytes = self.file.readline()
		if not line_bytes:
			return None
		self.line_number += 1
		try:
			line = line_bytes.decode()
		except UnicodeDecodeError:
			self._error('Bad UTF-8')
			return ''
		if self.line_number == 1 and line.startswith('\ufeff'):
			# skip byte order mark
			line = line[1:]
		if line.endswith('\r\n'):
			line = line[:-2]
		elif line.endswith('\n'):
			line = line[:-1]
		for c in line:
			if ord(c) < 32 and c != '\t':
				self._error(f'Invalid character in file: ASCII control character {ord(c)}')
				return ''
		return line

	def _parse_quoted_value(self, value_start: str) -> str:
		delimiter = value_start[0]
		start_line = self.line_number
		line = value_start[1:] + '\n'
		value = []
		while True:
			chars = iter(line)
			while (c := next(chars, None)) is not None:
				if c == '\\':
					value.append(self._process_escape_sequence(chars))
				elif c == delimiter:
					for stray in chars:
						if stray not in ' \t\n':
							self._error(f'Stray {stray} after string.')
					return ''.join(value)
				else:
					value.append(c)
			next_line = self._read_line()
			if next_line is None:
				self.line_number = start_line
				self._error(f'Closing {delimiter} not found.')
				return ''
			line = next_line + '\n'

	def _parse_line(self) -> bool:
		line = self._read_line()
		if line is None:
			return False
		line = line.lstrip(' \t')
		if not line or line.startswith('#'):
			return True
		if line.startswith('['):
			line = line.rstrip(' \t')
			if not line.endswith(']'):
				self._error('[ with no matching ]')
				return True
			self.current_section = line[1:-1]
			if self.current_section:
				self._check_key(self.current_section)
			return True
		equals_idx = line.find('=')
		if equals_idx == -1:
			self._error('Invalid line — should either start with [ or contain =')
			return True
		relative_key = line[:equals_idx].rstrip(' \t')
		self._check_key(relative_key)
		value = line[equals_idx+1:].lstrip(' \t')
		if value.startswith('"') or value.startswith('`'):
			value = self._parse_quoted_value(value)
		else:
			value = value.rstrip(' \t')
		key = f'{self.current_section}.{relative_key}' if self.current_section else relative_key
		item = Item()
		item.key = key
		item.read = False
		item.value = value
		item.file = self.filename
		item.line = self.line_number
		self.items[key] = item
		return True

def load_file(filename: str, file: io.BufferedIOBase) -> Configuration:
	r'''Load a configuration from a file object.

\param filename File name to use for errors.
\param file File object, such as one returned from `open`.

\exception pom_parser.Error The configuration is invalid in some way.'''
	parser = _Parser(filename, file)
	while parser._parse_line():
		pass
	if parser.errors:
		raise Error._from_list(parser.errors)
	conf = Configuration()
	conf._init(parser.items)
	return conf

def load_string(filename: str, string: str) -> Configuration:
	r'''Load a configuration from a string.

\param filename File name to use for errors.
\param string String containing configuration.

\exception pom_parser.Error The configuration is invalid in some way.'''
	return load_file(filename, io.BytesIO(string.encode()))

def load_path(path: str) -> Configuration:
	r'''Load a configuration from a file path.

\exception pom_parser.Error The configuration is invalid in some way.'''
	with open(path, 'rb') as file:
		return load_file(path, file)