Coverage for ase / io / ulm.py: 90.48%
378 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-03-30 08:22 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-03-30 08:22 +0000
1# fmt: off
3"""
4ULM files
5=========
7*Simple and efficient pythonic file-format*
9Stores ndarrays as binary data and Python's built-in datatypes
10(bool, int, float, complex, str, dict, list, tuple, None) as json.
12.. autofunction:: open
13.. autoexception:: InvalidULMFileError
16File layout
17-----------
19When there is only a single item::
21 0: "- of Ulm" (magic prefix, ascii)
22 8: " " (tag, ascii)
23 24: version (int64)
24 32: nitems (int64)
25 40: 48 (position of offsets, int64)
26 48: p0 (offset to json data, int64)
27 56: array1, array2, ... (8-byte aligned ndarrays)
28 p0: n (length of json data, int64)
29 p0+8: json data
30 p0+8+n: EOF
33Examples
34--------
36Writing:
38>>> import numpy as np
39>>> import ase.io.ulm as ulm
40>>> with ulm.open('x.ulm', 'w') as w:
41... w.write(a=np.ones(7), b=42, c='abc')
42... w.write(d=3.14)
45Reading:
47>>> r = ulm.open('x.ulm')
48>>> print(r.c)
49abc
50>>> r.close()
52To see what's inside 'x.ulm' do this::
54 $ ase ulm x.ulm
55 x.ulm (tag: "", 1 item)
56 item #0:
57 {
58 a: <ndarray shape=(7,) dtype=float64>,
59 b: 42,
60 c: abc,
61 d: 3.14}
64.. autoclass:: Writer
65 :members:
67.. autoclass:: Reader
68 :members:
71More examples
72-------------
74In the following we append to the ulm-file from above and demonstrae
75how to write a big array in chunks:
77>>> w = ulm.open('x.ulm', 'a')
78>>> w.add_array('bigarray', (10, 1000), float)
79>>> for i in range(10):
80... w.fill(np.ones(1000))
81...
82>>> w.close()
84Now read first and second items:
86>>> with ulm.open('x.ulm') as r:
87... print(r.keys())
88dict_keys(['a', 'b', 'c', 'd'])
89>>> with ulm.open('x.ulm', index=1) as r:
90... print(r.keys())
91dict_keys(['bigarray'])
93To get all the data, it is possible to iterate over the items in the file.
95>>> for i, r in enumerate(ulm.Reader('x.ulm')):
96... for k in r.keys():
97... print(i, k)
980 a
990 b
1000 c
1010 d
1021 bigarray
103>>> r.close()
105The different parts (items) of the file are numbered by the index
106argument:
108>>> r = ulm.Reader('x.ulm')
109>>> r[1].bigarray.shape
110(10, 1000)
111>>> r.close()
114Versions
115--------
1171) Initial version.
1192) Added support for big endian machines. Json data may now have
120 _little_endian=False item.
1223) Changed magic string from "AFFormat" to "- of Ulm".
123"""
125import numbers
126from pathlib import Path
128import numpy as np
130from ase.io.formats import is_compressed
131from ase.io.jsonio import decode, encode
132from ase.utils import plural
134VERSION = 3
135N1 = 42 # block size - max number of items: 1, N1, N1*N1, N1*N1*N1, ...
138def open(filename, mode='r', index=None, tag=None):
139 """Open ulm-file.
141 filename: str
142 Filename.
143 mode: str
144 Mode. Must be 'r' for reading, 'w' for writing to a new file
145 (overwriting an existing one) or 'a' for appending to an existing file.
146 index: int
147 Index of item to read. Defaults to 0.
148 tag: str
149 Magic ID string.
151 Returns a :class:`Reader` or a :class:`Writer` object. May raise
152 :class:`InvalidULMFileError`.
153 """
154 if mode == 'r':
155 assert tag is None
156 return Reader(filename, index or 0)
157 if mode not in 'wa':
158 2 / 0
159 assert index is None
160 return Writer(filename, mode, tag or '')
163ulmopen = open
166def align(fd):
167 """Advance file descriptor to 8 byte alignment and return position."""
168 pos = fd.tell()
169 r = pos % 8
170 if r == 0:
171 return pos
172 fd.write(b'#' * (8 - r))
173 return pos + 8 - r
176def writeint(fd, n, pos=None):
177 """Write 64 bit integer n at pos or current position."""
178 if pos is not None:
179 fd.seek(pos)
180 a = np.array(n, np.int64)
181 if not np.little_endian:
182 a.byteswap(True)
183 fd.write(a.tobytes())
186def readints(fd, n):
187 a = np.frombuffer(fd.read(int(n * 8)), dtype=np.int64, count=n)
188 if not np.little_endian:
189 # Cannot use in-place byteswap because frombuffer()
190 # returns readonly view
191 a = a.byteswap()
192 return a
195def file_has_fileno(fd):
196 """Tell whether file implements fileio() or not.
198 array.tofile(fd) works only on files with fileno().
199 numpy may write faster to physical files using fileno().
201 For files without fileno() we use instead fd.write(array.tobytes()).
202 Either way we need to distinguish."""
204 try:
205 fno = fd.fileno # AttributeError?
206 fno() # IOError/OSError? (Newer python: OSError is IOError)
207 except (AttributeError, OSError):
208 return False
209 return True
212class Writer:
213 def __init__(self, fd, mode='w', tag='', data=None):
214 """Create writer object.
216 fd: str
217 Filename.
218 mode: str
219 Mode. Must be 'w' for writing to a new file (overwriting an
220 existing one) and 'a' for appending to an existing file.
221 tag: str
222 Magic ID string.
223 """
225 assert mode in 'aw'
227 # Header to be written later:
228 self.header = b''
230 if data is None:
231 if np.little_endian:
232 data = {}
233 else:
234 data = {'_little_endian': False}
236 if isinstance(fd, str):
237 fd = Path(fd)
239 if mode == 'w' or (isinstance(fd, Path) and
240 not (fd.is_file() and
241 fd.stat().st_size > 0)):
242 self.nitems = 0
243 self.pos0 = 48
244 self.offsets = np.array([-1], np.int64)
246 if isinstance(fd, Path):
247 fd = fd.open('wb')
249 # File format identifier and other stuff:
250 a = np.array([VERSION, self.nitems, self.pos0], np.int64)
251 if not np.little_endian:
252 a.byteswap(True)
253 self.header = (f'- of Ulm{tag:16}'.encode('ascii') +
254 a.tobytes() +
255 self.offsets.tobytes())
256 else:
257 if isinstance(fd, Path):
258 fd = fd.open('r+b')
260 version, self.nitems, self.pos0, offsets = read_header(fd)[1:]
261 assert version == VERSION
262 n = 1
263 while self.nitems > n:
264 n *= N1
265 padding = np.zeros(n - self.nitems, np.int64)
266 self.offsets = np.concatenate((offsets, padding))
267 fd.seek(0, 2)
269 self.fd = fd
270 self.hasfileno = file_has_fileno(fd)
272 self.data = data
274 # date for array being filled:
275 self.nmissing = 0 # number of missing numbers
276 self.shape = None
277 self.dtype = None
279 def __enter__(self):
280 return self
282 def __exit__(self, exc_type, exc_value, tb):
283 self.close()
285 def add_array(self, name, shape, dtype=float):
286 """Add ndarray object.
288 Set name, shape and dtype for array and fill in the data in chunks
289 later with the fill() method.
290 """
292 self._write_header()
294 if isinstance(shape, int):
295 shape = (shape,)
297 shape = tuple(int(s) for s in shape) # Convert np.int64 to int
299 i = align(self.fd)
301 self.data[name + '.'] = {
302 'ndarray': (shape, np.dtype(dtype).name, i)}
304 assert self.nmissing == 0, 'last array not done'
306 self.dtype = dtype
307 self.shape = shape
308 self.nmissing = np.prod(shape)
310 def _write_header(self):
311 # We want to delay writing until there is any real data written.
312 # Some people rely on zero file size.
313 if self.header:
314 self.fd.write(self.header)
315 self.header = b''
317 def fill(self, a):
318 """Fill in ndarray chunks for array currently being written."""
319 assert a.dtype == self.dtype
320 assert a.shape[1:] == self.shape[len(self.shape) - a.ndim + 1:]
321 self.nmissing -= a.size
322 assert self.nmissing >= 0
324 if self.hasfileno:
325 a.tofile(self.fd)
326 else:
327 self.fd.write(a.tobytes())
329 def sync(self):
330 """Write data dictionary.
332 Write bool, int, float, complex and str data, shapes and
333 dtypes for ndarrays."""
335 self._write_header()
337 assert self.nmissing == 0
338 i = self.fd.tell()
339 s = encode(self.data).encode()
340 writeint(self.fd, len(s))
341 self.fd.write(s)
343 n = len(self.offsets)
344 if self.nitems >= n:
345 offsets = np.zeros(n * N1, np.int64)
346 offsets[:n] = self.offsets
347 self.pos0 = align(self.fd)
349 buf = offsets if np.little_endian else offsets.byteswap()
351 if self.hasfileno:
352 buf.tofile(self.fd)
353 else:
354 self.fd.write(buf.tobytes())
355 writeint(self.fd, self.pos0, 40)
356 self.offsets = offsets
358 self.offsets[self.nitems] = i
359 writeint(self.fd, i, self.pos0 + self.nitems * 8)
360 self.nitems += 1
361 writeint(self.fd, self.nitems, 32)
362 self.fd.flush()
363 self.fd.seek(0, 2) # end of file
364 if np.little_endian:
365 self.data = {}
366 else:
367 self.data = {'_little_endian': False}
369 def write(self, *args, **kwargs):
370 """Write data.
372 Examples::
374 writer.write('n', 7)
375 writer.write(n=7)
376 writer.write(n=7, s='abc', a=np.zeros(3), abc=obj)
378 If obj is not one of the supported data types (bool, int, float,
379 complex, tupl, list, dict, None or ndarray) then it must have a
380 obj.write(childwriter) method.
381 """
383 if args:
384 name, value = args
385 kwargs[name] = value
387 self._write_header()
389 for name, value in kwargs.items():
390 if isinstance(value, (bool, int, float, complex,
391 dict, list, tuple, str,
392 type(None))):
393 self.data[name] = value
394 elif hasattr(value, '__array__'):
395 value = np.asarray(value)
396 if value.ndim == 0:
397 self.data[name] = value.item()
398 else:
399 self.add_array(name, value.shape, value.dtype)
400 self.fill(value)
401 else:
402 value.write(self.child(name))
404 def child(self, name):
405 """Create child-writer object."""
406 self._write_header()
407 dct = self.data[name + '.'] = {}
408 return Writer(self.fd, data=dct)
410 def close(self):
411 """Close file."""
412 n = int('_little_endian' in self.data)
413 if len(self.data) > n:
414 # There is more than the "_little_endian" key.
415 # Write that stuff before closing:
416 self.sync()
417 else:
418 # Make sure header has been written (empty ulm-file):
419 self._write_header()
420 self.fd.close()
422 def __len__(self):
423 return int(self.nitems)
426class DummyWriter:
427 def __enter__(self):
428 return self
430 def __exit__(self, exc_type, exc_value, tb):
431 self.close()
433 def add_array(self, name, shape, dtype=float):
434 pass
436 def fill(self, a):
437 pass
439 def sync(self):
440 pass
442 def write(self, *args, **kwargs):
443 pass
445 def child(self, name):
446 return self
448 def close(self):
449 pass
451 def __len__(self):
452 return 0
455def read_header(fd):
456 fd.seek(0)
457 if fd.read(8) not in [b'- of Ulm', b'AFFormat']:
458 raise InvalidULMFileError('This is not an ULM formatted file.')
459 tag = fd.read(16).decode('ascii').rstrip()
460 version, nitems, pos0 = readints(fd, 3)
461 fd.seek(pos0)
462 offsets = readints(fd, nitems)
463 return tag, version, nitems, pos0, offsets
466class InvalidULMFileError(IOError):
467 pass
470class Reader:
471 def __init__(self, fd, index=0, data=None, _little_endian=None):
472 """Create reader."""
474 self._little_endian = _little_endian
476 self.must_close_fd = False
477 if not hasattr(fd, 'read'):
478 self.must_close_fd = True
479 fd = Path(fd).open('rb')
481 self._fd = fd
482 self._index = index
484 if data is None:
485 try:
486 (self._tag, self._version, self._nitems, self._pos0,
487 self._offsets) = read_header(fd)
488 except BaseException:
489 if self.must_close_fd:
490 fd.close()
491 raise
492 if self._nitems > 0:
493 data = self._read_data(index)
494 else:
495 data = {}
497 self._parse_data(data)
499 def __enter__(self):
500 return self
502 def __exit__(self, exc_type, exc_value, tb):
503 self.close()
505 def _parse_data(self, data):
506 self._data = {}
507 for name, value in data.items():
508 if name.endswith('.'):
509 if 'ndarray' in value:
510 shape, dtype, offset = value['ndarray']
511 dtype = dtype.encode() # compatibility with Numpy 1.4
512 value = NDArrayReader(self._fd,
513 shape,
514 np.dtype(dtype),
515 offset,
516 self._little_endian)
517 else:
518 value = Reader(self._fd, data=value,
519 _little_endian=self._little_endian)
520 name = name[:-1]
522 self._data[name] = value
524 def get_tag(self):
525 """Return special tag string."""
526 return self._tag
528 def keys(self):
529 """Return list of keys."""
530 return self._data.keys()
532 def asdict(self):
533 """Read everything now and convert to dict."""
534 dct = {}
535 for key, value in self._data.items():
536 if isinstance(value, NDArrayReader):
537 value = value.read()
538 elif isinstance(value, Reader):
539 value = value.asdict()
540 dct[key] = value
541 return dct
543 __dir__ = keys # needed for tab-completion
545 def __getattr__(self, attr):
546 try:
547 value = self._data[attr]
548 except KeyError:
549 raise AttributeError(attr)
550 if isinstance(value, NDArrayReader):
551 return value.read()
552 return value
554 def __contains__(self, key):
555 return key in self._data
557 def __iter__(self):
558 yield self
559 for i in range(self._index + 1, self._nitems):
560 self._index = i
561 data = self._read_data(i)
562 self._parse_data(data)
563 yield self
565 def get(self, attr, value=None):
566 """Get attr or value if no such attr."""
567 try:
568 return self.__getattr__(attr)
569 except AttributeError:
570 return value
572 def proxy(self, name, *indices):
573 value = self._data[name]
574 assert isinstance(value, NDArrayReader)
575 if indices:
576 return value.proxy(*indices)
577 return value
579 def __len__(self):
580 return int(self._nitems)
582 def _read_data(self, index):
583 self._fd.seek(self._offsets[index])
584 size = int(readints(self._fd, 1)[0])
585 data = decode(self._fd.read(size).decode(), False)
586 self._little_endian = data.pop('_little_endian', True)
587 return data
589 def __getitem__(self, index):
590 """Return Reader for item *index*."""
591 data = self._read_data(index)
592 return Reader(self._fd, index, data, self._little_endian)
594 def tostr(self, verbose=False, indent=' '):
595 keys = sorted(self._data)
596 strings = []
597 for key in keys:
598 value = self._data[key]
599 if verbose and isinstance(value, NDArrayReader):
600 value = value.read()
601 if isinstance(value, NDArrayReader):
602 s = '<ndarray shape={} dtype={}>'.format(value.shape,
603 value.dtype)
604 elif isinstance(value, Reader):
605 s = value.tostr(verbose, indent + ' ')
606 else:
607 s = str(value).replace('\n', '\n ' + ' ' * len(key) + indent)
608 strings.append(f'{indent}{key}: {s}')
609 return '{\n' + ',\n'.join(strings) + '}'
611 def __str__(self):
612 return self.tostr(False, '').replace('\n', ' ')
614 def close(self):
615 if self.must_close_fd:
616 self._fd.close()
619class NDArrayReader:
620 def __init__(self, fd, shape, dtype, offset, little_endian):
621 self.fd = fd
622 self.hasfileno = file_has_fileno(fd)
623 self.shape = tuple(shape)
624 self.dtype = dtype
625 self.offset = offset
626 self.little_endian = little_endian
628 self.ndim = len(self.shape)
629 self.itemsize = dtype.itemsize
630 self.size = np.prod(self.shape)
631 self.nbytes = self.size * self.itemsize
633 self.scale = 1.0
634 self.length_of_last_dimension = None
636 def __len__(self):
637 return int(self.shape[0]) # Python-2.6 needs int
639 def read(self):
640 return self[:]
642 def __getitem__(self, i):
643 if isinstance(i, numbers.Integral):
644 if i < 0:
645 i += len(self)
646 return self[i:i + 1][0]
647 start, stop, step = i.indices(len(self))
648 stride = np.prod(self.shape[1:], dtype=int)
649 offset = self.offset + start * self.itemsize * stride
650 self.fd.seek(offset)
651 count = (stop - start) * stride
652 if not is_compressed(self.fd) and self.hasfileno:
653 a = np.fromfile(self.fd, self.dtype, count)
654 else:
655 # Not as fast, but works for reading from tar-files:
656 a = np.frombuffer(self.fd.read(int(count * self.itemsize)),
657 self.dtype)
658 a.shape = (stop - start,) + self.shape[1:]
659 if step != 1:
660 a = a[::step].copy()
661 if self.little_endian != np.little_endian:
662 # frombuffer() returns readonly array
663 a = a.byteswap(inplace=a.flags.writeable)
664 if self.length_of_last_dimension is not None:
665 a = a[..., :self.length_of_last_dimension]
666 if self.scale != 1.0:
667 a *= self.scale
668 return a
670 def proxy(self, *indices):
671 stride = self.size // len(self)
672 start = 0
673 for i, index in enumerate(indices):
674 start += stride * index
675 stride //= self.shape[i + 1]
676 offset = self.offset + start * self.itemsize
677 p = NDArrayReader(self.fd, self.shape[i + 1:], self.dtype,
678 offset, self.little_endian)
679 p.scale = self.scale
680 return p
683def print_ulm_info(filename, index=None, verbose=False):
684 b = ulmopen(filename, 'r')
685 if index is None:
686 indices = range(len(b))
687 else:
688 indices = [index]
689 print('{} (tag: "{}", {})'.format(filename, b.get_tag(),
690 plural(len(b), 'item')))
691 for i in indices:
692 print(f'item #{i}:')
693 print(b[i].tostr(verbose))
696def copy(reader: str | Path | Reader,
697 writer: str | Path | Writer,
698 exclude: set[str] = set(),
699 name: str = '') -> None:
700 """Copy from reader to writer except for keys in exclude."""
701 close_reader = False
702 close_writer = False
703 if not isinstance(reader, Reader):
704 reader = Reader(reader)
705 close_reader = True
706 if not isinstance(writer, Writer):
707 writer = Writer(writer)
708 close_writer = True
709 for key, value in reader._data.items():
710 if name + '.' + key in exclude:
711 continue
712 if isinstance(value, NDArrayReader):
713 value = value.read()
714 if isinstance(value, Reader):
715 copy(value, writer.child(key), exclude, name + '.' + key)
716 else:
717 writer.write(key, value)
718 if close_reader:
719 reader.close()
720 if close_writer:
721 writer.close()