Coverage for /builds/ase/ase/ase/io/ulm.py: 90.50%
379 statements
« prev ^ index » next coverage.py v7.5.3, created at 2025-08-02 00:12 +0000
« prev ^ index » next coverage.py v7.5.3, created at 2025-08-02 00:12 +0000
1# fmt: off
3"""
4ULM files
5=========
7*Simple and efficient pythonic file-format*
9Stores ndarrays as binary data and Python's built-in datatypes
10(bool, int, float, complex, str, dict, list, tuple, None) as json.
12.. autofunction:: open
13.. autoexception:: InvalidULMFileError
16File layout
17-----------
19When there is only a single item::
21 0: "- of Ulm" (magic prefix, ascii)
22 8: " " (tag, ascii)
23 24: version (int64)
24 32: nitems (int64)
25 40: 48 (position of offsets, int64)
26 48: p0 (offset to json data, int64)
27 56: array1, array2, ... (8-byte aligned ndarrays)
28 p0: n (length of json data, int64)
29 p0+8: json data
30 p0+8+n: EOF
33Examples
34--------
36Writing:
38>>> import numpy as np
39>>> import ase.io.ulm as ulm
40>>> with ulm.open('x.ulm', 'w') as w:
41... w.write(a=np.ones(7), b=42, c='abc')
42... w.write(d=3.14)
45Reading:
47>>> r = ulm.open('x.ulm')
48>>> print(r.c)
49abc
50>>> r.close()
52To see what's inside 'x.ulm' do this::
54 $ ase ulm x.ulm
55 x.ulm (tag: "", 1 item)
56 item #0:
57 {
58 a: <ndarray shape=(7,) dtype=float64>,
59 b: 42,
60 c: abc,
61 d: 3.14}
64.. autoclass:: Writer
65 :members:
67.. autoclass:: Reader
68 :members:
71More examples
72-------------
74In the following we append to the ulm-file from above and demonstrae
75how to write a big array in chunks:
77>>> w = ulm.open('x.ulm', 'a')
78>>> w.add_array('bigarray', (10, 1000), float)
79>>> for i in range(10):
80... w.fill(np.ones(1000))
81...
82>>> w.close()
84Now read first and second items:
86>>> with ulm.open('x.ulm') as r:
87... print(r.keys())
88dict_keys(['a', 'b', 'c', 'd'])
89>>> with ulm.open('x.ulm', index=1) as r:
90... print(r.keys())
91dict_keys(['bigarray'])
93To get all the data, it is possible to iterate over the items in the file.
95>>> for i, r in enumerate(ulm.Reader('x.ulm')):
96... for k in r.keys():
97... print(i, k)
980 a
990 b
1000 c
1010 d
1021 bigarray
103>>> r.close()
105The different parts (items) of the file are numbered by the index
106argument:
108>>> r = ulm.Reader('x.ulm')
109>>> r[1].bigarray.shape
110(10, 1000)
111>>> r.close()
114Versions
115--------
1171) Initial version.
1192) Added support for big endian machines. Json data may now have
120 _little_endian=False item.
1223) Changed magic string from "AFFormat" to "- of Ulm".
123"""
125import numbers
126from pathlib import Path
127from typing import Set, Union
129import numpy as np
131from ase.io.formats import is_compressed
132from ase.io.jsonio import decode, encode
133from ase.utils import plural
135VERSION = 3
136N1 = 42 # block size - max number of items: 1, N1, N1*N1, N1*N1*N1, ...
139def open(filename, mode='r', index=None, tag=None):
140 """Open ulm-file.
142 filename: str
143 Filename.
144 mode: str
145 Mode. Must be 'r' for reading, 'w' for writing to a new file
146 (overwriting an existing one) or 'a' for appending to an existing file.
147 index: int
148 Index of item to read. Defaults to 0.
149 tag: str
150 Magic ID string.
152 Returns a :class:`Reader` or a :class:`Writer` object. May raise
153 :class:`InvalidULMFileError`.
154 """
155 if mode == 'r':
156 assert tag is None
157 return Reader(filename, index or 0)
158 if mode not in 'wa':
159 2 / 0
160 assert index is None
161 return Writer(filename, mode, tag or '')
164ulmopen = open
167def align(fd):
168 """Advance file descriptor to 8 byte alignment and return position."""
169 pos = fd.tell()
170 r = pos % 8
171 if r == 0:
172 return pos
173 fd.write(b'#' * (8 - r))
174 return pos + 8 - r
177def writeint(fd, n, pos=None):
178 """Write 64 bit integer n at pos or current position."""
179 if pos is not None:
180 fd.seek(pos)
181 a = np.array(n, np.int64)
182 if not np.little_endian:
183 a.byteswap(True)
184 fd.write(a.tobytes())
187def readints(fd, n):
188 a = np.frombuffer(fd.read(int(n * 8)), dtype=np.int64, count=n)
189 if not np.little_endian:
190 # Cannot use in-place byteswap because frombuffer()
191 # returns readonly view
192 a = a.byteswap()
193 return a
196def file_has_fileno(fd):
197 """Tell whether file implements fileio() or not.
199 array.tofile(fd) works only on files with fileno().
200 numpy may write faster to physical files using fileno().
202 For files without fileno() we use instead fd.write(array.tobytes()).
203 Either way we need to distinguish."""
205 try:
206 fno = fd.fileno # AttributeError?
207 fno() # IOError/OSError? (Newer python: OSError is IOError)
208 except (AttributeError, OSError):
209 return False
210 return True
213class Writer:
214 def __init__(self, fd, mode='w', tag='', data=None):
215 """Create writer object.
217 fd: str
218 Filename.
219 mode: str
220 Mode. Must be 'w' for writing to a new file (overwriting an
221 existing one) and 'a' for appending to an existing file.
222 tag: str
223 Magic ID string.
224 """
226 assert mode in 'aw'
228 # Header to be written later:
229 self.header = b''
231 if data is None:
232 if np.little_endian:
233 data = {}
234 else:
235 data = {'_little_endian': False}
237 if isinstance(fd, str):
238 fd = Path(fd)
240 if mode == 'w' or (isinstance(fd, Path) and
241 not (fd.is_file() and
242 fd.stat().st_size > 0)):
243 self.nitems = 0
244 self.pos0 = 48
245 self.offsets = np.array([-1], np.int64)
247 if isinstance(fd, Path):
248 fd = fd.open('wb')
250 # File format identifier and other stuff:
251 a = np.array([VERSION, self.nitems, self.pos0], np.int64)
252 if not np.little_endian:
253 a.byteswap(True)
254 self.header = (f'- of Ulm{tag:16}'.encode('ascii') +
255 a.tobytes() +
256 self.offsets.tobytes())
257 else:
258 if isinstance(fd, Path):
259 fd = fd.open('r+b')
261 version, self.nitems, self.pos0, offsets = read_header(fd)[1:]
262 assert version == VERSION
263 n = 1
264 while self.nitems > n:
265 n *= N1
266 padding = np.zeros(n - self.nitems, np.int64)
267 self.offsets = np.concatenate((offsets, padding))
268 fd.seek(0, 2)
270 self.fd = fd
271 self.hasfileno = file_has_fileno(fd)
273 self.data = data
275 # date for array being filled:
276 self.nmissing = 0 # number of missing numbers
277 self.shape = None
278 self.dtype = None
280 def __enter__(self):
281 return self
283 def __exit__(self, exc_type, exc_value, tb):
284 self.close()
286 def add_array(self, name, shape, dtype=float):
287 """Add ndarray object.
289 Set name, shape and dtype for array and fill in the data in chunks
290 later with the fill() method.
291 """
293 self._write_header()
295 if isinstance(shape, int):
296 shape = (shape,)
298 shape = tuple(int(s) for s in shape) # Convert np.int64 to int
300 i = align(self.fd)
302 self.data[name + '.'] = {
303 'ndarray': (shape, np.dtype(dtype).name, i)}
305 assert self.nmissing == 0, 'last array not done'
307 self.dtype = dtype
308 self.shape = shape
309 self.nmissing = np.prod(shape)
311 def _write_header(self):
312 # We want to delay writing until there is any real data written.
313 # Some people rely on zero file size.
314 if self.header:
315 self.fd.write(self.header)
316 self.header = b''
318 def fill(self, a):
319 """Fill in ndarray chunks for array currently being written."""
320 assert a.dtype == self.dtype
321 assert a.shape[1:] == self.shape[len(self.shape) - a.ndim + 1:]
322 self.nmissing -= a.size
323 assert self.nmissing >= 0
325 if self.hasfileno:
326 a.tofile(self.fd)
327 else:
328 self.fd.write(a.tobytes())
330 def sync(self):
331 """Write data dictionary.
333 Write bool, int, float, complex and str data, shapes and
334 dtypes for ndarrays."""
336 self._write_header()
338 assert self.nmissing == 0
339 i = self.fd.tell()
340 s = encode(self.data).encode()
341 writeint(self.fd, len(s))
342 self.fd.write(s)
344 n = len(self.offsets)
345 if self.nitems >= n:
346 offsets = np.zeros(n * N1, np.int64)
347 offsets[:n] = self.offsets
348 self.pos0 = align(self.fd)
350 buf = offsets if np.little_endian else offsets.byteswap()
352 if self.hasfileno:
353 buf.tofile(self.fd)
354 else:
355 self.fd.write(buf.tobytes())
356 writeint(self.fd, self.pos0, 40)
357 self.offsets = offsets
359 self.offsets[self.nitems] = i
360 writeint(self.fd, i, self.pos0 + self.nitems * 8)
361 self.nitems += 1
362 writeint(self.fd, self.nitems, 32)
363 self.fd.flush()
364 self.fd.seek(0, 2) # end of file
365 if np.little_endian:
366 self.data = {}
367 else:
368 self.data = {'_little_endian': False}
370 def write(self, *args, **kwargs):
371 """Write data.
373 Examples::
375 writer.write('n', 7)
376 writer.write(n=7)
377 writer.write(n=7, s='abc', a=np.zeros(3), abc=obj)
379 If obj is not one of the supported data types (bool, int, float,
380 complex, tupl, list, dict, None or ndarray) then it must have a
381 obj.write(childwriter) method.
382 """
384 if args:
385 name, value = args
386 kwargs[name] = value
388 self._write_header()
390 for name, value in kwargs.items():
391 if isinstance(value, (bool, int, float, complex,
392 dict, list, tuple, str,
393 type(None))):
394 self.data[name] = value
395 elif hasattr(value, '__array__'):
396 value = np.asarray(value)
397 if value.ndim == 0:
398 self.data[name] = value.item()
399 else:
400 self.add_array(name, value.shape, value.dtype)
401 self.fill(value)
402 else:
403 value.write(self.child(name))
405 def child(self, name):
406 """Create child-writer object."""
407 self._write_header()
408 dct = self.data[name + '.'] = {}
409 return Writer(self.fd, data=dct)
411 def close(self):
412 """Close file."""
413 n = int('_little_endian' in self.data)
414 if len(self.data) > n:
415 # There is more than the "_little_endian" key.
416 # Write that stuff before closing:
417 self.sync()
418 else:
419 # Make sure header has been written (empty ulm-file):
420 self._write_header()
421 self.fd.close()
423 def __len__(self):
424 return int(self.nitems)
427class DummyWriter:
428 def __enter__(self):
429 return self
431 def __exit__(self, exc_type, exc_value, tb):
432 self.close()
434 def add_array(self, name, shape, dtype=float):
435 pass
437 def fill(self, a):
438 pass
440 def sync(self):
441 pass
443 def write(self, *args, **kwargs):
444 pass
446 def child(self, name):
447 return self
449 def close(self):
450 pass
452 def __len__(self):
453 return 0
456def read_header(fd):
457 fd.seek(0)
458 if fd.read(8) not in [b'- of Ulm', b'AFFormat']:
459 raise InvalidULMFileError('This is not an ULM formatted file.')
460 tag = fd.read(16).decode('ascii').rstrip()
461 version, nitems, pos0 = readints(fd, 3)
462 fd.seek(pos0)
463 offsets = readints(fd, nitems)
464 return tag, version, nitems, pos0, offsets
467class InvalidULMFileError(IOError):
468 pass
471class Reader:
472 def __init__(self, fd, index=0, data=None, _little_endian=None):
473 """Create reader."""
475 self._little_endian = _little_endian
477 self.must_close_fd = False
478 if not hasattr(fd, 'read'):
479 self.must_close_fd = True
480 fd = Path(fd).open('rb')
482 self._fd = fd
483 self._index = index
485 if data is None:
486 try:
487 (self._tag, self._version, self._nitems, self._pos0,
488 self._offsets) = read_header(fd)
489 except BaseException:
490 if self.must_close_fd:
491 fd.close()
492 raise
493 if self._nitems > 0:
494 data = self._read_data(index)
495 else:
496 data = {}
498 self._parse_data(data)
500 def __enter__(self):
501 return self
503 def __exit__(self, exc_type, exc_value, tb):
504 self.close()
506 def _parse_data(self, data):
507 self._data = {}
508 for name, value in data.items():
509 if name.endswith('.'):
510 if 'ndarray' in value:
511 shape, dtype, offset = value['ndarray']
512 dtype = dtype.encode() # compatibility with Numpy 1.4
513 value = NDArrayReader(self._fd,
514 shape,
515 np.dtype(dtype),
516 offset,
517 self._little_endian)
518 else:
519 value = Reader(self._fd, data=value,
520 _little_endian=self._little_endian)
521 name = name[:-1]
523 self._data[name] = value
525 def get_tag(self):
526 """Return special tag string."""
527 return self._tag
529 def keys(self):
530 """Return list of keys."""
531 return self._data.keys()
533 def asdict(self):
534 """Read everything now and convert to dict."""
535 dct = {}
536 for key, value in self._data.items():
537 if isinstance(value, NDArrayReader):
538 value = value.read()
539 elif isinstance(value, Reader):
540 value = value.asdict()
541 dct[key] = value
542 return dct
544 __dir__ = keys # needed for tab-completion
546 def __getattr__(self, attr):
547 try:
548 value = self._data[attr]
549 except KeyError:
550 raise AttributeError(attr)
551 if isinstance(value, NDArrayReader):
552 return value.read()
553 return value
555 def __contains__(self, key):
556 return key in self._data
558 def __iter__(self):
559 yield self
560 for i in range(self._index + 1, self._nitems):
561 self._index = i
562 data = self._read_data(i)
563 self._parse_data(data)
564 yield self
566 def get(self, attr, value=None):
567 """Get attr or value if no such attr."""
568 try:
569 return self.__getattr__(attr)
570 except AttributeError:
571 return value
573 def proxy(self, name, *indices):
574 value = self._data[name]
575 assert isinstance(value, NDArrayReader)
576 if indices:
577 return value.proxy(*indices)
578 return value
580 def __len__(self):
581 return int(self._nitems)
583 def _read_data(self, index):
584 self._fd.seek(self._offsets[index])
585 size = int(readints(self._fd, 1)[0])
586 data = decode(self._fd.read(size).decode(), False)
587 self._little_endian = data.pop('_little_endian', True)
588 return data
590 def __getitem__(self, index):
591 """Return Reader for item *index*."""
592 data = self._read_data(index)
593 return Reader(self._fd, index, data, self._little_endian)
595 def tostr(self, verbose=False, indent=' '):
596 keys = sorted(self._data)
597 strings = []
598 for key in keys:
599 value = self._data[key]
600 if verbose and isinstance(value, NDArrayReader):
601 value = value.read()
602 if isinstance(value, NDArrayReader):
603 s = '<ndarray shape={} dtype={}>'.format(value.shape,
604 value.dtype)
605 elif isinstance(value, Reader):
606 s = value.tostr(verbose, indent + ' ')
607 else:
608 s = str(value).replace('\n', '\n ' + ' ' * len(key) + indent)
609 strings.append(f'{indent}{key}: {s}')
610 return '{\n' + ',\n'.join(strings) + '}'
612 def __str__(self):
613 return self.tostr(False, '').replace('\n', ' ')
615 def close(self):
616 if self.must_close_fd:
617 self._fd.close()
620class NDArrayReader:
621 def __init__(self, fd, shape, dtype, offset, little_endian):
622 self.fd = fd
623 self.hasfileno = file_has_fileno(fd)
624 self.shape = tuple(shape)
625 self.dtype = dtype
626 self.offset = offset
627 self.little_endian = little_endian
629 self.ndim = len(self.shape)
630 self.itemsize = dtype.itemsize
631 self.size = np.prod(self.shape)
632 self.nbytes = self.size * self.itemsize
634 self.scale = 1.0
635 self.length_of_last_dimension = None
637 def __len__(self):
638 return int(self.shape[0]) # Python-2.6 needs int
640 def read(self):
641 return self[:]
643 def __getitem__(self, i):
644 if isinstance(i, numbers.Integral):
645 if i < 0:
646 i += len(self)
647 return self[i:i + 1][0]
648 start, stop, step = i.indices(len(self))
649 stride = np.prod(self.shape[1:], dtype=int)
650 offset = self.offset + start * self.itemsize * stride
651 self.fd.seek(offset)
652 count = (stop - start) * stride
653 if not is_compressed(self.fd) and self.hasfileno:
654 a = np.fromfile(self.fd, self.dtype, count)
655 else:
656 # Not as fast, but works for reading from tar-files:
657 a = np.frombuffer(self.fd.read(int(count * self.itemsize)),
658 self.dtype)
659 a.shape = (stop - start,) + self.shape[1:]
660 if step != 1:
661 a = a[::step].copy()
662 if self.little_endian != np.little_endian:
663 # frombuffer() returns readonly array
664 a = a.byteswap(inplace=a.flags.writeable)
665 if self.length_of_last_dimension is not None:
666 a = a[..., :self.length_of_last_dimension]
667 if self.scale != 1.0:
668 a *= self.scale
669 return a
671 def proxy(self, *indices):
672 stride = self.size // len(self)
673 start = 0
674 for i, index in enumerate(indices):
675 start += stride * index
676 stride //= self.shape[i + 1]
677 offset = self.offset + start * self.itemsize
678 p = NDArrayReader(self.fd, self.shape[i + 1:], self.dtype,
679 offset, self.little_endian)
680 p.scale = self.scale
681 return p
684def print_ulm_info(filename, index=None, verbose=False):
685 b = ulmopen(filename, 'r')
686 if index is None:
687 indices = range(len(b))
688 else:
689 indices = [index]
690 print('{} (tag: "{}", {})'.format(filename, b.get_tag(),
691 plural(len(b), 'item')))
692 for i in indices:
693 print(f'item #{i}:')
694 print(b[i].tostr(verbose))
697def copy(reader: Union[str, Path, Reader],
698 writer: Union[str, Path, Writer],
699 exclude: Set[str] = set(),
700 name: str = '') -> None:
701 """Copy from reader to writer except for keys in exclude."""
702 close_reader = False
703 close_writer = False
704 if not isinstance(reader, Reader):
705 reader = Reader(reader)
706 close_reader = True
707 if not isinstance(writer, Writer):
708 writer = Writer(writer)
709 close_writer = True
710 for key, value in reader._data.items():
711 if name + '.' + key in exclude:
712 continue
713 if isinstance(value, NDArrayReader):
714 value = value.read()
715 if isinstance(value, Reader):
716 copy(value, writer.child(key), exclude, name + '.' + key)
717 else:
718 writer.write(key, value)
719 if close_reader:
720 reader.close()
721 if close_writer:
722 writer.close()