Coverage for ase / io / ulm.py: 90.48%

378 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-03-30 08:22 +0000

1# fmt: off 

2 

3""" 

4ULM files 

5========= 

6 

7*Simple and efficient pythonic file-format* 

8 

9Stores ndarrays as binary data and Python's built-in datatypes 

10(bool, int, float, complex, str, dict, list, tuple, None) as json. 

11 

12.. autofunction:: open 

13.. autoexception:: InvalidULMFileError 

14 

15 

16File layout 

17----------- 

18 

19When there is only a single item:: 

20 

21 0: "- of Ulm" (magic prefix, ascii) 

22 8: " " (tag, ascii) 

23 24: version (int64) 

24 32: nitems (int64) 

25 40: 48 (position of offsets, int64) 

26 48: p0 (offset to json data, int64) 

27 56: array1, array2, ... (8-byte aligned ndarrays) 

28 p0: n (length of json data, int64) 

29 p0+8: json data 

30 p0+8+n: EOF 

31 

32 

33Examples 

34-------- 

35 

36Writing: 

37 

38>>> import numpy as np 

39>>> import ase.io.ulm as ulm 

40>>> with ulm.open('x.ulm', 'w') as w: 

41... w.write(a=np.ones(7), b=42, c='abc') 

42... w.write(d=3.14) 

43 

44 

45Reading: 

46 

47>>> r = ulm.open('x.ulm') 

48>>> print(r.c) 

49abc 

50>>> r.close() 

51 

52To see what's inside 'x.ulm' do this:: 

53 

54 $ ase ulm x.ulm 

55 x.ulm (tag: "", 1 item) 

56 item #0: 

57 { 

58 a: <ndarray shape=(7,) dtype=float64>, 

59 b: 42, 

60 c: abc, 

61 d: 3.14} 

62 

63 

64.. autoclass:: Writer 

65 :members: 

66 

67.. autoclass:: Reader 

68 :members: 

69 

70 

71More examples 

72------------- 

73 

74In the following we append to the ulm-file from above and demonstrae 

75how to write a big array in chunks: 

76 

77>>> w = ulm.open('x.ulm', 'a') 

78>>> w.add_array('bigarray', (10, 1000), float) 

79>>> for i in range(10): 

80... w.fill(np.ones(1000)) 

81... 

82>>> w.close() 

83 

84Now read first and second items: 

85 

86>>> with ulm.open('x.ulm') as r: 

87... print(r.keys()) 

88dict_keys(['a', 'b', 'c', 'd']) 

89>>> with ulm.open('x.ulm', index=1) as r: 

90... print(r.keys()) 

91dict_keys(['bigarray']) 

92 

93To get all the data, it is possible to iterate over the items in the file. 

94 

95>>> for i, r in enumerate(ulm.Reader('x.ulm')): 

96... for k in r.keys(): 

97... print(i, k) 

980 a 

990 b 

1000 c 

1010 d 

1021 bigarray 

103>>> r.close() 

104 

105The different parts (items) of the file are numbered by the index 

106argument: 

107 

108>>> r = ulm.Reader('x.ulm') 

109>>> r[1].bigarray.shape 

110(10, 1000) 

111>>> r.close() 

112 

113 

114Versions 

115-------- 

116 

1171) Initial version. 

118 

1192) Added support for big endian machines. Json data may now have 

120 _little_endian=False item. 

121 

1223) Changed magic string from "AFFormat" to "- of Ulm". 

123""" 

124 

125import numbers 

126from pathlib import Path 

127 

128import numpy as np 

129 

130from ase.io.formats import is_compressed 

131from ase.io.jsonio import decode, encode 

132from ase.utils import plural 

133 

134VERSION = 3 

135N1 = 42 # block size - max number of items: 1, N1, N1*N1, N1*N1*N1, ... 

136 

137 

138def open(filename, mode='r', index=None, tag=None): 

139 """Open ulm-file. 

140 

141 filename: str 

142 Filename. 

143 mode: str 

144 Mode. Must be 'r' for reading, 'w' for writing to a new file 

145 (overwriting an existing one) or 'a' for appending to an existing file. 

146 index: int 

147 Index of item to read. Defaults to 0. 

148 tag: str 

149 Magic ID string. 

150 

151 Returns a :class:`Reader` or a :class:`Writer` object. May raise 

152 :class:`InvalidULMFileError`. 

153 """ 

154 if mode == 'r': 

155 assert tag is None 

156 return Reader(filename, index or 0) 

157 if mode not in 'wa': 

158 2 / 0 

159 assert index is None 

160 return Writer(filename, mode, tag or '') 

161 

162 

163ulmopen = open 

164 

165 

166def align(fd): 

167 """Advance file descriptor to 8 byte alignment and return position.""" 

168 pos = fd.tell() 

169 r = pos % 8 

170 if r == 0: 

171 return pos 

172 fd.write(b'#' * (8 - r)) 

173 return pos + 8 - r 

174 

175 

176def writeint(fd, n, pos=None): 

177 """Write 64 bit integer n at pos or current position.""" 

178 if pos is not None: 

179 fd.seek(pos) 

180 a = np.array(n, np.int64) 

181 if not np.little_endian: 

182 a.byteswap(True) 

183 fd.write(a.tobytes()) 

184 

185 

186def readints(fd, n): 

187 a = np.frombuffer(fd.read(int(n * 8)), dtype=np.int64, count=n) 

188 if not np.little_endian: 

189 # Cannot use in-place byteswap because frombuffer() 

190 # returns readonly view 

191 a = a.byteswap() 

192 return a 

193 

194 

195def file_has_fileno(fd): 

196 """Tell whether file implements fileio() or not. 

197 

198 array.tofile(fd) works only on files with fileno(). 

199 numpy may write faster to physical files using fileno(). 

200 

201 For files without fileno() we use instead fd.write(array.tobytes()). 

202 Either way we need to distinguish.""" 

203 

204 try: 

205 fno = fd.fileno # AttributeError? 

206 fno() # IOError/OSError? (Newer python: OSError is IOError) 

207 except (AttributeError, OSError): 

208 return False 

209 return True 

210 

211 

212class Writer: 

213 def __init__(self, fd, mode='w', tag='', data=None): 

214 """Create writer object. 

215 

216 fd: str 

217 Filename. 

218 mode: str 

219 Mode. Must be 'w' for writing to a new file (overwriting an 

220 existing one) and 'a' for appending to an existing file. 

221 tag: str 

222 Magic ID string. 

223 """ 

224 

225 assert mode in 'aw' 

226 

227 # Header to be written later: 

228 self.header = b'' 

229 

230 if data is None: 

231 if np.little_endian: 

232 data = {} 

233 else: 

234 data = {'_little_endian': False} 

235 

236 if isinstance(fd, str): 

237 fd = Path(fd) 

238 

239 if mode == 'w' or (isinstance(fd, Path) and 

240 not (fd.is_file() and 

241 fd.stat().st_size > 0)): 

242 self.nitems = 0 

243 self.pos0 = 48 

244 self.offsets = np.array([-1], np.int64) 

245 

246 if isinstance(fd, Path): 

247 fd = fd.open('wb') 

248 

249 # File format identifier and other stuff: 

250 a = np.array([VERSION, self.nitems, self.pos0], np.int64) 

251 if not np.little_endian: 

252 a.byteswap(True) 

253 self.header = (f'- of Ulm{tag:16}'.encode('ascii') + 

254 a.tobytes() + 

255 self.offsets.tobytes()) 

256 else: 

257 if isinstance(fd, Path): 

258 fd = fd.open('r+b') 

259 

260 version, self.nitems, self.pos0, offsets = read_header(fd)[1:] 

261 assert version == VERSION 

262 n = 1 

263 while self.nitems > n: 

264 n *= N1 

265 padding = np.zeros(n - self.nitems, np.int64) 

266 self.offsets = np.concatenate((offsets, padding)) 

267 fd.seek(0, 2) 

268 

269 self.fd = fd 

270 self.hasfileno = file_has_fileno(fd) 

271 

272 self.data = data 

273 

274 # date for array being filled: 

275 self.nmissing = 0 # number of missing numbers 

276 self.shape = None 

277 self.dtype = None 

278 

279 def __enter__(self): 

280 return self 

281 

282 def __exit__(self, exc_type, exc_value, tb): 

283 self.close() 

284 

285 def add_array(self, name, shape, dtype=float): 

286 """Add ndarray object. 

287 

288 Set name, shape and dtype for array and fill in the data in chunks 

289 later with the fill() method. 

290 """ 

291 

292 self._write_header() 

293 

294 if isinstance(shape, int): 

295 shape = (shape,) 

296 

297 shape = tuple(int(s) for s in shape) # Convert np.int64 to int 

298 

299 i = align(self.fd) 

300 

301 self.data[name + '.'] = { 

302 'ndarray': (shape, np.dtype(dtype).name, i)} 

303 

304 assert self.nmissing == 0, 'last array not done' 

305 

306 self.dtype = dtype 

307 self.shape = shape 

308 self.nmissing = np.prod(shape) 

309 

310 def _write_header(self): 

311 # We want to delay writing until there is any real data written. 

312 # Some people rely on zero file size. 

313 if self.header: 

314 self.fd.write(self.header) 

315 self.header = b'' 

316 

317 def fill(self, a): 

318 """Fill in ndarray chunks for array currently being written.""" 

319 assert a.dtype == self.dtype 

320 assert a.shape[1:] == self.shape[len(self.shape) - a.ndim + 1:] 

321 self.nmissing -= a.size 

322 assert self.nmissing >= 0 

323 

324 if self.hasfileno: 

325 a.tofile(self.fd) 

326 else: 

327 self.fd.write(a.tobytes()) 

328 

329 def sync(self): 

330 """Write data dictionary. 

331 

332 Write bool, int, float, complex and str data, shapes and 

333 dtypes for ndarrays.""" 

334 

335 self._write_header() 

336 

337 assert self.nmissing == 0 

338 i = self.fd.tell() 

339 s = encode(self.data).encode() 

340 writeint(self.fd, len(s)) 

341 self.fd.write(s) 

342 

343 n = len(self.offsets) 

344 if self.nitems >= n: 

345 offsets = np.zeros(n * N1, np.int64) 

346 offsets[:n] = self.offsets 

347 self.pos0 = align(self.fd) 

348 

349 buf = offsets if np.little_endian else offsets.byteswap() 

350 

351 if self.hasfileno: 

352 buf.tofile(self.fd) 

353 else: 

354 self.fd.write(buf.tobytes()) 

355 writeint(self.fd, self.pos0, 40) 

356 self.offsets = offsets 

357 

358 self.offsets[self.nitems] = i 

359 writeint(self.fd, i, self.pos0 + self.nitems * 8) 

360 self.nitems += 1 

361 writeint(self.fd, self.nitems, 32) 

362 self.fd.flush() 

363 self.fd.seek(0, 2) # end of file 

364 if np.little_endian: 

365 self.data = {} 

366 else: 

367 self.data = {'_little_endian': False} 

368 

369 def write(self, *args, **kwargs): 

370 """Write data. 

371 

372 Examples:: 

373 

374 writer.write('n', 7) 

375 writer.write(n=7) 

376 writer.write(n=7, s='abc', a=np.zeros(3), abc=obj) 

377 

378 If obj is not one of the supported data types (bool, int, float, 

379 complex, tupl, list, dict, None or ndarray) then it must have a 

380 obj.write(childwriter) method. 

381 """ 

382 

383 if args: 

384 name, value = args 

385 kwargs[name] = value 

386 

387 self._write_header() 

388 

389 for name, value in kwargs.items(): 

390 if isinstance(value, (bool, int, float, complex, 

391 dict, list, tuple, str, 

392 type(None))): 

393 self.data[name] = value 

394 elif hasattr(value, '__array__'): 

395 value = np.asarray(value) 

396 if value.ndim == 0: 

397 self.data[name] = value.item() 

398 else: 

399 self.add_array(name, value.shape, value.dtype) 

400 self.fill(value) 

401 else: 

402 value.write(self.child(name)) 

403 

404 def child(self, name): 

405 """Create child-writer object.""" 

406 self._write_header() 

407 dct = self.data[name + '.'] = {} 

408 return Writer(self.fd, data=dct) 

409 

410 def close(self): 

411 """Close file.""" 

412 n = int('_little_endian' in self.data) 

413 if len(self.data) > n: 

414 # There is more than the "_little_endian" key. 

415 # Write that stuff before closing: 

416 self.sync() 

417 else: 

418 # Make sure header has been written (empty ulm-file): 

419 self._write_header() 

420 self.fd.close() 

421 

422 def __len__(self): 

423 return int(self.nitems) 

424 

425 

426class DummyWriter: 

427 def __enter__(self): 

428 return self 

429 

430 def __exit__(self, exc_type, exc_value, tb): 

431 self.close() 

432 

433 def add_array(self, name, shape, dtype=float): 

434 pass 

435 

436 def fill(self, a): 

437 pass 

438 

439 def sync(self): 

440 pass 

441 

442 def write(self, *args, **kwargs): 

443 pass 

444 

445 def child(self, name): 

446 return self 

447 

448 def close(self): 

449 pass 

450 

451 def __len__(self): 

452 return 0 

453 

454 

455def read_header(fd): 

456 fd.seek(0) 

457 if fd.read(8) not in [b'- of Ulm', b'AFFormat']: 

458 raise InvalidULMFileError('This is not an ULM formatted file.') 

459 tag = fd.read(16).decode('ascii').rstrip() 

460 version, nitems, pos0 = readints(fd, 3) 

461 fd.seek(pos0) 

462 offsets = readints(fd, nitems) 

463 return tag, version, nitems, pos0, offsets 

464 

465 

466class InvalidULMFileError(IOError): 

467 pass 

468 

469 

470class Reader: 

471 def __init__(self, fd, index=0, data=None, _little_endian=None): 

472 """Create reader.""" 

473 

474 self._little_endian = _little_endian 

475 

476 self.must_close_fd = False 

477 if not hasattr(fd, 'read'): 

478 self.must_close_fd = True 

479 fd = Path(fd).open('rb') 

480 

481 self._fd = fd 

482 self._index = index 

483 

484 if data is None: 

485 try: 

486 (self._tag, self._version, self._nitems, self._pos0, 

487 self._offsets) = read_header(fd) 

488 except BaseException: 

489 if self.must_close_fd: 

490 fd.close() 

491 raise 

492 if self._nitems > 0: 

493 data = self._read_data(index) 

494 else: 

495 data = {} 

496 

497 self._parse_data(data) 

498 

499 def __enter__(self): 

500 return self 

501 

502 def __exit__(self, exc_type, exc_value, tb): 

503 self.close() 

504 

505 def _parse_data(self, data): 

506 self._data = {} 

507 for name, value in data.items(): 

508 if name.endswith('.'): 

509 if 'ndarray' in value: 

510 shape, dtype, offset = value['ndarray'] 

511 dtype = dtype.encode() # compatibility with Numpy 1.4 

512 value = NDArrayReader(self._fd, 

513 shape, 

514 np.dtype(dtype), 

515 offset, 

516 self._little_endian) 

517 else: 

518 value = Reader(self._fd, data=value, 

519 _little_endian=self._little_endian) 

520 name = name[:-1] 

521 

522 self._data[name] = value 

523 

524 def get_tag(self): 

525 """Return special tag string.""" 

526 return self._tag 

527 

528 def keys(self): 

529 """Return list of keys.""" 

530 return self._data.keys() 

531 

532 def asdict(self): 

533 """Read everything now and convert to dict.""" 

534 dct = {} 

535 for key, value in self._data.items(): 

536 if isinstance(value, NDArrayReader): 

537 value = value.read() 

538 elif isinstance(value, Reader): 

539 value = value.asdict() 

540 dct[key] = value 

541 return dct 

542 

543 __dir__ = keys # needed for tab-completion 

544 

545 def __getattr__(self, attr): 

546 try: 

547 value = self._data[attr] 

548 except KeyError: 

549 raise AttributeError(attr) 

550 if isinstance(value, NDArrayReader): 

551 return value.read() 

552 return value 

553 

554 def __contains__(self, key): 

555 return key in self._data 

556 

557 def __iter__(self): 

558 yield self 

559 for i in range(self._index + 1, self._nitems): 

560 self._index = i 

561 data = self._read_data(i) 

562 self._parse_data(data) 

563 yield self 

564 

565 def get(self, attr, value=None): 

566 """Get attr or value if no such attr.""" 

567 try: 

568 return self.__getattr__(attr) 

569 except AttributeError: 

570 return value 

571 

572 def proxy(self, name, *indices): 

573 value = self._data[name] 

574 assert isinstance(value, NDArrayReader) 

575 if indices: 

576 return value.proxy(*indices) 

577 return value 

578 

579 def __len__(self): 

580 return int(self._nitems) 

581 

582 def _read_data(self, index): 

583 self._fd.seek(self._offsets[index]) 

584 size = int(readints(self._fd, 1)[0]) 

585 data = decode(self._fd.read(size).decode(), False) 

586 self._little_endian = data.pop('_little_endian', True) 

587 return data 

588 

589 def __getitem__(self, index): 

590 """Return Reader for item *index*.""" 

591 data = self._read_data(index) 

592 return Reader(self._fd, index, data, self._little_endian) 

593 

594 def tostr(self, verbose=False, indent=' '): 

595 keys = sorted(self._data) 

596 strings = [] 

597 for key in keys: 

598 value = self._data[key] 

599 if verbose and isinstance(value, NDArrayReader): 

600 value = value.read() 

601 if isinstance(value, NDArrayReader): 

602 s = '<ndarray shape={} dtype={}>'.format(value.shape, 

603 value.dtype) 

604 elif isinstance(value, Reader): 

605 s = value.tostr(verbose, indent + ' ') 

606 else: 

607 s = str(value).replace('\n', '\n ' + ' ' * len(key) + indent) 

608 strings.append(f'{indent}{key}: {s}') 

609 return '{\n' + ',\n'.join(strings) + '}' 

610 

611 def __str__(self): 

612 return self.tostr(False, '').replace('\n', ' ') 

613 

614 def close(self): 

615 if self.must_close_fd: 

616 self._fd.close() 

617 

618 

619class NDArrayReader: 

620 def __init__(self, fd, shape, dtype, offset, little_endian): 

621 self.fd = fd 

622 self.hasfileno = file_has_fileno(fd) 

623 self.shape = tuple(shape) 

624 self.dtype = dtype 

625 self.offset = offset 

626 self.little_endian = little_endian 

627 

628 self.ndim = len(self.shape) 

629 self.itemsize = dtype.itemsize 

630 self.size = np.prod(self.shape) 

631 self.nbytes = self.size * self.itemsize 

632 

633 self.scale = 1.0 

634 self.length_of_last_dimension = None 

635 

636 def __len__(self): 

637 return int(self.shape[0]) # Python-2.6 needs int 

638 

639 def read(self): 

640 return self[:] 

641 

642 def __getitem__(self, i): 

643 if isinstance(i, numbers.Integral): 

644 if i < 0: 

645 i += len(self) 

646 return self[i:i + 1][0] 

647 start, stop, step = i.indices(len(self)) 

648 stride = np.prod(self.shape[1:], dtype=int) 

649 offset = self.offset + start * self.itemsize * stride 

650 self.fd.seek(offset) 

651 count = (stop - start) * stride 

652 if not is_compressed(self.fd) and self.hasfileno: 

653 a = np.fromfile(self.fd, self.dtype, count) 

654 else: 

655 # Not as fast, but works for reading from tar-files: 

656 a = np.frombuffer(self.fd.read(int(count * self.itemsize)), 

657 self.dtype) 

658 a.shape = (stop - start,) + self.shape[1:] 

659 if step != 1: 

660 a = a[::step].copy() 

661 if self.little_endian != np.little_endian: 

662 # frombuffer() returns readonly array 

663 a = a.byteswap(inplace=a.flags.writeable) 

664 if self.length_of_last_dimension is not None: 

665 a = a[..., :self.length_of_last_dimension] 

666 if self.scale != 1.0: 

667 a *= self.scale 

668 return a 

669 

670 def proxy(self, *indices): 

671 stride = self.size // len(self) 

672 start = 0 

673 for i, index in enumerate(indices): 

674 start += stride * index 

675 stride //= self.shape[i + 1] 

676 offset = self.offset + start * self.itemsize 

677 p = NDArrayReader(self.fd, self.shape[i + 1:], self.dtype, 

678 offset, self.little_endian) 

679 p.scale = self.scale 

680 return p 

681 

682 

683def print_ulm_info(filename, index=None, verbose=False): 

684 b = ulmopen(filename, 'r') 

685 if index is None: 

686 indices = range(len(b)) 

687 else: 

688 indices = [index] 

689 print('{} (tag: "{}", {})'.format(filename, b.get_tag(), 

690 plural(len(b), 'item'))) 

691 for i in indices: 

692 print(f'item #{i}:') 

693 print(b[i].tostr(verbose)) 

694 

695 

696def copy(reader: str | Path | Reader, 

697 writer: str | Path | Writer, 

698 exclude: set[str] = set(), 

699 name: str = '') -> None: 

700 """Copy from reader to writer except for keys in exclude.""" 

701 close_reader = False 

702 close_writer = False 

703 if not isinstance(reader, Reader): 

704 reader = Reader(reader) 

705 close_reader = True 

706 if not isinstance(writer, Writer): 

707 writer = Writer(writer) 

708 close_writer = True 

709 for key, value in reader._data.items(): 

710 if name + '.' + key in exclude: 

711 continue 

712 if isinstance(value, NDArrayReader): 

713 value = value.read() 

714 if isinstance(value, Reader): 

715 copy(value, writer.child(key), exclude, name + '.' + key) 

716 else: 

717 writer.write(key, value) 

718 if close_reader: 

719 reader.close() 

720 if close_writer: 

721 writer.close()