Coverage for /builds/ase/ase/ase/io/ulm.py: 90.50%

379 statements  

« prev     ^ index     » next       coverage.py v7.5.3, created at 2025-08-02 00:12 +0000

1# fmt: off 

2 

3""" 

4ULM files 

5========= 

6 

7*Simple and efficient pythonic file-format* 

8 

9Stores ndarrays as binary data and Python's built-in datatypes 

10(bool, int, float, complex, str, dict, list, tuple, None) as json. 

11 

12.. autofunction:: open 

13.. autoexception:: InvalidULMFileError 

14 

15 

16File layout 

17----------- 

18 

19When there is only a single item:: 

20 

21 0: "- of Ulm" (magic prefix, ascii) 

22 8: " " (tag, ascii) 

23 24: version (int64) 

24 32: nitems (int64) 

25 40: 48 (position of offsets, int64) 

26 48: p0 (offset to json data, int64) 

27 56: array1, array2, ... (8-byte aligned ndarrays) 

28 p0: n (length of json data, int64) 

29 p0+8: json data 

30 p0+8+n: EOF 

31 

32 

33Examples 

34-------- 

35 

36Writing: 

37 

38>>> import numpy as np 

39>>> import ase.io.ulm as ulm 

40>>> with ulm.open('x.ulm', 'w') as w: 

41... w.write(a=np.ones(7), b=42, c='abc') 

42... w.write(d=3.14) 

43 

44 

45Reading: 

46 

47>>> r = ulm.open('x.ulm') 

48>>> print(r.c) 

49abc 

50>>> r.close() 

51 

52To see what's inside 'x.ulm' do this:: 

53 

54 $ ase ulm x.ulm 

55 x.ulm (tag: "", 1 item) 

56 item #0: 

57 { 

58 a: <ndarray shape=(7,) dtype=float64>, 

59 b: 42, 

60 c: abc, 

61 d: 3.14} 

62 

63 

64.. autoclass:: Writer 

65 :members: 

66 

67.. autoclass:: Reader 

68 :members: 

69 

70 

71More examples 

72------------- 

73 

74In the following we append to the ulm-file from above and demonstrae 

75how to write a big array in chunks: 

76 

77>>> w = ulm.open('x.ulm', 'a') 

78>>> w.add_array('bigarray', (10, 1000), float) 

79>>> for i in range(10): 

80... w.fill(np.ones(1000)) 

81... 

82>>> w.close() 

83 

84Now read first and second items: 

85 

86>>> with ulm.open('x.ulm') as r: 

87... print(r.keys()) 

88dict_keys(['a', 'b', 'c', 'd']) 

89>>> with ulm.open('x.ulm', index=1) as r: 

90... print(r.keys()) 

91dict_keys(['bigarray']) 

92 

93To get all the data, it is possible to iterate over the items in the file. 

94 

95>>> for i, r in enumerate(ulm.Reader('x.ulm')): 

96... for k in r.keys(): 

97... print(i, k) 

980 a 

990 b 

1000 c 

1010 d 

1021 bigarray 

103>>> r.close() 

104 

105The different parts (items) of the file are numbered by the index 

106argument: 

107 

108>>> r = ulm.Reader('x.ulm') 

109>>> r[1].bigarray.shape 

110(10, 1000) 

111>>> r.close() 

112 

113 

114Versions 

115-------- 

116 

1171) Initial version. 

118 

1192) Added support for big endian machines. Json data may now have 

120 _little_endian=False item. 

121 

1223) Changed magic string from "AFFormat" to "- of Ulm". 

123""" 

124 

125import numbers 

126from pathlib import Path 

127from typing import Set, Union 

128 

129import numpy as np 

130 

131from ase.io.formats import is_compressed 

132from ase.io.jsonio import decode, encode 

133from ase.utils import plural 

134 

135VERSION = 3 

136N1 = 42 # block size - max number of items: 1, N1, N1*N1, N1*N1*N1, ... 

137 

138 

139def open(filename, mode='r', index=None, tag=None): 

140 """Open ulm-file. 

141 

142 filename: str 

143 Filename. 

144 mode: str 

145 Mode. Must be 'r' for reading, 'w' for writing to a new file 

146 (overwriting an existing one) or 'a' for appending to an existing file. 

147 index: int 

148 Index of item to read. Defaults to 0. 

149 tag: str 

150 Magic ID string. 

151 

152 Returns a :class:`Reader` or a :class:`Writer` object. May raise 

153 :class:`InvalidULMFileError`. 

154 """ 

155 if mode == 'r': 

156 assert tag is None 

157 return Reader(filename, index or 0) 

158 if mode not in 'wa': 

159 2 / 0 

160 assert index is None 

161 return Writer(filename, mode, tag or '') 

162 

163 

164ulmopen = open 

165 

166 

167def align(fd): 

168 """Advance file descriptor to 8 byte alignment and return position.""" 

169 pos = fd.tell() 

170 r = pos % 8 

171 if r == 0: 

172 return pos 

173 fd.write(b'#' * (8 - r)) 

174 return pos + 8 - r 

175 

176 

177def writeint(fd, n, pos=None): 

178 """Write 64 bit integer n at pos or current position.""" 

179 if pos is not None: 

180 fd.seek(pos) 

181 a = np.array(n, np.int64) 

182 if not np.little_endian: 

183 a.byteswap(True) 

184 fd.write(a.tobytes()) 

185 

186 

187def readints(fd, n): 

188 a = np.frombuffer(fd.read(int(n * 8)), dtype=np.int64, count=n) 

189 if not np.little_endian: 

190 # Cannot use in-place byteswap because frombuffer() 

191 # returns readonly view 

192 a = a.byteswap() 

193 return a 

194 

195 

196def file_has_fileno(fd): 

197 """Tell whether file implements fileio() or not. 

198 

199 array.tofile(fd) works only on files with fileno(). 

200 numpy may write faster to physical files using fileno(). 

201 

202 For files without fileno() we use instead fd.write(array.tobytes()). 

203 Either way we need to distinguish.""" 

204 

205 try: 

206 fno = fd.fileno # AttributeError? 

207 fno() # IOError/OSError? (Newer python: OSError is IOError) 

208 except (AttributeError, OSError): 

209 return False 

210 return True 

211 

212 

213class Writer: 

214 def __init__(self, fd, mode='w', tag='', data=None): 

215 """Create writer object. 

216 

217 fd: str 

218 Filename. 

219 mode: str 

220 Mode. Must be 'w' for writing to a new file (overwriting an 

221 existing one) and 'a' for appending to an existing file. 

222 tag: str 

223 Magic ID string. 

224 """ 

225 

226 assert mode in 'aw' 

227 

228 # Header to be written later: 

229 self.header = b'' 

230 

231 if data is None: 

232 if np.little_endian: 

233 data = {} 

234 else: 

235 data = {'_little_endian': False} 

236 

237 if isinstance(fd, str): 

238 fd = Path(fd) 

239 

240 if mode == 'w' or (isinstance(fd, Path) and 

241 not (fd.is_file() and 

242 fd.stat().st_size > 0)): 

243 self.nitems = 0 

244 self.pos0 = 48 

245 self.offsets = np.array([-1], np.int64) 

246 

247 if isinstance(fd, Path): 

248 fd = fd.open('wb') 

249 

250 # File format identifier and other stuff: 

251 a = np.array([VERSION, self.nitems, self.pos0], np.int64) 

252 if not np.little_endian: 

253 a.byteswap(True) 

254 self.header = (f'- of Ulm{tag:16}'.encode('ascii') + 

255 a.tobytes() + 

256 self.offsets.tobytes()) 

257 else: 

258 if isinstance(fd, Path): 

259 fd = fd.open('r+b') 

260 

261 version, self.nitems, self.pos0, offsets = read_header(fd)[1:] 

262 assert version == VERSION 

263 n = 1 

264 while self.nitems > n: 

265 n *= N1 

266 padding = np.zeros(n - self.nitems, np.int64) 

267 self.offsets = np.concatenate((offsets, padding)) 

268 fd.seek(0, 2) 

269 

270 self.fd = fd 

271 self.hasfileno = file_has_fileno(fd) 

272 

273 self.data = data 

274 

275 # date for array being filled: 

276 self.nmissing = 0 # number of missing numbers 

277 self.shape = None 

278 self.dtype = None 

279 

280 def __enter__(self): 

281 return self 

282 

283 def __exit__(self, exc_type, exc_value, tb): 

284 self.close() 

285 

286 def add_array(self, name, shape, dtype=float): 

287 """Add ndarray object. 

288 

289 Set name, shape and dtype for array and fill in the data in chunks 

290 later with the fill() method. 

291 """ 

292 

293 self._write_header() 

294 

295 if isinstance(shape, int): 

296 shape = (shape,) 

297 

298 shape = tuple(int(s) for s in shape) # Convert np.int64 to int 

299 

300 i = align(self.fd) 

301 

302 self.data[name + '.'] = { 

303 'ndarray': (shape, np.dtype(dtype).name, i)} 

304 

305 assert self.nmissing == 0, 'last array not done' 

306 

307 self.dtype = dtype 

308 self.shape = shape 

309 self.nmissing = np.prod(shape) 

310 

311 def _write_header(self): 

312 # We want to delay writing until there is any real data written. 

313 # Some people rely on zero file size. 

314 if self.header: 

315 self.fd.write(self.header) 

316 self.header = b'' 

317 

318 def fill(self, a): 

319 """Fill in ndarray chunks for array currently being written.""" 

320 assert a.dtype == self.dtype 

321 assert a.shape[1:] == self.shape[len(self.shape) - a.ndim + 1:] 

322 self.nmissing -= a.size 

323 assert self.nmissing >= 0 

324 

325 if self.hasfileno: 

326 a.tofile(self.fd) 

327 else: 

328 self.fd.write(a.tobytes()) 

329 

330 def sync(self): 

331 """Write data dictionary. 

332 

333 Write bool, int, float, complex and str data, shapes and 

334 dtypes for ndarrays.""" 

335 

336 self._write_header() 

337 

338 assert self.nmissing == 0 

339 i = self.fd.tell() 

340 s = encode(self.data).encode() 

341 writeint(self.fd, len(s)) 

342 self.fd.write(s) 

343 

344 n = len(self.offsets) 

345 if self.nitems >= n: 

346 offsets = np.zeros(n * N1, np.int64) 

347 offsets[:n] = self.offsets 

348 self.pos0 = align(self.fd) 

349 

350 buf = offsets if np.little_endian else offsets.byteswap() 

351 

352 if self.hasfileno: 

353 buf.tofile(self.fd) 

354 else: 

355 self.fd.write(buf.tobytes()) 

356 writeint(self.fd, self.pos0, 40) 

357 self.offsets = offsets 

358 

359 self.offsets[self.nitems] = i 

360 writeint(self.fd, i, self.pos0 + self.nitems * 8) 

361 self.nitems += 1 

362 writeint(self.fd, self.nitems, 32) 

363 self.fd.flush() 

364 self.fd.seek(0, 2) # end of file 

365 if np.little_endian: 

366 self.data = {} 

367 else: 

368 self.data = {'_little_endian': False} 

369 

370 def write(self, *args, **kwargs): 

371 """Write data. 

372 

373 Examples:: 

374 

375 writer.write('n', 7) 

376 writer.write(n=7) 

377 writer.write(n=7, s='abc', a=np.zeros(3), abc=obj) 

378 

379 If obj is not one of the supported data types (bool, int, float, 

380 complex, tupl, list, dict, None or ndarray) then it must have a 

381 obj.write(childwriter) method. 

382 """ 

383 

384 if args: 

385 name, value = args 

386 kwargs[name] = value 

387 

388 self._write_header() 

389 

390 for name, value in kwargs.items(): 

391 if isinstance(value, (bool, int, float, complex, 

392 dict, list, tuple, str, 

393 type(None))): 

394 self.data[name] = value 

395 elif hasattr(value, '__array__'): 

396 value = np.asarray(value) 

397 if value.ndim == 0: 

398 self.data[name] = value.item() 

399 else: 

400 self.add_array(name, value.shape, value.dtype) 

401 self.fill(value) 

402 else: 

403 value.write(self.child(name)) 

404 

405 def child(self, name): 

406 """Create child-writer object.""" 

407 self._write_header() 

408 dct = self.data[name + '.'] = {} 

409 return Writer(self.fd, data=dct) 

410 

411 def close(self): 

412 """Close file.""" 

413 n = int('_little_endian' in self.data) 

414 if len(self.data) > n: 

415 # There is more than the "_little_endian" key. 

416 # Write that stuff before closing: 

417 self.sync() 

418 else: 

419 # Make sure header has been written (empty ulm-file): 

420 self._write_header() 

421 self.fd.close() 

422 

423 def __len__(self): 

424 return int(self.nitems) 

425 

426 

427class DummyWriter: 

428 def __enter__(self): 

429 return self 

430 

431 def __exit__(self, exc_type, exc_value, tb): 

432 self.close() 

433 

434 def add_array(self, name, shape, dtype=float): 

435 pass 

436 

437 def fill(self, a): 

438 pass 

439 

440 def sync(self): 

441 pass 

442 

443 def write(self, *args, **kwargs): 

444 pass 

445 

446 def child(self, name): 

447 return self 

448 

449 def close(self): 

450 pass 

451 

452 def __len__(self): 

453 return 0 

454 

455 

456def read_header(fd): 

457 fd.seek(0) 

458 if fd.read(8) not in [b'- of Ulm', b'AFFormat']: 

459 raise InvalidULMFileError('This is not an ULM formatted file.') 

460 tag = fd.read(16).decode('ascii').rstrip() 

461 version, nitems, pos0 = readints(fd, 3) 

462 fd.seek(pos0) 

463 offsets = readints(fd, nitems) 

464 return tag, version, nitems, pos0, offsets 

465 

466 

467class InvalidULMFileError(IOError): 

468 pass 

469 

470 

471class Reader: 

472 def __init__(self, fd, index=0, data=None, _little_endian=None): 

473 """Create reader.""" 

474 

475 self._little_endian = _little_endian 

476 

477 self.must_close_fd = False 

478 if not hasattr(fd, 'read'): 

479 self.must_close_fd = True 

480 fd = Path(fd).open('rb') 

481 

482 self._fd = fd 

483 self._index = index 

484 

485 if data is None: 

486 try: 

487 (self._tag, self._version, self._nitems, self._pos0, 

488 self._offsets) = read_header(fd) 

489 except BaseException: 

490 if self.must_close_fd: 

491 fd.close() 

492 raise 

493 if self._nitems > 0: 

494 data = self._read_data(index) 

495 else: 

496 data = {} 

497 

498 self._parse_data(data) 

499 

500 def __enter__(self): 

501 return self 

502 

503 def __exit__(self, exc_type, exc_value, tb): 

504 self.close() 

505 

506 def _parse_data(self, data): 

507 self._data = {} 

508 for name, value in data.items(): 

509 if name.endswith('.'): 

510 if 'ndarray' in value: 

511 shape, dtype, offset = value['ndarray'] 

512 dtype = dtype.encode() # compatibility with Numpy 1.4 

513 value = NDArrayReader(self._fd, 

514 shape, 

515 np.dtype(dtype), 

516 offset, 

517 self._little_endian) 

518 else: 

519 value = Reader(self._fd, data=value, 

520 _little_endian=self._little_endian) 

521 name = name[:-1] 

522 

523 self._data[name] = value 

524 

525 def get_tag(self): 

526 """Return special tag string.""" 

527 return self._tag 

528 

529 def keys(self): 

530 """Return list of keys.""" 

531 return self._data.keys() 

532 

533 def asdict(self): 

534 """Read everything now and convert to dict.""" 

535 dct = {} 

536 for key, value in self._data.items(): 

537 if isinstance(value, NDArrayReader): 

538 value = value.read() 

539 elif isinstance(value, Reader): 

540 value = value.asdict() 

541 dct[key] = value 

542 return dct 

543 

544 __dir__ = keys # needed for tab-completion 

545 

546 def __getattr__(self, attr): 

547 try: 

548 value = self._data[attr] 

549 except KeyError: 

550 raise AttributeError(attr) 

551 if isinstance(value, NDArrayReader): 

552 return value.read() 

553 return value 

554 

555 def __contains__(self, key): 

556 return key in self._data 

557 

558 def __iter__(self): 

559 yield self 

560 for i in range(self._index + 1, self._nitems): 

561 self._index = i 

562 data = self._read_data(i) 

563 self._parse_data(data) 

564 yield self 

565 

566 def get(self, attr, value=None): 

567 """Get attr or value if no such attr.""" 

568 try: 

569 return self.__getattr__(attr) 

570 except AttributeError: 

571 return value 

572 

573 def proxy(self, name, *indices): 

574 value = self._data[name] 

575 assert isinstance(value, NDArrayReader) 

576 if indices: 

577 return value.proxy(*indices) 

578 return value 

579 

580 def __len__(self): 

581 return int(self._nitems) 

582 

583 def _read_data(self, index): 

584 self._fd.seek(self._offsets[index]) 

585 size = int(readints(self._fd, 1)[0]) 

586 data = decode(self._fd.read(size).decode(), False) 

587 self._little_endian = data.pop('_little_endian', True) 

588 return data 

589 

590 def __getitem__(self, index): 

591 """Return Reader for item *index*.""" 

592 data = self._read_data(index) 

593 return Reader(self._fd, index, data, self._little_endian) 

594 

595 def tostr(self, verbose=False, indent=' '): 

596 keys = sorted(self._data) 

597 strings = [] 

598 for key in keys: 

599 value = self._data[key] 

600 if verbose and isinstance(value, NDArrayReader): 

601 value = value.read() 

602 if isinstance(value, NDArrayReader): 

603 s = '<ndarray shape={} dtype={}>'.format(value.shape, 

604 value.dtype) 

605 elif isinstance(value, Reader): 

606 s = value.tostr(verbose, indent + ' ') 

607 else: 

608 s = str(value).replace('\n', '\n ' + ' ' * len(key) + indent) 

609 strings.append(f'{indent}{key}: {s}') 

610 return '{\n' + ',\n'.join(strings) + '}' 

611 

612 def __str__(self): 

613 return self.tostr(False, '').replace('\n', ' ') 

614 

615 def close(self): 

616 if self.must_close_fd: 

617 self._fd.close() 

618 

619 

620class NDArrayReader: 

621 def __init__(self, fd, shape, dtype, offset, little_endian): 

622 self.fd = fd 

623 self.hasfileno = file_has_fileno(fd) 

624 self.shape = tuple(shape) 

625 self.dtype = dtype 

626 self.offset = offset 

627 self.little_endian = little_endian 

628 

629 self.ndim = len(self.shape) 

630 self.itemsize = dtype.itemsize 

631 self.size = np.prod(self.shape) 

632 self.nbytes = self.size * self.itemsize 

633 

634 self.scale = 1.0 

635 self.length_of_last_dimension = None 

636 

637 def __len__(self): 

638 return int(self.shape[0]) # Python-2.6 needs int 

639 

640 def read(self): 

641 return self[:] 

642 

643 def __getitem__(self, i): 

644 if isinstance(i, numbers.Integral): 

645 if i < 0: 

646 i += len(self) 

647 return self[i:i + 1][0] 

648 start, stop, step = i.indices(len(self)) 

649 stride = np.prod(self.shape[1:], dtype=int) 

650 offset = self.offset + start * self.itemsize * stride 

651 self.fd.seek(offset) 

652 count = (stop - start) * stride 

653 if not is_compressed(self.fd) and self.hasfileno: 

654 a = np.fromfile(self.fd, self.dtype, count) 

655 else: 

656 # Not as fast, but works for reading from tar-files: 

657 a = np.frombuffer(self.fd.read(int(count * self.itemsize)), 

658 self.dtype) 

659 a.shape = (stop - start,) + self.shape[1:] 

660 if step != 1: 

661 a = a[::step].copy() 

662 if self.little_endian != np.little_endian: 

663 # frombuffer() returns readonly array 

664 a = a.byteswap(inplace=a.flags.writeable) 

665 if self.length_of_last_dimension is not None: 

666 a = a[..., :self.length_of_last_dimension] 

667 if self.scale != 1.0: 

668 a *= self.scale 

669 return a 

670 

671 def proxy(self, *indices): 

672 stride = self.size // len(self) 

673 start = 0 

674 for i, index in enumerate(indices): 

675 start += stride * index 

676 stride //= self.shape[i + 1] 

677 offset = self.offset + start * self.itemsize 

678 p = NDArrayReader(self.fd, self.shape[i + 1:], self.dtype, 

679 offset, self.little_endian) 

680 p.scale = self.scale 

681 return p 

682 

683 

684def print_ulm_info(filename, index=None, verbose=False): 

685 b = ulmopen(filename, 'r') 

686 if index is None: 

687 indices = range(len(b)) 

688 else: 

689 indices = [index] 

690 print('{} (tag: "{}", {})'.format(filename, b.get_tag(), 

691 plural(len(b), 'item'))) 

692 for i in indices: 

693 print(f'item #{i}:') 

694 print(b[i].tostr(verbose)) 

695 

696 

697def copy(reader: Union[str, Path, Reader], 

698 writer: Union[str, Path, Writer], 

699 exclude: Set[str] = set(), 

700 name: str = '') -> None: 

701 """Copy from reader to writer except for keys in exclude.""" 

702 close_reader = False 

703 close_writer = False 

704 if not isinstance(reader, Reader): 

705 reader = Reader(reader) 

706 close_reader = True 

707 if not isinstance(writer, Writer): 

708 writer = Writer(writer) 

709 close_writer = True 

710 for key, value in reader._data.items(): 

711 if name + '.' + key in exclude: 

712 continue 

713 if isinstance(value, NDArrayReader): 

714 value = value.read() 

715 if isinstance(value, Reader): 

716 copy(value, writer.child(key), exclude, name + '.' + key) 

717 else: 

718 writer.write(key, value) 

719 if close_reader: 

720 reader.close() 

721 if close_writer: 

722 writer.close()