Coverage for /builds/ase/ase/ase/io/proteindatabank.py: 91.03%
145 statements
« prev ^ index » next coverage.py v7.5.3, created at 2025-08-02 00:12 +0000
« prev ^ index » next coverage.py v7.5.3, created at 2025-08-02 00:12 +0000
1# fmt: off
3"""Module to read and write atoms in PDB file format.
5See::
7 http://www.wwpdb.org/documentation/file-format
9Note: The PDB format saves cell lengths and angles; hence the absolute
10orientation is lost when saving. Saving and loading a file will
11conserve the scaled positions, not the absolute ones.
12"""
14import warnings
16import numpy as np
18from ase.atoms import Atoms
19from ase.cell import Cell
20from ase.io.espresso import label_to_symbol
21from ase.utils import reader, writer
24def read_atom_line(line_full):
25 """
26 Read atom line from pdb format
27 HETATM 1 H14 ORTE 0 6.301 0.693 1.919 1.00 0.00 H
28 """
29 line = line_full.rstrip('\n')
30 type_atm = line[0:6]
31 if type_atm == "ATOM " or type_atm == "HETATM":
33 name = line[12:16].strip()
35 altloc = line[16]
36 resname = line[17:21].strip()
37 # chainid = line[21] # Not used
39 seq = line[22:26].split()
40 if len(seq) == 0:
41 resseq = 1
42 else:
43 resseq = int(seq[0]) # sequence identifier
44 # icode = line[26] # insertion code, not used
46 # atomic coordinates
47 try:
48 coord = np.array([float(line[30:38]),
49 float(line[38:46]),
50 float(line[46:54])], dtype=np.float64)
51 except ValueError:
52 raise ValueError("Invalid or missing coordinate(s)")
54 # occupancy & B factor
55 try:
56 occupancy = float(line[54:60])
57 except ValueError:
58 occupancy = None # Rather than arbitrary zero or one
60 if occupancy is not None and occupancy < 0:
61 warnings.warn("Negative occupancy in one or more atoms")
63 try:
64 bfactor = float(line[60:66])
65 except ValueError:
66 # The PDB use a default of zero if the data is missing
67 bfactor = 0.0
69 # segid = line[72:76] # not used
70 symbol = line[76:78].strip().upper()
72 else:
73 raise ValueError("Only ATOM and HETATM supported")
75 return symbol, name, altloc, resname, coord, occupancy, bfactor, resseq
78@reader
79def read_proteindatabank(fileobj, index=-1, read_arrays=True):
80 """Read PDB files."""
81 images = []
82 orig = np.identity(3)
83 trans = np.zeros(3)
84 occ = []
85 bfactor = []
86 residuenames = []
87 residuenumbers = []
88 atomtypes = []
90 symbols = []
91 positions = []
92 cell = None
93 pbc = None
95 def build_atoms():
96 atoms = Atoms(symbols=symbols,
97 cell=cell, pbc=pbc,
98 positions=positions)
100 if not read_arrays:
101 return atoms
103 info = {'occupancy': occ,
104 'bfactor': bfactor,
105 'residuenames': residuenames,
106 'atomtypes': atomtypes,
107 'residuenumbers': residuenumbers}
108 for name, array in info.items():
109 if len(array) == 0:
110 pass
111 elif len(array) != len(atoms):
112 warnings.warn('Length of {} array, {}, '
113 'different from number of atoms {}'.
114 format(name, len(array), len(atoms)))
115 else:
116 atoms.set_array(name, np.array(array))
117 return atoms
119 for line in fileobj.readlines():
120 if line.startswith('CRYST1'):
121 cellpar = [float(line[6:15]), # a
122 float(line[15:24]), # b
123 float(line[24:33]), # c
124 float(line[33:40]), # alpha
125 float(line[40:47]), # beta
126 float(line[47:54])] # gamma
127 cell = Cell.new(cellpar)
128 pbc = True
129 for c in range(3):
130 if line.startswith('ORIGX' + '123'[c]):
131 orig[c] = [float(line[10:20]),
132 float(line[20:30]),
133 float(line[30:40])]
134 trans[c] = float(line[45:55])
136 if line.startswith('ATOM') or line.startswith('HETATM'):
137 # Atom name is arbitrary and does not necessarily
138 # contain the element symbol. The specification
139 # requires the element symbol to be in columns 77+78.
140 # Fall back to Atom name for files that do not follow
141 # the spec, e.g. packmol.
143 # line_info = symbol, name, altloc, resname, coord, occupancy,
144 # bfactor, resseq
145 line_info = read_atom_line(line)
147 try:
148 symbol = label_to_symbol(line_info[0])
149 except (KeyError, IndexError):
150 symbol = label_to_symbol(line_info[1])
152 position = np.dot(orig, line_info[4]) + trans
153 atomtypes.append(line_info[1])
154 residuenames.append(line_info[3])
155 if line_info[5] is not None:
156 occ.append(line_info[5])
157 bfactor.append(line_info[6])
158 residuenumbers.append(line_info[7])
160 symbols.append(symbol)
161 positions.append(position)
163 if line.startswith('END'):
164 # End of configuration reached
165 # According to the latest PDB file format (v3.30),
166 # this line should start with 'ENDMDL' (not 'END'),
167 # but in this way PDB trajectories from e.g. CP2K
168 # are supported (also VMD supports this format).
169 atoms = build_atoms()
170 images.append(atoms)
171 occ = []
172 bfactor = []
173 residuenames = []
174 residuenumbers = []
175 atomtypes = []
176 symbols = []
177 positions = []
178 cell = None
179 pbc = None
181 if len(images) == 0:
182 atoms = build_atoms()
183 images.append(atoms)
184 return images[index]
187@writer
188def write_proteindatabank(fileobj, images, write_arrays=True):
189 """Write images to PDB-file."""
190 rot_t = None
191 if hasattr(images, 'get_positions'):
192 images = [images]
194 # 1234567 123 6789012345678901 89 67 456789012345678901234567 890
195 format = ('ATOM %5d %4s %4s %4d %8.3f%8.3f%8.3f%6.2f%6.2f'
196 ' %2s \n')
198 # RasMol complains if the atom index exceeds 100000. There might
199 # be a limit of 5 digit numbers in this field.
200 MAXNUM = 100000
202 symbols = images[0].get_chemical_symbols()
203 natoms = len(symbols)
205 for n, atoms in enumerate(images):
206 if atoms.get_pbc().any():
207 currentcell = atoms.get_cell()
208 cellpar = currentcell.cellpar()
209 _, rot_t = currentcell.standard_form()
210 # ignoring Z-value, using P1 since we have all atoms defined
211 # explicitly
212 cellformat = 'CRYST1%9.3f%9.3f%9.3f%7.2f%7.2f%7.2f P 1\n'
213 fileobj.write(cellformat % (cellpar[0], cellpar[1], cellpar[2],
214 cellpar[3], cellpar[4], cellpar[5]))
215 fileobj.write('MODEL ' + str(n + 1) + '\n')
216 p = atoms.get_positions()
217 if rot_t is not None:
218 p = p.dot(rot_t.T)
219 occupancy = np.ones(len(atoms))
220 bfactor = np.zeros(len(atoms))
221 residuenames = ['MOL '] * len(atoms)
222 residuenumbers = np.ones(len(atoms))
223 names = atoms.get_chemical_symbols()
224 if write_arrays:
225 if 'occupancy' in atoms.arrays:
226 occupancy = atoms.get_array('occupancy')
227 if 'bfactor' in atoms.arrays:
228 bfactor = atoms.get_array('bfactor')
229 if 'residuenames' in atoms.arrays:
230 residuenames = atoms.get_array('residuenames')
231 if 'residuenumbers' in atoms.arrays:
232 residuenumbers = atoms.get_array('residuenumbers')
233 if 'atomtypes' in atoms.arrays:
234 names = atoms.get_array('atomtypes')
235 for a in range(natoms):
236 x, y, z = p[a]
237 occ = occupancy[a]
238 bf = bfactor[a]
239 resname = residuenames[a].ljust(4)
240 resseq = residuenumbers[a]
241 name = names[a]
242 fileobj.write(format % ((a + 1) % MAXNUM, name, resname, resseq,
243 x, y, z, occ, bf, symbols[a].upper()))
244 fileobj.write('ENDMDL\n')