Coverage for /builds/ase/ase/ase/io/proteindatabank.py: 91.03%

145 statements  

« prev     ^ index     » next       coverage.py v7.5.3, created at 2025-08-02 00:12 +0000

1# fmt: off 

2 

3"""Module to read and write atoms in PDB file format. 

4 

5See:: 

6 

7 http://www.wwpdb.org/documentation/file-format 

8 

9Note: The PDB format saves cell lengths and angles; hence the absolute 

10orientation is lost when saving. Saving and loading a file will 

11conserve the scaled positions, not the absolute ones. 

12""" 

13 

14import warnings 

15 

16import numpy as np 

17 

18from ase.atoms import Atoms 

19from ase.cell import Cell 

20from ase.io.espresso import label_to_symbol 

21from ase.utils import reader, writer 

22 

23 

24def read_atom_line(line_full): 

25 """ 

26 Read atom line from pdb format 

27 HETATM 1 H14 ORTE 0 6.301 0.693 1.919 1.00 0.00 H 

28 """ 

29 line = line_full.rstrip('\n') 

30 type_atm = line[0:6] 

31 if type_atm == "ATOM " or type_atm == "HETATM": 

32 

33 name = line[12:16].strip() 

34 

35 altloc = line[16] 

36 resname = line[17:21].strip() 

37 # chainid = line[21] # Not used 

38 

39 seq = line[22:26].split() 

40 if len(seq) == 0: 

41 resseq = 1 

42 else: 

43 resseq = int(seq[0]) # sequence identifier 

44 # icode = line[26] # insertion code, not used 

45 

46 # atomic coordinates 

47 try: 

48 coord = np.array([float(line[30:38]), 

49 float(line[38:46]), 

50 float(line[46:54])], dtype=np.float64) 

51 except ValueError: 

52 raise ValueError("Invalid or missing coordinate(s)") 

53 

54 # occupancy & B factor 

55 try: 

56 occupancy = float(line[54:60]) 

57 except ValueError: 

58 occupancy = None # Rather than arbitrary zero or one 

59 

60 if occupancy is not None and occupancy < 0: 

61 warnings.warn("Negative occupancy in one or more atoms") 

62 

63 try: 

64 bfactor = float(line[60:66]) 

65 except ValueError: 

66 # The PDB use a default of zero if the data is missing 

67 bfactor = 0.0 

68 

69 # segid = line[72:76] # not used 

70 symbol = line[76:78].strip().upper() 

71 

72 else: 

73 raise ValueError("Only ATOM and HETATM supported") 

74 

75 return symbol, name, altloc, resname, coord, occupancy, bfactor, resseq 

76 

77 

78@reader 

79def read_proteindatabank(fileobj, index=-1, read_arrays=True): 

80 """Read PDB files.""" 

81 images = [] 

82 orig = np.identity(3) 

83 trans = np.zeros(3) 

84 occ = [] 

85 bfactor = [] 

86 residuenames = [] 

87 residuenumbers = [] 

88 atomtypes = [] 

89 

90 symbols = [] 

91 positions = [] 

92 cell = None 

93 pbc = None 

94 

95 def build_atoms(): 

96 atoms = Atoms(symbols=symbols, 

97 cell=cell, pbc=pbc, 

98 positions=positions) 

99 

100 if not read_arrays: 

101 return atoms 

102 

103 info = {'occupancy': occ, 

104 'bfactor': bfactor, 

105 'residuenames': residuenames, 

106 'atomtypes': atomtypes, 

107 'residuenumbers': residuenumbers} 

108 for name, array in info.items(): 

109 if len(array) == 0: 

110 pass 

111 elif len(array) != len(atoms): 

112 warnings.warn('Length of {} array, {}, ' 

113 'different from number of atoms {}'. 

114 format(name, len(array), len(atoms))) 

115 else: 

116 atoms.set_array(name, np.array(array)) 

117 return atoms 

118 

119 for line in fileobj.readlines(): 

120 if line.startswith('CRYST1'): 

121 cellpar = [float(line[6:15]), # a 

122 float(line[15:24]), # b 

123 float(line[24:33]), # c 

124 float(line[33:40]), # alpha 

125 float(line[40:47]), # beta 

126 float(line[47:54])] # gamma 

127 cell = Cell.new(cellpar) 

128 pbc = True 

129 for c in range(3): 

130 if line.startswith('ORIGX' + '123'[c]): 

131 orig[c] = [float(line[10:20]), 

132 float(line[20:30]), 

133 float(line[30:40])] 

134 trans[c] = float(line[45:55]) 

135 

136 if line.startswith('ATOM') or line.startswith('HETATM'): 

137 # Atom name is arbitrary and does not necessarily 

138 # contain the element symbol. The specification 

139 # requires the element symbol to be in columns 77+78. 

140 # Fall back to Atom name for files that do not follow 

141 # the spec, e.g. packmol. 

142 

143 # line_info = symbol, name, altloc, resname, coord, occupancy, 

144 # bfactor, resseq 

145 line_info = read_atom_line(line) 

146 

147 try: 

148 symbol = label_to_symbol(line_info[0]) 

149 except (KeyError, IndexError): 

150 symbol = label_to_symbol(line_info[1]) 

151 

152 position = np.dot(orig, line_info[4]) + trans 

153 atomtypes.append(line_info[1]) 

154 residuenames.append(line_info[3]) 

155 if line_info[5] is not None: 

156 occ.append(line_info[5]) 

157 bfactor.append(line_info[6]) 

158 residuenumbers.append(line_info[7]) 

159 

160 symbols.append(symbol) 

161 positions.append(position) 

162 

163 if line.startswith('END'): 

164 # End of configuration reached 

165 # According to the latest PDB file format (v3.30), 

166 # this line should start with 'ENDMDL' (not 'END'), 

167 # but in this way PDB trajectories from e.g. CP2K 

168 # are supported (also VMD supports this format). 

169 atoms = build_atoms() 

170 images.append(atoms) 

171 occ = [] 

172 bfactor = [] 

173 residuenames = [] 

174 residuenumbers = [] 

175 atomtypes = [] 

176 symbols = [] 

177 positions = [] 

178 cell = None 

179 pbc = None 

180 

181 if len(images) == 0: 

182 atoms = build_atoms() 

183 images.append(atoms) 

184 return images[index] 

185 

186 

187@writer 

188def write_proteindatabank(fileobj, images, write_arrays=True): 

189 """Write images to PDB-file.""" 

190 rot_t = None 

191 if hasattr(images, 'get_positions'): 

192 images = [images] 

193 

194 # 1234567 123 6789012345678901 89 67 456789012345678901234567 890 

195 format = ('ATOM %5d %4s %4s %4d %8.3f%8.3f%8.3f%6.2f%6.2f' 

196 ' %2s \n') 

197 

198 # RasMol complains if the atom index exceeds 100000. There might 

199 # be a limit of 5 digit numbers in this field. 

200 MAXNUM = 100000 

201 

202 symbols = images[0].get_chemical_symbols() 

203 natoms = len(symbols) 

204 

205 for n, atoms in enumerate(images): 

206 if atoms.get_pbc().any(): 

207 currentcell = atoms.get_cell() 

208 cellpar = currentcell.cellpar() 

209 _, rot_t = currentcell.standard_form() 

210 # ignoring Z-value, using P1 since we have all atoms defined 

211 # explicitly 

212 cellformat = 'CRYST1%9.3f%9.3f%9.3f%7.2f%7.2f%7.2f P 1\n' 

213 fileobj.write(cellformat % (cellpar[0], cellpar[1], cellpar[2], 

214 cellpar[3], cellpar[4], cellpar[5])) 

215 fileobj.write('MODEL ' + str(n + 1) + '\n') 

216 p = atoms.get_positions() 

217 if rot_t is not None: 

218 p = p.dot(rot_t.T) 

219 occupancy = np.ones(len(atoms)) 

220 bfactor = np.zeros(len(atoms)) 

221 residuenames = ['MOL '] * len(atoms) 

222 residuenumbers = np.ones(len(atoms)) 

223 names = atoms.get_chemical_symbols() 

224 if write_arrays: 

225 if 'occupancy' in atoms.arrays: 

226 occupancy = atoms.get_array('occupancy') 

227 if 'bfactor' in atoms.arrays: 

228 bfactor = atoms.get_array('bfactor') 

229 if 'residuenames' in atoms.arrays: 

230 residuenames = atoms.get_array('residuenames') 

231 if 'residuenumbers' in atoms.arrays: 

232 residuenumbers = atoms.get_array('residuenumbers') 

233 if 'atomtypes' in atoms.arrays: 

234 names = atoms.get_array('atomtypes') 

235 for a in range(natoms): 

236 x, y, z = p[a] 

237 occ = occupancy[a] 

238 bf = bfactor[a] 

239 resname = residuenames[a].ljust(4) 

240 resseq = residuenumbers[a] 

241 name = names[a] 

242 fileobj.write(format % ((a + 1) % MAXNUM, name, resname, resseq, 

243 x, y, z, occ, bf, symbols[a].upper())) 

244 fileobj.write('ENDMDL\n')