Coverage for /builds/ase/ase/ase/io/proteindatabank.py: 91.03%

1# fmt: off

3"""Module to read and write atoms in PDB file format.

5See::

7 http://www.wwpdb.org/documentation/file-format

9Note: The PDB format saves cell lengths and angles; hence the absolute

10orientation is lost when saving. Saving and loading a file will

11conserve the scaled positions, not the absolute ones.

12"""

14import warnings

16import numpy as np

18from ase.atoms import Atoms

19from ase.cell import Cell

20from ase.io.espresso import label_to_symbol

21from ase.utils import reader, writer

24def read_atom_line(line_full):

25 """

26 Read atom line from pdb format

27 HETATM 1 H14 ORTE 0 6.301 0.693 1.919 1.00 0.00 H

28 """

29 line = line_full.rstrip('\n')

30 type_atm = line[0:6]

31 if type_atm == "ATOM " or type_atm == "HETATM":

33 name = line[12:16].strip()

35 altloc = line[16]

36 resname = line[17:21].strip()

37 # chainid = line[21] # Not used

39 seq = line[22:26].split()

40 if len(seq) == 0:

41 resseq = 1

42 else:

43 resseq = int(seq[0]) # sequence identifier

44 # icode = line[26] # insertion code, not used

46 # atomic coordinates

47 try:

48 coord = np.array([float(line[30:38]),

49 float(line[38:46]),

50 float(line[46:54])], dtype=np.float64)

51 except ValueError:

52 raise ValueError("Invalid or missing coordinate(s)")

54 # occupancy & B factor

55 try:

56 occupancy = float(line[54:60])

57 except ValueError:

58 occupancy = None # Rather than arbitrary zero or one

60 if occupancy is not None and occupancy < 0:

61 warnings.warn("Negative occupancy in one or more atoms")

63 try:

64 bfactor = float(line[60:66])

65 except ValueError:

66 # The PDB use a default of zero if the data is missing

67 bfactor = 0.0

69 # segid = line[72:76] # not used

70 symbol = line[76:78].strip().upper()

72 else:

73 raise ValueError("Only ATOM and HETATM supported")

75 return symbol, name, altloc, resname, coord, occupancy, bfactor, resseq

78@reader

79def read_proteindatabank(fileobj, index=-1, read_arrays=True):

80 """Read PDB files."""

81 images = []

82 orig = np.identity(3)

83 trans = np.zeros(3)

84 occ = []

85 bfactor = []

86 residuenames = []

87 residuenumbers = []

88 atomtypes = []

90 symbols = []

91 positions = []

92 cell = None

93 pbc = None

95 def build_atoms():

96 atoms = Atoms(symbols=symbols,

97 cell=cell, pbc=pbc,

98 positions=positions)

100 if not read_arrays:

101 return atoms

102

103 info = {'occupancy': occ,

104 'bfactor': bfactor,

105 'residuenames': residuenames,

106 'atomtypes': atomtypes,

107 'residuenumbers': residuenumbers}

108 for name, array in info.items():

109 if len(array) == 0:

110 pass

111 elif len(array) != len(atoms):

112 warnings.warn('Length of {} array, {}, '

113 'different from number of atoms {}'.

114 format(name, len(array), len(atoms)))

115 else:

116 atoms.set_array(name, np.array(array))

117 return atoms

118

119 for line in fileobj.readlines():

120 if line.startswith('CRYST1'):

121 cellpar = [float(line[6:15]), # a

122 float(line[15:24]), # b

123 float(line[24:33]), # c

124 float(line[33:40]), # alpha

125 float(line[40:47]), # beta

126 float(line[47:54])] # gamma

127 cell = Cell.new(cellpar)

128 pbc = True

129 for c in range(3):

130 if line.startswith('ORIGX' + '123'[c]):

131 orig[c] = [float(line[10:20]),

132 float(line[20:30]),

133 float(line[30:40])]

134 trans[c] = float(line[45:55])

135

136 if line.startswith('ATOM') or line.startswith('HETATM'):

137 # Atom name is arbitrary and does not necessarily

138 # contain the element symbol. The specification

139 # requires the element symbol to be in columns 77+78.

140 # Fall back to Atom name for files that do not follow

141 # the spec, e.g. packmol.

142

143 # line_info = symbol, name, altloc, resname, coord, occupancy,

144 # bfactor, resseq

145 line_info = read_atom_line(line)

146

147 try:

148 symbol = label_to_symbol(line_info[0])

149 except (KeyError, IndexError):

150 symbol = label_to_symbol(line_info[1])

151

152 position = np.dot(orig, line_info[4]) + trans

153 atomtypes.append(line_info[1])

154 residuenames.append(line_info[3])

155 if line_info[5] is not None:

156 occ.append(line_info[5])

157 bfactor.append(line_info[6])

158 residuenumbers.append(line_info[7])

159

160 symbols.append(symbol)

161 positions.append(position)

162

163 if line.startswith('END'):

164 # End of configuration reached

165 # According to the latest PDB file format (v3.30),

166 # this line should start with 'ENDMDL' (not 'END'),

167 # but in this way PDB trajectories from e.g. CP2K

168 # are supported (also VMD supports this format).

169 atoms = build_atoms()

170 images.append(atoms)

171 occ = []

172 bfactor = []

173 residuenames = []

174 residuenumbers = []

175 atomtypes = []

176 symbols = []

177 positions = []

178 cell = None

179 pbc = None

180

181 if len(images) == 0:

182 atoms = build_atoms()

183 images.append(atoms)

184 return images[index]

185

186

187@writer

188def write_proteindatabank(fileobj, images, write_arrays=True):

189 """Write images to PDB-file."""

190 rot_t = None

191 if hasattr(images, 'get_positions'):

192 images = [images]

193

194 # 1234567 123 6789012345678901 89 67 456789012345678901234567 890

195 format = ('ATOM %5d %4s %4s %4d %8.3f%8.3f%8.3f%6.2f%6.2f'

196 ' %2s \n')

197

198 # RasMol complains if the atom index exceeds 100000. There might

199 # be a limit of 5 digit numbers in this field.

200 MAXNUM = 100000

201

202 symbols = images[0].get_chemical_symbols()

203 natoms = len(symbols)

204

205 for n, atoms in enumerate(images):

206 if atoms.get_pbc().any():

207 currentcell = atoms.get_cell()

208 cellpar = currentcell.cellpar()

209 _, rot_t = currentcell.standard_form()

210 # ignoring Z-value, using P1 since we have all atoms defined

211 # explicitly

212 cellformat = 'CRYST1%9.3f%9.3f%9.3f%7.2f%7.2f%7.2f P 1\n'

213 fileobj.write(cellformat % (cellpar[0], cellpar[1], cellpar[2],

214 cellpar[3], cellpar[4], cellpar[5]))

215 fileobj.write('MODEL ' + str(n + 1) + '\n')

216 p = atoms.get_positions()

217 if rot_t is not None:

218 p = p.dot(rot_t.T)

219 occupancy = np.ones(len(atoms))

220 bfactor = np.zeros(len(atoms))

221 residuenames = ['MOL '] * len(atoms)

222 residuenumbers = np.ones(len(atoms))

223 names = atoms.get_chemical_symbols()

224 if write_arrays:

225 if 'occupancy' in atoms.arrays:

226 occupancy = atoms.get_array('occupancy')

227 if 'bfactor' in atoms.arrays:

228 bfactor = atoms.get_array('bfactor')

229 if 'residuenames' in atoms.arrays:

230 residuenames = atoms.get_array('residuenames')

231 if 'residuenumbers' in atoms.arrays:

232 residuenumbers = atoms.get_array('residuenumbers')

233 if 'atomtypes' in atoms.arrays:

234 names = atoms.get_array('atomtypes')

235 for a in range(natoms):

236 x, y, z = p[a]

237 occ = occupancy[a]

238 bf = bfactor[a]

239 resname = residuenames[a].ljust(4)

240 resseq = residuenumbers[a]

241 name = names[a]

242 fileobj.write(format % ((a + 1) % MAXNUM, name, resname, resseq,

243 x, y, z, occ, bf, symbols[a].upper()))

244 fileobj.write('ENDMDL\n')