Coverage for ase/data/pubchem.py: 73.00%

1import json

2import urllib.request

3import warnings

4from collections import namedtuple

5from io import StringIO

6from urllib.error import HTTPError, URLError

8from ase.io import read

10base_url = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug'

12PubchemSearch = namedtuple('PubchemSearch', 'search field')

15class PubchemData:

16 """

17 a specialized class for entries from the pubchem database

18 """

20 def __init__(self, atoms, data):

21 self.atoms = atoms

22 self.data = data

24 def get_atoms(self):

25 return self.atoms

27 def get_pubchem_data(self):

28 return self.data

31def search_pubchem_raw(search, field, silent=False):

32 """

33 A helper function for searching pubchem.

35 Parameters

36 ----------

37 search (str or int):

38 the compound you are searching for. This can be either

39 a common name, CID, or smiles string depending of the

40 `field` you are searching

42 field (str):

43 the particular field you are searching with. Possible values

44 are 'name', 'CID', and 'smiles'.'name' will search common '

45 'names,CID will search the Pubchem Chemical Idenitification '

46 'Numberswhich can be found on their website and smiles'

47 ' searches for compounds with the entered smiles string.

49 returns

50 -------

51 data (str):

52 a string containing the raw response from pubchem.

53 """

54 suffix = 'sdf?record_type=3d'

56 url = (

57 f'{base_url}/{field}/{search!s}/{suffix}'

58 if field == 'conformers'

59 else f'{base_url}/compound/{field}/{search!s}/{suffix}'

60 )

61 try:

62 r = urllib.request.urlopen(url)

63 except HTTPError as e:

64 raise ValueError(

65 f'the search term {search} could not be found for the field {field}'

66 ) from e

67 except URLError as e:

68 raise ValueError(

69 "Couldn't reach the pubchem servers, check your internet connection"

70 ) from e

72 # check if there are confomers and warn them if there are

73 if field != 'conformers' and not silent:

74 conformer_ids = available_conformer_search(search, field)

75 if len(conformer_ids) > 1:

76 warnings.warn(

77 f'The structure "{search}" has more than one conformer in '

78 'PubChem. By default, the first conformer is returned, please '

79 'ensure you are using the structure you intend to or use the '

80 '`ase.data.pubchem.pubchem_conformer_search` function'

81 )

83 return r.read().decode('utf-8')

86def parse_pubchem_raw(data):

87 """

88 a helper function for parsing the returned pubchem entries

90 Parameters

91 ----------

92 data (str):

93 the raw output from pubchem in string form

95 returns

96 -------

97 atoms (ASE Atoms Object):

98 An ASE atoms obejct containing the information from

99 pubchem

100 pubchem_data (dict):

101 a dictionary containing the non-structural information

102 from pubchem

103

104 """

105 if 'PUBCHEM_COMPOUND_CID' not in data:

106 raise Exception('There was a problem with the data returned by PubChem')

107 f_like = StringIO(data)

108 atoms = read(f_like, format='sdf')

109

110 # check if there are confomers and warn them if there are

111

112 # further analyze the text returned from pubchem

113 pubchem_data = {}

114 other_info = data.split('END\n')[1]

115 other_info = other_info.split('$')[0] # remove the $$$$ at the end

116 # the strucuture of this string is > <field>\nentry_info\n

117 other_info = other_info.split('> <') # split into the fields

118 for data_field in other_info:

119 if data_field == '':

120 continue

121 field_name, entry_value = data_field.split('>\n')

122 # split it into lines and remove the empty lines

123 entry_value = entry_value.splitlines()

124 entry_value = [a for a in entry_value if a != '']

125 if len(entry_value) == 1:

126 entry_value = entry_value[0]

127 pubchem_data[field_name] = entry_value

128 # recover partial charges

129 if 'PUBCHEM_MMFF94_PARTIAL_CHARGES' in pubchem_data:

130 # the first entry just contains the number of atoms with charges

131 charges = pubchem_data['PUBCHEM_MMFF94_PARTIAL_CHARGES'][1:]

132 # each subsequent entry contains the index and charge of the atoms

133 atom_charges = [0.0] * len(atoms)

134 for charge in charges:

135 i, charge = charge.split()

136 # indices start at 1

137 atom_charges[int(i) - 1] = float(charge)

138 atoms.set_initial_charges(atom_charges)

139 return atoms, pubchem_data

140

141

142def analyze_input(

143 name=None, cid=None, smiles=None, conformer=None, silent=False

144):

145 """

146 helper function to translate keyword arguments from intialization

147 and searching into the search and field that is being asked for

148

149 Parameters

150 ----------

151 see `ase.data.pubchem.pubchem_search`

152 returns:

153 search:

154 the search term the user has entered

155 field:

156 the name of the field being asked for

157

158 """

159 inputs = [name, cid, smiles, conformer]

160 inputs_check = [a is not None for a in [name, cid, smiles, conformer]]

161 input_fields = ['name', 'cid', 'smiles', 'conformers']

162

163 if inputs_check.count(True) > 1:

164 raise ValueError(

165 'Only one search term my be entered a time.'

166 ' Please pass in only one of the following: '

167 'name, cid, smiles, confomer'

168 )

169 if inputs_check.count(True) == 0:

170 raise ValueError(

171 'No search was entered.'

172 ' Please pass in only one of the following: '

173 'name, cid, smiles, confomer'

174 )

175

176 # Figure out which input has been passed in

177 index = inputs_check.index(True)

178 field = input_fields[index]

179 search = inputs[index]

180

181 # convert hash (triple bond) to hex for URL

182 if isinstance(search, str):

183 search = search.replace('#', '%23')

184

185 return PubchemSearch(search, field)

186

187

188def available_conformer_search(search, field) -> list:

189 """

190 Helper function to get the conformer IDs. This searches pubchem for

191 the conformers of a given structure and returns all the confomer ids

192 of a structure.

193

194 Parameters

195 ----------

196 search (str or int):

197 the compound you are searching for. This can be either

198 a common name, CID, or smiles string depending of the

199 `field` you are searching

200

201 field (str):

202 the particular field you are searching with. Possible values

203 are 'name', 'CID', and 'smiles'.'name' will search common '

204 'names,CID will search the Pubchem Chemical Idenitification '

205 'Numberswhich can be found on their website and smiles'

206 ' searches for compounds with the entered smiles string.

207

208 returns:

209 conformers_ids (list):

210 a list of the conformer IDs from PubChem, this is different

211 than the CID numbers

212 """

213 suffix = 'conformers/JSON'

214 url = f'{base_url}/compound/{field}/{search!s}/{suffix}'

215 try:

216 r = urllib.request.urlopen(url)

217 except HTTPError as e:

218 err = ValueError(

219 f'the search term {search} could not be found for the field {field}'

220 )

221 raise err from e

222 except URLError as e:

223 err = ValueError(

224 "Couldn't reach the pubchem servers, check your internet connection"

225 )

226 raise err from e

227 record = r.read().decode('utf-8')

228 record = json.loads(record)

229 return record['InformationList']['Information'][0]['ConformerID']

230

231

232def pubchem_search(*args, **kwargs) -> PubchemData:

233 """

234 Search PubChem for the field and search input on the argument passed in

235 returning a PubchemData object. Note that only one argument may be passed

236 in at a time.

237

238 Parameters

239 ----------

240 name (str):

241 the common name of the compound you're searching for

242 cid (str or int):

243 the cid of the compound you're searching for

244 smiles (str):

245 the smiles string of the compound you're searching for

246 conformer (str or int):

247 the conformer id of the compound you're searching for

248

249 returns

250 -------

251 result (PubchemData):

252 a pubchem data object containing the information on the

253 requested entry

254 """

255 search, field = analyze_input(*args, **kwargs)

256 raw_pubchem = search_pubchem_raw(search, field)

257 atoms, data = parse_pubchem_raw(raw_pubchem)

258 return PubchemData(atoms, data)

259

260

261def pubchem_conformer_search(*args, **kwargs) -> list:

262 """

263 Search PubChem for all the conformers of a given compound.

264 Note that only one argument may be passed in at a time.

265

266 Parameters

267 ----------

268 see `ase.data.pubchem.pubchem_search`

269

270 returns

271 -------

272 conformers (list):

273 a list containing the PubchemData objects of all the conformers

274 for your search

275 """

276 search, field = analyze_input(*args, **kwargs)

277 conformer_ids = available_conformer_search(search, field)

278 return [pubchem_search(conformer=id_) for id_ in conformer_ids]

279

280

281def pubchem_atoms_search(*args, **kwargs):

282 """

283 Search PubChem for the field and search input on the argument passed in

284 returning an atoms object.Note that only one argument may be passed

285 in at a time.

286

287 Parameters

288 ----------

289 see `ase.data.pubchem.pubchem_search`

290

291 returns

292 -------

293 atoms (ASE Atoms Object):

294 an ASE Atoms object containing the information on the

295 requested entry

296 """

297 return pubchem_search(*args, **kwargs).get_atoms()

298

299

300def pubchem_atoms_conformer_search(*args, **kwargs):

301 """

302 Search PubChem for all the conformers of a given compound.

303 Note that only one argument may be passed in at a time.

304

305 Parameters

306 ----------

307 see `ase.data.pubchem.pubchem_search`

308

309 returns

310 -------

311 conformers (list):

312 a list containing the atoms objects of all the conformers

313 for your search

314 """

315 conformers = pubchem_conformer_search(*args, **kwargs)

316 conformers = [conformer.get_atoms() for conformer in conformers]

317 return conformers

Coverage for ase / data / pubchem.py: 73.00%

100 statements