Coverage for ase / data / pubchem.py: 73.00%

100 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-03-30 08:22 +0000

1import json 

2import urllib.request 

3import warnings 

4from collections import namedtuple 

5from io import StringIO 

6from urllib.error import HTTPError, URLError 

7 

8from ase.io import read 

9 

10base_url = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug' 

11 

12PubchemSearch = namedtuple('PubchemSearch', 'search field') 

13 

14 

15class PubchemData: 

16 """ 

17 a specialized class for entries from the pubchem database 

18 """ 

19 

20 def __init__(self, atoms, data): 

21 self.atoms = atoms 

22 self.data = data 

23 

24 def get_atoms(self): 

25 return self.atoms 

26 

27 def get_pubchem_data(self): 

28 return self.data 

29 

30 

31def search_pubchem_raw(search, field, silent=False): 

32 """ 

33 A helper function for searching pubchem. 

34 

35 Parameters 

36 ---------- 

37 search (str or int): 

38 the compound you are searching for. This can be either 

39 a common name, CID, or smiles string depending of the 

40 `field` you are searching 

41 

42 field (str): 

43 the particular field you are searching with. Possible values 

44 are 'name', 'CID', and 'smiles'.'name' will search common ' 

45 'names,CID will search the Pubchem Chemical Idenitification ' 

46 'Numberswhich can be found on their website and smiles' 

47 ' searches for compounds with the entered smiles string. 

48 

49 returns 

50 ------- 

51 data (str): 

52 a string containing the raw response from pubchem. 

53 """ 

54 suffix = 'sdf?record_type=3d' 

55 

56 url = ( 

57 f'{base_url}/{field}/{search!s}/{suffix}' 

58 if field == 'conformers' 

59 else f'{base_url}/compound/{field}/{search!s}/{suffix}' 

60 ) 

61 try: 

62 r = urllib.request.urlopen(url) 

63 except HTTPError as e: 

64 raise ValueError( 

65 f'the search term {search} could not be found for the field {field}' 

66 ) from e 

67 except URLError as e: 

68 raise ValueError( 

69 "Couldn't reach the pubchem servers, check your internet connection" 

70 ) from e 

71 

72 # check if there are confomers and warn them if there are 

73 if field != 'conformers' and not silent: 

74 conformer_ids = available_conformer_search(search, field) 

75 if len(conformer_ids) > 1: 

76 warnings.warn( 

77 f'The structure "{search}" has more than one conformer in ' 

78 'PubChem. By default, the first conformer is returned, please ' 

79 'ensure you are using the structure you intend to or use the ' 

80 '`ase.data.pubchem.pubchem_conformer_search` function' 

81 ) 

82 

83 return r.read().decode('utf-8') 

84 

85 

86def parse_pubchem_raw(data): 

87 """ 

88 a helper function for parsing the returned pubchem entries 

89 

90 Parameters 

91 ---------- 

92 data (str): 

93 the raw output from pubchem in string form 

94 

95 returns 

96 ------- 

97 atoms (ASE Atoms Object): 

98 An ASE atoms obejct containing the information from 

99 pubchem 

100 pubchem_data (dict): 

101 a dictionary containing the non-structural information 

102 from pubchem 

103 

104 """ 

105 if 'PUBCHEM_COMPOUND_CID' not in data: 

106 raise Exception('There was a problem with the data returned by PubChem') 

107 f_like = StringIO(data) 

108 atoms = read(f_like, format='sdf') 

109 

110 # check if there are confomers and warn them if there are 

111 

112 # further analyze the text returned from pubchem 

113 pubchem_data = {} 

114 other_info = data.split('END\n')[1] 

115 other_info = other_info.split('$')[0] # remove the $$$$ at the end 

116 # the strucuture of this string is > <field>\nentry_info\n 

117 other_info = other_info.split('> <') # split into the fields 

118 for data_field in other_info: 

119 if data_field == '': 

120 continue 

121 field_name, entry_value = data_field.split('>\n') 

122 # split it into lines and remove the empty lines 

123 entry_value = entry_value.splitlines() 

124 entry_value = [a for a in entry_value if a != ''] 

125 if len(entry_value) == 1: 

126 entry_value = entry_value[0] 

127 pubchem_data[field_name] = entry_value 

128 # recover partial charges 

129 if 'PUBCHEM_MMFF94_PARTIAL_CHARGES' in pubchem_data: 

130 # the first entry just contains the number of atoms with charges 

131 charges = pubchem_data['PUBCHEM_MMFF94_PARTIAL_CHARGES'][1:] 

132 # each subsequent entry contains the index and charge of the atoms 

133 atom_charges = [0.0] * len(atoms) 

134 for charge in charges: 

135 i, charge = charge.split() 

136 # indices start at 1 

137 atom_charges[int(i) - 1] = float(charge) 

138 atoms.set_initial_charges(atom_charges) 

139 return atoms, pubchem_data 

140 

141 

142def analyze_input( 

143 name=None, cid=None, smiles=None, conformer=None, silent=False 

144): 

145 """ 

146 helper function to translate keyword arguments from intialization 

147 and searching into the search and field that is being asked for 

148 

149 Parameters 

150 ---------- 

151 see `ase.data.pubchem.pubchem_search` 

152 returns: 

153 search: 

154 the search term the user has entered 

155 field: 

156 the name of the field being asked for 

157 

158 """ 

159 inputs = [name, cid, smiles, conformer] 

160 inputs_check = [a is not None for a in [name, cid, smiles, conformer]] 

161 input_fields = ['name', 'cid', 'smiles', 'conformers'] 

162 

163 if inputs_check.count(True) > 1: 

164 raise ValueError( 

165 'Only one search term my be entered a time.' 

166 ' Please pass in only one of the following: ' 

167 'name, cid, smiles, confomer' 

168 ) 

169 if inputs_check.count(True) == 0: 

170 raise ValueError( 

171 'No search was entered.' 

172 ' Please pass in only one of the following: ' 

173 'name, cid, smiles, confomer' 

174 ) 

175 

176 # Figure out which input has been passed in 

177 index = inputs_check.index(True) 

178 field = input_fields[index] 

179 search = inputs[index] 

180 

181 # convert hash (triple bond) to hex for URL 

182 if isinstance(search, str): 

183 search = search.replace('#', '%23') 

184 

185 return PubchemSearch(search, field) 

186 

187 

188def available_conformer_search(search, field) -> list: 

189 """ 

190 Helper function to get the conformer IDs. This searches pubchem for 

191 the conformers of a given structure and returns all the confomer ids 

192 of a structure. 

193 

194 Parameters 

195 ---------- 

196 search (str or int): 

197 the compound you are searching for. This can be either 

198 a common name, CID, or smiles string depending of the 

199 `field` you are searching 

200 

201 field (str): 

202 the particular field you are searching with. Possible values 

203 are 'name', 'CID', and 'smiles'.'name' will search common ' 

204 'names,CID will search the Pubchem Chemical Idenitification ' 

205 'Numberswhich can be found on their website and smiles' 

206 ' searches for compounds with the entered smiles string. 

207 

208 returns: 

209 conformers_ids (list): 

210 a list of the conformer IDs from PubChem, this is different 

211 than the CID numbers 

212 """ 

213 suffix = 'conformers/JSON' 

214 url = f'{base_url}/compound/{field}/{search!s}/{suffix}' 

215 try: 

216 r = urllib.request.urlopen(url) 

217 except HTTPError as e: 

218 err = ValueError( 

219 f'the search term {search} could not be found for the field {field}' 

220 ) 

221 raise err from e 

222 except URLError as e: 

223 err = ValueError( 

224 "Couldn't reach the pubchem servers, check your internet connection" 

225 ) 

226 raise err from e 

227 record = r.read().decode('utf-8') 

228 record = json.loads(record) 

229 return record['InformationList']['Information'][0]['ConformerID'] 

230 

231 

232def pubchem_search(*args, **kwargs) -> PubchemData: 

233 """ 

234 Search PubChem for the field and search input on the argument passed in 

235 returning a PubchemData object. Note that only one argument may be passed 

236 in at a time. 

237 

238 Parameters 

239 ---------- 

240 name (str): 

241 the common name of the compound you're searching for 

242 cid (str or int): 

243 the cid of the compound you're searching for 

244 smiles (str): 

245 the smiles string of the compound you're searching for 

246 conformer (str or int): 

247 the conformer id of the compound you're searching for 

248 

249 returns 

250 ------- 

251 result (PubchemData): 

252 a pubchem data object containing the information on the 

253 requested entry 

254 """ 

255 search, field = analyze_input(*args, **kwargs) 

256 raw_pubchem = search_pubchem_raw(search, field) 

257 atoms, data = parse_pubchem_raw(raw_pubchem) 

258 return PubchemData(atoms, data) 

259 

260 

261def pubchem_conformer_search(*args, **kwargs) -> list: 

262 """ 

263 Search PubChem for all the conformers of a given compound. 

264 Note that only one argument may be passed in at a time. 

265 

266 Parameters 

267 ---------- 

268 see `ase.data.pubchem.pubchem_search` 

269 

270 returns 

271 ------- 

272 conformers (list): 

273 a list containing the PubchemData objects of all the conformers 

274 for your search 

275 """ 

276 search, field = analyze_input(*args, **kwargs) 

277 conformer_ids = available_conformer_search(search, field) 

278 return [pubchem_search(conformer=id_) for id_ in conformer_ids] 

279 

280 

281def pubchem_atoms_search(*args, **kwargs): 

282 """ 

283 Search PubChem for the field and search input on the argument passed in 

284 returning an atoms object.Note that only one argument may be passed 

285 in at a time. 

286 

287 Parameters 

288 ---------- 

289 see `ase.data.pubchem.pubchem_search` 

290 

291 returns 

292 ------- 

293 atoms (ASE Atoms Object): 

294 an ASE Atoms object containing the information on the 

295 requested entry 

296 """ 

297 return pubchem_search(*args, **kwargs).get_atoms() 

298 

299 

300def pubchem_atoms_conformer_search(*args, **kwargs): 

301 """ 

302 Search PubChem for all the conformers of a given compound. 

303 Note that only one argument may be passed in at a time. 

304 

305 Parameters 

306 ---------- 

307 see `ase.data.pubchem.pubchem_search` 

308 

309 returns 

310 ------- 

311 conformers (list): 

312 a list containing the atoms objects of all the conformers 

313 for your search 

314 """ 

315 conformers = pubchem_conformer_search(*args, **kwargs) 

316 conformers = [conformer.get_atoms() for conformer in conformers] 

317 return conformers