Coverage for /builds/ase/ase/ase/data/pubchem.py: 73.00%

100 statements  

« prev     ^ index     » next       coverage.py v7.5.3, created at 2025-08-02 00:12 +0000

1import json 

2import urllib.request 

3import warnings 

4from collections import namedtuple 

5from io import StringIO 

6from urllib.error import HTTPError, URLError 

7 

8from ase.io import read 

9 

10base_url = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug' 

11 

12PubchemSearch = namedtuple('PubchemSearch', 'search field') 

13 

14 

15class PubchemData: 

16 """ 

17 a specialized class for entries from the pubchem database 

18 """ 

19 

20 def __init__(self, atoms, data): 

21 self.atoms = atoms 

22 self.data = data 

23 

24 def get_atoms(self): 

25 return self.atoms 

26 

27 def get_pubchem_data(self): 

28 return self.data 

29 

30 

31def search_pubchem_raw(search, field, silent=False): 

32 """ 

33 A helper function for searching pubchem. 

34 

35 Parameters: 

36 search (str or int): 

37 the compound you are searching for. This can be either 

38 a common name, CID, or smiles string depending of the 

39 `field` you are searching 

40 

41 field (str): 

42 the particular field you are searching with. Possible values 

43 are 'name', 'CID', and 'smiles'.'name' will search common ' 

44 'names,CID will search the Pubchem Chemical Idenitification ' 

45 'Numberswhich can be found on their website and smiles' 

46 ' searches for compounds with the entered smiles string. 

47 

48 returns: 

49 data (str): 

50 a string containing the raw response from pubchem. 

51 """ 

52 suffix = 'sdf?record_type=3d' 

53 

54 url = ( 

55 f'{base_url}/{field}/{search!s}/{suffix}' 

56 if field == 'conformers' 

57 else f'{base_url}/compound/{field}/{search!s}/{suffix}' 

58 ) 

59 try: 

60 r = urllib.request.urlopen(url) 

61 except HTTPError as e: 

62 raise ValueError( 

63 f'the search term {search} could not be found for the field {field}' 

64 ) from e 

65 except URLError as e: 

66 raise ValueError( 

67 "Couldn't reach the pubchem servers, check your internet connection" 

68 ) from e 

69 

70 # check if there are confomers and warn them if there are 

71 if field != 'conformers' and not silent: 

72 conformer_ids = available_conformer_search(search, field) 

73 if len(conformer_ids) > 1: 

74 warnings.warn( 

75 f'The structure "{search}" has more than one conformer in ' 

76 'PubChem. By default, the first conformer is returned, please ' 

77 'ensure you are using the structure you intend to or use the ' 

78 '`ase.data.pubchem.pubchem_conformer_search` function' 

79 ) 

80 

81 return r.read().decode('utf-8') 

82 

83 

84def parse_pubchem_raw(data): 

85 """ 

86 a helper function for parsing the returned pubchem entries 

87 

88 Parameters: 

89 data (str): 

90 the raw output from pubchem in string form 

91 

92 returns: 

93 atoms (ASE Atoms Object): 

94 An ASE atoms obejct containing the information from 

95 pubchem 

96 pubchem_data (dict): 

97 a dictionary containing the non-structural information 

98 from pubchem 

99 

100 """ 

101 if 'PUBCHEM_COMPOUND_CID' not in data: 

102 raise Exception('There was a problem with the data returned by PubChem') 

103 f_like = StringIO(data) 

104 atoms = read(f_like, format='sdf') 

105 

106 # check if there are confomers and warn them if there are 

107 

108 # further analyze the text returned from pubchem 

109 pubchem_data = {} 

110 other_info = data.split('END\n')[1] 

111 other_info = other_info.split('$')[0] # remove the $$$$ at the end 

112 # the strucuture of this string is > <field>\nentry_info\n 

113 other_info = other_info.split('> <') # split into the fields 

114 for data_field in other_info: 

115 if data_field == '': 

116 continue 

117 field_name, entry_value = data_field.split('>\n') 

118 # split it into lines and remove the empty lines 

119 entry_value = entry_value.splitlines() 

120 entry_value = [a for a in entry_value if a != ''] 

121 if len(entry_value) == 1: 

122 entry_value = entry_value[0] 

123 pubchem_data[field_name] = entry_value 

124 # recover partial charges 

125 if 'PUBCHEM_MMFF94_PARTIAL_CHARGES' in pubchem_data: 

126 # the first entry just contains the number of atoms with charges 

127 charges = pubchem_data['PUBCHEM_MMFF94_PARTIAL_CHARGES'][1:] 

128 # each subsequent entry contains the index and charge of the atoms 

129 atom_charges = [0.0] * len(atoms) 

130 for charge in charges: 

131 i, charge = charge.split() 

132 # indices start at 1 

133 atom_charges[int(i) - 1] = float(charge) 

134 atoms.set_initial_charges(atom_charges) 

135 return atoms, pubchem_data 

136 

137 

138def analyze_input( 

139 name=None, cid=None, smiles=None, conformer=None, silent=False 

140): 

141 """ 

142 helper function to translate keyword arguments from intialization 

143 and searching into the search and field that is being asked for 

144 

145 Parameters: 

146 see `ase.data.pubchem.pubchem_search` 

147 returns: 

148 search: 

149 the search term the user has entered 

150 field: 

151 the name of the field being asked for 

152 

153 """ 

154 inputs = [name, cid, smiles, conformer] 

155 inputs_check = [a is not None for a in [name, cid, smiles, conformer]] 

156 input_fields = ['name', 'cid', 'smiles', 'conformers'] 

157 

158 if inputs_check.count(True) > 1: 

159 raise ValueError( 

160 'Only one search term my be entered a time.' 

161 ' Please pass in only one of the following: ' 

162 'name, cid, smiles, confomer' 

163 ) 

164 if inputs_check.count(True) == 0: 

165 raise ValueError( 

166 'No search was entered.' 

167 ' Please pass in only one of the following: ' 

168 'name, cid, smiles, confomer' 

169 ) 

170 

171 # Figure out which input has been passed in 

172 index = inputs_check.index(True) 

173 field = input_fields[index] 

174 search = inputs[index] 

175 

176 # convert hash (triple bond) to hex for URL 

177 if isinstance(search, str): 

178 search = search.replace('#', '%23') 

179 

180 return PubchemSearch(search, field) 

181 

182 

183def available_conformer_search(search, field) -> list: 

184 """ 

185 Helper function to get the conformer IDs. This searches pubchem for 

186 the conformers of a given structure and returns all the confomer ids 

187 of a structure. 

188 

189 Parameters: 

190 search (str or int): 

191 the compound you are searching for. This can be either 

192 a common name, CID, or smiles string depending of the 

193 `field` you are searching 

194 

195 field (str): 

196 the particular field you are searching with. Possible values 

197 are 'name', 'CID', and 'smiles'.'name' will search common ' 

198 'names,CID will search the Pubchem Chemical Idenitification ' 

199 'Numberswhich can be found on their website and smiles' 

200 ' searches for compounds with the entered smiles string. 

201 

202 returns: 

203 conformers_ids (list): 

204 a list of the conformer IDs from PubChem, this is different 

205 than the CID numbers 

206 """ 

207 suffix = 'conformers/JSON' 

208 url = f'{base_url}/compound/{field}/{search!s}/{suffix}' 

209 try: 

210 r = urllib.request.urlopen(url) 

211 except HTTPError as e: 

212 err = ValueError( 

213 f'the search term {search} could not be found for the field {field}' 

214 ) 

215 raise err from e 

216 except URLError as e: 

217 err = ValueError( 

218 "Couldn't reach the pubchem servers, check your internet connection" 

219 ) 

220 raise err from e 

221 record = r.read().decode('utf-8') 

222 record = json.loads(record) 

223 return record['InformationList']['Information'][0]['ConformerID'] 

224 

225 

226def pubchem_search(*args, **kwargs) -> PubchemData: 

227 """ 

228 Search PubChem for the field and search input on the argument passed in 

229 returning a PubchemData object. Note that only one argument may be passed 

230 in at a time. 

231 

232 Parameters: 

233 name (str): 

234 the common name of the compound you're searching for 

235 cid (str or int): 

236 the cid of the compound you're searching for 

237 smiles (str): 

238 the smiles string of the compound you're searching for 

239 conformer (str or int): 

240 the conformer id of the compound you're searching for 

241 

242 returns: 

243 result (PubchemData): 

244 a pubchem data object containing the information on the 

245 requested entry 

246 """ 

247 search, field = analyze_input(*args, **kwargs) 

248 raw_pubchem = search_pubchem_raw(search, field) 

249 atoms, data = parse_pubchem_raw(raw_pubchem) 

250 return PubchemData(atoms, data) 

251 

252 

253def pubchem_conformer_search(*args, **kwargs) -> list: 

254 """ 

255 Search PubChem for all the conformers of a given compound. 

256 Note that only one argument may be passed in at a time. 

257 

258 Parameters: 

259 see `ase.data.pubchem.pubchem_search` 

260 

261 returns: 

262 conformers (list): 

263 a list containing the PubchemData objects of all the conformers 

264 for your search 

265 """ 

266 search, field = analyze_input(*args, **kwargs) 

267 conformer_ids = available_conformer_search(search, field) 

268 return [pubchem_search(conformer=id_) for id_ in conformer_ids] 

269 

270 

271def pubchem_atoms_search(*args, **kwargs): 

272 """ 

273 Search PubChem for the field and search input on the argument passed in 

274 returning an atoms object.Note that only one argument may be passed 

275 in at a time. 

276 

277 Parameters: 

278 see `ase.data.pubchem.pubchem_search` 

279 

280 returns: 

281 atoms (ASE Atoms Object): 

282 an ASE Atoms object containing the information on the 

283 requested entry 

284 """ 

285 return pubchem_search(*args, **kwargs).get_atoms() 

286 

287 

288def pubchem_atoms_conformer_search(*args, **kwargs): 

289 """ 

290 Search PubChem for all the conformers of a given compound. 

291 Note that only one argument may be passed in at a time. 

292 

293 Parameters: 

294 see `ase.data.pubchem.pubchem_search` 

295 

296 returns: 

297 conformers (list): 

298 a list containing the atoms objects of all the conformers 

299 for your search 

300 """ 

301 conformers = pubchem_conformer_search(*args, **kwargs) 

302 conformers = [conformer.get_atoms() for conformer in conformers] 

303 return conformers