Coverage for ase / data / pubchem.py: 73.00%
100 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-03-30 08:22 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-03-30 08:22 +0000
1import json
2import urllib.request
3import warnings
4from collections import namedtuple
5from io import StringIO
6from urllib.error import HTTPError, URLError
8from ase.io import read
10base_url = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug'
12PubchemSearch = namedtuple('PubchemSearch', 'search field')
15class PubchemData:
16 """
17 a specialized class for entries from the pubchem database
18 """
20 def __init__(self, atoms, data):
21 self.atoms = atoms
22 self.data = data
24 def get_atoms(self):
25 return self.atoms
27 def get_pubchem_data(self):
28 return self.data
31def search_pubchem_raw(search, field, silent=False):
32 """
33 A helper function for searching pubchem.
35 Parameters
36 ----------
37 search (str or int):
38 the compound you are searching for. This can be either
39 a common name, CID, or smiles string depending of the
40 `field` you are searching
42 field (str):
43 the particular field you are searching with. Possible values
44 are 'name', 'CID', and 'smiles'.'name' will search common '
45 'names,CID will search the Pubchem Chemical Idenitification '
46 'Numberswhich can be found on their website and smiles'
47 ' searches for compounds with the entered smiles string.
49 returns
50 -------
51 data (str):
52 a string containing the raw response from pubchem.
53 """
54 suffix = 'sdf?record_type=3d'
56 url = (
57 f'{base_url}/{field}/{search!s}/{suffix}'
58 if field == 'conformers'
59 else f'{base_url}/compound/{field}/{search!s}/{suffix}'
60 )
61 try:
62 r = urllib.request.urlopen(url)
63 except HTTPError as e:
64 raise ValueError(
65 f'the search term {search} could not be found for the field {field}'
66 ) from e
67 except URLError as e:
68 raise ValueError(
69 "Couldn't reach the pubchem servers, check your internet connection"
70 ) from e
72 # check if there are confomers and warn them if there are
73 if field != 'conformers' and not silent:
74 conformer_ids = available_conformer_search(search, field)
75 if len(conformer_ids) > 1:
76 warnings.warn(
77 f'The structure "{search}" has more than one conformer in '
78 'PubChem. By default, the first conformer is returned, please '
79 'ensure you are using the structure you intend to or use the '
80 '`ase.data.pubchem.pubchem_conformer_search` function'
81 )
83 return r.read().decode('utf-8')
86def parse_pubchem_raw(data):
87 """
88 a helper function for parsing the returned pubchem entries
90 Parameters
91 ----------
92 data (str):
93 the raw output from pubchem in string form
95 returns
96 -------
97 atoms (ASE Atoms Object):
98 An ASE atoms obejct containing the information from
99 pubchem
100 pubchem_data (dict):
101 a dictionary containing the non-structural information
102 from pubchem
104 """
105 if 'PUBCHEM_COMPOUND_CID' not in data:
106 raise Exception('There was a problem with the data returned by PubChem')
107 f_like = StringIO(data)
108 atoms = read(f_like, format='sdf')
110 # check if there are confomers and warn them if there are
112 # further analyze the text returned from pubchem
113 pubchem_data = {}
114 other_info = data.split('END\n')[1]
115 other_info = other_info.split('$')[0] # remove the $$$$ at the end
116 # the strucuture of this string is > <field>\nentry_info\n
117 other_info = other_info.split('> <') # split into the fields
118 for data_field in other_info:
119 if data_field == '':
120 continue
121 field_name, entry_value = data_field.split('>\n')
122 # split it into lines and remove the empty lines
123 entry_value = entry_value.splitlines()
124 entry_value = [a for a in entry_value if a != '']
125 if len(entry_value) == 1:
126 entry_value = entry_value[0]
127 pubchem_data[field_name] = entry_value
128 # recover partial charges
129 if 'PUBCHEM_MMFF94_PARTIAL_CHARGES' in pubchem_data:
130 # the first entry just contains the number of atoms with charges
131 charges = pubchem_data['PUBCHEM_MMFF94_PARTIAL_CHARGES'][1:]
132 # each subsequent entry contains the index and charge of the atoms
133 atom_charges = [0.0] * len(atoms)
134 for charge in charges:
135 i, charge = charge.split()
136 # indices start at 1
137 atom_charges[int(i) - 1] = float(charge)
138 atoms.set_initial_charges(atom_charges)
139 return atoms, pubchem_data
142def analyze_input(
143 name=None, cid=None, smiles=None, conformer=None, silent=False
144):
145 """
146 helper function to translate keyword arguments from intialization
147 and searching into the search and field that is being asked for
149 Parameters
150 ----------
151 see `ase.data.pubchem.pubchem_search`
152 returns:
153 search:
154 the search term the user has entered
155 field:
156 the name of the field being asked for
158 """
159 inputs = [name, cid, smiles, conformer]
160 inputs_check = [a is not None for a in [name, cid, smiles, conformer]]
161 input_fields = ['name', 'cid', 'smiles', 'conformers']
163 if inputs_check.count(True) > 1:
164 raise ValueError(
165 'Only one search term my be entered a time.'
166 ' Please pass in only one of the following: '
167 'name, cid, smiles, confomer'
168 )
169 if inputs_check.count(True) == 0:
170 raise ValueError(
171 'No search was entered.'
172 ' Please pass in only one of the following: '
173 'name, cid, smiles, confomer'
174 )
176 # Figure out which input has been passed in
177 index = inputs_check.index(True)
178 field = input_fields[index]
179 search = inputs[index]
181 # convert hash (triple bond) to hex for URL
182 if isinstance(search, str):
183 search = search.replace('#', '%23')
185 return PubchemSearch(search, field)
188def available_conformer_search(search, field) -> list:
189 """
190 Helper function to get the conformer IDs. This searches pubchem for
191 the conformers of a given structure and returns all the confomer ids
192 of a structure.
194 Parameters
195 ----------
196 search (str or int):
197 the compound you are searching for. This can be either
198 a common name, CID, or smiles string depending of the
199 `field` you are searching
201 field (str):
202 the particular field you are searching with. Possible values
203 are 'name', 'CID', and 'smiles'.'name' will search common '
204 'names,CID will search the Pubchem Chemical Idenitification '
205 'Numberswhich can be found on their website and smiles'
206 ' searches for compounds with the entered smiles string.
208 returns:
209 conformers_ids (list):
210 a list of the conformer IDs from PubChem, this is different
211 than the CID numbers
212 """
213 suffix = 'conformers/JSON'
214 url = f'{base_url}/compound/{field}/{search!s}/{suffix}'
215 try:
216 r = urllib.request.urlopen(url)
217 except HTTPError as e:
218 err = ValueError(
219 f'the search term {search} could not be found for the field {field}'
220 )
221 raise err from e
222 except URLError as e:
223 err = ValueError(
224 "Couldn't reach the pubchem servers, check your internet connection"
225 )
226 raise err from e
227 record = r.read().decode('utf-8')
228 record = json.loads(record)
229 return record['InformationList']['Information'][0]['ConformerID']
232def pubchem_search(*args, **kwargs) -> PubchemData:
233 """
234 Search PubChem for the field and search input on the argument passed in
235 returning a PubchemData object. Note that only one argument may be passed
236 in at a time.
238 Parameters
239 ----------
240 name (str):
241 the common name of the compound you're searching for
242 cid (str or int):
243 the cid of the compound you're searching for
244 smiles (str):
245 the smiles string of the compound you're searching for
246 conformer (str or int):
247 the conformer id of the compound you're searching for
249 returns
250 -------
251 result (PubchemData):
252 a pubchem data object containing the information on the
253 requested entry
254 """
255 search, field = analyze_input(*args, **kwargs)
256 raw_pubchem = search_pubchem_raw(search, field)
257 atoms, data = parse_pubchem_raw(raw_pubchem)
258 return PubchemData(atoms, data)
261def pubchem_conformer_search(*args, **kwargs) -> list:
262 """
263 Search PubChem for all the conformers of a given compound.
264 Note that only one argument may be passed in at a time.
266 Parameters
267 ----------
268 see `ase.data.pubchem.pubchem_search`
270 returns
271 -------
272 conformers (list):
273 a list containing the PubchemData objects of all the conformers
274 for your search
275 """
276 search, field = analyze_input(*args, **kwargs)
277 conformer_ids = available_conformer_search(search, field)
278 return [pubchem_search(conformer=id_) for id_ in conformer_ids]
281def pubchem_atoms_search(*args, **kwargs):
282 """
283 Search PubChem for the field and search input on the argument passed in
284 returning an atoms object.Note that only one argument may be passed
285 in at a time.
287 Parameters
288 ----------
289 see `ase.data.pubchem.pubchem_search`
291 returns
292 -------
293 atoms (ASE Atoms Object):
294 an ASE Atoms object containing the information on the
295 requested entry
296 """
297 return pubchem_search(*args, **kwargs).get_atoms()
300def pubchem_atoms_conformer_search(*args, **kwargs):
301 """
302 Search PubChem for all the conformers of a given compound.
303 Note that only one argument may be passed in at a time.
305 Parameters
306 ----------
307 see `ase.data.pubchem.pubchem_search`
309 returns
310 -------
311 conformers (list):
312 a list containing the atoms objects of all the conformers
313 for your search
314 """
315 conformers = pubchem_conformer_search(*args, **kwargs)
316 conformers = [conformer.get_atoms() for conformer in conformers]
317 return conformers